summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore6
-rw-r--r--data/enm-dump.rb16
-rw-r--r--data/enm-import.rb47
-rw-r--r--lib/classification.rb3
-rw-r--r--lib/compound.rb20
-rw-r--r--lib/crossvalidation.rb122
-rw-r--r--lib/dataset.rb199
-rw-r--r--lib/feature.rb11
-rw-r--r--lib/import.rb73
-rw-r--r--lib/lazar.rb7
-rw-r--r--lib/leave-one-out-validation.rb115
-rw-r--r--lib/model.rb127
-rw-r--r--lib/nanoparticle.rb69
-rw-r--r--lib/opentox.rb6
-rw-r--r--lib/regression.rb35
-rw-r--r--lib/substance.rb10
-rw-r--r--lib/validation-statistics.rb101
-rw-r--r--lib/validation.rb63
-rw-r--r--test/classification.rb12
-rw-r--r--test/dataset.rb50
-rw-r--r--test/nanoparticles.rb34
-rw-r--r--test/prediction_models.rb1
-rw-r--r--test/validation.rb22
23 files changed, 633 insertions, 516 deletions
diff --git a/.gitignore b/.gitignore
index 791dc27..6e0f374 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,5 @@
-last-utils
-libfminer
+R
openbabel
-fminer_debug.txt
-test/fminer_debug.txt
Gemfile.lock
*.gem
.bundle
@@ -11,3 +8,4 @@ pkg/*
.yardoc/
doc/
lazar.log
+data
diff --git a/data/enm-dump.rb b/data/enm-dump.rb
new file mode 100644
index 0000000..c1c25e7
--- /dev/null
+++ b/data/enm-dump.rb
@@ -0,0 +1,16 @@
+require 'json'
+
+#get list of bundle URIs
+`wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json`
+json = JSON.parse File.read('./bundles.json')
+json["dataset"].each do |dataset|
+ uri = dataset["URI"]
+ id = uri.split("/").last
+ `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'`
+ `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'`
+end
diff --git a/data/enm-import.rb b/data/enm-import.rb
new file mode 100644
index 0000000..37bc22b
--- /dev/null
+++ b/data/enm-import.rb
@@ -0,0 +1,47 @@
+require_relative '../lib/lazar.rb'
+include OpenTox
+$mongo.database.drop
+$gridfs = $mongo.database.fs
+
+#get list of bundle URIs
+bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+bundles.each do |bundle|
+ uri = bundle["URI"]
+ nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
+ features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"]
+ nanoparticles.each do |np|
+ nanoparticle = Nanoparticle.find_or_create_by(
+ :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+ :source => np["compound"]["URI"],
+ )
+ nanoparticle.bundles << uri
+ nanoparticle.bundles.uniq!
+ np["composition"].each do |comp|
+ case comp["relation"]
+ when "HAS_CORE"
+ nanoparticle.core = comp["component"]["compound"]["URI"]
+ when "HAS_COATING"
+ nanoparticle.coating << comp["component"]["compound"]["URI"]
+ end
+ end if np["composition"]
+ np["values"].each do |u,v|
+ if u.match(/property/)
+ name, unit, source = nil
+ features.each do |uri,feat|
+ if u.match(/#{uri}/)
+ name = feat["title"]
+ unit = feat["units"]
+ source = uri
+ end
+ end
+ feature = Feature.find_or_create_by(
+ :name => name,
+ :unit => unit,
+ :source => source
+ )
+ end
+ v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array
+ end
+ nanoparticle.save!
+ end
+end
diff --git a/lib/classification.rb b/lib/classification.rb
index b9b66f0..93b4f0f 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -9,10 +9,9 @@ module OpenTox
sims = {}
neighbors.each do |n|
sim = n["tanimoto"]
- n["features"][feature_id].each do |act|
+ n["toxicities"][feature_id].each do |act|
sims[act] ||= []
sims[act] << sim
- #sims[act] << 0.5*sim+0.5 # scale to 1-0.5
end
end
sim_all = sims.collect{|a,s| s}.flatten
diff --git a/lib/compound.rb b/lib/compound.rb
index 2a79fd6..049d77b 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -2,10 +2,8 @@ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/"
module OpenTox
- class Compound
+ class Compound < Substance
require_relative "unique_descriptors.rb"
- include OpenTox
-
DEFAULT_FINGERPRINT = "MP2D"
field :inchi, type: String
@@ -19,9 +17,6 @@ module OpenTox
field :sdf_id, type: BSON::ObjectId
field :fingerprints, type: Hash, default: {}
field :default_fingerprint_size, type: Integer
- field :physchem_descriptors, type: Hash, default: {}
- field :dataset_ids, type: Array, default: []
- field :features, type: Hash, default: {}
index({smiles: 1}, {unique: true})
@@ -293,8 +288,7 @@ module OpenTox
training_dataset.compounds.each do |compound|
candidate_fingerprint = compound.fingerprint params[:type]
sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
- feature_values = training_dataset.values(compound,prediction_feature)
- neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
+ neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => compound.toxicities[prediction_feature.id.to_s]}, "tanimoto" => sim} if sim >= params[:min_sim]
end
neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
end
@@ -335,25 +329,25 @@ module OpenTox
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
}},
'_id' => 1,
- 'features' => 1,
+ 'toxicities' => 1,
'dataset_ids' => 1
}},
{'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
{'$sort' => {'tanimoto' => -1}}
]
- $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
+ $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]}
end
- # Convert mg to mmol
+ # Convert mmol to mg
# @return [Float] value in mg
def mmol_to_mg mmol
mmol.to_f*molecular_weight
end
- # Convert mmol to mg
- # @return [Float] value in mg
+ # Convert mg to mmol
+ # @return [Float] value in mmol
def mg_to_mmol mg
mg.to_f/molecular_weight
end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 6ffeb25..50afb6f 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -6,7 +6,7 @@ module OpenTox
field :folds, type: Integer
field :nr_instances, type: Integer
field :nr_unpredicted, type: Integer
- field :predictions, type: Array, default: []
+ field :predictions, type: Hash, default: {}
field :finished_at, type: Time
def time
@@ -22,8 +22,10 @@ module OpenTox
end
def self.create model, n=10
- model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation
- bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass
+ klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
+ klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
+ bad_request_error "Unknown model class #{model.class}." unless klass
+
cv = klass.new(
name: model.name,
model_id: model.id,
@@ -32,22 +34,22 @@ module OpenTox
cv.save # set created_at
nr_instances = 0
nr_unpredicted = 0
- predictions = []
+ predictions = {}
training_dataset = Dataset.find model.training_dataset_id
training_dataset.folds(n).each_with_index do |fold,fold_nr|
- #fork do # parallel execution of validations
+ #fork do # parallel execution of validations can lead to Rserve and memory problems
$logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
t = Time.now
validation = Validation.create(model, fold[0], fold[1],cv)
$logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
#end
end
- #Process.waitall
+ Process.waitall
cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id)
cv.validations.each do |validation|
nr_instances += validation.nr_instances
nr_unpredicted += validation.nr_unpredicted
- predictions += validation.predictions
+ predictions.merge! validation.predictions
end
cv.update_attributes(
nr_instances: nr_instances,
@@ -73,61 +75,8 @@ module OpenTox
# TODO auc, f-measure (usability??)
def statistics
- accept_values = Feature.find(model.prediction_feature_id).accept_values
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- true_rate = {}
- predictivity = {}
- predictions.each do |pred|
- compound_id,activities,prediction,confidence = pred
- if activities and prediction #and confidence.numeric?
- if activities.uniq.size == 1
- activity = activities.uniq.first
- if prediction == activity
- if prediction == accept_values[0]
- confusion_matrix[0][0] += 1
- #weighted_confusion_matrix[0][0] += confidence
- elsif prediction == accept_values[1]
- confusion_matrix[1][1] += 1
- #weighted_confusion_matrix[1][1] += confidence
- end
- elsif prediction != activity
- if prediction == accept_values[0]
- confusion_matrix[0][1] += 1
- #weighted_confusion_matrix[0][1] += confidence
- elsif prediction == accept_values[1]
- confusion_matrix[1][0] += 1
- #weighted_confusion_matrix[1][0] += confidence
- end
- end
- end
- else
- nr_unpredicted += 1 if prediction.nil?
- end
- end
- true_rate = {}
- predictivity = {}
- accept_values.each_with_index do |v,i|
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
- end
- confidence_sum = 0
- #weighted_confusion_matrix.each do |r|
- #r.each do |c|
- #confidence_sum += c
- #end
- #end
- update_attributes(
- accept_values: accept_values,
- confusion_matrix: confusion_matrix,
- #weighted_confusion_matrix: weighted_confusion_matrix,
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
- #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
- true_rate: true_rate,
- predictivity: predictivity,
- finished_at: Time.now
- )
- $logger.debug "Accuracy #{accuracy}"
+ stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values)
+ update_attributes(stat)
end
def confidence_plot
@@ -169,52 +118,11 @@ module OpenTox
field :correlation_plot_id, type: BSON::ObjectId
def statistics
- rmse = 0
- mae = 0
- x = []
- y = []
- predictions.each do |pred|
- compound_id,activity,prediction,confidence = pred
- if activity and prediction
- unless activity == [nil]
- x << -Math.log10(activity.median)
- y << -Math.log10(prediction)
- error = Math.log10(prediction)-Math.log10(activity.median)
- rmse += error**2
- #weighted_rmse += confidence*error**2
- mae += error.abs
- #weighted_mae += confidence*error.abs
- #confidence_sum += confidence
- end
- else
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- end
- end
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "r <- cor(measurement,prediction,use='complete')"
- r = R.eval("r").to_ruby
-
- mae = mae/predictions.size
- #weighted_mae = weighted_mae/confidence_sum
- rmse = Math.sqrt(rmse/predictions.size)
- #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
- update_attributes(
- mae: mae,
- rmse: rmse,
- #weighted_mae: weighted_mae,
- #weighted_rmse: weighted_rmse,
- r_squared: r**2,
- finished_at: Time.now
- )
- $logger.debug "R^2 #{r**2}"
- $logger.debug "RMSE #{rmse}"
- $logger.debug "MAE #{mae}"
+ stat = ValidationStatistics.regression predictions
+ update_attributes(stat)
end
def misclassifications n=nil
- #n = predictions.size unless n
n ||= 10
model = Model::Lazar.find(self.model_id)
training_dataset = Dataset.find(model.training_dataset_id)
@@ -225,8 +133,7 @@ module OpenTox
neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters)
neighbors.collect! do |n|
neighbor = Compound.find(n[0])
- values = training_dataset.values(neighbor,prediction_feature)
- { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values}
+ { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]}
end
{
:smiles => compound.smiles,
@@ -297,5 +204,4 @@ module OpenTox
end
end
-
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 5d8aeaf..b51d74b 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -5,23 +5,28 @@ module OpenTox
class Dataset
- # associations like has_many, belongs_to deteriorate performance
+ field :substance_ids, type: Array, default: []
field :feature_ids, type: Array, default: []
- field :compound_ids, type: Array, default: []
- field :data_entries, type: Array, default: []
- field :source, type: String
# Readers
- # Get all compounds
def compounds
- @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id}
- @compounds
+ substances.select{|s| s.is_a? Compound}
+ end
+
+ def nanoparticles
+ substances.select{|s| s.is_a? Nanoparticle}
+ end
+
+ # Get all substances
+ def substances
+ @substances ||= substance_ids.collect{|id| OpenTox::Substance.find id}
+ @substances
end
# Get all features
def features
- @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)}
+ @features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)}
@features
end
@@ -29,17 +34,15 @@ module OpenTox
# @param compound [OpenTox::Compound] OpenTox Compound object
# @param feature [OpenTox::Feature] OpenTox Feature object
# @return [Array] Data entry values
- def values(compound, feature)
- rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id }
- col = feature_ids.index feature.id
- rows.collect{|row| data_entries[row][col]}
- end
+ #def values(compound, feature)
+ #data_entries[compound.id.to_s][feature.id.to_s]
+ #end
# Writers
# Set compounds
def compounds=(compounds)
- self.compound_ids = compounds.collect{|c| c.id}
+ self.substance_ids = compounds.collect{|c| c.id}
end
# Set features
@@ -53,13 +56,7 @@ module OpenTox
# @param [Integer] number of folds
# @return [Array] Array with folds [training_dataset,test_dataset]
def folds n
- unique_compound_data = {}
- compound_ids.each_with_index do |cid,i|
- unique_compound_data[cid] ||= []
- unique_compound_data[cid] << data_entries[i]
- end
- unique_compound_ids = unique_compound_data.keys
- len = unique_compound_ids.size
+ len = self.substance_ids.size
indices = (0..len-1).to_a.shuffle
mid = (len/n)
chunks = []
@@ -68,24 +65,15 @@ module OpenTox
last = start+mid
last = last-1 unless len%n >= i
test_idxs = indices[start..last] || []
- test_cids = test_idxs.collect{|i| unique_compound_ids[i]}
+ test_cids = test_idxs.collect{|i| substance_ids[i]}
training_idxs = indices-test_idxs
- training_cids = training_idxs.collect{|i| unique_compound_ids[i]}
- chunk = [training_cids,test_cids].collect do |unique_cids|
- cids = []
- data_entries = []
- unique_cids.each do |cid|
- unique_compound_data[cid].each do |de|
- cids << cid
- data_entries << de
- end
- end
- dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id )
+ training_cids = training_idxs.collect{|i| substance_ids[i]}
+ chunk = [training_cids,test_cids].collect do |cids|
+ dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id )
dataset.compounds.each do |compound|
compound.dataset_ids << dataset.id
compound.save
end
- dataset.save
dataset
end
start = last+1
@@ -94,41 +82,28 @@ module OpenTox
chunks
end
- # Diagnostics
-
- def duplicates feature=self.features.first
- col = feature_ids.index feature.id
- dups = {}
- compound_ids.each_with_index do |cid,i|
- rows = compound_ids.each_index.select{|r| compound_ids[r] == cid }
- values = rows.collect{|row| data_entries[row][col]}
- dups[cid] = values if values.size > 1
- end
- dups
- end
-
- def correlation_plot training_dataset
- # TODO: create/store svg
- R.assign "features", data_entries
- R.assign "activities", training_dataset.data_entries.collect{|de| de.first}
- R.eval "featurePlot(features,activities)"
- end
-
- def density_plot
- # TODO: create/store svg
- R.assign "acts", data_entries.collect{|r| r.first }#.compact
- R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')"
- end
-
# Serialisation
# converts dataset to csv format including compound smiles as first column, other column headers are feature names
# @return [String]
def to_csv(inchi=false)
- CSV.generate() do |csv| #{:force_quotes=>true}
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
- compounds.each_with_index do |c,i|
- csv << [inchi ? c.inchi : c.smiles] + data_entries[i]
+ CSV.generate() do |csv|
+ compound = Substance.find(substance_ids.first).is_a? Compound
+ if compound
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
+ else
+ csv << ["Name"] + features.collect{|f| f.name}
+ end
+ substances.each do |substance|
+ features.each do |f|
+ substance.toxicities[f.id.to_s].each do |v|
+ if compound
+ csv << [inchi ? substance.inchi : substance.smiles , v]
+ else
+ csv << [substance.name , v]
+ end
+ end if substance.toxicities[f.id.to_s]
+ end
end
end
end
@@ -144,7 +119,7 @@ module OpenTox
# Create a dataset from CSV file
# TODO: document structure
- def self.from_csv_file file, source=nil, bioassay=true#, layout={}
+ def self.from_csv_file file, source=nil
source ||= file
name = File.basename(file,".*")
dataset = self.find_by(:source => source, :name => name)
@@ -154,51 +129,40 @@ module OpenTox
$logger.debug "Parsing #{file}."
table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
dataset = self.new(:source => source, :name => name)
- dataset.parse_table table, bioassay#, layout
+ dataset.parse_table table
end
dataset
end
# parse data in tabular format (e.g. from csv)
# does a lot of guesswork in order to determine feature types
- def parse_table table, bioassay=true
+ def parse_table table
time = Time.now
# features
feature_names = table.shift.collect{|f| f.strip}
- warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size
+ warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
compound_format = feature_names.shift.strip
+ # TODO nanoparticles
bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
-
numeric = []
# guess feature types
feature_names.each_with_index do |f,i|
metadata = {:name => f}
values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
types = values.collect{|v| v.numeric? ? true : false}.uniq
+ feature = nil
if values.size == 0 # empty feature
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
metadata["numeric"] = true
numeric[i] = true
+ feature = NumericFeature.find_or_create_by(metadata)
else
metadata["nominal"] = true
metadata["accept_values"] = values
numeric[i] = false
- end
- if bioassay
- if metadata["numeric"]
- feature = NumericBioAssay.find_or_create_by(metadata)
- elsif metadata["nominal"]
- feature = NominalBioAssay.find_or_create_by(metadata)
- end
- else
- metadata.merge({:measured => false, :calculated => true})
- if metadata["numeric"]
- feature = NumericFeature.find_or_create_by(metadata)
- elsif metadata["nominal"]
- feature = NominalFeature.find_or_create_by(metadata)
- end
+ feature = NominalFeature.find_or_create_by(metadata)
end
feature_ids << feature.id if feature
end
@@ -211,59 +175,54 @@ module OpenTox
value_time = 0
# compounds and values
- self.data_entries = []
table.each_with_index do |vals,i|
ct = Time.now
identifier = vals.shift.strip
- warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty?
+ warn "No feature values for compound at position #{i+2}." if vals.compact.empty?
begin
case compound_format
when /SMILES/i
compound = OpenTox::Compound.from_smiles(identifier)
when /InChI/i
compound = OpenTox::Compound.from_inchi(identifier)
+ # TODO nanoparticle
end
rescue
compound = nil
end
- if compound.nil?
- # compound parsers may return nil
- warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
+ if compound.nil? # compound parsers may return nil
+ warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored."
next
end
+ substance_ids << compound.id
compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id
compound_time += Time.now-ct
r += 1
- unless vals.size == feature_ids.size # way cheaper than accessing features
- warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
+ unless vals.size == feature_ids.size
+ warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
next
end
- compound_ids << compound.id
- table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1)
-
vals.each_with_index do |v,j|
if v.blank?
- warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
+ warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})."
next
elsif numeric[j]
v = v.to_f
else
v = v.strip
end
- self.data_entries.last[j] = v
- #i = compound.feature_ids.index feature_ids[j]
- compound.features[feature_ids[j].to_s] ||= []
- compound.features[feature_ids[j].to_s] << v
+ compound.toxicities[feature_ids[j].to_s] ||= []
+ compound.toxicities[feature_ids[j].to_s] << v
compound.save
end
end
compounds.duplicates.each do |compound|
positions = []
compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
- warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
end
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
@@ -273,52 +232,26 @@ module OpenTox
end
- # Fill unset data entries
- # @param any value
- def fill_nil_with n
- (0 .. compound_ids.size-1).each do |i|
- data_entries[i] ||= []
- (0 .. feature_ids.size-1).each do |j|
- data_entries[i][j] ||= n
- end
- end
- end
-
end
# Dataset for lazar predictions
- class LazarPrediction < Dataset
+ class LazarPrediction #< Dataset
field :creator, type: String
- field :prediction_feature_id, type: String
+ field :prediction_feature_id, type: BSON::ObjectId
+ field :predictions, type: Hash, default: {}
def prediction_feature
Feature.find prediction_feature_id
end
- end
-
- # Dataset for descriptors (physchem)
- class DescriptorDataset < Dataset
- field :feature_calculation_algorithm, type: String
-
- end
-
- class ScaledDataset < DescriptorDataset
-
- field :centers, type: Array, default: []
- field :scales, type: Array, default: []
+ def compounds
+ substances.select{|s| s.is_a? Compound}
+ end
- def original_value value, i
- value * scales[i] + centers[i]
+ def substances
+ predictions.keys.collect{|id| Substance.find id}
end
- end
- # Dataset for fminer descriptors
- class FminerDataset < DescriptorDataset
- field :training_algorithm, type: String
- field :training_dataset_id, type: BSON::ObjectId
- field :training_feature_id, type: BSON::ObjectId
- field :training_parameters, type: Hash
end
end
diff --git a/lib/feature.rb b/lib/feature.rb
index b58946b..c6fb68a 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -6,6 +6,9 @@ module OpenTox
field :numeric, type: Boolean
field :measured, type: Boolean
field :calculated, type: Boolean
+ field :category, type: String
+ field :unit, type: String
+ field :conditions, type: Hash
end
# Feature for categorical variables
@@ -34,12 +37,4 @@ module OpenTox
end
end
- # Feature for categorical bioassay results
- class NominalBioAssay < NominalFeature
- end
-
- # Feature for quantitative bioassay results
- class NumericBioAssay < NumericFeature
- end
-
end
diff --git a/lib/import.rb b/lib/import.rb
new file mode 100644
index 0000000..9091207
--- /dev/null
+++ b/lib/import.rb
@@ -0,0 +1,73 @@
+module OpenTox
+
+ module Import
+
+ class Enanomapper
+ include OpenTox
+
+ def self.import
+ #get list of bundle URIs
+ bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+ datasets = []
+ bundles.each do |bundle|
+ uri = bundle["URI"]
+ dataset = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"])
+ nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"]
+ features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"]
+ nanoparticles.each do |np|
+ nanoparticle = Nanoparticle.find_or_create_by(
+ :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+ :source => np["compound"]["URI"],
+ )
+ dataset.substance_ids << nanoparticle.id
+ dataset.substance_ids.uniq!
+ studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"]
+ studies.each do |study|
+ study["effects"].each do |effect|
+ effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
+ # TODO parse core/coating
+ # TODO parse proteomics, they come as a large textValue
+ $logger.debug File.join(np["compound"]["URI"],"study")
+ effect["conditions"].delete_if { |k, v| v.nil? }
+ feature = klass.find_or_create_by(
+ :source => File.join(np["compound"]["URI"],"study"),
+ :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}",
+ :unit => effect["result"]["unit"],
+ :category => study["protocol"]["topcategory"],
+ :conditions => effect["conditions"]
+ )
+ nanoparticle.parse_ambit_value feature, effect["result"]
+ dataset.feature_ids << feature.id
+ dataset.feature_ids.uniq!
+ end
+ end
+ end
+ dataset.save
+ datasets << dataset
+ end
+ datasets.collect{|d| d.id}
+ end
+
+ def self.dump
+ #get list of bundle URIs
+ `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json`
+ json = JSON.parse File.read('./bundles.json')
+ json["dataset"].each do |dataset|
+ uri = dataset["URI"]
+ id = uri.split("/").last
+ `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'`
+ `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'`
+ `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'`
+ end
+ end
+
+ end
+
+ end
+
+end
+
diff --git a/lib/lazar.rb b/lib/lazar.rb
index a28ba3a..8eb46e0 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -61,7 +61,8 @@ suppressPackageStartupMessages({
"
# OpenTox classes and includes
-CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+#CLASSES = ["Feature","Substance::Compound","Substance::Nanoparticle","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
[ # be aware of the require sequence as it affects class/method overwrites
"overwrite.rb",
@@ -70,7 +71,9 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO
"opentox.rb",
"feature.rb",
"physchem.rb",
+ "substance.rb",
"compound.rb",
+ "nanoparticle.rb",
"dataset.rb",
"algorithm.rb",
"model.rb",
@@ -79,6 +82,8 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO
"validation.rb",
"crossvalidation.rb",
"leave-one-out-validation.rb",
+ "validation-statistics.rb",
"experiment.rb",
+ "import.rb",
].each{ |f| require_relative f }
OpenTox::PhysChem.descriptors # load descriptor features
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index 0a131a4..ed917eb 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -6,22 +6,31 @@ module OpenTox
field :dataset_id, type: BSON::ObjectId
field :nr_instances, type: Integer
field :nr_unpredicted, type: Integer
- field :predictions, type: Array
+ field :predictions, type: Hash
field :finished_at, type: Time
def self.create model
+ $logger.debug "#{model.name}: LOO validation started"
+ t = Time.now
model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation
loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id
- compound_ids = model.training_dataset.compound_ids
predictions = model.predict model.training_dataset.compounds
- predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]}
- predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?}
+ predictions.each{|cid,p| p.delete(:neighbors)}
+ nr_unpredicted = 0
+ predictions.each do |cid,prediction|
+ if prediction[:value]
+ prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s]
+ else
+ nr_unpredicted += 1
+ end
+ predictions.delete(cid) unless prediction[:value] and prediction[:measured]
+ end
loo.nr_instances = predictions.size
- predictions.select!{|p| p[:value]} # remove unpredicted
- loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]}
- loo.nr_unpredicted = loo.nr_instances - loo.predictions.size
+ loo.nr_unpredicted = nr_unpredicted
+ loo.predictions = predictions
loo.statistics
loo.save
+ $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
loo
end
@@ -42,53 +51,8 @@ module OpenTox
field :confidence_plot_id, type: BSON::ObjectId
def statistics
- accept_values = Feature.find(model.prediction_feature_id).accept_values
- confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)}
- predictions.each do |pred|
- pred[:database_activities].each do |db_act|
- if pred[:value]
- if pred[:value] == db_act
- if pred[:value] == accept_values[0]
- confusion_matrix[0][0] += 1
- #weighted_confusion_matrix[0][0] += pred[:confidence]
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][1] += 1
- #weighted_confusion_matrix[1][1] += pred[:confidence]
- end
- else
- if pred[:value] == accept_values[0]
- confusion_matrix[0][1] += 1
- #weighted_confusion_matrix[0][1] += pred[:confidence]
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][0] += 1
- #weighted_confusion_matrix[1][0] += pred[:confidence]
- end
- end
- end
- end
- end
- accept_values.each_with_index do |v,i|
- true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
- end
- confidence_sum = 0
-# weighted_confusion_matrix.each do |r|
-# r.each do |c|
-# confidence_sum += c
-# end
-# end
- update_attributes(
- accept_values: accept_values,
- confusion_matrix: confusion_matrix,
-# weighted_confusion_matrix: weighted_confusion_matrix,
- accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f,
-# weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f,
- true_rate: true_rate,
- predictivity: predictivity,
- finished_at: Time.now
- )
- $logger.debug "Accuracy #{accuracy}"
+ stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values)
+ update_attributes(stat)
end
def confidence_plot
@@ -123,52 +87,15 @@ module OpenTox
class RegressionLeaveOneOutValidation < LeaveOneOutValidation
-
- field :rmse, type: Float, default: 0.0
+ field :rmse, type: Float, default: 0
field :mae, type: Float, default: 0
- #field :weighted_rmse, type: Float, default: 0
- #field :weighted_mae, type: Float, default: 0
field :r_squared, type: Float
field :correlation_plot_id, type: BSON::ObjectId
field :confidence_plot_id, type: BSON::ObjectId
def statistics
- confidence_sum = 0
- predicted_values = []
- measured_values = []
- predictions.each do |pred|
- pred[:database_activities].each do |activity|
- if pred[:value]
- predicted_values << pred[:value]
- measured_values << activity
- error = Math.log10(pred[:value])-Math.log10(activity)
- self.rmse += error**2
- #self.weighted_rmse += pred[:confidence]*error**2
- self.mae += error.abs
- #self.weighted_mae += pred[:confidence]*error.abs
- #confidence_sum += pred[:confidence]
- end
- end
- if pred[:database_activities].empty?
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- end
- end
- R.assign "measurement", measured_values
- R.assign "prediction", predicted_values
- R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
- r = R.eval("r").to_ruby
-
- self.mae = self.mae/predictions.size
- #self.weighted_mae = self.weighted_mae/confidence_sum
- self.rmse = Math.sqrt(self.rmse/predictions.size)
- #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum)
- self.r_squared = r**2
- self.finished_at = Time.now
- save
- $logger.debug "R^2 #{r**2}"
- $logger.debug "RMSE #{rmse}"
- $logger.debug "MAE #{mae}"
+ stat = ValidationStatistics.regression predictions
+ update_attributes(stat)
end
def correlation_plot
diff --git a/lib/model.rb b/lib/model.rb
index 8e657b8..b82f098 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -20,6 +20,10 @@ module OpenTox
def training_dataset
Dataset.find(training_dataset_id)
end
+
+ def prediction_feature
+ Feature.find(prediction_feature_id)
+ end
end
class Lazar < Model
@@ -31,12 +35,10 @@ module OpenTox
# Create a lazar model from a training_dataset and a feature_dataset
# @param [OpenTox::Dataset] training_dataset
# @return [OpenTox::Model::Lazar] Regression or classification model
- def initialize training_dataset, params={}
+ def initialize prediction_feature, training_dataset, params={}
super params
- # TODO document convention
- prediction_feature = training_dataset.features.first
# set defaults for empty parameters
self.prediction_feature_id ||= prediction_feature.id
self.training_dataset_id ||= training_dataset.id
@@ -48,7 +50,6 @@ module OpenTox
end
def predict_compound compound
- prediction_feature = Feature.find prediction_feature_id
neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
# remove neighbors without prediction_feature
# check for database activities (neighbors may include query compound)
@@ -56,12 +57,13 @@ module OpenTox
prediction = {}
if neighbors.collect{|n| n["_id"]}.include? compound.id
- database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
+ #TODO restrict to dataset features
+ database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq
prediction[:database_activities] = database_activities
prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
neighbors.delete_if{|n| n["_id"] == compound.id}
end
- neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
+ neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] }
if neighbors.empty?
prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
else
@@ -78,62 +80,55 @@ module OpenTox
# parse data
compounds = []
- case object.class.to_s
- when "OpenTox::Compound"
+ if object.is_a? Substance
compounds = [object]
- when "Array"
+ elsif object.is_a? Array
compounds = object
- when "OpenTox::Dataset"
+ elsif object.is_a? Dataset
compounds = object.compounds
else
bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
end
# make predictions
- predictions = []
- predictions = compounds.collect{|c| predict_compound c}
+ predictions = {}
+ compounds.each do |c|
+ predictions[c.id.to_s] = predict_compound c
+ predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
+ end
# serialize result
- case object.class.to_s
- when "OpenTox::Compound"
- prediction = predictions.first
+ if object.is_a? Substance
+ prediction = predictions[compounds.first.id.to_s]
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
return prediction
- when "Array"
+ elsif object.is_a? Array
return predictions
- when "OpenTox::Dataset"
+ elsif object.is_a? Dataset
+ predictions.each{|cid,p| p.delete(:neighbors)}
# prepare prediction dataset
measurement_feature = Feature.find prediction_feature_id
- prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
- prediction_dataset = LazarPrediction.new(
+ prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
+ prediction_dataset = LazarPrediction.create(
:name => "Lazar prediction for #{prediction_feature.name}",
:creator => __FILE__,
- :prediction_feature_id => prediction_feature.id
-
+ :prediction_feature_id => prediction_feature.id,
+ :predictions => predictions
)
- confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" )
- warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings")
- prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ]
- prediction_dataset.compounds = compounds
- prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]}
- prediction_dataset.save
+
+ #prediction_dataset.save
return prediction_dataset
end
end
-
- def training_activities
- i = training_dataset.feature_ids.index prediction_feature_id
- training_dataset.data_entries.collect{|de| de[i]}
- end
end
class LazarClassification < Lazar
- def self.create training_dataset, params={}
- model = self.new training_dataset, params
+ def self.create prediction_feature, training_dataset, params={}
+ model = self.new prediction_feature, training_dataset, params
model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
model.neighbor_algorithm ||= "fingerprint_neighbors"
model.neighbor_algorithm_parameters ||= {}
@@ -151,8 +146,8 @@ module OpenTox
class LazarRegression < Lazar
- def self.create training_dataset, params={}
- model = self.new training_dataset, params
+ def self.create prediction_feature, training_dataset, params={}
+ model = self.new prediction_feature, training_dataset, params
model.neighbor_algorithm ||= "fingerprint_neighbors"
model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression"
model.neighbor_algorithm_parameters ||= {}
@@ -173,13 +168,13 @@ module OpenTox
include Mongoid::Document
include Mongoid::Timestamps
- # TODO field Validations
field :endpoint, type: String
field :species, type: String
field :source, type: String
field :unit, type: String
field :model_id, type: BSON::ObjectId
field :repeated_crossvalidation_id, type: BSON::ObjectId
+ field :leave_one_out_validation_id, type: BSON::ObjectId
def predict object
Lazar.find(model_id).predict object
@@ -201,12 +196,16 @@ module OpenTox
repeated_crossvalidation.crossvalidations
end
+ def leave_one_out_validation
+ LeaveOneOutValidation.find leave_one_out_validation_id
+ end
+
def regression?
- training_dataset.features.first.numeric?
+ model.is_a? LazarRegression
end
def classification?
- training_dataset.features.first.nominal?
+ model.is_a? LazarClassification
end
def self.from_csv_file file
@@ -214,19 +213,61 @@ module OpenTox
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
prediction_model = self.new JSON.parse(File.read(metadata_file))
training_dataset = Dataset.from_csv_file file
+ prediction_feature = training_dataset.features.first
model = nil
- if training_dataset.features.first.nominal?
- model = LazarClassification.create training_dataset
- elsif training_dataset.features.first.numeric?
- model = LazarRegression.create training_dataset
+ if prediction_feature.nominal?
+ model = LazarClassification.create prediction_feature, training_dataset
+ elsif prediction_feature.numeric?
+ model = LazarRegression.create prediction_feature, training_dataset
end
prediction_model[:model_id] = model.id
+ prediction_model[:prediction_feature_id] = prediction_feature.id
prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
+ prediction_model[:leave_one_out_validation_id] = LeaveOneOutValidation.create(model).id
prediction_model.save
prediction_model
end
end
+ class NanoLazar
+ include OpenTox
+ include Mongoid::Document
+ include Mongoid::Timestamps
+ store_in collection: "models"
+
+ field :name, type: String
+ field :creator, type: String, default: __FILE__
+ # datasets
+ field :training_dataset_id, type: BSON::ObjectId
+ # algorithms
+ field :prediction_algorithm, type: String
+ # prediction feature
+ field :prediction_feature_id, type: BSON::ObjectId
+ field :training_particle_ids, type: Array
+
+ def self.create_all
+ nanoparticles = Nanoparticle.all
+ toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id}
+ tox = {}
+ toxfeatures.each do |t|
+ tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s}
+ end
+ tox.select!{|t,nps| nps.size > 50}
+ tox.collect do |t,nps|
+ find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id})
+ end
+ end
+
+ def predict nanoparticle
+ training = training_particle_ids.collect{|id| Nanoparticle.find id}
+ training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq
+ query_features = nanoparticle.physchem_descriptors.keys
+ common_features = (training_features & query_features)
+ #p common_features
+ end
+
+ end
+
end
end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
new file mode 100644
index 0000000..b934bb3
--- /dev/null
+++ b/lib/nanoparticle.rb
@@ -0,0 +1,69 @@
+module OpenTox
+
+ class Nanoparticle < Substance
+ include OpenTox
+
+ field :core, type: String
+ field :coating, type: Array, default: []
+ field :bundles, type: Array, default: []
+
+ def nanoparticle_neighbors params
+ Dataset.find(params[:training_dataset_id]).nanoparticles
+ end
+
+ def add_feature feature, value
+ case feature.category
+ when "P-CHEM"
+ physchem_descriptors[feature.id.to_s] ||= []
+ physchem_descriptors[feature.id.to_s] << value
+ when "TOX"
+ toxicities[feature.id.to_s] ||= []
+ toxicities[feature.id.to_s] << value
+ else
+ warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted."
+ end
+ save
+ end
+
+ def parse_ambit_value feature, v
+ v.delete "unit"
+ # TODO: mmol/log10 conversion
+ if v.keys == ["textValue"]
+ add_feature feature, v["textValue"]
+ elsif v.keys == ["loValue"]
+ add_feature feature, v["loValue"]
+ elsif v.keys.size == 2 and v["errorValue"]
+ add_feature feature, v["loValue"]
+ warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
+ elsif v.keys.size == 2 and v["loQualifier"] == "mean"
+ add_feature feature, v["loValue"]
+ warn "'#{feature.name}' is a mean value. Original data is not available."
+ elsif v.keys.size == 2 and v["loQualifier"] #== ">="
+ warn "Only min value available for '#{feature.name}', entry ignored"
+ elsif v.keys.size == 2 and v["upQualifier"] #== ">="
+ warn "Only max value available for '#{feature.name}', entry ignored"
+ elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
+ add_feature feature, v["loValue"]
+ warn "loQualifier and upQualifier are empty."
+ elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == ""
+ add_feature feature, v["loValue"]
+ warn "loQualifier and upQualifier are empty."
+ elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil?
+ add_feature feature, v["loValue"]
+ warn "loQualifier and upQualifier are empty."
+ elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"]
+ add_feature feature, [v["loValue"],v["upValue"]].mean
+ warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
+ elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"]
+ warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'."
+ add_feature feature, v["loValue"]
+ elsif v == {} # do nothing
+ else
+ warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
+ end
+ end
+
+ end
+end
+
+
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 186c87a..7d8a8a2 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -13,7 +13,13 @@ module OpenTox
include Mongoid::Timestamps
store_in collection: klass.downcase.pluralize
field :name, type: String
+ field :source, type: String
field :warnings, type: Array, default: []
+
+ def warn warning
+ $logger.warn warning
+ warnings << warning
+ end
end
OpenTox.const_set klass,c
end
diff --git a/lib/regression.rb b/lib/regression.rb
index 5021fb3..cb17f25 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -9,8 +9,8 @@ module OpenTox
neighbors = params[:neighbors]
neighbors.each do |row|
sim = row["tanimoto"]
- if row["features"][params[:prediction_feature_id].to_s]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
+ if row["toxicities"][params[:prediction_feature_id].to_s]
+ row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
weighted_sum += sim*Math.log10(act)
sim_sum += sim
end
@@ -32,8 +32,8 @@ module OpenTox
neighbors.each_with_index do |row,i|
neighbor = Compound.find row["_id"]
fingerprint = neighbor.fingerprint
- if row["features"][params[:prediction_feature_id].to_s]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
+ if row["toxicities"][params[:prediction_feature_id].to_s]
+ row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
activities << Math.log10(act)
weights << row["tanimoto"]
fingerprint_ids.each_with_index do |id,j|
@@ -79,21 +79,24 @@ module OpenTox
neighbors = params[:neighbors]
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
- return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
+ return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
activities = []
weights = []
physchem = {}
- neighbors.each_with_index do |row,i|
- neighbor = Compound.find row["_id"]
- if row["features"][params[:prediction_feature_id].to_s]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- activities << Math.log10(act)
- weights << row["tanimoto"] # TODO cosine ?
- neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
+ neighbors.each_with_index do |n,i|
+ if n["toxicities"][params[:prediction_feature_id].to_s]
+ n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
+ # TODO fix!!!!
+ activities << -Math.log10(act)
+ #if act.numeric?
+ #activities << act
+ n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
+ neighbor = Substance.find(n["_id"])
+ neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity
physchem[pid] ||= []
- physchem[pid] << v
+ physchem[pid] += v
end
end
end
@@ -110,8 +113,8 @@ module OpenTox
return result
else
- data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
- prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
+ data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }}
+ prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]}
if prediction.nil?
prediction = local_weighted_average(compound, params)
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
@@ -127,6 +130,8 @@ module OpenTox
def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
R.assign "weights", training_weights
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+ #p r_data_frame
+ File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"}
R.eval "data <- #{r_data_frame}"
R.assign "features", training_features
R.eval "names(data) <- append(c('activities'),features)" #
diff --git a/lib/substance.rb b/lib/substance.rb
new file mode 100644
index 0000000..82ca65d
--- /dev/null
+++ b/lib/substance.rb
@@ -0,0 +1,10 @@
+module OpenTox
+
+ class Substance
+ field :physchem_descriptors, type: Hash, default: {}
+ field :toxicities, type: Hash, default: {}
+ field :dataset_ids, type: Array, default: []
+ end
+
+end
+
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
new file mode 100644
index 0000000..c6b2a07
--- /dev/null
+++ b/lib/validation-statistics.rb
@@ -0,0 +1,101 @@
+module OpenTox
+ class ValidationStatistics
+ include OpenTox
+ def self.classification predictions, accept_values
+ confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+ weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
+ true_rate = {}
+ predictivity = {}
+ nr_instances = 0
+ predictions.each do |cid,pred|
+ # TODO use measured majority class
+ if pred[:measured].uniq.size == 1
+ m = pred[:measured].first
+ #pred[:measured].each do |m|
+ if pred[:value] == m
+ if pred[:value] == accept_values[0]
+ confusion_matrix[0][0] += 1
+ weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ elsif pred[:value] == accept_values[1]
+ confusion_matrix[1][1] += 1
+ weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ end
+ elsif pred[:value] != m
+ if pred[:value] == accept_values[0]
+ confusion_matrix[0][1] += 1
+ weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ elsif pred[:value] == accept_values[1]
+ confusion_matrix[1][0] += 1
+ weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
+ nr_instances += 1
+ end
+ end
+ end
+ end
+ true_rate = {}
+ predictivity = {}
+ accept_values.each_with_index do |v,i|
+ true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
+ predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
+ end
+ confidence_sum = 0
+ weighted_confusion_matrix.each do |r|
+ r.each do |c|
+ confidence_sum += c
+ end
+ end
+ accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
+ weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+ $logger.debug "Accuracy #{accuracy}"
+ {
+ :accept_values => accept_values,
+ :confusion_matrix => confusion_matrix,
+ :weighted_confusion_matrix => weighted_confusion_matrix,
+ :accuracy => accuracy,
+ :weighted_accuracy => weighted_accuracy,
+ :true_rate => true_rate,
+ :predictivity => predictivity,
+ :finished_at => Time.now
+ }
+ end
+
+ def self.regression predictions
+ # TODO: prediction intervals
+ rmse = 0
+ mae = 0
+ x = []
+ y = []
+ predictions.each do |cid,pred|
+ if pred[:value] and pred[:measured] #and pred[:measured] != [nil]
+ x << -Math.log10(pred[:measured].median)
+ y << -Math.log10(pred[:value])
+ error = Math.log10(pred[:value])-Math.log10(pred[:measured].median)
+ rmse += error**2
+ mae += error.abs
+ else
+ warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
+ end
+ end
+ R.assign "measurement", x
+ R.assign "prediction", y
+ R.eval "r <- cor(measurement,prediction,use='complete')"
+ r = R.eval("r").to_ruby
+
+ mae = mae/predictions.size
+ rmse = Math.sqrt(rmse/predictions.size)
+ $logger.debug "R^2 #{r**2}"
+ $logger.debug "RMSE #{rmse}"
+ $logger.debug "MAE #{mae}"
+ {
+ :mae => mae,
+ :rmse => rmse,
+ :r_squared => r**2,
+ :finished_at => Time.now
+ }
+ end
+ end
+end
diff --git a/lib/validation.rb b/lib/validation.rb
index b72d273..6b515e4 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -8,7 +8,7 @@ module OpenTox
field :test_dataset_id, type: BSON::ObjectId
field :nr_instances, type: Integer
field :nr_unpredicted, type: Integer
- field :predictions, type: Array
+ field :predictions, type: Hash
def prediction_dataset
Dataset.find prediction_dataset_id
@@ -27,32 +27,23 @@ module OpenTox
atts = model.attributes.dup # do not modify attributes from original model
atts["_id"] = BSON::ObjectId.new
atts[:training_dataset_id] = training_set.id
- validation_model = model.class.create training_set, atts
+ validation_model = model.class.create model.prediction_feature, training_set, atts
validation_model.save
- cids = test_set.compound_ids
-
- test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used
- prediction_dataset = validation_model.predict test_set_without_activities
- predictions = []
+ predictions = validation_model.predict test_set.compounds
+ predictions.each{|cid,p| p.delete(:neighbors)}
nr_unpredicted = 0
- activities = test_set.data_entries.collect{|de| de.first}
- prediction_dataset.data_entries.each_with_index do |de,i|
- if de[0] #and de[1]
- cid = prediction_dataset.compound_ids[i]
- rows = cids.each_index.select{|r| cids[r] == cid }
- activities = rows.collect{|r| test_set.data_entries[r][0]}
- prediction = de.first
- confidence = de[1]
- predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]]
+ predictions.each do |cid,prediction|
+ if prediction[:value]
+ prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s]
else
nr_unpredicted += 1
end
+ predictions.delete(cid) unless prediction[:value] and prediction[:measured]
end
validation = self.new(
:model_id => validation_model.id,
- :prediction_dataset_id => prediction_dataset.id,
:test_dataset_id => test_set.id,
- :nr_instances => test_set.compound_ids.size,
+ :nr_instances => test_set.compounds.size,
:nr_unpredicted => nr_unpredicted,
:predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence
)
@@ -67,42 +58,6 @@ module OpenTox
end
class RegressionValidation < Validation
-
- def statistics
- rmse = 0
- weighted_rmse = 0
- rse = 0
- weighted_rse = 0
- mae = 0
- weighted_mae = 0
- confidence_sum = 0
- predictions.each do |pred|
- compound_id,activity,prediction,confidence = pred
- if activity and prediction
- error = Math.log10(prediction)-Math.log10(activity.median)
- rmse += error**2
- weighted_rmse += confidence*error**2
- mae += error.abs
- weighted_mae += confidence*error.abs
- confidence_sum += confidence
- else
- warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}."
- end
- end
- x = predictions.collect{|p| p[1].median}
- y = predictions.collect{|p| p[2]}
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')"
- r = R.eval("r").to_ruby
-
- mae = mae/predictions.size
- weighted_mae = weighted_mae/confidence_sum
- rmse = Math.sqrt(rmse/predictions.size)
- weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum)
- { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae }
- end
end
end
diff --git a/test/classification.rb b/test/classification.rb
index bedbe14..7412714 100644
--- a/test/classification.rb
+++ b/test/classification.rb
@@ -30,12 +30,14 @@ class LazarClassificationTest < MiniTest::Test
# make a dataset prediction
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
- prediction = model.predict compound_dataset
- assert_equal compound_dataset.compounds, prediction.compounds
+ prediction_dataset = model.predict compound_dataset
+ assert_equal compound_dataset.compounds, prediction_dataset.compounds
- assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3]
- assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3]
+ cid = prediction_dataset.compounds[7].id.to_s
+ assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning]
+ cid = prediction_dataset.compounds[9].id.to_s
+ assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction_dataset.predictions[cid][:warning]
# cleanup
- [training_dataset,model,compound_dataset].each{|o| o.delete}
+ [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
end
end
diff --git a/test/dataset.rb b/test/dataset.rb
index 297251e..a7b8769 100644
--- a/test/dataset.rb
+++ b/test/dataset.rb
@@ -36,38 +36,34 @@ class DatasetTest < MiniTest::Test
assert_equal Dataset, d.class
d.name = "Create dataset test"
- # features not set
- # << operator was removed for efficiency reasons (CH)
- #assert_raises BadRequestError do
- # d << [Compound.from_smiles("c1ccccc1NN"), 1,2]
- #end
-
# add data entries
- d.features = ["test1", "test2"].collect do |title|
+ features = ["test1", "test2"].collect do |title|
f = Feature.new
f.name = title
f.numeric = true
f.save
f
end
-
- # wrong feature size
- # << operator was removed for efficiency reasons (CH)
- #assert_raises BadRequestError do
- # d << [Compound.from_smiles("c1ccccc1NN"), 1,2,3]
- #end
# manual low-level insertions without consistency checks for runtime efficiency
+ compounds = ["c1ccccc1NN", "CC(C)N", "C1C(C)CCCC1"].collect do |smi|
+ Compound.from_smiles smi
+ end
data_entries = []
- d.compound_ids << Compound.from_smiles("c1ccccc1NN").id
data_entries << [1,2]
- d.compound_ids << Compound.from_smiles("CC(C)N").id
data_entries << [4,5]
- d.compound_ids << Compound.from_smiles("C1C(C)CCCC1").id
data_entries << [6,7]
- d.data_entries = data_entries
+ compounds.each_with_index do |c,i|
+ features.each_with_index do |f,j|
+ d.data_entries[c.id.to_s] ||= {}
+ d.data_entries[c.id.to_s][f.id.to_s] ||= []
+ d.data_entries[c.id.to_s][f.id.to_s] << data_entries[i][j]
+ end
+ end
+
assert_equal 3, d.compounds.size
assert_equal 2, d.features.size
+ p d.data_entries
assert_equal [[1,2],[4,5],[6,7]], d.data_entries
d.save
# check if dataset has been saved correctly
@@ -89,8 +85,14 @@ class DatasetTest < MiniTest::Test
assert_equal "multicolumn", new_dataset.name
# get features
assert_equal 6, new_dataset.features.size
- assert_equal 7, new_dataset.compounds.size
- assert_equal ["1", nil, "false", nil, nil, 1.0], new_dataset.data_entries.last
+ assert_equal 5, new_dataset.compounds.size
+ de = new_dataset.data_entries[new_dataset.compounds.last.id.to_s]
+ fid = new_dataset.features.first.id.to_s
+ assert_equal ["1"], de[fid]
+ fid = new_dataset.features.last.id.to_s
+ assert_equal [1.0], de[fid]
+ fid = new_dataset.features[2].id.to_s
+ assert_equal ["false"], de[fid]
d.delete
end
@@ -117,7 +119,7 @@ class DatasetTest < MiniTest::Test
assert d.warnings.grep(/Duplicate compound/)
assert d.warnings.grep(/3, 5/)
assert_equal 6, d.features.size
- assert_equal 7, d.compounds.size
+ assert_equal 5, d.compounds.size
assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size
assert_equal [["1", "1", "true", "true", "test", 1.1], ["1", "2", "false", "7.5", "test", 0.24], ["1", "3", "true", "5", "test", 3578.239], ["0", "4", "false", "false", "test", -2.35], ["1", "2", "true", "4", "test_2", 1], ["1", "2", "false", "false", "test", -1.5], ["1", nil, "false", nil, nil, 1.0]], d.data_entries
assert_equal "c1ccc[nH]1,1,,false,,,1.0", d.to_csv.split("\n")[7]
@@ -195,7 +197,7 @@ class DatasetTest < MiniTest::Test
assert_match "EPAFHM.mini.csv", d.source
assert_equal 1, d.features.size
feature = d.features.first
- assert_kind_of NumericBioAssay, feature
+ assert_kind_of NumericFeature, feature
assert_equal 0.0113, d.data_entries[0][0]
assert_equal 0.00323, d.data_entries[5][0]
d2 = Dataset.find d.id
@@ -207,10 +209,10 @@ class DatasetTest < MiniTest::Test
dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv")
dataset.folds(10).each do |fold|
fold.each do |d|
- assert_equal d.data_entries.size, d.compound_ids.size
- assert_operator d.compound_ids.size, :>=, d.compound_ids.uniq.size
+ assert_equal d.data_entries.size, d.compounds.size
+ assert_equal d.compounds.size, :>=, d.compounds.uniq.size
end
- assert_operator fold[0].compound_ids.uniq.size, :>=, fold[1].compound_ids.uniq.size
+ assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size
end
#puts dataset.folds 10
end
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
new file mode 100644
index 0000000..46073a9
--- /dev/null
+++ b/test/nanoparticles.rb
@@ -0,0 +1,34 @@
+require_relative "setup.rb"
+
+class NanoparticleTest < MiniTest::Test
+
+ def test_import
+ dataset_ids = Import::Enanomapper.import
+ assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
+ assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
+ assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki")
+ assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+ p dataset_ids.collect{|d| {d => Dataset.find(d).name}}
+ dataset_ids.collect do |d|
+ d = Dataset.find(d)
+ p d.name
+ puts d.to_csv
+ end
+ end
+
+ def test_export
+ Dataset.all.each do |d|
+ puts d.to_csv
+ end
+ end
+
+ def test_create_model
+ training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+ model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors")
+ nanoparticle = training_dataset.nanoparticles[-34]
+ prediction = model.predict nanoparticle
+ p prediction
+ refute_nil prediction[:value]
+ end
+
+end
diff --git a/test/prediction_models.rb b/test/prediction_models.rb
index a2e5fe2..49a2472 100644
--- a/test/prediction_models.rb
+++ b/test/prediction_models.rb
@@ -10,7 +10,6 @@ class PredictionModelTest < MiniTest::Test
assert pm.classification?
refute pm.regression?
pm.crossvalidations.each do |cv|
- p cv
assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split."
end
prediction = pm.predict Compound.from_smiles("CCCC(NN)C")
diff --git a/test/validation.rb b/test/validation.rb
index d8eea59..baee2d1 100644
--- a/test/validation.rb
+++ b/test/validation.rb
@@ -6,17 +6,17 @@ class ValidationTest < MiniTest::Test
def test_default_classification_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
- model = Model::LazarClassification.create dataset
+ model = Model::LazarClassification.create dataset.features.first, dataset
cv = ClassificationCrossValidation.create model
- assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7"
+ assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split"
end
def test_default_regression_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
- model = Model::LazarRegression.create dataset
+ model = Model::LazarRegression.create dataset.features.first, dataset
cv = RegressionCrossValidation.create model
- assert cv.rmse < 1.5, "RMSE > 1.5"
- assert cv.mae < 1
+ assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be larger than 1.5, this may occur due to an unfavorable training/test set split"
+ assert cv.mae < 1, "MAE #{cv.mae} should be larger than 1, this may occur due to an unfavorable training/test set split"
end
# parameters
@@ -30,7 +30,7 @@ class ValidationTest < MiniTest::Test
:type => "FP3"
}
}
- model = Model::LazarClassification.create dataset, params
+ model = Model::LazarClassification.create dataset.features.first, dataset, params
model.save
cv = ClassificationCrossValidation.create model
params = model.neighbor_algorithm_parameters
@@ -54,7 +54,7 @@ class ValidationTest < MiniTest::Test
:min_sim => 0.7,
}
}
- model = Model::LazarRegression.create dataset, params
+ model = Model::LazarRegression.create dataset.features.first, dataset, params
cv = RegressionCrossValidation.create model
cv.validation_ids.each do |vid|
model = Model::Lazar.find(Validation.find(vid).model_id)
@@ -70,7 +70,7 @@ class ValidationTest < MiniTest::Test
def test_physchem_regression_crossvalidation
training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
- model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
+ model = Model::LazarRegression.create(training_dataset.features.first, training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression")
cv = RegressionCrossValidation.create model
refute_nil cv.rmse
refute_nil cv.mae
@@ -80,7 +80,7 @@ class ValidationTest < MiniTest::Test
def test_classification_loo_validation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
- model = Model::LazarClassification.create dataset
+ model = Model::LazarClassification.create dataset.features.first, dataset
loo = ClassificationLeaveOneOutValidation.create model
assert_equal 14, loo.nr_unpredicted
refute_empty loo.confusion_matrix
@@ -89,7 +89,7 @@ class ValidationTest < MiniTest::Test
def test_regression_loo_validation
dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
- model = Model::LazarRegression.create dataset
+ model = Model::LazarRegression.create dataset.features.first, dataset
loo = RegressionLeaveOneOutValidation.create model
assert loo.r_squared > 0.34
end
@@ -98,7 +98,7 @@ class ValidationTest < MiniTest::Test
def test_repeated_crossvalidation
dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
- model = Model::LazarClassification.create dataset
+ model = Model::LazarClassification.create dataset.features.first, dataset
repeated_cv = RepeatedCrossValidation.create model
repeated_cv.crossvalidations.each do |cv|
assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split"