summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/array.rb30
-rw-r--r--lib/dataset.rb9
-rw-r--r--lib/feature-selection.rb56
-rw-r--r--lib/feature_selection.rb45
-rw-r--r--lib/lazar.rb2
-rw-r--r--lib/model.rb236
-rw-r--r--lib/statistics.rb12
7 files changed, 238 insertions, 152 deletions
diff --git a/lib/array.rb b/lib/array.rb
index 55186bd..32a8e5d 100644
--- a/lib/array.rb
+++ b/lib/array.rb
@@ -1,3 +1,4 @@
+# TODO increase speed with vectors, matrices?
class Array
# Sum the size of single arrays in an array of arrays
@@ -74,14 +75,29 @@ class Array
end
end
- # Collect array with index
- # in analogy to each_with_index
- def collect_with_index
- result = []
- self.each_with_index do |elt, idx|
- result << yield(elt, idx)
+ def remove indices
+ out = self
+ indices.sort.reverse_each do |i|
+ out.delete_at i
end
- result
+ out
+ end
+
+ # Correlation coefficient
+ # @param [Array]
+ # @return [Numeric]
+ def r(y)
+ raise "Argument is not a Array class!" unless y.class == Array
+ raise "Self array is nil!" if self.size == 0
+ raise "Argument array size is invalid!" unless self.size == y.size
+
+ mean_x = self.inject(0) { |s, a| s += a } / self.size.to_f
+ mean_y = y.inject(0) { |s, a| s += a } / y.size.to_f
+ cov = self.zip(y).inject(0) { |s, a| s += (a[0] - mean_x) * (a[1] - mean_y) }
+ var_x = self.inject(0) { |s, a| s += (a - mean_x) ** 2 }
+ var_y = y.inject(0) { |s, a| s += (a - mean_y) ** 2 }
+ r = cov / Math.sqrt(var_x)
+ r /= Math.sqrt(var_y)
end
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index e30f000..f40f054 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -12,7 +12,7 @@ class Dataset
@independent_variable_type = File.read(File.join(@dir,"independent_variable_type")).chomp
@lines = File.readlines(file)
@header = @lines.shift.split(",")
- @header.first.match(/ID/i) ? @has_id = true : @has_id = false
+ @header.first.match(/ID|SMILES/i) ? @has_id = true : @has_id = false
@dependent_variable_name = @header.pop
@ids = []
@dependent_variables = []
@@ -43,6 +43,13 @@ class Dataset
end
@independent_variables = Matrix[ *@independent_variables ]
columns = @independent_variables.column_vectors
+ stdev = columns.collect{|c| c.to_a.standard_deviation}
+ stdev.each_index.reverse_each do |i|
+ if stdev[i] == 0
+ @independent_variable_names.delete_at(i)
+ columns.delete_at(i)
+ end
+ end
@independent_variable_means = columns.collect{|c| c.to_a.mean}
@independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation}
scaled_columns = []
diff --git a/lib/feature-selection.rb b/lib/feature-selection.rb
new file mode 100644
index 0000000..8601737
--- /dev/null
+++ b/lib/feature-selection.rb
@@ -0,0 +1,56 @@
+module OpenTox
+ module Algorithm
+
+ # Feature selection algorithms
+ module FeatureSelection
+
+ class Supervised
+
+ # Select features correlated to the models prediction feature
+ # @param [OpenTox::Model::Lazar]
+ def self.correlation_filter model
+ relevant_features = {}
+ R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
+ model.descriptor_weights = []
+ selected_variables = []
+ selected_descriptor_ids = []
+ model.independent_variables.each_with_index do |v,i|
+ v.collect!{|n| to_r(n)}
+ R.assign "independent", v
+ begin
+ R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
+ pvalue = R.eval("cor$p.value").to_ruby
+ if pvalue <= 0.05
+ model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
+ selected_variables << v
+ selected_descriptor_ids << model.descriptor_ids[i]
+ end
+ rescue
+ warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
+ end
+ end
+
+ model.independent_variables = selected_variables
+ model.descriptor_ids = selected_descriptor_ids
+ model
+ end
+
+ def self.to_r v
+ return 0 if v == false
+ return 1 if v == true
+ v
+ end
+
+ end
+
+ class Unsupervised
+ # Select features correlated to the models prediction feature
+ # @param [OpenTox::Model::Lazar]
+ def self.nonredundant independent_variables
+ end
+ end
+
+ end
+
+ end
+end
diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb
deleted file mode 100644
index c596b1f..0000000
--- a/lib/feature_selection.rb
+++ /dev/null
@@ -1,45 +0,0 @@
-module OpenTox
- module Algorithm
-
- # Feature selection algorithms
- class FeatureSelection
-
- # Select features correlated to the models prediction feature
- # @param [OpenTox::Model::Lazar]
- def self.correlation_filter model
- relevant_features = {}
- R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)}
- model.descriptor_weights = []
- selected_variables = []
- selected_descriptor_ids = []
- model.independent_variables.each_with_index do |v,i|
- v.collect!{|n| to_r(n)}
- R.assign "independent", v
- begin
- R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')"
- pvalue = R.eval("cor$p.value").to_ruby
- if pvalue <= 0.05
- model.descriptor_weights << R.eval("cor$estimate").to_ruby**2
- selected_variables << v
- selected_descriptor_ids << model.descriptor_ids[i]
- end
- rescue
- warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed."
- end
- end
-
- model.independent_variables = selected_variables
- model.descriptor_ids = selected_descriptor_ids
- model
- end
-
- def self.to_r v
- return 0 if v == false
- return 1 if v == true
- v
- end
-
- end
-
- end
-end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index f8a2732..3247200 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -29,7 +29,7 @@ PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
[ "array.rb",
"compound.rb",
- "dataset.rb",
+ #"dataset.rb",
"similarity.rb",
"model.rb",
"statistics.rb",
diff --git a/lib/model.rb b/lib/model.rb
index 0e011c5..c1dcb4e 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -1,26 +1,13 @@
+require 'matrix'
+
class Model
def initialize dir
@dir = dir
- @dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp}
- @dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp
- if @dependent_variable_type == "binary"
- abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"]
- @dependent_variables = @dependent_variables.collect{|v| v.to_i}
- elsif @dependent_variable_type == "numeric"
- # TODO check for floats
- @dependent_variables = @dependent_variables.collect{|v| v.to_f}
- end
- @independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp
- @independent_variables = []
- @smiles = []
- File.readlines(File.join(@dir,"independent_variables")).each do |line|
- items = line.chomp.split(",")
- @smiles << items.shift
- items.collect!{|v| v.to_f} if @independent_variable_type == "numeric"
- @independent_variables << items
- end
- @similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f}
+ @similarity_thresholds = File.readlines(File.join(@dir,"similarity-thresholds")).collect{|v| v.chomp.to_f}
+ @smiles = Vector[ *File.readlines(File.join(@dir,"smiles")).collect{|v| v.chomp} ]
+ @dependent_variables = Vector[ *File.readlines(File.join(@dir,"dependent-variables")).collect{|v| v.chomp} ]
+ abort "Unequal number of smiles (#{@smiles.size}) and dependent-variables (#{@dependent_variables.size})." unless @smiles.size == @dependent_variables.size
end
def crossvalidation folds=10
@@ -29,89 +16,124 @@ class Model
indices = (0..nr_instances-1).to_a.shuffle
mid = (nr_instances/folds)
start = 0
+ threads = []
0.upto(folds-1) do |i|
- t = Time.now
- print "Fold #{i}: "
- # split train data
- last = start+mid
- last = last-1 unless nr_instances%folds > i
- test_idxs = indices[start..last] || []
- idxs = {
- :test => test_idxs,
- :train => indices-test_idxs
- }
- start = last+1
- # write training/test data
- cv_dir = File.join(@dir,"crossvalidation",i.to_s)
- dirs = {}
- idxs.each do |t,idx|
- d = File.join cv_dir,t.to_s
- dirs[t] = d
- FileUtils.mkdir_p d
- File.open(File.join(d,"independent_variables"),"w+") do |f|
- idx.each do |i|
- f.print "#{@smiles[i]},"
- f.puts @independent_variables[i].join(",")
- end
- end
- File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
- if t == :train
- File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type }
- File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type }
- File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") }
+ threads << Thread.new do
+ t = Time.now
+ puts "Fold #{i} started"
+ # split train data
+ last = start+mid
+ last = last-1 unless nr_instances%folds > i
+ test_idxs = indices[start..last] || []
+ idxs = {
+ :test => test_idxs,
+ :train => indices-test_idxs
+ }
+ start = last+1
+ # write training/test data
+ cv_dir = File.join(@dir,"crossvalidation",i.to_s)
+ dirs = {}
+ idxs.each do |t,idx|
+ d = File.join cv_dir,t.to_s
+ dirs[t] = d
+ FileUtils.mkdir_p d
+ File.open(File.join(d,"independent-variables"),"w+") { |f| f.puts idx.collect{|i| @independent_variables[i].join(",")}.join("\n") }
+ File.open(File.join(d,"smiles"),"w+") { |f| f.puts idx.collect{|i| @smiles[i]}.join("\n") }
+ File.open(File.join(d,"dependent-variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
+ File.open(File.join(d,"similarity-thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } if t == :train
end
+ # predict
+ train_model = self.class.new dirs[:train]
+ train_model.batch_predict dirs[:test], File.join(dirs[:test],"predictions")
+ puts "Fold #{i}: #{(Time.now-t)/60} min"
+ end
+ end
+ threads.each(&:join)
+ puts "Total: #{(Time.now-start_time)/60} min"
+ end
+
+ def batch_predict dir, out=$stdout
+ prediction_smiles = File.readlines(File.join(dir,"smiles")).collect{|smi| smi.chomp}
+ File.open(out, "w+") do |f|
+ File.readlines(File.join(dir,"independent-variables")).each_with_index do |line,i|
+ variables = line.chomp.split(",")
+ f.puts predict(prediction_smiles[i],variables).join(",")
end
- # predict
- train_model = self.class.new dirs[:train]
- train_model.predict_fold File.join(dirs[:test],"independent_variables")
- puts Time.now-t
end
- puts "Total: #{Time.now-start_time}"
end
end
-class ClassificationModel < Model
+module Cosine
- def predict_fold independent_variable_file
- pred_dir = File.dirname independent_variable_file
- predictions = []
- File.readlines(independent_variable_file).each do |line|
- variables = line.chomp.split(",")
- smiles = variables.shift
- variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
- predictions << predict(smiles,variables)
+ def preprocess
+ puts "Feature selection"
+ t = Time.now
+ @selected = (0..@independent_variables.first.size-1).to_a
+ columns = Matrix[ *@independent_variables ].column_vectors
+ columns.each_with_index do |c,i|
+ next unless @selected.include? i
+ p "#{i}/#{@selected.size}"
+ # remove variables with zero variances
+ if c.to_a.zero_variance?
+ @selected.delete i
+ next
+ end
+ # remove correlated variables
+ (i+1..columns.size-1).each do |j|
+ next unless @selected.include? j
+ @selected.delete(j) if c.to_a.r(columns[j].to_a).abs > 0.9
+ end
end
- File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} }
+ @selected.sort!
+ p
+ mat = @selected.collect{|i| @independent_variables[i]}
+ columns = Matrix[ *mat ].column_vectors
+ @independent_variable_means = columns.collect{|c| c.to_a.mean}
+ @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation}
+ scaled_columns = []
+ columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i] : nil}}
+ @scaled_independent_variables = Matrix.columns(scaled_columns).to_a
+ p @scaled_independent_variables.size, @selected_variable_means.size, @selected_variable_standard_deviations.size
+ puts (Time.now-t)/60
end
- def predict_file independent_variable_file
- predictions = []
- File.readlines(independent_variable_file).each do |line|
- variables = line.chomp.split(",")
- variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
- puts predict("",variables).join(",")
- end
+ def predict smiles, variables
+ variables.collect!{|v| v.to_f}
+ preprocess unless @scaled_independent_variables # lazy preprocessing
+ selected_variables = @selected.collect{|i| variables[i]}
+ scaled_variables = selected_variables.each_with_index{|v,i| (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i]}
+ similarities = @scaled_independent_variables.collect{|row| Similarity.cosine([row,scaled_variables])}
+ similarity_prediction smiles, similarities
end
+end
+
+module Tanimoto
+
def predict_smiles smiles
c = Compound.from_smiles(smiles)
- predict c.smiles, c.fingerprint
+ predict smiles, c.fingerprint
end
-
- def predict smiles, variables
- similarities = []
- @independent_variables.each do |row|
- if @independent_variable_type == "binary"
- similarities << Similarity.tanimoto([row, variables])
- elsif @independent_variable_type == "numeric"
- similarities << Similarity.cosine([row, variables])
- end
- end
- neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
- neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
+ def predict smiles, fingerprint
+ similarities = @independent_variables.collect{|row| Similarity.tanimoto([row,fingerprint])}
+ similarity_prediction smiles, similarities
+ end
+end
+
+class ClassificationModel < Model
+
+ def initialize dir
+ super dir
+ abort "Incorrect binary dependent variable values (#{@dependent_variables.uniq.sort.join(",")}). Expecting 0 and 1." unless @dependent_variables.uniq.sort == ["0","1"]
+ @dependent_variables = @dependent_variables.collect{|v| v.to_i}
+ end
+
+ def similarity_prediction smiles, similarities
+ neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
+ neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds
- experimental = @dependent_variables[@smiles.index(smiles)] if @smiles.include? smiles
+ experimental = @dependent_variables[@smiles.to_a.index(smiles)] if @smiles.include? smiles
return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2
neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]}
@@ -119,20 +141,17 @@ class ClassificationModel < Model
probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities)
probabilities[1] > probabilities[0] ? classification = 1 : classification = 0
- #p neighbor_dependent_variables.join ","
- #p neighbor_similarities.join ","
- #p neighbor_idx.collect{|i| @smiles[i]}
[ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ]
end
-
+
# Weighted majority vote
- # @param [Array<0,1>] dependent_variables
+ # @param [Array<0,1>] neighbor_dependent_variables
# @param [Array<Float>] weights
# @return [Array] probabilities
- def weighted_majority_vote dependent_variables, weights
+ def weighted_majority_vote neighbor_dependent_variables, weights
w = []
- w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]}
- w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]}
+ w[0] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 0}.collect{|i| weights[i]}
+ w[1] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 1}.collect{|i| weights[i]}
weights_sum = weights.sum.to_f
weights_max = weights.max.to_f
probabilities = []
@@ -141,3 +160,36 @@ class ClassificationModel < Model
probabilities
end
end
+
+class RegressionModel < Model
+end
+
+class TanimotoClassificationModel < ClassificationModel
+ include Tanimoto
+
+ def initialize dir
+ super dir
+ @independent_variables = Vector[ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",") } ]
+ abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.size})." unless @dependent_variables.size == @independent_variables.size
+ end
+end
+
+class CosineClassificationModel < ClassificationModel
+ include Cosine
+
+ def initialize dir
+ super dir
+ @independent_variables = Matrix[
+ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",").collect{|v| v.to_f} }
+ ]
+ abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.row_vectors.size})." unless @dependent_variables.size == @independent_variables.row_vectors.size
+ abort "Unequal number of independent-variable-names (#{@independent_variable_names.size}) and independent-variables columns (#{@independent_variables.column_vectors.size})." unless @independent_variable_names.size == @independent_variables.row_vectors.size
+ end
+
+end
+
+class TanimotoRegressionModel < RegressionModel
+end
+
+class CosineRegressionModel < RegressionModel
+end
diff --git a/lib/statistics.rb b/lib/statistics.rb
index 15ea416..e14dc7c 100644
--- a/lib/statistics.rb
+++ b/lib/statistics.rb
@@ -3,7 +3,7 @@ class ClassificationStatistics
def initialize dir
@dir = dir
@folds = Dir[File.join(@dir,"[0-9]*")]
- @confusion_matrix_dir = File.join(@dir,"confusion_matrices")
+ @confusion_matrix_dir = File.join(@dir,"confusion-matrices")
@summaries_dir = File.join(@dir,"summaries")
end
@@ -17,12 +17,12 @@ class ClassificationStatistics
@folds.each do |dir|
test_dir = File.join(dir,"test")
- classifications = File.readlines(File.join(test_dir,"classification")).collect{|row| row.chomp.split(",")}
- measurements = File.readlines(File.join(test_dir,"dependent_variables")).collect{|v| v.to_i}
- similarity_thresholds = File.readlines(File.join(dir,"train","similarity_thresholds")).collect{|v| v.chomp.to_f}
+ classifications = File.readlines(File.join(test_dir,"predictions")).collect{|row| row.chomp.split(",")}
+ measurements = File.readlines(File.join(test_dir,"dependent-variables")).collect{|v| v.to_i}
+ similarity_thresholds = File.readlines(File.join(dir,"train","similarity-thresholds")).collect{|v| v.chomp.to_f}
classifications.each_with_index do |c,i|
- prediction = c[1]
- max_sim = c[4].to_f
+ prediction = c[2]
+ max_sim = c[5].to_f
unless prediction.empty?
prediction = prediction.to_i
if prediction == 1 and measurements[i] == 1