From a29eb3e38414cd252850c9c4fb356f8b2bef6fb4 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 12 Feb 2021 19:54:07 +0100 Subject: model.rb refactored, mp2d models updated --- lib/array.rb | 30 ++++-- lib/dataset.rb | 9 +- lib/feature-selection.rb | 56 +++++++++++ lib/feature_selection.rb | 45 --------- lib/lazar.rb | 2 +- lib/model.rb | 236 +++++++++++++++++++++++++++++------------------ lib/statistics.rb | 12 +-- 7 files changed, 238 insertions(+), 152 deletions(-) create mode 100644 lib/feature-selection.rb delete mode 100644 lib/feature_selection.rb (limited to 'lib') diff --git a/lib/array.rb b/lib/array.rb index 55186bd..32a8e5d 100644 --- a/lib/array.rb +++ b/lib/array.rb @@ -1,3 +1,4 @@ +# TODO increase speed with vectors, matrices? class Array # Sum the size of single arrays in an array of arrays @@ -74,14 +75,29 @@ class Array end end - # Collect array with index - # in analogy to each_with_index - def collect_with_index - result = [] - self.each_with_index do |elt, idx| - result << yield(elt, idx) + def remove indices + out = self + indices.sort.reverse_each do |i| + out.delete_at i end - result + out + end + + # Correlation coefficient + # @param [Array] + # @return [Numeric] + def r(y) + raise "Argument is not a Array class!" unless y.class == Array + raise "Self array is nil!" if self.size == 0 + raise "Argument array size is invalid!" unless self.size == y.size + + mean_x = self.inject(0) { |s, a| s += a } / self.size.to_f + mean_y = y.inject(0) { |s, a| s += a } / y.size.to_f + cov = self.zip(y).inject(0) { |s, a| s += (a[0] - mean_x) * (a[1] - mean_y) } + var_x = self.inject(0) { |s, a| s += (a - mean_x) ** 2 } + var_y = y.inject(0) { |s, a| s += (a - mean_y) ** 2 } + r = cov / Math.sqrt(var_x) + r /= Math.sqrt(var_y) end end diff --git a/lib/dataset.rb b/lib/dataset.rb index e30f000..f40f054 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -12,7 +12,7 @@ class Dataset @independent_variable_type = File.read(File.join(@dir,"independent_variable_type")).chomp @lines = File.readlines(file) @header = @lines.shift.split(",") - @header.first.match(/ID/i) ? @has_id = true : @has_id = false + @header.first.match(/ID|SMILES/i) ? @has_id = true : @has_id = false @dependent_variable_name = @header.pop @ids = [] @dependent_variables = [] @@ -43,6 +43,13 @@ class Dataset end @independent_variables = Matrix[ *@independent_variables ] columns = @independent_variables.column_vectors + stdev = columns.collect{|c| c.to_a.standard_deviation} + stdev.each_index.reverse_each do |i| + if stdev[i] == 0 + @independent_variable_names.delete_at(i) + columns.delete_at(i) + end + end @independent_variable_means = columns.collect{|c| c.to_a.mean} @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation} scaled_columns = [] diff --git a/lib/feature-selection.rb b/lib/feature-selection.rb new file mode 100644 index 0000000..8601737 --- /dev/null +++ b/lib/feature-selection.rb @@ -0,0 +1,56 @@ +module OpenTox + module Algorithm + + # Feature selection algorithms + module FeatureSelection + + class Supervised + + # Select features correlated to the models prediction feature + # @param [OpenTox::Model::Lazar] + def self.correlation_filter model + relevant_features = {} + R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)} + model.descriptor_weights = [] + selected_variables = [] + selected_descriptor_ids = [] + model.independent_variables.each_with_index do |v,i| + v.collect!{|n| to_r(n)} + R.assign "independent", v + begin + R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')" + pvalue = R.eval("cor$p.value").to_ruby + if pvalue <= 0.05 + model.descriptor_weights << R.eval("cor$estimate").to_ruby**2 + selected_variables << v + selected_descriptor_ids << model.descriptor_ids[i] + end + rescue + warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed." + end + end + + model.independent_variables = selected_variables + model.descriptor_ids = selected_descriptor_ids + model + end + + def self.to_r v + return 0 if v == false + return 1 if v == true + v + end + + end + + class Unsupervised + # Select features correlated to the models prediction feature + # @param [OpenTox::Model::Lazar] + def self.nonredundant independent_variables + end + end + + end + + end +end diff --git a/lib/feature_selection.rb b/lib/feature_selection.rb deleted file mode 100644 index c596b1f..0000000 --- a/lib/feature_selection.rb +++ /dev/null @@ -1,45 +0,0 @@ -module OpenTox - module Algorithm - - # Feature selection algorithms - class FeatureSelection - - # Select features correlated to the models prediction feature - # @param [OpenTox::Model::Lazar] - def self.correlation_filter model - relevant_features = {} - R.assign "dependent", model.dependent_variables.collect{|v| to_r(v)} - model.descriptor_weights = [] - selected_variables = [] - selected_descriptor_ids = [] - model.independent_variables.each_with_index do |v,i| - v.collect!{|n| to_r(n)} - R.assign "independent", v - begin - R.eval "cor <- cor.test(dependent,independent,method = 'pearson',use='pairwise')" - pvalue = R.eval("cor$p.value").to_ruby - if pvalue <= 0.05 - model.descriptor_weights << R.eval("cor$estimate").to_ruby**2 - selected_variables << v - selected_descriptor_ids << model.descriptor_ids[i] - end - rescue - warn "Correlation of '#{model.prediction_feature.name}' (#{model.dependent_variables}) with (#{v}) failed." - end - end - - model.independent_variables = selected_variables - model.descriptor_ids = selected_descriptor_ids - model - end - - def self.to_r v - return 0 if v == false - return 1 if v == true - v - end - - end - - end -end diff --git a/lib/lazar.rb b/lib/lazar.rb index f8a2732..3247200 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -29,7 +29,7 @@ PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" [ "array.rb", "compound.rb", - "dataset.rb", + #"dataset.rb", "similarity.rb", "model.rb", "statistics.rb", diff --git a/lib/model.rb b/lib/model.rb index 0e011c5..c1dcb4e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -1,26 +1,13 @@ +require 'matrix' + class Model def initialize dir @dir = dir - @dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp} - @dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp - if @dependent_variable_type == "binary" - abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"] - @dependent_variables = @dependent_variables.collect{|v| v.to_i} - elsif @dependent_variable_type == "numeric" - # TODO check for floats - @dependent_variables = @dependent_variables.collect{|v| v.to_f} - end - @independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp - @independent_variables = [] - @smiles = [] - File.readlines(File.join(@dir,"independent_variables")).each do |line| - items = line.chomp.split(",") - @smiles << items.shift - items.collect!{|v| v.to_f} if @independent_variable_type == "numeric" - @independent_variables << items - end - @similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f} + @similarity_thresholds = File.readlines(File.join(@dir,"similarity-thresholds")).collect{|v| v.chomp.to_f} + @smiles = Vector[ *File.readlines(File.join(@dir,"smiles")).collect{|v| v.chomp} ] + @dependent_variables = Vector[ *File.readlines(File.join(@dir,"dependent-variables")).collect{|v| v.chomp} ] + abort "Unequal number of smiles (#{@smiles.size}) and dependent-variables (#{@dependent_variables.size})." unless @smiles.size == @dependent_variables.size end def crossvalidation folds=10 @@ -29,89 +16,124 @@ class Model indices = (0..nr_instances-1).to_a.shuffle mid = (nr_instances/folds) start = 0 + threads = [] 0.upto(folds-1) do |i| - t = Time.now - print "Fold #{i}: " - # split train data - last = start+mid - last = last-1 unless nr_instances%folds > i - test_idxs = indices[start..last] || [] - idxs = { - :test => test_idxs, - :train => indices-test_idxs - } - start = last+1 - # write training/test data - cv_dir = File.join(@dir,"crossvalidation",i.to_s) - dirs = {} - idxs.each do |t,idx| - d = File.join cv_dir,t.to_s - dirs[t] = d - FileUtils.mkdir_p d - File.open(File.join(d,"independent_variables"),"w+") do |f| - idx.each do |i| - f.print "#{@smiles[i]}," - f.puts @independent_variables[i].join(",") - end - end - File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")} - if t == :train - File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type } - File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type } - File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } + threads << Thread.new do + t = Time.now + puts "Fold #{i} started" + # split train data + last = start+mid + last = last-1 unless nr_instances%folds > i + test_idxs = indices[start..last] || [] + idxs = { + :test => test_idxs, + :train => indices-test_idxs + } + start = last+1 + # write training/test data + cv_dir = File.join(@dir,"crossvalidation",i.to_s) + dirs = {} + idxs.each do |t,idx| + d = File.join cv_dir,t.to_s + dirs[t] = d + FileUtils.mkdir_p d + File.open(File.join(d,"independent-variables"),"w+") { |f| f.puts idx.collect{|i| @independent_variables[i].join(",")}.join("\n") } + File.open(File.join(d,"smiles"),"w+") { |f| f.puts idx.collect{|i| @smiles[i]}.join("\n") } + File.open(File.join(d,"dependent-variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")} + File.open(File.join(d,"similarity-thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } if t == :train end + # predict + train_model = self.class.new dirs[:train] + train_model.batch_predict dirs[:test], File.join(dirs[:test],"predictions") + puts "Fold #{i}: #{(Time.now-t)/60} min" + end + end + threads.each(&:join) + puts "Total: #{(Time.now-start_time)/60} min" + end + + def batch_predict dir, out=$stdout + prediction_smiles = File.readlines(File.join(dir,"smiles")).collect{|smi| smi.chomp} + File.open(out, "w+") do |f| + File.readlines(File.join(dir,"independent-variables")).each_with_index do |line,i| + variables = line.chomp.split(",") + f.puts predict(prediction_smiles[i],variables).join(",") end - # predict - train_model = self.class.new dirs[:train] - train_model.predict_fold File.join(dirs[:test],"independent_variables") - puts Time.now-t end - puts "Total: #{Time.now-start_time}" end end -class ClassificationModel < Model +module Cosine - def predict_fold independent_variable_file - pred_dir = File.dirname independent_variable_file - predictions = [] - File.readlines(independent_variable_file).each do |line| - variables = line.chomp.split(",") - smiles = variables.shift - variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric" - predictions << predict(smiles,variables) + def preprocess + puts "Feature selection" + t = Time.now + @selected = (0..@independent_variables.first.size-1).to_a + columns = Matrix[ *@independent_variables ].column_vectors + columns.each_with_index do |c,i| + next unless @selected.include? i + p "#{i}/#{@selected.size}" + # remove variables with zero variances + if c.to_a.zero_variance? + @selected.delete i + next + end + # remove correlated variables + (i+1..columns.size-1).each do |j| + next unless @selected.include? j + @selected.delete(j) if c.to_a.r(columns[j].to_a).abs > 0.9 + end end - File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} } + @selected.sort! + p + mat = @selected.collect{|i| @independent_variables[i]} + columns = Matrix[ *mat ].column_vectors + @independent_variable_means = columns.collect{|c| c.to_a.mean} + @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation} + scaled_columns = [] + columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i] : nil}} + @scaled_independent_variables = Matrix.columns(scaled_columns).to_a + p @scaled_independent_variables.size, @selected_variable_means.size, @selected_variable_standard_deviations.size + puts (Time.now-t)/60 end - def predict_file independent_variable_file - predictions = [] - File.readlines(independent_variable_file).each do |line| - variables = line.chomp.split(",") - variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric" - puts predict("",variables).join(",") - end + def predict smiles, variables + variables.collect!{|v| v.to_f} + preprocess unless @scaled_independent_variables # lazy preprocessing + selected_variables = @selected.collect{|i| variables[i]} + scaled_variables = selected_variables.each_with_index{|v,i| (v-@selected_variable_means[i])/@selected_variable_standard_deviations[i]} + similarities = @scaled_independent_variables.collect{|row| Similarity.cosine([row,scaled_variables])} + similarity_prediction smiles, similarities end +end + +module Tanimoto + def predict_smiles smiles c = Compound.from_smiles(smiles) - predict c.smiles, c.fingerprint + predict smiles, c.fingerprint end - - def predict smiles, variables - similarities = [] - @independent_variables.each do |row| - if @independent_variable_type == "binary" - similarities << Similarity.tanimoto([row, variables]) - elsif @independent_variable_type == "numeric" - similarities << Similarity.cosine([row, variables]) - end - end - neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]} - neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold + def predict smiles, fingerprint + similarities = @independent_variables.collect{|row| Similarity.tanimoto([row,fingerprint])} + similarity_prediction smiles, similarities + end +end + +class ClassificationModel < Model + + def initialize dir + super dir + abort "Incorrect binary dependent variable values (#{@dependent_variables.uniq.sort.join(",")}). Expecting 0 and 1." unless @dependent_variables.uniq.sort == ["0","1"] + @dependent_variables = @dependent_variables.collect{|v| v.to_i} + end + + def similarity_prediction smiles, similarities + neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[1]} + neighbor_idx = similarities.to_a.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds - experimental = @dependent_variables[@smiles.index(smiles)] if @smiles.include? smiles + experimental = @dependent_variables[@smiles.to_a.index(smiles)] if @smiles.include? smiles return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2 neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]} @@ -119,20 +141,17 @@ class ClassificationModel < Model probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities) probabilities[1] > probabilities[0] ? classification = 1 : classification = 0 - #p neighbor_dependent_variables.join "," - #p neighbor_similarities.join "," - #p neighbor_idx.collect{|i| @smiles[i]} [ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ] end - + # Weighted majority vote - # @param [Array<0,1>] dependent_variables + # @param [Array<0,1>] neighbor_dependent_variables # @param [Array] weights # @return [Array] probabilities - def weighted_majority_vote dependent_variables, weights + def weighted_majority_vote neighbor_dependent_variables, weights w = [] - w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]} - w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]} + w[0] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 0}.collect{|i| weights[i]} + w[1] = weights.each_index.select{|i| neighbor_dependent_variables[i] == 1}.collect{|i| weights[i]} weights_sum = weights.sum.to_f weights_max = weights.max.to_f probabilities = [] @@ -141,3 +160,36 @@ class ClassificationModel < Model probabilities end end + +class RegressionModel < Model +end + +class TanimotoClassificationModel < ClassificationModel + include Tanimoto + + def initialize dir + super dir + @independent_variables = Vector[ *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",") } ] + abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.size})." unless @dependent_variables.size == @independent_variables.size + end +end + +class CosineClassificationModel < ClassificationModel + include Cosine + + def initialize dir + super dir + @independent_variables = Matrix[ + *File.readlines(File.join(@dir,"independent-variables")).collect { |line| line.chomp.split(",").collect{|v| v.to_f} } + ] + abort "Unequal number of dependent-variables (#{@dependent_variables.size}) and independent-variables rows (#{@independent_variables.row_vectors.size})." unless @dependent_variables.size == @independent_variables.row_vectors.size + abort "Unequal number of independent-variable-names (#{@independent_variable_names.size}) and independent-variables columns (#{@independent_variables.column_vectors.size})." unless @independent_variable_names.size == @independent_variables.row_vectors.size + end + +end + +class TanimotoRegressionModel < RegressionModel +end + +class CosineRegressionModel < RegressionModel +end diff --git a/lib/statistics.rb b/lib/statistics.rb index 15ea416..e14dc7c 100644 --- a/lib/statistics.rb +++ b/lib/statistics.rb @@ -3,7 +3,7 @@ class ClassificationStatistics def initialize dir @dir = dir @folds = Dir[File.join(@dir,"[0-9]*")] - @confusion_matrix_dir = File.join(@dir,"confusion_matrices") + @confusion_matrix_dir = File.join(@dir,"confusion-matrices") @summaries_dir = File.join(@dir,"summaries") end @@ -17,12 +17,12 @@ class ClassificationStatistics @folds.each do |dir| test_dir = File.join(dir,"test") - classifications = File.readlines(File.join(test_dir,"classification")).collect{|row| row.chomp.split(",")} - measurements = File.readlines(File.join(test_dir,"dependent_variables")).collect{|v| v.to_i} - similarity_thresholds = File.readlines(File.join(dir,"train","similarity_thresholds")).collect{|v| v.chomp.to_f} + classifications = File.readlines(File.join(test_dir,"predictions")).collect{|row| row.chomp.split(",")} + measurements = File.readlines(File.join(test_dir,"dependent-variables")).collect{|v| v.to_i} + similarity_thresholds = File.readlines(File.join(dir,"train","similarity-thresholds")).collect{|v| v.chomp.to_f} classifications.each_with_index do |c,i| - prediction = c[1] - max_sim = c[4].to_f + prediction = c[2] + max_sim = c[5].to_f unless prediction.empty? prediction = prediction.to_i if prediction == 1 and measurements[i] == 1 -- cgit v1.2.3