summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2021-03-18 16:48:36 +0100
committerChristoph Helma <helma@in-silico.ch>2021-03-18 16:48:36 +0100
commit83591831c6e36c36d87159acba6afdfedab95522 (patch)
treeaeab99f16956468d432b24ecabf447fb06ab8e66 /lib
parent1dcd741a5bff8dc41abf0840f59031eb557ff230 (diff)
fingerprint predictions addedsingle-input-file
Diffstat (limited to 'lib')
-rw-r--r--lib/model.rb55
1 files changed, 43 insertions, 12 deletions
diff --git a/lib/model.rb b/lib/model.rb
index d62d889..1726690 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -6,7 +6,7 @@ end
class Model
- attr_reader :train, :dependent_variable_name, :independent_variable_names
+ attr_reader :train, :dependent_variable_name, :independent_variable_names, :minsim
def initialize file
puts "Reading training data from #{file}."
@@ -14,13 +14,14 @@ class Model
header = @train.shift
@dependent_variable_name = header[1]
@independent_variable_names = header[2..-1]
+ model_type
end
def model_type
puts "Determining model type."
- if dependent_variables.uniq == ["1","0"]
+ if dependent_variables.uniq.sort == ["0","1"]
@dependent_variable_type = "binary"
- @train.each {|t| t[1] == t[1].to_i}
+ @train.each {|t| t[1] = t[1].to_i}
elsif dependent_variables.collect{|v| v.numeric?}.uniq == [true]
@dependent_variable_type = "numeric"
@train.each {|t| t[1] = t[1].to_f }
@@ -29,25 +30,25 @@ class Model
end
if independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false]
@independent_variable_type = "set"
+ @minsim = [0.5,0.2]
elsif independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true]
@independent_variable_type = "numeric"
@train.each {|t| t[2..-1] = t[2..-1].collect{|v| v = v.to_f}}
+ @minsim = [0.9,0.7]
else
raise "Incorrect model independent variables [#{independent_variables.flatten.uniq}]. Must be either a set (fingerprints) or numeric."
end
end
def predict file
- model_type
puts "Reading prediction data from #{file}."
@batch = File.readlines(file).collect{|l| l.chomp.split(",")}
header = @batch.shift
- @batch_independent_variable_names = header[1..-1]
- unless (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [false] and @independent_variable_type == "set") or (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] and @independent_variable_type == "numeric")
+ unless (@batch.collect{|b| b[1..-1]}.flatten.collect{|v| v.numeric?}.uniq == [false] and @independent_variable_type == "set") or (batch_independent_variables.flatten.collect{|v| v.numeric?}.uniq == [true] and @independent_variable_type == "numeric")
raise "Incorrect batch independent variables [#{independent_variables.flatten.uniq}]. Must be #{@independent_variable_type}."
end
if @independent_variable_type == "numeric"
- @minsim = [0.9,0.7]
+ @batch_independent_variable_names = header[1..-1]
@batch.each {|t| t[1..-1] = t[1..-1].collect{|v| v = v.to_f}}
select(@independent_variable_names & @batch_independent_variable_names)
File.open(File.join(File.dirname(file),"common-variables.csv"),"w+") do |f|
@@ -81,14 +82,22 @@ class Model
File.open(file.sub(".csv","-prediction.csv"),"w+") do |f|
f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",")
@scaled_batch.each do |pred|
- classification(pred[0], @scaled_train.collect{|row| row[0..1] + [Similarity.cosine([row[2..-1],pred[1..-1]])]}).each do |pred|
+ classification(pred[0], @scaled_train.collect{|row| row[0..1] + [cosine([row[2..-1],pred[1..-1]])]}).each do |pred|
f.puts pred.join(",")
- #puts pred.join(",")
end
end
end
elsif @independent_variable_type == "set"
- @minsim = [0.5,0.2]
+ puts "Predicting #{file}."
+ File.open(file.sub(".csv","-prediction.csv"),"w+") do |f|
+ f.puts ["Canonical SMILES","Experimental","Prediction","p-inactive","p-active","Max Simimilarity","Nr. Neighbors"].join(",")
+ @batch.each do |fingerprints|
+ smi = fingerprints.shift
+ classification(smi, @train.collect{|row| row[0..1] + [tanimoto([row[2..-1],fingerprints])]}).each do |pred|
+ f.puts pred.join(",")
+ end
+ end
+ end
end
end
@@ -107,8 +116,7 @@ class Model
n = train.select{|row| row[2] > @minsim[0]}
n = train.select!{|row| row[2] > @minsim[1]} if n.size < 2
train = n
- #puts train.sort_by{|r| r[2]}[0..5].collect{|r| r.join(",")}.join("\n")
- #puts train.sort_by{|r| r[2]}.reverse.collect{|r| r.join(",")}.join("\n")
+ #p train.sort_by{|r| r[2]}[0..5]#.collect{|r| r.join(",")}.join("\n")
if train.size < 2
classification = nil
probabilities = [nil,nil]
@@ -122,6 +130,29 @@ class Model
end
end
+ # Get Euclidean distance
+ # @param [Array<Array<Float>>]
+ # @return [Float]
+ def euclid variables
+ sq = variables[0].zip(variables[1]).map{|a,b| (a - b) ** 2}
+ Math.sqrt(sq.inject(0) {|s,c| s + c})
+ end
+
+ # Get Tanimoto similarity
+ # @param [Array<Array<String>>]
+ # @return [Float]
+ def tanimoto fingerprints
+ ( fingerprints[0] & fingerprints[1] ).size/( fingerprints[0] | fingerprints[1] ).size.to_f
+ end
+
+ # Get cosine similarity
+ # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
+ # @param [Array<Array<Float>>]
+ # @return [Float]
+ def cosine variables
+ variables[0].dot_product(variables[1]) / (variables[0].magnitude * variables[1].magnitude)
+ end
+
def weighted_majority_vote neighbors
w = [neighbors.select{|n| n[1] == 0}.collect{|n| n[2]}, neighbors.select{|n| n[1] == 1}.collect{|n| n[2]}]
weights_sum = neighbors.collect{|n| n[2]}.sum.to_f