lib/model.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

class Model 

  def initialize dir
    @dir = dir
    @dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp}
    @dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp
    if @dependent_variable_type == "binary"
      abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"]
      @dependent_variables = @dependent_variables.collect{|v| v.to_i}
    elsif @dependent_variable_type == "numeric"
      # TODO check for floats
      @dependent_variables = @dependent_variables.collect{|v| v.to_f}
    end
    @independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp
    @independent_variables = []
    @smiles = []
    File.readlines(File.join(@dir,"independent_variables")).each do |line|
      items = line.chomp.split(",")
      @smiles << items.shift
      items.collect!{|v| v.to_f} if @independent_variable_type == "numeric"
      @independent_variables << items
    end
    @similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f}
  end

  def crossvalidation folds=10
    start_time = Time.now
    nr_instances = @independent_variables.size
    indices = (0..nr_instances-1).to_a.shuffle
    mid = (nr_instances/folds)
    start = 0
    0.upto(folds-1) do |i|
      t = Time.now
      print "Fold #{i}: "
      # split train data
      last = start+mid
      last = last-1 unless nr_instances%folds > i
      test_idxs = indices[start..last] || []
      idxs = {
        :test => test_idxs,
        :train => indices-test_idxs
      }
      start = last+1
      # write training/test data
      cv_dir = File.join(@dir,"crossvalidation",i.to_s)
      dirs = {}
      idxs.each do |t,idx|
        d = File.join cv_dir,t.to_s
        dirs[t] = d
        FileUtils.mkdir_p d
        File.open(File.join(d,"independent_variables"),"w+") do |f|
          idx.each do |i|
            f.print "#{@smiles[i]},"
            f.puts @independent_variables[i].join(",")
          end
        end
        File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
        if t == :train
          File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type }
          File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type }
          File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") }
        end
      end
      # predict
      train_model = self.class.new dirs[:train]
      train_model.predict_fold File.join(dirs[:test],"independent_variables")
      puts Time.now-t
    end
    puts "Total: #{Time.now-start_time}"
  end
end

class ClassificationModel < Model

  def predict_fold independent_variable_file
    pred_dir = File.dirname independent_variable_file
    predictions = []
    File.readlines(independent_variable_file).each do |line|
      variables = line.chomp.split(",")
      smiles = variables.shift
      variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
      predictions << predict(smiles,variables)
    end
    File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} }
  end

  def predict_file independent_variable_file
    predictions = []
    File.readlines(independent_variable_file).each do |line|
      variables = line.chomp.split(",")
      variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
      puts predict("",variables).join(",")
    end
  end

  def predict_smiles smiles 
    c = Compound.from_smiles(smiles)
    predict c.smiles, c.fingerprint
  end
    
  def predict smiles, variables
    similarities = []
    @independent_variables.each do |row|
      if @independent_variable_type == "binary"
        similarities << Similarity.tanimoto([row, variables])
      elsif @independent_variable_type == "numeric"
        similarities << Similarity.cosine([row, variables])
      end
    end

    neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
    neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
    neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds
    experimental = @dependent_variables[@smiles.index(smiles)] if @smiles.include? smiles
    return [smiles,experimental,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2

    neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]}
    neighbor_similarities = neighbor_idx.collect{|i| similarities[i]}
    probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_similarities)
    probabilities[1] > probabilities[0] ? classification = 1 : classification = 0
    
    #p neighbor_dependent_variables.join ","
    #p neighbor_similarities.join ","
    #p neighbor_idx.collect{|i| @smiles[i]}
    [ smiles, experimental, classification ] + probabilities + [ neighbor_similarities.max, neighbor_idx.size ]
  end
  
  # Weighted majority vote
  # @param [Array<0,1>] dependent_variables
  # @param [Array<Float>] weights
  # @return [Array] probabilities
  def weighted_majority_vote dependent_variables, weights
    w = []
    w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]}
    w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]}
    weights_sum = weights.sum.to_f
    weights_max = weights.max.to_f
    probabilities = []
    probabilities[0] = weights_max*w[0].sum/weights_sum
    probabilities[1] = weights_max*w[1].sum/weights_sum
    probabilities
  end
end