lib/model.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

class Model 

  def initialize dir
    @dir = dir
    @dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp}
    @dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp
    if @dependent_variable_type == "binary"
      abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"]
      @dependent_variables = @dependent_variables.collect{|v| v.to_i}
    elsif @dependent_variable_type == "numeric"
      # TODO check for floats
      @dependent_variables = @dependent_variables.collect{|v| v.to_f}
    end
    @independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp
    @independent_variables = []
    @smiles = []
    File.readlines(File.join(@dir,"independent_variables")).each do |line|
      items = line.chomp.split(",")
      @smiles << items.shift
      items.collect!{|v| v.to_f} if @independent_variable_type == "numeric"
      @independent_variables << items
    end
    @similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f}
  end

  def crossvalidation folds=10
    start_time = Time.now
    nr_instances = @independent_variables.size
    indices = (0..nr_instances-1).to_a.shuffle
    mid = (nr_instances/folds)
    start = 0
    0.upto(folds-1) do |i|
      t = Time.now
      print "Fold #{i}: "
      # split train data
      last = start+mid
      last = last-1 unless nr_instances%folds > i
      test_idxs = indices[start..last] || []
      idxs = {
        :test => test_idxs,
        :train => indices-test_idxs
      }
      start = last+1
      # write training/test data
      cv_dir = File.join(@dir,"crossvalidation",i.to_s)
      dirs = {}
      idxs.each do |t,idx|
        d = File.join cv_dir,t.to_s
        dirs[t] = d
        FileUtils.mkdir_p d
        File.open(File.join(d,"independent_variables"),"w+") do |f|
          idx.each do |i|
            f.print "#{@smiles[i]},"
            f.puts @independent_variables[i].join(",")
          end
        end
        File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")}
        if t == :train
          File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type }
          File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type }
          File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") }
        end
      end
      # predict
      train_model = self.class.new dirs[:train]
      train_model.predict_file File.join(dirs[:test],"independent_variables")
      puts Time.now-t
    end
    puts "Total: #{Time.now-start_time}"
  end
end

class ClassificationModel < Model

  def predict_file independent_variable_file
    pred_dir = File.dirname independent_variable_file
    predictions = []
    File.readlines(independent_variable_file).each do |line|
      variables = line.chomp.split(",")
      smiles = variables.shift
      variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric"
      predictions << predict(smiles,variables)
    end
    File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} }
  end

  # TODO: with neighbors
  def predict_smiles smiles 
  end
    
  def predict smiles, variables
    similarities = []
    @independent_variables.each do |row|
      if @independent_variable_type == "binary"
        similarities << Similarity.tanimoto([row, variables])
      elsif @independent_variable_type == "numeric"
        similarities << Similarity.cosine([row, variables])
      end
    end

    neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]}
    neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold
    neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds
    return [smiles,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2

    neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]}
    neighbor_weights = neighbor_idx.collect{|i| similarities[i]}
    probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_weights)
    probabilities[1] > probabilities[0] ? classification = 1 : classification = 0
    
    [ smiles, classification ] + probabilities + [ similarities.max, neighbor_idx.size ]
  end
  
  # Weighted majority vote
  # @param [Array<0,1>] dependent_variables
  # @param [Array<Float>] weights
  # @return [Array] probabilities
  def weighted_majority_vote dependent_variables, weights
    w = []
    w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]}
    w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]}
    weights_sum = weights.sum.to_f
    weights_max = weights.max.to_f
    probabilities = []
    probabilities[0] = weights_max*w[0].sum/weights_sum
    probabilities[1] = weights_max*w[1].sum/weights_sum
    probabilities
  end
end