bin/lazar


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182

#!/usr/bin/env ruby
require 'optparse'
require_relative '../lib/lazar'

ARGV << '-h' if ARGV.empty?
options = {}
options[:folds] = 10
options[:thresholds] = [0.5,0.2]

OptionParser.new do |opts|
  opts.banner = "Usage: lazar -t TRAIN -x|-p descriptors [options]"
  opts.on( '-h', '--help', 'Display this screen' ) do
    puts opts
    exit
  end
  opts.on( '-t TRAIN', '-train TRAIN', "Training data in csv format (required). Type 'lazar -f' for format specifications." ) do |t|
    options[:train] = t
  end
  opts.on( '-p descriptors', '--predict descriptors', "Prediction data in csv format. Type 'lazar -f' for format specifications.") do |p|
    options[:predict] = p
  end
  opts.on( '-x', '--crossvalidation', "Run crossvalidation." ) do |c|
    options[:cv] = true
  end
  opts.on( '-f folds', '--folds folds', Integer, "Change crossvalidation folds (default: #{options[:folds]})." ) do |f|
    options[:folds] = f
  end
  opts.on( '-f', '--formats', "Describe input and output formats" ) do |f|
    raise OptionParser::InvalidArgument, "Format description not yet implemented."
  end
#  opts.on( '-d', '--daemon', "Run as daemon in background" ) do |f|
#    raise OptionParser::InvalidArgument, "Daemon mode not yet implemented"
#  end
end.parse!

raise OptionParser::MissingArgument, "Training data is required. Type 'lazar -h' for help." if options[:train].nil? 
raise OptionParser::InvalidArgument, "Training data file #{options[:train]} does not exist. Type 'lazar -h' for help." unless File.exists? options[:train]
raise OptionParser::InvalidOption, "Choose either --predict or --crossvalidation. Type 'lazar -h' for help." if options[:predict] and options[:cv]
raise OptionParser::InvalidOption, "One of the --predict or --crossvalidation options is required. Type 'lazar -h' for help." unless options[:predict] or options[:cv]
raise OptionParser::InvalidArgument, "Prediction descriptor file #{options[:predict]} does not exist. Type 'lazar -h' for help." if options[:predict] and !File.exists? options[:predict]

model = Model.new options[:train]

if options[:predict] # batch predictions
  model.predict options[:predict]

elsif options[:cv] # crossvalidation

  # create folds
  cv_dir = File.join(File.dirname(options[:train]),"crossvalidation")
  folds = (0..options[:folds]-1).collect{|i| File.join(cv_dir,i.to_s)}
  nr_instances = model.train.size
  indices = (0..nr_instances-1).to_a.shuffle
  mid = (nr_instances/options[:folds])
  start = 0
  0.upto(options[:folds]-1) do |i|

    # split train data
    puts "Creating fold #{i}"
    last = start+mid
    last = last-1 unless nr_instances%options[:folds] > i
    test_idxs = indices[start..last] || []
    idxs = {
      :train => indices-test_idxs,
      :test => test_idxs
    }
    start = last+1

    # write training/test data
    idxs.each do |t,idx|
      file = File.join(cv_dir,i.to_s,t.to_s+".csv")
      `mkdir -p #{File.dirname file}`
      case t
      when :train
        File.open(file,"w+") do |f|
          f.puts (["Canonical SMILES",model.dependent_variable_name] + model.independent_variable_names).join(",")
          idx.collect{|i| model.train[i]}.each do |t|
            f.puts t.join(",")
          end
        end
      when :test
        File.open(file,"w+") do |f|
          f.puts (["Canonical SMILES"] + model.independent_variable_names).join(",")
          idx.collect{|i| model.train[i]}.each do |t|
            o = t.clone # keep model.train intact
            o.delete_at(1)
            f.puts o.join(",")
          end
        end
      end
    end
  end

  # crossvalidation predictions
  t = Time.now
  folds.each do |fold|
    fork do
      puts "Crossvalidation #{fold} started"
      m = Model.new File.join(fold,"train.csv")
      m.predict File.join(fold,"test.csv")
    end
  end
  Process.waitall
  puts "Crossvalidation: #{(Time.now-t)/60} min"

  # crossvalidation summaries

  predictions = []
  tp=0
  tn=0
  fp=0
  fn=0
  hc_tp=0
  hc_tn=0
  hc_fp=0
  hc_fn=0

  File.open(File.join(cv_dir,"predictions.csv"),"w+") do |f|
    folds.each do |fold|
      pred = File.readlines(File.join(fold,"test-prediction.csv")).collect{|row| row.chomp.split(",")}
      pred.shift
      pred.each do |prediction|
        smi = prediction[0]
        exp = model.train.select{|t| t[0] == smi}.collect{|t| t[1].to_i}
        maxsim = prediction[5].to_f
        v = "NA"
        unless exp.nil? or prediction[2].empty? or exp.empty?
          p = prediction[2].to_i
          exp.each do |e|
            if p and e
              if p == 1 and e == 1
                v = "TP"
                tp+=1
                hc_tp+=1 if maxsim > model.minsim.max
              elsif p == 0 and e == 0
                v = "TN"
                tn+=1
                hc_tn+=1 if maxsim > model.minsim.max
              elsif p == 1 and e == 0
                v = "FP"
                fp+=1
                hc_fp+=1 if maxsim > model.minsim.max
              elsif p == 0 and e == 1
                v = "FN"
                fn+=1
                hc_fn+=1 if maxsim > model.minsim.max
              end
            end
            predictions << v
          end
        end
        f.puts([smi,v,maxsim].join(","))
      end
    end
  end

  File.open(File.join(cv_dir,"confusion-matrix-all.csv"),"w+") do |f|
    f.puts "#{tp},#{fp}\n#{fn},#{tn}"
  end

  File.open(File.join(cv_dir,"confusion-matrix-high-confidence.csv"),"w+") do |f|
    f.puts "#{hc_tp},#{hc_fp}\n#{hc_fn},#{hc_tn}"
  end

  File.open(File.join(cv_dir,"summary-all.csv"),"w+") do |f|
    f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn).to_f}"
    f.puts "true_positive_rate,#{tp/(tp+fn).to_f}"
    f.puts "true_negative_rate,#{tn/(tn+fp).to_f}"
    f.puts "positive_predictive_value,#{tp/(tp+fp).to_f}"
    f.puts "negative_predictive_value,#{tn/(tn+fn).to_f}"
  end

  File.open(File.join(cv_dir,"summary-high-confidence.csv"),"w+") do |f|
    f.puts "accuracy,#{(hc_tp+hc_tn)/(hc_tp+hc_fp+hc_tn+hc_fn).to_f}"
    f.puts "true_positive_rate,#{hc_tp/(hc_tp+hc_fn).to_f}"
    f.puts "true_negative_rate,#{hc_tn/(hc_tn+hc_fp).to_f}"
    f.puts "positive_predictive_value,#{hc_tp/(hc_tp+hc_fp).to_f}"
    f.puts "negative_predictive_value,#{hc_tn/(hc_tn+hc_fn).to_f}"
  end

end