class Model def initialize dir @dir = dir @dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp} @dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp if @dependent_variable_type == "binary" abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"] @dependent_variables = @dependent_variables.collect{|v| v.to_i} elsif @dependent_variable_type == "numeric" # TODO check for floats @dependent_variables = @dependent_variables.collect{|v| v.to_f} end @independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp @independent_variables = [] @smiles = [] File.readlines(File.join(@dir,"independent_variables")).each do |line| items = line.chomp.split(",") @smiles << items.shift items.collect!{|v| v.to_f} if @independent_variable_type == "numeric" @independent_variables << items end @similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f} end def crossvalidation folds=10 start_time = Time.now nr_instances = @independent_variables.size indices = (0..nr_instances-1).to_a.shuffle mid = (nr_instances/folds) start = 0 0.upto(folds-1) do |i| t = Time.now print "Fold #{i}: " # split train data last = start+mid last = last-1 unless nr_instances%folds > i test_idxs = indices[start..last] || [] idxs = { :test => test_idxs, :train => indices-test_idxs } start = last+1 # write training/test data cv_dir = File.join(@dir,"crossvalidation",i.to_s) dirs = {} idxs.each do |t,idx| d = File.join cv_dir,t.to_s dirs[t] = d FileUtils.mkdir_p d File.open(File.join(d,"independent_variables"),"w+") do |f| idx.each do |i| f.print "#{@smiles[i]}," f.puts @independent_variables[i].join(",") end end File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")} if t == :train File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type } File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type } File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } end end # predict train_model = self.class.new dirs[:train] train_model.predict_file File.join(dirs[:test],"independent_variables") puts Time.now-t end puts "Total: #{Time.now-start_time}" end end class ClassificationModel < Model def predict_file independent_variable_file pred_dir = File.dirname independent_variable_file predictions = [] File.readlines(independent_variable_file).each do |line| variables = line.chomp.split(",") smiles = variables.shift variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric" predictions << predict(smiles,variables) end File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} } end # TODO: with neighbors def predict_smiles smiles end def predict smiles, variables similarities = [] @independent_variables.each do |row| if @independent_variable_type == "binary" similarities << Similarity.tanimoto([row, variables]) elsif @independent_variable_type == "numeric" similarities << Similarity.cosine([row, variables]) end end neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]} neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds return [smiles,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2 neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]} neighbor_weights = neighbor_idx.collect{|i| similarities[i]} probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_weights) probabilities[1] > probabilities[0] ? classification = 1 : classification = 0 [ smiles, classification ] + probabilities + [ similarities.max, neighbor_idx.size ] end # Weighted majority vote # @param [Array<0,1>] dependent_variables # @param [Array] weights # @return [Array] probabilities def weighted_majority_vote dependent_variables, weights w = [] w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]} w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]} weights_sum = weights.sum.to_f weights_max = weights.max.to_f probabilities = [] probabilities[0] = weights_max*w[0].sum/weights_sum probabilities[1] = weights_max*w[1].sum/weights_sum probabilities end end