require 'rserve' require 'json' require 'yaml' require 'csv' ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])" def query_features relevant_features = JSON.parse(File.read("./relevant-features.json")) end def predict params # calculate difference parameters diff_features = JSON.parse(File.read("./diff-features.json")) diff_features.each do |feature,originals| params[feature] = params[originals[1]]-params[originals[0]] # causes rounding errors! end neighbors = [] sim_sum = 0 weighted_sum = 0 match = nil relevant_features = JSON.parse(File.read("./relevant-features.json")) weights = relevant_features.values.collect{|v| v["r"]} JSON.parse(File.read("./data.json")).each do |id,categories| query_values = [] neighbor_values = [] relevant_features.keys.each do |f| query_values << params[f] neighbor_values << categories["physchem"][f] end sim = weighted_cosine_similarity(query_values,neighbor_values,weights) if sim > 0.9999 # no exact match because of rounding errors match = {id => categories} elsif sim > 0.95 neighbor = categories neighbor["similarity"] = sim neighbor["sim"] = cosine_similarity(query_values,neighbor_values) neighbor["id"] = id sim_sum += sim weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT]) #weighted_sum += sim*categories["tox"][ENDPOINT] neighbors << neighbor end end neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]} sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum) { :query => params, :match => match, :prediction => {ENDPOINT => prediction}, :neighbors => neighbors } end class Object def numeric? true if Float(self) rescue false end end def euclidean_distance(a, b) sq = a.zip(b).map{|a,b| (a - b) ** 2} Math.sqrt(sq.inject(0) {|s,c| s + c}) end def dot_product(a, b) products = a.zip(b).map{|a, b| a * b} products.inject(0) {|s,p| s + p} end def magnitude(point) squares = point.map{|x| x ** 2} Math.sqrt(squares.inject(0) {|s, c| s + c}) end # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity def cosine_similarity(a, b) dot_product(a, b) / (magnitude(a) * magnitude(b)) end def weighted_cosine_similarity(a, b, w) dot_product = 0 magnitude_a = 0 magnitude_b = 0 (0..a.size-1).each do |i| dot_product += w[i].abs*a[i]*b[i] magnitude_a += w[i].abs*a[i]**2 magnitude_b += w[i].abs*b[i]**2 end dot_product/Math.sqrt(magnitude_a*magnitude_b) end #@endpoint = @data.collect{|r| r[5]} def neighbors query end def csv2json csv = CSV.read("data/MergedSheets_edit.csv") csv.collect!{|row| row[0..36].collect{|c| c.numeric? ? c.to_f : c } }.compact feature_names = [ "ID", csv[0][1], csv[0][2], csv[0][3], csv[6][4], "#{csv[0][5]} (#{csv[6][5]} [#{csv[11][5]}])", # endpoint "#{csv[0][6]} (#{csv[6][6]})", # endpoint "#{csv[6][7]} [#{csv[11][7]}]", "#{csv[6][8]} [#{csv[11][8]}]", "#{csv[6][9]} [#{csv[11][9]}]", ] (10..10+5*3).step(3) do |i| feature_names += [ "#{csv[6][i]} [#{csv[11][i]}]", "#{csv[6][i+1]} #{csv[8][i+1]} [#{csv[11][i+1]}]", "#{csv[6][i+2]} #{csv[8][i+2]}", ] end feature_names += [ "#{csv[6][28]}", "#{csv[6][29]} #{csv[8][29]}", "#{csv[6][30]} #{csv[8][30]}", ] (31..34).each do |i| feature_names << "#{csv[6][i]} #{csv[8][i]} [#{csv[11][i]}]" end (35..36).each do |i| feature_names << "#{csv[6][i]} #{csv[8][i]} #{csv[10][i]} [#{csv[11][i]}]" end data = {} csv.drop(12).each do |row| id = row.first if id.match /^G/ # skip Ag, too many missing values data[id] = {} row.each_with_index do |col,i| if i == 0 data[id][:composition] = {} elsif i < 5 data[id][:composition][feature_names[i]] = col elsif i == 5 data[id][:tox] ||= {} data[id][:tox][feature_names[i]] = col elsif i > 6 data[id][:physchem] ||= {} data[id][:physchem][feature_names[i]] = col end end end end File.open("data.json","w+"){|f| f.puts data.to_json} data end #puts data.to_yaml =begin R.assign "endpoint", endpoint (0..data[0].size).each do |c| if data.collect{|r| r[c]}.uniq.size > 1 begin R.assign "feature", data.collect{|r| r[c]} R.eval "r <- cor(-log(endpoint),-log(feature),use='complete')" r = R.eval("r").to_ruby p "#{c}: #{r}" if r > 0.3 or r < -0.3 rescue end end end csv[0..13].each do |row| row.each_with_index do |col,i| features[i] = features[i].to_s+", "+col.to_s end end puts features.select{|f| f.match(/Mean/)}.to_yaml #n+=1 #p n,row.first unless row.first.match /^[G|S]/ =end