nanoparticles.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

require 'rserve'
require 'json'
require 'yaml'
require 'csv'

ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])"

def predict params
  neighbors = []
  sim_sum = 0
  weighted_sum = 0
  match = nil
  relevant_features = JSON.parse(File.read("./relevant-features.json"))
  weights = relevant_features.values.collect{|v| v["r"]}
  JSON.parse(File.read("./data.json")).each do |id,categories|
    neighbor_values = categories["physchem"].select{|f,v| params.keys.include? f}.values
    if params.values == neighbor_values
      match = {id => categories}
    else
      sim = weighted_cosine_similarity(params.values,neighbor_values,weights)
      if sim > 0.95
        neighbor = categories
        neighbor["similarity"] = sim
        neighbor["sim"] = cosine_similarity(params.values,neighbor_values)
        neighbor["id"] = id
        sim_sum += sim
        weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT])
        #weighted_sum += sim*categories["tox"][ENDPOINT]
        neighbors << neighbor
      end
    end
  end
  neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
  sim_sum == 0 ? prediction = nil : prediction =  10**(weighted_sum/sim_sum)
  {
    :query => params,
    :match => match,
    :prediction => {ENDPOINT => prediction},
    :neighbors => neighbors
  }
end

class Object
  def numeric?
    true if Float(self) rescue false
  end
end

def euclidean_distance(a, b)
  sq = a.zip(b).map{|a,b| (a - b) ** 2}
  Math.sqrt(sq.inject(0) {|s,c| s + c})
end

def dot_product(a, b)
  products = a.zip(b).map{|a, b| a * b}
  products.inject(0) {|s,p| s + p}
end

def magnitude(point)
  squares = point.map{|x| x ** 2}
  Math.sqrt(squares.inject(0) {|s, c| s + c})
end

# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
def cosine_similarity(a, b)
  dot_product(a, b) / (magnitude(a) * magnitude(b))
end

def weighted_cosine_similarity(a, b, w)
  dot_product = 0
  magnitude_a = 0
  magnitude_b = 0
  (0..a.size-1).each do |i|
    dot_product += w[i].abs*a[i]*b[i]
    magnitude_a += w[i].abs*a[i]**2
    magnitude_b += w[i].abs*b[i]**2
  end
  dot_product/Math.sqrt(magnitude_a*magnitude_b)

end

#@endpoint = @data.collect{|r| r[5]}

def neighbors query
end

def csv2json
  csv = CSV.read("data/MergedSheets_edit.csv")
  csv.collect!{|row| row[0..36].collect{|c| c.numeric? ? c.to_f : c } }.compact
  feature_names = [
    "ID",
     csv[0][1],
     csv[0][2],
     csv[0][3],
     csv[6][4],
     "#{csv[0][5]} (#{csv[6][5]} [#{csv[11][5]}])", # endpoint
     "#{csv[0][6]} (#{csv[6][6]})", # endpoint
     "#{csv[6][7]} [#{csv[11][7]}]",
     "#{csv[6][8]} [#{csv[11][8]}]",
     "#{csv[6][9]} [#{csv[11][9]}]",
  ]
  (10..10+5*3).step(3) do |i|
    feature_names += [
     "#{csv[6][i]} [#{csv[11][i]}]",
     "#{csv[6][i+1]} #{csv[8][i+1]} [#{csv[11][i+1]}]",
     "#{csv[6][i+2]} #{csv[8][i+2]}",
    ]
  end
  feature_names += [
   "#{csv[6][28]}",
   "#{csv[6][29]} #{csv[8][29]}",
   "#{csv[6][30]} #{csv[8][30]}",
  ]
  (31..34).each do |i|
    feature_names << "#{csv[6][i]} #{csv[8][i]} [#{csv[11][i]}]"
  end
  (35..36).each do |i|
    feature_names << "#{csv[6][i]} #{csv[8][i]} #{csv[10][i]} [#{csv[11][i]}]"
  end
  data = {}
  csv.drop(12).each do |row|
    id = row.first
    if id.match /^G/ # skip Ag, too many missing values
      data[id] = {}
      row.each_with_index do |col,i|
        if i == 0
          data[id][:composition] = {}
        elsif i < 5
          data[id][:composition][feature_names[i]] = col
        elsif i == 5
          data[id][:tox] ||= {}
          data[id][:tox][feature_names[i]] = col
        elsif i > 6
          data[id][:physchem] ||= {}
          data[id][:physchem][feature_names[i]] = col
        end
      end
    end
  end
  File.open("data.json","w+"){|f| f.puts data.to_json}
  data
end

#puts data.to_yaml
=begin
R.assign "endpoint", endpoint
(0..data[0].size).each do |c|
  if data.collect{|r| r[c]}.uniq.size > 1
    begin
    R.assign "feature", data.collect{|r| r[c]}
    R.eval "r <- cor(-log(endpoint),-log(feature),use='complete')"
    r = R.eval("r").to_ruby
    p "#{c}: #{r}" if r > 0.3 or r < -0.3
    rescue
    end
  end
end


csv[0..13].each do |row|
  row.each_with_index do |col,i|
    features[i] = features[i].to_s+", "+col.to_s
  end
end

puts features.select{|f| f.match(/Mean/)}.to_yaml

  #n+=1
  #p n,row.first unless row.first.match /^[G|S]/
=end