1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
|
require 'rserve'
require 'json'
require 'yaml'
require 'csv'
ENDPOINT = "Cell.association (Net cell association [mL/ug(Mg)])"
def predict params
neighbors = []
sim_sum = 0
weighted_sum = 0
match = nil
relevant_features = JSON.parse(File.read("./relevant-features.json"))
weights = relevant_features.values.collect{|v| v["r"]}
JSON.parse(File.read("./data.json")).each do |id,categories|
neighbor_values = categories["physchem"].select{|f,v| params.keys.include? f}.values
if params.values == neighbor_values
match = {id => categories}
else
sim = weighted_cosine_similarity(params.values,neighbor_values,weights)
if sim > 0.95
neighbor = categories
neighbor["similarity"] = sim
neighbor["sim"] = cosine_similarity(params.values,neighbor_values)
neighbor["id"] = id
sim_sum += sim
weighted_sum += sim*Math.log10(categories["tox"][ENDPOINT])
#weighted_sum += sim*categories["tox"][ENDPOINT]
neighbors << neighbor
end
end
end
neighbors.sort!{|a,b| b["similarity"] <=> a["similarity"]}
sim_sum == 0 ? prediction = nil : prediction = 10**(weighted_sum/sim_sum)
{
:query => params,
:match => match,
:prediction => {ENDPOINT => prediction},
:neighbors => neighbors
}
end
class Object
def numeric?
true if Float(self) rescue false
end
end
def euclidean_distance(a, b)
sq = a.zip(b).map{|a,b| (a - b) ** 2}
Math.sqrt(sq.inject(0) {|s,c| s + c})
end
def dot_product(a, b)
products = a.zip(b).map{|a, b| a * b}
products.inject(0) {|s,p| s + p}
end
def magnitude(point)
squares = point.map{|x| x ** 2}
Math.sqrt(squares.inject(0) {|s, c| s + c})
end
# http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity
def cosine_similarity(a, b)
dot_product(a, b) / (magnitude(a) * magnitude(b))
end
def weighted_cosine_similarity(a, b, w)
dot_product = 0
magnitude_a = 0
magnitude_b = 0
(0..a.size-1).each do |i|
dot_product += w[i].abs*a[i]*b[i]
magnitude_a += w[i].abs*a[i]**2
magnitude_b += w[i].abs*b[i]**2
end
dot_product/Math.sqrt(magnitude_a*magnitude_b)
end
#@endpoint = @data.collect{|r| r[5]}
def neighbors query
end
def csv2json
csv = CSV.read("data/MergedSheets_edit.csv")
csv.collect!{|row| row[0..36].collect{|c| c.numeric? ? c.to_f : c } }.compact
feature_names = [
"ID",
csv[0][1],
csv[0][2],
csv[0][3],
csv[6][4],
"#{csv[0][5]} (#{csv[6][5]} [#{csv[11][5]}])", # endpoint
"#{csv[0][6]} (#{csv[6][6]})", # endpoint
"#{csv[6][7]} [#{csv[11][7]}]",
"#{csv[6][8]} [#{csv[11][8]}]",
"#{csv[6][9]} [#{csv[11][9]}]",
]
(10..10+5*3).step(3) do |i|
feature_names += [
"#{csv[6][i]} [#{csv[11][i]}]",
"#{csv[6][i+1]} #{csv[8][i+1]} [#{csv[11][i+1]}]",
"#{csv[6][i+2]} #{csv[8][i+2]}",
]
end
feature_names += [
"#{csv[6][28]}",
"#{csv[6][29]} #{csv[8][29]}",
"#{csv[6][30]} #{csv[8][30]}",
]
(31..34).each do |i|
feature_names << "#{csv[6][i]} #{csv[8][i]} [#{csv[11][i]}]"
end
(35..36).each do |i|
feature_names << "#{csv[6][i]} #{csv[8][i]} #{csv[10][i]} [#{csv[11][i]}]"
end
data = {}
csv.drop(12).each do |row|
id = row.first
if id.match /^G/ # skip Ag, too many missing values
data[id] = {}
row.each_with_index do |col,i|
if i == 0
data[id][:composition] = {}
elsif i < 5
data[id][:composition][feature_names[i]] = col
elsif i == 5
data[id][:tox] ||= {}
data[id][:tox][feature_names[i]] = col
elsif i > 6
data[id][:physchem] ||= {}
data[id][:physchem][feature_names[i]] = col
end
end
end
end
File.open("data.json","w+"){|f| f.puts data.to_json}
data
end
#puts data.to_yaml
=begin
R.assign "endpoint", endpoint
(0..data[0].size).each do |c|
if data.collect{|r| r[c]}.uniq.size > 1
begin
R.assign "feature", data.collect{|r| r[c]}
R.eval "r <- cor(-log(endpoint),-log(feature),use='complete')"
r = R.eval("r").to_ruby
p "#{c}: #{r}" if r > 0.3 or r < -0.3
rescue
end
end
end
csv[0..13].each do |row|
row.each_with_index do |col,i|
features[i] = features[i].to_s+", "+col.to_s
end
end
puts features.select{|f| f.match(/Mean/)}.to_yaml
#n+=1
#p n,row.first unless row.first.match /^[G|S]/
=end
|