summaryrefslogtreecommitdiff
path: root/nch/06_compare_features.rb
blob: 0dc4cf2647f6c2e03ac33e7ff56162146a9eac8c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env ruby

require "csv"
require "uri"

require "./config.rb"

unless ARGV.size==3 and (DATA-["LOAEL-mg"]).include?(ARGV[0]) and ARGV[1]=~/features|compounds/
  $stderr.puts "\nfirst-param: dataset-name from "+(DATA-["LOAEL-mg"]).inspect
  $stderr.puts "second-param:  features|compounds"
  $stderr.puts "third-param: <num>"
  $stderr.puts ""
  $stderr.puts "second param specifies if features or compounds that differ should be printed"
  $stderr.puts "third param specifies num features/compounds that are printed\n\n"
  abort
end

d = ARGV[0]
transposed = (ARGV[1]=~/features/)
num_print = ARGV[2].to_i

file_feat = "data/05/#{d}_new-features.csv"
file_orig = "data/02/#{d}_orig-features.csv"
puts "comparing #{file_feat} and #{file_orig}"

@inchis = []

feat = []
CSV.foreach(file_feat) do |row|
  feat << row
end

orig = []
#  CSV.foreach("data/#{d}_orig-features-quotes.csv") do |row|
CSV.foreach(file_orig) do |row|
#CSV.foreach("data/#{d}_noarom-features.csv") do |row|
  orig << row
end

match = 0
total = 0
orig_nil = 0
new_nil = 0
orig_0 = 0
new_0 = 0
diff = 0
epsilon_ratio = 0.1

dev = []

raise if feat.size!=orig.size
(0..feat.size-1).each do |r|
  
  dev << feat[r] if (r==0)
  
  raise "num columns in row #{r} differ, #{feat[r].size} != #{orig[r].size}" if feat[r].size!=orig[r].size
  raise "inchis differ #{feat[r][0]} != #{orig[r][0]}" if feat[r][0]!=orig[r][0]
  if r>0
    dev_per_compound = [feat[r][0]]
    (1..feat[r].size-1).each do |c|
      total += 1
      f_missing = feat[r][c]==nil || feat[r][c].to_s.size==0
      o_missing = orig[r][c]==nil || orig[r][c].to_s.size==0
      if (f_missing != o_missing)
        #         $stderr.puts "one value for #{feat[0][c]}/#{orig[0][c]} is missing, new features: #{feat[r][c]} != orig features: #{orig[r][c]}, inchi: #{orig[r][0]}"
        if f_missing
          dev_ratio = 10
          new_nil+=1
        else
          dev_ratio = 20
          orig_nil+=1
        end
      else 
        f = feat[r][c].to_f
        o = orig[r][c].to_f
        if ((f==0.0 and o!=0.0) or (f!=0.0 and o==0.0))
          if (f==0.0 and o!=0.0)
            dev_ratio = 11
            new_0+=1
          else
            dev_ratio = 21
            orig_0+=1
          end
        else
          dev_ratio = 1 - [f.abs,o.abs].min/[f.abs,o.abs].max
          if (f!=o && dev_ratio > epsilon_ratio)
            #            $stderr.puts "deviation to high for #{feat[0][c]}/#{orig[0][c]}, new features: #{feat[r][c]} != orig features: #{orig[r][c]}, inchi: #{orig[r][0]}, deviation #{dev_ratio}"
            diff += 1
          else
            dev_ratio = 0
            match += 1
          end
        end
      end
      dev_per_compound << dev_ratio
    end
    #      puts dev_per_compound.inspect
    dev << dev_per_compound
  end
end

puts "matching values #{match}/#{total} #{(match/total.to_f*100.0).round}%"
puts "no matches:"
puts "#{orig_nil} orig-nil"
puts "#{new_nil} new-nil"
puts "#{orig_0} orig-0"
puts "#{new_0} new-0"
puts "#{diff} delta > #{epsilon_ratio*100}%"
puts ""

class Array
  def max1_sum
    inject(0.0) { |result, el| result + [1,el].min }
  end
  
  def abs_mean 
    max1_sum / size
  end
end

dev = dev.transpose if transposed

puts ""
dev.shift
dev.sort!{ |b,a| a[1..-1].abs_mean <=> b[1..-1].abs_mean }
puts "top #{num_print} diffing #{transposed ? "features" : "compounds"} (numbers is diff between values in percent): "
num_print.times do |i|
  d =  dev[i].collect do |x|
    unless x.is_a?(Numeric)
      x
    else
      if x==20
        "orig-nil"
      elsif x==10
        "new-nil"
      elsif x==21
        "orig-0"
      elsif x==11
        "new-0"
      else
        (x*100).round
      end
    end
  end
  puts d.inspect
end