summaryrefslogtreecommitdiff
path: root/nch/06_compare_features.rb
diff options
context:
space:
mode:
Diffstat (limited to 'nch/06_compare_features.rb')
-rwxr-xr-xnch/06_compare_features.rb150
1 files changed, 150 insertions, 0 deletions
diff --git a/nch/06_compare_features.rb b/nch/06_compare_features.rb
new file mode 100755
index 0000000..0dc4cf2
--- /dev/null
+++ b/nch/06_compare_features.rb
@@ -0,0 +1,150 @@
+#!/usr/bin/env ruby
+
+require "csv"
+require "uri"
+
+require "./config.rb"
+
+unless ARGV.size==3 and (DATA-["LOAEL-mg"]).include?(ARGV[0]) and ARGV[1]=~/features|compounds/
+ $stderr.puts "\nfirst-param: dataset-name from "+(DATA-["LOAEL-mg"]).inspect
+ $stderr.puts "second-param: features|compounds"
+ $stderr.puts "third-param: <num>"
+ $stderr.puts ""
+ $stderr.puts "second param specifies if features or compounds that differ should be printed"
+ $stderr.puts "third param specifies num features/compounds that are printed\n\n"
+ abort
+end
+
+d = ARGV[0]
+transposed = (ARGV[1]=~/features/)
+num_print = ARGV[2].to_i
+
+file_feat = "data/05/#{d}_new-features.csv"
+file_orig = "data/02/#{d}_orig-features.csv"
+puts "comparing #{file_feat} and #{file_orig}"
+
+@inchis = []
+
+feat = []
+CSV.foreach(file_feat) do |row|
+ feat << row
+end
+
+orig = []
+# CSV.foreach("data/#{d}_orig-features-quotes.csv") do |row|
+CSV.foreach(file_orig) do |row|
+#CSV.foreach("data/#{d}_noarom-features.csv") do |row|
+ orig << row
+end
+
+match = 0
+total = 0
+orig_nil = 0
+new_nil = 0
+orig_0 = 0
+new_0 = 0
+diff = 0
+epsilon_ratio = 0.1
+
+dev = []
+
+raise if feat.size!=orig.size
+(0..feat.size-1).each do |r|
+
+ dev << feat[r] if (r==0)
+
+ raise "num columns in row #{r} differ, #{feat[r].size} != #{orig[r].size}" if feat[r].size!=orig[r].size
+ raise "inchis differ #{feat[r][0]} != #{orig[r][0]}" if feat[r][0]!=orig[r][0]
+ if r>0
+ dev_per_compound = [feat[r][0]]
+ (1..feat[r].size-1).each do |c|
+ total += 1
+ f_missing = feat[r][c]==nil || feat[r][c].to_s.size==0
+ o_missing = orig[r][c]==nil || orig[r][c].to_s.size==0
+ if (f_missing != o_missing)
+ # $stderr.puts "one value for #{feat[0][c]}/#{orig[0][c]} is missing, new features: #{feat[r][c]} != orig features: #{orig[r][c]}, inchi: #{orig[r][0]}"
+ if f_missing
+ dev_ratio = 10
+ new_nil+=1
+ else
+ dev_ratio = 20
+ orig_nil+=1
+ end
+ else
+ f = feat[r][c].to_f
+ o = orig[r][c].to_f
+ if ((f==0.0 and o!=0.0) or (f!=0.0 and o==0.0))
+ if (f==0.0 and o!=0.0)
+ dev_ratio = 11
+ new_0+=1
+ else
+ dev_ratio = 21
+ orig_0+=1
+ end
+ else
+ dev_ratio = 1 - [f.abs,o.abs].min/[f.abs,o.abs].max
+ if (f!=o && dev_ratio > epsilon_ratio)
+ # $stderr.puts "deviation to high for #{feat[0][c]}/#{orig[0][c]}, new features: #{feat[r][c]} != orig features: #{orig[r][c]}, inchi: #{orig[r][0]}, deviation #{dev_ratio}"
+ diff += 1
+ else
+ dev_ratio = 0
+ match += 1
+ end
+ end
+ end
+ dev_per_compound << dev_ratio
+ end
+ # puts dev_per_compound.inspect
+ dev << dev_per_compound
+ end
+end
+
+puts "matching values #{match}/#{total} #{(match/total.to_f*100.0).round}%"
+puts "no matches:"
+puts "#{orig_nil} orig-nil"
+puts "#{new_nil} new-nil"
+puts "#{orig_0} orig-0"
+puts "#{new_0} new-0"
+puts "#{diff} delta > #{epsilon_ratio*100}%"
+puts ""
+
+class Array
+ def max1_sum
+ inject(0.0) { |result, el| result + [1,el].min }
+ end
+
+ def abs_mean
+ max1_sum / size
+ end
+end
+
+dev = dev.transpose if transposed
+
+puts ""
+dev.shift
+dev.sort!{ |b,a| a[1..-1].abs_mean <=> b[1..-1].abs_mean }
+puts "top #{num_print} diffing #{transposed ? "features" : "compounds"} (numbers is diff between values in percent): "
+num_print.times do |i|
+ d = dev[i].collect do |x|
+ unless x.is_a?(Numeric)
+ x
+ else
+ if x==20
+ "orig-nil"
+ elsif x==10
+ "new-nil"
+ elsif x==21
+ "orig-0"
+ elsif x==11
+ "new-0"
+ else
+ (x*100).round
+ end
+ end
+ end
+ puts d.inspect
+end
+
+
+
+