From 36ea2f865014a761be9fe74719f4c88dbaffeb81 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Mon, 27 Oct 2014 15:48:01 +0100 Subject: inital commit more documentation will be added --- nch/06_compare_features.rb | 150 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100755 nch/06_compare_features.rb (limited to 'nch/06_compare_features.rb') diff --git a/nch/06_compare_features.rb b/nch/06_compare_features.rb new file mode 100755 index 0000000..0dc4cf2 --- /dev/null +++ b/nch/06_compare_features.rb @@ -0,0 +1,150 @@ +#!/usr/bin/env ruby + +require "csv" +require "uri" + +require "./config.rb" + +unless ARGV.size==3 and (DATA-["LOAEL-mg"]).include?(ARGV[0]) and ARGV[1]=~/features|compounds/ + $stderr.puts "\nfirst-param: dataset-name from "+(DATA-["LOAEL-mg"]).inspect + $stderr.puts "second-param: features|compounds" + $stderr.puts "third-param: " + $stderr.puts "" + $stderr.puts "second param specifies if features or compounds that differ should be printed" + $stderr.puts "third param specifies num features/compounds that are printed\n\n" + abort +end + +d = ARGV[0] +transposed = (ARGV[1]=~/features/) +num_print = ARGV[2].to_i + +file_feat = "data/05/#{d}_new-features.csv" +file_orig = "data/02/#{d}_orig-features.csv" +puts "comparing #{file_feat} and #{file_orig}" + +@inchis = [] + +feat = [] +CSV.foreach(file_feat) do |row| + feat << row +end + +orig = [] +# CSV.foreach("data/#{d}_orig-features-quotes.csv") do |row| +CSV.foreach(file_orig) do |row| +#CSV.foreach("data/#{d}_noarom-features.csv") do |row| + orig << row +end + +match = 0 +total = 0 +orig_nil = 0 +new_nil = 0 +orig_0 = 0 +new_0 = 0 +diff = 0 +epsilon_ratio = 0.1 + +dev = [] + +raise if feat.size!=orig.size +(0..feat.size-1).each do |r| + + dev << feat[r] if (r==0) + + raise "num columns in row #{r} differ, #{feat[r].size} != #{orig[r].size}" if feat[r].size!=orig[r].size + raise "inchis differ #{feat[r][0]} != #{orig[r][0]}" if feat[r][0]!=orig[r][0] + if r>0 + dev_per_compound = [feat[r][0]] + (1..feat[r].size-1).each do |c| + total += 1 + f_missing = feat[r][c]==nil || feat[r][c].to_s.size==0 + o_missing = orig[r][c]==nil || orig[r][c].to_s.size==0 + if (f_missing != o_missing) + # $stderr.puts "one value for #{feat[0][c]}/#{orig[0][c]} is missing, new features: #{feat[r][c]} != orig features: #{orig[r][c]}, inchi: #{orig[r][0]}" + if f_missing + dev_ratio = 10 + new_nil+=1 + else + dev_ratio = 20 + orig_nil+=1 + end + else + f = feat[r][c].to_f + o = orig[r][c].to_f + if ((f==0.0 and o!=0.0) or (f!=0.0 and o==0.0)) + if (f==0.0 and o!=0.0) + dev_ratio = 11 + new_0+=1 + else + dev_ratio = 21 + orig_0+=1 + end + else + dev_ratio = 1 - [f.abs,o.abs].min/[f.abs,o.abs].max + if (f!=o && dev_ratio > epsilon_ratio) + # $stderr.puts "deviation to high for #{feat[0][c]}/#{orig[0][c]}, new features: #{feat[r][c]} != orig features: #{orig[r][c]}, inchi: #{orig[r][0]}, deviation #{dev_ratio}" + diff += 1 + else + dev_ratio = 0 + match += 1 + end + end + end + dev_per_compound << dev_ratio + end + # puts dev_per_compound.inspect + dev << dev_per_compound + end +end + +puts "matching values #{match}/#{total} #{(match/total.to_f*100.0).round}%" +puts "no matches:" +puts "#{orig_nil} orig-nil" +puts "#{new_nil} new-nil" +puts "#{orig_0} orig-0" +puts "#{new_0} new-0" +puts "#{diff} delta > #{epsilon_ratio*100}%" +puts "" + +class Array + def max1_sum + inject(0.0) { |result, el| result + [1,el].min } + end + + def abs_mean + max1_sum / size + end +end + +dev = dev.transpose if transposed + +puts "" +dev.shift +dev.sort!{ |b,a| a[1..-1].abs_mean <=> b[1..-1].abs_mean } +puts "top #{num_print} diffing #{transposed ? "features" : "compounds"} (numbers is diff between values in percent): " +num_print.times do |i| + d = dev[i].collect do |x| + unless x.is_a?(Numeric) + x + else + if x==20 + "orig-nil" + elsif x==10 + "new-nil" + elsif x==21 + "orig-0" + elsif x==11 + "new-0" + else + (x*100).round + end + end + end + puts d.inspect +end + + + + -- cgit v1.2.3