# TODO: missing data for protein corona silver particles
require 'json'
require 'yaml'
require 'csv'
require_relative "lib/nano-lazar.rb"
include OpenTox

def feature_name uri
  f = @features[uri]
  name = f['title']
  annotations = f['annotation'].collect{|a| "#{a['p']}: #{a['o']}"}.uniq.join ", "
  name << " (#{annotations})" unless annotations.empty?
  name << " [#{f['units']}]" if f['units'] and !f['units'].empty?
  name
end

nanomaterials = []
feature_names = {}
@features = {}

["nanowiki.json",  "protein-corona.json", "marina.json"].each do |f|
  bundle = JSON.parse(File.read(File.join("data",f)))
  @features.merge! bundle["feature"]
  bundle["dataEntry"].each do |substance|
    nm = Nanoparticle.new
    nm.uri = substance["compound"]["URI"]
    nm.name = substance["values"]["https://apps.ideaconsult.net/enanomapper/identifier/name"] if substance["values"]
    if substance["composition"]
      nr_cores = substance["composition"].select{|c| c["relation"] == "HAS_CORE"}.size
      puts "#{substance["compound"]["URI"]} has #{nr_cores} cores" if nr_cores !=1
      substance["composition"].each do |composition|
        component = composition["component"]
        if component
          name = component["values"]["https://apps.ideaconsult.net/enanomapper/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
          #names << name
          if composition["relation"] == "HAS_CORE"
            nm.core = name
          elsif composition["relation"] == "HAS_COATING"
            nm.coating ||= []
            nm.coating << name
          end
        else
          #puts substance.to_yaml
        end
      end
    else
      #puts substance.to_yaml
    end
    substance["values"].each do |k,v|
      property = nil
      if k.match(/TOX/)
        nm.tox ||= []
        property = "tox"
      elsif k.match(/P-CHEM/)
        nm.p_chem ||= []
        property = "p_chem"
      end
      if property
        v.each do |val|
          if val.keys == ["loValue"]
            nm.tox << {k => val["loValue"]} if property == "tox"
            nm.p_chem << {k => val["loValue"]} if property == "p_chem"
          elsif val.keys == ["loQualifier", "loValue"] and val["loQualifier"] == "mean"
            nm.tox << {k => val["loValue"]} if property == "tox"
            nm.p_chem << {k => val["loValue"]} if property == "p_chem"
          elsif val.keys == ["loQualifier", "loValue", "upQualifier", "upValue" ]
            nm.tox << {k => (val["loValue"]+val["upValue"])/2} if property == "tox"
            nm.p_chem << {k => (val["loValue"]+val["upValue"])/2} if property == "p_chem"
          elsif val.keys == ["loQualifier", "loValue"] and val["loQualifier"] == ">="
          else
          p val
          end
        end
      else
        #p k,v
      end
    end
    nm.tox.uniq! if nm.tox
    nm.p_chem.uniq! if nm.p_chem
    nanomaterials << nm
  end
end

puts "Total imported: #{nanomaterials.size}"
puts "With nanoparticle characterisation: #{nanomaterials.select{|n| n.p_chem}.size}"
modelling_data = nanomaterials.select{|n| n.tox and n.p_chem}
puts "With TOX data: #{nanomaterials.select{|n| n.tox}.size}"
puts "With TOX data and particle characterisation: #{modelling_data.size}"
endpoints = modelling_data.collect{|n| n.tox.collect{|t| t.keys}}.flatten.compact.uniq
puts
puts "Endpoints: #{endpoints.size}"

single_value_endpoints = []
endpoint_values = {}

endpoints.each do |e|
  i = 0
  values = []
  modelling_data.each do |n|
    n.tox.each do |t|
      if t[e]
        i += 1
        values << t[e]
      end
    end
  end
  single_value_endpoints << e if values.uniq.size == 1
  endpoint_values[e] = values.size unless values.uniq.size == 1
end

endpoints -= single_value_endpoints
puts "Endpoints with more than one measurement value: #{endpoints.size}"
endpoint_values.select!{|k,v| v > 10}
puts "Endpoints with more than 10 measurements: #{endpoint_values.size}"
endpoints = endpoint_values.keys
puts
puts endpoint_values.sort{|a,b| b[1] <=> a[1]}.collect{|e,v| "#{feature_names[e]}: #{v}"}.join("\n")

endpoint = "https://apps.ideaconsult.net/enanomapper/property/TOX/UNKNOWN_TOXICITY_SECTION/Log2+transformed/94D664CFE4929A0F400A5AD8CA733B52E049A688/E/3ed642f9-1b42-387a-9966-dea5b91e5f8a"
nanomaterials.select!{|nm| nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint}
p nanomaterials.size

feature_values = {}
nanomaterials.each do |nm|
  (nm.p_chem + nm.tox).each do |f|
    feature_names[f] = feature_name f # avoid appending annotations/units with each function call, unclear why it happens
    p f unless f.size == 1
    k = f.keys.first
    unless f[k].is_a? String
      feature_values[k] ||= []
      feature_values[k] << f[k]
    end
  end
end

# remove empty values
feature_values.select!{|f,vals| vals.uniq.size > 2}
tox_descriptors = feature_values.select{|f,vals| f.match 'TOX'}.keys
p_chem_descriptors = feature_values.select{|f,vals| f.match 'P-CHEM'}.keys

#puts @features.to_yaml

column_names = ["Nanoparticle"] + p_chem_descriptors.collect{|d| feature_names[d]} + tox_descriptors.collect{|d| feature_names[d]}
table = []
CSV.open(File.join(File.dirname(__FILE__),"data","protein_corona_extract.csv"),"w+") do |csv|
  csv << column_names
  nanomaterials.each do |nm|
    if nm.tox and nm.tox.collect{|t| t.keys}.flatten.include? endpoint
      #table << []
      csv << [nm.name] + p_chem_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first} + tox_descriptors.collect{|p| nm.p_chem.collect{|pchem| pchem[p]}.compact.first}
    end
  end
end