From 9661a67983ffc93ee02bc12b20b9afb38e199d79 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Mon, 29 Oct 2012 13:19:20 +0100 Subject: pubchem pathway prediction implemented --- application.rb | 9 +- pubchem-test.rb | 92 ++++++++++++++++ pubchem.rb | 320 +++++++++++++++++++++++++++++++++++++------------------ test.rb | 51 +++++++++ views/index.haml | 54 +++------- 5 files changed, 379 insertions(+), 147 deletions(-) create mode 100644 pubchem-test.rb create mode 100644 test.rb diff --git a/application.rb b/application.rb index 09f3188..2a77013 100644 --- a/application.rb +++ b/application.rb @@ -6,16 +6,17 @@ require "./pubchem.rb" also_reload './pubchem.rb' get '/?' do -=begin - @compound = PubChem::Compound.new + #@neighbors = OpenTox::PubChemNeighbors.new smiles = "OC(=O)C1=C(C=CC=C1)OC(=O)C" #smiles = "c1cc(CC)ccc1" #smiles = "CC(=O)Nc1ccc(O)cc1" smiles = "C1=CC(=C(C=C1Cl)Cl)OCC(=O)O" #@compound.from_smiles smiles - @compound.get_neighbors smiles + #@neighbors.from_smiles smiles +=begin File.open("compound.yaml","w+"){|f| f.puts @compound.to_yaml} -=end @compound = YAML.load_file "compound.yaml" +=end + @neighbors = YAML.load_file "search.yaml" haml :index end diff --git a/pubchem-test.rb b/pubchem-test.rb new file mode 100644 index 0000000..273698e --- /dev/null +++ b/pubchem-test.rb @@ -0,0 +1,92 @@ +require 'test/unit' +require '../opentox-client/lib/opentox-client' +require './pubchem.rb' +require 'yaml' + +class AOPTest < Test::Unit::TestCase + + def setup + @pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/" + @compound = OpenTox::PubChemCompound.new + @compound.cid = 1983 + #@compound.from_name "2,4-D" + end + + def test_initialize + print @compound.cid + print " " + puts @compound.to_smiles + puts "measured targets" + puts @compound.targets.collect{|t| t["Target Name"]}.to_yaml + puts "predicted targets" + puts @compound.predicted_targets.select{|t| t[:prediction] == "active"}.to_yaml + + puts "predicted non_targets" + puts @compound.predicted_targets.select{|t| t[:prediction] == "inactive"}.size + #puts @compound.predicted_non_targets.values.inspect + measured_target_gis = @compound.targets.collect{|t| t["Target GI"]}.compact.uniq + measured_nontarget_gis = @compound.non_targets.collect{|t| t["Target GI"]}.compact.uniq + predicted_target_gis = @compound.predicted_targets.collect{|t| t[:target_gi] if t[:prediction] == "active"}.compact.uniq + predicted_nontarget_gis = @compound.predicted_targets.collect{|t| t[:target_gi] if t[:prediction] == "inactive"}.compact.uniq + print "correct predicted targets: " + puts (measured_target_gis & predicted_target_gis).size + print "new predicted targets: " + puts (predicted_target_gis - measured_target_gis).size + print "correct predicted non-targets: " + puts (measured_nontarget_gis & predicted_nontarget_gis).size + print "new predicted non-targets: " + puts (predicted_nontarget_gis - measured_nontarget_gis).size + print "incorrect predicted targets: " + puts (measured_nontarget_gis & predicted_target_gis).size + print "incorrect predicted non-targets: " + puts (measured_target_gis & predicted_nontarget_gis).size +=begin + @compound.neighbors.each do |n| + #print n.cid + #print " " + print n.to_smiles + print " " + print n.similarity + print " " + puts n.p + puts n.targets.sort.inspect + #puts n.non_targets.inspect + end +=end + #File.open("Acetaminophen.yaml","w+"){|f| f.puts @compound.to_yaml} + #puts @compound.neighbors.size + #assert_equal "7500", @p.cid + #assert_equal true, @p.aids[:active].include?(1188) + #assert_equal true, @p.aids[:inactive].include?(435) + end +=begin + def test_similarity_search + #puts @p.to_smiles + @p.neighbors.each do |n| + #puts n.to_smiles + puts @p.target_similarity(n) + end + #puts @p.neighbors.inspect + #assert_equal 100, @p.neighbor_cids.size + end + + def test_assay_description + puts @p.assay_description.to_yaml + end + + def test_assay_genes + puts @p.assay_genes.to_yaml + end + + def test_assay_similarity + @p2 = PubChem::Compound.new "OC(=O)C1=C(C=CC=C1)OC(=O)C" + puts @p.assay_similarity(@p2) + end + + def test_target_similarity + @p2 = PubChem::Compound.new "OC(=O)C1=C(C=CC=C1)OC(=O)C" + puts @p.target_similarity(@p2) + end +=end + +end diff --git a/pubchem.rb b/pubchem.rb index e2574d1..891110c 100644 --- a/pubchem.rb +++ b/pubchem.rb @@ -1,68 +1,198 @@ require '../opentox-client/lib/opentox-client.rb' require 'json' -# get assay from endpoint -# search in endpoint ontology -# -# get measurements -# search for compound and assay -# -# get affected pathways -# search for compound and genes -# identify affected pathways -# identify relations between affected genes/pathways and endpoint -# -# get related assays -# search for assays in ontology tree -# search for compound and related assays -# -# get similar compounds -# search for similar compounds +def Math.gauss(x, sigma = 0.3) + d = 1.0 - x.to_f + Math.exp(-(d*d)/(2*sigma*sigma)) +end module PubChem - def pubchem_search url - puts url - #json = RestClient.get url, :accept => "application/json", :timeout => 90000000 - json = `curl "#{url}"`#, :accept => "application/json", :timeout => 90000000 - @result = JSON.parse json - end + attr_accessor :result - class Assay - attr_accessor :aid + def initialize + @pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/" end - class Result - attr_accessor :aid, :cid, :sid + def pubchem_search url + json = RestClient.get url#, :accept => "application/json", :timeout => 90000000 + @result = JSON.parse json + rescue + puts url + puts $!.message + @result = nil end - class Substance - end +end - class Compound - # doc @ http://pubchem.ncbi.nlm.nih.gov/pug_rest/ - include OpenTox +module OpenTox + + class PubChemCompound < Compound include PubChem - attr_accessor :result, :cid, :neighbors, :tanimoto + # doc @ http://pubchem.ncbi.nlm.nih.gov/pug_rest/ + attr_writer :cid + attr_accessor :similarity, :p, :assays def initialize - @uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/" - @similarity_threshold = 95 + super @summary = [] + @similarity_threshold = 75 @neighbors = [] + @predicted_targets = [] + end + + def from_name name + @inchi = RestClientWrapper.get File.join(CACTUS_URI,URI.escape(name),"stdinchi") + end + + def neighbors + if @neighbors.empty? + pubchem_search File.join(@pug_uri, "compound", "similarity", "cid", cid.to_s, "JSON")+"?Threshold=#{@similarity_threshold}&MaxRecords=100" + listkey = @result["Waiting"]["ListKey"] + while @result["Waiting"] do + sleep 1 + pubchem_search File.join(@pug_uri, "compound", "listkey", listkey, "assaysummary", "JSON") + end + columns = @result["Table"]["Columns"]["Column"] + table = @result["Table"]["Row"].collect{|cell| cell.values.flatten} + cid_idx = columns.index("CID") + cids = table.collect{|r| r[cid_idx]}.uniq + cids.each do |cid| + unless cid.to_s == @cid.to_s + tab = table.collect{|r| r if r[cid_idx] == cid}.compact + c = PubChemCompound.new + c.extract_result columns, tab + c.similarity = tanimoto c + @neighbors << c unless (c.targets + c.non_targets).empty? + end + end + @neighbors.sort!{|a,b| b.similarity <=> a.similarity} + end + @neighbors + end + + def summary + if @summary.empty? + pubchem_search File.join(@pug_uri, "compound", "cid", cid.to_s, "assaysummary", "JSON") + extract_result @result["Table"]["Columns"]["Column"], @result["Table"]["Row"].collect{|cell| cell.values.flatten} + end + @summary + end + + def active_assays + summary.select{|a| a["Activity Outcome"] == "active"} + end + + def inactive_assays + summary.select{|a| a["Activity Outcome"] == "inactive"} + end + + def targets + active_assays.select{|a| a["Target GI"]} + end + + def non_targets + inactive_assays.select{|a| a["Target GI"]} + end + + def predicted_targets + if @predicted_targets.empty? + target_gis = neighbors.collect{|n| n.summary.collect{|a| a["Target GI"]}}.flatten.compact.uniq + target_gis.each do |gid| + target = {:target_gi => gid} + neighbors.each do |neighbor| + if neighbor.similarity > 0.5 # avoid downweighting + search = neighbor.summary.select{|a| a["Target GI"] == gid} + unless search.empty? or search.size == 1 + print "+++ (" + print search.size + puts ")" + puts search.inspect + end + search.each do |assay| + target[:aid] ||= assay["AID"] + target[:name] ||= assay["Target Name"] + target[:assay_name] ||= assay["Assay Name"] + target[:active_similarities] ||= [] + target[:inactive_similarities] ||= [] + + if assay["Activity Outcome"] == "active" + target[:p_active] ? target[:p_active] = target[:p_active]*neighbor.similarity : target[:p_active] = neighbor.similarity + target[:p_inactive] ? target[:p_inactive] = target[:p_inactive]*(1-neighbor.similarity) : target[:p_inactive] = 1-neighbor.similarity + target[:active_similarities] << neighbor.similarity + elsif assay["Activity Outcome"] == "inactive" + target[:p_active] ? target[:p_active] = target[:p_active]*(1-neighbor.similarity) : target[:p_active] = 1-neighbor.similarity + target[:p_inactive] ? target[:p_inactive] = target[:p_inactive]*neighbor.similarity : target[:p_inactive] = neighbor.similarity + target[:inactive_similarities] << neighbor.similarity + end + end + end + end + if target[:p_active] and target[:p_inactive] and target[:p_active] + target[:p_inactive] != 0 + target[:p_active] = target[:p_active]/(target[:p_active]+target[:p_inactive]) + target[:p_inactive] = target[:p_inactive]/(target[:p_active]+target[:p_inactive]) + if target[:p_active] > target[:p_inactive] + target[:prediction] = "active" + elsif target[:p_active] < target[:p_inactive] + target[:prediction] = "inactive" + end + @predicted_targets << target + end + end + @predicted_targets.sort{|a,b| b[:p_active] <=> a[:p_active]} + end + @predicted_targets end def to_smiles - RestClient.get(File.join(@uri, "compound", "cid", @cid, "property", "CanonicalSMILES", "TXT")).strip + RestClient.get(File.join(@pug_uri, "compound", "cid", cid.to_s, "property", "CanonicalSMILES", "TXT")).strip end - def name - RestClient.get(File.join(@uri, "compound", "cid", @cid, "property", "IUPACName", "TXT")).strip + def tanimoto compound + f1 = File.open(File.join(".","tmp",SecureRandom.uuid+".smi"),"w+") + f1.puts to_smiles + f1.close + f2 = File.open(File.join(".","tmp",SecureRandom.uuid+".smi"),"w+") + f2.puts compound.to_smiles + f2.close + sim = `babel #{f1.path} #{f2.path} -ofpt 2>/dev/null| grep Tanimoto|cut -d "=" -f2`.strip.to_f + File.delete(f1.path) + File.delete(f2.path) + sim end - def image - File.join @uri, "compound", "cid", @cid, "PNG?record_type=3d&image_size=small" + def extract_result columns, table + table.each do |row| + @summary << {} + row.each_with_index do |cell,i| + if columns[i] == "CID" + @cid = cell if @cid.nil? + else + cell.blank? ? @summary.last[columns[i]] = nil : @summary.last[columns[i]] = cell + end + end + end + end + +=begin + def assay_summary assay + if assay["Target GI"] and !@assays[assay["AID"]] + @assays[assay["AID"]] = {"nr_active" => 0, "nr_inactive" => 0} + pubchem_search File.join(@pug_uri, "assay", "aid", assay["AID"].to_s, "cids", "JSON?cids_type=active") + @assays[assay["AID"]]["nr_active"] = @result["InformationList"]["Information"].first["CID"].size if @result + pubchem_search File.join(@pug_uri, "assay", "aid", assay["AID"].to_s, "cids", "JSON?cids_type=inactive") + @assays[assay["AID"]]["nr_inactive"] = @result["InformationList"]["Information"].first["CID"].size if @result + print "getting (in)actives for aid " + puts assay["AID"] + print @assays[assay["AID"]]["nr_active"] + print " " + puts @assays[assay["AID"]]["nr_inactive"] + File.open("assays.json","w+"){|f| f.puts @assays.to_json} + end end +=end + +=begin def properties properties = [ @@ -99,104 +229,82 @@ module PubChem "EffectiveRotorCount3D", "ConformerCount3D", ] - pubchem_search File.join(@uri, "compound", "cid", @cid, "property", properties.join(","), "JSON") + pubchem_search File.join(@pug_uri, "compound", "cid", @cid, "property", properties.join(","), "JSON") @result["PropertyTable"]["Properties"].first end def from_smiles smiles - pubchem_search File.join(@uri, "compound", "smiles", smiles, "assaysummary", "JSON") - from_summary @result["Table"]["Columns"]["Column"], @result["Table"]["Row"].collect{|cell| cell.values.flatten} + pubchem_search File.join(@pug_uri, "compound", "smiles", smiles, "assaysummary", "JSON") + extract_result @result["Table"]["Columns"]["Column"], @result["Table"]["Row"].collect{|cell| cell.values.flatten} + end + def property_similarity compound + svd = OpenTox::SVD.new(GSL::Matrix [[properties, compound.properties]]) + OpenTox::Algorithm::Similarity.cosine svd.data_transformed_matrix.first, svd.data_transformed_matrix.last end - def from_summary columns, table - table.each do |row| - @summary << {} - row.each_with_index do |cell,i| - if columns[i] == "CID" - @cid = cell if @cid.nil? - else - cell.blank? ? @summary.last[columns[i]] = nil : @summary.last[columns[i]] = cell - end - end - end + def assay_similarity compound + tanimoto [[active_assays,inactive_assays],[compound.active_assays,compound.inactive_assays]] end - def active_assays - @summary.collect{|a| a if a["Activity Outcome"] == "active"}.compact + def target_similarity compound + tanimoto [[targets,non_targets],[compound.targets,compound.non_targets]] end - def inactive_assays - @summary.collect{|a| a if a["Activity Outcome"] == "inactive"}.compact + def tanimoto features + common = features.first.flatten & features.last.flatten + same_outcome = (features.first.first & features.last.first) + (features.first.last & features.last.last) + same_outcome.size.to_f/common.size end - def targets - active_assays.collect{|a| a["Target Name"]}.compact + def euclid features end - def non_targets - inactive_assays.collect{|a| a["Target Name"]}.compact + def to_name + RestClient.get(File.join(@pug_uri, "compound", "cid", @cid, "property", "IUPACName", "TXT")).strip end - def assay_similarity compound - a1 = active_assays.collect{|a| a["Assay Name"]} - a2 = compound.active_assays.collect{|a| a["Assay Name"]} - i1 = inactive_assays.collect{|a| a["Assay Name"]} - i2 = compound.inactive_assays.collect{|a| a["Assay Name"]} - self_assays = a1 + i1 - compound_assays = a2 + i2 - common_assays = self_assays & compound_assays - same_outcome = (a1 & a2) + (i1 & i2) - same_outcome.size.to_f/common_assays.size + def to_image_uri + File.join @pug_uri, "compound", "cid", @cid, "PNG?record_type=3d&image_size=small" end +=end - def target_similarity compound - self_assays = targets + non_targets - compound_assays = compound.targets + compound.non_targets - common_assays = self_assays & compound_assays - same_outcome = (targets & compound.targets) + (non_targets & compound.non_targets) - same_outcome.size.to_f/common_assays.size - end - - def assay_genes - active = [] - @aids[:active].each do |aid| - begin - pubchem_search File.join(@uri, "assay", "aid", aid.to_s, "genes", "JSON") - active << @result["InformationList"]["Information"].collect{|i| i["GeneID"]}.flatten - rescue; end - end - @aids[:inactive].each do |aid| - begin - pubchem_search File.join(@uri, "assay", "aid", aid.to_s, "genes", "JSON") - inactive << @result["InformationList"]["Information"].collect{|i| i["GeneID"]}.flatten - rescue; end - end - {:active => active, :inactive => inactive } + end + +=begin + class PubChemNeighbors < Dataset + include PubChem + + attr_accessor :query, :neighbors + + def initialize + @similarity_threshold = 95 + @neighbors = [] + @pug_uri = "http://pubchem.ncbi.nlm.nih.gov/rest/pug/" end - def get_neighbors smiles - pubchem_search File.join(@uri, "compound", "similarity", "smiles", smiles, "JSON")+"?Threshold=#{@similarity_threshold}&MaxRecords=250" + def from_smiles smiles + #@query = PubChemCompound.new.from_smiles smiles + pubchem_search File.join(@pug_uri, "compound", "similarity", "smiles", smiles, "JSON")+"?Threshold=#{@similarity_threshold}&MaxRecords=250" listkey = @result["Waiting"]["ListKey"] while @result["Waiting"] do sleep 1 - pubchem_search File.join(@uri, "compound", "listkey", listkey, "assaysummary", "JSON") + pubchem_search File.join(@pug_uri, "compound", "listkey", listkey, "assaysummary", "JSON") end - File.open("search.yaml","w+"){|s| s.puts @result.to_yaml} + #File.open("search.yaml","w+"){|s| s.puts @result.to_yaml} columns = @result["Table"]["Columns"]["Column"] table = @result["Table"]["Row"].collect{|cell| cell.values.flatten} cid_idx = columns.index("CID") cids = table.collect{|r| r[cid_idx]}.uniq cids.each do |cid| tab = table.collect{|r| r if r[cid_idx] == cid}.compact - c = PubChem::Compound.new - c.from_summary columns, tab + c = PubChemCompound.new + c.extract_result columns, tab @neighbors << c unless (c.targets + c.active_assays).flatten.compact.empty? end - File.open("smiles.smi","w+"){|f| f.puts @neighbors.collect{|n| n.to_smiles}.join("\n")} - `babel smiles.smi -ofpt 2>/dev/null| grep Tanimoto|cut -d "=" -f2`.split("\n").each_with_index do |t,i| - @neighbors[i].tanimoto = t.strip.to_f - end + @query = @neighbors.shift + File.open("search.yaml","w+"){|s| s.puts self.to_yaml} + #puts @neighbors.query.to_name end end +=end end - diff --git a/test.rb b/test.rb new file mode 100644 index 0000000..bd15dcd --- /dev/null +++ b/test.rb @@ -0,0 +1,51 @@ +require 'test/unit' +#require 'yaml' +require './pubchem.rb' + +class MRATest < Test::Unit::TestCase + + def setup + #@p = PubChem::Compound.new "c1cc(CC)ccc1" + @p = PubChem::Compound.new + @p.from_smiles "CC(=O)Nc1ccc(O)cc1" + end + + def test_initialize + puts @p.active_assays.size + puts @p.inactive_assays.size + puts @p.targets.size + puts @p.non_targets.size + assert_equal "7500", @p.cid + #assert_equal true, @p.aids[:active].include?(1188) + #assert_equal true, @p.aids[:inactive].include?(435) + end + + def test_similarity_search + #puts @p.to_smiles + @p.neighbors.each do |n| + #puts n.to_smiles + puts @p.target_similarity(n) + end + #puts @p.neighbors.inspect + #assert_equal 100, @p.neighbor_cids.size + end + + def test_assay_description + puts @p.assay_description.to_yaml + end + + def test_assay_genes + puts @p.assay_genes.to_yaml + end + + def test_assay_similarity + @p2 = PubChem::Compound.new "OC(=O)C1=C(C=CC=C1)OC(=O)C" + puts @p.assay_similarity(@p2) + end + + def test_target_similarity + @p2 = PubChem::Compound.new "OC(=O)C1=C(C=CC=C1)OC(=O)C" + puts @p.target_similarity(@p2) + end + +end diff --git a/views/index.haml b/views/index.haml index c8481c3..b75a9ab 100644 --- a/views/index.haml +++ b/views/index.haml @@ -1,47 +1,27 @@ !!! 5 -= @compound.neighbors.size -%table +%script{:type => "text/javascript", :src => "sorttable.js"} +%table{:class => "sortable"} %tr + %th + = @neighbors.query.to_name + %br + %img{:src => @neighbors.query.to_image_uri} %th Structure %th Properties %th Targets %th Assays - - @compound.neighbors.each do |neighbor| - %tr + - @neighbors.neighbors.each do |neighbor| + - sim = neighbor.structure_similarity @neighbors.query + %tr{:sorttable_customkey => sim} %td - = neighbor.name - %img{:src => neighbor.image} - %dl - %dt Similarity - %dd - = neighbor.tanimoto + = neighbor.to_name + %br + %img{:src => neighbor.to_image_uri} %td - %dl - - neighbor.properties.each do |p,v| - %dt - = p - %dd - = v + = sim %td - %dl - %dt Similarity - %dd - = neighbor.target_similarity @compound.neighbors.first - %dt Targets - %dd - = neighbor.targets.uniq - %dt Total - %dd - = neighbor.targets.size + neighbor.non_targets.size + = neighbor.property_similarity @neighbors.query %td - %dl - %dt Similarity - %dd - = neighbor.assay_similarity @compound.neighbors.first - %dt Active - %dd - = neighbor.active_assays.collect{|a| a["Assay Name"]} - %dt Total - %dd - = neighbor.active_assays.size + neighbor.inactive_assays.size - =# neighbor.inactive_assays.collect{|a| a["Assay Name"]} + = neighbor.target_similarity @neighbors.query + %td + = neighbor.assay_similarity @neighbors.query -- cgit v1.2.3