summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-10-31 14:50:42 +0100
committerhelma@in-silico.ch <helma@in-silico.ch>2018-10-31 14:50:42 +0100
commit5b08a8c6d8e5567d253bec92d5bf5d18fd040cdc (patch)
tree9cd4bf4a79ff09771e51bafbc828a088d975bf66
parent2d4ce39cb1b489e26b0d6d96026054566a4f77b9 (diff)
pubchem import for openrisknet
-rw-r--r--lib/dataset.rb41
-rw-r--r--test/use_cases.rb50
2 files changed, 63 insertions, 28 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 46a83d7..d02a302 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -207,21 +207,40 @@ module OpenTox
# @param [Integer] PubChem AssayID (AID)
# @return [OpenTox::Dataset]
def self.from_pubchem_aid aid
- url = File.join PUBCHEM_URI, "assay/aid/#{aid}/CSV"
- assay_metadata = JSON.parse(RestClientWrapper.get(File.join PUBCHEM_URI,"assay/aid/#{aid}/description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"]
+ # TODO get regression data
+ aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
+ assay_metadata = JSON.parse(RestClientWrapper.get(File.join aid_url,"description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"]
name = assay_metadata["name"].gsub(/\s+/,"_")
- csv = CSV.parse(RestClientWrapper.get(url))
- csv.select!{|r| r[0].match /^\d/} # discard header rows
+ dataset = self.new(:source => aid_url, :name => name)
+ # Get assay data in chunks
+ # Assay record retrieval is limited to 10000 SIDs
+ # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
+ list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
+ listkey = list["ListKey"]
+ size = list["Size"]
+ start = 0
+ csv = []
+ while start < size
+ url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
+ csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
+ start += 10000
+ end
table = [["SID","SMILES",name]]
csv.each_slice(100) do |slice| # get SMILES in chunks
- sids = slice.collect{|s| s[1]}
- smiles = RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT")).split("\n").collect{|s| s.to_s}
- abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size
- smiles.each_with_index do |smi,i|
- table << [slice[i][1].to_s,smi.chomp,slice[i][3].to_s]
+ cids = slice.collect{|s| s[2]}
+ pubchem_cids = []
+ JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
+ i = cids.index(prop["CID"].to_s)
+ value = slice[i][3]
+ if value == "Active" or value == "Inactive"
+ table << [slice[i][1].to_s,prop["CanonicalSMILES"],slice[i][3].to_s]
+ pubchem_cids << prop["CID"].to_s
+ else
+ dataset.warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is #{value}."
+ end
end
+ (cids-pubchem_cids).each { |cid| dataset.warnings << "Could not retrieve SMILES for CID #{cid}, all entries are ignored." }
end
- dataset = self.new(:source => url, :name => name)
dataset.parse_table table
dataset
end
@@ -315,7 +334,7 @@ module OpenTox
positions = []
all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles}
all_substances.select{|s| s.smiles == substance.smiles}.each do |s|
- add s, warnings_feature, "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ add s, warnings_feature, "Duplicated compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
end
end
save
diff --git a/test/use_cases.rb b/test/use_cases.rb
index 4959f16..4e072d8 100644
--- a/test/use_cases.rb
+++ b/test/use_cases.rb
@@ -3,27 +3,43 @@ require_relative "setup.rb"
class UseCasesTest < MiniTest::Test
def test_PA
- kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf"
- hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv"
- efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv"
- datasets = [kazius,hansen,efsa]
- map = {"1" => "mutagen", "0" => "nonmutagen"}
- p "merging"
- training_dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,map,map], keep_original_features: false, remove_duplicates: true
- assert_equal 8281, training_dataset.compounds.size
- p training_dataset.features.size
- p training_dataset.id
- training_dataset = Dataset.find('5bd8ac8fca62695d767fca6b')
+ #kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf"
+ #hansen = Dataset.from_csv_file "#{DATA_DIR}/hansen.csv"
+ #efsa = Dataset.from_csv_file "#{DATA_DIR}/efsa.csv"
+ #datasets = [kazius,hansen,efsa]
+ #map = {"1" => "mutagen", "0" => "nonmutagen"}
+ #p "merging"
+ #training_dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,map,map], keep_original_features: false, remove_duplicates: true
+ #assert_equal 8281, training_dataset.compounds.size
+ #p training_dataset.features.size
+ #p training_dataset.id
+ #training_dataset = Dataset.find('5bd8ac8fca62695d767fca6b')
+ #training_dataset = Dataset.find('5bd8bbadca62695f69e7a33b')
+ #puts training_dataset.to_csv
p "create model_validation"
- model_validation = Model::Validation.from_dataset training_dataset: training_dataset, prediction_feature: training_dataset.merged_features.first, species: "Salmonella typhimurium", endpoint: "Mutagenicity"
- p model_validation.id
- p "predict"
- pa = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf"
- prediction_dataset = model_dataset.predict pa
- p prediction_dataset.id
+ #model_validation = Model::Validation.from_dataset training_dataset: training_dataset, prediction_feature: training_dataset.merged_features.first, species: "Salmonella typhimurium", endpoint: "Mutagenicity"
+ #p model_validation.id
+ #model_validation = Model::Validation.find '5bd8df47ca6269604590ab38'
+ #p "predict"
+ #pa = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf"
+ #prediction_dataset = model_validation.predict pa
+ #p prediction_dataset.id
+ prediction_dataset = Dataset.find('5bd98b88ca6269609aab79f4')
puts prediction_dataset.to_csv
end
+ def test_tox21
+ training_dataset = Dataset.from_pubchem_aid 743122
+ p training_dataset.id
+ #'5bd9a1dbca626969d97fb421'
+ File.open("AID743122.csv","w+"){|f| f.puts training_dataset.to_csv}
+ model = Model::Lazar.create training_dataset: training_dataset
+ p model.id
+ #p Model::Lazar.find('5bd9a70bca626969d97fc9df')
+ model_validation = Model::Validation.from_dataset training_dataset: training_dataset, prediction_feature: training_dataset.bioactivity_features.first, species: "Human HG2L7.5c1 cell line", endpoint: "aryl hydrocarbon receptor (AhR) signaling pathway activation"
+ p model_validation.id
+ end
+
def test_public_models
skip
=begin