summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-11-13 14:32:09 +0100
committerhelma@in-silico.ch <helma@in-silico.ch>2018-11-13 14:32:09 +0100
commit6e23be652ad90c747aaccf15258bdaa4458185a4 (patch)
treebcd9fe38d492217d00f983bab119bbb44588a837 /lib
parent8649795b3d5d63f227eed030286270b91ec39c68 (diff)
public dataset download
Diffstat (limited to 'lib')
-rw-r--r--lib/download.rb201
-rw-r--r--lib/import.rb13
-rw-r--r--lib/lazar.rb4
-rw-r--r--lib/model.rb3
4 files changed, 216 insertions, 5 deletions
diff --git a/lib/download.rb b/lib/download.rb
index 2f6b4f1..99d8842 100644
--- a/lib/download.rb
+++ b/lib/download.rb
@@ -22,6 +22,7 @@ module OpenTox
end
warnings = []
name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_")
+ $logger.debug name
table = [["SID","SMILES",name]]
csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100
cids = slice.collect{|s| s[2]}
@@ -45,7 +46,58 @@ module OpenTox
meta = {
:species => species,
:endpoint => endpoint,
- :source => aid_url,
+ :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}",
+ :qmrf => qmrf,
+ :warnings => warnings
+ }
+ File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json}
+ end
+
+ def self.pubchem_regression aid: , species: , endpoint:, qmrf: nil
+ aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
+
+ # Get assay data in chunks
+ # Assay record retrieval is limited to 10000 SIDs
+ # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
+ list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
+ listkey = list["ListKey"]
+ size = list["Size"]
+ start = 0
+ csv = []
+ unit = nil
+ while start < size
+ url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
+ # get unit
+ unit ||= CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0] == "RESULT_UNIT"}[0][8]
+ csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
+ start += 10000
+ end
+ warnings = []
+ name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_")
+ $logger.debug name
+ table = [["SID","SMILES","-log10(#{name} [#{unit}])"]]
+ csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100
+ cids = slice.collect{|s| s[2]}
+ pubchem_cids = []
+ JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
+ i = cids.index(prop["CID"].to_s)
+ value = slice[i][8]
+ if value
+ value = -Math.log10(value.to_f)
+ table << [slice[i][1].to_s,prop["CanonicalSMILES"],value]
+ pubchem_cids << prop["CID"].to_s
+ else
+ warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'."
+ end
+ end
+ (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
+ end
+ File.open(File.join(File.dirname(__FILE__),"..","data",name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
+ meta = {
+ :species => species,
+ :endpoint => endpoint,
+ :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}",
+ :unit => unit,
:qmrf => qmrf,
:warnings => warnings
}
@@ -53,6 +105,7 @@ module OpenTox
end
def self.mutagenicity
+ $logger.debug "Mutagenicity"
# TODO add download/conversion programs to lazar dependencies
hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
@@ -68,6 +121,7 @@ module OpenTox
# convert hansen
hansen = CSV.read File.join(parts,"hansen-original.csv")
hansen.shift
+
map = {"0" => "non-mutagenic","1" => "mutagenic"}
File.open(File.join(parts,"hansen.csv"),"w+") do |f|
f.puts "ID,SMILES,Mutagenicity"
@@ -122,12 +176,155 @@ module OpenTox
:source => [kazius_url,hansen_url,efsa_url].join(", "),
:qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"},
}
- File.open(File.join(File.dirname(__FILE__),"..","data","Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json}
+ File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json}
# cleanup
datasets << dataset
datasets.each{|d| d.delete }
end
+ def self.blood_brain_barrier
+ url = "http://cheminformatics.org/datasets/li/bbp2.smi"
+ name = "Blood_Brain_Barrier_Penetration-Human"
+ $logger.debug name
+ map = {"n" => "non-penetrating", "p" => "penetrating"}
+ table = CSV.parse RestClientWrapper.get(url).to_s, :col_sep => "\t"
+ File.open(File.join(DATA,name+".csv"),"w+") do |f|
+ f.puts "ID,SMILES,#{name}"
+ table.each do |row|
+ f.puts [row[1],row[0],map[row[3]]].join(",")
+ end
+ end
+ meta = {
+ :species => "Human",
+ :endpoint => "Blood Brain Barrier Penetration",
+ :source => url,
+ :qmrf => {"name": "QMRF 5.4. Toxicokinetics.Blood-brain barrier penetration", "group": "QMRF 5. Toxicokinetics"},
+ }
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ end
+
+ def self.loael
+ # TODO: fix url??
+ url = "https://raw.githubusercontent.com/opentox/loael-paper/revision/data/training_log10.csv"
+ name = "Lowest_observed_adverse_effect_level-Rats"
+ $logger.debug name
+ File.open(File.join(DATA,name+".csv"),"w+") do |f|
+ f.puts RestClientWrapper.get(url).to_s
+ end
+ meta = {
+ :species => "Rat",
+ :endpoint => "Lowest observed adverse effect level",
+ :source => url,
+ :unit => "mmol/kg_bw/day",
+ :qmrf => {
+ "name": "QMRF 4.14. Repeated dose toxicity",
+ "group": "QMRF 4.Human Health Effects"
+ }
+ }
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ end
+
+ def self.daphnia
+ # download of original file requires email request, this is a temporary solution
+ url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv"
+ name = "Acute_toxicity-Daphnia_magna"
+ $logger.debug name
+ File.open(File.join(DATA,name+".csv"),"w+") do |f|
+ f.puts RestClientWrapper.get(url).to_s
+ end
+ meta = { "species": "Daphnia magna",
+ "endpoint": "Acute toxicity",
+ "source": "http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/",
+ "unit": "mmol/L",
+ "qmrf": {
+ "group": "QMRF 3.1. Short-term toxicity to Daphnia (immobilisation)",
+ "name": "EC C. 2. Daphnia sp Acute Immobilisation Test"
+ }
+ }
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ end
+
+ def self.public_data
+
+ # Classification
+ [
+ {
+ :aid => 1205,
+ :species => "Rodents",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
+ },{
+ :aid => 1208,
+ :species => "Rat",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
+ },{
+ :aid => 1199,
+ :species => "Mouse",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
+ }
+ ].each do |assay|
+ Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogen", inactive: "non-carcinogen", qmrf: assay[:qmrf]
+ end
+ Download.mutagenicity
+ Download.blood_brain_barrier
+
+ # Regression
+ [
+ {
+ :aid => 1195,
+ :species => "Human",
+ :endpoint => "Maximum Recommended Daily Dose",
+ :qmrf => {
+ "group": "QMRF 4.14. Repeated dose toxicity",
+ "name": "OECD 452 Chronic Toxicity Studies"
+ },
+ },{
+ :aid => 1208,
+ :species => "Rat (TD50)",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {
+ :group => "QMRF 4.12. Carcinogenicity",
+ :name => "OECD 451 Carcinogenicity Studies"
+ }
+ },{
+ :aid => 1199,
+ :species => "Mouse (TD50)",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {
+ :group => "QMRF 4.12. Carcinogenicity",
+ :name => "OECD 451 Carcinogenicity Studies"
+ }
+ },{
+ :aid => 1188,
+ :species => "Fathead minnow",
+ :endpoint => "Acute toxicity",
+ :qmrf => {
+ "group": "QMRF 3.3. Acute toxicity to fish (lethality)",
+ "name": "EC C. 1. Acute Toxicity for Fish"
+ }
+ }
+ ].each do |assay|
+ Download.pubchem_regression aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], qmrf: assay[:qmrf]
+ end
+
+ Download.loael
+ Download.daphnia
+
+=begin
+ # 1204 estrogen receptor
+ # 1259408, # GENE-TOX
+ # 1159563 HepG2 cytotoxicity assay
+ # 588209 hepatotoxicity
+ # 1259333 cytotoxicity
+ # 1159569 HepG2 cytotoxicity counterscreen Measured in Cell-Based System Using Plate Reader - 2153-03_Inhibitor_Dose_DryPowder_Activity
+ # 2122 HTS Counterscreen for Detection of Compound Cytotoxicity in MIN6 Cells
+ # 116724 Acute toxicity determined after intravenal administration in mice
+ # 1148549 Toxicity in po dosed mouse assessed as mortality after 7 days
+=end
+ end
+
end
end
diff --git a/lib/import.rb b/lib/import.rb
new file mode 100644
index 0000000..831efcb
--- /dev/null
+++ b/lib/import.rb
@@ -0,0 +1,13 @@
+module OpenTox
+
+ class Import
+
+ def self.public_data
+ # TODO clear database?
+ Dir[File.join(File.dirname(__FILE__),"..","data/*csv")].each do |f|
+ $logger.debug f
+ Model::Validation.from_csv_file f
+ end
+ end
+ end
+end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 6f14f67..c3bbbf3 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -97,6 +97,6 @@ CLASSES = ["Feature","Substance","Dataset","CrossValidation","LeaveOneOutValidat
"train-test-validation.rb",
"leave-one-out-validation.rb",
"crossvalidation.rb",
- "download.rb"
- #"import.rb",
+ "download.rb",
+ "import.rb",
].each{ |f| require_relative f }
diff --git a/lib/model.rb b/lib/model.rb
index 966460b..70ae43c 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -422,6 +422,7 @@ module OpenTox
field :species, type: String
field :source, type: String
field :unit, type: String
+ field :warnings, type: Array
field :model_id, type: BSON::ObjectId
field :repeated_crossvalidation_id, type: BSON::ObjectId
@@ -494,7 +495,7 @@ module OpenTox
# Create and validate a lazar model from a csv file with training data and a json file with metadata
# @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
- # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
+ # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
def self.from_csv_file file
metadata_file = file.sub(/csv$/,"json")
bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file