From a0d9f7bf7481b257acbd7d7629558b3251e653f8 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 28 Oct 2014 08:56:48 +0100 Subject: add fetch, change URI storing in config.rb, add documentation in README --- nch/01_fetch.sh | 31 ++++++++++++++++++++++ nch/03_validate_compounds.rb | 4 +-- nch/README.md | 10 ++++++-- nch/config.rb | 61 ++++++++++++++++++++++++-------------------- 4 files changed, 75 insertions(+), 31 deletions(-) create mode 100755 nch/01_fetch.sh diff --git a/nch/01_fetch.sh b/nch/01_fetch.sh new file mode 100755 index 0000000..4af8b00 --- /dev/null +++ b/nch/01_fetch.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +cd data/01 + +rm *LOAEL*mol* +wget https://raw.githubusercontent.com/opentox/test/TD50/LOAEL/LOAEL_mol.csv +mv LOAEL_mol.csv LOAEL-mol_endpoint_enc.csv + +wget https://raw.githubusercontent.com/opentox/test/TD50/LOAEL/LOAEL_topological_mg_mol.csv +mv LOAEL_topological_mg_mol.csv LOAEL-mol_orig-features_enc.csv +echo "" >> LOAEL-mol_orig-features_enc.csv + +rm *LOAEL*mg* +wget https://raw.githubusercontent.com/opentox/test/TD50/LOAEL/LOAEL_mg.csv +mv LOAEL_mg.csv LOAEL-mg_endpoint_enc.csv + +rm *MOU* +wget https://raw.githubusercontent.com/opentox/test/TD50/TD50/MOU_training.csv +mv MOU_training.csv MOU_endpoint_enc.csv +wget https://raw.githubusercontent.com/opentox/test/TD50/TD50/MOU_test.csv +mv MOU_test.csv MOU_test_enc.csv +echo "" >> MOU_test_enc.csv +cp MOU_endpoint_enc.csv MOU_complete_enc.csv +tail -n40 MOU_test_enc.csv >> MOU_complete_enc.csv + +wget https://raw.githubusercontent.com/opentox/test/TD50/TD50/MOU_constitutional_06e4762c88_report.csv +mv MOU_constitutional_06e4762c88_report.csv MOU_orig-features_enc.csv +echo "" >> MOU_orig-features_enc.csv + +cd - + diff --git a/nch/03_validate_compounds.rb b/nch/03_validate_compounds.rb index 1ddb719..304add2 100755 --- a/nch/03_validate_compounds.rb +++ b/nch/03_validate_compounds.rb @@ -37,8 +37,8 @@ DATA.each do |d| File.open("data/03/#{d}_uniq.csv","w").puts "\""+(["InChI"]+@inchis).join("\"\n\"")+"\"" puts @inchis.size.to_s+" uniq compounds in complete-set (written to data/03/#{d}_uniq.csv)" else - raise "complete-set inchis not found in features-set #{@inchis.inspect}" unless @inchis.size==0 - puts "inchis in features-set uniq and all included in complete-set" + raise "complete-set inchis not found in orig-features-set #{@inchis.inspect}" unless @inchis.size==0 + puts "inchis in orig-features-set uniq and all included in complete-set" end end diff --git a/nch/README.md b/nch/README.md index c64bbcb..193ca31 100644 --- a/nch/README.md +++ b/nch/README.md @@ -3,6 +3,12 @@ scripts ------------------------------------- * results are stored in the data folder in the corresponding sub-folders: 01, 02, ... -* config.rb defines which datasets to employ +* config.rb defines which datasets to employ and stores URIs of already uploaded files -01_fetch - copies data from old repository \ No newline at end of file +01_fetch - copies data from old repository and converts to a consistent naming scheme +02_decode_inchi.rb - decodes inchis and renames SMILES column to InChI +03_validate_compounds.rb - checks if all compounds are included in the feature set, stores uniq compounds without duplicates +04_get_feature_names.rb - extracts new features names for features from orig files +05_compute_features.rb - computes new features +06_compare_features.rb - compares orig features and new features +07_validate.rb - starts crossvalidation/test set validation with old / new features \ No newline at end of file diff --git a/nch/config.rb b/nch/config.rb index 3151c5a..293cdc8 100644 --- a/nch/config.rb +++ b/nch/config.rb @@ -1,5 +1,28 @@ -DATA = ["LOAEL-mol", "LOAEL-mg","MOU"] +#DATA = ["LOAEL-mol", "LOAEL-mg","MOU"] +DATA = ["MOU"] + +URIS = { + "LOAEL-mol"=>{ + # :dataset_uri=>"http://localhost:8083/dataset/3da90c55-0388-42a0-8ada-978abe4a515c", + # :prediction_feature=>"http://localhost:8084/feature/2a74d78d-5b3d-438c-a1e5-6cfb16bd9354", + # :new_feature_dataset_uri=>"http://localhost:8083/dataset/7de04de4-41ce-4528-97c2-fd92fbb4d0b8", + # :orig_feature_dataset_uri=>"http://localhost:8083/dataset/e9016641-dddb-434f-bb05-63d80a37679a", + }, + "LOAEL-mg"=>{ + # :dataset_uri=>"http://localhost:8083/dataset/4f3b9de4-0494-4339-8ebd-e6c6c1984a23", + # :prediction_feature=>"http://localhost:8084/feature/ba5b0f78-36bc-4ac3-8020-9d8b2ca3bd13", + # :new_feature_dataset_uri=>"http://localhost:8083/dataset/7de04de4-41ce-4528-97c2-fd92fbb4d0b8", + # :orig_feature_dataset_uri=>"http://localhost:8083/dataset/e9016641-dddb-434f-bb05-63d80a37679a", + }, + "MOU"=>{ + # :dataset_uri=>"http://localhost:8083/dataset/f0af478a-51e6-41a5-adb2-d1a9bedf8981", + # :prediction_feature=>"http://localhost:8084/feature/432f18c5-ff8f-4ff2-a1cc-cbda1c43cff9", + # :test_dataset_uri=>"http://localhost:8083/dataset/a5c39a5d-8747-495a-8d30-6ee9abdd5f3b", + # :new_feature_dataset_uri=>"http://localhost:8083/dataset/8d324c7d-e6fe-4807-b2f8-e851750b959d", + # :orig_feature_dataset_uri=>"http://localhost:8083/dataset/cc651943-886c-4290-b346-41d1c951476a", + }, +} def info(d) puts d.uri @@ -16,13 +39,8 @@ def plz_add(msg) end def dataset_uri(d) - case d - when "LOAEL-mol" - "http://localhost:8083/dataset/3da90c55-0388-42a0-8ada-978abe4a515c" - when "LOAEL-mg" - "http://localhost:8083/dataset/4f3b9de4-0494-4339-8ebd-e6c6c1984a23" - when "MOU" - "http://localhost:8083/dataset/f0af478a-51e6-41a5-adb2-d1a9bedf8981" + if URIS[d] and URIS[d][:dataset_uri] + URIS[d][:dataset_uri] else dataset = OpenTox::Dataset.new dataset.upload File.join("data/02/#{d}_endpoint.csv") @@ -32,21 +50,16 @@ def dataset_uri(d) end def prediction_feature(d) - case d - when "LOAEL-mol" - "http://localhost:8084/feature/2a74d78d-5b3d-438c-a1e5-6cfb16bd9354" - when "LOAEL-mg" - "http://localhost:8084/feature/ba5b0f78-36bc-4ac3-8020-9d8b2ca3bd13" - when "MOU" - "http://localhost:8084/feature/432f18c5-ff8f-4ff2-a1cc-cbda1c43cff9" + if URIS[d] and URIS[d][:prediction_feature] + URIS[d][:prediction_feature] else plz_add "prediction_feature by uploading dataset" end end def test_dataset_uri(d) - if d=="MOU" - "http://localhost:8083/dataset/a5c39a5d-8747-495a-8d30-6ee9abdd5f3b" + if URIS[d] and URIS[d][:test_dataset_uri] + URIS[d][:test_dataset_uri] else pred_feat = prediction_feature(d) dataset = OpenTox::Dataset.new @@ -58,11 +71,8 @@ def test_dataset_uri(d) end def new_feature_dataset_uri(d) - case d - when /LOAEL-mol|LOAEL-mg/ - "http://localhost:8083/dataset/7de04de4-41ce-4528-97c2-fd92fbb4d0b8" - when "MOU" - "http://localhost:8083/dataset/8d324c7d-e6fe-4807-b2f8-e851750b959d" + if URIS[d] and URIS[d][:new_feature_dataset_uri] + URIS[d][:new_feature_dataset_uri] else u_dataset = OpenTox::Dataset.new u_dataset.upload File.join("data/03/#{d}_uniq.csv") @@ -78,11 +88,8 @@ def new_feature_dataset_uri(d) end def orig_feature_dataset_uri(d) - case d - when /LOAEL-mol|LOAEL-mg/ - "http://localhost:8083/dataset/e9016641-dddb-434f-bb05-63d80a37679a" - when "MOU" - "http://localhost:8083/dataset/cc651943-886c-4290-b346-41d1c951476a" + if URIS[d] and URIS[d][:orig_feature_dataset_uri] + URIS[d][:orig_feature_dataset_uri] else f = OpenTox::Dataset.new f.upload File.join("data/02/#{d}_orig-features.csv") -- cgit v1.2.3