summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2014-10-28 08:56:48 +0100
committermguetlein <martin.guetlein@gmail.com>2014-10-28 08:56:48 +0100
commita0d9f7bf7481b257acbd7d7629558b3251e653f8 (patch)
tree84089ef60768b93160df86dd46993a44d49ccc95
parent36ea2f865014a761be9fe74719f4c88dbaffeb81 (diff)
add fetch, change URI storing in config.rb, add documentation in README
-rwxr-xr-xnch/01_fetch.sh31
-rwxr-xr-xnch/03_validate_compounds.rb4
-rw-r--r--nch/README.md10
-rw-r--r--nch/config.rb61
4 files changed, 75 insertions, 31 deletions
diff --git a/nch/01_fetch.sh b/nch/01_fetch.sh
new file mode 100755
index 0000000..4af8b00
--- /dev/null
+++ b/nch/01_fetch.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+cd data/01
+
+rm *LOAEL*mol*
+wget https://raw.githubusercontent.com/opentox/test/TD50/LOAEL/LOAEL_mol.csv
+mv LOAEL_mol.csv LOAEL-mol_endpoint_enc.csv
+
+wget https://raw.githubusercontent.com/opentox/test/TD50/LOAEL/LOAEL_topological_mg_mol.csv
+mv LOAEL_topological_mg_mol.csv LOAEL-mol_orig-features_enc.csv
+echo "" >> LOAEL-mol_orig-features_enc.csv
+
+rm *LOAEL*mg*
+wget https://raw.githubusercontent.com/opentox/test/TD50/LOAEL/LOAEL_mg.csv
+mv LOAEL_mg.csv LOAEL-mg_endpoint_enc.csv
+
+rm *MOU*
+wget https://raw.githubusercontent.com/opentox/test/TD50/TD50/MOU_training.csv
+mv MOU_training.csv MOU_endpoint_enc.csv
+wget https://raw.githubusercontent.com/opentox/test/TD50/TD50/MOU_test.csv
+mv MOU_test.csv MOU_test_enc.csv
+echo "" >> MOU_test_enc.csv
+cp MOU_endpoint_enc.csv MOU_complete_enc.csv
+tail -n40 MOU_test_enc.csv >> MOU_complete_enc.csv
+
+wget https://raw.githubusercontent.com/opentox/test/TD50/TD50/MOU_constitutional_06e4762c88_report.csv
+mv MOU_constitutional_06e4762c88_report.csv MOU_orig-features_enc.csv
+echo "" >> MOU_orig-features_enc.csv
+
+cd -
+
diff --git a/nch/03_validate_compounds.rb b/nch/03_validate_compounds.rb
index 1ddb719..304add2 100755
--- a/nch/03_validate_compounds.rb
+++ b/nch/03_validate_compounds.rb
@@ -37,8 +37,8 @@ DATA.each do |d|
File.open("data/03/#{d}_uniq.csv","w").puts "\""+(["InChI"]+@inchis).join("\"\n\"")+"\""
puts @inchis.size.to_s+" uniq compounds in complete-set (written to data/03/#{d}_uniq.csv)"
else
- raise "complete-set inchis not found in features-set #{@inchis.inspect}" unless @inchis.size==0
- puts "inchis in features-set uniq and all included in complete-set"
+ raise "complete-set inchis not found in orig-features-set #{@inchis.inspect}" unless @inchis.size==0
+ puts "inchis in orig-features-set uniq and all included in complete-set"
end
end
diff --git a/nch/README.md b/nch/README.md
index c64bbcb..193ca31 100644
--- a/nch/README.md
+++ b/nch/README.md
@@ -3,6 +3,12 @@
scripts
-------------------------------------
* results are stored in the data folder in the corresponding sub-folders: 01, 02, ...
-* config.rb defines which datasets to employ
+* config.rb defines which datasets to employ and stores URIs of already uploaded files
-01_fetch - copies data from old repository \ No newline at end of file
+01_fetch - copies data from old repository and converts to a consistent naming scheme
+02_decode_inchi.rb - decodes inchis and renames SMILES column to InChI
+03_validate_compounds.rb - checks if all compounds are included in the feature set, stores uniq compounds without duplicates
+04_get_feature_names.rb - extracts new features names for features from orig files
+05_compute_features.rb - computes new features
+06_compare_features.rb - compares orig features and new features
+07_validate.rb - starts crossvalidation/test set validation with old / new features \ No newline at end of file
diff --git a/nch/config.rb b/nch/config.rb
index 3151c5a..293cdc8 100644
--- a/nch/config.rb
+++ b/nch/config.rb
@@ -1,5 +1,28 @@
-DATA = ["LOAEL-mol", "LOAEL-mg","MOU"]
+#DATA = ["LOAEL-mol", "LOAEL-mg","MOU"]
+DATA = ["MOU"]
+
+URIS = {
+ "LOAEL-mol"=>{
+ # :dataset_uri=>"http://localhost:8083/dataset/3da90c55-0388-42a0-8ada-978abe4a515c",
+ # :prediction_feature=>"http://localhost:8084/feature/2a74d78d-5b3d-438c-a1e5-6cfb16bd9354",
+ # :new_feature_dataset_uri=>"http://localhost:8083/dataset/7de04de4-41ce-4528-97c2-fd92fbb4d0b8",
+ # :orig_feature_dataset_uri=>"http://localhost:8083/dataset/e9016641-dddb-434f-bb05-63d80a37679a",
+ },
+ "LOAEL-mg"=>{
+ # :dataset_uri=>"http://localhost:8083/dataset/4f3b9de4-0494-4339-8ebd-e6c6c1984a23",
+ # :prediction_feature=>"http://localhost:8084/feature/ba5b0f78-36bc-4ac3-8020-9d8b2ca3bd13",
+ # :new_feature_dataset_uri=>"http://localhost:8083/dataset/7de04de4-41ce-4528-97c2-fd92fbb4d0b8",
+ # :orig_feature_dataset_uri=>"http://localhost:8083/dataset/e9016641-dddb-434f-bb05-63d80a37679a",
+ },
+ "MOU"=>{
+ # :dataset_uri=>"http://localhost:8083/dataset/f0af478a-51e6-41a5-adb2-d1a9bedf8981",
+ # :prediction_feature=>"http://localhost:8084/feature/432f18c5-ff8f-4ff2-a1cc-cbda1c43cff9",
+ # :test_dataset_uri=>"http://localhost:8083/dataset/a5c39a5d-8747-495a-8d30-6ee9abdd5f3b",
+ # :new_feature_dataset_uri=>"http://localhost:8083/dataset/8d324c7d-e6fe-4807-b2f8-e851750b959d",
+ # :orig_feature_dataset_uri=>"http://localhost:8083/dataset/cc651943-886c-4290-b346-41d1c951476a",
+ },
+}
def info(d)
puts d.uri
@@ -16,13 +39,8 @@ def plz_add(msg)
end
def dataset_uri(d)
- case d
- when "LOAEL-mol"
- "http://localhost:8083/dataset/3da90c55-0388-42a0-8ada-978abe4a515c"
- when "LOAEL-mg"
- "http://localhost:8083/dataset/4f3b9de4-0494-4339-8ebd-e6c6c1984a23"
- when "MOU"
- "http://localhost:8083/dataset/f0af478a-51e6-41a5-adb2-d1a9bedf8981"
+ if URIS[d] and URIS[d][:dataset_uri]
+ URIS[d][:dataset_uri]
else
dataset = OpenTox::Dataset.new
dataset.upload File.join("data/02/#{d}_endpoint.csv")
@@ -32,21 +50,16 @@ def dataset_uri(d)
end
def prediction_feature(d)
- case d
- when "LOAEL-mol"
- "http://localhost:8084/feature/2a74d78d-5b3d-438c-a1e5-6cfb16bd9354"
- when "LOAEL-mg"
- "http://localhost:8084/feature/ba5b0f78-36bc-4ac3-8020-9d8b2ca3bd13"
- when "MOU"
- "http://localhost:8084/feature/432f18c5-ff8f-4ff2-a1cc-cbda1c43cff9"
+ if URIS[d] and URIS[d][:prediction_feature]
+ URIS[d][:prediction_feature]
else
plz_add "prediction_feature by uploading dataset"
end
end
def test_dataset_uri(d)
- if d=="MOU"
- "http://localhost:8083/dataset/a5c39a5d-8747-495a-8d30-6ee9abdd5f3b"
+ if URIS[d] and URIS[d][:test_dataset_uri]
+ URIS[d][:test_dataset_uri]
else
pred_feat = prediction_feature(d)
dataset = OpenTox::Dataset.new
@@ -58,11 +71,8 @@ def test_dataset_uri(d)
end
def new_feature_dataset_uri(d)
- case d
- when /LOAEL-mol|LOAEL-mg/
- "http://localhost:8083/dataset/7de04de4-41ce-4528-97c2-fd92fbb4d0b8"
- when "MOU"
- "http://localhost:8083/dataset/8d324c7d-e6fe-4807-b2f8-e851750b959d"
+ if URIS[d] and URIS[d][:new_feature_dataset_uri]
+ URIS[d][:new_feature_dataset_uri]
else
u_dataset = OpenTox::Dataset.new
u_dataset.upload File.join("data/03/#{d}_uniq.csv")
@@ -78,11 +88,8 @@ def new_feature_dataset_uri(d)
end
def orig_feature_dataset_uri(d)
- case d
- when /LOAEL-mol|LOAEL-mg/
- "http://localhost:8083/dataset/e9016641-dddb-434f-bb05-63d80a37679a"
- when "MOU"
- "http://localhost:8083/dataset/cc651943-886c-4290-b346-41d1c951476a"
+ if URIS[d] and URIS[d][:orig_feature_dataset_uri]
+ URIS[d][:orig_feature_dataset_uri]
else
f = OpenTox::Dataset.new
f.upload File.join("data/02/#{d}_orig-features.csv")