summaryrefslogtreecommitdiff
path: root/paper/create-training-test-sets.rb
diff options
context:
space:
mode:
Diffstat (limited to 'paper/create-training-test-sets.rb')
-rw-r--r--paper/create-training-test-sets.rb42
1 files changed, 42 insertions, 0 deletions
diff --git a/paper/create-training-test-sets.rb b/paper/create-training-test-sets.rb
new file mode 100644
index 0000000..1079341
--- /dev/null
+++ b/paper/create-training-test-sets.rb
@@ -0,0 +1,42 @@
+require_relative '../../lazar/lib/lazar'
+include OpenTox
+dirpath = File.join(File.dirname(__FILE__),"..","regression")
+old = CSV.read File.join(dirpath,"LOAEL_mg_corrected_smiles_mmol.csv")
+old.shift
+new = CSV.read File.join(dirpath,"swissRat_chron_LOAEL_mmol.csv")
+new.shift
+p old.size
+p new.size
+# canonical smiles
+old.collect!{|r| [Compound.from_smiles(r.first).smiles, r.last]}
+new.collect!{|r| [Compound.from_smiles(r.first).smiles, r.last]}
+old_compounds = old.collect{|r| r.first}.uniq
+new_compounds = new.collect{|r| r.first}.uniq
+p old_compounds.size
+p new_compounds.size
+common_compounds = (old_compounds & new_compounds).uniq
+p common_compounds.size
+common = []
+# TODO: canonical smiles??
+common_compounds.each do |smi|
+ old_rows = old.select{|r| r.first == smi}
+ new_rows = new.select{|r| r.first == smi}
+ common += old_rows + new_rows
+ old -= old_rows
+ new -= new_rows
+end
+header = ["SMILES","LOAEL"]
+p old.size
+p new.size
+p common.size
+{
+ "mazzatorta-loael-training.csv" => old.uniq,
+ "swiss-loael-training.csv" => new.uniq,
+ "combined-training.csv" => (old+new).uniq,
+ "common-test.csv" => common.uniq,
+}.each do |file,data|
+ CSV.open(File.join(dirpath,file),"w+") do |csv|
+ csv << header
+ data.each{|row| csv << row}
+ end
+end