diff options
Diffstat (limited to 'Rakefile')
-rw-r--r-- | Rakefile | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..5d63b4f --- /dev/null +++ b/Rakefile @@ -0,0 +1,165 @@ +#!/usr/bin/env ruby +require_relative '../lazar/lib/lazar.rb' +include OpenTox + +#task :default => ["predictions/PA_mutagenicity.id","validations/mutagenicity-merged.id"] +task :default => ["predictions/PA_mutagenicity.id"] + +# summaries +# predictions + +file "predictions/PA_mutagenicity.id" => ["models/mutagenicity-merged.id", "data/PA.id"] do |t| predict t end +file "predictions/PA_carcinogenicity.id" => ["models/carcinogenicity.id", "data/PA.id"] do |t| predict t end + +# validations + +file "validations/mutagenicity-merged.id" => "models/mutagenicity-merged.id" do |t| validate_model t end +file "validations/carcinogenicity.id" => "models/carcinogenicity.id" do |t| validate_model t end + +# models + +file "models/mutagenicity-merged.id" => "data/mutagenicity-merged.id" do |t| create_model t end +file "models/carcinogenicity.id" => "data/carcinogenicity.id" do |t| create_model t end + +# test data + +file "data/PA.id" => "data/PA.sdf" do |t| import_sdf t end + +# training data + +file "data/mutagenicity-merged.id" => ["data/hansen.id", "data/kazius.id", "data/efsa.id"] do |t| + input = t.prerequisites.collect{|id| Dataset.find(File.read(id).chomp)} + source_feature = Feature.where(:name => "Ames test categorisation").first # Kazius + target_feature = Feature.where(:name => "Mutagenicity").first + merged = Dataset.merge input, {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} + File.open(t.name,"w+") { |f| f.puts merged.id } +end + +file "data/carcinogenicity.id" do |t| + File.open(t.name,"w+") do |f| + d = Dataset.from_pubchem 1205 + f.puts d.id + end +end + +# kazius + +file "data/kazius.id" => "data/cas_4337.sdf" do |t| import_sdf t end + +file "data/cas_4337.sdf" => "data/cas_4337.zip" do |t| + `cd data && unzip cas_4337.zip` +end + +file "data/cas_4337.zip" do + `cd data && wget "http://cheminformatics.org/datasets/bursi/cas_4337.zip"` +end + +# efsa + +file "data/efsa.id" => "data/efsa.csv" do |t| import_csv t end + +file "data/efsa.csv" => "data/GENOTOX_data_and_dictionary.tsv" do |t| efsa2csv t end + +file "data/GENOTOX_data_and_dictionary.tsv" => "data/GENOTOX_data_and_dictionary.xls" do |t| + `xls2csv -s cp1252 -d utf-8 -x -c " " #{t.name} > #{t.prerequisites[0]}` +end + +file "data/GENOTOX_data_and_dictionary.xls" do |t| + `cd data && wget "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX%20data%20and%20dictionary.xls" -o #{t.name}` +end + +# hansen + +file "data/hansen.id" => "data/hansen.csv" do |t| import_csv t end + +file "data/hansen.csv" => "data/Mutagenicity_N6512.csv" do |t| + File.open(t.name,"w+") do |f| + i = 0 + CSV.foreach(t.prerequisites[0]) do |row| + if i == 0 + f.puts "ID,SMILES,Mutagenicity" + else + c = OpenTox::Compound.from_smiles(row[5]).smiles + f.puts ["hansen_#{row[0]}",c, row[2]].join "," + end + i += 1 + end + end +end + +file "data/Mutagenicity_N6512.csv" do + `cd data && wget "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"` +end + +# cleanup + +task :clean do `rm data/*id` end +task :cleanall do `rm data/*` end + +# functions + +def import_csv t + File.open(t.name,"w+") do |f| + d = Dataset.from_csv_file t.prerequisites[0] + f.puts d.id + end +end + +def import_sdf t + File.open(t.name,"w+") do |f| + d = Dataset.from_sdf_file t.prerequisites[0] + f.puts d.id + end +end + +def efsa2csv t + File.open(t.name,"w+") do |f| + i = 0 + db = {} + CSV.foreach(t.prerequisites[0], :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row| + if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33] + begin + c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles + rescue + c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters + end + db[c] ||= {} + db[c][:id] ||= "efsa_#{row[2]}" + if row[33].match(/Positiv/i) + db[c][:value] = 1 # at least one positive result in TA 98 or TA 100 + elsif row[33].match(/Negativ/i) + db[c][:value] ||= 0 + end + end + i += 1 + end + + f.puts "ID,SMILES,Mutagenicity" + db.each do |s,v| + f.puts [v[:id],s,v[:value]].join "," + end + end +end + +def create_model t + File.open(t.name,"w+") do |f| + model = Model::LazarClassification.create(training_dataset: Dataset.find(File.read(t.prerequisites.first).chomp)) + f.puts model.id + end +end + +def validate_model t + File.open(t.name,"w+") do |f| + rcv = Validation::RepeatedCrossValidation.create(Model.find(File.read(t.prerequisites.first).chomp)) + f.puts rcv.id + end +end + +def predict t + File.open(t.name,"w+") do |f| + model = Model.find t.prerequisites[0] + dataset = Dataset.find t.prerequisites[1] + prediction = model.predict dataset + f.puts prediction.id + end +end |