From 19eb655f4af1a4631692989a30a59b7b78e6669b Mon Sep 17 00:00:00 2001 From: gebele Date: Fri, 20 Apr 2018 10:45:11 +0000 Subject: batch download with original identifiers --- application.rb | 59 +++++++++++++++++++++++++---------------- batch.rb | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ helper.rb | 9 +++---- views/batch.haml | 1 - 4 files changed, 120 insertions(+), 30 deletions(-) create mode 100644 batch.rb diff --git a/application.rb b/application.rb index 9c6f63e..b567147 100644 --- a/application.rb +++ b/application.rb @@ -1,5 +1,6 @@ require_relative 'task.rb' require_relative 'prediction.rb' +require_relative 'batch.rb' require_relative 'helper.rb' include OpenTox @@ -8,6 +9,7 @@ configure :production, :development do enable :reloader also_reload './helper.rb' also_reload './prediction.rb' + also_reload './batch.rb' end before do @@ -46,12 +48,11 @@ get '/task/?' do task = Task.find(params[:turi].to_s) return JSON.pretty_generate(:percent => task.percent) elsif params[:predictions] - task = Task.find(params[:predictions].to_s) - predictions = task.predictions[params[:model]] + task = Task.find(params[:predictions]) pageSize = params[:pageSize].to_i - 1 pageNumber= params[:pageNumber].to_i - 1 if params[:model] == "Cramer" - prediction = predictions + prediction = task.predictions[params[:model]] compound = Compound.find prediction["compounds"][pageNumber] image = compound.svg smiles = compound.smiles @@ -63,6 +64,7 @@ get '/task/?' do string += "" html += "#{string}" else + predictions = task.predictions[params[:model]].collect{|hash| hash.values[0]} prediction_object = Prediction.find predictions[pageNumber] prediction = prediction_object.prediction compound = Compound.find prediction_object.compound @@ -147,7 +149,7 @@ get '/download/dataset/:id' do end get '/delete/dataset/:id' do - dataset = Dataset.find params[:id] + dataset = Batch.find params[:id] dataset.delete File.delete File.join("tmp/"+dataset.name+".csv") redirect to("/") @@ -164,11 +166,13 @@ get '/predict/csv/:task/:model/:filename/?' do else header = task.csv lines = [] - task.predictions[params[:model]].each_with_index do |p,idx| - if p.is_a? BSON::ObjectId - lines << "#{idx+1},#{Prediction.find(p).csv}" + task.predictions[params[:model]].each_with_index do |hash,idx| + identifier = hash.keys[0] + prediction_id = hash.values[0] + if prediction_id.is_a? BSON::ObjectId + lines << "#{idx+1},#{identifier},#{Prediction.find(prediction_id).csv}" else - lines << "#{idx+1},#{p}\n" + lines << "#{idx+1},#{identifier},#{p}\n" end end csv = header + lines.join("") @@ -182,8 +186,9 @@ post '/predict/?' do # process batch prediction if !params[:fileselect].blank? || !params[:existing].blank? if !params[:existing].blank? - @dataset = Dataset.find params[:existing].keys[0] + @dataset = Batch.find params[:existing].keys[0] @compounds = @dataset.compounds + @identifiers = @dataset.identifiers @filename = @dataset.name end if !params[:fileselect].blank? @@ -192,19 +197,21 @@ post '/predict/?' do end @filename = params[:fileselect][:filename] begin - @dataset = Dataset.find_by(:name => params[:fileselect][:filename].sub(/\.csv$/,"")) + @dataset = Batch.find_by(:name => params[:fileselect][:filename].sub(/\.csv$/,"")) if @dataset $logger.debug "Take file from database." @compounds = @dataset.compounds + @identifiers = @dataset.identifiers else File.open('tmp/' + params[:fileselect][:filename], "w") do |f| f.write(params[:fileselect][:tempfile].read) end - input = Dataset.from_csv_file File.join("tmp", params[:fileselect][:filename]), true + input = Batch.from_csv_file File.join("tmp", params[:fileselect][:filename]) $logger.debug "Processing '#{params[:fileselect][:filename]}'" - if input.class == OpenTox::Dataset + if input.class == OpenTox::Batch @dataset = input - @compounds = input.compounds + @compounds = @dataset.compounds + @identifiers = @dataset.identifiers else File.delete File.join("tmp", params[:fileselect][:filename]) bad_request_error "Could not serialize file '#{@filename}'." @@ -216,7 +223,7 @@ post '/predict/?' do end if @compounds.size == 0 - message = dataset[:warnings] + message = @dataset[:warnings] @dataset.delete bad_request_error message end @@ -237,7 +244,7 @@ post '/predict/?' do if type == "Regression" unit = (type == "Regression") ? "(#{m.unit})" : "" converted_unit = (type == "Regression") ? "#{m.unit =~ /\b(mmol\/L)\b/ ? "(mg/L)" : "(mg/kg_bw/day)"}" : "" - header = "ID,Endpoint,Unique SMILES,inTrainingSet,Measurements #{unit},Prediction #{unit},Prediction #{converted_unit},"\ + header = "ID,Input,Endpoint,Unique SMILES,inTrainingSet,Measurements #{unit},Prediction #{unit},Prediction #{converted_unit},"\ "Prediction Interval Low #{unit},Prediction Interval High #{unit},"\ "Prediction Interval Low #{converted_unit},Prediction Interval High #{converted_unit},"\ "inApplicabilityDomain,Note\n" @@ -245,7 +252,7 @@ post '/predict/?' do # add header for classification if type == "Classification" av = m.prediction_feature.accept_values - header = "ID,Endpoint,Unique SMILES,inTrainingSet,Measurements,Consensus Prediction,Consensus Confidence,"\ + header = "ID,Input,Endpoint,Unique SMILES,inTrainingSet,Measurements,Consensus Prediction,Consensus Confidence,"\ "Structural alerts for mutagenicity,Lazar Prediction,"\ "Lazar predProbability #{av[0]},Lazar predProbability #{av[1]},inApplicabilityDomain,Note\n" end @@ -253,7 +260,8 @@ post '/predict/?' do p = 100.0/@compounds.size counter = 1 predictions = [] - @compounds.each_with_index do |compound,idx| + @compounds.each_with_index do |cid,idx| + compound = Compound.find cid if Prediction.where(compound: compound.id, model: m.id).exists? prediction_object = Prediction.find_by(compound: compound.id, model: m.id) prediction = prediction_object.prediction @@ -263,6 +271,8 @@ post '/predict/?' do prediction_object[:csv] = prediction_to_csv(m,compound,prediction) prediction_object.save end + # identifier + identifier = @identifiers[idx] else prediction = m.predict(compound) # save prediction object @@ -312,9 +322,12 @@ post '/predict/?' do prediction_object[:prediction] = prediction prediction_object[:csv] = prediction_to_csv(m,compound,prediction) prediction_object.save + + # identifier + identifier = @identifiers[idx] end - # collect prediction_object ids - predictions << prediction_id + # collect prediction_object ids with identifier + predictions << {identifier => prediction_id} t.update_percent((counter*p).ceil > 100 ? 100 : (counter*p).ceil) counter += 1 end @@ -323,24 +336,24 @@ post '/predict/?' do # write predictions @predictions["#{model}"] = predictions else # Cramer model - compounds = @compounds.collect{|c| c.smiles} + compounds = @compounds.collect{|cid| c = Compound.find cid; c.smiles} prediction = [Toxtree.predict(compounds, "Cramer rules"), Toxtree.predict(compounds, "Cramer rules with extensions")] output = {} output["model_name"] = "Oral toxicity (Cramer rules)" output["cramer_rules"] = prediction.collect{|array| array.collect{|hash| hash["Cramer rules"]}}.flatten.compact output["cramer_rules_extensions"] = prediction.collect{|array| array.collect{|hash| hash["Cramer rules, with extensions"]}}.flatten.compact # header - csv = "ID,Endpoint,Unique SMILES,Cramer rules,Cramer rules with extensions\n" + csv = "ID,Input,Endpoint,Unique SMILES,Cramer rules,Cramer rules with extensions\n" # content compounds.each_with_index do |smiles, idx| - csv << "#{idx+1},#{output["model_name"]},#{smiles},"\ + csv << "#{idx+1},#{@identifiers[idx]},#{output["model_name"]},#{smiles},"\ "#{output["cramer_rules"][idx] != "nil" ? output["cramer_rules"][idx] : "none" },"\ "#{output["cramer_rules_extensions"][idx] != "nil" ? output["cramer_rules_extensions"][idx] : "none"}\n" end predictions = {} predictions["Cramer rules"] = output["cramer_rules"].collect{|rule| rule != "nil" ? rule : "none"} predictions["Cramer rules, with extensions"] = output["cramer_rules_extensions"].collect{|rule| rule != "nil" ? rule : "none"} - predictions["compounds"] = @compounds.collect{|c| c.id} + predictions["compounds"] = @compounds if @dataset.warnings @dataset.warnings.each do |warning| diff --git a/batch.rb b/batch.rb new file mode 100644 index 0000000..2dd9359 --- /dev/null +++ b/batch.rb @@ -0,0 +1,81 @@ +require 'csv' +require 'tempfile' + +module OpenTox + + class Batch + + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "batch" + field :name, type: String + field :source, type: String + field :identifiers, type: Array + field :compounds, type: Array + field :warnings, type: Array, default: [] + + def self.from_csv_file file + source = file + name = File.basename(file,".*") + batch = self.find_by(:source => source, :name => name) + if batch + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})." + else + $logger.debug "Parsing #{file}." + table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + batch = self.new(:source => source, :name => name, :identifiers => [], :compounds => []) + + # features + feature_names = table.shift.collect{|f| f.strip} + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + compound_format = feature_names.shift.strip + bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + numeric = [] + features = [] + # guess feature types + feature_names.each_with_index do |f,i| + metadata = {:name => f} + values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil + if values.size == 0 # empty feature + elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes + numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) + else + metadata["accept_values"] = values + numeric[i] = false + feature = NominalFeature.find_or_create_by(metadata) + end + features << feature if feature + end + + table.each_with_index do |vals,i| + identifier = vals.shift.strip + batch.identifiers << identifier + begin + case compound_format + when /SMILES/i + compound = OpenTox::Compound.from_smiles(identifier) + when /InChI/i + compound = OpenTox::Compound.from_inchi(identifier) + end + rescue + compound = nil + end + if compound.nil? # compound parsers may return nil + #warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." + batch.compounds << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}." + next + end + batch.compounds << compound.id + end + batch.save + end + batch + end + + end + +end diff --git a/helper.rb b/helper.rb index dc4b695..ca20def 100644 --- a/helper.rb +++ b/helper.rb @@ -16,17 +16,14 @@ helpers do end def prediction_to_csv(m,c,p) - #model = Model::Validation.find(m.to_s) model = m model_name = "#{model.endpoint.gsub('_', ' ')} (#{model.species})" model_unit = model.regression? ? "(#{model.unit})" : "" converted_model_unit = model.regression? ? "#{model.unit =~ /\b(mmol\/L)\b/ ? "(mg/L)" : "(mg/kg_bw/day)"}" : "" - #predictions = predictions_ids.collect{|prediction_id| Prediction.find prediction_id} csv = "" - compound = c#Compound.find prediction_object.compound - prediction = p#prediction_object.prediction - #prediction.delete_if{|k,v| k =~ /neighbors|prediction_feature_id/} + compound = c + prediction = p output = {} line = "" output["model_name"] = model_name @@ -123,7 +120,7 @@ helpers do end def dataset_storage - all = Dataset.where(:source => /^tmp/) + all = Batch.where(:source => /^tmp/) out = Hash.new all.reverse.each{|d| out[d.id] = [d.name, d.created_at]} out diff --git a/views/batch.haml b/views/batch.haml index c71d056..1aa6779 100644 --- a/views/batch.haml +++ b/views/batch.haml @@ -102,7 +102,6 @@ - @models.each_with_index do |model,idx| - m = Model::Validation.find model unless model == "Cramer" - task = @tasks[idx].id - - predictions = @tasks[idx].predictions["#{model}"] #result.panel{:id=>idx} %div.row %div.col-md-6 -- cgit v1.2.3