From 292ffcd5eccb05b2bea1aab64504134f5cdd0834 Mon Sep 17 00:00:00 2001 From: gebele Date: Mon, 31 Jul 2017 15:18:22 +0000 Subject: introduce batch predictions and QMRF for public service;layout refinements for better readability --- application.rb | 339 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 296 insertions(+), 43 deletions(-) (limited to 'application.rb') diff --git a/application.rb b/application.rb index aac1c39..1694b6e 100644 --- a/application.rb +++ b/application.rb @@ -73,43 +73,10 @@ get '/predict/dataset/:name' do csv end -get '/predict/?:csv?' do +get '/predict/:tmppath/:filename/?' do response['Content-Type'] = "text/csv" - @csv = "\"Compound\",\"Endpoint\",\"Type\",\"Prediction\",\"95% Prediction interval\"\n" - @@batch.each do |key, values| - compound = key - smiles = compound.smiles - values.each do |array| - model = array[0] - type = model.model.class.to_s.match("Classification") ? "Classification" : "Regression" - prediction = array[1] - endpoint = "#{model.endpoint.gsub('_', ' ')} (#{model.species})" - if prediction[:confidence] == "measured" - if prediction[:value].is_a?(Array) - prediction[:value].each do |value| - pred = value.numeric? ? "#{value} (#{model.unit}), #{compound.mmol_to_mg(value.delog10)} #{(model.unit =~ /\b(mol\/L)\b/) ? "(mg/L)" : "(mg/kg_bw/day)"}" : value - int = (prediction[:prediction_interval].nil? ? nil : prediction[:prediction_interval]) - interval = (int.nil? ? "--" : "#{int[1].delog10} - #{int[0].delog10} (#{model.unit})") - @csv += "\"#{smiles}\",\"#{endpoint}\",\"#{type}\",\"#{pred}\",\"#{interval}\"\n" - end - else - pred = prediction[:value].numeric? ? "#{prediction[:value]} (#{model.unit}), #{compound.mmol_to_mg(prediction[:value].delog10)} #{(model.unit =~ /\b(mol\/L)\b/) ? "(mg/L)" : "(mg/kg_bw/day)"}" : prediction[:value] - confidence = "measured activity" - end - elsif prediction[:neighbors].size > 0 - type = model.model.class.to_s.match("Classification") ? "Classification" : "Regression" - pred = prediction[:value].numeric? ? "#{prediction[:value].delog10} (#{model.unit}), #{compound.mmol_to_mg(prediction[:value].delog10)} #{(model.unit =~ /\b(mol\/L)\b/) ? "(mg/L)" : "(mg/kg_bw/day)"}" : prediction[:value] - int = (prediction[:prediction_interval].nil? ? nil : prediction[:prediction_interval]) - interval = (int.nil? ? "--" : "#{int[1].delog10} - #{int[0].delog10} (#{model.unit})") - else - type = "" - pred = "Not enough similar compounds in training dataset." - interval = "" - end - @csv += "\"#{smiles}\",\"#{endpoint}\",\"#{type}\",\"#{pred}\",\"#{interval}\"\n" unless prediction[:value].is_a?(Array) - end - end - @csv + path = "/tmp/#{params[:tmppath]}" + send_file path, :filename => "lazar_batch_prediction_#{params[:filename]}", :type => "text/csv", :disposition => "attachment" end post '/predict/?' do @@ -142,24 +109,119 @@ post '/predict/?' do dataset.delete return haml :error end + + # for csv export @batch = {} - @compounds.each do |compound| - @batch[compound] = [] - params[:selection].keys.each do |model_id| - model = OpenTox::Model::Validation.find model_id + # for haml table + @view = {} + + @compounds.each{|c| @view[c] = []} + params[:selection].keys.each do |model_id| + model = OpenTox::Model::Validation.find model_id + @batch[model] = [] + @compounds.each_with_index do |compound,idx| prediction = model.predict(compound) - @batch[compound] << [model, prediction] + @batch[model] << [compound, prediction] + @view[compound] << [model,prediction] end end - @@batch = @batch + + @csvhash = {} @warnings = dataset[:warnings] + dupEntries = {} + delEntries = "" + + # split duplicates and deleted entries + @warnings.each do |w| + substring = w.match(/line .* of/) + unless substring.nil? + delEntries += "\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"#{w.sub(/\b(tmp\/)\b/,"")}\"\n" + end + substring = w.match(/rows .* Entries/) + unless substring.nil? + lines = [] + substring[0].split(",").each{|s| lines << s[/\d+/]} + lines.shift + lines.each{|l| dupEntries[l.to_i] = w.split(".").first} + end + end + + @batch.each_with_index do |hash, idx| + @csvhash[idx] = "" + model = hash[0] + values = hash[1] + dupEntries.keys.each{|k| values.insert(k-1, dupEntries[k])}.compact! + values.each_with_index do |array, id| + unless array.kind_of? String + compound = array[0] + prediction = array[1] + smiles = compound.smiles + type = model.model.class.to_s.match("Classification") ? "Classification" : "Regression" + endpoint = "#{model.endpoint.gsub('_', ' ')} (#{model.species})" + pred = propA = propB = interval = inApp = inT = note = "" + if prediction[:neighbors] + if prediction[:value] + pred = prediction[:value].numeric? ? "#{prediction[:value].delog10.signif(3)} (#{model.unit}), #{compound.mmol_to_mg(prediction[:value].delog10.signif(3))} #{(model.unit =~ /\b(mol\/L)\b/) ? "(mg/L)" : "(mg/kg_bw/day)"}" : prediction[:value] + int = (prediction[:prediction_interval].nil? ? nil : prediction[:prediction_interval]) + interval = (int.nil? ? "" : "#{int[1].delog10.signif(3)} - #{int[0].delog10.signif(3)} (#{model.unit})") + inApp = "yes" + inT = prediction[:info] =~ /\b(identical)\b/i ? "yes" : "no" + note = prediction[:warnings].join("\n") + ( prediction[:info] ? prediction[:info].sub(/\'.*\'/,"") : "\n" ) + unless prediction[:probabilities].nil? + if id == 0 + probFirst = probLast = "" + probFirst = prediction[:probabilities].keys.first.capitalize + prediction[:probabilities].keys.last.split("-").each{|s| probLast += s.capitalize} + @csvhash[idx] = "\"ID\",\"Endpoint\",\"Type\",\"Unique SMILES\",\"Prediction\",\"predProbability#{probFirst}\",\"predProbability#{probLast}\",\"95% Prediction interval\",\"inApplicabilityDomain\",\"inTrainningSet\",\"Note\"\n" + unless delEntries.blank? and id == 0 + @csvhash[idx] += delEntries + end + end + propA = "#{prediction[:probabilities].values_at(prediction[:probabilities].keys.first)[0].to_f.signif(3)}" + propB = "#{prediction[:probabilities].values_at(prediction[:probabilities].keys.last)[0].to_f.signif(3)}" + else + @csvhash[idx] = "\"ID\",\"Endpoint\",\"Type\",\"Unique SMILES\",\"Prediction\",\"predProbability\",\"predProbability\",\"95% Prediction interval\",\"inApplicabilityDomain\",\"inTrainningSet\",\"Note\"\n" + unless delEntries.blank? and id == 0 + @csvhash[idx] += delEntries + end + end + # only one neighbor + else + inApp = "no" + inT = prediction[:info] =~ /\b(identical)\b/i ? "yes" : "no" + note = prediction[:warnings].join("\n") + ( prediction[:info] ? prediction[:info].sub(/\'.*\'/,"") : "\n" ) + end + else # no prediction value + inApp = "no" + inT = prediction[:info] =~ /\b(identical)\b/i ? "yes" : "no" + note = prediction[:warnings].join("\n") + ( prediction[:info] ? prediction[:info].sub(/\'.*\'/,"") : "\n" ) + end + if @warnings + @warnings.each do |w| + note += (w.split(".").first + ".") if /\b(#{Regexp.escape(smiles)})\b/ === w + end + end + else + endpoint = type = smiles = pred = propA = propB = interval = inApp = inT = "" + note = array + end + @csvhash[idx] += "\"#{id+1}\",\"#{endpoint}\",\"#{type}\",\"#{smiles}\",\"#{pred}\",\"#{propA}\",\"#{propB}\",\"#{interval}\",\"#{inApp}\",\"#{inT}\",\"#{note.chomp}\"\n" + end + end + t = Tempfile.new + @csvhash.each do |model, csv| + t.write(csv) + t.write("\n") + end + t.rewind + @tmppath = t.path.split("/").last + dataset.delete File.delete File.join("tmp", params[:fileselect][:filename]) return haml :batch end # validate identifier input - # transfered input if !params[:identifier].blank? @identifier = params[:identifier] $logger.debug "input:#{@identifier}" @@ -181,6 +243,197 @@ post '/predict/?' do end end +get "/report/:id/?" do + lazarpath = `gem path lazar` + lazarpath = File.dirname lazarpath + lazarpath = File.dirname lazarpath + qmrfpath = `gem path qsar-report` + qmrfpath = File.dirname qmrfpath + qmrfpath = File.dirname qmrfpath + prediction_model = Model::Validation.find params[:id] + model = prediction_model.model + validation_template = "./views/model_details.haml" + + if File.directory?(lazarpath) + lazar_commit = `cd #{lazarpath}; git rev-parse HEAD`.strip + lazar_commit = "https://github.com/opentox/lazar/tree/#{lazar_commit}" + else + lazar_commit = "https://github.com/opentox/lazar/releases/tag/v#{Gem.loaded_specs["lazar"].version}" + end + + report = OpenTox::QMRFReport.new + + # QSAR Identifier Title 1.1 + report.value "QSAR_title", "Lazar model for #{prediction_model.species} #{prediction_model.endpoint}" + + # Software coding the model 1.3 + report.change_catalog :software_catalog, :firstsoftware, {:name => "lazar", :description => "lazar Lazy Structure- Activity Relationships", :number => "1", :url => "https://lazar.in-silico.ch", :contact => "info@in-silico.ch"} + report.ref_catalog :QSAR_software, :software_catalog, :firstsoftware + + # Date of QMRF 2.1 + report.value "qmrf_date", "#{Time.now.strftime('%d %B %Y')}" + + # QMRF author(s) and contact details 2.1 + report.change_catalog :authors_catalog, :firstauthor, {:name => "Christoph Helma", :affiliation => "in silico toxicology gmbh", :contact => "Rastatterstr. 41, CH-4057 Basel", :email => "info@in-silico.ch", :number => "1", :url => "www.in-silico.ch"} + report.ref_catalog :qmrf_authors, :authors_catalog, :firstauthor + + # Model developer(s) and contact details 2.5 + report.change_catalog :authors_catalog, :modelauthor, {:name => "Christoph Helma", :affiliation => "in silico toxicology gmbh", :contact => "Rastatterstr. 41, CH-4057 Basel", :email => "info@in-silico.ch", :number => "1", :url => "www.in-silico.ch"} + report.ref_catalog :model_authors, :authors_catalog, :modelauthor + + # Date of model development and/or publication 2.6 + report.value "model_date", "#{Time.parse(model.created_at.to_s).strftime('%Y')}" + + # Reference(s) to main scientific papers and/or software package 2.7 + report.change_catalog :publications_catalog, :publications_catalog_1, {:title => "Maunz, Guetlein, Rautenberg, Vorgrimmler, Gebele and Helma (2013), lazar: a modular predictive toxicology framework ", :url => "http://dx.doi.org/10.3389/fphar.2013.00038"} + report.ref_catalog :references, :publications_catalog, :publications_catalog_1 + + # Reference(s) to main scientific papers and/or software package 2.7 + report.change_catalog :publications_catalog, :publications_catalog_2, {:title => "Maunz A and Helma C (2008) Prediction of chemical toxicity with local support vector regression and activity-specific kernels. SAR & QSAR in Environmental Research 19 (5-6), 413-431", :url => "http://dx.doi.org/10.1080/10629360802358430"} + report.ref_catalog :references, :publications_catalog, :publications_catalog_2 + + # Species 3.1 + report.value "model_species", prediction_model.species + + # Endpoint 3.2 + report.change_catalog :endpoints_catalog, :endpoints_catalog_1, {:name => prediction_model.endpoint, :group => ""} + report.ref_catalog :model_endpoint, :endpoints_catalog, :endpoints_catalog_1 + + # Endpoint Units 3.4 + report.value "endpoint_units", "#{prediction_model.unit}" + + model_type = model.class.to_s.gsub('OpenTox::Model::Lazar','') + + # Type of model 4.1 + report.value "algorithm_type", "#{model_type}" + + # Explicit algorithm 4.2 + report.change_catalog :algorithms_catalog, :algorithms_catalog_1, {:definition => "see Helma 2016 and lazar.in-silico.ch, submitted version: #{lazar_commit}", :description => "Neighbor algorithm: #{model.algorithms["similarity"]["method"].gsub('_',' ').titleize}#{(model.algorithms["similarity"][:min] ? ' with similarity > ' + model.algorithms["similarity"][:min].to_s : '')}"} + report.ref_catalog :algorithm_explicit, :algorithms_catalog, :algorithms_catalog_1 + report.change_catalog :algorithms_catalog, :algorithms_catalog_3, {:definition => "see Helma 2016 and lazar.in-silico.ch, submitted version: #{lazar_commit}", :description => "modified k-nearest neighbor #{model_type}"} + report.ref_catalog :algorithm_explicit, :algorithms_catalog, :algorithms_catalog_3 + if model.algorithms["prediction"] + pred_algorithm_params = (model.algorithms["prediction"][:method] == "rf" ? "random forest" : model.algorithms["prediction"][:method]) + end + report.change_catalog :algorithms_catalog, :algorithms_catalog_2, {:definition => "see Helma 2016 and lazar.in-silico.ch, submitted version: #{lazar_commit}", :description => "Prediction algorithm: #{model.algorithms["prediction"].to_s.gsub('OpenTox::Algorithm::','').gsub('_',' ').gsub('.', ' with ')} #{(pred_algorithm_params ? pred_algorithm_params : '')}"} + report.ref_catalog :algorithm_explicit, :algorithms_catalog, :algorithms_catalog_2 + + # Descriptors in the model 4.3 + if model.algorithms["descriptors"][:type] + report.change_catalog :descriptors_catalog, :descriptors_catalog_1, {:description => "", :name => "#{model.algorithms["descriptors"][:type]}", :publication_ref => "", :units => ""} + report.ref_catalog :algorithms_descriptors, :descriptors_catalog, :descriptors_catalog_1 + end + + # Descriptor selection 4.4 + report.value "descriptors_selection", "#{model.algorithms["feature_selection"].gsub('_',' ')} #{model.algorithms["feature_selection"].collect{|k,v| k.to_s + ': ' + v.to_s}.join(', ')}" if model.algorithms["feature_selection"] + + # Algorithm and descriptor generation 4.5 + report.value "descriptors_generation", "exhaustive breadth first search for paths in chemical graphs (simplified MolFea algorithm)" + + # Software name and version for descriptor generation 4.6 + report.change_catalog :software_catalog, :software_catalog_2, {:name => "lazar, submitted version: #{lazar_commit}", :description => "simplified MolFea algorithm", :number => "2", :url => "https://lazar.in-silico.ch", :contact => "info@in-silico.ch"} + report.ref_catalog :descriptors_generation_software, :software_catalog, :software_catalog_2 + + # Chemicals/Descriptors ratio 4.7 + report.value "descriptors_chemicals_ratio", "not applicable (classification based on activities of neighbors, descriptors are used for similarity calculation)" + + # Description of the applicability domain of the model 5.1 + report.value "app_domain_description", " +

+ The applicability domain (AD) of the training set is characterized by + the confidence index of a prediction (high confidence index: close to + the applicability domain of the training set/reliable prediction, low + confidence: far from the applicability domain of the + trainingset/unreliable prediction). The confidence index considers (i) + the similarity and number of neighbors and (ii) contradictory examples + within the neighbors. A formal definition can be found in Helma 2006. +

+

+ The reliability of predictions decreases gradually with increasing + distance from the applicability domain (i.e. decreasing confidence index) +

+ + " + + # Method used to assess the applicability domain 5.2 + report.value "app_domain_method", "see Helma 2006 and Maunz 2008" + + # Software name and version for applicability domain assessment 5.3 + report.change_catalog :software_catalog, :software_catalog_3, {:name => "lazar, submitted version: #{lazar_commit}", :description => "integrated into main lazar algorithm", :number => "3", :url => "https://lazar.in-silico.ch", :contact => "info@in-silico.ch"} + report.ref_catalog :app_domain_software, :software_catalog, :software_catalog_3 + + # Limits of applicability 5.4 + report.value "applicability_limits", "Predictions with low confidence index, unknown substructures and neighbors that might act by different mechanisms" + + # Availability of the training set 6.1 + report.change_attributes "training_set_availability", {:answer => "Yes"} + + # Available information for the training set 6.2 + report.change_attributes "training_set_data", {:cas => "Yes", :chemname => "Yes", :formula => "Yes", :inchi => "Yes", :mol => "Yes", :smiles => "Yes"} + + # Data for each descriptor variable for the training set 6.3 + report.change_attributes "training_set_descriptors", {:answer => "No"} + + # Data for the dependent variable for the training set 6.4 + report.change_attributes "dependent_var_availability", {:answer => "All"} + + # Other information about the training set 6.5 + report.value "other_info", "#{prediction_model.source}" + + # Pre-processing of data before modelling 6.6 + report.value "preprocessing", (model.class == OpenTox::Model::LazarRegression ? "-log10 transformation" : "none") + + # Robustness - Statistics obtained by leave-many-out cross-validation 6.9 + if prediction_model.repeated_crossvalidation + $logger.error "#####################{prediction_model}" + crossvalidations = prediction_model.crossvalidations + out = haml File.read(validation_template), :layout=> false, :locals => {:model => prediction_model, :crossvalidations => crossvalidations} + report.value "lmo", out + end + + # Mechanistic basis of the model 8.1 + report.value "mechanistic_basis"," +

+ Compounds with similar structures (neighbors) are assumed to have + similar activities as the query compound. For the determination of + activity specific similarities only statistically relevant subtructures + (paths) are used. For this reason there is a priori no bias towards + specific mechanistic hypothesis. +

+ +" + + # A priori or a posteriori mechanistic interpretation 8.2 + report.value "mechanistic_basis_comments","a posteriori for individual predictions" + + # Other information about the mechanistic interpretation 8.3 + report.value "mechanistic_basis_info","

Hypothesis about biochemical mechanisms can be derived from individual + predictions by inspecting neighbors and relevant fragments.

+

Neighbors are compounds that are similar in respect to a certain + endpoint and it is likely that compounds with high similarity act by + similar mechanisms as the query compound. Links at the webinterface + prove an easy access to additional experimental data and literature + citations for the neighbors and the query structure.

+

Activating and deactivating parts of the query compound are highlighted + in red and green on the webinterface. Fragments that are unknown (or too + infrequent for statistical evaluation are marked in yellow and + additional statistical information about the individual fragments can be + retrieved. Please note that lazar predictions are based on neighbors and + not on fragments. Fragments and their statistical significance are used + for the calculation of activity specific similarities.

" + + # Bibliography 9.2 + report.ref_catalog :bibliography, :publications_catalog, :publications_catalog_1 + report.ref_catalog :bibliography, :publications_catalog, :publications_catalog_2 + report.change_catalog :publications_catalog, :publications_catalog_3, {:title => "Helma (2006), Lazy structure-activity relationships (lazar) for the prediction of rodent carcinogenicity and Salmonella mutagenicity.", :url => "http://dx.doi.org/10.1007/s11030-005-9001-5"} + report.ref_catalog :bibliography, :publications_catalog, :publications_catalog_3 + + # output + t = Tempfile.new + t << report.to_xml + send_file t.path, :filename => "QMRF_report_#{model.name}.xml", :type => "application/xml", :disposition => "attachment" +end + get '/license' do @license = RDiscount.new(File.read("LICENSE.md")).to_html haml :license, :layout => false -- cgit v1.2.3