From 455da06aa6459da0d25b286ca6cb866ff64c4c34 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 20 Jun 2019 22:01:50 +0200 Subject: separate csv serialisations for batch predictions and training data, repeated measurements in mutagenicity dataset fixed, daphnia import fixed, CENTRAL_MONGO_IP removed --- lib/dataset.rb | 75 ++++++++++++++++++++++++++++++-------------- lib/download.rb | 3 +- lib/feature.rb | 11 ++++--- lib/lazar.rb | 9 ++++-- lib/model.rb | 8 ++--- lib/validation-statistics.rb | 4 +-- 6 files changed, 71 insertions(+), 39 deletions(-) (limited to 'lib') diff --git a/lib/dataset.rb b/lib/dataset.rb index df17569..596c53c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -96,8 +96,14 @@ module OpenTox # Get nominal and numeric prediction features # @return [Array] - def prediction_features - features.select{|f| f._type.match("Prediction")} + def prediction_feature + features.select{|f| f._type.match(/Prediction$/)}.first + end + + # Get supporting nominal and numeric prediction features (class probabilities, prediction interval) + # @return [Array] + def prediction_supporting_features + features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)} end # Get nominal and numeric merged features @@ -259,7 +265,7 @@ module OpenTox feature_names = table.shift.collect{|f| f.strip} raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - if feature_names[0] =~ /ID/i # check ID column + if feature_names[0] !~ /SMILES|InChI/i # check ID column original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) else original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") @@ -343,30 +349,52 @@ module OpenTox # Serialisation - # Convert dataset to csv format + # Convert lazar prediction dataset to csv format # @return [String] - def to_csv #inchi=false - CSV.generate() do |csv| - - compound = substances.first.is_a? Compound - f = features - original_id_features - original_smiles_features - warnings_features - header = original_id_features.collect{|f| "ID "+Dataset.find(f.dataset_id).name} - header += original_smiles_features.collect{|f| "SMILES "+Dataset.find(f.dataset_id).name} if compound - compound ? header << "Canonical SMILES" : header << "Name" - header += f.collect{|f| f.name} - header += warnings_features.collect{|f| "Warnings "+Dataset.find(f.dataset_id).name} - csv << header - - substances.each do |substance| - row = original_id_features.collect{|f| values(substance,f).join(" ")} - row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound - compound ? row << substance.smiles : row << substance.name - row += f.collect{|f| values(substance,f).join(" ")} - row += warnings_features.collect{|f| values(substance,f).uniq.join(" ")} + def to_prediction_csv + + compound = substances.first.is_a? Compound + header = ["ID"] + header << "Original SMILES" if compound + compound ? header << "Canonical SMILES" : header << "Name" + header << "Prediction" if prediction_feature + header << "Confidence" if confidence_feature + header += prediction_supporting_features.collect{|f| f.name} + header << "Measurements" + csv = [header] + + substances.each do |substance| + row = original_id_features.collect{|f| values(substance,f).join(" ")} + row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound + compound ? row << substance.smiles : row << substance.name + row << values(substance,prediction_feature).join(" ") + row << values(substance,confidence_feature).join(" ") + row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")} + row << values(substance,bioactivity_features[0]).join(" ") + csv << row + end + csv.collect{|r| r.join(",")}.join("\n") + end + + # Convert dataset into csv formatted training data + # @return [String] + def to_training_csv + + p features + p bioactivity_features + header = ["Canonical SMILES"] + header << bioactivity_features[0].name + csv = [header] + + substances.each do |substance| + nr_activities = values(substance,bioactivity_features.first).size + (0..nr_activities-1).each do |n| # new row for each value + row = [substance.smiles] + row << values(substance,bioactivity_features[0])[n] csv << row end - end + csv.collect{|r| r.join(",")}.join("\n") end # Convert dataset to SDF format @@ -396,7 +424,6 @@ module OpenTox predictions = {} substances.each do |s| predictions[s] ||= {} - prediction_feature = prediction_features.first predictions[s][:value] = values(s,prediction_feature).first #predictions[s][:warnings] = [] #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } diff --git a/lib/download.rb b/lib/download.rb index f17d060..2546dc4 100644 --- a/lib/download.rb +++ b/lib/download.rb @@ -122,7 +122,6 @@ module OpenTox # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder def self.mutagenicity $logger.debug "Mutagenicity" - # TODO add download/conversion programs to lazar dependencies hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv" kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip" efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls" @@ -185,7 +184,7 @@ module OpenTox map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"} dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true dataset.merged_features.first.name = "Mutagenicity" - File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_csv} + File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv} meta = { :species => "Salmonella typhimurium", :endpoint => "Mutagenicity", diff --git a/lib/feature.rb b/lib/feature.rb index 72c26d7..296a174 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -18,6 +18,9 @@ module OpenTox # Confidence class Confidence < Feature field :dataset_id, type: BSON::ObjectId + def name + "Confidence" + end end # Categorical variables @@ -66,13 +69,13 @@ module OpenTox field :model_id, type: BSON::ObjectId field :training_feature_id, type: BSON::ObjectId def name - "#{self[:name]} Prediction" + "Prediction: #{self[:name]}" end end class LazarPredictionProbability < NominalLazarPrediction def name - "probability(#{self[:name]})" + "Probability: #{self[:name]}" end end @@ -81,13 +84,13 @@ module OpenTox field :model_id, type: BSON::ObjectId field :training_feature_id, type: BSON::ObjectId def name - "#{self[:name]} Prediction" + "Prediction: #{self[:name]}" end end class LazarPredictionInterval < NumericLazarPrediction def name - "prediction_interval_#{self[:name]}" + "#{self[:name].capitalize} prediction interval" end end diff --git a/lib/lazar.rb b/lib/lazar.rb index 2a3f749..e77de9d 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -17,19 +17,22 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"] ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment +# CH: this interferes with /etc/hosts on my machine # search for a central mongo database in use # http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment -CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp +# CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp Mongoid.load_configuration({ :clients => { :default => { :database => ENV["LAZAR_ENV"], - :hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]), + #:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]), + :hosts => ["localhost:27017"] } } }) Mongoid.raise_not_found_error = false # return nil if no document is found -$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}") +#$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}") +$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}") $gridfs = $mongo.database.fs # Logger setup diff --git a/lib/model.rb b/lib/model.rb index cbfefe3..05cd113 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -286,14 +286,14 @@ module OpenTox end if threshold == algorithms[:similarity][:min].first if prediction[:warnings].empty? - prediction[:confidence] = "High (close to bioassay results)" + prediction[:confidence] = "Similar to bioassay results" return prediction else # try again with a lower threshold prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." predict_substance substance, algorithms[:similarity][:min].last, prediction end elsif threshold < algorithms[:similarity][:min].first - prediction[:confidence] = "Low (lower than bioassay results)" + prediction[:confidence] = "Lower than bioassay results" return prediction end end @@ -348,9 +348,9 @@ module OpenTox end elsif prediction_feature.is_a? NumericBioActivity f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) - prediction_interval = {} + prediction_interval = [] ["lower","upper"].each do |v| - prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) end end diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 8a8970e..d603294 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -18,7 +18,7 @@ module OpenTox if pred[:value] == v confusion_matrix[:all][i][i] += 1 self.nr_predictions[:all] += 1 - if pred[:confidence].match(/High/i) + if pred[:confidence].match(/Similar/i) confusion_matrix[:confidence_high][i][i] += 1 self.nr_predictions[:confidence_high] += 1 elsif pred[:confidence].match(/Low/i) @@ -32,7 +32,7 @@ module OpenTox if pred[:value] == v confusion_matrix[:all][i][(i+1)%2] += 1 self.nr_predictions[:all] += 1 - if pred[:confidence].match(/High/i) + if pred[:confidence].match(/Similar/i) confusion_matrix[:confidence_high][i][(i+1)%2] += 1 self.nr_predictions[:confidence_high] += 1 elsif pred[:confidence].match(/Low/i) -- cgit v1.2.3