summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2019-06-20 22:01:50 +0200
committerChristoph Helma <helma@in-silico.ch>2019-06-20 22:01:50 +0200
commit455da06aa6459da0d25b286ca6cb866ff64c4c34 (patch)
treed0ed8fcf720a02742da781669251f379b8fd07f0 /lib
parent1b44e0cd76f2ead93b8b3fa0f970c85ef32a4b14 (diff)
separate csv serialisations for batch predictions and training data, repeated measurements in mutagenicity dataset fixed, daphnia import fixed, CENTRAL_MONGO_IP removed
Diffstat (limited to 'lib')
-rw-r--r--lib/dataset.rb75
-rw-r--r--lib/download.rb3
-rw-r--r--lib/feature.rb11
-rw-r--r--lib/lazar.rb9
-rw-r--r--lib/model.rb8
-rw-r--r--lib/validation-statistics.rb4
6 files changed, 71 insertions, 39 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index df17569..596c53c 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -96,8 +96,14 @@ module OpenTox
# Get nominal and numeric prediction features
# @return [Array<OpenTox::NominalLazarPrediction,OpenTox::NumericLazarPrediction>]
- def prediction_features
- features.select{|f| f._type.match("Prediction")}
+ def prediction_feature
+ features.select{|f| f._type.match(/Prediction$/)}.first
+ end
+
+ # Get supporting nominal and numeric prediction features (class probabilities, prediction interval)
+ # @return [Array<OpenTox::LazarPredictionProbability,OpenTox::LazarPredictionInterval>]
+ def prediction_supporting_features
+ features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)}
end
# Get nominal and numeric merged features
@@ -259,7 +265,7 @@ module OpenTox
feature_names = table.shift.collect{|f| f.strip}
raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
- if feature_names[0] =~ /ID/i # check ID column
+ if feature_names[0] !~ /SMILES|InChI/i # check ID column
original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift)
else
original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID")
@@ -343,30 +349,52 @@ module OpenTox
# Serialisation
- # Convert dataset to csv format
+ # Convert lazar prediction dataset to csv format
# @return [String]
- def to_csv #inchi=false
- CSV.generate() do |csv|
-
- compound = substances.first.is_a? Compound
- f = features - original_id_features - original_smiles_features - warnings_features
- header = original_id_features.collect{|f| "ID "+Dataset.find(f.dataset_id).name}
- header += original_smiles_features.collect{|f| "SMILES "+Dataset.find(f.dataset_id).name} if compound
- compound ? header << "Canonical SMILES" : header << "Name"
- header += f.collect{|f| f.name}
- header += warnings_features.collect{|f| "Warnings "+Dataset.find(f.dataset_id).name}
- csv << header
-
- substances.each do |substance|
- row = original_id_features.collect{|f| values(substance,f).join(" ")}
- row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
- compound ? row << substance.smiles : row << substance.name
- row += f.collect{|f| values(substance,f).join(" ")}
- row += warnings_features.collect{|f| values(substance,f).uniq.join(" ")}
+ def to_prediction_csv
+
+ compound = substances.first.is_a? Compound
+ header = ["ID"]
+ header << "Original SMILES" if compound
+ compound ? header << "Canonical SMILES" : header << "Name"
+ header << "Prediction" if prediction_feature
+ header << "Confidence" if confidence_feature
+ header += prediction_supporting_features.collect{|f| f.name}
+ header << "Measurements"
+ csv = [header]
+
+ substances.each do |substance|
+ row = original_id_features.collect{|f| values(substance,f).join(" ")}
+ row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
+ compound ? row << substance.smiles : row << substance.name
+ row << values(substance,prediction_feature).join(" ")
+ row << values(substance,confidence_feature).join(" ")
+ row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")}
+ row << values(substance,bioactivity_features[0]).join(" ")
+ csv << row
+ end
+ csv.collect{|r| r.join(",")}.join("\n")
+ end
+
+ # Convert dataset into csv formatted training data
+ # @return [String]
+ def to_training_csv
+
+ p features
+ p bioactivity_features
+ header = ["Canonical SMILES"]
+ header << bioactivity_features[0].name
+ csv = [header]
+
+ substances.each do |substance|
+ nr_activities = values(substance,bioactivity_features.first).size
+ (0..nr_activities-1).each do |n| # new row for each value
+ row = [substance.smiles]
+ row << values(substance,bioactivity_features[0])[n]
csv << row
end
-
end
+ csv.collect{|r| r.join(",")}.join("\n")
end
# Convert dataset to SDF format
@@ -396,7 +424,6 @@ module OpenTox
predictions = {}
substances.each do |s|
predictions[s] ||= {}
- prediction_feature = prediction_features.first
predictions[s][:value] = values(s,prediction_feature).first
#predictions[s][:warnings] = []
#warnings_features.each { |w| predictions[s][:warnings] += values(s,w) }
diff --git a/lib/download.rb b/lib/download.rb
index f17d060..2546dc4 100644
--- a/lib/download.rb
+++ b/lib/download.rb
@@ -122,7 +122,6 @@ module OpenTox
# Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder
def self.mutagenicity
$logger.debug "Mutagenicity"
- # TODO add download/conversion programs to lazar dependencies
hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls"
@@ -185,7 +184,7 @@ module OpenTox
map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true
dataset.merged_features.first.name = "Mutagenicity"
- File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_csv}
+ File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv}
meta = {
:species => "Salmonella typhimurium",
:endpoint => "Mutagenicity",
diff --git a/lib/feature.rb b/lib/feature.rb
index 72c26d7..296a174 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -18,6 +18,9 @@ module OpenTox
# Confidence
class Confidence < Feature
field :dataset_id, type: BSON::ObjectId
+ def name
+ "Confidence"
+ end
end
# Categorical variables
@@ -66,13 +69,13 @@ module OpenTox
field :model_id, type: BSON::ObjectId
field :training_feature_id, type: BSON::ObjectId
def name
- "#{self[:name]} Prediction"
+ "Prediction: #{self[:name]}"
end
end
class LazarPredictionProbability < NominalLazarPrediction
def name
- "probability(#{self[:name]})"
+ "Probability: #{self[:name]}"
end
end
@@ -81,13 +84,13 @@ module OpenTox
field :model_id, type: BSON::ObjectId
field :training_feature_id, type: BSON::ObjectId
def name
- "#{self[:name]} Prediction"
+ "Prediction: #{self[:name]}"
end
end
class LazarPredictionInterval < NumericLazarPrediction
def name
- "prediction_interval_#{self[:name]}"
+ "#{self[:name].capitalize} prediction interval"
end
end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 2a3f749..e77de9d 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -17,19 +17,22 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple
ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"]
ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
+# CH: this interferes with /etc/hosts on my machine
# search for a central mongo database in use
# http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment
-CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
+# CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
Mongoid.load_configuration({
:clients => {
:default => {
:database => ENV["LAZAR_ENV"],
- :hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+ #:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+ :hosts => ["localhost:27017"]
}
}
})
Mongoid.raise_not_found_error = false # return nil if no document is found
-$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+#$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
$gridfs = $mongo.database.fs
# Logger setup
diff --git a/lib/model.rb b/lib/model.rb
index cbfefe3..05cd113 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -286,14 +286,14 @@ module OpenTox
end
if threshold == algorithms[:similarity][:min].first
if prediction[:warnings].empty?
- prediction[:confidence] = "High (close to bioassay results)"
+ prediction[:confidence] = "Similar to bioassay results"
return prediction
else # try again with a lower threshold
prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
predict_substance substance, algorithms[:similarity][:min].last, prediction
end
elsif threshold < algorithms[:similarity][:min].first
- prediction[:confidence] = "Low (lower than bioassay results)"
+ prediction[:confidence] = "Lower than bioassay results"
return prediction
end
end
@@ -348,9 +348,9 @@ module OpenTox
end
elsif prediction_feature.is_a? NumericBioActivity
f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
- prediction_interval = {}
+ prediction_interval = []
["lower","upper"].each do |v|
- prediction_interval[v] = LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
end
end
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 8a8970e..d603294 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -18,7 +18,7 @@ module OpenTox
if pred[:value] == v
confusion_matrix[:all][i][i] += 1
self.nr_predictions[:all] += 1
- if pred[:confidence].match(/High/i)
+ if pred[:confidence].match(/Similar/i)
confusion_matrix[:confidence_high][i][i] += 1
self.nr_predictions[:confidence_high] += 1
elsif pred[:confidence].match(/Low/i)
@@ -32,7 +32,7 @@ module OpenTox
if pred[:value] == v
confusion_matrix[:all][i][(i+1)%2] += 1
self.nr_predictions[:all] += 1
- if pred[:confidence].match(/High/i)
+ if pred[:confidence].match(/Similar/i)
confusion_matrix[:confidence_high][i][(i+1)%2] += 1
self.nr_predictions[:confidence_high] += 1
elsif pred[:confidence].match(/Low/i)