summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/caret-classification.rb107
-rw-r--r--lib/caret.rb11
-rw-r--r--lib/classification.rb5
-rw-r--r--lib/compound.rb80
-rw-r--r--lib/crossvalidation.rb42
-rw-r--r--lib/dataset.rb537
-rw-r--r--lib/download.rb358
-rw-r--r--lib/enm-import.rb125
-rw-r--r--lib/error.rb66
-rw-r--r--lib/feature.rb113
-rw-r--r--lib/import.rb132
-rw-r--r--lib/lazar.rb17
-rw-r--r--lib/leave-one-out-validation.rb41
-rw-r--r--lib/model.rb123
-rw-r--r--lib/opentox.rb7
-rw-r--r--lib/overwrite.rb2
-rw-r--r--lib/physchem.rb8
-rw-r--r--lib/regression.rb2
-rw-r--r--lib/rest-client-wrapper.rb10
-rw-r--r--lib/substance.rb1
-rw-r--r--lib/train-test-validation.rb13
-rw-r--r--lib/validation-statistics.rb176
-rw-r--r--lib/validation.rb2
23 files changed, 1356 insertions, 622 deletions
diff --git a/lib/caret-classification.rb b/lib/caret-classification.rb
new file mode 100644
index 0000000..fefe6b6
--- /dev/null
+++ b/lib/caret-classification.rb
@@ -0,0 +1,107 @@
+module OpenTox
+ module Algorithm
+
+ # Ruby interface for the R caret package
+ # Caret model list: https://topepo.github.io/caret/modelList.html
+ class Caret
+
+ # Create a local R caret model and make a prediction
+ # @param [Array<Float,Bool>] dependent_variables
+ # @param [Array<Array<Float,Bool>>] independent_variables
+ # @param [Array<Float>] weights
+ # @param [String] Caret method
+ # @param [Array<Float,Bool>] query_variables
+ # @return [Hash]
+ def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables:
+ remove = []
+ # remove independent_variables with single values
+ independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1}
+ remove.sort.reverse.each do |i|
+ independent_variables.delete_at i
+ query_variables.delete_at i
+ end
+ if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
+ prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+ prediction[:warnings] << "No variables for classification model. Using weighted average of similar substances."
+ elsif dependent_variables.uniq.size == 1
+ prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+ prediction[:warnings] << "All neighbors have the same measured activity. Cannot create random forest model, using weighted average of similar substances."
+ elsif dependent_variables.size < 3
+ prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+ prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for classification model. Using weighted average of similar substances."
+ else
+ dependent_variables.collect!{|v| to_r(v)}
+ independent_variables.each_with_index do |c,i|
+ c.each_with_index do |v,j|
+ independent_variables[i][j] = to_r(v)
+ end
+ end
+# query_variables.collect!{|v| to_r(v)}
+ begin
+ R.assign "weights", weights
+ #r_data_frame = "data.frame(#{([dependent_variables.collect{|v| to_r(v)}]+independent_variables).collect{|r| "c(#{r.collect{|v| to_r(v)}.join(',')})"}.join(', ')})"
+ r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+ #p r_data_frame
+ R.eval "data <- #{r_data_frame}"
+ R.assign "features", (0..independent_variables.size-1).to_a
+ R.eval "names(data) <- append(c('activities'),features)" #
+ p "train"
+ R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)"
+ p "done"
+ rescue => e
+ $logger.debug "R caret model creation error for: #{e.message}"
+ $logger.debug dependent_variables
+ $logger.debug independent_variables
+ prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+ prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
+ return prediction
+ end
+ begin
+ R.eval "query <- data.frame(rbind(c(#{query_variables.collect{|v| to_r(v)}.join ','})))"
+ R.eval "names(query) <- features"
+ R.eval "prediction <- predict(model,query, type=\"prob\")"
+ names = R.eval("names(prediction)").to_ruby
+ probs = R.eval("prediction").to_ruby
+ probabilities = {}
+ names.each_with_index { |n,i| probabilities[n] = probs[i] }
+ value = probabilities.sort_by{|n,p| -p }[0][0]
+ prediction = {
+ :value => value,
+ :probabilities => probabilities,
+ :warnings => [],
+ }
+ rescue => e
+ $logger.debug "R caret prediction error for: #{e.inspect}"
+ $logger.debug self.inspect
+ prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+ prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
+ return prediction
+ end
+ if prediction.nil? or prediction[:value].nil?
+ prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights
+ prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
+ end
+ end
+ prediction
+
+ end
+
+ # Call caret methods dynamically, e.g. Caret.pls
+ def self.method_missing(sym, *args, &block)
+ args.first[:method] = sym.to_s
+ self.create_model_and_predict args.first
+ end
+
+ # Convert Ruby values to R values
+ def self.to_r v
+ return "F" if v == false
+ return "T" if v == true
+ return nil if v.is_a? Float and v.nan?
+ return "\"#{v}\"" if v.is_a? String
+ v
+ end
+
+ end
+ end
+end
+
diff --git a/lib/caret.rb b/lib/caret.rb
index 8bccf74..2e5f1bc 100644
--- a/lib/caret.rb
+++ b/lib/caret.rb
@@ -22,11 +22,11 @@ module OpenTox
end
if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == []
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances."
+ prediction[:warnings] = ["No variables for regression model, using weighted average of similar substances (no prediction interval available)."]
elsif
dependent_variables.size < 3
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances."
+ prediction[:warnings] = ["Insufficient number of neighbors (#{dependent_variables.size}) for regression model, using weighted average of similar substances (no prediction interval available)."]
else
dependent_variables.each_with_index do |v,i|
dependent_variables[i] = to_r(v)
@@ -51,7 +51,8 @@ module OpenTox
$logger.debug dependent_variables
$logger.debug independent_variables
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances."
+ prediction[:warnings] ||= []
+ prediction[:warnings] << "R caret model creation error, using weighted average of similar substances (no prediction interval available)."
return prediction
end
begin
@@ -72,12 +73,12 @@ module OpenTox
$logger.debug "R caret prediction error for:"
$logger.debug self.inspect
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances"
+ prediction[:warnings] << "R caret prediction error, using weighted average of similar substances (no prediction interval available)."
return prediction
end
if prediction.nil? or prediction[:value].nil?
prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights
- prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances."
+ prediction[:warnings] << "Empty R caret prediction, using weighted average of similar substances (no prediction interval available)."
end
end
prediction
diff --git a/lib/classification.rb b/lib/classification.rb
index a875903..638492b 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -18,11 +18,6 @@ module OpenTox
class_weights.each do |a,w|
probabilities[a] = w.sum/weights.sum
end
- # DG: hack to ensure always two probability values
- if probabilities.keys.uniq.size == 1
- missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0]
- probabilities[missing_key] = 0.0
- end
probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h
p_max = probabilities.collect{|a,p| p}.max
prediction = probabilities.key(p_max)
diff --git a/lib/compound.rb b/lib/compound.rb
index bfe69e3..6d0e075 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -1,5 +1,3 @@
-CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/"
-
module OpenTox
# Small molecules with defined chemical structures
@@ -12,7 +10,6 @@ module OpenTox
field :inchikey, type: String
field :names, type: Array
field :cid, type: String
- field :chemblid, type: String
field :png_id, type: BSON::ObjectId
field :svg_id, type: BSON::ObjectId
field :sdf_id, type: BSON::ObjectId
@@ -35,13 +32,11 @@ module OpenTox
def fingerprint type=DEFAULT_FINGERPRINT
unless fingerprints[type]
return [] unless self.smiles
- #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
- if type == "MP2D"
+ if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
fp = obconversion(smiles,"smi","mpd").strip.split("\t")
name = fp.shift # remove Title
fingerprints[type] = fp.uniq # no fingerprint counts
- #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
- elsif type== "MNA"
+ elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
level = 2 # TODO: level as parameter, evaluate level 1, see paper
fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
fp.shift # remove Title
@@ -130,30 +125,17 @@ module OpenTox
# @param [String] smiles
# @return [OpenTox::Compound]
def self.from_smiles smiles
- if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
- $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces."
- return nil
- end
+ return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles
smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons)
- if smiles.empty?
- $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string."
- return nil
- else
- Compound.find_or_create_by :smiles => smiles
- end
+ smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles)
end
# Create a compound from InChI string
# @param [String] InChI
# @return [OpenTox::Compound]
def self.from_inchi inchi
- #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip
smiles = obconversion(inchi,"inchi","can")
- if smiles.empty?
- Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."])
- else
- Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
- end
+ smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles)
end
# Create a compound from SDF
@@ -170,7 +152,7 @@ module OpenTox
# @param [String] name, can be also an InChI/InChiKey, CAS number, etc
# @return [OpenTox::Compound]
def self.from_name name
- Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
+ Compound.from_smiles RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","name",URI.escape(name),"property","CanonicalSMILES","TXT")).chomp
end
# Get InChI
@@ -238,56 +220,16 @@ module OpenTox
# names = compound.names
# @return [Array<String>]
def names
- update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"]
+ update(:names => RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","smiles",URI.escape(smiles),"synonyms","TXT")).split("\n")) #unless self["names"]
self["names"]
end
# Get PubChem Compound Identifier (CID), obtained via REST call to PubChem
# @return [String]
def cid
- pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
- update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
+ update(:cid => RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"]
self["cid"]
end
-
- # Get ChEMBL database compound id, obtained via REST call to ChEMBL
- # @return [String]
- def chemblid
- # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey
- uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json"
- update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"]
- self["chemblid"]
- end
-
- def db_neighbors min_sim: 0.1, dataset_id:
- #p fingerprints[DEFAULT_FINGERPRINT]
- # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
-
- #qn = default_fingerprint_size
- #qmin = qn * threshold
- #qmax = qn / threshold
- #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
- #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)]
- aggregate = [
- #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}},
- #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
- {'$project' => {
- 'similarity' => {'$let' => {
- 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}},
- 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
- }},
- '_id' => 1,
- #'measurements' => 1,
- 'dataset_ids' => 1
- }},
- {'$match' => {'similarity' => {'$gte' => min_sim}}},
- {'$sort' => {'similarity' => -1}}
- ]
-
- # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array
- $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id}
-
- end
# Convert mmol to mg
# @return [Float] value in mg
@@ -319,7 +261,7 @@ module OpenTox
obconversion.read_string obmol, identifier
case output_format
when /smi|can|inchi/
- obconversion.write_string(obmol).gsub(/\s/,'').chomp
+ obconversion.write_string(obmol).split(/\s/).first
when /sdf/
# TODO: find disconnected structures
# strip_salts
@@ -332,11 +274,11 @@ module OpenTox
print sdf
if sdf.match(/.nan/)
- $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
+ #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure"
obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS)
sdf = obconversion.write_string(obmol)
if sdf.match(/.nan/)
- $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates."
+ #warn "2D generation failed for compound #{identifier}, rendering without coordinates."
obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS)
sdf = obconversion.write_string(obmol)
end
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 06a1e2a..e1761bc 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -15,7 +15,7 @@ module OpenTox
$logger.debug model.algorithms
klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification
klass = RegressionCrossValidation if model.is_a? Model::LazarRegression
- bad_request_error "Unknown model class #{model.class}." unless klass
+ raise ArgumentError, "Unknown model class #{model.class}." unless klass
cv = klass.new(
name: model.name,
@@ -24,24 +24,16 @@ module OpenTox
)
cv.save # set created_at
- nr_instances = 0
- nr_unpredicted = 0
training_dataset = model.training_dataset
training_dataset.folds(n).each_with_index do |fold,fold_nr|
#fork do # parallel execution of validations can lead to Rserve and memory problems
- $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
- t = Time.now
- validation = TrainTest.create(model, fold[0], fold[1])
- cv.validation_ids << validation.id
- cv.nr_instances += validation.nr_instances
- cv.nr_unpredicted += validation.nr_unpredicted
- #cv.predictions.merge! validation.predictions
- $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
- #end
+ $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started"
+ t = Time.now
+ validation = TrainTest.create(model, fold[0], fold[1])
+ cv.validation_ids << validation.id
+ $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds"
end
- #Process.waitall
cv.save
- $logger.debug "Nr unpredicted: #{nr_unpredicted}"
cv.statistics
cv.update_attributes(finished_at: Time.now)
cv
@@ -72,25 +64,25 @@ module OpenTox
class ClassificationCrossValidation < CrossValidation
include ClassificationStatistics
field :accept_values, type: Array
- field :confusion_matrix, type: Array
- field :weighted_confusion_matrix, type: Array
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
+ field :confusion_matrix, type: Hash
+ field :accuracy, type: Hash
field :true_rate, type: Hash
field :predictivity, type: Hash
+ field :nr_predictions, type: Hash
field :probability_plot_id, type: BSON::ObjectId
end
# Crossvalidation of regression models
class RegressionCrossValidation < CrossValidation
include RegressionStatistics
- field :rmse, type: Float, default:0
- field :mae, type: Float, default:0
- field :r_squared, type: Float
- field :within_prediction_interval, type: Integer, default:0
- field :out_of_prediction_interval, type: Integer, default:0
- field :correlation_plot_id, type: BSON::ObjectId
+ field :rmse, type: Hash
+ field :mae, type: Hash
+ field :r_squared, type: Hash
+ field :within_prediction_interval, type: Hash
+ field :out_of_prediction_interval, type: Hash
+ field :nr_predictions, type: Hash
field :warnings, type: Array
+ field :correlation_plot_id, type: BSON::ObjectId
end
# Independent repeated crossvalidations
@@ -103,7 +95,7 @@ module OpenTox
# @param [Fixnum] number of folds
# @param [Fixnum] number of repeats
# @return [OpenTox::Validation::RepeatedCrossValidation]
- def self.create model, folds=10, repeats=3
+ def self.create model, folds=10, repeats=5
repeated_cross_validation = self.new
repeats.times do |n|
$logger.debug "Crossvalidation #{n+1} for #{model.name}"
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 6e7d67f..fb1afd2 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -1,12 +1,16 @@
require 'csv'
require 'tempfile'
+require 'digest/md5'
module OpenTox
# Collection of substances and features
class Dataset
- field :data_entries, type: Hash, default: {}
+ field :data_entries, type: Array, default: [] #substance,feature,value
+ field :warnings, type: Array, default: []
+ field :source, type: String
+ field :md5, type: String
# Readers
@@ -25,29 +29,87 @@ module OpenTox
# Get all substances
# @return [Array<OpenTox::Substance>]
def substances
- @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq
+ @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0]}.uniq
@substances
end
# Get all features
# @return [Array<OpenTox::Feature>]
def features
- @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq
+ @features ||= data_entries.collect{|row| OpenTox::Feature.find(row[1])}.uniq
@features
end
# Get all values for a given substance and feature
# @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id
# @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id
- # @return [TrueClass,FalseClass,Float]
+ # @return [Array<TrueClass,FalseClass,Float>] values
def values substance,feature
substance = substance.id if substance.is_a? Substance
feature = feature.id if feature.is_a? Feature
- if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s]
- data_entries[substance.to_s][feature.to_s]
- else
- [nil]
- end
+ substance = BSON::ObjectId.from_string(substance) if substance.is_a? String
+ feature = BSON::ObjectId.from_string(feature) if feature.is_a? String
+ data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]}
+ end
+
+ # Get OriginalId features
+ # @return [Array<OpenTox::OriginalId>] original ID features (merged datasets may have multiple original IDs)
+ def original_id_features
+ features.select{|f| f.is_a?(OriginalId)}
+ end
+
+ # Get OriginalSmiles features
+ # @return [Array<OpenTox::OriginalSmiles>] original smiles features (merged datasets may have multiple original smiles)
+ def original_smiles_features
+ features.select{|f| f.is_a?(OriginalSmiles)}
+ end
+
+ # Get Warnings features
+ # @return [Array<OpenTox::Warnings>] warnings features (merged datasets may have multiple warnings)
+ def warnings_features
+ features.select{|f| f.is_a?(Warnings)}
+ end
+
+ # Get Confidence feature
+ # @return [OpenTox::Confidence] confidence feature
+ def confidence_feature
+ features.select{|f| f.is_a?(Confidence)}.first
+ end
+
+ # Get nominal and numeric bioactivity features
+ # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>]
+ def bioactivity_features
+ features.select{|f| f._type.match(/BioActivity/)}
+ end
+
+ # Get nominal and numeric bioactivity features
+ # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>]
+ def transformed_bioactivity_features
+ features.select{|f| f._type.match(/Transformed.*BioActivity/)}
+ end
+
+ # Get nominal and numeric substance property features
+ # @return [Array<OpenTox::NominalSubstanceProperty,OpenTox::NumericSubstanceProperty>]
+ def substance_property_features
+ features.select{|f| f._type.match("SubstanceProperty")}
+ end
+
+ # Get nominal and numeric prediction features
+ # @return [Array<OpenTox::NominalLazarPrediction,OpenTox::NumericLazarPrediction>]
+ def prediction_feature
+ features.select{|f| f._type.match(/Prediction$/)}.first
+ end
+
+ # Get supporting nominal and numeric prediction features (class probabilities, prediction interval)
+ # @return [Array<OpenTox::LazarPredictionProbability,OpenTox::LazarPredictionInterval>]
+ def prediction_supporting_features
+ features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)}
+ end
+
+ # Get nominal and numeric merged features
+ # @return [Array<OpenTox::MergedNominalBioActivity,OpenTox::MergedNumericBioActivity>]
+ def merged_features
+ features.select{|f| f._type.match("Merged")}
end
# Writers
@@ -59,137 +121,178 @@ module OpenTox
def add(substance,feature,value)
substance = substance.id if substance.is_a? Substance
feature = feature.id if feature.is_a? Feature
- data_entries[substance.to_s] ||= {}
- data_entries[substance.to_s][feature.to_s] ||= []
- data_entries[substance.to_s][feature.to_s] << value
- #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source
+ data_entries << [substance,feature,value] if substance and feature and value
end
- # Dataset operations
-
- # Split a dataset into n folds
- # @param [Integer] number of folds
- # @return [Array] Array with folds [training_dataset,test_dataset]
- def folds n
- len = self.substances.size
- indices = (0..len-1).to_a.shuffle
- mid = (len/n)
- chunks = []
- start = 0
- 1.upto(n) do |i|
- last = start+mid
- last = last-1 unless len%n >= i
- test_idxs = indices[start..last] || []
- test_substances = test_idxs.collect{|i| substances[i]}
- training_idxs = indices-test_idxs
- training_substances = training_idxs.collect{|i| substances[i]}
- chunk = [training_substances,test_substances].collect do |substances|
- dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id )
- substances.each do |substance|
- substance.dataset_ids << dataset.id
- substance.dataset_ids.uniq!
- substance.save
- dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {}
+ # Parsers
+
+ # Create a dataset from CSV file
+ # @param [File] Input file with the following format:
+ # - ID column (optional): header containing "ID" string, arbitrary ID values
+ # - SMILES/InChI column: header indicating "SMILES" or "InChI", Smiles or InChI strings
+ # - one or more properties column(s): header with property name(s), property values
+ # files with a single property column are read as BioActivities (i.e. dependent variable)
+ # files with multiple property columns are read as SubstanceProperties (i.e. independent variables)
+ # @return [OpenTox::Dataset]
+ def self.from_csv_file file
+ md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
+ dataset = self.find_by(:md5 => md5)
+ if dataset
+ $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
+ else
+ $logger.debug "Parsing #{file}."
+ table = nil
+ sep = ","
+ ["\t",";"].each do |s| # guess alternative CSV separator
+ if File.readlines(file).first.match(/#{s}/)
+ sep = s
+ break
end
- dataset.save
- dataset
end
- start = last+1
- chunks << chunk
+ table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
+ if table
+ dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
+ dataset.parse_table table
+ else
+ raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator."
+ end
end
- chunks
+ dataset
end
- # Serialisation
-
- # Convert dataset to csv format including compound smiles as first column, other column headers are feature names
- # @return [String]
- def to_csv(inchi=false)
- CSV.generate() do |csv|
- compound = substances.first.is_a? Compound
- if compound
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
- else
- csv << ["Name"] + features.collect{|f| f.name}
- end
- substances.each do |substance|
- if compound
- name = (inchi ? substance.inchi : substance.smiles)
- else
- name = substance.name
- end
- nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq
+ # Create a dataset from SDF file
+ # files with a single data field are read as BioActivities (i.e. dependent variable)
+ # files with multiple data fields are read as SubstanceProperties (i.e. independent variable)
+ # @param [File]
+ # @return [OpenTox::Dataset]
+ def self.from_sdf_file file
+ md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files
+ dataset = self.find_by(:md5 => md5)
+ if dataset
+ $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import."
+ else
+ $logger.debug "Parsing #{file}."
+
+ dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5)
+ original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID")
- if nr_measurements.size > 1
- warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries."
+ read_result = false
+ sdf = ""
+ feature_name = ""
+ compound = nil
+ features = {}
+ table = [["ID","SMILES"]]
+
+ File.readlines(file).each do |line|
+ if line.match %r{\$\$\$\$}
+ sdf << line
+ id = sdf.split("\n").first.chomp
+ compound = Compound.from_sdf sdf
+ row = [id,compound.smiles]
+ features.each do |f,v|
+ table[0] << f unless table[0].include? f
+ row[table[0].index(f)] = v
+ end
+ table << row
+ sdf = ""
+ features = {}
+ elsif line.match /^>\s+</
+ feature_name = line.match(/^>\s+<(.*)>/)[1]
+ read_result = true
else
- (0..nr_measurements.first-1).each do |i|
- row = [name]
- features.each do |f|
- values(substance,f) ? row << values(substance,f)[i] : row << ""
- end
- csv << row
+ if read_result
+ value = line.chomp
+ features[feature_name] = value
+ read_result = false
+ else
+ sdf << line
end
end
end
+ dataset.parse_table table
end
+ dataset
end
- # Parsers
-
- # Create a dataset from file (csv,sdf,...)
- # @param filename [String]
- # @return [String] dataset uri
- # TODO
- #def self.from_sdf_file
- #end
-
- # Create a dataset from CSV file
- # @param [File]
- # @param [TrueClass,FalseClass] accept or reject empty values
+ # Create a dataset from PubChem Assay
+ # @param [Integer] PubChem AssayID (AID)
# @return [OpenTox::Dataset]
- def self.from_csv_file file, accept_empty_values=false
- source = file
- name = File.basename(file,".*")
- dataset = self.find_by(:source => source, :name => name)
- if dataset
- $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})."
- else
- $logger.debug "Parsing #{file}."
- table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8'
- dataset = self.new(:source => source, :name => name)
- dataset.parse_table table, accept_empty_values
+ def self.from_pubchem_aid aid
+ # TODO get regression data
+ aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
+ assay_metadata = JSON.parse(RestClientWrapper.get(File.join aid_url,"description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"]
+ name = assay_metadata["name"].gsub(/\s+/,"_")
+ dataset = self.new(:source => aid_url, :name => name)
+ # Get assay data in chunks
+ # Assay record retrieval is limited to 10000 SIDs
+ # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
+ list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
+ listkey = list["ListKey"]
+ size = list["Size"]
+ start = 0
+ csv = []
+ while start < size
+ url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
+ csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
+ start += 10000
+ end
+ table = [["SID","SMILES",name]]
+ csv.each_slice(100) do |slice| # get SMILES in chunks
+ cids = slice.collect{|s| s[2]}
+ pubchem_cids = []
+ JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
+ i = cids.index(prop["CID"].to_s)
+ value = slice[i][3]
+ if value == "Active" or value == "Inactive"
+ table << [slice[i][1].to_s,prop["CanonicalSMILES"],slice[i][3].to_s]
+ pubchem_cids << prop["CID"].to_s
+ else
+ dataset.warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is #{value}."
+ end
+ end
+ (cids-pubchem_cids).each { |cid| dataset.warnings << "Could not retrieve SMILES for CID #{cid}, all entries are ignored." }
end
+ dataset.parse_table table
dataset
end
# Parse data in tabular format (e.g. from csv)
# does a lot of guesswork in order to determine feature types
# @param [Array<Array>]
- # @param [TrueClass,FalseClass] accept or reject empty values
- def parse_table table, accept_empty_values
+ def parse_table table
# features
feature_names = table.shift.collect{|f| f.strip}
- warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
- compound_format = feature_names.shift.strip
- bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
+ raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+
+ if feature_names[0] !~ /SMILES|InChI/i # check ID column
+ original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift)
+ else
+ original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID")
+ end
+
+ compound_format = feature_names.shift
+ raise ArgumentError, "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i
+ original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i)
+
numeric = []
features = []
+
# guess feature types
+ bioactivity = true if feature_names.size == 1
+
feature_names.each_with_index do |f,i|
- metadata = {:name => f}
- values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact
+ original_id.name.match(/LineID$/) ? j = i+1 : j = i+2
+ values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact
types = values.collect{|v| v.numeric? ? true : false}.uniq
feature = nil
if values.size == 0 # empty feature
elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes
numeric[i] = true
- feature = NumericFeature.find_or_create_by(metadata)
+ bioactivity ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f)
else
- metadata["accept_values"] = values
numeric[i] = false
- feature = NominalFeature.find_or_create_by(metadata)
+ bioactivity ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort)
end
features << feature if feature
end
@@ -198,35 +301,31 @@ module OpenTox
all_substances = []
table.each_with_index do |vals,i|
+ original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.to_s.strip
identifier = vals.shift.strip
- warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values
begin
case compound_format
when /SMILES/i
- substance = OpenTox::Compound.from_smiles(identifier)
+ substance = Compound.from_smiles(identifier)
+ add substance, original_smiles, identifier
when /InChI/i
- substance = OpenTox::Compound.from_inchi(identifier)
+ substance = Compound.from_inchi(identifier)
end
rescue
substance = nil
end
+
if substance.nil? # compound parsers may return nil
- warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
+ warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored."
next
end
+
all_substances << substance
- substance.dataset_ids << self.id
- substance.dataset_ids.uniq!
- substance.save
-
- unless vals.size == features.size
- warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored."
- next
- end
+ add substance, original_id, original_id_value
vals.each_with_index do |v,j|
if v.blank?
- warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'."
+ warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'."
next
elsif numeric[j]
v = v.to_f
@@ -235,46 +334,202 @@ module OpenTox
end
add substance, features[j], v
end
- data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values
end
+
+ warnings_feature = Warnings.find_or_create_by(:dataset_id => id)
all_substances.duplicates.each do |substance|
positions = []
- all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi}
- warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles}
+ all_substances.select{|s| s.smiles == substance.smiles}.each do |s|
+ add s, warnings_feature, "Duplicated compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ end
end
save
end
- # Delete dataset
- def delete
- compounds.each{|c| c.dataset_ids.delete id.to_s}
- super
+ # Serialisation
+
+ # Convert lazar prediction dataset to csv format
+ # @return [String]
+ def to_prediction_csv
+
+ compound = substances.first.is_a? Compound
+ header = ["ID"]
+ header << "Original SMILES" if compound
+ compound ? header << "Canonical SMILES" : header << "Name"
+ header << "Prediction" if prediction_feature
+ header << "Confidence" if confidence_feature
+ header += prediction_supporting_features.collect{|f| f.name}
+ header << "Measurements"
+ csv = [header]
+
+ substances.each do |substance|
+ row = original_id_features.collect{|f| values(substance,f).join(" ")}
+ row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound
+ compound ? row << substance.smiles : row << substance.name
+ row << values(substance,prediction_feature).join(" ")
+ row << values(substance,confidence_feature).join(" ")
+ row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")}
+ row << values(substance,bioactivity_features[0]).join(" ")
+ csv << row
+ end
+ csv.collect{|r| r.join(",")}.join("\n")
end
+
+ # Convert dataset into csv formatted training data
+ # @return [String]
+ def to_training_csv
+
+ export_features = merged_features
+ export_features = transformed_bioactivity_features if export_features.empty?
+ export_features = bioactivity_features if export_features.empty?
+ export_feature = export_features.first
- end
+ header = ["Canonical SMILES"]
+ header << bioactivity_features.first.name # use original bioactivity name instead of long merged name
+ csv = [header]
- # Dataset for lazar predictions
- class LazarPrediction #< Dataset
- field :creator, type: String
- field :prediction_feature_id, type: BSON::ObjectId
- field :predictions, type: Hash, default: {}
+ substances.each do |substance|
+ nr_activities = values(substance,bioactivity_features.first).size
+ (0..nr_activities-1).each do |n| # new row for each value
+ row = [substance.smiles]
+ row << values(substance,export_feature)[n]
+ csv << row
+ end
+ end
+ csv.collect{|r| r.join(",")}.join("\n")
+ end
- # Get prediction feature
- # @return [OpenTox::Feature]
- def prediction_feature
- Feature.find prediction_feature_id
+ # Convert dataset to SDF format
+ # @return [String] SDF string
+ def to_sdf
+ sdf = ""
+ compounds.each do |compound|
+ sdf_lines = compound.sdf.sub(/\$\$\$\$\n/,"").split("\n")
+ sdf_lines[0] = compound.smiles
+ sdf += sdf_lines.join("\n")
+ bioactivity_features.each do |f|
+ v = values(compound,f)
+ unless v.empty?
+ sdf += "\n> <#{f.name}>\n"
+ sdf += v.uniq.join ","
+ sdf += "\n"
+ end
+ end
+ sdf += "\n$$$$\n"
+ end
+ sdf
end
- # Get all compounds
- # @return [Array<OpenTox::Compound>]
- def compounds
- substances.select{|s| s.is_a? Compound}
+ # Get lazar predictions from a dataset
+ # @return [Hash] predictions
+ def predictions
+ predictions = {}
+ substances.each do |s|
+ predictions[s] ||= {}
+ predictions[s][:value] = values(s,prediction_feature).first
+ #predictions[s][:warnings] = []
+ #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) }
+ predictions[s][:confidence] = values(s,confidence_feature).first
+ if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction
+ prediction_feature.accept_values.each do |v|
+ f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id)
+ predictions[s][:probabilities] ||= {}
+ predictions[s][:probabilities][v] = values(s,f).first
+ end
+ end
+ end
+ predictions
end
- # Get all substances
- # @return [Array<OpenTox::Substance>]
- def substances
- predictions.keys.collect{|id| Substance.find id}
+ # Dataset operations
+
+ # Copy a dataset
+ # @return OpenTox::Dataset dataset copy
+ def copy
+ dataset = Dataset.new
+ dataset.data_entries = data_entries
+ dataset.warnings = warnings
+ dataset.name = name
+ dataset.source = id.to_s
+ dataset.save
+ dataset
+ end
+
+ # Split a dataset into n folds
+ # @param [Integer] number of folds
+ # @return [Array] Array with folds [training_dataset,test_dataset]
+ def folds n
+ $logger.debug "Creating #{n} folds for #{name}."
+ len = self.substances.size
+ indices = (0..len-1).to_a.shuffle
+ mid = (len/n)
+ chunks = []
+ start = 0
+ 1.upto(n) do |i|
+ last = start+mid
+ last = last-1 unless len%n >= i
+ test_idxs = indices[start..last] || []
+ test_substances = test_idxs.collect{|i| substances[i].id}
+ training_idxs = indices-test_idxs
+ training_substances = training_idxs.collect{|i| substances[i].id}
+ chunk = [training_substances,test_substances].collect do |substances|
+ self.class.create(
+ :name => "#{self.name} (Fold #{i-1})",
+ :source => self.id,
+ :data_entries => data_entries.select{|row| substances.include? row[0]}
+ )
+ end
+ start = last+1
+ chunks << chunk
+ end
+ chunks
+ end
+
+ # Merge an array of datasets
+ # @param [Array<OpenTox::Dataset>] datasets Datasets to be merged
+ # @param [Array<OpenTox::Feature>] features Features to be merged (same size as datasets)
+ # @param [Array<Hash>] value_maps Value transfomations (use nil for keeping original values, same size as dataset)
+ # @param [Bool] keep_original_features Copy original features/values to the merged dataset
+ # @param [Bool] remove_duplicates Delete duplicated values (assuming they come from the same experiment)
+ # @return [OpenTox::Dataset] merged dataset
+ def self.merge datasets: , features: , value_maps: , keep_original_features: , remove_duplicates:
+ dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")+" merged")
+
+ datasets.each do |d|
+ dataset.data_entries += d.data_entries
+ dataset.warnings += d.warnings
+ end if keep_original_features
+
+ feature_classes = features.collect{|f| f.class}.uniq
+ merged_feature = nil
+ if feature_classes.size == 1
+ if features.first.kind_of? NominalFeature
+ merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(" and ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps)
+ else
+ merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations
+ end
+ else
+ raise ArgumentError, "Cannot merge features of different types (#{feature_classes})."
+ end
+
+ accept_values = []
+ features.each_with_index do |f,i|
+ dataset.data_entries += datasets[i].data_entries.select{|de| de[1] == f.id}.collect do |de|
+ value_maps[i] ? v = value_maps[i][de[2]] : v = de[2]
+ accept_values << v
+ [de[0],merged_feature.id,v]
+ end
+ end
+
+ if merged_feature.is_a? MergedNominalBioActivity
+ merged_feature.accept_values = accept_values.uniq.sort
+ merged_feature.save
+ end
+
+ dataset.data_entries.uniq! if remove_duplicates
+ dataset.save
+ dataset
end
end
diff --git a/lib/download.rb b/lib/download.rb
new file mode 100644
index 0000000..2546dc4
--- /dev/null
+++ b/lib/download.rb
@@ -0,0 +1,358 @@
+module OpenTox
+
+ class Download
+
+ DATA = File.join(File.dirname(__FILE__),"..","data")
+
+ # Download classification dataset from PubChem into the data folder
+ # @param [Integer] aid PubChem Assay ID
+ # @param [String] active Name for the "Active" class
+ # @param [String] inactive Name for the "Inactive" class
+ # @param [String] species Species name
+ # @param [String] endpoint Endpoint name
+ # @param [Hash] qmrf Name and group for QMRF reports (optional)
+ def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil
+ aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
+
+ # Get assay data in chunks
+ # Assay record retrieval is limited to 10000 SIDs
+ # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
+ list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
+ listkey = list["ListKey"]
+ size = list["Size"]
+ start = 0
+ csv = []
+ while start < size
+ url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
+ csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
+ start += 10000
+ end
+ warnings = []
+ name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_")
+ $logger.debug name
+ table = [["SID","SMILES",name]]
+ csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100
+ cids = slice.collect{|s| s[2]}
+ pubchem_cids = []
+ JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
+ i = cids.index(prop["CID"].to_s)
+ value = slice[i][3]
+ if value == "Active"
+ table << [slice[i][1].to_s,prop["CanonicalSMILES"],active]
+ pubchem_cids << prop["CID"].to_s
+ elsif value == "Inactive"
+ table << [slice[i][1].to_s,prop["CanonicalSMILES"],inactive]
+ pubchem_cids << prop["CID"].to_s
+ else
+ warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'."
+ end
+ end
+ (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
+ end
+ File.open(File.join(DATA,name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
+ meta = {
+ :species => species,
+ :endpoint => endpoint,
+ :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}",
+ :qmrf => qmrf,
+ :warnings => warnings
+ }
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ File.join(DATA,name+".csv")
+ end
+
+ # Download regression dataset from PubChem into the data folder
+ # Uses -log10 transformed experimental data in mmol units
+ # @param [String] aid PubChem Assay ID
+ # @param [String] species Species name
+ # @param [String] endpoint Endpoint name
+ # @param [Hash] qmrf Name and group for QMRF reports (optional)
+ def self.pubchem_regression aid: , species: , endpoint:, qmrf: nil
+ aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
+
+ # Get assay data in chunks
+ # Assay record retrieval is limited to 10000 SIDs
+ # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435
+ list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"]
+ listkey = list["ListKey"]
+ size = list["Size"]
+ start = 0
+ csv = []
+ unit = nil
+ while start < size
+ url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000"
+ # get unit
+ unit ||= CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0] == "RESULT_UNIT"}[0][8]
+ csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows
+ start += 10000
+ end
+ warnings = []
+ name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_")
+ $logger.debug name
+ table = [["SID","SMILES","-log10(#{name} [#{unit}])"]]
+ csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100
+ cids = slice.collect{|s| s[2]}
+ pubchem_cids = []
+ JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop|
+ i = cids.index(prop["CID"].to_s)
+ value = slice[i][8]
+ if value
+ value = -Math.log10(value.to_f)
+ table << [slice[i][1].to_s,prop["CanonicalSMILES"],value]
+ pubchem_cids << prop["CID"].to_s
+ else
+ warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'."
+ end
+ end
+ (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
+ end
+ File.open(File.join(DATA,name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
+ meta = {
+ :species => species,
+ :endpoint => endpoint,
+ :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}",
+ :unit => unit,
+ :qmrf => qmrf,
+ :warnings => warnings
+ }
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ File.join(DATA,name+".csv")
+ end
+
+ # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder
+ def self.mutagenicity
+ $logger.debug "Mutagenicity"
+ hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv"
+ kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip"
+ efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls"
+
+ parts = File.join(DATA, "parts")
+ FileUtils.mkdir_p parts
+ Dir[File.join(parts,"hansen.*")].each{|f| FileUtils.rm f }
+ Dir[File.join(parts,"cas_4337.*")].each{|f| FileUtils.rm f }
+ Dir[File.join(parts,"efsa.*")].each{|f| FileUtils.rm f }
+ File.open(File.join(parts,"hansen-original.csv"),"w+"){|f| f.puts RestClientWrapper.get(hansen_url).to_s }
+
+ # convert hansen
+ hansen = CSV.read File.join(parts,"hansen-original.csv")
+ hansen.shift
+
+ map = {"0" => "non-mutagenic","1" => "mutagenic"}
+ File.open(File.join(parts,"hansen.csv"),"w+") do |f|
+ f.puts "ID,SMILES,Mutagenicity"
+ hansen.each do |row|
+ f.puts [row[0],row[5],map[row[2]]].join ","
+ end
+ end
+ File.open(File.join(parts,"cas_4337.zip"),"w+"){|f| f.puts RestClientWrapper.get(kazius_url).to_s }
+ `cd #{parts} && unzip cas_4337.zip`
+ `cd #{parts} && wget #{URI.escape efsa_url} -O efsa.xls`
+ `cd #{parts} && xls2csv -s cp1252 -d utf-8 -x -c " " efsa.xls > efsa.tsv`
+
+ # convert EFSA data to mutagenicity classifications
+ i = 0
+ db = {}
+ CSV.foreach(File.join(parts,"efsa.tsv"), :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row|
+ if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33]
+ begin
+ c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles
+ rescue
+ c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters
+ end
+ db[c] ||= {}
+ db[c][:id] ||= row[2]
+ if row[33].match(/Positiv/i)
+ db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100
+ elsif row[33].match(/Negativ/i)
+ db[c][:value] ||= "non-mutagenic"
+ end
+ end
+ i += 1
+ end
+ File.open(File.join(parts,"efsa.csv"),"w+") do |f|
+ f.puts "ID,SMILES,Mutagenicity"
+ db.each do |s,v|
+ f.puts [v[:id],s,v[:value]].join ","
+ end
+ end
+
+ # merge datasets
+ hansen = Dataset.from_csv_file File.join(parts,"hansen.csv")
+ efsa = Dataset.from_csv_file File.join(parts,"efsa.csv")
+ kazius = Dataset.from_sdf_file File.join(parts,"cas_4337.sdf")
+ datasets = [hansen,efsa,kazius]
+ map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"}
+ dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true
+ dataset.merged_features.first.name = "Mutagenicity"
+ File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv}
+ meta = {
+ :species => "Salmonella typhimurium",
+ :endpoint => "Mutagenicity",
+ :source => [kazius_url,hansen_url,efsa_url].join(", "),
+ :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"},
+ }
+ File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json}
+
+ # cleanup
+ datasets << dataset
+ datasets.each{|d| d.delete }
+ File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv")
+ end
+
+ # Download Blood Brain Barrier Penetration dataset into the data folder
+ def self.blood_brain_barrier
+ url = "http://cheminformatics.org/datasets/li/bbp2.smi"
+ name = "Blood_Brain_Barrier_Penetration-Human"
+ $logger.debug name
+ map = {"n" => "non-penetrating", "p" => "penetrating"}
+ table = CSV.parse RestClientWrapper.get(url).to_s, :col_sep => "\t"
+ File.open(File.join(DATA,name+".csv"),"w+") do |f|
+ f.puts "ID,SMILES,#{name}"
+ table.each do |row|
+ f.puts [row[1],row[0],map[row[3]]].join(",")
+ end
+ end
+ meta = {
+ :species => "Human",
+ :endpoint => "Blood Brain Barrier Penetration",
+ :source => url,
+ :qmrf => {"name": "QMRF 5.4. Toxicokinetics.Blood-brain barrier penetration", "group": "QMRF 5. Toxicokinetics"},
+ }
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ end
+
+ # Download the combined LOAEL dataset from Helma et al 2018 into the data folder
+ def self.loael
+ # TODO: fix url??
+ url = "https://raw.githubusercontent.com/opentox/loael-paper/revision/data/training_log10.csv"
+ name = "Lowest_observed_adverse_effect_level-Rats"
+ $logger.debug name
+ File.open(File.join(DATA,name+".csv"),"w+") do |f|
+ CSV.parse(RestClientWrapper.get(url).to_s) do |row|
+ f.puts [row[0],row[1]].join ","
+ end
+ end
+ meta = {
+ :species => "Rat",
+ :endpoint => "Lowest observed adverse effect level",
+ :source => url,
+ :unit => "mmol/kg_bw/day",
+ :qmrf => {
+ "name": "QMRF 4.14. Repeated dose toxicity",
+ "group": "QMRF 4.Human Health Effects"
+ }
+ }
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ end
+
+ # Download Daphnia dataset from http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/ into the public folder
+ # The original file requires an email request, this is a temporary workaround
+ def self.daphnia
+ #url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv"
+ src = File.join(DATA,"parts","toxicity_data.xlsx")
+ name = "Acute_toxicity-Daphnia_magna"
+ $logger.debug name
+ File.open(File.join(DATA,name+".csv"),"w+") do |f|
+ i = 0
+ CSV.parse(`xlsx2csv #{src}`) do |row|
+ i == 0 ? v = "-log[LC50_mmol/L]" : v = -Math.log10(10**-row[3].to_f*1000)
+ f.puts [row[0],row[1],v].join(",")
+ i += 1
+ end
+ end
+ meta = { "species": "Daphnia magna",
+ "endpoint": "Acute toxicity",
+ "source": "http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/",
+ "unit": "mmol/L",
+ "qmrf": {
+ "group": "QMRF 3.1. Short-term toxicity to Daphnia (immobilisation)",
+ "name": "EC C. 2. Daphnia sp Acute Immobilisation Test"
+ }
+ }
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ end
+
+ # Download all public lazar datasets into the data folder
+ def self.public_data
+
+ # Classification
+ [
+ {
+ :aid => 1205,
+ :species => "Rodents",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
+ },{
+ :aid => 1208,
+ :species => "Rat",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
+ },{
+ :aid => 1199,
+ :species => "Mouse",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"}
+ }
+ ].each do |assay|
+ Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogenic", inactive: "non-carcinogenic", qmrf: assay[:qmrf]
+ end
+ Download.mutagenicity
+ Download.blood_brain_barrier
+
+ # Regression
+ [
+ {
+ :aid => 1195,
+ :species => "Human",
+ :endpoint => "Maximum Recommended Daily Dose",
+ :qmrf => {
+ "group": "QMRF 4.14. Repeated dose toxicity",
+ "name": "OECD 452 Chronic Toxicity Studies"
+ },
+ },{
+ :aid => 1208,
+ :species => "Rat (TD50)",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {
+ :group => "QMRF 4.12. Carcinogenicity",
+ :name => "OECD 451 Carcinogenicity Studies"
+ }
+ },{
+ :aid => 1199,
+ :species => "Mouse (TD50)",
+ :endpoint => "Carcinogenicity",
+ :qmrf => {
+ :group => "QMRF 4.12. Carcinogenicity",
+ :name => "OECD 451 Carcinogenicity Studies"
+ }
+ },{
+ :aid => 1188,
+ :species => "Fathead minnow",
+ :endpoint => "Acute toxicity",
+ :qmrf => {
+ "group": "QMRF 3.3. Acute toxicity to fish (lethality)",
+ "name": "EC C. 1. Acute Toxicity for Fish"
+ }
+ }
+ ].each do |assay|
+ Download.pubchem_regression aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], qmrf: assay[:qmrf]
+ end
+
+ Download.loael
+ Download.daphnia
+
+=begin
+ # 1204 estrogen receptor
+ # 1259408, # GENE-TOX
+ # 1159563 HepG2 cytotoxicity assay
+ # 588209 hepatotoxicity
+ # 1259333 cytotoxicity
+ # 1159569 HepG2 cytotoxicity counterscreen Measured in Cell-Based System Using Plate Reader - 2153-03_Inhibitor_Dose_DryPowder_Activity
+ # 2122 HTS Counterscreen for Detection of Compound Cytotoxicity in MIN6 Cells
+ # 116724 Acute toxicity determined after intravenal administration in mice
+ # 1148549 Toxicity in po dosed mouse assessed as mortality after 7 days
+=end
+ end
+
+ end
+end
diff --git a/lib/enm-import.rb b/lib/enm-import.rb
new file mode 100644
index 0000000..cf1a26f
--- /dev/null
+++ b/lib/enm-import.rb
@@ -0,0 +1,125 @@
+module OpenTox
+
+ # Import data from external databases
+ module Import
+
+ class Enanomapper
+ include OpenTox
+
+ # Import from eNanoMapper
+ def self.import
+ # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
+ datasets = {}
+ bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle', {}, {accept: :json}))["dataset"]
+ bundles.each do |bundle|
+ datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
+ $logger.debug bundle["title"].strip
+ nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"], {}, {accept: :json}))["dataEntry"]
+ nanoparticles.each_with_index do |np,n|
+ core_id = nil
+ coating_ids = []
+ np["composition"].each do |c|
+ uri = c["component"]["compound"]["URI"]
+ data = JSON.parse(RestClientWrapper.get("https://data.enanomapper.net/query/compound/url/all?search=#{uri}", {}, {accept: :json}))
+ source = data["dataEntry"][0]["compound"]["URI"]
+ smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
+ names = []
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
+ names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
+ if smiles
+ compound = Compound.find_or_create_by(:smiles => smiles)
+ compound.name = names.first
+ compound.names = names.compact
+ else
+ compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
+ end
+ compound.source = source
+ compound.save
+ if c["relation"] == "HAS_CORE"
+ core_id = compound.id.to_s
+ elsif c["relation"] == "HAS_COATING"
+ coating_ids << compound.id.to_s
+ end
+ end if np["composition"]
+ nanoparticle = Nanoparticle.find_or_create_by(
+ :name => np["values"]["https://data.enanomapper.net/identifier/name"],
+ :source => np["compound"]["URI"],
+ :core_id => core_id,
+ :coating_ids => coating_ids
+ )
+ #np["bundles"].keys.each do |bundle_uri|
+ #nanoparticle.dataset_ids << datasets[bundle_uri].id
+ #end
+
+ studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"]
+ studies.each do |study|
+ dataset = datasets[np["bundles"].keys.first]
+ proteomics_features = {}
+ category = study["protocol"]["topcategory"]
+ source = study["protocol"]["category"]["term"]
+ study["effects"].each do |effect|
+
+ effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
+ effect["conditions"].delete_if { |k, v| v.nil? }
+
+ if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
+
+ JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
+ proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
+ nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
+ end
+ else
+ name = effect["endpoint"]
+ unit = effect["result"]["unit"]
+ warnings = []
+ case name
+ when "Log2 transformed" # use a sensible name
+ name = "log2(Net cell association)"
+ warnings = ["Original name was 'Log2 transformed'"]
+ unit = "log2(mL/ug(Mg))"
+ when "Total protein (BCA assay)"
+ category = "P-CHEM"
+ warnings = ["Category changed from TOX to P-CHEM"]
+ end
+ feature = klass.find_or_create_by(
+ :name => name,
+ :unit => unit,
+ :category => category,
+ :conditions => effect["conditions"],
+ :source => study["protocol"]["category"]["term"],
+ :measured => true,
+ :warnings => warnings
+ )
+ nanoparticle.parse_ambit_value feature, effect["result"], dataset
+ end
+ end
+ end
+ nanoparticle.save
+ print "#{n}, "
+ end
+ puts
+ end
+ datasets.each { |u,d| d.save }
+ end
+
+=begin
+ def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
+ #get list of bundle URIs
+ bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
+ datasets = []
+ bundles.each do |bundle|
+ uri = bundle["URI"]
+ study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
+ study["@graph"].each do |i|
+ puts i.to_yaml if i.keys.include? "sio:has-value"
+ end
+ end
+ datasets.collect{|d| d.id}
+ end
+=end
+
+ end
+
+ end
+
+end
diff --git a/lib/error.rb b/lib/error.rb
deleted file mode 100644
index 39b3c76..0000000
--- a/lib/error.rb
+++ /dev/null
@@ -1,66 +0,0 @@
-module OpenToxError
- attr_accessor :http_code, :message, :cause
- def initialize message=nil
- message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes
- super message
- @http_code ||= 500
- @message = message.to_s
- @cause = cut_backtrace(caller)
- $logger.error("\n"+JSON.pretty_generate({
- :http_code => @http_code,
- :message => @message,
- :cause => @cause
- }))
- end
-
- def cut_backtrace(trace)
- if trace.is_a?(Array)
- cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)}
- cut_index ||= trace.size
- cut_index -= 1
- cut_index = trace.size-1 if cut_index < 0
- trace[0..cut_index]
- else
- trace
- end
- end
-
-end
-
-class RuntimeError
- include OpenToxError
-end
-
-# clutters log file with library errors
-#class NoMethodError
- #include OpenToxError
-#end
-
-module OpenTox
-
- class Error < RuntimeError
- include OpenToxError
-
- def initialize(code, message=nil)
- @http_code = code
- super message
- end
- end
-
- # OpenTox errors
- RestClientWrapper.known_errors.each do |error|
- # create error classes
- c = Class.new Error do
- define_method :initialize do |message=nil|
- super error[:code], message
- end
- end
- OpenTox.const_set error[:class],c
-
- # define global methods for raising errors, eg. bad_request_error
- Object.send(:define_method, error[:method]) do |message|
- raise c.new(message)
- end
- end
-
-end
diff --git a/lib/feature.rb b/lib/feature.rb
index f811aef..296a174 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -1,37 +1,112 @@
module OpenTox
- # Basic feature class
- class Feature
- field :measured, type: Boolean
- field :calculated, type: Boolean
- field :category, type: String
- field :unit, type: String
- field :conditions, type: Hash
+ # Original ID (e.g. from CSV input)
+ class OriginalId < Feature
+ field :dataset_id, type: BSON::ObjectId
+ end
- # Is it a nominal feature
- # @return [TrueClass,FalseClass]
- def nominal?
- self.class == NominalFeature
- end
+ # Original SMILES (e.g. from CSV input)
+ class OriginalSmiles < Feature
+ field :dataset_id, type: BSON::ObjectId
+ end
- # Is it a numeric feature
- # @return [TrueClass,FalseClass]
- def numeric?
- self.class == NumericFeature
+ # Warnings
+ class Warnings < Feature
+ field :dataset_id, type: BSON::ObjectId
+ end
+
+ # Confidence
+ class Confidence < Feature
+ field :dataset_id, type: BSON::ObjectId
+ def name
+ "Confidence"
end
end
- # Feature for categorical variables
+ # Categorical variables
class NominalFeature < Feature
field :accept_values, type: Array
end
- # Feature for quantitative variables
+ # Quantitative variables
class NumericFeature < Feature
+ field :unit, type: String
+ end
+
+ # Nominal biological activity
+ class NominalBioActivity < NominalFeature
+ end
+
+ # Numeric biological activity
+ class NumericBioActivity < NumericFeature
+ end
+
+ # Merged nominal biological activity
+ class MergedNominalBioActivity < NominalBioActivity
+ field :original_feature_ids, type: Array
+ field :transformations, type: Array
+ end
+
+ # Merged numeric biological activity
+ class MergedNumericBioActivity < NumericBioActivity
+ field :original_feature_ids, type: Array
+ end
+
+ # Transformed nominal biological activity
+ class TransformedNominalBioActivity < NominalFeature
+ field :original_feature_id, type: BSON::ObjectId
+ field :transformation, type: Hash
+ end
+
+ # Transformed numeric biological activity
+ class TransformedNumericBioActivity < NumericFeature
+ field :original_feature_id, type: BSON::ObjectId
+ field :transformation, type: String
+ end
+
+ # Nominal lazar prediction
+ class NominalLazarPrediction < NominalFeature
+ field :model_id, type: BSON::ObjectId
+ field :training_feature_id, type: BSON::ObjectId
+ def name
+ "Prediction: #{self[:name]}"
+ end
+ end
+
+ class LazarPredictionProbability < NominalLazarPrediction
+ def name
+ "Probability: #{self[:name]}"
+ end
+ end
+
+ # Numeric lazar prediction
+ class NumericLazarPrediction < NumericFeature
+ field :model_id, type: BSON::ObjectId
+ field :training_feature_id, type: BSON::ObjectId
+ def name
+ "Prediction: #{self[:name]}"
+ end
+ end
+
+ class LazarPredictionInterval < NumericLazarPrediction
+ def name
+ "#{self[:name].capitalize} prediction interval"
+ end
+ end
+
+ class NominalSubstanceProperty < NominalFeature
+ end
+
+ class NumericSubstanceProperty < NumericFeature
+ end
+
+ class NanoParticleProperty < NumericSubstanceProperty
+ field :category, type: String
+ field :conditions, type: Hash
end
# Feature for SMARTS fragments
- class Smarts < NominalFeature
+ class Smarts < Feature
field :smarts, type: String
index "smarts" => 1
# Create feature from SMARTS string
diff --git a/lib/import.rb b/lib/import.rb
index 0857717..cdf96e3 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -1,125 +1,21 @@
module OpenTox
- # Import data from external databases
- module Import
-
- class Enanomapper
- include OpenTox
-
- # Import from eNanoMapper
- def self.import
- # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%)
- datasets = {}
- bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle', {}, {accept: :json}))["dataset"]
- bundles.each do |bundle|
- datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip)
- $logger.debug bundle["title"].strip
- nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"], {}, {accept: :json}))["dataEntry"]
- nanoparticles.each_with_index do |np,n|
- core_id = nil
- coating_ids = []
- np["composition"].each do |c|
- uri = c["component"]["compound"]["URI"]
- data = JSON.parse(RestClientWrapper.get("https://data.enanomapper.net/query/compound/url/all?search=#{uri}", {}, {accept: :json}))
- source = data["dataEntry"][0]["compound"]["URI"]
- smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"]
- names = []
- names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"]
- names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"]
- if smiles
- compound = Compound.find_or_create_by(:smiles => smiles)
- compound.name = names.first
- compound.names = names.compact
- else
- compound = Compound.find_or_create_by(:name => names.first,:names => names.compact)
- end
- compound.source = source
- compound.save
- if c["relation"] == "HAS_CORE"
- core_id = compound.id.to_s
- elsif c["relation"] == "HAS_COATING"
- coating_ids << compound.id.to_s
- end
- end if np["composition"]
- nanoparticle = Nanoparticle.find_or_create_by(
- :name => np["values"]["https://data.enanomapper.net/identifier/name"],
- :source => np["compound"]["URI"],
- :core_id => core_id,
- :coating_ids => coating_ids
- )
- np["bundles"].keys.each do |bundle_uri|
- nanoparticle.dataset_ids << datasets[bundle_uri].id
- end
-
- studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"]
- studies.each do |study|
- dataset = datasets[np["bundles"].keys.first]
- proteomics_features = {}
- category = study["protocol"]["topcategory"]
- source = study["protocol"]["category"]["term"]
- study["effects"].each do |effect|
-
- effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature
- effect["conditions"].delete_if { |k, v| v.nil? }
-
- if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data
-
- JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step
- proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true)
- nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset
- end
- else
- name = effect["endpoint"]
- unit = effect["result"]["unit"]
- warnings = []
- case name
- when "Log2 transformed" # use a sensible name
- name = "log2(Net cell association)"
- warnings = ["Original name was 'Log2 transformed'"]
- unit = "log2(mL/ug(Mg))"
- when "Total protein (BCA assay)"
- category = "P-CHEM"
- warnings = ["Category changed from TOX to P-CHEM"]
- end
- feature = klass.find_or_create_by(
- :name => name,
- :unit => unit,
- :category => category,
- :conditions => effect["conditions"],
- :source => study["protocol"]["category"]["term"],
- :measured => true,
- :warnings => warnings
- )
- nanoparticle.parse_ambit_value feature, effect["result"], dataset
- end
- end
- end
- nanoparticle.save
- print "#{n}, "
- end
- puts
+ class Import
+
+ # Import datasets from the data folder, create and validate models
+ # @return [Array<OpenTox::Model::Validation>] Validated models
+ def self.public_data
+ models = []
+ Dir[File.join(File.dirname(__FILE__),"..","data/*csv")].each do |f|
+ $logger.debug f
+ m = Model::Validation.from_csv_file f
+ $logger.debug "#{f} ID: #{m.id.to_s}"
+ m.crossvalidations.each do |cv|
+ $logger.debug cv.statistics
end
- datasets.each { |u,d| d.save }
+ models << m
end
-
-=begin
- def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries
- #get list of bundle URIs
- bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"]
- datasets = []
- bundles.each do |bundle|
- uri = bundle["URI"]
- study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`)
- study["@graph"].each do |i|
- puts i.to_yaml if i.keys.include? "sio:has-value"
- end
- end
- datasets.collect{|d| d.id}
- end
-=end
-
+ models
end
-
end
-
end
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 32f0317..e77de9d 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -17,19 +17,22 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple
ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"]
ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment
+# CH: this interferes with /etc/hosts on my machine
# search for a central mongo database in use
# http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment
-CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
+# CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp
Mongoid.load_configuration({
:clients => {
:default => {
:database => ENV["LAZAR_ENV"],
- :hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+ #:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]),
+ :hosts => ["localhost:27017"]
}
}
})
Mongoid.raise_not_found_error = false # return nil if no document is found
-$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+#$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}")
+$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}")
$gridfs = $mongo.database.fs
# Logger setup
@@ -68,13 +71,15 @@ suppressPackageStartupMessages({
})
"
+PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/"
+CHEMBL_URI = "https://www.ebi.ac.uk/chembl/api/data/molecule/"
+
# OpenTox classes and includes
-CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules
+CLASSES = ["Feature","Substance","Dataset","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation"]# Algorithm and Models are modules
[ # be aware of the require sequence as it affects class/method overwrites
"overwrite.rb",
"rest-client-wrapper.rb",
- "error.rb",
"opentox.rb",
"feature.rb",
"physchem.rb",
@@ -94,6 +99,6 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","
"train-test-validation.rb",
"leave-one-out-validation.rb",
"crossvalidation.rb",
- #"experiment.rb",
+ "download.rb",
"import.rb",
].each{ |f| require_relative f }
diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb
index c33c92b..7d73b89 100644
--- a/lib/leave-one-out-validation.rb
+++ b/lib/leave-one-out-validation.rb
@@ -9,25 +9,18 @@ module OpenTox
# @param [OpenTox::Model::Lazar]
# @return [OpenTox::Validation::LeaveOneOut]
def self.create model
- bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
+ raise ArgumentError, "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection]
$logger.debug "#{model.name}: LOO validation started"
t = Time.now
- model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
+ model.training_dataset.features.collect{|f| f.class}.include?(NominalBioActivity) ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut
loo = klass.new :model_id => model.id
predictions = model.predict model.training_dataset.substances
predictions.each{|cid,p| p.delete(:neighbors)}
- nr_unpredicted = 0
predictions.each do |cid,prediction|
- if prediction[:value]
- prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id])
- else
- nr_unpredicted += 1
- end
+ prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
predictions.delete(cid) unless prediction[:value] and prediction[:measurements]
end
predictions.select!{|cid,p| p[:value] and p[:measurements]}
- loo.nr_instances = predictions.size
- loo.nr_unpredicted = nr_unpredicted
loo.predictions = predictions
loo.statistics
$logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds"
@@ -40,25 +33,27 @@ module OpenTox
class ClassificationLeaveOneOut < LeaveOneOut
include ClassificationStatistics
field :accept_values, type: Array
- field :confusion_matrix, type: Array, default: []
- field :weighted_confusion_matrix, type: Array, default: []
- field :accuracy, type: Float
- field :weighted_accuracy, type: Float
- field :true_rate, type: Hash, default: {}
- field :predictivity, type: Hash, default: {}
- field :confidence_plot_id, type: BSON::ObjectId
+ field :confusion_matrix, type: Hash
+ field :weighted_confusion_matrix, type: Hash
+ field :accuracy, type: Hash
+ field :weighted_accuracy, type: Hash
+ field :true_rate, type: Hash
+ field :predictivity, type: Hash
+ field :nr_predictions, type: Hash
+ field :probability_plot_id, type: BSON::ObjectId
end
# Leave one out validation for regression models
class RegressionLeaveOneOut < LeaveOneOut
include RegressionStatistics
- field :rmse, type: Float, default: 0
- field :mae, type: Float, default: 0
- field :r_squared, type: Float
- field :within_prediction_interval, type: Integer, default:0
- field :out_of_prediction_interval, type: Integer, default:0
- field :correlation_plot_id, type: BSON::ObjectId
+ field :rmse, type: Hash
+ field :mae, type: Hash
+ field :r_squared, type: Hash
+ field :within_prediction_interval, type: Hash
+ field :out_of_prediction_interval, type: Hash
+ field :nr_predictions, type: Hash
field :warnings, type: Array
+ field :correlation_plot_id, type: BSON::ObjectId
end
end
diff --git a/lib/model.rb b/lib/model.rb
index dce53a9..d7b2df6 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -32,20 +32,20 @@ module OpenTox
# @param [OpenTox::Feature, nil] prediction_feature
# By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature
# @param [Hash, nil] algorithms
- # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
+ # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys.
#
# @return [OpenTox::Model::Lazar]
def self.create prediction_feature:nil, training_dataset:, algorithms:{}
- bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset
- prediction_feature = training_dataset.features.first unless prediction_feature
- # TODO: prediction_feature without training_dataset: use all available data
+ raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset
+ prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature
# guess model type
- prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new
+ prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new
model.prediction_feature_id = prediction_feature.id
model.training_dataset_id = training_dataset.id
- model.name = "#{prediction_feature.name} (#{training_dataset.name})"
+ model.name = training_dataset.name
+
# git or gem versioning
dir = File.dirname(__FILE__)
path = File.expand_path("../", File.expand_path(dir))
@@ -62,7 +62,7 @@ module OpenTox
# set defaults#
substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq
- bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
+ raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1
if substance_classes.first == "OpenTox::Compound"
@@ -80,7 +80,7 @@ module OpenTox
}
model.algorithms[:similarity] = {
:method => "Algorithm::Similarity.tanimoto",
- :min => 0.1,
+ :min => [0.5,0.2],
}
elsif model.class == LazarRegression
model.algorithms[:prediction] = {
@@ -88,7 +88,7 @@ module OpenTox
}
model.algorithms[:similarity] = {
:method => "Algorithm::Similarity.tanimoto",
- :min => 0.5,
+ :min => [0.5,0.2],
}
end
@@ -100,7 +100,7 @@ module OpenTox
},
:similarity => {
:method => "Algorithm::Similarity.weighted_cosine",
- :min => 0.5,
+ :min => [0.5,0.2],
},
:prediction => {
:method => "Algorithm::Caret.rf",
@@ -110,7 +110,7 @@ module OpenTox
},
}
else
- bad_request_error "Cannot create models for #{substance_classes.first}."
+ raise ArgumentError, "Cannot create models for #{substance_classes.first}."
end
# overwrite defaults with explicit parameters
@@ -175,7 +175,7 @@ module OpenTox
model.descriptor_ids = feature_ids & property_ids
model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}}
else
- bad_request_error "Descriptor method '#{descriptor_method}' not implemented."
+ raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented."
end
if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method]
@@ -197,7 +197,7 @@ module OpenTox
# Predict a substance (compound or nanoparticle)
# @param [OpenTox::Substance]
# @return [Hash]
- def predict_substance substance, threshold = self.algorithms[:similarity][:min]
+ def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil
@independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data
case algorithms[:similarity][:method]
@@ -224,11 +224,11 @@ module OpenTox
end
end
else
- bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
+ raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'."
end
- prediction = {:warnings => [], :measurements => []}
- prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min]
+ prediction ||= {:warnings => [], :measurements => []}
+ prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first
neighbor_ids = []
neighbor_similarities = []
neighbor_dependent_variables = []
@@ -238,7 +238,7 @@ module OpenTox
substance_ids.each_with_index do |s,i|
# handle query substance
if substance.id.to_s == s
- prediction[:measurements] << dependent_variables[i]
+ prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass
prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance."
else
if fingerprints?
@@ -264,25 +264,37 @@ module OpenTox
if neighbor_similarities.empty?
prediction[:value] = nil
- prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset."
+ prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset."
+ if threshold == algorithms[:similarity][:min].last
+ prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})."
+ return prediction
+ end
elsif neighbor_similarities.size == 1
prediction[:value] = nil
- prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set."
- prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}]
+ prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})."
+ prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}]
+ if threshold == algorithms[:similarity][:min].last
+ prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set."
+ return prediction
+ end
else
query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint"
# call prediction algorithm
result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors
prediction.merge! result
prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}}
- #if neighbor_similarities.max < algorithms[:similarity][:warn_min]
- #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain."
- #end
end
- if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2
- prediction
- else # try again with a lower threshold
- predict_substance substance, 0.2
+ if threshold == algorithms[:similarity][:min].first
+ if prediction[:warnings].empty?
+ prediction[:confidence] = "Similar to bioassay results"
+ return prediction
+ else # try again with a lower threshold
+ prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}."
+ predict_substance substance, algorithms[:similarity][:min].last, prediction
+ end
+ elsif threshold < algorithms[:similarity][:min].first
+ prediction[:confidence] = "Lower than bioassay results"
+ return prediction
end
end
@@ -302,13 +314,18 @@ module OpenTox
elsif object.is_a? Dataset
substances = object.substances
else
- bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
+ raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter."
end
# make predictions
predictions = {}
substances.each do |c|
predictions[c.id.to_s] = predict_substance c
+ if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value]
+ prediction_feature.accept_values.each do |v|
+ predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity)
+ end
+ end
predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id
end
@@ -320,17 +337,35 @@ module OpenTox
elsif object.is_a? Array
return predictions
elsif object.is_a? Dataset
- # prepare prediction dataset
- measurement_feature = Feature.find prediction_feature_id
-
- prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
- prediction_dataset = LazarPrediction.create(
- :name => "Lazar prediction for #{prediction_feature.name}",
- :creator => __FILE__,
- :prediction_feature_id => prediction_feature.id,
- :predictions => predictions
- )
- return prediction_dataset
+ d = object.copy
+ #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id)
+ confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id)
+ if prediction_feature.is_a? NominalBioActivity
+ f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ probability_features = {}
+ prediction_feature.accept_values.each do |v|
+ probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ end
+ elsif prediction_feature.is_a? NumericBioActivity
+ f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ prediction_interval = []
+ ["lower","upper"].each do |v|
+ prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id)
+ end
+ end
+
+ # add predictions to dataset
+ predictions.each do |substance_id,p|
+ substance_id = BSON::ObjectId.from_string(substance_id)
+ d.add substance_id,confidence_feature,p[:confidence]
+ unless p[:value].nil?
+ d.add substance_id,f,p[:value]
+ p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities]
+ p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval]
+ end
+ end
+ d.save
+ return d
end
end
@@ -402,6 +437,7 @@ module OpenTox
field :species, type: String
field :source, type: String
field :unit, type: String
+ field :warnings, type: Array
field :model_id, type: BSON::ObjectId
field :repeated_crossvalidation_id, type: BSON::ObjectId
@@ -461,11 +497,11 @@ module OpenTox
end
# Create and validate a lazar model from a csv file with training data and a json file with metadata
- # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
- # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations
+ # @param [File] CSV file with two or three columns. The first column is optional and may contain an arbitrary substance ID. The next column should contain either SMILES or InChIs of the training compounds, followed by toxic activities (qualitative or quantitative) in the last column. Use -log10 transformed values for regression datasets. The first line should contain "ID" (optional), either SMILES or InChI and the endpoint name (last column). Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source", "qmrf" (optional) and "unit" (regression only). You can find example training data in the data folder of lazar.
+ # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
def self.from_csv_file file
metadata_file = file.sub(/csv$/,"json")
- bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file
+ raise ArgumentError, "No metadata file #{metadata_file}" unless File.exist? metadata_file
model_validation = self.new JSON.parse(File.read(metadata_file))
training_dataset = Dataset.from_csv_file file
model = Lazar.create training_dataset: training_dataset
@@ -477,6 +513,7 @@ module OpenTox
# Create and validate a nano-lazar model, import data from eNanoMapper if necessary
# nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
+ # *eNanoMapper import is currently broken, because APIs and data formats are constantly changing and we have no resources to track this changes permanently!*
# @param [OpenTox::Dataset, nil] training_dataset
# @param [OpenTox::Feature, nil] prediction_feature
# @param [Hash, nil] algorithms
@@ -488,7 +525,7 @@ module OpenTox
unless training_dataset # try to import
Import::Enanomapper.import
training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first
- bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
+ raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset
end
prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first
diff --git a/lib/opentox.rb b/lib/opentox.rb
index 5c300cf..fb2a579 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -11,13 +11,6 @@ module OpenTox
include Mongoid::Timestamps
store_in collection: klass.downcase.pluralize
field :name, type: String
- field :source, type: String
- field :warnings, type: Array, default: []
-
- def warn warning
- $logger.warn warning
- warnings << warning
- end
end
OpenTox.const_set klass,c
end
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index 0dd1c8a..d482902 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -84,7 +84,7 @@ class String
def to_boolean
return true if self == true || self =~ (/(true|t|yes|y|1)$/i)
return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i)
- bad_request_error "invalid value for Boolean: \"#{self}\""
+ raise ArgumentError, "invalid value for Boolean: \"#{self}\""
end
end
diff --git a/lib/physchem.rb b/lib/physchem.rb
index 07df867..2af043b 100644
--- a/lib/physchem.rb
+++ b/lib/physchem.rb
@@ -1,7 +1,7 @@
module OpenTox
# Feature for physico-chemical descriptors
- class PhysChem < NumericFeature
+ class PhysChem < NumericSubstanceProperty
field :library, type: String
field :descriptor, type: String
@@ -45,7 +45,7 @@ module OpenTox
def self.descriptors desc=DESCRIPTORS
desc.collect do |name,description|
lib,desc = name.split('.',2)
- self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
+ self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description)
end
end
@@ -59,11 +59,11 @@ module OpenTox
CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n|
dname = "#{name}.#{n}"
description = DESCRIPTORS[dname]
- udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
+ udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description)
end
else
description = DESCRIPTORS[name]
- udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true)
+ udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description)
end
end
udesc
diff --git a/lib/regression.rb b/lib/regression.rb
index 25c0732..fd2855f 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -17,7 +17,7 @@ module OpenTox
sim_sum += weights[i]
end if dependent_variables
sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum
- {:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]}
+ {:value => prediction}
end
end
diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb
index c9fd40f..db23e66 100644
--- a/lib/rest-client-wrapper.rb
+++ b/lib/rest-client-wrapper.rb
@@ -28,14 +28,14 @@ module OpenTox
uri = Addressable::URI.encode(uri)
# check input
- bad_request_error "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash)
+ raise ArgumentError, "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash)
headers[:subjectid] ||= @@subjectid
- bad_request_error "Invalid URI: '#{uri}'" unless URI.valid? uri
+ raise ArgumentError, "Invalid URI: '#{uri}'" unless URI.valid? uri
# make sure that no header parameters are set in the payload
[:accept,:content_type,:subjectid].each do |header|
if defined? $aa || URI(uri).host == URI($aa[:uri]).host
else
- bad_request_error "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header]
+ raise ArgumentError, "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header]
end
end
@@ -56,6 +56,7 @@ module OpenTox
@response = @request.execute do |response, request, result|
if [301, 302, 307].include? response.code and request.method == :get
response.follow_redirection(request, result)
+=begin
elsif response.code >= 400 and !URI.task?(uri)
error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first
begin # errors are returned as error reports in json, try to parse
@@ -68,6 +69,7 @@ module OpenTox
cause = nil
end
Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method
+=end
else
response
end
@@ -75,6 +77,7 @@ module OpenTox
end
end
+=begin
#@return [Array] of hashes with error code, method and class
def self.known_errors
errors = []
@@ -88,6 +91,7 @@ module OpenTox
end
errors
end
+=end
end
end
diff --git a/lib/substance.rb b/lib/substance.rb
index ef49659..5c486d8 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -3,7 +3,6 @@ module OpenTox
# Base class for substances (e.g. compunds, nanoparticles)
class Substance
field :properties, type: Hash, default: {}
- field :dataset_ids, type: Array, default: []
end
end
diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb
index 9a5532d..d034cd1 100644
--- a/lib/train-test-validation.rb
+++ b/lib/train-test-validation.rb
@@ -18,22 +18,15 @@ module OpenTox
validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms
validation_model.save
predictions = validation_model.predict test_set.substances
- nr_unpredicted = 0
predictions.each do |cid,prediction|
- if prediction[:value]
- prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id])
- else
- nr_unpredicted += 1
- end
+ prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) if prediction[:value]
end
predictions.select!{|cid,p| p[:value] and p[:measurements]}
- # hack to avoid mongos file size limit error on large datasets
- #predictions.each{|cid,p| p[:neighbors] = []} if model.training_dataset.name.match(/mutagenicity/i)
+ # remove neighbors to avoid mongos file size limit error on large datasets
+ predictions.each{|cid,p| p.delete(:neighbors)} #if model.training_dataset.name.match(/mutagenicity/i)
validation = self.new(
:model_id => validation_model.id,
:test_dataset_id => test_set.id,
- :nr_instances => test_set.substances.size,
- :nr_unpredicted => nr_unpredicted,
:predictions => predictions
)
validation.save
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 69e7992..5fd9985 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -7,62 +7,66 @@ module OpenTox
# @return [Hash]
def statistics
self.accept_values = model.prediction_feature.accept_values
- self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
- self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)}
- nr_instances = 0
+ self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_high => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_low => Array.new(accept_values.size){Array.new(accept_values.size,0)}}
+ self.nr_predictions = {:all => 0,:confidence_high => 0,:confidence_low => 0}
predictions.each do |cid,pred|
- # TODO
- # use predictions without probabilities (single neighbor)??
- # use measured majority class??
+ # TODO: use measured majority class or all measurements??
if pred[:measurements].uniq.size == 1 and pred[:probabilities]
m = pred[:measurements].first
if pred[:value] == m
- if pred[:value] == accept_values[0]
- confusion_matrix[0][0] += 1
- weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]]
- nr_instances += 1
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][1] += 1
- weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]]
- nr_instances += 1
+ accept_values.each_with_index do |v,i|
+ if pred[:value] == v
+ confusion_matrix[:all][i][i] += 1
+ self.nr_predictions[:all] += 1
+ if pred[:confidence].match(/Similar/i)
+ confusion_matrix[:confidence_high][i][i] += 1
+ self.nr_predictions[:confidence_high] += 1
+ elsif pred[:confidence].match(/Low/i)
+ confusion_matrix[:confidence_low][i][i] += 1
+ self.nr_predictions[:confidence_low] += 1
+ end
+ end
end
elsif pred[:value] != m
- if pred[:value] == accept_values[0]
- confusion_matrix[0][1] += 1
- weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]]
- nr_instances += 1
- elsif pred[:value] == accept_values[1]
- confusion_matrix[1][0] += 1
- weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]]
- nr_instances += 1
+ accept_values.each_with_index do |v,i|
+ if pred[:value] == v
+ confusion_matrix[:all][i][(i+1)%2] += 1
+ self.nr_predictions[:all] += 1
+ if pred[:confidence].match(/Similar/i)
+ confusion_matrix[:confidence_high][i][(i+1)%2] += 1
+ self.nr_predictions[:confidence_high] += 1
+ elsif pred[:confidence].match(/Low/i)
+ confusion_matrix[:confidence_low][i][(i+1)%2] += 1
+ self.nr_predictions[:confidence_low] += 1
+ end
+ end
end
end
end
end
- self.true_rate = {}
- self.predictivity = {}
+
+ self.true_rate = {:all => {}, :confidence_high => {}, :confidence_low => {}}
+ self.predictivity = {:all => {}, :confidence_high => {}, :confidence_low => {}}
accept_values.each_with_index do |v,i|
- self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f
- self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f
- end
- confidence_sum = 0
- weighted_confusion_matrix.each do |r|
- r.each do |c|
- confidence_sum += c
+ [:all,:confidence_high,:confidence_low].each do |a|
+ self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f
+ self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f
end
end
- self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f
- self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f
+ self.accuracy = {}
+ [:all,:confidence_high,:confidence_low].each do |a|
+ self.accuracy[a] = (confusion_matrix[a][0][0]+confusion_matrix[a][1][1])/nr_predictions[a].to_f
+ end
$logger.debug "Accuracy #{accuracy}"
+ $logger.debug "Nr Predictions #{nr_predictions}"
save
{
:accept_values => accept_values,
:confusion_matrix => confusion_matrix,
- :weighted_confusion_matrix => weighted_confusion_matrix,
:accuracy => accuracy,
- :weighted_accuracy => weighted_accuracy,
:true_rate => self.true_rate,
:predictivity => self.predictivity,
+ :nr_predictions => nr_predictions,
}
end
@@ -97,7 +101,7 @@ module OpenTox
R.assign "probability", probabilities
R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()"
R.eval "ggsave(file='#{tmpfile}', plot=image)"
- file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg")
+ file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.#{format}")
plot_id = $gridfs.insert_one(file)
update(:probability_plot_id => plot_id)
#end
@@ -108,29 +112,27 @@ module OpenTox
# Statistical evaluation of regression validations
module RegressionStatistics
+ attr_accessor :x, :y
+
# Get statistics
# @return [Hash]
def statistics
self.warnings = []
- self.rmse = 0
- self.mae = 0
- self.within_prediction_interval = 0
- self.out_of_prediction_interval = 0
- x = []
- y = []
+ self.rmse = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.r_squared = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.mae = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.within_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ self.out_of_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0}
+ @x = {:all => [],:confidence_high => [],:confidence_low => []}
+ @y = {:all => [],:confidence_high => [],:confidence_low => []}
+ self.nr_predictions = {:all =>0,:confidence_high => 0,:confidence_low => 0}
predictions.each do |cid,pred|
- if pred[:value] and pred[:measurements]
- x << pred[:measurements].median
- y << pred[:value]
- error = pred[:value]-pred[:measurements].median
- self.rmse += error**2
- self.mae += error.abs
- if pred[:prediction_interval]
- if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1]
- self.within_prediction_interval += 1
- else
- self.out_of_prediction_interval += 1
- end
+ !if pred[:value] and pred[:measurements] and !pred[:measurements].empty?
+ insert_prediction pred, :all
+ if pred[:confidence].match(/Similar/i)
+ insert_prediction pred, :confidence_high
+ elsif pred[:confidence].match(/Low/i)
+ insert_prediction pred, :confidence_low
end
else
trd_id = model.training_dataset_id
@@ -139,39 +141,49 @@ module OpenTox
$logger.debug "No training activities for #{smiles} in training dataset #{trd_id}."
end
end
- R.assign "measurement", x
- R.assign "prediction", y
- R.eval "r <- cor(measurement,prediction,use='pairwise')"
- self.r_squared = R.eval("r").to_ruby**2
- self.mae = self.mae/predictions.size
- self.rmse = Math.sqrt(self.rmse/predictions.size)
+ [:all,:confidence_high,:confidence_low].each do |a|
+ if @x[a].size > 2
+ R.assign "measurement", @x[a]
+ R.assign "prediction", @y[a]
+ R.eval "r <- cor(measurement,prediction,use='pairwise')"
+ self.r_squared[a] = R.eval("r").to_ruby**2
+ else
+ self.r_squared[a] = 0
+ end
+ if self.nr_predictions[a] > 0
+ self.mae[a] = self.mae[a]/self.nr_predictions[a]
+ self.rmse[a] = Math.sqrt(self.rmse[a]/self.nr_predictions[a])
+ else
+ self.mae[a] = nil
+ self.rmse[a] = nil
+ end
+ end
$logger.debug "R^2 #{r_squared}"
$logger.debug "RMSE #{rmse}"
$logger.debug "MAE #{mae}"
- $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval"
- $logger.debug "#{warnings}"
+ $logger.debug "Nr predictions #{nr_predictions}"
+ $logger.debug "#{within_prediction_interval} measurements within prediction interval"
save
{
:mae => mae,
:rmse => rmse,
:r_squared => r_squared,
- :within_prediction_interval => within_prediction_interval,
+ :within_prediction_interval => self.within_prediction_interval,
:out_of_prediction_interval => out_of_prediction_interval,
+ :nr_predictions => nr_predictions,
}
end
- # Get percentage of measurements within the prediction interval
- # @return [Float]
- def percent_within_prediction_interval
- 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval)
- end
-
# Plot predicted vs measured values
# @param [String,nil] format
# @return [Blob]
def correlation_plot format: "png"
- unless correlation_plot_id
- tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
+ #unless correlation_plot_id
+ #tmpfile = "/tmp/#{id.to_s}_correlation.#{format}"
+ tmpdir = "/tmp"
+ #p tmpdir
+ FileUtils.mkdir_p tmpdir
+ tmpfile = File.join(tmpdir,"#{id.to_s}_correlation.#{format}")
x = []
y = []
feature = Feature.find(predictions.first.last["prediction_feature_id"])
@@ -187,7 +199,7 @@ module OpenTox
title = "log2(Net cell association [mL/ug(Mg)])"
else
title = feature.name
- title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank?
+ title += "-log10(#{feature.unit})" if feature.unit and !feature.unit.blank?
end
R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
R.eval "image = image + geom_abline(intercept=0, slope=1)"
@@ -195,7 +207,7 @@ module OpenTox
file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}")
plot_id = $gridfs.insert_one(file)
update(:correlation_plot_id => plot_id)
- end
+ #end
$gridfs.find_one(_id: correlation_plot_id).data
end
@@ -215,6 +227,24 @@ module OpenTox
end
worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
end
+
+ private
+
+ def insert_prediction prediction, type
+ self.nr_predictions[type] +=1
+ @x[type] << prediction[:measurements].median
+ @y[type] << prediction[:value]
+ error = prediction[:value]-prediction[:measurements].median
+ self.rmse[type] += error**2
+ self.mae[type] += error.abs
+ if prediction[:prediction_interval]
+ if prediction[:measurements].median >= prediction[:prediction_interval][0] and prediction[:measurements].median <= prediction[:prediction_interval][1]
+ self.within_prediction_interval[type] += 1
+ else
+ self.out_of_prediction_interval[type] += 1
+ end
+ end
+ end
end
end
end
diff --git a/lib/validation.rb b/lib/validation.rb
index c9954b6..9402361 100644
--- a/lib/validation.rb
+++ b/lib/validation.rb
@@ -10,8 +10,6 @@ module OpenTox
store_in collection: "validations"
field :name, type: String
field :model_id, type: BSON::ObjectId
- field :nr_instances, type: Integer, default: 0
- field :nr_unpredicted, type: Integer, default: 0
field :predictions, type: Hash, default: {}
field :finished_at, type: Time