summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorhelma@in-silico.ch <helma@in-silico.ch>2018-11-14 13:35:17 +0100
committerhelma@in-silico.ch <helma@in-silico.ch>2018-11-14 13:35:17 +0100
commitae78e8216909ebfa708b8da3c55248a68abc291c (patch)
treec956dcd8d9d6ef48ccace8ab922bd5eb793002c8 /lib
parent6e23be652ad90c747aaccf15258bdaa4458185a4 (diff)
public model validation, updated documentation
Diffstat (limited to 'lib')
-rw-r--r--lib/dataset.rb36
-rw-r--r--lib/download.rb35
-rw-r--r--lib/import.rb12
-rw-r--r--lib/model.rb25
4 files changed, 49 insertions, 59 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index b09d7bf..90b4993 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -384,6 +384,8 @@ module OpenTox
sdf
end
+ # Get lazar predictions from a dataset
+ # @return [Hash] predictions
def predictions
predictions = {}
substances.each do |s|
@@ -448,7 +450,11 @@ module OpenTox
end
# Merge an array of datasets
- # @param [Array<OpenTox::Dataset>] datasets to be merged
+ # @param [Array<OpenTox::Dataset>] datasets Datasets to be merged
+ # @param [Array<OpenTox::Feature>] features Features to be merged (same size as datasets)
+ # @param [Array<Hash>] value_maps Value transfomations (use nil for keeping original values, same size as dataset)
+ # @param [Bool] keep_original_features Copy original features/values to the merged dataset
+ # @param [Bool] remove_duplicates Delete duplicated values (assuming they come from the same experiment)
# @return [OpenTox::Dataset] merged dataset
def self.merge datasets: , features: , value_maps: , keep_original_features: , remove_duplicates:
dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")+" merged")
@@ -489,34 +495,6 @@ module OpenTox
dataset
end
- # Change nominal feature values
- # @param [NominalFeature] Original feature
- # @param [Hash] how to change feature values
- def map feature, map
- dataset = self.copy
- new_feature = TransformedNominalBioActivity.find_or_create_by(:name => feature.name + " (transformed)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort)
- compounds.each do |c|
- values(c,feature).each { |v| dataset.add c, new_feature, map[v] }
- end
- dataset.save
- dataset
- end
-
- def merge_nominal_features nominal_features, maps=[]
- dataset = self.copy
- new_feature = MergedNominalBioActivity.find_or_create_by(:name => nominal_features.collect{|f| f.name}.join("/") + " (transformed)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort)
-
- compounds.each do |c|
- if map
- values(c,feature).each { |v| dataset.add c, new_feature, map[v] }
- else
- end
- end
- end
-
- def transform # TODO
- end
-
end
end
diff --git a/lib/download.rb b/lib/download.rb
index 99d8842..5467167 100644
--- a/lib/download.rb
+++ b/lib/download.rb
@@ -4,6 +4,13 @@ module OpenTox
DATA = File.join(File.dirname(__FILE__),"..","data")
+ # Download classification dataset from PubChem into the data folder
+ # @param [Integer] aid PubChem Assay ID
+ # @param [String] active Name for the "Active" class
+ # @param [String] inactive Name for the "Inactive" class
+ # @param [String] species Species name
+ # @param [String] endpoint Endpoint name
+ # @param [Hash] qmrf Name and group for QMRF reports (optional)
def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil
aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
@@ -42,7 +49,7 @@ module OpenTox
end
(cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
end
- File.open(File.join(File.dirname(__FILE__),"..","data",name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
+ File.open(File.join(DATA,name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
meta = {
:species => species,
:endpoint => endpoint,
@@ -50,9 +57,16 @@ module OpenTox
:qmrf => qmrf,
:warnings => warnings
}
- File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json}
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ File.join(DATA,name+".csv")
end
+ # Download regression dataset from PubChem into the data folder
+ # Uses -log10 transformed experimental data in mmol units
+ # @param [String] aid PubChem Assay ID
+ # @param [String] species Species name
+ # @param [String] endpoint Endpoint name
+ # @param [Hash] qmrf Name and group for QMRF reports (optional)
def self.pubchem_regression aid: , species: , endpoint:, qmrf: nil
aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}"
@@ -92,7 +106,7 @@ module OpenTox
end
(cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." }
end
- File.open(File.join(File.dirname(__FILE__),"..","data",name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
+ File.open(File.join(DATA,name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")}
meta = {
:species => species,
:endpoint => endpoint,
@@ -101,9 +115,11 @@ module OpenTox
:qmrf => qmrf,
:warnings => warnings
}
- File.open(File.join(File.dirname(__FILE__),"..","data",name+".json"),"w+"){|f| f.puts meta.to_json}
+ File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
+ File.join(DATA,name+".csv")
end
+ # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder
def self.mutagenicity
$logger.debug "Mutagenicity"
# TODO add download/conversion programs to lazar dependencies
@@ -181,8 +197,10 @@ module OpenTox
# cleanup
datasets << dataset
datasets.each{|d| d.delete }
+ File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv")
end
+ # Download Blood Brain Barrier Penetration dataset into the data folder
def self.blood_brain_barrier
url = "http://cheminformatics.org/datasets/li/bbp2.smi"
name = "Blood_Brain_Barrier_Penetration-Human"
@@ -204,13 +222,16 @@ module OpenTox
File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
end
+ # Download the combined LOAEL dataset from Helma et al 2018 into the data folder
def self.loael
# TODO: fix url??
url = "https://raw.githubusercontent.com/opentox/loael-paper/revision/data/training_log10.csv"
name = "Lowest_observed_adverse_effect_level-Rats"
$logger.debug name
File.open(File.join(DATA,name+".csv"),"w+") do |f|
- f.puts RestClientWrapper.get(url).to_s
+ CSV.parse(RestClientWrapper.get(url).to_s) do |row|
+ f.puts [row[0],row[1]].join ","
+ end
end
meta = {
:species => "Rat",
@@ -225,8 +246,9 @@ module OpenTox
File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
end
+ # Download Daphnia dataset from http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/ into the public folder
+ # The original file requires an email request, this is a temporary workaround
def self.daphnia
- # download of original file requires email request, this is a temporary solution
url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv"
name = "Acute_toxicity-Daphnia_magna"
$logger.debug name
@@ -245,6 +267,7 @@ module OpenTox
File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json}
end
+ # Download all public lazar datasets into the data folder
def self.public_data
# Classification
diff --git a/lib/import.rb b/lib/import.rb
index 831efcb..cdf96e3 100644
--- a/lib/import.rb
+++ b/lib/import.rb
@@ -2,12 +2,20 @@ module OpenTox
class Import
+ # Import datasets from the data folder, create and validate models
+ # @return [Array<OpenTox::Model::Validation>] Validated models
def self.public_data
- # TODO clear database?
+ models = []
Dir[File.join(File.dirname(__FILE__),"..","data/*csv")].each do |f|
$logger.debug f
- Model::Validation.from_csv_file f
+ m = Model::Validation.from_csv_file f
+ $logger.debug "#{f} ID: #{m.id.to_s}"
+ m.crossvalidations.each do |cv|
+ $logger.debug cv.statistics
+ end
+ models << m
end
+ models
end
end
end
diff --git a/lib/model.rb b/lib/model.rb
index 70ae43c..db69120 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -44,7 +44,7 @@ module OpenTox
model.prediction_feature_id = prediction_feature.id
model.training_dataset_id = training_dataset.id
- model.name = "#{prediction_feature.name} (#{training_dataset.name})"
+ model.name = training_dataset.name
# git or gem versioning
dir = File.dirname(__FILE__)
@@ -481,20 +481,8 @@ module OpenTox
model.is_a? LazarClassification
end
- # TODO from_pubchem_aid
- def self.from_dataset training_dataset: , prediction_feature: , species: , endpoint: , folds: 10, repeats: 5
- model_validation = Model::Validation.create species: species, endpoint: endpoint
- #p "create model"
- model = Lazar.create training_dataset: training_dataset, prediction_feature: prediction_feature
- model_validation[:model_id] = model.id
- #p "create_crossvalidations"
- model_validation[:repeated_crossvalidation_id] = OpenTox::Validation::RepeatedCrossValidation.create(model,folds,repeats).id # full class name required
- model_validation.save
- model_validation
- end
-
# Create and validate a lazar model from a csv file with training data and a json file with metadata
- # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data.
+ # @param [File] CSV file with two or three columns. The first column is optional and may contain an arbitrary substance ID. The next column should contain either SMILES or InChIs of the training compounds, followed by toxic activities (qualitative or quantitative) in the last column. Use -log10 transformed values for regression datasets. The first line should contain "ID" (optional), either SMILES or InChI and the endpoint name (last column). Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source", "qmrf" (optional) and "unit" (regression only). You can find example training data in the data folder of lazar.
# @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations
def self.from_csv_file file
metadata_file = file.sub(/csv$/,"json")
@@ -510,6 +498,7 @@ module OpenTox
# Create and validate a nano-lazar model, import data from eNanoMapper if necessary
# nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf
+ # *eNanoMapper import is currently broken, because APIs and data formats are constantly changing and we have no resources to track this changes permanently!*
# @param [OpenTox::Dataset, nil] training_dataset
# @param [OpenTox::Feature, nil] prediction_feature
# @param [Hash, nil] algorithms
@@ -541,14 +530,6 @@ module OpenTox
end
- # TODO
- def to_json
- "{\n metadata:#{super},\n model:#{model.to_json}, repeated_crossvalidations:#{repeated_crossvalidations.to_json}\n}"
- end
-
- def from_json_file
- end
-
end
end