diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/caret-classification.rb | 107 | ||||
-rw-r--r-- | lib/caret.rb | 11 | ||||
-rw-r--r-- | lib/classification.rb | 5 | ||||
-rw-r--r-- | lib/compound.rb | 80 | ||||
-rw-r--r-- | lib/crossvalidation.rb | 42 | ||||
-rw-r--r-- | lib/dataset.rb | 537 | ||||
-rw-r--r-- | lib/download.rb | 358 | ||||
-rw-r--r-- | lib/enm-import.rb | 125 | ||||
-rw-r--r-- | lib/error.rb | 66 | ||||
-rw-r--r-- | lib/feature.rb | 113 | ||||
-rw-r--r-- | lib/import.rb | 132 | ||||
-rw-r--r-- | lib/lazar.rb | 17 | ||||
-rw-r--r-- | lib/leave-one-out-validation.rb | 41 | ||||
-rw-r--r-- | lib/model.rb | 123 | ||||
-rw-r--r-- | lib/opentox.rb | 7 | ||||
-rw-r--r-- | lib/overwrite.rb | 2 | ||||
-rw-r--r-- | lib/physchem.rb | 8 | ||||
-rw-r--r-- | lib/regression.rb | 2 | ||||
-rw-r--r-- | lib/rest-client-wrapper.rb | 10 | ||||
-rw-r--r-- | lib/substance.rb | 1 | ||||
-rw-r--r-- | lib/train-test-validation.rb | 13 | ||||
-rw-r--r-- | lib/validation-statistics.rb | 176 | ||||
-rw-r--r-- | lib/validation.rb | 2 |
23 files changed, 1356 insertions, 622 deletions
diff --git a/lib/caret-classification.rb b/lib/caret-classification.rb new file mode 100644 index 0000000..fefe6b6 --- /dev/null +++ b/lib/caret-classification.rb @@ -0,0 +1,107 @@ +module OpenTox + module Algorithm + + # Ruby interface for the R caret package + # Caret model list: https://topepo.github.io/caret/modelList.html + class Caret + + # Create a local R caret model and make a prediction + # @param [Array<Float,Bool>] dependent_variables + # @param [Array<Array<Float,Bool>>] independent_variables + # @param [Array<Float>] weights + # @param [String] Caret method + # @param [Array<Float,Bool>] query_variables + # @return [Hash] + def self.create_model_and_predict dependent_variables:, independent_variables:, weights:, method:, query_variables: + remove = [] + # remove independent_variables with single values + independent_variables.each_with_index { |values,i| remove << i if values.uniq.size == 1} + remove.sort.reverse.each do |i| + independent_variables.delete_at i + query_variables.delete_at i + end + if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] + prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights + prediction[:warnings] << "No variables for classification model. Using weighted average of similar substances." + elsif dependent_variables.uniq.size == 1 + prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights + prediction[:warnings] << "All neighbors have the same measured activity. Cannot create random forest model, using weighted average of similar substances." + elsif dependent_variables.size < 3 + prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights + prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for classification model. Using weighted average of similar substances." + else + dependent_variables.collect!{|v| to_r(v)} + independent_variables.each_with_index do |c,i| + c.each_with_index do |v,j| + independent_variables[i][j] = to_r(v) + end + end +# query_variables.collect!{|v| to_r(v)} + begin + R.assign "weights", weights + #r_data_frame = "data.frame(#{([dependent_variables.collect{|v| to_r(v)}]+independent_variables).collect{|r| "c(#{r.collect{|v| to_r(v)}.join(',')})"}.join(', ')})" + r_data_frame = "data.frame(#{([dependent_variables]+independent_variables).collect{|r| "c(#{r.join(',')})"}.join(', ')})" + #p r_data_frame + R.eval "data <- #{r_data_frame}" + R.assign "features", (0..independent_variables.size-1).to_a + R.eval "names(data) <- append(c('activities'),features)" # + p "train" + R.eval "model <- train(activities ~ ., data = data, method = '#{method}', na.action = na.pass, allowParallel=TRUE)" + p "done" + rescue => e + $logger.debug "R caret model creation error for: #{e.message}" + $logger.debug dependent_variables + $logger.debug independent_variables + prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights + prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances." + return prediction + end + begin + R.eval "query <- data.frame(rbind(c(#{query_variables.collect{|v| to_r(v)}.join ','})))" + R.eval "names(query) <- features" + R.eval "prediction <- predict(model,query, type=\"prob\")" + names = R.eval("names(prediction)").to_ruby + probs = R.eval("prediction").to_ruby + probabilities = {} + names.each_with_index { |n,i| probabilities[n] = probs[i] } + value = probabilities.sort_by{|n,p| -p }[0][0] + prediction = { + :value => value, + :probabilities => probabilities, + :warnings => [], + } + rescue => e + $logger.debug "R caret prediction error for: #{e.inspect}" + $logger.debug self.inspect + prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights + prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances" + return prediction + end + if prediction.nil? or prediction[:value].nil? + prediction = Algorithm::Classification::weighted_majority_vote dependent_variables:dependent_variables, weights:weights + prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances." + end + end + prediction + + end + + # Call caret methods dynamically, e.g. Caret.pls + def self.method_missing(sym, *args, &block) + args.first[:method] = sym.to_s + self.create_model_and_predict args.first + end + + # Convert Ruby values to R values + def self.to_r v + return "F" if v == false + return "T" if v == true + return nil if v.is_a? Float and v.nan? + return "\"#{v}\"" if v.is_a? String + v + end + + end + end +end + diff --git a/lib/caret.rb b/lib/caret.rb index 8bccf74..2e5f1bc 100644 --- a/lib/caret.rb +++ b/lib/caret.rb @@ -22,11 +22,11 @@ module OpenTox end if independent_variables.flatten.uniq == ["NA"] or independent_variables.flatten.uniq == [] prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "No variables for regression model. Using weighted average of similar substances." + prediction[:warnings] = ["No variables for regression model, using weighted average of similar substances (no prediction interval available)."] elsif dependent_variables.size < 3 prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "Insufficient number of neighbors (#{dependent_variables.size}) for regression model. Using weighted average of similar substances." + prediction[:warnings] = ["Insufficient number of neighbors (#{dependent_variables.size}) for regression model, using weighted average of similar substances (no prediction interval available)."] else dependent_variables.each_with_index do |v,i| dependent_variables[i] = to_r(v) @@ -51,7 +51,8 @@ module OpenTox $logger.debug dependent_variables $logger.debug independent_variables prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "R caret model creation error. Using weighted average of similar substances." + prediction[:warnings] ||= [] + prediction[:warnings] << "R caret model creation error, using weighted average of similar substances (no prediction interval available)." return prediction end begin @@ -72,12 +73,12 @@ module OpenTox $logger.debug "R caret prediction error for:" $logger.debug self.inspect prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "R caret prediction error. Using weighted average of similar substances" + prediction[:warnings] << "R caret prediction error, using weighted average of similar substances (no prediction interval available)." return prediction end if prediction.nil? or prediction[:value].nil? prediction = Algorithm::Regression::weighted_average dependent_variables:dependent_variables, weights:weights - prediction[:warnings] << "Empty R caret prediction. Using weighted average of similar substances." + prediction[:warnings] << "Empty R caret prediction, using weighted average of similar substances (no prediction interval available)." end end prediction diff --git a/lib/classification.rb b/lib/classification.rb index a875903..638492b 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -18,11 +18,6 @@ module OpenTox class_weights.each do |a,w| probabilities[a] = w.sum/weights.sum end - # DG: hack to ensure always two probability values - if probabilities.keys.uniq.size == 1 - missing_key = probabilities.keys.uniq[0].match(/^non/) ? probabilities.keys.uniq[0].sub(/non-/,"") : "non-"+probabilities.keys.uniq[0] - probabilities[missing_key] = 0.0 - end probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h p_max = probabilities.collect{|a,p| p}.max prediction = probabilities.key(p_max) diff --git a/lib/compound.rb b/lib/compound.rb index bfe69e3..6d0e075 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,5 +1,3 @@ -CACTUS_URI="https://cactus.nci.nih.gov/chemical/structure/" - module OpenTox # Small molecules with defined chemical structures @@ -12,7 +10,6 @@ module OpenTox field :inchikey, type: String field :names, type: Array field :cid, type: String - field :chemblid, type: String field :png_id, type: BSON::ObjectId field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId @@ -35,13 +32,11 @@ module OpenTox def fingerprint type=DEFAULT_FINGERPRINT unless fingerprints[type] return [] unless self.smiles - #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format - if type == "MP2D" + if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format fp = obconversion(smiles,"smi","mpd").strip.split("\t") name = fp.shift # remove Title fingerprints[type] = fp.uniq # no fingerprint counts - #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html - elsif type== "MNA" + elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html level = 2 # TODO: level as parameter, evaluate level 1, see paper fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n") fp.shift # remove Title @@ -130,30 +125,17 @@ module OpenTox # @param [String] smiles # @return [OpenTox::Compound] def self.from_smiles smiles - if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles - $logger.warn "SMILES parsing failed for '#{smiles}'', SMILES string contains whitespaces." - return nil - end + return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) - if smiles.empty? - $logger.warn "SMILES parsing failed for '#{smiles}'', this may be caused by an incorrect SMILES string." - return nil - else - Compound.find_or_create_by :smiles => smiles - end + smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles) end # Create a compound from InChI string # @param [String] InChI # @return [OpenTox::Compound] def self.from_inchi inchi - #smiles = `echo "#{inchi}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -iinchi - -ocan`.chomp.strip smiles = obconversion(inchi,"inchi","can") - if smiles.empty? - Compound.find_or_create_by(:warnings => ["InChi parsing failed for #{inchi}, this may be caused by an incorrect InChi string or a bug in OpenBabel libraries."]) - else - Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) - end + smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles) end # Create a compound from SDF @@ -170,7 +152,7 @@ module OpenTox # @param [String] name, can be also an InChI/InChiKey, CAS number, etc # @return [OpenTox::Compound] def self.from_name name - Compound.from_smiles RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles")) + Compound.from_smiles RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","name",URI.escape(name),"property","CanonicalSMILES","TXT")).chomp end # Get InChI @@ -238,56 +220,16 @@ module OpenTox # names = compound.names # @return [Array<String>] def names - update(:names => RestClientWrapper.get("#{CACTUS_URI}#{inchi}/names").split("\n")) unless self["names"] + update(:names => RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","smiles",URI.escape(smiles),"synonyms","TXT")).split("\n")) #unless self["names"] self["names"] end # Get PubChem Compound Identifier (CID), obtained via REST call to PubChem # @return [String] def cid - pug_uri = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" - update(:cid => RestClientWrapper.post(File.join(pug_uri, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] + update(:cid => RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] self["cid"] end - - # Get ChEMBL database compound id, obtained via REST call to ChEMBL - # @return [String] - def chemblid - # https://www.ebi.ac.uk/chembldb/ws#individualCompoundByInChiKey - uri = "https://www.ebi.ac.uk/chemblws/compounds/smiles/#{smiles}.json" - update(:chemblid => JSON.parse(RestClientWrapper.get(uri))["compounds"].first["chemblId"]) unless self["chemblid"] - self["chemblid"] - end - - def db_neighbors min_sim: 0.1, dataset_id: - #p fingerprints[DEFAULT_FINGERPRINT] - # from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb - - #qn = default_fingerprint_size - #qmin = qn * threshold - #qmax = qn / threshold - #not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...) - #reqbits = [count['_id'] for count in db.mfp_counts.find({'_id': {'$in': qfp}}).sort('count', 1).limit(qn - qmin + 1)] - aggregate = [ - #{'$match': {'mfp.count': {'$gte': qmin, '$lte': qmax}, 'mfp.bits': {'$in': reqbits}}}, - #{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self - {'$project' => { - 'similarity' => {'$let' => { - 'vars' => {'common' => {'$size' => {'$setIntersection' => ["$fingerprints.#{DEFAULT_FINGERPRINT}", fingerprints[DEFAULT_FINGERPRINT]]}}}, - 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} - }}, - '_id' => 1, - #'measurements' => 1, - 'dataset_ids' => 1 - }}, - {'$match' => {'similarity' => {'$gte' => min_sim}}}, - {'$sort' => {'similarity' => -1}} - ] - - # TODO move into aggregate pipeline, see http://stackoverflow.com/questions/30537317/mongodb-aggregation-match-if-value-in-array - $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? dataset_id} - - end # Convert mmol to mg # @return [Float] value in mg @@ -319,7 +261,7 @@ module OpenTox obconversion.read_string obmol, identifier case output_format when /smi|can|inchi/ - obconversion.write_string(obmol).gsub(/\s/,'').chomp + obconversion.write_string(obmol).split(/\s/).first when /sdf/ # TODO: find disconnected structures # strip_salts @@ -332,11 +274,11 @@ module OpenTox print sdf if sdf.match(/.nan/) - $logger.warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" + #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS) sdf = obconversion.write_string(obmol) if sdf.match(/.nan/) - $logger.warn "2D generation failed for compound #{identifier}, rendering without coordinates." + #warn "2D generation failed for compound #{identifier}, rendering without coordinates." obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS) sdf = obconversion.write_string(obmol) end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 06a1e2a..e1761bc 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -15,7 +15,7 @@ module OpenTox $logger.debug model.algorithms klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification klass = RegressionCrossValidation if model.is_a? Model::LazarRegression - bad_request_error "Unknown model class #{model.class}." unless klass + raise ArgumentError, "Unknown model class #{model.class}." unless klass cv = klass.new( name: model.name, @@ -24,24 +24,16 @@ module OpenTox ) cv.save # set created_at - nr_instances = 0 - nr_unpredicted = 0 training_dataset = model.training_dataset training_dataset.folds(n).each_with_index do |fold,fold_nr| #fork do # parallel execution of validations can lead to Rserve and memory problems - $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" - t = Time.now - validation = TrainTest.create(model, fold[0], fold[1]) - cv.validation_ids << validation.id - cv.nr_instances += validation.nr_instances - cv.nr_unpredicted += validation.nr_unpredicted - #cv.predictions.merge! validation.predictions - $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" - #end + $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" + t = Time.now + validation = TrainTest.create(model, fold[0], fold[1]) + cv.validation_ids << validation.id + $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" end - #Process.waitall cv.save - $logger.debug "Nr unpredicted: #{nr_unpredicted}" cv.statistics cv.update_attributes(finished_at: Time.now) cv @@ -72,25 +64,25 @@ module OpenTox class ClassificationCrossValidation < CrossValidation include ClassificationStatistics field :accept_values, type: Array - field :confusion_matrix, type: Array - field :weighted_confusion_matrix, type: Array - field :accuracy, type: Float - field :weighted_accuracy, type: Float + field :confusion_matrix, type: Hash + field :accuracy, type: Hash field :true_rate, type: Hash field :predictivity, type: Hash + field :nr_predictions, type: Hash field :probability_plot_id, type: BSON::ObjectId end # Crossvalidation of regression models class RegressionCrossValidation < CrossValidation include RegressionStatistics - field :rmse, type: Float, default:0 - field :mae, type: Float, default:0 - field :r_squared, type: Float - field :within_prediction_interval, type: Integer, default:0 - field :out_of_prediction_interval, type: Integer, default:0 - field :correlation_plot_id, type: BSON::ObjectId + field :rmse, type: Hash + field :mae, type: Hash + field :r_squared, type: Hash + field :within_prediction_interval, type: Hash + field :out_of_prediction_interval, type: Hash + field :nr_predictions, type: Hash field :warnings, type: Array + field :correlation_plot_id, type: BSON::ObjectId end # Independent repeated crossvalidations @@ -103,7 +95,7 @@ module OpenTox # @param [Fixnum] number of folds # @param [Fixnum] number of repeats # @return [OpenTox::Validation::RepeatedCrossValidation] - def self.create model, folds=10, repeats=3 + def self.create model, folds=10, repeats=5 repeated_cross_validation = self.new repeats.times do |n| $logger.debug "Crossvalidation #{n+1} for #{model.name}" diff --git a/lib/dataset.rb b/lib/dataset.rb index 6e7d67f..fb1afd2 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -1,12 +1,16 @@ require 'csv' require 'tempfile' +require 'digest/md5' module OpenTox # Collection of substances and features class Dataset - field :data_entries, type: Hash, default: {} + field :data_entries, type: Array, default: [] #substance,feature,value + field :warnings, type: Array, default: [] + field :source, type: String + field :md5, type: String # Readers @@ -25,29 +29,87 @@ module OpenTox # Get all substances # @return [Array<OpenTox::Substance>] def substances - @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq + @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0]}.uniq @substances end # Get all features # @return [Array<OpenTox::Feature>] def features - @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq + @features ||= data_entries.collect{|row| OpenTox::Feature.find(row[1])}.uniq @features end # Get all values for a given substance and feature # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id - # @return [TrueClass,FalseClass,Float] + # @return [Array<TrueClass,FalseClass,Float>] values def values substance,feature substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature - if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s] - data_entries[substance.to_s][feature.to_s] - else - [nil] - end + substance = BSON::ObjectId.from_string(substance) if substance.is_a? String + feature = BSON::ObjectId.from_string(feature) if feature.is_a? String + data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]} + end + + # Get OriginalId features + # @return [Array<OpenTox::OriginalId>] original ID features (merged datasets may have multiple original IDs) + def original_id_features + features.select{|f| f.is_a?(OriginalId)} + end + + # Get OriginalSmiles features + # @return [Array<OpenTox::OriginalSmiles>] original smiles features (merged datasets may have multiple original smiles) + def original_smiles_features + features.select{|f| f.is_a?(OriginalSmiles)} + end + + # Get Warnings features + # @return [Array<OpenTox::Warnings>] warnings features (merged datasets may have multiple warnings) + def warnings_features + features.select{|f| f.is_a?(Warnings)} + end + + # Get Confidence feature + # @return [OpenTox::Confidence] confidence feature + def confidence_feature + features.select{|f| f.is_a?(Confidence)}.first + end + + # Get nominal and numeric bioactivity features + # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>] + def bioactivity_features + features.select{|f| f._type.match(/BioActivity/)} + end + + # Get nominal and numeric bioactivity features + # @return [Array<OpenTox::NominalBioActivity,OpenTox::NumericBioActivity>] + def transformed_bioactivity_features + features.select{|f| f._type.match(/Transformed.*BioActivity/)} + end + + # Get nominal and numeric substance property features + # @return [Array<OpenTox::NominalSubstanceProperty,OpenTox::NumericSubstanceProperty>] + def substance_property_features + features.select{|f| f._type.match("SubstanceProperty")} + end + + # Get nominal and numeric prediction features + # @return [Array<OpenTox::NominalLazarPrediction,OpenTox::NumericLazarPrediction>] + def prediction_feature + features.select{|f| f._type.match(/Prediction$/)}.first + end + + # Get supporting nominal and numeric prediction features (class probabilities, prediction interval) + # @return [Array<OpenTox::LazarPredictionProbability,OpenTox::LazarPredictionInterval>] + def prediction_supporting_features + features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)} + end + + # Get nominal and numeric merged features + # @return [Array<OpenTox::MergedNominalBioActivity,OpenTox::MergedNumericBioActivity>] + def merged_features + features.select{|f| f._type.match("Merged")} end # Writers @@ -59,137 +121,178 @@ module OpenTox def add(substance,feature,value) substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature - data_entries[substance.to_s] ||= {} - data_entries[substance.to_s][feature.to_s] ||= [] - data_entries[substance.to_s][feature.to_s] << value - #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source + data_entries << [substance,feature,value] if substance and feature and value end - # Dataset operations - - # Split a dataset into n folds - # @param [Integer] number of folds - # @return [Array] Array with folds [training_dataset,test_dataset] - def folds n - len = self.substances.size - indices = (0..len-1).to_a.shuffle - mid = (len/n) - chunks = [] - start = 0 - 1.upto(n) do |i| - last = start+mid - last = last-1 unless len%n >= i - test_idxs = indices[start..last] || [] - test_substances = test_idxs.collect{|i| substances[i]} - training_idxs = indices-test_idxs - training_substances = training_idxs.collect{|i| substances[i]} - chunk = [training_substances,test_substances].collect do |substances| - dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) - substances.each do |substance| - substance.dataset_ids << dataset.id - substance.dataset_ids.uniq! - substance.save - dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} + # Parsers + + # Create a dataset from CSV file + # @param [File] Input file with the following format: + # - ID column (optional): header containing "ID" string, arbitrary ID values + # - SMILES/InChI column: header indicating "SMILES" or "InChI", Smiles or InChI strings + # - one or more properties column(s): header with property name(s), property values + # files with a single property column are read as BioActivities (i.e. dependent variable) + # files with multiple property columns are read as SubstanceProperties (i.e. independent variables) + # @return [OpenTox::Dataset] + def self.from_csv_file file + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) + if dataset + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." + else + $logger.debug "Parsing #{file}." + table = nil + sep = "," + ["\t",";"].each do |s| # guess alternative CSV separator + if File.readlines(file).first.match(/#{s}/) + sep = s + break end - dataset.save - dataset end - start = last+1 - chunks << chunk + table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + if table + dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) + dataset.parse_table table + else + raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." + end end - chunks + dataset end - # Serialisation - - # Convert dataset to csv format including compound smiles as first column, other column headers are feature names - # @return [String] - def to_csv(inchi=false) - CSV.generate() do |csv| - compound = substances.first.is_a? Compound - if compound - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} - else - csv << ["Name"] + features.collect{|f| f.name} - end - substances.each do |substance| - if compound - name = (inchi ? substance.inchi : substance.smiles) - else - name = substance.name - end - nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq + # Create a dataset from SDF file + # files with a single data field are read as BioActivities (i.e. dependent variable) + # files with multiple data fields are read as SubstanceProperties (i.e. independent variable) + # @param [File] + # @return [OpenTox::Dataset] + def self.from_sdf_file file + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) + if dataset + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." + else + $logger.debug "Parsing #{file}." + + dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) + original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID") - if nr_measurements.size > 1 - warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." + read_result = false + sdf = "" + feature_name = "" + compound = nil + features = {} + table = [["ID","SMILES"]] + + File.readlines(file).each do |line| + if line.match %r{\$\$\$\$} + sdf << line + id = sdf.split("\n").first.chomp + compound = Compound.from_sdf sdf + row = [id,compound.smiles] + features.each do |f,v| + table[0] << f unless table[0].include? f + row[table[0].index(f)] = v + end + table << row + sdf = "" + features = {} + elsif line.match /^>\s+</ + feature_name = line.match(/^>\s+<(.*)>/)[1] + read_result = true else - (0..nr_measurements.first-1).each do |i| - row = [name] - features.each do |f| - values(substance,f) ? row << values(substance,f)[i] : row << "" - end - csv << row + if read_result + value = line.chomp + features[feature_name] = value + read_result = false + else + sdf << line end end end + dataset.parse_table table end + dataset end - # Parsers - - # Create a dataset from file (csv,sdf,...) - # @param filename [String] - # @return [String] dataset uri - # TODO - #def self.from_sdf_file - #end - - # Create a dataset from CSV file - # @param [File] - # @param [TrueClass,FalseClass] accept or reject empty values + # Create a dataset from PubChem Assay + # @param [Integer] PubChem AssayID (AID) # @return [OpenTox::Dataset] - def self.from_csv_file file, accept_empty_values=false - source = file - name = File.basename(file,".*") - dataset = self.find_by(:source => source, :name => name) - if dataset - $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." - else - $logger.debug "Parsing #{file}." - table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - dataset = self.new(:source => source, :name => name) - dataset.parse_table table, accept_empty_values + def self.from_pubchem_aid aid + # TODO get regression data + aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}" + assay_metadata = JSON.parse(RestClientWrapper.get(File.join aid_url,"description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"] + name = assay_metadata["name"].gsub(/\s+/,"_") + dataset = self.new(:source => aid_url, :name => name) + # Get assay data in chunks + # Assay record retrieval is limited to 10000 SIDs + # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435 + list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"] + listkey = list["ListKey"] + size = list["Size"] + start = 0 + csv = [] + while start < size + url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000" + csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows + start += 10000 + end + table = [["SID","SMILES",name]] + csv.each_slice(100) do |slice| # get SMILES in chunks + cids = slice.collect{|s| s[2]} + pubchem_cids = [] + JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop| + i = cids.index(prop["CID"].to_s) + value = slice[i][3] + if value == "Active" or value == "Inactive" + table << [slice[i][1].to_s,prop["CanonicalSMILES"],slice[i][3].to_s] + pubchem_cids << prop["CID"].to_s + else + dataset.warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is #{value}." + end + end + (cids-pubchem_cids).each { |cid| dataset.warnings << "Could not retrieve SMILES for CID #{cid}, all entries are ignored." } end + dataset.parse_table table dataset end # Parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types # @param [Array<Array>] - # @param [TrueClass,FalseClass] accept or reject empty values - def parse_table table, accept_empty_values + def parse_table table # features feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - compound_format = feature_names.shift.strip - bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + + if feature_names[0] !~ /SMILES|InChI/i # check ID column + original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) + else + original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") + end + + compound_format = feature_names.shift + raise ArgumentError, "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i) + numeric = [] features = [] + # guess feature types + bioactivity = true if feature_names.size == 1 + feature_names.each_with_index do |f,i| - metadata = {:name => f} - values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + original_id.name.match(/LineID$/) ? j = i+1 : j = i+2 + values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes numeric[i] = true - feature = NumericFeature.find_or_create_by(metadata) + bioactivity ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f) else - metadata["accept_values"] = values numeric[i] = false - feature = NominalFeature.find_or_create_by(metadata) + bioactivity ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) end features << feature if feature end @@ -198,35 +301,31 @@ module OpenTox all_substances = [] table.each_with_index do |vals,i| + original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.to_s.strip identifier = vals.shift.strip - warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values begin case compound_format when /SMILES/i - substance = OpenTox::Compound.from_smiles(identifier) + substance = Compound.from_smiles(identifier) + add substance, original_smiles, identifier when /InChI/i - substance = OpenTox::Compound.from_inchi(identifier) + substance = Compound.from_inchi(identifier) end rescue substance = nil end + if substance.nil? # compound parsers may return nil - warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." + warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." next end + all_substances << substance - substance.dataset_ids << self.id - substance.dataset_ids.uniq! - substance.save - - unless vals.size == features.size - warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." - next - end + add substance, original_id, original_id_value vals.each_with_index do |v,j| if v.blank? - warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." + warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." next elsif numeric[j] v = v.to_f @@ -235,46 +334,202 @@ module OpenTox end add substance, features[j], v end - data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values end + + warnings_feature = Warnings.find_or_create_by(:dataset_id => id) all_substances.duplicates.each do |substance| positions = [] - all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} - warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles} + all_substances.select{|s| s.smiles == substance.smiles}.each do |s| + add s, warnings_feature, "Duplicated compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + end end save end - # Delete dataset - def delete - compounds.each{|c| c.dataset_ids.delete id.to_s} - super + # Serialisation + + # Convert lazar prediction dataset to csv format + # @return [String] + def to_prediction_csv + + compound = substances.first.is_a? Compound + header = ["ID"] + header << "Original SMILES" if compound + compound ? header << "Canonical SMILES" : header << "Name" + header << "Prediction" if prediction_feature + header << "Confidence" if confidence_feature + header += prediction_supporting_features.collect{|f| f.name} + header << "Measurements" + csv = [header] + + substances.each do |substance| + row = original_id_features.collect{|f| values(substance,f).join(" ")} + row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound + compound ? row << substance.smiles : row << substance.name + row << values(substance,prediction_feature).join(" ") + row << values(substance,confidence_feature).join(" ") + row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")} + row << values(substance,bioactivity_features[0]).join(" ") + csv << row + end + csv.collect{|r| r.join(",")}.join("\n") end + + # Convert dataset into csv formatted training data + # @return [String] + def to_training_csv + + export_features = merged_features + export_features = transformed_bioactivity_features if export_features.empty? + export_features = bioactivity_features if export_features.empty? + export_feature = export_features.first - end + header = ["Canonical SMILES"] + header << bioactivity_features.first.name # use original bioactivity name instead of long merged name + csv = [header] - # Dataset for lazar predictions - class LazarPrediction #< Dataset - field :creator, type: String - field :prediction_feature_id, type: BSON::ObjectId - field :predictions, type: Hash, default: {} + substances.each do |substance| + nr_activities = values(substance,bioactivity_features.first).size + (0..nr_activities-1).each do |n| # new row for each value + row = [substance.smiles] + row << values(substance,export_feature)[n] + csv << row + end + end + csv.collect{|r| r.join(",")}.join("\n") + end - # Get prediction feature - # @return [OpenTox::Feature] - def prediction_feature - Feature.find prediction_feature_id + # Convert dataset to SDF format + # @return [String] SDF string + def to_sdf + sdf = "" + compounds.each do |compound| + sdf_lines = compound.sdf.sub(/\$\$\$\$\n/,"").split("\n") + sdf_lines[0] = compound.smiles + sdf += sdf_lines.join("\n") + bioactivity_features.each do |f| + v = values(compound,f) + unless v.empty? + sdf += "\n> <#{f.name}>\n" + sdf += v.uniq.join "," + sdf += "\n" + end + end + sdf += "\n$$$$\n" + end + sdf end - # Get all compounds - # @return [Array<OpenTox::Compound>] - def compounds - substances.select{|s| s.is_a? Compound} + # Get lazar predictions from a dataset + # @return [Hash] predictions + def predictions + predictions = {} + substances.each do |s| + predictions[s] ||= {} + predictions[s][:value] = values(s,prediction_feature).first + #predictions[s][:warnings] = [] + #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } + predictions[s][:confidence] = values(s,confidence_feature).first + if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction + prediction_feature.accept_values.each do |v| + f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id) + predictions[s][:probabilities] ||= {} + predictions[s][:probabilities][v] = values(s,f).first + end + end + end + predictions end - # Get all substances - # @return [Array<OpenTox::Substance>] - def substances - predictions.keys.collect{|id| Substance.find id} + # Dataset operations + + # Copy a dataset + # @return OpenTox::Dataset dataset copy + def copy + dataset = Dataset.new + dataset.data_entries = data_entries + dataset.warnings = warnings + dataset.name = name + dataset.source = id.to_s + dataset.save + dataset + end + + # Split a dataset into n folds + # @param [Integer] number of folds + # @return [Array] Array with folds [training_dataset,test_dataset] + def folds n + $logger.debug "Creating #{n} folds for #{name}." + len = self.substances.size + indices = (0..len-1).to_a.shuffle + mid = (len/n) + chunks = [] + start = 0 + 1.upto(n) do |i| + last = start+mid + last = last-1 unless len%n >= i + test_idxs = indices[start..last] || [] + test_substances = test_idxs.collect{|i| substances[i].id} + training_idxs = indices-test_idxs + training_substances = training_idxs.collect{|i| substances[i].id} + chunk = [training_substances,test_substances].collect do |substances| + self.class.create( + :name => "#{self.name} (Fold #{i-1})", + :source => self.id, + :data_entries => data_entries.select{|row| substances.include? row[0]} + ) + end + start = last+1 + chunks << chunk + end + chunks + end + + # Merge an array of datasets + # @param [Array<OpenTox::Dataset>] datasets Datasets to be merged + # @param [Array<OpenTox::Feature>] features Features to be merged (same size as datasets) + # @param [Array<Hash>] value_maps Value transfomations (use nil for keeping original values, same size as dataset) + # @param [Bool] keep_original_features Copy original features/values to the merged dataset + # @param [Bool] remove_duplicates Delete duplicated values (assuming they come from the same experiment) + # @return [OpenTox::Dataset] merged dataset + def self.merge datasets: , features: , value_maps: , keep_original_features: , remove_duplicates: + dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")+" merged") + + datasets.each do |d| + dataset.data_entries += d.data_entries + dataset.warnings += d.warnings + end if keep_original_features + + feature_classes = features.collect{|f| f.class}.uniq + merged_feature = nil + if feature_classes.size == 1 + if features.first.kind_of? NominalFeature + merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(" and ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps) + else + merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations + end + else + raise ArgumentError, "Cannot merge features of different types (#{feature_classes})." + end + + accept_values = [] + features.each_with_index do |f,i| + dataset.data_entries += datasets[i].data_entries.select{|de| de[1] == f.id}.collect do |de| + value_maps[i] ? v = value_maps[i][de[2]] : v = de[2] + accept_values << v + [de[0],merged_feature.id,v] + end + end + + if merged_feature.is_a? MergedNominalBioActivity + merged_feature.accept_values = accept_values.uniq.sort + merged_feature.save + end + + dataset.data_entries.uniq! if remove_duplicates + dataset.save + dataset end end diff --git a/lib/download.rb b/lib/download.rb new file mode 100644 index 0000000..2546dc4 --- /dev/null +++ b/lib/download.rb @@ -0,0 +1,358 @@ +module OpenTox + + class Download + + DATA = File.join(File.dirname(__FILE__),"..","data") + + # Download classification dataset from PubChem into the data folder + # @param [Integer] aid PubChem Assay ID + # @param [String] active Name for the "Active" class + # @param [String] inactive Name for the "Inactive" class + # @param [String] species Species name + # @param [String] endpoint Endpoint name + # @param [Hash] qmrf Name and group for QMRF reports (optional) + def self.pubchem_classification aid: , active: , inactive: , species: , endpoint:, qmrf: nil + aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}" + + # Get assay data in chunks + # Assay record retrieval is limited to 10000 SIDs + # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435 + list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"] + listkey = list["ListKey"] + size = list["Size"] + start = 0 + csv = [] + while start < size + url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000" + csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows + start += 10000 + end + warnings = [] + name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_") + $logger.debug name + table = [["SID","SMILES",name]] + csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100 + cids = slice.collect{|s| s[2]} + pubchem_cids = [] + JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop| + i = cids.index(prop["CID"].to_s) + value = slice[i][3] + if value == "Active" + table << [slice[i][1].to_s,prop["CanonicalSMILES"],active] + pubchem_cids << prop["CID"].to_s + elsif value == "Inactive" + table << [slice[i][1].to_s,prop["CanonicalSMILES"],inactive] + pubchem_cids << prop["CID"].to_s + else + warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'." + end + end + (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." } + end + File.open(File.join(DATA,name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")} + meta = { + :species => species, + :endpoint => endpoint, + :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}", + :qmrf => qmrf, + :warnings => warnings + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + File.join(DATA,name+".csv") + end + + # Download regression dataset from PubChem into the data folder + # Uses -log10 transformed experimental data in mmol units + # @param [String] aid PubChem Assay ID + # @param [String] species Species name + # @param [String] endpoint Endpoint name + # @param [Hash] qmrf Name and group for QMRF reports (optional) + def self.pubchem_regression aid: , species: , endpoint:, qmrf: nil + aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}" + + # Get assay data in chunks + # Assay record retrieval is limited to 10000 SIDs + # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435 + list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"] + listkey = list["ListKey"] + size = list["Size"] + start = 0 + csv = [] + unit = nil + while start < size + url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000" + # get unit + unit ||= CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0] == "RESULT_UNIT"}[0][8] + csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows + start += 10000 + end + warnings = [] + name = endpoint.gsub(" ","_")+"-"+species.gsub(" ","_") + $logger.debug name + table = [["SID","SMILES","-log10(#{name} [#{unit}])"]] + csv.each_slice(100) do |slice| # get SMILES in chunks, size limit is 100 + cids = slice.collect{|s| s[2]} + pubchem_cids = [] + JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop| + i = cids.index(prop["CID"].to_s) + value = slice[i][8] + if value + value = -Math.log10(value.to_f) + table << [slice[i][1].to_s,prop["CanonicalSMILES"],value] + pubchem_cids << prop["CID"].to_s + else + warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is '#{value}'." + end + end + (cids-pubchem_cids).each { |cid| warnings << "Could not retrieve SMILES for CID '#{cid}', all entries are ignored." } + end + File.open(File.join(DATA,name+".csv"),"w+"){|f| f.puts table.collect{|row| row.join(",")}.join("\n")} + meta = { + :species => species, + :endpoint => endpoint, + :source => "https://pubchem.ncbi.nlm.nih.gov/bioassay/#{aid}", + :unit => unit, + :qmrf => qmrf, + :warnings => warnings + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + File.join(DATA,name+".csv") + end + + # Combine mutagenicity data from Kazius, Hansen and EFSA and download into the data folder + def self.mutagenicity + $logger.debug "Mutagenicity" + hansen_url = "http://doc.ml.tu-berlin.de/toxbenchmark/Mutagenicity_N6512.csv" + kazius_url = "http://cheminformatics.org/datasets/bursi/cas_4337.zip" + efsa_url = "https://data.europa.eu/euodp/data/storage/f/2017-07-19T142131/GENOTOX data and dictionary.xls" + + parts = File.join(DATA, "parts") + FileUtils.mkdir_p parts + Dir[File.join(parts,"hansen.*")].each{|f| FileUtils.rm f } + Dir[File.join(parts,"cas_4337.*")].each{|f| FileUtils.rm f } + Dir[File.join(parts,"efsa.*")].each{|f| FileUtils.rm f } + File.open(File.join(parts,"hansen-original.csv"),"w+"){|f| f.puts RestClientWrapper.get(hansen_url).to_s } + + # convert hansen + hansen = CSV.read File.join(parts,"hansen-original.csv") + hansen.shift + + map = {"0" => "non-mutagenic","1" => "mutagenic"} + File.open(File.join(parts,"hansen.csv"),"w+") do |f| + f.puts "ID,SMILES,Mutagenicity" + hansen.each do |row| + f.puts [row[0],row[5],map[row[2]]].join "," + end + end + File.open(File.join(parts,"cas_4337.zip"),"w+"){|f| f.puts RestClientWrapper.get(kazius_url).to_s } + `cd #{parts} && unzip cas_4337.zip` + `cd #{parts} && wget #{URI.escape efsa_url} -O efsa.xls` + `cd #{parts} && xls2csv -s cp1252 -d utf-8 -x -c " " efsa.xls > efsa.tsv` + + # convert EFSA data to mutagenicity classifications + i = 0 + db = {} + CSV.foreach(File.join(parts,"efsa.tsv"), :encoding => "UTF-8", :col_sep => "\t", :liberal_parsing => true) do |row| + if i > 0 and row[11] and !row[11].empty? and row[24].match(/Salmonella/i) and ( row[25].match("TA 98") or row[25].match("TA 100") ) and row[33] + begin + c = OpenTox::Compound.from_smiles(row[11].gsub('"','')).smiles + rescue + c = OpenTox::Compound.from_inchi(row[12]).smiles # some smiles (row[11]) contain non-parseable characters + end + db[c] ||= {} + db[c][:id] ||= row[2] + if row[33].match(/Positiv/i) + db[c][:value] = "mutagenic" # at least one positive result in TA 98 or TA 100 + elsif row[33].match(/Negativ/i) + db[c][:value] ||= "non-mutagenic" + end + end + i += 1 + end + File.open(File.join(parts,"efsa.csv"),"w+") do |f| + f.puts "ID,SMILES,Mutagenicity" + db.each do |s,v| + f.puts [v[:id],s,v[:value]].join "," + end + end + + # merge datasets + hansen = Dataset.from_csv_file File.join(parts,"hansen.csv") + efsa = Dataset.from_csv_file File.join(parts,"efsa.csv") + kazius = Dataset.from_sdf_file File.join(parts,"cas_4337.sdf") + datasets = [hansen,efsa,kazius] + map = {"mutagen" => "mutagenic", "nonmutagen" => "non-mutagenic"} + dataset = Dataset.merge datasets: datasets, features: datasets.collect{|d| d.bioactivity_features.first}, value_maps: [nil,nil,map], keep_original_features: false, remove_duplicates: true + dataset.merged_features.first.name = "Mutagenicity" + File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv"),"w+"){|f| f.puts dataset.to_training_csv} + meta = { + :species => "Salmonella typhimurium", + :endpoint => "Mutagenicity", + :source => [kazius_url,hansen_url,efsa_url].join(", "), + :qmrf => { "group": "QMRF 4.10. Mutagenicity", "name": "OECD 471 Bacterial Reverse Mutation Test"}, + } + File.open(File.join(DATA,"Mutagenicity-Salmonella_typhimurium.json"),"w+"){|f| f.puts meta.to_json} + + # cleanup + datasets << dataset + datasets.each{|d| d.delete } + File.join(DATA,"Mutagenicity-Salmonella_typhimurium.csv") + end + + # Download Blood Brain Barrier Penetration dataset into the data folder + def self.blood_brain_barrier + url = "http://cheminformatics.org/datasets/li/bbp2.smi" + name = "Blood_Brain_Barrier_Penetration-Human" + $logger.debug name + map = {"n" => "non-penetrating", "p" => "penetrating"} + table = CSV.parse RestClientWrapper.get(url).to_s, :col_sep => "\t" + File.open(File.join(DATA,name+".csv"),"w+") do |f| + f.puts "ID,SMILES,#{name}" + table.each do |row| + f.puts [row[1],row[0],map[row[3]]].join(",") + end + end + meta = { + :species => "Human", + :endpoint => "Blood Brain Barrier Penetration", + :source => url, + :qmrf => {"name": "QMRF 5.4. Toxicokinetics.Blood-brain barrier penetration", "group": "QMRF 5. Toxicokinetics"}, + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + end + + # Download the combined LOAEL dataset from Helma et al 2018 into the data folder + def self.loael + # TODO: fix url?? + url = "https://raw.githubusercontent.com/opentox/loael-paper/revision/data/training_log10.csv" + name = "Lowest_observed_adverse_effect_level-Rats" + $logger.debug name + File.open(File.join(DATA,name+".csv"),"w+") do |f| + CSV.parse(RestClientWrapper.get(url).to_s) do |row| + f.puts [row[0],row[1]].join "," + end + end + meta = { + :species => "Rat", + :endpoint => "Lowest observed adverse effect level", + :source => url, + :unit => "mmol/kg_bw/day", + :qmrf => { + "name": "QMRF 4.14. Repeated dose toxicity", + "group": "QMRF 4.Human Health Effects" + } + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + end + + # Download Daphnia dataset from http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/ into the public folder + # The original file requires an email request, this is a temporary workaround + def self.daphnia + #url = "https://raw.githubusercontent.com/opentox/lazar-public-data/master/regression/daphnia_magna_mmol_log10.csv" + src = File.join(DATA,"parts","toxicity_data.xlsx") + name = "Acute_toxicity-Daphnia_magna" + $logger.debug name + File.open(File.join(DATA,name+".csv"),"w+") do |f| + i = 0 + CSV.parse(`xlsx2csv #{src}`) do |row| + i == 0 ? v = "-log[LC50_mmol/L]" : v = -Math.log10(10**-row[3].to_f*1000) + f.puts [row[0],row[1],v].join(",") + i += 1 + end + end + meta = { "species": "Daphnia magna", + "endpoint": "Acute toxicity", + "source": "http://www.michem.unimib.it/download/data/acute-aquatic-toxicity-to-daphnia-magna/", + "unit": "mmol/L", + "qmrf": { + "group": "QMRF 3.1. Short-term toxicity to Daphnia (immobilisation)", + "name": "EC C. 2. Daphnia sp Acute Immobilisation Test" + } + } + File.open(File.join(DATA,name+".json"),"w+"){|f| f.puts meta.to_json} + end + + # Download all public lazar datasets into the data folder + def self.public_data + + # Classification + [ + { + :aid => 1205, + :species => "Rodents", + :endpoint => "Carcinogenicity", + :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} + },{ + :aid => 1208, + :species => "Rat", + :endpoint => "Carcinogenicity", + :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} + },{ + :aid => 1199, + :species => "Mouse", + :endpoint => "Carcinogenicity", + :qmrf => {:group => "QMRF 4.12. Carcinogenicity", :name => "OECD 451 Carcinogenicity Studies"} + } + ].each do |assay| + Download.pubchem_classification aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], active: "carcinogenic", inactive: "non-carcinogenic", qmrf: assay[:qmrf] + end + Download.mutagenicity + Download.blood_brain_barrier + + # Regression + [ + { + :aid => 1195, + :species => "Human", + :endpoint => "Maximum Recommended Daily Dose", + :qmrf => { + "group": "QMRF 4.14. Repeated dose toxicity", + "name": "OECD 452 Chronic Toxicity Studies" + }, + },{ + :aid => 1208, + :species => "Rat (TD50)", + :endpoint => "Carcinogenicity", + :qmrf => { + :group => "QMRF 4.12. Carcinogenicity", + :name => "OECD 451 Carcinogenicity Studies" + } + },{ + :aid => 1199, + :species => "Mouse (TD50)", + :endpoint => "Carcinogenicity", + :qmrf => { + :group => "QMRF 4.12. Carcinogenicity", + :name => "OECD 451 Carcinogenicity Studies" + } + },{ + :aid => 1188, + :species => "Fathead minnow", + :endpoint => "Acute toxicity", + :qmrf => { + "group": "QMRF 3.3. Acute toxicity to fish (lethality)", + "name": "EC C. 1. Acute Toxicity for Fish" + } + } + ].each do |assay| + Download.pubchem_regression aid: assay[:aid], species: assay[:species], endpoint: assay[:endpoint], qmrf: assay[:qmrf] + end + + Download.loael + Download.daphnia + +=begin + # 1204 estrogen receptor + # 1259408, # GENE-TOX + # 1159563 HepG2 cytotoxicity assay + # 588209 hepatotoxicity + # 1259333 cytotoxicity + # 1159569 HepG2 cytotoxicity counterscreen Measured in Cell-Based System Using Plate Reader - 2153-03_Inhibitor_Dose_DryPowder_Activity + # 2122 HTS Counterscreen for Detection of Compound Cytotoxicity in MIN6 Cells + # 116724 Acute toxicity determined after intravenal administration in mice + # 1148549 Toxicity in po dosed mouse assessed as mortality after 7 days +=end + end + + end +end diff --git a/lib/enm-import.rb b/lib/enm-import.rb new file mode 100644 index 0000000..cf1a26f --- /dev/null +++ b/lib/enm-import.rb @@ -0,0 +1,125 @@ +module OpenTox + + # Import data from external databases + module Import + + class Enanomapper + include OpenTox + + # Import from eNanoMapper + def self.import + # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%) + datasets = {} + bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle', {}, {accept: :json}))["dataset"] + bundles.each do |bundle| + datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip) + $logger.debug bundle["title"].strip + nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"], {}, {accept: :json}))["dataEntry"] + nanoparticles.each_with_index do |np,n| + core_id = nil + coating_ids = [] + np["composition"].each do |c| + uri = c["component"]["compound"]["URI"] + data = JSON.parse(RestClientWrapper.get("https://data.enanomapper.net/query/compound/url/all?search=#{uri}", {}, {accept: :json})) + source = data["dataEntry"][0]["compound"]["URI"] + smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] + names = [] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] + names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] + if smiles + compound = Compound.find_or_create_by(:smiles => smiles) + compound.name = names.first + compound.names = names.compact + else + compound = Compound.find_or_create_by(:name => names.first,:names => names.compact) + end + compound.source = source + compound.save + if c["relation"] == "HAS_CORE" + core_id = compound.id.to_s + elsif c["relation"] == "HAS_COATING" + coating_ids << compound.id.to_s + end + end if np["composition"] + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + :core_id => core_id, + :coating_ids => coating_ids + ) + #np["bundles"].keys.each do |bundle_uri| + #nanoparticle.dataset_ids << datasets[bundle_uri].id + #end + + studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"] + studies.each do |study| + dataset = datasets[np["bundles"].keys.first] + proteomics_features = {} + category = study["protocol"]["topcategory"] + source = study["protocol"]["category"]["term"] + study["effects"].each do |effect| + + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature + effect["conditions"].delete_if { |k, v| v.nil? } + + if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data + + JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step + proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) + nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset + end + else + name = effect["endpoint"] + unit = effect["result"]["unit"] + warnings = [] + case name + when "Log2 transformed" # use a sensible name + name = "log2(Net cell association)" + warnings = ["Original name was 'Log2 transformed'"] + unit = "log2(mL/ug(Mg))" + when "Total protein (BCA assay)" + category = "P-CHEM" + warnings = ["Category changed from TOX to P-CHEM"] + end + feature = klass.find_or_create_by( + :name => name, + :unit => unit, + :category => category, + :conditions => effect["conditions"], + :source => study["protocol"]["category"]["term"], + :measured => true, + :warnings => warnings + ) + nanoparticle.parse_ambit_value feature, effect["result"], dataset + end + end + end + nanoparticle.save + print "#{n}, " + end + puts + end + datasets.each { |u,d| d.save } + end + +=begin + def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries + #get list of bundle URIs + bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + datasets = [] + bundles.each do |bundle| + uri = bundle["URI"] + study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`) + study["@graph"].each do |i| + puts i.to_yaml if i.keys.include? "sio:has-value" + end + end + datasets.collect{|d| d.id} + end +=end + + end + + end + +end diff --git a/lib/error.rb b/lib/error.rb deleted file mode 100644 index 39b3c76..0000000 --- a/lib/error.rb +++ /dev/null @@ -1,66 +0,0 @@ -module OpenToxError - attr_accessor :http_code, :message, :cause - def initialize message=nil - message = message.to_s.gsub(/\A"|"\Z/, '') if message # remove quotes - super message - @http_code ||= 500 - @message = message.to_s - @cause = cut_backtrace(caller) - $logger.error("\n"+JSON.pretty_generate({ - :http_code => @http_code, - :message => @message, - :cause => @cause - })) - end - - def cut_backtrace(trace) - if trace.is_a?(Array) - cut_index = trace.find_index{|line| line.match(/sinatra|minitest/)} - cut_index ||= trace.size - cut_index -= 1 - cut_index = trace.size-1 if cut_index < 0 - trace[0..cut_index] - else - trace - end - end - -end - -class RuntimeError - include OpenToxError -end - -# clutters log file with library errors -#class NoMethodError - #include OpenToxError -#end - -module OpenTox - - class Error < RuntimeError - include OpenToxError - - def initialize(code, message=nil) - @http_code = code - super message - end - end - - # OpenTox errors - RestClientWrapper.known_errors.each do |error| - # create error classes - c = Class.new Error do - define_method :initialize do |message=nil| - super error[:code], message - end - end - OpenTox.const_set error[:class],c - - # define global methods for raising errors, eg. bad_request_error - Object.send(:define_method, error[:method]) do |message| - raise c.new(message) - end - end - -end diff --git a/lib/feature.rb b/lib/feature.rb index f811aef..296a174 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -1,37 +1,112 @@ module OpenTox - # Basic feature class - class Feature - field :measured, type: Boolean - field :calculated, type: Boolean - field :category, type: String - field :unit, type: String - field :conditions, type: Hash + # Original ID (e.g. from CSV input) + class OriginalId < Feature + field :dataset_id, type: BSON::ObjectId + end - # Is it a nominal feature - # @return [TrueClass,FalseClass] - def nominal? - self.class == NominalFeature - end + # Original SMILES (e.g. from CSV input) + class OriginalSmiles < Feature + field :dataset_id, type: BSON::ObjectId + end - # Is it a numeric feature - # @return [TrueClass,FalseClass] - def numeric? - self.class == NumericFeature + # Warnings + class Warnings < Feature + field :dataset_id, type: BSON::ObjectId + end + + # Confidence + class Confidence < Feature + field :dataset_id, type: BSON::ObjectId + def name + "Confidence" end end - # Feature for categorical variables + # Categorical variables class NominalFeature < Feature field :accept_values, type: Array end - # Feature for quantitative variables + # Quantitative variables class NumericFeature < Feature + field :unit, type: String + end + + # Nominal biological activity + class NominalBioActivity < NominalFeature + end + + # Numeric biological activity + class NumericBioActivity < NumericFeature + end + + # Merged nominal biological activity + class MergedNominalBioActivity < NominalBioActivity + field :original_feature_ids, type: Array + field :transformations, type: Array + end + + # Merged numeric biological activity + class MergedNumericBioActivity < NumericBioActivity + field :original_feature_ids, type: Array + end + + # Transformed nominal biological activity + class TransformedNominalBioActivity < NominalFeature + field :original_feature_id, type: BSON::ObjectId + field :transformation, type: Hash + end + + # Transformed numeric biological activity + class TransformedNumericBioActivity < NumericFeature + field :original_feature_id, type: BSON::ObjectId + field :transformation, type: String + end + + # Nominal lazar prediction + class NominalLazarPrediction < NominalFeature + field :model_id, type: BSON::ObjectId + field :training_feature_id, type: BSON::ObjectId + def name + "Prediction: #{self[:name]}" + end + end + + class LazarPredictionProbability < NominalLazarPrediction + def name + "Probability: #{self[:name]}" + end + end + + # Numeric lazar prediction + class NumericLazarPrediction < NumericFeature + field :model_id, type: BSON::ObjectId + field :training_feature_id, type: BSON::ObjectId + def name + "Prediction: #{self[:name]}" + end + end + + class LazarPredictionInterval < NumericLazarPrediction + def name + "#{self[:name].capitalize} prediction interval" + end + end + + class NominalSubstanceProperty < NominalFeature + end + + class NumericSubstanceProperty < NumericFeature + end + + class NanoParticleProperty < NumericSubstanceProperty + field :category, type: String + field :conditions, type: Hash end # Feature for SMARTS fragments - class Smarts < NominalFeature + class Smarts < Feature field :smarts, type: String index "smarts" => 1 # Create feature from SMARTS string diff --git a/lib/import.rb b/lib/import.rb index 0857717..cdf96e3 100644 --- a/lib/import.rb +++ b/lib/import.rb @@ -1,125 +1,21 @@ module OpenTox - # Import data from external databases - module Import - - class Enanomapper - include OpenTox - - # Import from eNanoMapper - def self.import - # time critical step: JSON parsing (>99%), Oj brings only minor speed gains (~1%) - datasets = {} - bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle', {}, {accept: :json}))["dataset"] - bundles.each do |bundle| - datasets[bundle["URI"]] = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"].strip) - $logger.debug bundle["title"].strip - nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"], {}, {accept: :json}))["dataEntry"] - nanoparticles.each_with_index do |np,n| - core_id = nil - coating_ids = [] - np["composition"].each do |c| - uri = c["component"]["compound"]["URI"] - data = JSON.parse(RestClientWrapper.get("https://data.enanomapper.net/query/compound/url/all?search=#{uri}", {}, {accept: :json})) - source = data["dataEntry"][0]["compound"]["URI"] - smiles = data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23SMILESDefault"] - names = [] - names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23ChemicalNameDefault"] - names << data["dataEntry"][0]["values"]["https://data.enanomapper.net/feature/http%3A%2F%2Fwww.opentox.org%2Fapi%2F1.1%23IUPACNameDefault"] - if smiles - compound = Compound.find_or_create_by(:smiles => smiles) - compound.name = names.first - compound.names = names.compact - else - compound = Compound.find_or_create_by(:name => names.first,:names => names.compact) - end - compound.source = source - compound.save - if c["relation"] == "HAS_CORE" - core_id = compound.id.to_s - elsif c["relation"] == "HAS_COATING" - coating_ids << compound.id.to_s - end - end if np["composition"] - nanoparticle = Nanoparticle.find_or_create_by( - :name => np["values"]["https://data.enanomapper.net/identifier/name"], - :source => np["compound"]["URI"], - :core_id => core_id, - :coating_ids => coating_ids - ) - np["bundles"].keys.each do |bundle_uri| - nanoparticle.dataset_ids << datasets[bundle_uri].id - end - - studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study"), {}, {accept: :json}))["study"] - studies.each do |study| - dataset = datasets[np["bundles"].keys.first] - proteomics_features = {} - category = study["protocol"]["topcategory"] - source = study["protocol"]["category"]["term"] - study["effects"].each do |effect| - - effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature - effect["conditions"].delete_if { |k, v| v.nil? } - - if study["protocol"]["category"]["title"].match(/Proteomics/) and effect["result"]["textValue"] and effect["result"]["textValue"].length > 50 # parse proteomics data - - JSON.parse(effect["result"]["textValue"]).each do |identifier, value| # time critical step - proteomics_features[identifier] ||= NumericFeature.find_or_create_by(:name => identifier, :category => "Proteomics", :unit => "Spectral counts", :source => source,:measured => true) - nanoparticle.parse_ambit_value proteomics_features[identifier], value, dataset - end - else - name = effect["endpoint"] - unit = effect["result"]["unit"] - warnings = [] - case name - when "Log2 transformed" # use a sensible name - name = "log2(Net cell association)" - warnings = ["Original name was 'Log2 transformed'"] - unit = "log2(mL/ug(Mg))" - when "Total protein (BCA assay)" - category = "P-CHEM" - warnings = ["Category changed from TOX to P-CHEM"] - end - feature = klass.find_or_create_by( - :name => name, - :unit => unit, - :category => category, - :conditions => effect["conditions"], - :source => study["protocol"]["category"]["term"], - :measured => true, - :warnings => warnings - ) - nanoparticle.parse_ambit_value feature, effect["result"], dataset - end - end - end - nanoparticle.save - print "#{n}, " - end - puts + class Import + + # Import datasets from the data folder, create and validate models + # @return [Array<OpenTox::Model::Validation>] Validated models + def self.public_data + models = [] + Dir[File.join(File.dirname(__FILE__),"..","data/*csv")].each do |f| + $logger.debug f + m = Model::Validation.from_csv_file f + $logger.debug "#{f} ID: #{m.id.to_s}" + m.crossvalidations.each do |cv| + $logger.debug cv.statistics end - datasets.each { |u,d| d.save } + models << m end - -=begin - def self.import_ld # defunct, AMBIT JSON_LD does not have substance entries - #get list of bundle URIs - bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] - datasets = [] - bundles.each do |bundle| - uri = bundle["URI"] - study = JSON.parse(`curl -H 'Accept:application/ld+json' '#{uri}/substance'`) - study["@graph"].each do |i| - puts i.to_yaml if i.keys.include? "sio:has-value" - end - end - datasets.collect{|d| d.id} - end -=end - + models end - end - end diff --git a/lib/lazar.rb b/lib/lazar.rb index 32f0317..e77de9d 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -17,19 +17,22 @@ raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', ple ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"] ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment +# CH: this interferes with /etc/hosts on my machine # search for a central mongo database in use # http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment -CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp +# CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp Mongoid.load_configuration({ :clients => { :default => { :database => ENV["LAZAR_ENV"], - :hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]), + #:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]), + :hosts => ["localhost:27017"] } } }) Mongoid.raise_not_found_error = false # return nil if no document is found -$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}") +#$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}") +$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}") $gridfs = $mongo.database.fs # Logger setup @@ -68,13 +71,15 @@ suppressPackageStartupMessages({ }) " +PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" +CHEMBL_URI = "https://www.ebi.ac.uk/chembl/api/data/molecule/" + # OpenTox classes and includes -CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", "rest-client-wrapper.rb", - "error.rb", "opentox.rb", "feature.rb", "physchem.rb", @@ -94,6 +99,6 @@ CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation"," "train-test-validation.rb", "leave-one-out-validation.rb", "crossvalidation.rb", - #"experiment.rb", + "download.rb", "import.rb", ].each{ |f| require_relative f } diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index c33c92b..7d73b89 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -9,25 +9,18 @@ module OpenTox # @param [OpenTox::Model::Lazar] # @return [OpenTox::Validation::LeaveOneOut] def self.create model - bad_request_error "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection] + raise ArgumentError, "Cannot create leave one out validation for models with supervised feature selection. Please use crossvalidation instead." if model.algorithms[:feature_selection] $logger.debug "#{model.name}: LOO validation started" t = Time.now - model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut + model.training_dataset.features.collect{|f| f.class}.include?(NominalBioActivity) ? klass = ClassificationLeaveOneOut : klass = RegressionLeaveOneOut loo = klass.new :model_id => model.id predictions = model.predict model.training_dataset.substances predictions.each{|cid,p| p.delete(:neighbors)} - nr_unpredicted = 0 predictions.each do |cid,prediction| - if prediction[:value] - prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) - else - nr_unpredicted += 1 - end + prediction[:measurements] = model.training_dataset.values(cid, prediction[:prediction_feature_id]) if prediction[:value] predictions.delete(cid) unless prediction[:value] and prediction[:measurements] end predictions.select!{|cid,p| p[:value] and p[:measurements]} - loo.nr_instances = predictions.size - loo.nr_unpredicted = nr_unpredicted loo.predictions = predictions loo.statistics $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" @@ -40,25 +33,27 @@ module OpenTox class ClassificationLeaveOneOut < LeaveOneOut include ClassificationStatistics field :accept_values, type: Array - field :confusion_matrix, type: Array, default: [] - field :weighted_confusion_matrix, type: Array, default: [] - field :accuracy, type: Float - field :weighted_accuracy, type: Float - field :true_rate, type: Hash, default: {} - field :predictivity, type: Hash, default: {} - field :confidence_plot_id, type: BSON::ObjectId + field :confusion_matrix, type: Hash + field :weighted_confusion_matrix, type: Hash + field :accuracy, type: Hash + field :weighted_accuracy, type: Hash + field :true_rate, type: Hash + field :predictivity, type: Hash + field :nr_predictions, type: Hash + field :probability_plot_id, type: BSON::ObjectId end # Leave one out validation for regression models class RegressionLeaveOneOut < LeaveOneOut include RegressionStatistics - field :rmse, type: Float, default: 0 - field :mae, type: Float, default: 0 - field :r_squared, type: Float - field :within_prediction_interval, type: Integer, default:0 - field :out_of_prediction_interval, type: Integer, default:0 - field :correlation_plot_id, type: BSON::ObjectId + field :rmse, type: Hash + field :mae, type: Hash + field :r_squared, type: Hash + field :within_prediction_interval, type: Hash + field :out_of_prediction_interval, type: Hash + field :nr_predictions, type: Hash field :warnings, type: Array + field :correlation_plot_id, type: BSON::ObjectId end end diff --git a/lib/model.rb b/lib/model.rb index dce53a9..d7b2df6 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -32,20 +32,20 @@ module OpenTox # @param [OpenTox::Feature, nil] prediction_feature # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature # @param [Hash, nil] algorithms - # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and threshold), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. + # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. # # @return [OpenTox::Model::Lazar] def self.create prediction_feature:nil, training_dataset:, algorithms:{} - bad_request_error "Please provide a prediction_feature and/or a training_dataset." unless prediction_feature or training_dataset - prediction_feature = training_dataset.features.first unless prediction_feature - # TODO: prediction_feature without training_dataset: use all available data + raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset + prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature # guess model type - prediction_feature.numeric? ? model = LazarRegression.new : model = LazarClassification.new + prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new model.prediction_feature_id = prediction_feature.id model.training_dataset_id = training_dataset.id - model.name = "#{prediction_feature.name} (#{training_dataset.name})" + model.name = training_dataset.name + # git or gem versioning dir = File.dirname(__FILE__) path = File.expand_path("../", File.expand_path(dir)) @@ -62,7 +62,7 @@ module OpenTox # set defaults# substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq - bad_request_error "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 + raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 if substance_classes.first == "OpenTox::Compound" @@ -80,7 +80,7 @@ module OpenTox } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", - :min => 0.1, + :min => [0.5,0.2], } elsif model.class == LazarRegression model.algorithms[:prediction] = { @@ -88,7 +88,7 @@ module OpenTox } model.algorithms[:similarity] = { :method => "Algorithm::Similarity.tanimoto", - :min => 0.5, + :min => [0.5,0.2], } end @@ -100,7 +100,7 @@ module OpenTox }, :similarity => { :method => "Algorithm::Similarity.weighted_cosine", - :min => 0.5, + :min => [0.5,0.2], }, :prediction => { :method => "Algorithm::Caret.rf", @@ -110,7 +110,7 @@ module OpenTox }, } else - bad_request_error "Cannot create models for #{substance_classes.first}." + raise ArgumentError, "Cannot create models for #{substance_classes.first}." end # overwrite defaults with explicit parameters @@ -175,7 +175,7 @@ module OpenTox model.descriptor_ids = feature_ids & property_ids model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} else - bad_request_error "Descriptor method '#{descriptor_method}' not implemented." + raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented." end if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] @@ -197,7 +197,7 @@ module OpenTox # Predict a substance (compound or nanoparticle) # @param [OpenTox::Substance] # @return [Hash] - def predict_substance substance, threshold = self.algorithms[:similarity][:min] + def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data case algorithms[:similarity][:method] @@ -224,11 +224,11 @@ module OpenTox end end else - bad_request_error "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." + raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." end - prediction = {:warnings => [], :measurements => []} - prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min]}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min] + prediction ||= {:warnings => [], :measurements => []} + prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first neighbor_ids = [] neighbor_similarities = [] neighbor_dependent_variables = [] @@ -238,7 +238,7 @@ module OpenTox substance_ids.each_with_index do |s,i| # handle query substance if substance.id.to_s == s - prediction[:measurements] << dependent_variables[i] + prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." else if fingerprints? @@ -264,25 +264,37 @@ module OpenTox if neighbor_similarities.empty? prediction[:value] = nil - prediction[:warnings] << "Could not find similar substances with experimental data in the training dataset." + prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset." + if threshold == algorithms[:similarity][:min].last + prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})." + return prediction + end elsif neighbor_similarities.size == 1 prediction[:value] = nil - prediction[:warnings] << "Cannot create prediction: Only one similar compound in the training set." - prediction[:neighbors] = [{:id => neighbor_ids.first, :similarity => neighbor_similarities.first}] + prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})." + prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}] + if threshold == algorithms[:similarity][:min].last + prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set." + return prediction + end else query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" # call prediction algorithm result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors prediction.merge! result prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} - #if neighbor_similarities.max < algorithms[:similarity][:warn_min] - #prediction[:warnings] << "Closest neighbor has similarity < #{algorithms[:similarity][:warn_min]}. Prediction may be out of applicability domain." - #end end - if prediction[:warnings].empty? or threshold < algorithms[:similarity][:min] or threshold <= 0.2 - prediction - else # try again with a lower threshold - predict_substance substance, 0.2 + if threshold == algorithms[:similarity][:min].first + if prediction[:warnings].empty? + prediction[:confidence] = "Similar to bioassay results" + return prediction + else # try again with a lower threshold + prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." + predict_substance substance, algorithms[:similarity][:min].last, prediction + end + elsif threshold < algorithms[:similarity][:min].first + prediction[:confidence] = "Lower than bioassay results" + return prediction end end @@ -302,13 +314,18 @@ module OpenTox elsif object.is_a? Dataset substances = object.substances else - bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." + raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." end # make predictions predictions = {} substances.each do |c| predictions[c.id.to_s] = predict_substance c + if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value] + prediction_feature.accept_values.each do |v| + predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity) + end + end predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id end @@ -320,17 +337,35 @@ module OpenTox elsif object.is_a? Array return predictions elsif object.is_a? Dataset - # prepare prediction dataset - measurement_feature = Feature.find prediction_feature_id - - prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) - prediction_dataset = LazarPrediction.create( - :name => "Lazar prediction for #{prediction_feature.name}", - :creator => __FILE__, - :prediction_feature_id => prediction_feature.id, - :predictions => predictions - ) - return prediction_dataset + d = object.copy + #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id) + confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id) + if prediction_feature.is_a? NominalBioActivity + f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) + probability_features = {} + prediction_feature.accept_values.each do |v| + probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + end + elsif prediction_feature.is_a? NumericBioActivity + f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) + prediction_interval = [] + ["lower","upper"].each do |v| + prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) + end + end + + # add predictions to dataset + predictions.each do |substance_id,p| + substance_id = BSON::ObjectId.from_string(substance_id) + d.add substance_id,confidence_feature,p[:confidence] + unless p[:value].nil? + d.add substance_id,f,p[:value] + p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] + p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval] + end + end + d.save + return d end end @@ -402,6 +437,7 @@ module OpenTox field :species, type: String field :source, type: String field :unit, type: String + field :warnings, type: Array field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId @@ -461,11 +497,11 @@ module OpenTox end # Create and validate a lazar model from a csv file with training data and a json file with metadata - # @param [File] CSV file with two columns. The first line should contain either SMILES or InChI (first column) and the endpoint (second column). The first column should contain either the SMILES or InChI of the training compounds, the second column the training compounds toxic activities (qualitative or quantitative). Use -log10 transformed values for regression datasets. Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source" and "unit" (regression only). You can find example training data at https://github.com/opentox/lazar-public-data. - # @return [OpenTox::Model::Validation] lazar model with three independent 10-fold crossvalidations + # @param [File] CSV file with two or three columns. The first column is optional and may contain an arbitrary substance ID. The next column should contain either SMILES or InChIs of the training compounds, followed by toxic activities (qualitative or quantitative) in the last column. Use -log10 transformed values for regression datasets. The first line should contain "ID" (optional), either SMILES or InChI and the endpoint name (last column). Add metadata to a JSON file with the same basename containing the fields "species", "endpoint", "source", "qmrf" (optional) and "unit" (regression only). You can find example training data in the data folder of lazar. + # @return [OpenTox::Model::Validation] lazar model with five independent 10-fold crossvalidations def self.from_csv_file file metadata_file = file.sub(/csv$/,"json") - bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file + raise ArgumentError, "No metadata file #{metadata_file}" unless File.exist? metadata_file model_validation = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file model = Lazar.create training_dataset: training_dataset @@ -477,6 +513,7 @@ module OpenTox # Create and validate a nano-lazar model, import data from eNanoMapper if necessary # nano-lazar methods are described in detail in https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf + # *eNanoMapper import is currently broken, because APIs and data formats are constantly changing and we have no resources to track this changes permanently!* # @param [OpenTox::Dataset, nil] training_dataset # @param [OpenTox::Feature, nil] prediction_feature # @param [Hash, nil] algorithms @@ -488,7 +525,7 @@ module OpenTox unless training_dataset # try to import Import::Enanomapper.import training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - bad_request_error "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset + raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset end prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first diff --git a/lib/opentox.rb b/lib/opentox.rb index 5c300cf..fb2a579 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -11,13 +11,6 @@ module OpenTox include Mongoid::Timestamps store_in collection: klass.downcase.pluralize field :name, type: String - field :source, type: String - field :warnings, type: Array, default: [] - - def warn warning - $logger.warn warning - warnings << warning - end end OpenTox.const_set klass,c end diff --git a/lib/overwrite.rb b/lib/overwrite.rb index 0dd1c8a..d482902 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -84,7 +84,7 @@ class String def to_boolean return true if self == true || self =~ (/(true|t|yes|y|1)$/i) return false if self == false || self.nil? || self =~ (/(false|f|no|n|0)$/i) - bad_request_error "invalid value for Boolean: \"#{self}\"" + raise ArgumentError, "invalid value for Boolean: \"#{self}\"" end end diff --git a/lib/physchem.rb b/lib/physchem.rb index 07df867..2af043b 100644 --- a/lib/physchem.rb +++ b/lib/physchem.rb @@ -1,7 +1,7 @@ module OpenTox # Feature for physico-chemical descriptors - class PhysChem < NumericFeature + class PhysChem < NumericSubstanceProperty field :library, type: String field :descriptor, type: String @@ -45,7 +45,7 @@ module OpenTox def self.descriptors desc=DESCRIPTORS desc.collect do |name,description| lib,desc = name.split('.',2) - self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) + self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description) end end @@ -59,11 +59,11 @@ module OpenTox CDK_DESCRIPTIONS.select{|d| desc == d[:java_class].split('.').last.sub('Descriptor','') }.first[:names].each do |n| dname = "#{name}.#{n}" description = DESCRIPTORS[dname] - udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) + udesc << self.find_or_create_by(:name => dname, :library => lib, :descriptor => desc, :description => description) end else description = DESCRIPTORS[name] - udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description, :measured => false, :calculated => true) + udesc << self.find_or_create_by(:name => name, :library => lib, :descriptor => desc, :description => description) end end udesc diff --git a/lib/regression.rb b/lib/regression.rb index 25c0732..fd2855f 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -17,7 +17,7 @@ module OpenTox sim_sum += weights[i] end if dependent_variables sim_sum == 0 ? prediction = nil : prediction = weighted_sum/sim_sum - {:value => prediction, :warnings => ["Weighted average prediction, no prediction interval available."]} + {:value => prediction} end end diff --git a/lib/rest-client-wrapper.rb b/lib/rest-client-wrapper.rb index c9fd40f..db23e66 100644 --- a/lib/rest-client-wrapper.rb +++ b/lib/rest-client-wrapper.rb @@ -28,14 +28,14 @@ module OpenTox uri = Addressable::URI.encode(uri) # check input - bad_request_error "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) + raise ArgumentError, "Headers are not a hash: #{headers.inspect} for #{uri}." unless headers==nil or headers.is_a?(Hash) headers[:subjectid] ||= @@subjectid - bad_request_error "Invalid URI: '#{uri}'" unless URI.valid? uri + raise ArgumentError, "Invalid URI: '#{uri}'" unless URI.valid? uri # make sure that no header parameters are set in the payload [:accept,:content_type,:subjectid].each do |header| if defined? $aa || URI(uri).host == URI($aa[:uri]).host else - bad_request_error "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header] + raise ArgumentError, "#{header} should be submitted in the headers of URI: #{uri}" if payload and payload.is_a?(Hash) and payload[header] end end @@ -56,6 +56,7 @@ module OpenTox @response = @request.execute do |response, request, result| if [301, 302, 307].include? response.code and request.method == :get response.follow_redirection(request, result) +=begin elsif response.code >= 400 and !URI.task?(uri) error = known_errors.collect{|e| e if e[:code] == response.code}.compact.first begin # errors are returned as error reports in json, try to parse @@ -68,6 +69,7 @@ module OpenTox cause = nil end Object.method(error[:method]).call "#{msg}, #{uri}, #{cause}" # call error method +=end else response end @@ -75,6 +77,7 @@ module OpenTox end end +=begin #@return [Array] of hashes with error code, method and class def self.known_errors errors = [] @@ -88,6 +91,7 @@ module OpenTox end errors end +=end end end diff --git a/lib/substance.rb b/lib/substance.rb index ef49659..5c486d8 100644 --- a/lib/substance.rb +++ b/lib/substance.rb @@ -3,7 +3,6 @@ module OpenTox # Base class for substances (e.g. compunds, nanoparticles) class Substance field :properties, type: Hash, default: {} - field :dataset_ids, type: Array, default: [] end end diff --git a/lib/train-test-validation.rb b/lib/train-test-validation.rb index 9a5532d..d034cd1 100644 --- a/lib/train-test-validation.rb +++ b/lib/train-test-validation.rb @@ -18,22 +18,15 @@ module OpenTox validation_model = model.class.create prediction_feature: model.prediction_feature, training_dataset: training_set, algorithms: model.algorithms validation_model.save predictions = validation_model.predict test_set.substances - nr_unpredicted = 0 predictions.each do |cid,prediction| - if prediction[:value] - prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) - else - nr_unpredicted += 1 - end + prediction[:measurements] = test_set.values(cid, prediction[:prediction_feature_id]) if prediction[:value] end predictions.select!{|cid,p| p[:value] and p[:measurements]} - # hack to avoid mongos file size limit error on large datasets - #predictions.each{|cid,p| p[:neighbors] = []} if model.training_dataset.name.match(/mutagenicity/i) + # remove neighbors to avoid mongos file size limit error on large datasets + predictions.each{|cid,p| p.delete(:neighbors)} #if model.training_dataset.name.match(/mutagenicity/i) validation = self.new( :model_id => validation_model.id, :test_dataset_id => test_set.id, - :nr_instances => test_set.substances.size, - :nr_unpredicted => nr_unpredicted, :predictions => predictions ) validation.save diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb index 69e7992..5fd9985 100644 --- a/lib/validation-statistics.rb +++ b/lib/validation-statistics.rb @@ -7,62 +7,66 @@ module OpenTox # @return [Hash] def statistics self.accept_values = model.prediction_feature.accept_values - self.confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - self.weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} - nr_instances = 0 + self.confusion_matrix = {:all => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_high => Array.new(accept_values.size){Array.new(accept_values.size,0)}, :confidence_low => Array.new(accept_values.size){Array.new(accept_values.size,0)}} + self.nr_predictions = {:all => 0,:confidence_high => 0,:confidence_low => 0} predictions.each do |cid,pred| - # TODO - # use predictions without probabilities (single neighbor)?? - # use measured majority class?? + # TODO: use measured majority class or all measurements?? if pred[:measurements].uniq.size == 1 and pred[:probabilities] m = pred[:measurements].first if pred[:value] == m - if pred[:value] == accept_values[0] - confusion_matrix[0][0] += 1 - weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][1] += 1 - weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 + accept_values.each_with_index do |v,i| + if pred[:value] == v + confusion_matrix[:all][i][i] += 1 + self.nr_predictions[:all] += 1 + if pred[:confidence].match(/Similar/i) + confusion_matrix[:confidence_high][i][i] += 1 + self.nr_predictions[:confidence_high] += 1 + elsif pred[:confidence].match(/Low/i) + confusion_matrix[:confidence_low][i][i] += 1 + self.nr_predictions[:confidence_low] += 1 + end + end end elsif pred[:value] != m - if pred[:value] == accept_values[0] - confusion_matrix[0][1] += 1 - weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] - nr_instances += 1 - elsif pred[:value] == accept_values[1] - confusion_matrix[1][0] += 1 - weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] - nr_instances += 1 + accept_values.each_with_index do |v,i| + if pred[:value] == v + confusion_matrix[:all][i][(i+1)%2] += 1 + self.nr_predictions[:all] += 1 + if pred[:confidence].match(/Similar/i) + confusion_matrix[:confidence_high][i][(i+1)%2] += 1 + self.nr_predictions[:confidence_high] += 1 + elsif pred[:confidence].match(/Low/i) + confusion_matrix[:confidence_low][i][(i+1)%2] += 1 + self.nr_predictions[:confidence_low] += 1 + end + end end end end end - self.true_rate = {} - self.predictivity = {} + + self.true_rate = {:all => {}, :confidence_high => {}, :confidence_low => {}} + self.predictivity = {:all => {}, :confidence_high => {}, :confidence_low => {}} accept_values.each_with_index do |v,i| - self.true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - self.predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - weighted_confusion_matrix.each do |r| - r.each do |c| - confidence_sum += c + [:all,:confidence_high,:confidence_low].each do |a| + self.true_rate[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a][i].reduce(:+).to_f + self.predictivity[a][v] = confusion_matrix[a][i][i]/confusion_matrix[a].collect{|n| n[i]}.reduce(:+).to_f end end - self.accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f - self.weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f + self.accuracy = {} + [:all,:confidence_high,:confidence_low].each do |a| + self.accuracy[a] = (confusion_matrix[a][0][0]+confusion_matrix[a][1][1])/nr_predictions[a].to_f + end $logger.debug "Accuracy #{accuracy}" + $logger.debug "Nr Predictions #{nr_predictions}" save { :accept_values => accept_values, :confusion_matrix => confusion_matrix, - :weighted_confusion_matrix => weighted_confusion_matrix, :accuracy => accuracy, - :weighted_accuracy => weighted_accuracy, :true_rate => self.true_rate, :predictivity => self.predictivity, + :nr_predictions => nr_predictions, } end @@ -97,7 +101,7 @@ module OpenTox R.assign "probability", probabilities R.eval "image = qplot(probability,accuracy)+ylab('Accumulated accuracy')+xlab('Prediction probability')+ylim(c(0,1))+scale_x_reverse()+geom_line()" R.eval "ggsave(file='#{tmpfile}', plot=image)" - file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.svg") + file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{self.id.to_s}_probability_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:probability_plot_id => plot_id) #end @@ -108,29 +112,27 @@ module OpenTox # Statistical evaluation of regression validations module RegressionStatistics + attr_accessor :x, :y + # Get statistics # @return [Hash] def statistics self.warnings = [] - self.rmse = 0 - self.mae = 0 - self.within_prediction_interval = 0 - self.out_of_prediction_interval = 0 - x = [] - y = [] + self.rmse = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.r_squared = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.mae = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.within_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0} + self.out_of_prediction_interval = {:all =>0,:confidence_high => 0,:confidence_low => 0} + @x = {:all => [],:confidence_high => [],:confidence_low => []} + @y = {:all => [],:confidence_high => [],:confidence_low => []} + self.nr_predictions = {:all =>0,:confidence_high => 0,:confidence_low => 0} predictions.each do |cid,pred| - if pred[:value] and pred[:measurements] - x << pred[:measurements].median - y << pred[:value] - error = pred[:value]-pred[:measurements].median - self.rmse += error**2 - self.mae += error.abs - if pred[:prediction_interval] - if pred[:measurements].median >= pred[:prediction_interval][0] and pred[:measurements].median <= pred[:prediction_interval][1] - self.within_prediction_interval += 1 - else - self.out_of_prediction_interval += 1 - end + !if pred[:value] and pred[:measurements] and !pred[:measurements].empty? + insert_prediction pred, :all + if pred[:confidence].match(/Similar/i) + insert_prediction pred, :confidence_high + elsif pred[:confidence].match(/Low/i) + insert_prediction pred, :confidence_low end else trd_id = model.training_dataset_id @@ -139,39 +141,49 @@ module OpenTox $logger.debug "No training activities for #{smiles} in training dataset #{trd_id}." end end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='pairwise')" - self.r_squared = R.eval("r").to_ruby**2 - self.mae = self.mae/predictions.size - self.rmse = Math.sqrt(self.rmse/predictions.size) + [:all,:confidence_high,:confidence_low].each do |a| + if @x[a].size > 2 + R.assign "measurement", @x[a] + R.assign "prediction", @y[a] + R.eval "r <- cor(measurement,prediction,use='pairwise')" + self.r_squared[a] = R.eval("r").to_ruby**2 + else + self.r_squared[a] = 0 + end + if self.nr_predictions[a] > 0 + self.mae[a] = self.mae[a]/self.nr_predictions[a] + self.rmse[a] = Math.sqrt(self.rmse[a]/self.nr_predictions[a]) + else + self.mae[a] = nil + self.rmse[a] = nil + end + end $logger.debug "R^2 #{r_squared}" $logger.debug "RMSE #{rmse}" $logger.debug "MAE #{mae}" - $logger.debug "#{percent_within_prediction_interval.round(2)}% of measurements within prediction interval" - $logger.debug "#{warnings}" + $logger.debug "Nr predictions #{nr_predictions}" + $logger.debug "#{within_prediction_interval} measurements within prediction interval" save { :mae => mae, :rmse => rmse, :r_squared => r_squared, - :within_prediction_interval => within_prediction_interval, + :within_prediction_interval => self.within_prediction_interval, :out_of_prediction_interval => out_of_prediction_interval, + :nr_predictions => nr_predictions, } end - # Get percentage of measurements within the prediction interval - # @return [Float] - def percent_within_prediction_interval - 100*within_prediction_interval.to_f/(within_prediction_interval+out_of_prediction_interval) - end - # Plot predicted vs measured values # @param [String,nil] format # @return [Blob] def correlation_plot format: "png" - unless correlation_plot_id - tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" + #unless correlation_plot_id + #tmpfile = "/tmp/#{id.to_s}_correlation.#{format}" + tmpdir = "/tmp" + #p tmpdir + FileUtils.mkdir_p tmpdir + tmpfile = File.join(tmpdir,"#{id.to_s}_correlation.#{format}") x = [] y = [] feature = Feature.find(predictions.first.last["prediction_feature_id"]) @@ -187,7 +199,7 @@ module OpenTox title = "log2(Net cell association [mL/ug(Mg)])" else title = feature.name - title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank? + title += "-log10(#{feature.unit})" if feature.unit and !feature.unit.blank? end R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)" R.eval "image = image + geom_abline(intercept=0, slope=1)" @@ -195,7 +207,7 @@ module OpenTox file = Mongo::Grid::File.new(File.read(tmpfile), :filename => "#{id.to_s}_correlation_plot.#{format}") plot_id = $gridfs.insert_one(file) update(:correlation_plot_id => plot_id) - end + #end $gridfs.find_one(_id: correlation_plot_id).data end @@ -215,6 +227,24 @@ module OpenTox end worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h end + + private + + def insert_prediction prediction, type + self.nr_predictions[type] +=1 + @x[type] << prediction[:measurements].median + @y[type] << prediction[:value] + error = prediction[:value]-prediction[:measurements].median + self.rmse[type] += error**2 + self.mae[type] += error.abs + if prediction[:prediction_interval] + if prediction[:measurements].median >= prediction[:prediction_interval][0] and prediction[:measurements].median <= prediction[:prediction_interval][1] + self.within_prediction_interval[type] += 1 + else + self.out_of_prediction_interval[type] += 1 + end + end + end end end end diff --git a/lib/validation.rb b/lib/validation.rb index c9954b6..9402361 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -10,8 +10,6 @@ module OpenTox store_in collection: "validations" field :name, type: String field :model_id, type: BSON::ObjectId - field :nr_instances, type: Integer, default: 0 - field :nr_unpredicted, type: Integer, default: 0 field :predictions, type: Hash, default: {} field :finished_at, type: Time |