diff options
-rw-r--r-- | .gitignore | 6 | ||||
-rw-r--r-- | data/enm-dump.rb | 16 | ||||
-rw-r--r-- | data/enm-import.rb | 47 | ||||
-rw-r--r-- | lib/classification.rb | 3 | ||||
-rw-r--r-- | lib/compound.rb | 20 | ||||
-rw-r--r-- | lib/crossvalidation.rb | 122 | ||||
-rw-r--r-- | lib/dataset.rb | 199 | ||||
-rw-r--r-- | lib/feature.rb | 11 | ||||
-rw-r--r-- | lib/import.rb | 73 | ||||
-rw-r--r-- | lib/lazar.rb | 7 | ||||
-rw-r--r-- | lib/leave-one-out-validation.rb | 115 | ||||
-rw-r--r-- | lib/model.rb | 127 | ||||
-rw-r--r-- | lib/nanoparticle.rb | 69 | ||||
-rw-r--r-- | lib/opentox.rb | 6 | ||||
-rw-r--r-- | lib/regression.rb | 35 | ||||
-rw-r--r-- | lib/substance.rb | 10 | ||||
-rw-r--r-- | lib/validation-statistics.rb | 101 | ||||
-rw-r--r-- | lib/validation.rb | 63 | ||||
-rw-r--r-- | test/classification.rb | 12 | ||||
-rw-r--r-- | test/dataset.rb | 50 | ||||
-rw-r--r-- | test/nanoparticles.rb | 34 | ||||
-rw-r--r-- | test/prediction_models.rb | 1 | ||||
-rw-r--r-- | test/validation.rb | 22 |
23 files changed, 633 insertions, 516 deletions
@@ -1,8 +1,5 @@ -last-utils -libfminer +R openbabel -fminer_debug.txt -test/fminer_debug.txt Gemfile.lock *.gem .bundle @@ -11,3 +8,4 @@ pkg/* .yardoc/ doc/ lazar.log +data diff --git a/data/enm-dump.rb b/data/enm-dump.rb new file mode 100644 index 0000000..c1c25e7 --- /dev/null +++ b/data/enm-dump.rb @@ -0,0 +1,16 @@ +require 'json' + +#get list of bundle URIs +`wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` +json = JSON.parse File.read('./bundles.json') +json["dataset"].each do |dataset| + uri = dataset["URI"] + id = uri.split("/").last + `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` + `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` + `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` + `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` + `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` + `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` + `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` +end diff --git a/data/enm-import.rb b/data/enm-import.rb new file mode 100644 index 0000000..37bc22b --- /dev/null +++ b/data/enm-import.rb @@ -0,0 +1,47 @@ +require_relative '../lib/lazar.rb' +include OpenTox +$mongo.database.drop +$gridfs = $mongo.database.fs + +#get list of bundle URIs +bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] +bundles.each do |bundle| + uri = bundle["URI"] + nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] + features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"] + nanoparticles.each do |np| + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + ) + nanoparticle.bundles << uri + nanoparticle.bundles.uniq! + np["composition"].each do |comp| + case comp["relation"] + when "HAS_CORE" + nanoparticle.core = comp["component"]["compound"]["URI"] + when "HAS_COATING" + nanoparticle.coating << comp["component"]["compound"]["URI"] + end + end if np["composition"] + np["values"].each do |u,v| + if u.match(/property/) + name, unit, source = nil + features.each do |uri,feat| + if u.match(/#{uri}/) + name = feat["title"] + unit = feat["units"] + source = uri + end + end + feature = Feature.find_or_create_by( + :name => name, + :unit => unit, + :source => source + ) + end + v.each{|value| nanoparticle.parse_ambit_value feature, value} if v.is_a? Array + end + nanoparticle.save! + end +end diff --git a/lib/classification.rb b/lib/classification.rb index b9b66f0..93b4f0f 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -9,10 +9,9 @@ module OpenTox sims = {} neighbors.each do |n| sim = n["tanimoto"] - n["features"][feature_id].each do |act| + n["toxicities"][feature_id].each do |act| sims[act] ||= [] sims[act] << sim - #sims[act] << 0.5*sim+0.5 # scale to 1-0.5 end end sim_all = sims.collect{|a,s| s}.flatten diff --git a/lib/compound.rb b/lib/compound.rb index 2a79fd6..049d77b 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -2,10 +2,8 @@ CACTUS_URI="http://cactus.nci.nih.gov/chemical/structure/" module OpenTox - class Compound + class Compound < Substance require_relative "unique_descriptors.rb" - include OpenTox - DEFAULT_FINGERPRINT = "MP2D" field :inchi, type: String @@ -19,9 +17,6 @@ module OpenTox field :sdf_id, type: BSON::ObjectId field :fingerprints, type: Hash, default: {} field :default_fingerprint_size, type: Integer - field :physchem_descriptors, type: Hash, default: {} - field :dataset_ids, type: Array, default: [] - field :features, type: Hash, default: {} index({smiles: 1}, {unique: true}) @@ -293,8 +288,7 @@ module OpenTox training_dataset.compounds.each do |compound| candidate_fingerprint = compound.fingerprint params[:type] sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f - feature_values = training_dataset.values(compound,prediction_feature) - neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim] + neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => compound.toxicities[prediction_feature.id.to_s]}, "tanimoto" => sim} if sim >= params[:min_sim] end neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]} end @@ -335,25 +329,25 @@ module OpenTox 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]} }}, '_id' => 1, - 'features' => 1, + 'toxicities' => 1, 'dataset_ids' => 1 }}, {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}}, {'$sort' => {'tanimoto' => -1}} ] - $mongo["compounds"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} + $mongo["substances"].aggregate(aggregate).select{|r| r["dataset_ids"].include? params[:training_dataset_id]} end - # Convert mg to mmol + # Convert mmol to mg # @return [Float] value in mg def mmol_to_mg mmol mmol.to_f*molecular_weight end - # Convert mmol to mg - # @return [Float] value in mg + # Convert mg to mmol + # @return [Float] value in mmol def mg_to_mmol mg mg.to_f/molecular_weight end diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb index 6ffeb25..50afb6f 100644 --- a/lib/crossvalidation.rb +++ b/lib/crossvalidation.rb @@ -6,7 +6,7 @@ module OpenTox field :folds, type: Integer field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array, default: [] + field :predictions, type: Hash, default: {} field :finished_at, type: Time def time @@ -22,8 +22,10 @@ module OpenTox end def self.create model, n=10 - model.training_dataset.features.first.nominal? ? klass = ClassificationCrossValidation : klass = RegressionCrossValidation - bad_request_error "#{dataset.features.first} is neither nominal nor numeric." unless klass + klass = ClassificationCrossValidation if model.is_a? Model::LazarClassification + klass = RegressionCrossValidation if model.is_a? Model::LazarRegression + bad_request_error "Unknown model class #{model.class}." unless klass + cv = klass.new( name: model.name, model_id: model.id, @@ -32,22 +34,22 @@ module OpenTox cv.save # set created_at nr_instances = 0 nr_unpredicted = 0 - predictions = [] + predictions = {} training_dataset = Dataset.find model.training_dataset_id training_dataset.folds(n).each_with_index do |fold,fold_nr| - #fork do # parallel execution of validations + #fork do # parallel execution of validations can lead to Rserve and memory problems $logger.debug "Dataset #{training_dataset.name}: Fold #{fold_nr} started" t = Time.now validation = Validation.create(model, fold[0], fold[1],cv) $logger.debug "Dataset #{training_dataset.name}, Fold #{fold_nr}: #{Time.now-t} seconds" #end end - #Process.waitall + Process.waitall cv.validation_ids = Validation.where(:crossvalidation_id => cv.id).distinct(:_id) cv.validations.each do |validation| nr_instances += validation.nr_instances nr_unpredicted += validation.nr_unpredicted - predictions += validation.predictions + predictions.merge! validation.predictions end cv.update_attributes( nr_instances: nr_instances, @@ -73,61 +75,8 @@ module OpenTox # TODO auc, f-measure (usability??) def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - true_rate = {} - predictivity = {} - predictions.each do |pred| - compound_id,activities,prediction,confidence = pred - if activities and prediction #and confidence.numeric? - if activities.uniq.size == 1 - activity = activities.uniq.first - if prediction == activity - if prediction == accept_values[0] - confusion_matrix[0][0] += 1 - #weighted_confusion_matrix[0][0] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][1] += 1 - #weighted_confusion_matrix[1][1] += confidence - end - elsif prediction != activity - if prediction == accept_values[0] - confusion_matrix[0][1] += 1 - #weighted_confusion_matrix[0][1] += confidence - elsif prediction == accept_values[1] - confusion_matrix[1][0] += 1 - #weighted_confusion_matrix[1][0] += confidence - end - end - end - else - nr_unpredicted += 1 if prediction.nil? - end - end - true_rate = {} - predictivity = {} - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 - #weighted_confusion_matrix.each do |r| - #r.each do |c| - #confidence_sum += c - #end - #end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, - #weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, - #weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" + stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) + update_attributes(stat) end def confidence_plot @@ -169,52 +118,11 @@ module OpenTox field :correlation_plot_id, type: BSON::ObjectId def statistics - rmse = 0 - mae = 0 - x = [] - y = [] - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - unless activity == [nil] - x << -Math.log10(activity.median) - y << -Math.log10(prediction) - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - #weighted_rmse += confidence*error**2 - mae += error.abs - #weighted_mae += confidence*error.abs - #confidence_sum += confidence - end - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(measurement,prediction,use='complete')" - r = R.eval("r").to_ruby - - mae = mae/predictions.size - #weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - #weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - update_attributes( - mae: mae, - rmse: rmse, - #weighted_mae: weighted_mae, - #weighted_rmse: weighted_rmse, - r_squared: r**2, - finished_at: Time.now - ) - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" + stat = ValidationStatistics.regression predictions + update_attributes(stat) end def misclassifications n=nil - #n = predictions.size unless n n ||= 10 model = Model::Lazar.find(self.model_id) training_dataset = Dataset.find(model.training_dataset_id) @@ -225,8 +133,7 @@ module OpenTox neighbors = compound.send(model.neighbor_algorithm,model.neighbor_algorithm_parameters) neighbors.collect! do |n| neighbor = Compound.find(n[0]) - values = training_dataset.values(neighbor,prediction_feature) - { :smiles => neighbor.smiles, :similarity => n[1], :measurements => values} + { :smiles => neighbor.smiles, :similarity => n[1], :measurements => neighbor.toxicities[prediction_feature.id.to_s]} end { :smiles => compound.smiles, @@ -297,5 +204,4 @@ module OpenTox end end - end diff --git a/lib/dataset.rb b/lib/dataset.rb index 5d8aeaf..b51d74b 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -5,23 +5,28 @@ module OpenTox class Dataset - # associations like has_many, belongs_to deteriorate performance + field :substance_ids, type: Array, default: [] field :feature_ids, type: Array, default: [] - field :compound_ids, type: Array, default: [] - field :data_entries, type: Array, default: [] - field :source, type: String # Readers - # Get all compounds def compounds - @compounds ||= self.compound_ids.collect{|id| OpenTox::Compound.find id} - @compounds + substances.select{|s| s.is_a? Compound} + end + + def nanoparticles + substances.select{|s| s.is_a? Nanoparticle} + end + + # Get all substances + def substances + @substances ||= substance_ids.collect{|id| OpenTox::Substance.find id} + @substances end # Get all features def features - @features ||= self.feature_ids.collect{|id| OpenTox::Feature.find(id)} + @features ||= feature_ids.collect{|id| OpenTox::Feature.find(id)} @features end @@ -29,17 +34,15 @@ module OpenTox # @param compound [OpenTox::Compound] OpenTox Compound object # @param feature [OpenTox::Feature] OpenTox Feature object # @return [Array] Data entry values - def values(compound, feature) - rows = compound_ids.each_index.select{|r| compound_ids[r] == compound.id } - col = feature_ids.index feature.id - rows.collect{|row| data_entries[row][col]} - end + #def values(compound, feature) + #data_entries[compound.id.to_s][feature.id.to_s] + #end # Writers # Set compounds def compounds=(compounds) - self.compound_ids = compounds.collect{|c| c.id} + self.substance_ids = compounds.collect{|c| c.id} end # Set features @@ -53,13 +56,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n - unique_compound_data = {} - compound_ids.each_with_index do |cid,i| - unique_compound_data[cid] ||= [] - unique_compound_data[cid] << data_entries[i] - end - unique_compound_ids = unique_compound_data.keys - len = unique_compound_ids.size + len = self.substance_ids.size indices = (0..len-1).to_a.shuffle mid = (len/n) chunks = [] @@ -68,24 +65,15 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_cids = test_idxs.collect{|i| unique_compound_ids[i]} + test_cids = test_idxs.collect{|i| substance_ids[i]} training_idxs = indices-test_idxs - training_cids = training_idxs.collect{|i| unique_compound_ids[i]} - chunk = [training_cids,test_cids].collect do |unique_cids| - cids = [] - data_entries = [] - unique_cids.each do |cid| - unique_compound_data[cid].each do |de| - cids << cid - data_entries << de - end - end - dataset = self.class.new(:compound_ids => cids, :feature_ids => self.feature_ids, :data_entries => data_entries, :source => self.id ) + training_cids = training_idxs.collect{|i| substance_ids[i]} + chunk = [training_cids,test_cids].collect do |cids| + dataset = self.class.create(:substance_ids => cids, :feature_ids => feature_ids, :source => self.id ) dataset.compounds.each do |compound| compound.dataset_ids << dataset.id compound.save end - dataset.save dataset end start = last+1 @@ -94,41 +82,28 @@ module OpenTox chunks end - # Diagnostics - - def duplicates feature=self.features.first - col = feature_ids.index feature.id - dups = {} - compound_ids.each_with_index do |cid,i| - rows = compound_ids.each_index.select{|r| compound_ids[r] == cid } - values = rows.collect{|row| data_entries[row][col]} - dups[cid] = values if values.size > 1 - end - dups - end - - def correlation_plot training_dataset - # TODO: create/store svg - R.assign "features", data_entries - R.assign "activities", training_dataset.data_entries.collect{|de| de.first} - R.eval "featurePlot(features,activities)" - end - - def density_plot - # TODO: create/store svg - R.assign "acts", data_entries.collect{|r| r.first }#.compact - R.eval "plot(density(-log(acts),na.rm= TRUE), main='-log(#{features.first.name})')" - end - # Serialisation # converts dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] def to_csv(inchi=false) - CSV.generate() do |csv| #{:force_quotes=>true} - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} - compounds.each_with_index do |c,i| - csv << [inchi ? c.inchi : c.smiles] + data_entries[i] + CSV.generate() do |csv| + compound = Substance.find(substance_ids.first).is_a? Compound + if compound + csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + else + csv << ["Name"] + features.collect{|f| f.name} + end + substances.each do |substance| + features.each do |f| + substance.toxicities[f.id.to_s].each do |v| + if compound + csv << [inchi ? substance.inchi : substance.smiles , v] + else + csv << [substance.name , v] + end + end if substance.toxicities[f.id.to_s] + end end end end @@ -144,7 +119,7 @@ module OpenTox # Create a dataset from CSV file # TODO: document structure - def self.from_csv_file file, source=nil, bioassay=true#, layout={} + def self.from_csv_file file, source=nil source ||= file name = File.basename(file,".*") dataset = self.find_by(:source => source, :name => name) @@ -154,51 +129,40 @@ module OpenTox $logger.debug "Parsing #{file}." table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' dataset = self.new(:source => source, :name => name) - dataset.parse_table table, bioassay#, layout + dataset.parse_table table end dataset end # parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types - def parse_table table, bioassay=true + def parse_table table time = Time.now # features feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicate features in table header." unless feature_names.size == feature_names.uniq.size + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size compound_format = feature_names.shift.strip + # TODO nanoparticles bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - numeric = [] # guess feature types feature_names.each_with_index do |f,i| metadata = {:name => f} values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes metadata["numeric"] = true numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) else metadata["nominal"] = true metadata["accept_values"] = values numeric[i] = false - end - if bioassay - if metadata["numeric"] - feature = NumericBioAssay.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalBioAssay.find_or_create_by(metadata) - end - else - metadata.merge({:measured => false, :calculated => true}) - if metadata["numeric"] - feature = NumericFeature.find_or_create_by(metadata) - elsif metadata["nominal"] - feature = NominalFeature.find_or_create_by(metadata) - end + feature = NominalFeature.find_or_create_by(metadata) end feature_ids << feature.id if feature end @@ -211,59 +175,54 @@ module OpenTox value_time = 0 # compounds and values - self.data_entries = [] table.each_with_index do |vals,i| ct = Time.now identifier = vals.shift.strip - warnings << "No feature values for compound at position #{i+2}." if vals.compact.empty? + warn "No feature values for compound at position #{i+2}." if vals.compact.empty? begin case compound_format when /SMILES/i compound = OpenTox::Compound.from_smiles(identifier) when /InChI/i compound = OpenTox::Compound.from_inchi(identifier) + # TODO nanoparticle end rescue compound = nil end - if compound.nil? - # compound parsers may return nil - warnings << "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." + if compound.nil? # compound parsers may return nil + warn "Cannot parse #{compound_format} compound '#{identifier}' at position #{i+2}, all entries are ignored." next end + substance_ids << compound.id compound.dataset_ids << self.id unless compound.dataset_ids.include? self.id compound_time += Time.now-ct r += 1 - unless vals.size == feature_ids.size # way cheaper than accessing features - warnings << "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." + unless vals.size == feature_ids.size + warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." next end - compound_ids << compound.id - table.first.size == 0 ? self.data_entries << Array.new(0) : self.data_entries << Array.new(table.first.size-1) - vals.each_with_index do |v,j| if v.blank? - warnings << "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." + warn "Empty value for compound '#{identifier}' (row #{r+2}) and feature '#{feature_names[j]}' (column #{j+2})." next elsif numeric[j] v = v.to_f else v = v.strip end - self.data_entries.last[j] = v - #i = compound.feature_ids.index feature_ids[j] - compound.features[feature_ids[j].to_s] ||= [] - compound.features[feature_ids[j].to_s] << v + compound.toxicities[feature_ids[j].to_s] ||= [] + compound.toxicities[feature_ids[j].to_s] << v compound.save end end compounds.duplicates.each do |compound| positions = [] compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi} - warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + warn "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end $logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})" @@ -273,52 +232,26 @@ module OpenTox end - # Fill unset data entries - # @param any value - def fill_nil_with n - (0 .. compound_ids.size-1).each do |i| - data_entries[i] ||= [] - (0 .. feature_ids.size-1).each do |j| - data_entries[i][j] ||= n - end - end - end - end # Dataset for lazar predictions - class LazarPrediction < Dataset + class LazarPrediction #< Dataset field :creator, type: String - field :prediction_feature_id, type: String + field :prediction_feature_id, type: BSON::ObjectId + field :predictions, type: Hash, default: {} def prediction_feature Feature.find prediction_feature_id end - end - - # Dataset for descriptors (physchem) - class DescriptorDataset < Dataset - field :feature_calculation_algorithm, type: String - - end - - class ScaledDataset < DescriptorDataset - - field :centers, type: Array, default: [] - field :scales, type: Array, default: [] + def compounds + substances.select{|s| s.is_a? Compound} + end - def original_value value, i - value * scales[i] + centers[i] + def substances + predictions.keys.collect{|id| Substance.find id} end - end - # Dataset for fminer descriptors - class FminerDataset < DescriptorDataset - field :training_algorithm, type: String - field :training_dataset_id, type: BSON::ObjectId - field :training_feature_id, type: BSON::ObjectId - field :training_parameters, type: Hash end end diff --git a/lib/feature.rb b/lib/feature.rb index b58946b..c6fb68a 100644 --- a/lib/feature.rb +++ b/lib/feature.rb @@ -6,6 +6,9 @@ module OpenTox field :numeric, type: Boolean field :measured, type: Boolean field :calculated, type: Boolean + field :category, type: String + field :unit, type: String + field :conditions, type: Hash end # Feature for categorical variables @@ -34,12 +37,4 @@ module OpenTox end end - # Feature for categorical bioassay results - class NominalBioAssay < NominalFeature - end - - # Feature for quantitative bioassay results - class NumericBioAssay < NumericFeature - end - end diff --git a/lib/import.rb b/lib/import.rb new file mode 100644 index 0000000..9091207 --- /dev/null +++ b/lib/import.rb @@ -0,0 +1,73 @@ +module OpenTox + + module Import + + class Enanomapper + include OpenTox + + def self.import + #get list of bundle URIs + bundles = JSON.parse(RestClientWrapper.get('https://data.enanomapper.net/bundle?media=application%2Fjson'))["dataset"] + datasets = [] + bundles.each do |bundle| + uri = bundle["URI"] + dataset = Dataset.find_or_create_by(:source => bundle["URI"],:name => bundle["title"]) + nanoparticles = JSON.parse(RestClientWrapper.get(bundle["dataset"]+"?media=application%2Fjson"))["dataEntry"] + features = JSON.parse(RestClientWrapper.get(bundle["property"]+"?media=application%2Fjson"))["feature"] + nanoparticles.each do |np| + nanoparticle = Nanoparticle.find_or_create_by( + :name => np["values"]["https://data.enanomapper.net/identifier/name"], + :source => np["compound"]["URI"], + ) + dataset.substance_ids << nanoparticle.id + dataset.substance_ids.uniq! + studies = JSON.parse(RestClientWrapper.get(File.join(np["compound"]["URI"],"study")))["study"] + studies.each do |study| + study["effects"].each do |effect| + effect["result"]["textValue"] ? klass = NominalFeature : klass = NumericFeature + # TODO parse core/coating + # TODO parse proteomics, they come as a large textValue + $logger.debug File.join(np["compound"]["URI"],"study") + effect["conditions"].delete_if { |k, v| v.nil? } + feature = klass.find_or_create_by( + :source => File.join(np["compound"]["URI"],"study"), + :name => "#{study["protocol"]["category"]["title"]} #{study["protocol"]["endpoint"]}", + :unit => effect["result"]["unit"], + :category => study["protocol"]["topcategory"], + :conditions => effect["conditions"] + ) + nanoparticle.parse_ambit_value feature, effect["result"] + dataset.feature_ids << feature.id + dataset.feature_ids.uniq! + end + end + end + dataset.save + datasets << dataset + end + datasets.collect{|d| d.id} + end + + def self.dump + #get list of bundle URIs + `wget 'https://data.enanomapper.net/bundle?media=application%2Fjson' -O bundles.json` + json = JSON.parse File.read('./bundles.json') + json["dataset"].each do |dataset| + uri = dataset["URI"] + id = uri.split("/").last + `wget --header='accept:application/json' '#{uri}' -O 'bundle#{id}'` + `wget --header='accept:application/json' '#{dataset["summary"]}' -O 'summary#{id}.json'` + `wget --header='accept:application/json' '#{dataset["compound"]}' -O 'compound#{id}.json'` + `wget --header='accept:application/json' '#{dataset["substance"]}' -O 'substance#{id}.json'` + `wget --header='accept:application/json' '#{dataset["property"]}' -O 'property#{id}.json'` + `wget --header='accept:application/json' '#{dataset["dataset"]}' -O 'dataset#{id}.json'` + `wget --header='accept:application/json' '#{dataset["matrix"]}' -O 'matrix#{id}.json'` + end + end + + end + + end + +end + diff --git a/lib/lazar.rb b/lib/lazar.rb index a28ba3a..8eb46e0 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -61,7 +61,8 @@ suppressPackageStartupMessages({ " # OpenTox classes and includes -CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +#CLASSES = ["Feature","Substance::Compound","Substance::Nanoparticle","Dataset","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules +CLASSES = ["Feature","Substance","Dataset","LazarPrediction","Validation","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules [ # be aware of the require sequence as it affects class/method overwrites "overwrite.rb", @@ -70,7 +71,9 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO "opentox.rb", "feature.rb", "physchem.rb", + "substance.rb", "compound.rb", + "nanoparticle.rb", "dataset.rb", "algorithm.rb", "model.rb", @@ -79,6 +82,8 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation","LeaveO "validation.rb", "crossvalidation.rb", "leave-one-out-validation.rb", + "validation-statistics.rb", "experiment.rb", + "import.rb", ].each{ |f| require_relative f } OpenTox::PhysChem.descriptors # load descriptor features diff --git a/lib/leave-one-out-validation.rb b/lib/leave-one-out-validation.rb index 0a131a4..ed917eb 100644 --- a/lib/leave-one-out-validation.rb +++ b/lib/leave-one-out-validation.rb @@ -6,22 +6,31 @@ module OpenTox field :dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array + field :predictions, type: Hash field :finished_at, type: Time def self.create model + $logger.debug "#{model.name}: LOO validation started" + t = Time.now model.training_dataset.features.first.nominal? ? klass = ClassificationLeaveOneOutValidation : klass = RegressionLeaveOneOutValidation loo = klass.new :model_id => model.id, :dataset_id => model.training_dataset_id - compound_ids = model.training_dataset.compound_ids predictions = model.predict model.training_dataset.compounds - predictions = predictions.each_with_index {|p,i| p[:compound_id] = compound_ids[i]} - predictions.select!{|p| p[:database_activities] and !p[:database_activities].empty?} + predictions.each{|cid,p| p.delete(:neighbors)} + nr_unpredicted = 0 + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] + else + nr_unpredicted += 1 + end + predictions.delete(cid) unless prediction[:value] and prediction[:measured] + end loo.nr_instances = predictions.size - predictions.select!{|p| p[:value]} # remove unpredicted - loo.predictions = predictions#.sort{|a,b| b[:confidence] <=> a[:confidence]} - loo.nr_unpredicted = loo.nr_instances - loo.predictions.size + loo.nr_unpredicted = nr_unpredicted + loo.predictions = predictions loo.statistics loo.save + $logger.debug "#{model.name}, LOO validation: #{Time.now-t} seconds" loo end @@ -42,53 +51,8 @@ module OpenTox field :confidence_plot_id, type: BSON::ObjectId def statistics - accept_values = Feature.find(model.prediction_feature_id).accept_values - confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - weighted_confusion_matrix = Array.new(accept_values.size,0){Array.new(accept_values.size,0)} - predictions.each do |pred| - pred[:database_activities].each do |db_act| - if pred[:value] - if pred[:value] == db_act - if pred[:value] == accept_values[0] - confusion_matrix[0][0] += 1 - #weighted_confusion_matrix[0][0] += pred[:confidence] - elsif pred[:value] == accept_values[1] - confusion_matrix[1][1] += 1 - #weighted_confusion_matrix[1][1] += pred[:confidence] - end - else - if pred[:value] == accept_values[0] - confusion_matrix[0][1] += 1 - #weighted_confusion_matrix[0][1] += pred[:confidence] - elsif pred[:value] == accept_values[1] - confusion_matrix[1][0] += 1 - #weighted_confusion_matrix[1][0] += pred[:confidence] - end - end - end - end - end - accept_values.each_with_index do |v,i| - true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f - predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f - end - confidence_sum = 0 -# weighted_confusion_matrix.each do |r| -# r.each do |c| -# confidence_sum += c -# end -# end - update_attributes( - accept_values: accept_values, - confusion_matrix: confusion_matrix, -# weighted_confusion_matrix: weighted_confusion_matrix, - accuracy: (confusion_matrix[0][0]+confusion_matrix[1][1])/(nr_instances-nr_unpredicted).to_f, -# weighted_accuracy: (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f, - true_rate: true_rate, - predictivity: predictivity, - finished_at: Time.now - ) - $logger.debug "Accuracy #{accuracy}" + stat = ValidationStatistics.classification(predictions, Feature.find(model.prediction_feature_id).accept_values) + update_attributes(stat) end def confidence_plot @@ -123,52 +87,15 @@ module OpenTox class RegressionLeaveOneOutValidation < LeaveOneOutValidation - - field :rmse, type: Float, default: 0.0 + field :rmse, type: Float, default: 0 field :mae, type: Float, default: 0 - #field :weighted_rmse, type: Float, default: 0 - #field :weighted_mae, type: Float, default: 0 field :r_squared, type: Float field :correlation_plot_id, type: BSON::ObjectId field :confidence_plot_id, type: BSON::ObjectId def statistics - confidence_sum = 0 - predicted_values = [] - measured_values = [] - predictions.each do |pred| - pred[:database_activities].each do |activity| - if pred[:value] - predicted_values << pred[:value] - measured_values << activity - error = Math.log10(pred[:value])-Math.log10(activity) - self.rmse += error**2 - #self.weighted_rmse += pred[:confidence]*error**2 - self.mae += error.abs - #self.weighted_mae += pred[:confidence]*error.abs - #confidence_sum += pred[:confidence] - end - end - if pred[:database_activities].empty? - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - R.assign "measurement", measured_values - R.assign "prediction", predicted_values - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" - r = R.eval("r").to_ruby - - self.mae = self.mae/predictions.size - #self.weighted_mae = self.weighted_mae/confidence_sum - self.rmse = Math.sqrt(self.rmse/predictions.size) - #self.weighted_rmse = Math.sqrt(self.weighted_rmse/confidence_sum) - self.r_squared = r**2 - self.finished_at = Time.now - save - $logger.debug "R^2 #{r**2}" - $logger.debug "RMSE #{rmse}" - $logger.debug "MAE #{mae}" + stat = ValidationStatistics.regression predictions + update_attributes(stat) end def correlation_plot diff --git a/lib/model.rb b/lib/model.rb index 8e657b8..b82f098 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -20,6 +20,10 @@ module OpenTox def training_dataset Dataset.find(training_dataset_id) end + + def prediction_feature + Feature.find(prediction_feature_id) + end end class Lazar < Model @@ -31,12 +35,10 @@ module OpenTox # Create a lazar model from a training_dataset and a feature_dataset # @param [OpenTox::Dataset] training_dataset # @return [OpenTox::Model::Lazar] Regression or classification model - def initialize training_dataset, params={} + def initialize prediction_feature, training_dataset, params={} super params - # TODO document convention - prediction_feature = training_dataset.features.first # set defaults for empty parameters self.prediction_feature_id ||= prediction_feature.id self.training_dataset_id ||= training_dataset.id @@ -48,7 +50,6 @@ module OpenTox end def predict_compound compound - prediction_feature = Feature.find prediction_feature_id neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters) # remove neighbors without prediction_feature # check for database activities (neighbors may include query compound) @@ -56,12 +57,13 @@ module OpenTox prediction = {} if neighbors.collect{|n| n["_id"]}.include? compound.id - database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq + #TODO restrict to dataset features + database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq prediction[:database_activities] = database_activities prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound." neighbors.delete_if{|n| n["_id"] == compound.id} end - neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] } + neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] } if neighbors.empty? prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []}) else @@ -78,62 +80,55 @@ module OpenTox # parse data compounds = [] - case object.class.to_s - when "OpenTox::Compound" + if object.is_a? Substance compounds = [object] - when "Array" + elsif object.is_a? Array compounds = object - when "OpenTox::Dataset" + elsif object.is_a? Dataset compounds = object.compounds else bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." end # make predictions - predictions = [] - predictions = compounds.collect{|c| predict_compound c} + predictions = {} + compounds.each do |c| + predictions[c.id.to_s] = predict_compound c + predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id + end # serialize result - case object.class.to_s - when "OpenTox::Compound" - prediction = predictions.first + if object.is_a? Substance + prediction = predictions[compounds.first.id.to_s] prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity return prediction - when "Array" + elsif object.is_a? Array return predictions - when "OpenTox::Dataset" + elsif object.is_a? Dataset + predictions.each{|cid,p| p.delete(:neighbors)} # prepare prediction dataset measurement_feature = Feature.find prediction_feature_id - prediction_feature = OpenTox::NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) - prediction_dataset = LazarPrediction.new( + prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" ) + prediction_dataset = LazarPrediction.create( :name => "Lazar prediction for #{prediction_feature.name}", :creator => __FILE__, - :prediction_feature_id => prediction_feature.id - + :prediction_feature_id => prediction_feature.id, + :predictions => predictions ) - confidence_feature = OpenTox::NumericFeature.find_or_create_by( "name" => "Model RMSE" ) - warning_feature = OpenTox::NominalFeature.find_or_create_by("name" => "Warnings") - prediction_dataset.features = [ prediction_feature, confidence_feature, measurement_feature, warning_feature ] - prediction_dataset.compounds = compounds - prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:rmse] , p[:dataset_activities].to_s, p[:warning]]} - prediction_dataset.save + + #prediction_dataset.save return prediction_dataset end end - - def training_activities - i = training_dataset.feature_ids.index prediction_feature_id - training_dataset.data_entries.collect{|de| de[i]} - end end class LazarClassification < Lazar - def self.create training_dataset, params={} - model = self.new training_dataset, params + def self.create prediction_feature, training_dataset, params={} + model = self.new prediction_feature, training_dataset, params model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm model.neighbor_algorithm ||= "fingerprint_neighbors" model.neighbor_algorithm_parameters ||= {} @@ -151,8 +146,8 @@ module OpenTox class LazarRegression < Lazar - def self.create training_dataset, params={} - model = self.new training_dataset, params + def self.create prediction_feature, training_dataset, params={} + model = self.new prediction_feature, training_dataset, params model.neighbor_algorithm ||= "fingerprint_neighbors" model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.local_fingerprint_regression" model.neighbor_algorithm_parameters ||= {} @@ -173,13 +168,13 @@ module OpenTox include Mongoid::Document include Mongoid::Timestamps - # TODO field Validations field :endpoint, type: String field :species, type: String field :source, type: String field :unit, type: String field :model_id, type: BSON::ObjectId field :repeated_crossvalidation_id, type: BSON::ObjectId + field :leave_one_out_validation_id, type: BSON::ObjectId def predict object Lazar.find(model_id).predict object @@ -201,12 +196,16 @@ module OpenTox repeated_crossvalidation.crossvalidations end + def leave_one_out_validation + LeaveOneOutValidation.find leave_one_out_validation_id + end + def regression? - training_dataset.features.first.numeric? + model.is_a? LazarRegression end def classification? - training_dataset.features.first.nominal? + model.is_a? LazarClassification end def self.from_csv_file file @@ -214,19 +213,61 @@ module OpenTox bad_request_error "No metadata file #{metadata_file}" unless File.exist? metadata_file prediction_model = self.new JSON.parse(File.read(metadata_file)) training_dataset = Dataset.from_csv_file file + prediction_feature = training_dataset.features.first model = nil - if training_dataset.features.first.nominal? - model = LazarClassification.create training_dataset - elsif training_dataset.features.first.numeric? - model = LazarRegression.create training_dataset + if prediction_feature.nominal? + model = LazarClassification.create prediction_feature, training_dataset + elsif prediction_feature.numeric? + model = LazarRegression.create prediction_feature, training_dataset end prediction_model[:model_id] = model.id + prediction_model[:prediction_feature_id] = prediction_feature.id prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id + prediction_model[:leave_one_out_validation_id] = LeaveOneOutValidation.create(model).id prediction_model.save prediction_model end end + class NanoLazar + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "models" + + field :name, type: String + field :creator, type: String, default: __FILE__ + # datasets + field :training_dataset_id, type: BSON::ObjectId + # algorithms + field :prediction_algorithm, type: String + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId + field :training_particle_ids, type: Array + + def self.create_all + nanoparticles = Nanoparticle.all + toxfeatures = Nanoparticle.all.collect{|np| np.toxicities.keys}.flatten.uniq.collect{|id| Feature.find id} + tox = {} + toxfeatures.each do |t| + tox[t] = nanoparticles.select{|np| np.toxicities.keys.include? t.id.to_s} + end + tox.select!{|t,nps| nps.size > 50} + tox.collect do |t,nps| + find_or_create_by(:prediction_feature_id => t.id, :training_particle_ids => nps.collect{|np| np.id}) + end + end + + def predict nanoparticle + training = training_particle_ids.collect{|id| Nanoparticle.find id} + training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq + query_features = nanoparticle.physchem_descriptors.keys + common_features = (training_features & query_features) + #p common_features + end + + end + end end diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb new file mode 100644 index 0000000..b934bb3 --- /dev/null +++ b/lib/nanoparticle.rb @@ -0,0 +1,69 @@ +module OpenTox + + class Nanoparticle < Substance + include OpenTox + + field :core, type: String + field :coating, type: Array, default: [] + field :bundles, type: Array, default: [] + + def nanoparticle_neighbors params + Dataset.find(params[:training_dataset_id]).nanoparticles + end + + def add_feature feature, value + case feature.category + when "P-CHEM" + physchem_descriptors[feature.id.to_s] ||= [] + physchem_descriptors[feature.id.to_s] << value + when "TOX" + toxicities[feature.id.to_s] ||= [] + toxicities[feature.id.to_s] << value + else + warn "Unknown feature type '#{feature.category}'. Value '#{value}' not inserted." + end + save + end + + def parse_ambit_value feature, v + v.delete "unit" + # TODO: mmol/log10 conversion + if v.keys == ["textValue"] + add_feature feature, v["textValue"] + elsif v.keys == ["loValue"] + add_feature feature, v["loValue"] + elsif v.keys.size == 2 and v["errorValue"] + add_feature feature, v["loValue"] + warn "Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + elsif v.keys.size == 2 and v["loQualifier"] == "mean" + add_feature feature, v["loValue"] + warn "'#{feature.name}' is a mean value. Original data is not available." + elsif v.keys.size == 2 and v["loQualifier"] #== ">=" + warn "Only min value available for '#{feature.name}', entry ignored" + elsif v.keys.size == 2 and v["upQualifier"] #== ">=" + warn "Only max value available for '#{feature.name}', entry ignored" + elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? + add_feature feature, v["loValue"] + warn "loQualifier and upQualifier are empty." + elsif v.keys.size == 3 and v["loValue"] and v["loQualifier"] == "" and v["upQualifier"] == "" + add_feature feature, v["loValue"] + warn "loQualifier and upQualifier are empty." + elsif v.keys.size == 4 and v["loValue"] and v["loQualifier"].nil? and v["upQualifier"].nil? + add_feature feature, v["loValue"] + warn "loQualifier and upQualifier are empty." + elsif v.size == 4 and v["loQualifier"] and v["upQualifier"] and v["loValue"] and v["upValue"] + add_feature feature, [v["loValue"],v["upValue"]].mean + warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available." + elsif v.size == 4 and v["loQualifier"] == "mean" and v["errorValue"] + warn "'#{feature.name}' is a mean value. Original data is not available. Ignoring errorValue '#{v["errorValue"]}' for '#{feature.name}'." + add_feature feature, v["loValue"] + elsif v == {} # do nothing + else + warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'." + end + end + + end +end + + diff --git a/lib/opentox.rb b/lib/opentox.rb index 186c87a..7d8a8a2 100644 --- a/lib/opentox.rb +++ b/lib/opentox.rb @@ -13,7 +13,13 @@ module OpenTox include Mongoid::Timestamps store_in collection: klass.downcase.pluralize field :name, type: String + field :source, type: String field :warnings, type: Array, default: [] + + def warn warning + $logger.warn warning + warnings << warning + end end OpenTox.const_set klass,c end diff --git a/lib/regression.rb b/lib/regression.rb index 5021fb3..cb17f25 100644 --- a/lib/regression.rb +++ b/lib/regression.rb @@ -9,8 +9,8 @@ module OpenTox neighbors = params[:neighbors] neighbors.each do |row| sim = row["tanimoto"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| + if row["toxicities"][params[:prediction_feature_id].to_s] + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| weighted_sum += sim*Math.log10(act) sim_sum += sim end @@ -32,8 +32,8 @@ module OpenTox neighbors.each_with_index do |row,i| neighbor = Compound.find row["_id"] fingerprint = neighbor.fingerprint - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| + if row["toxicities"][params[:prediction_feature_id].to_s] + row["toxicities"][params[:prediction_feature_id].to_s].each do |act| activities << Math.log10(act) weights << row["tanimoto"] fingerprint_ids.each_with_index do |id,j| @@ -79,21 +79,24 @@ module OpenTox neighbors = params[:neighbors] return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0 - return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 + return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1 activities = [] weights = [] physchem = {} - neighbors.each_with_index do |row,i| - neighbor = Compound.find row["_id"] - if row["features"][params[:prediction_feature_id].to_s] - row["features"][params[:prediction_feature_id].to_s].each do |act| - activities << Math.log10(act) - weights << row["tanimoto"] # TODO cosine ? - neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity + neighbors.each_with_index do |n,i| + if n["toxicities"][params[:prediction_feature_id].to_s] + n["toxicities"][params[:prediction_feature_id].to_s].each do |act| + # TODO fix!!!! + activities << -Math.log10(act) + #if act.numeric? + #activities << act + n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ? + neighbor = Substance.find(n["_id"]) + neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity physchem[pid] ||= [] - physchem[pid] << v + physchem[pid] += v end end end @@ -110,8 +113,8 @@ module OpenTox return result else - data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] } - prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]} + data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }} + prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]} if prediction.nil? prediction = local_weighted_average(compound, params) prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds." @@ -127,6 +130,8 @@ module OpenTox def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values R.assign "weights", training_weights r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})" + #p r_data_frame + File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"} R.eval "data <- #{r_data_frame}" R.assign "features", training_features R.eval "names(data) <- append(c('activities'),features)" # diff --git a/lib/substance.rb b/lib/substance.rb new file mode 100644 index 0000000..82ca65d --- /dev/null +++ b/lib/substance.rb @@ -0,0 +1,10 @@ +module OpenTox + + class Substance + field :physchem_descriptors, type: Hash, default: {} + field :toxicities, type: Hash, default: {} + field :dataset_ids, type: Array, default: [] + end + +end + diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb new file mode 100644 index 0000000..c6b2a07 --- /dev/null +++ b/lib/validation-statistics.rb @@ -0,0 +1,101 @@ +module OpenTox + class ValidationStatistics + include OpenTox + def self.classification predictions, accept_values + confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + weighted_confusion_matrix = Array.new(accept_values.size){Array.new(accept_values.size,0)} + true_rate = {} + predictivity = {} + nr_instances = 0 + predictions.each do |cid,pred| + # TODO use measured majority class + if pred[:measured].uniq.size == 1 + m = pred[:measured].first + #pred[:measured].each do |m| + if pred[:value] == m + if pred[:value] == accept_values[0] + confusion_matrix[0][0] += 1 + weighted_confusion_matrix[0][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][1] += 1 + weighted_confusion_matrix[1][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end + elsif pred[:value] != m + if pred[:value] == accept_values[0] + confusion_matrix[0][1] += 1 + weighted_confusion_matrix[0][1] += pred[:probabilities][pred[:value]] + nr_instances += 1 + elsif pred[:value] == accept_values[1] + confusion_matrix[1][0] += 1 + weighted_confusion_matrix[1][0] += pred[:probabilities][pred[:value]] + nr_instances += 1 + end + end + end + end + true_rate = {} + predictivity = {} + accept_values.each_with_index do |v,i| + true_rate[v] = confusion_matrix[i][i]/confusion_matrix[i].reduce(:+).to_f + predictivity[v] = confusion_matrix[i][i]/confusion_matrix.collect{|n| n[i]}.reduce(:+).to_f + end + confidence_sum = 0 + weighted_confusion_matrix.each do |r| + r.each do |c| + confidence_sum += c + end + end + accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/nr_instances.to_f + weighted_accuracy = (weighted_confusion_matrix[0][0]+weighted_confusion_matrix[1][1])/confidence_sum.to_f + $logger.debug "Accuracy #{accuracy}" + { + :accept_values => accept_values, + :confusion_matrix => confusion_matrix, + :weighted_confusion_matrix => weighted_confusion_matrix, + :accuracy => accuracy, + :weighted_accuracy => weighted_accuracy, + :true_rate => true_rate, + :predictivity => predictivity, + :finished_at => Time.now + } + end + + def self.regression predictions + # TODO: prediction intervals + rmse = 0 + mae = 0 + x = [] + y = [] + predictions.each do |cid,pred| + if pred[:value] and pred[:measured] #and pred[:measured] != [nil] + x << -Math.log10(pred[:measured].median) + y << -Math.log10(pred[:value]) + error = Math.log10(pred[:value])-Math.log10(pred[:measured].median) + rmse += error**2 + mae += error.abs + else + warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." + end + end + R.assign "measurement", x + R.assign "prediction", y + R.eval "r <- cor(measurement,prediction,use='complete')" + r = R.eval("r").to_ruby + + mae = mae/predictions.size + rmse = Math.sqrt(rmse/predictions.size) + $logger.debug "R^2 #{r**2}" + $logger.debug "RMSE #{rmse}" + $logger.debug "MAE #{mae}" + { + :mae => mae, + :rmse => rmse, + :r_squared => r**2, + :finished_at => Time.now + } + end + end +end diff --git a/lib/validation.rb b/lib/validation.rb index b72d273..6b515e4 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -8,7 +8,7 @@ module OpenTox field :test_dataset_id, type: BSON::ObjectId field :nr_instances, type: Integer field :nr_unpredicted, type: Integer - field :predictions, type: Array + field :predictions, type: Hash def prediction_dataset Dataset.find prediction_dataset_id @@ -27,32 +27,23 @@ module OpenTox atts = model.attributes.dup # do not modify attributes from original model atts["_id"] = BSON::ObjectId.new atts[:training_dataset_id] = training_set.id - validation_model = model.class.create training_set, atts + validation_model = model.class.create model.prediction_feature, training_set, atts validation_model.save - cids = test_set.compound_ids - - test_set_without_activities = Dataset.new(:compound_ids => cids.uniq) # remove duplicates and make sure that activities cannot be used - prediction_dataset = validation_model.predict test_set_without_activities - predictions = [] + predictions = validation_model.predict test_set.compounds + predictions.each{|cid,p| p.delete(:neighbors)} nr_unpredicted = 0 - activities = test_set.data_entries.collect{|de| de.first} - prediction_dataset.data_entries.each_with_index do |de,i| - if de[0] #and de[1] - cid = prediction_dataset.compound_ids[i] - rows = cids.each_index.select{|r| cids[r] == cid } - activities = rows.collect{|r| test_set.data_entries[r][0]} - prediction = de.first - confidence = de[1] - predictions << [prediction_dataset.compound_ids[i], activities, prediction, de[1]] + predictions.each do |cid,prediction| + if prediction[:value] + prediction[:measured] = Substance.find(cid).toxicities[prediction[:prediction_feature_id].to_s] else nr_unpredicted += 1 end + predictions.delete(cid) unless prediction[:value] and prediction[:measured] end validation = self.new( :model_id => validation_model.id, - :prediction_dataset_id => prediction_dataset.id, :test_dataset_id => test_set.id, - :nr_instances => test_set.compound_ids.size, + :nr_instances => test_set.compounds.size, :nr_unpredicted => nr_unpredicted, :predictions => predictions#.sort{|a,b| p a; b[3] <=> a[3]} # sort according to confidence ) @@ -67,42 +58,6 @@ module OpenTox end class RegressionValidation < Validation - - def statistics - rmse = 0 - weighted_rmse = 0 - rse = 0 - weighted_rse = 0 - mae = 0 - weighted_mae = 0 - confidence_sum = 0 - predictions.each do |pred| - compound_id,activity,prediction,confidence = pred - if activity and prediction - error = Math.log10(prediction)-Math.log10(activity.median) - rmse += error**2 - weighted_rmse += confidence*error**2 - mae += error.abs - weighted_mae += confidence*error.abs - confidence_sum += confidence - else - warnings << "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - $logger.debug "No training activities for #{Compound.find(compound_id).smiles} in training dataset #{model.training_dataset_id}." - end - end - x = predictions.collect{|p| p[1].median} - y = predictions.collect{|p| p[2]} - R.assign "measurement", x - R.assign "prediction", y - R.eval "r <- cor(-log(measurement),-log(prediction),use='complete')" - r = R.eval("r").to_ruby - - mae = mae/predictions.size - weighted_mae = weighted_mae/confidence_sum - rmse = Math.sqrt(rmse/predictions.size) - weighted_rmse = Math.sqrt(weighted_rmse/confidence_sum) - { "R^2" => r**2, "RMSE" => rmse, "MAE" => mae } - end end end diff --git a/test/classification.rb b/test/classification.rb index bedbe14..7412714 100644 --- a/test/classification.rb +++ b/test/classification.rb @@ -30,12 +30,14 @@ class LazarClassificationTest < MiniTest::Test # make a dataset prediction compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv") - prediction = model.predict compound_dataset - assert_equal compound_dataset.compounds, prediction.compounds + prediction_dataset = model.predict compound_dataset + assert_equal compound_dataset.compounds, prediction_dataset.compounds - assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.data_entries[7][3] - assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.data_entries[14][3] + cid = prediction_dataset.compounds[7].id.to_s + assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning] + cid = prediction_dataset.compounds[9].id.to_s + assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction_dataset.predictions[cid][:warning] # cleanup - [training_dataset,model,compound_dataset].each{|o| o.delete} + [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete} end end diff --git a/test/dataset.rb b/test/dataset.rb index 297251e..a7b8769 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -36,38 +36,34 @@ class DatasetTest < MiniTest::Test assert_equal Dataset, d.class d.name = "Create dataset test" - # features not set - # << operator was removed for efficiency reasons (CH) - #assert_raises BadRequestError do - # d << [Compound.from_smiles("c1ccccc1NN"), 1,2] - #end - # add data entries - d.features = ["test1", "test2"].collect do |title| + features = ["test1", "test2"].collect do |title| f = Feature.new f.name = title f.numeric = true f.save f end - - # wrong feature size - # << operator was removed for efficiency reasons (CH) - #assert_raises BadRequestError do - # d << [Compound.from_smiles("c1ccccc1NN"), 1,2,3] - #end # manual low-level insertions without consistency checks for runtime efficiency + compounds = ["c1ccccc1NN", "CC(C)N", "C1C(C)CCCC1"].collect do |smi| + Compound.from_smiles smi + end data_entries = [] - d.compound_ids << Compound.from_smiles("c1ccccc1NN").id data_entries << [1,2] - d.compound_ids << Compound.from_smiles("CC(C)N").id data_entries << [4,5] - d.compound_ids << Compound.from_smiles("C1C(C)CCCC1").id data_entries << [6,7] - d.data_entries = data_entries + compounds.each_with_index do |c,i| + features.each_with_index do |f,j| + d.data_entries[c.id.to_s] ||= {} + d.data_entries[c.id.to_s][f.id.to_s] ||= [] + d.data_entries[c.id.to_s][f.id.to_s] << data_entries[i][j] + end + end + assert_equal 3, d.compounds.size assert_equal 2, d.features.size + p d.data_entries assert_equal [[1,2],[4,5],[6,7]], d.data_entries d.save # check if dataset has been saved correctly @@ -89,8 +85,14 @@ class DatasetTest < MiniTest::Test assert_equal "multicolumn", new_dataset.name # get features assert_equal 6, new_dataset.features.size - assert_equal 7, new_dataset.compounds.size - assert_equal ["1", nil, "false", nil, nil, 1.0], new_dataset.data_entries.last + assert_equal 5, new_dataset.compounds.size + de = new_dataset.data_entries[new_dataset.compounds.last.id.to_s] + fid = new_dataset.features.first.id.to_s + assert_equal ["1"], de[fid] + fid = new_dataset.features.last.id.to_s + assert_equal [1.0], de[fid] + fid = new_dataset.features[2].id.to_s + assert_equal ["false"], de[fid] d.delete end @@ -117,7 +119,7 @@ class DatasetTest < MiniTest::Test assert d.warnings.grep(/Duplicate compound/) assert d.warnings.grep(/3, 5/) assert_equal 6, d.features.size - assert_equal 7, d.compounds.size + assert_equal 5, d.compounds.size assert_equal 5, d.compounds.collect{|c| c.inchi}.uniq.size assert_equal [["1", "1", "true", "true", "test", 1.1], ["1", "2", "false", "7.5", "test", 0.24], ["1", "3", "true", "5", "test", 3578.239], ["0", "4", "false", "false", "test", -2.35], ["1", "2", "true", "4", "test_2", 1], ["1", "2", "false", "false", "test", -1.5], ["1", nil, "false", nil, nil, 1.0]], d.data_entries assert_equal "c1ccc[nH]1,1,,false,,,1.0", d.to_csv.split("\n")[7] @@ -195,7 +197,7 @@ class DatasetTest < MiniTest::Test assert_match "EPAFHM.mini.csv", d.source assert_equal 1, d.features.size feature = d.features.first - assert_kind_of NumericBioAssay, feature + assert_kind_of NumericFeature, feature assert_equal 0.0113, d.data_entries[0][0] assert_equal 0.00323, d.data_entries[5][0] d2 = Dataset.find d.id @@ -207,10 +209,10 @@ class DatasetTest < MiniTest::Test dataset = Dataset.from_csv_file File.join(DATA_DIR,"loael.csv") dataset.folds(10).each do |fold| fold.each do |d| - assert_equal d.data_entries.size, d.compound_ids.size - assert_operator d.compound_ids.size, :>=, d.compound_ids.uniq.size + assert_equal d.data_entries.size, d.compounds.size + assert_equal d.compounds.size, :>=, d.compounds.uniq.size end - assert_operator fold[0].compound_ids.uniq.size, :>=, fold[1].compound_ids.uniq.size + assert_operator fold[0].compounds.size, :>=, fold[1].compounds.size end #puts dataset.folds 10 end diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb new file mode 100644 index 0000000..46073a9 --- /dev/null +++ b/test/nanoparticles.rb @@ -0,0 +1,34 @@ +require_relative "setup.rb" + +class NanoparticleTest < MiniTest::Test + + def test_import + dataset_ids = Import::Enanomapper.import + assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported" + assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported" + assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki") + assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") + p dataset_ids.collect{|d| {d => Dataset.find(d).name}} + dataset_ids.collect do |d| + d = Dataset.find(d) + p d.name + puts d.to_csv + end + end + + def test_export + Dataset.all.each do |d| + puts d.to_csv + end + end + + def test_create_model + training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles") + model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors") + nanoparticle = training_dataset.nanoparticles[-34] + prediction = model.predict nanoparticle + p prediction + refute_nil prediction[:value] + end + +end diff --git a/test/prediction_models.rb b/test/prediction_models.rb index a2e5fe2..49a2472 100644 --- a/test/prediction_models.rb +++ b/test/prediction_models.rb @@ -10,7 +10,6 @@ class PredictionModelTest < MiniTest::Test assert pm.classification? refute pm.regression? pm.crossvalidations.each do |cv| - p cv assert cv.accuracy > 0.74, "Crossvalidation accuracy (#{cv.accuracy}) should be larger than 0.75. This may happen due to an unfavorable training/test set split." end prediction = pm.predict Compound.from_smiles("CCCC(NN)C") diff --git a/test/validation.rb b/test/validation.rb index d8eea59..baee2d1 100644 --- a/test/validation.rb +++ b/test/validation.rb @@ -6,17 +6,17 @@ class ValidationTest < MiniTest::Test def test_default_classification_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::LazarClassification.create dataset + model = Model::LazarClassification.create dataset.features.first, dataset cv = ClassificationCrossValidation.create model - assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7" + assert cv.accuracy > 0.7, "Accuracy (#{cv.accuracy}) should be larger than 0.7, this may occur due to an unfavorable training/test set split" end def test_default_regression_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" - model = Model::LazarRegression.create dataset + model = Model::LazarRegression.create dataset.features.first, dataset cv = RegressionCrossValidation.create model - assert cv.rmse < 1.5, "RMSE > 1.5" - assert cv.mae < 1 + assert cv.rmse < 1.5, "RMSE #{cv.rmse} should be larger than 1.5, this may occur due to an unfavorable training/test set split" + assert cv.mae < 1, "MAE #{cv.mae} should be larger than 1, this may occur due to an unfavorable training/test set split" end # parameters @@ -30,7 +30,7 @@ class ValidationTest < MiniTest::Test :type => "FP3" } } - model = Model::LazarClassification.create dataset, params + model = Model::LazarClassification.create dataset.features.first, dataset, params model.save cv = ClassificationCrossValidation.create model params = model.neighbor_algorithm_parameters @@ -54,7 +54,7 @@ class ValidationTest < MiniTest::Test :min_sim => 0.7, } } - model = Model::LazarRegression.create dataset, params + model = Model::LazarRegression.create dataset.features.first, dataset, params cv = RegressionCrossValidation.create model cv.validation_ids.each do |vid| model = Model::Lazar.find(Validation.find(vid).model_id) @@ -70,7 +70,7 @@ class ValidationTest < MiniTest::Test def test_physchem_regression_crossvalidation training_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv") - model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") + model = Model::LazarRegression.create(training_dataset.features.first, training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression") cv = RegressionCrossValidation.create model refute_nil cv.rmse refute_nil cv.mae @@ -80,7 +80,7 @@ class ValidationTest < MiniTest::Test def test_classification_loo_validation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::LazarClassification.create dataset + model = Model::LazarClassification.create dataset.features.first, dataset loo = ClassificationLeaveOneOutValidation.create model assert_equal 14, loo.nr_unpredicted refute_empty loo.confusion_matrix @@ -89,7 +89,7 @@ class ValidationTest < MiniTest::Test def test_regression_loo_validation dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv") - model = Model::LazarRegression.create dataset + model = Model::LazarRegression.create dataset.features.first, dataset loo = RegressionLeaveOneOutValidation.create model assert loo.r_squared > 0.34 end @@ -98,7 +98,7 @@ class ValidationTest < MiniTest::Test def test_repeated_crossvalidation dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" - model = Model::LazarClassification.create dataset + model = Model::LazarClassification.create dataset.features.first, dataset repeated_cv = RepeatedCrossValidation.create model repeated_cv.crossvalidations.each do |cv| assert_operator cv.accuracy, :>, 0.7, "model accuracy < 0.7, this may happen by chance due to an unfavorable training/test set split" |