From 455da06aa6459da0d25b286ca6cb866ff64c4c34 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 20 Jun 2019 22:01:50 +0200 Subject: separate csv serialisations for batch predictions and training data, repeated measurements in mutagenicity dataset fixed, daphnia import fixed, CENTRAL_MONGO_IP removed --- lib/dataset.rb | 75 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 24 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index df17569..596c53c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -96,8 +96,14 @@ module OpenTox # Get nominal and numeric prediction features # @return [Array] - def prediction_features - features.select{|f| f._type.match("Prediction")} + def prediction_feature + features.select{|f| f._type.match(/Prediction$/)}.first + end + + # Get supporting nominal and numeric prediction features (class probabilities, prediction interval) + # @return [Array] + def prediction_supporting_features + features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)} end # Get nominal and numeric merged features @@ -259,7 +265,7 @@ module OpenTox feature_names = table.shift.collect{|f| f.strip} raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - if feature_names[0] =~ /ID/i # check ID column + if feature_names[0] !~ /SMILES|InChI/i # check ID column original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) else original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") @@ -343,30 +349,52 @@ module OpenTox # Serialisation - # Convert dataset to csv format + # Convert lazar prediction dataset to csv format # @return [String] - def to_csv #inchi=false - CSV.generate() do |csv| - - compound = substances.first.is_a? Compound - f = features - original_id_features - original_smiles_features - warnings_features - header = original_id_features.collect{|f| "ID "+Dataset.find(f.dataset_id).name} - header += original_smiles_features.collect{|f| "SMILES "+Dataset.find(f.dataset_id).name} if compound - compound ? header << "Canonical SMILES" : header << "Name" - header += f.collect{|f| f.name} - header += warnings_features.collect{|f| "Warnings "+Dataset.find(f.dataset_id).name} - csv << header - - substances.each do |substance| - row = original_id_features.collect{|f| values(substance,f).join(" ")} - row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound - compound ? row << substance.smiles : row << substance.name - row += f.collect{|f| values(substance,f).join(" ")} - row += warnings_features.collect{|f| values(substance,f).uniq.join(" ")} + def to_prediction_csv + + compound = substances.first.is_a? Compound + header = ["ID"] + header << "Original SMILES" if compound + compound ? header << "Canonical SMILES" : header << "Name" + header << "Prediction" if prediction_feature + header << "Confidence" if confidence_feature + header += prediction_supporting_features.collect{|f| f.name} + header << "Measurements" + csv = [header] + + substances.each do |substance| + row = original_id_features.collect{|f| values(substance,f).join(" ")} + row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound + compound ? row << substance.smiles : row << substance.name + row << values(substance,prediction_feature).join(" ") + row << values(substance,confidence_feature).join(" ") + row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")} + row << values(substance,bioactivity_features[0]).join(" ") + csv << row + end + csv.collect{|r| r.join(",")}.join("\n") + end + + # Convert dataset into csv formatted training data + # @return [String] + def to_training_csv + + p features + p bioactivity_features + header = ["Canonical SMILES"] + header << bioactivity_features[0].name + csv = [header] + + substances.each do |substance| + nr_activities = values(substance,bioactivity_features.first).size + (0..nr_activities-1).each do |n| # new row for each value + row = [substance.smiles] + row << values(substance,bioactivity_features[0])[n] csv << row end - end + csv.collect{|r| r.join(",")}.join("\n") end # Convert dataset to SDF format @@ -396,7 +424,6 @@ module OpenTox predictions = {} substances.each do |s| predictions[s] ||= {} - prediction_feature = prediction_features.first predictions[s][:value] = values(s,prediction_feature).first #predictions[s][:warnings] = [] #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } -- cgit v1.2.3