From 6d6be53a110e71d0d56ae5ea9a2675f76f7c84ec Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Thu, 6 Sep 2018 17:24:25 +0200 Subject: adjusted classification similarities, dataset sdf export --- lib/dataset.rb | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 6e7d67f..b32d526 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -135,6 +135,19 @@ module OpenTox end end + # Convert dataset to SDF file + # @return [String] + def to_sdf + substances.each do |substance| + puts substance.sdf.sub(/\$\$\$\$\n/,"") + features.each do |f| + puts "> <#{f.name}>" + puts values(substance,f).uniq.join "," + puts "\n$$$$" + end + end + end + # Parsers # Create a dataset from file (csv,sdf,...) -- cgit v1.2.3 From ea0864ae89d57839177c850e3b473f0aa5987474 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 17 Sep 2018 16:54:23 +0200 Subject: smiles as identifier for sdf export --- lib/dataset.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index b32d526..4e504de 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -139,7 +139,9 @@ module OpenTox # @return [String] def to_sdf substances.each do |substance| - puts substance.sdf.sub(/\$\$\$\$\n/,"") + sdf_lines = substance.sdf.sub(/\$\$\$\$\n/,"").split("\n") + sdf_lines[0] = substance.smiles + puts sdf_lines.join("\n") features.each do |f| puts "> <#{f.name}>" puts values(substance,f).uniq.join "," -- cgit v1.2.3 From e718cf76f32fb29d6c7c3732ec82f35b0da49122 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 5 Oct 2018 17:06:46 +0200 Subject: sdf import, csv files with id column --- lib/dataset.rb | 237 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 224 insertions(+), 13 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 4e504de..17c30d5 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -1,5 +1,6 @@ require 'csv' require 'tempfile' +require 'digest/md5' module OpenTox @@ -7,6 +8,7 @@ module OpenTox class Dataset field :data_entries, type: Hash, default: {} + field :md5, type: String # Readers @@ -104,6 +106,7 @@ module OpenTox # Convert dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] + # TODO original_id def to_csv(inchi=false) CSV.generate() do |csv| compound = substances.first.is_a? Compound @@ -152,28 +155,120 @@ module OpenTox # Parsers - # Create a dataset from file (csv,sdf,...) - # @param filename [String] - # @return [String] dataset uri - # TODO - #def self.from_sdf_file - #end + # Create a dataset from PubChem Assay + # @param [File] + # @return [OpenTox::Dataset] + def self.from_pubchem aid + csv = RestClientWrapper.get "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV" + table = CSV.read csv + puts table +=begin + dataset = self.new(:source => file, :name => name, :md5 => md5) + dataset.parse_table table, accept_empty_values + else + puts csv +i = 0 +activities = [] +File.readlines(ARGV[0]).each do |line| + if i > 2 + tokens = line.split "," + p line if tokens[1].empty? + activities << [tokens[1],tokens[3]] + end + i += 1 +end + +puts "SMILES,Activity" +activities.each_slice(100) do |slice| # get SMILES in chunks + sids = slice.collect{|e| e[0]} + smiles = `curl https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT`.split("\n") + abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size + smiles.each_with_index do |smi,i| + act = slice[i] + puts [smi.chomp,act[1]].join(",") + end +end +=end + end + + # Create a dataset from SDF file + # @param [File] + # @return [OpenTox::Dataset] + def self.from_sdf_file file + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) + if dataset + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." + else + $logger.debug "Parsing #{file}." + table = nil + read_result = false + sdf = "" + dataset = self.new(:source => file, :name => name, :md5 => md5) + original_id = NominalFeature.find_or_create_by(:name => "original_id") + + feature_name = "" + compound = nil + features = {} + + File.readlines(file).each do |line| + if line.match %r{\$\$\$\$} + sdf << line + id = sdf.split("\n").first.chomp + compound = Compound.from_sdf sdf + dataset.add compound, original_id, id + features.each { |f,v| dataset.add compound, f, v } + sdf = "" + features = {} + elsif line.match /^>\s+\s+<(.*)>/)[1] + read_result = true + else + if read_result + value = line.chomp + if value.numeric? + feature = NumericFeature.find_or_create_by(:name => feature_name) + value = value.to_f + else + feature = NominalFeature.find_or_create_by(:name => feature_name) + end + features[feature] = value + #p compound.smiles, feature.name, value + read_result = false + else + sdf << line + end + end + end + end + dataset + + end # Create a dataset from CSV file # @param [File] # @param [TrueClass,FalseClass] accept or reject empty values # @return [OpenTox::Dataset] def self.from_csv_file file, accept_empty_values=false - source = file - name = File.basename(file,".*") - dataset = self.find_by(:source => source, :name => name) + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) if dataset $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." else $logger.debug "Parsing #{file}." - table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - dataset = self.new(:source => source, :name => name) - dataset.parse_table table, accept_empty_values + table = nil + [",","\t",";"].each do |sep| # guess CSV separator + if File.readlines(file).first.match(/#{sep}/) + table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + break + end + end + if table + dataset = self.new(:source => file, :name => name, :md5 => md5) + dataset.parse_table table, accept_empty_values + else + bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." + end end dataset end @@ -187,10 +282,18 @@ module OpenTox # features feature_names = table.shift.collect{|f| f.strip} warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - compound_format = feature_names.shift.strip + + original_id = nil + if feature_names[0] =~ /ID/i # check ID column + feature_names.shift + original_id = NominalFeature.find_or_create_by(:name => "original_id") + end + + compound_format = feature_names.shift bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i numeric = [] features = [] + # guess feature types feature_names.each_with_index do |f,i| metadata = {:name => f} @@ -213,6 +316,7 @@ module OpenTox all_substances = [] table.each_with_index do |vals,i| + original_id_value = vals.shift.strip if original_id identifier = vals.shift.strip warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values begin @@ -239,6 +343,8 @@ module OpenTox next end + add substance, original_id, original_id_value if original_id + vals.each_with_index do |v,j| if v.blank? warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." @@ -294,4 +400,109 @@ module OpenTox end + class Batch + + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "batch" + field :name, type: String + field :source, type: String + field :identifiers, type: Array + field :ids, type: Array + field :compounds, type: Array + field :warnings, type: Array, default: [] + + def self.from_csv_file file + source = file + name = File.basename(file,".*") + batch = self.find_by(:source => source, :name => name) + if batch + $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})." + else + $logger.debug "Parsing #{file}." + # check delimiter + line = File.readlines(file).first + if line.match(/\t/) + table = CSV.read file, :col_sep => "\t", :skip_blanks => true, :encoding => 'windows-1251:utf-8' + else + table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + end + batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => []) + + # original IDs + if table[0][0] =~ /ID/i + @original_ids = table.collect{|row| row.shift} + @original_ids.shift + end + + # features + feature_names = table.shift.collect{|f| f.strip} + warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + compound_format = feature_names.shift.strip + unless compound_format =~ /SMILES|InChI/i + File.delete file + bad_request_error "'#{compound_format}' is not a supported compound format in the header. " \ + "Accepted formats: SMILES, InChI. Please take a look on the help page." + end + numeric = [] + features = [] + # guess feature types + feature_names.each_with_index do |f,i| + metadata = {:name => f} + values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + types = values.collect{|v| v.numeric? ? true : false}.uniq + feature = nil + if values.size == 0 # empty feature + elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes + numeric[i] = true + feature = NumericFeature.find_or_create_by(metadata) + else + metadata["accept_values"] = values + numeric[i] = false + feature = NominalFeature.find_or_create_by(metadata) + end + features << feature if feature + end + + table.each_with_index do |vals,i| + identifier = vals.shift.strip.gsub(/^'|'$/,"") + begin + case compound_format + when /SMILES/i + compound = OpenTox::Compound.from_smiles(identifier) + when /InChI/i + compound = OpenTox::Compound.from_inchi(identifier) + end + rescue + compound = nil + end + # collect only for present compounds + unless compound.nil? + batch.identifiers << identifier + batch.compounds << compound.id + batch.ids << @original_ids[i] if @original_ids + else + batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}." + end + end + batch.compounds.duplicates.each do |duplicate| + $logger.debug "Duplicates found in #{name}." + dup = Compound.find duplicate + positions = [] + batch.compounds.each_with_index do |co,i| + c = Compound.find co + if !c.blank? and c.inchi and c.inchi == dup.inchi + positions << i+1 + end + end + batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}." + end + batch.save + end + batch + end + + end + end -- cgit v1.2.3 From 47a49508a736549006418ac9a9607ec0f5083a55 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 5 Oct 2018 19:31:48 +0200 Subject: partial pubchem classification import --- lib/dataset.rb | 51 ++++++++++++++++++--------------------------------- 1 file changed, 18 insertions(+), 33 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 17c30d5..1cc388c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -156,39 +156,24 @@ module OpenTox # Parsers # Create a dataset from PubChem Assay - # @param [File] + # @param [Integer] PubChem AssayID (AID) # @return [OpenTox::Dataset] def self.from_pubchem aid - csv = RestClientWrapper.get "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV" - table = CSV.read csv - puts table -=begin - dataset = self.new(:source => file, :name => name, :md5 => md5) - dataset.parse_table table, accept_empty_values - else - puts csv -i = 0 -activities = [] -File.readlines(ARGV[0]).each do |line| - if i > 2 - tokens = line.split "," - p line if tokens[1].empty? - activities << [tokens[1],tokens[3]] - end - i += 1 -end - -puts "SMILES,Activity" -activities.each_slice(100) do |slice| # get SMILES in chunks - sids = slice.collect{|e| e[0]} - smiles = `curl https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT`.split("\n") - abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size - smiles.each_with_index do |smi,i| - act = slice[i] - puts [smi.chomp,act[1]].join(",") - end -end -=end + url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV" + csv = CSV.parse(RestClientWrapper.get(url)) + csv.select!{|r| r[0].match /^\d/} # discard header rows + table = [["SID","SMILES","Activity"]] + csv.each_slice(100) do |slice| + sids = slice.collect{|s| s[1]} + smiles = RestClientWrapper.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT").split("\n") + abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size + smiles.each_with_index do |smi,i| + table << [slice[i][1],smi.chomp,slice[i][3]] + end + end + dataset = self.new(:source => url) # TODO name + dataset.parse_table table, false + dataset end # Create a dataset from SDF file @@ -233,7 +218,6 @@ end feature = NominalFeature.find_or_create_by(:name => feature_name) end features[feature] = value - #p compound.smiles, feature.name, value read_result = false else sdf << line @@ -297,7 +281,8 @@ end # guess feature types feature_names.each_with_index do |f,i| metadata = {:name => f} - values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact + original_id ? j = i+2 : j = i+1 + values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq feature = nil if values.size == 0 # empty feature -- cgit v1.2.3 From 0a8da103e020b4a584a28a52b4ba12e1f3f90fd3 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Sun, 7 Oct 2018 18:12:39 +0200 Subject: dataset merge with feature/value maps --- lib/dataset.rb | 63 ++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 21 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 1cc388c..b7d9d4e 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -69,6 +69,26 @@ module OpenTox # Dataset operations + # Merge an array of datasets + # @param [Array] OpenTox::Dataset Array to be merged + # @return [OpenTox::Dataset] merged dataset + def self.merge datasets, feature_map=nil, value_map=nil + dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) + datasets.each do |d| + d.substances.each do |s| + d.features.each do |f| + d.values(s,f).each do |v| + f = feature_map[f] if feature_map and feature_map[f] + v = value_map[v] if value_map and value_map[v] + dataset.add s,f,v #unless dataset.values(s,f).include? v + end + end + end + end + dataset.save + dataset + end + # Split a dataset into n folds # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] @@ -138,19 +158,21 @@ module OpenTox end end - # Convert dataset to SDF file - # @return [String] + # Convert dataset to SDF format + # @return [String] SDF string def to_sdf + sdf = "" substances.each do |substance| sdf_lines = substance.sdf.sub(/\$\$\$\$\n/,"").split("\n") sdf_lines[0] = substance.smiles - puts sdf_lines.join("\n") + sdf += sdf_lines.join("\n") features.each do |f| - puts "> <#{f.name}>" - puts values(substance,f).uniq.join "," - puts "\n$$$$" + sdf += "\n> <#{f.name}>\n" + sdf += values(substance,f).uniq.join "," end + sdf += "\n$$$$\n" end + sdf end # Parsers @@ -163,7 +185,7 @@ module OpenTox csv = CSV.parse(RestClientWrapper.get(url)) csv.select!{|r| r[0].match /^\d/} # discard header rows table = [["SID","SMILES","Activity"]] - csv.each_slice(100) do |slice| + csv.each_slice(100) do |slice| # get SMILES in chunks sids = slice.collect{|s| s[1]} smiles = RestClientWrapper.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT").split("\n") abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size @@ -179,7 +201,7 @@ module OpenTox # Create a dataset from SDF file # @param [File] # @return [OpenTox::Dataset] - def self.from_sdf_file file + def self.from_sdf_file file, map=nil md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) if dataset @@ -189,8 +211,8 @@ module OpenTox table = nil read_result = false sdf = "" - dataset = self.new(:source => file, :name => name, :md5 => md5) - original_id = NominalFeature.find_or_create_by(:name => "original_id") + dataset = self.new(:source => file, :name => File.basename(file), :md5 => md5) + original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID") feature_name = "" compound = nil @@ -225,15 +247,15 @@ module OpenTox end end end + dataset.save dataset - end # Create a dataset from CSV file # @param [File] # @param [TrueClass,FalseClass] accept or reject empty values # @return [OpenTox::Dataset] - def self.from_csv_file file, accept_empty_values=false + def self.from_csv_file file md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) if dataset @@ -248,8 +270,8 @@ module OpenTox end end if table - dataset = self.new(:source => file, :name => name, :md5 => md5) - dataset.parse_table table, accept_empty_values + dataset = self.new(:source => file, :name => File.basename(file), :md5 => md5) + dataset.parse_table table else bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." end @@ -260,8 +282,7 @@ module OpenTox # Parse data in tabular format (e.g. from csv) # does a lot of guesswork in order to determine feature types # @param [Array] - # @param [TrueClass,FalseClass] accept or reject empty values - def parse_table table, accept_empty_values + def parse_table table # features feature_names = table.shift.collect{|f| f.strip} @@ -270,7 +291,7 @@ module OpenTox original_id = nil if feature_names[0] =~ /ID/i # check ID column feature_names.shift - original_id = NominalFeature.find_or_create_by(:name => "original_id") + original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => self.name+".ID") end compound_format = feature_names.shift @@ -290,7 +311,7 @@ module OpenTox numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else - metadata["accept_values"] = values + metadata["accept_values"] = values.sort numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end @@ -303,7 +324,7 @@ module OpenTox table.each_with_index do |vals,i| original_id_value = vals.shift.strip if original_id identifier = vals.shift.strip - warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? and !accept_empty_values + #warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? #and !accept_empty_values begin case compound_format when /SMILES/i @@ -341,8 +362,8 @@ module OpenTox end add substance, features[j], v end - data_entries[substance.id.to_s] = {} if vals.empty? and accept_empty_values end + all_substances.duplicates.each do |substance| positions = [] all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} @@ -443,7 +464,7 @@ module OpenTox numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else - metadata["accept_values"] = values + metadata["accept_values"] = values.sort numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end -- cgit v1.2.3 From bdc6b5b40437896384561d74a510560e9e592364 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 9 Oct 2018 18:20:27 +0200 Subject: tentative random forest classification: hangs unpredictably during caret model generation/optimization for some (inorganic?) compounds. --- lib/dataset.rb | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index b7d9d4e..6ad3215 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -71,6 +71,8 @@ module OpenTox # Merge an array of datasets # @param [Array] OpenTox::Dataset Array to be merged + # @param [Hash] feature modifications + # @param [Hash] value modifications # @return [OpenTox::Dataset] merged dataset def self.merge datasets, feature_map=nil, value_map=nil dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) @@ -205,7 +207,7 @@ module OpenTox md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) if dataset - $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." table = nil @@ -234,10 +236,10 @@ module OpenTox if read_result value = line.chomp if value.numeric? - feature = NumericFeature.find_or_create_by(:name => feature_name) + feature = NumericFeature.find_or_create_by(:name => feature_name, :measured => true) value = value.to_f else - feature = NominalFeature.find_or_create_by(:name => feature_name) + feature = NominalFeature.find_or_create_by(:name => feature_name, :measured => true) end features[feature] = value read_result = false @@ -259,7 +261,7 @@ module OpenTox md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) if dataset - $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." table = nil @@ -301,7 +303,7 @@ module OpenTox # guess feature types feature_names.each_with_index do |f,i| - metadata = {:name => f} + metadata = {:name => f, :measured => true} original_id ? j = i+2 : j = i+1 values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq @@ -424,7 +426,7 @@ module OpenTox name = File.basename(file,".*") batch = self.find_by(:source => source, :name => name) if batch - $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})." + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." # check delimiter -- cgit v1.2.3 From 8b31acab67e22f30a87c995a94f1ee1e2a3d510f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 10 Oct 2018 21:39:11 +0200 Subject: dataset tests fixed --- lib/dataset.rb | 369 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 190 insertions(+), 179 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 6ad3215..b6c6173 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -67,166 +67,76 @@ module OpenTox #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source end - # Dataset operations - - # Merge an array of datasets - # @param [Array] OpenTox::Dataset Array to be merged - # @param [Hash] feature modifications - # @param [Hash] value modifications - # @return [OpenTox::Dataset] merged dataset - def self.merge datasets, feature_map=nil, value_map=nil - dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) - datasets.each do |d| - d.substances.each do |s| - d.features.each do |f| - d.values(s,f).each do |v| - f = feature_map[f] if feature_map and feature_map[f] - v = value_map[v] if value_map and value_map[v] - dataset.add s,f,v #unless dataset.values(s,f).include? v - end - end - end - end - dataset.save - dataset - end - - # Split a dataset into n folds - # @param [Integer] number of folds - # @return [Array] Array with folds [training_dataset,test_dataset] - def folds n - len = self.substances.size - indices = (0..len-1).to_a.shuffle - mid = (len/n) - chunks = [] - start = 0 - 1.upto(n) do |i| - last = start+mid - last = last-1 unless len%n >= i - test_idxs = indices[start..last] || [] - test_substances = test_idxs.collect{|i| substances[i]} - training_idxs = indices-test_idxs - training_substances = training_idxs.collect{|i| substances[i]} - chunk = [training_substances,test_substances].collect do |substances| - dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) - substances.each do |substance| - substance.dataset_ids << dataset.id - substance.dataset_ids.uniq! - substance.save - dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} - end - dataset.save - dataset - end - start = last+1 - chunks << chunk - end - chunks - end - - # Serialisation + # Parsers - # Convert dataset to csv format including compound smiles as first column, other column headers are feature names - # @return [String] - # TODO original_id - def to_csv(inchi=false) - CSV.generate() do |csv| - compound = substances.first.is_a? Compound - if compound - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} - else - csv << ["Name"] + features.collect{|f| f.name} - end - substances.each do |substance| - if compound - name = (inchi ? substance.inchi : substance.smiles) - else - name = substance.name - end - nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq - - if nr_measurements.size > 1 - warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." - else - (0..nr_measurements.first-1).each do |i| - row = [name] - features.each do |f| - values(substance,f) ? row << values(substance,f)[i] : row << "" - end - csv << row - end + # Create a dataset from CSV file + # @param [File] Input file with the following format: + # - ID column (optional): header containing "ID" string, arbitrary ID values + # - SMILES/InChI column: header indicating "SMILES" or "InChI", Smiles or InChI strings + # - one or more properties column(s): header with property name(s), property values + # files with a single property column are read as BioActivities (i.e. dependent variable) + # files with multiple property columns are read as SubstanceProperties (i.e. independent variables) + # @return [OpenTox::Dataset] + def self.from_csv_file file + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) + if dataset + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." + else + $logger.debug "Parsing #{file}." + table = nil + sep = "," + ["\t",";"].each do |s| # guess alternative CSV separator + if File.readlines(file).first.match(/#{s}/) + sep = s + break end end - end - end - - # Convert dataset to SDF format - # @return [String] SDF string - def to_sdf - sdf = "" - substances.each do |substance| - sdf_lines = substance.sdf.sub(/\$\$\$\$\n/,"").split("\n") - sdf_lines[0] = substance.smiles - sdf += sdf_lines.join("\n") - features.each do |f| - sdf += "\n> <#{f.name}>\n" - sdf += values(substance,f).uniq.join "," - end - sdf += "\n$$$$\n" - end - sdf - end - - # Parsers - - # Create a dataset from PubChem Assay - # @param [Integer] PubChem AssayID (AID) - # @return [OpenTox::Dataset] - def self.from_pubchem aid - url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV" - csv = CSV.parse(RestClientWrapper.get(url)) - csv.select!{|r| r[0].match /^\d/} # discard header rows - table = [["SID","SMILES","Activity"]] - csv.each_slice(100) do |slice| # get SMILES in chunks - sids = slice.collect{|s| s[1]} - smiles = RestClientWrapper.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT").split("\n") - abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size - smiles.each_with_index do |smi,i| - table << [slice[i][1],smi.chomp,slice[i][3]] + table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + if table + dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) + dataset.parse_table table + else + bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." end end - dataset = self.new(:source => url) # TODO name - dataset.parse_table table, false dataset end # Create a dataset from SDF file + # files with a single data field are read as BioActivities (i.e. dependent variable) + # files with multiple data fields are read as SubstanceProperties (i.e. independent variable) # @param [File] # @return [OpenTox::Dataset] - def self.from_sdf_file file, map=nil + def self.from_sdf_file file md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) if dataset $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." - table = nil - read_result = false - sdf = "" - dataset = self.new(:source => file, :name => File.basename(file), :md5 => md5) + + dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID") + read_result = false + sdf = "" feature_name = "" compound = nil features = {} + table = [["ID","SMILES"]] File.readlines(file).each do |line| if line.match %r{\$\$\$\$} sdf << line id = sdf.split("\n").first.chomp compound = Compound.from_sdf sdf - dataset.add compound, original_id, id - features.each { |f,v| dataset.add compound, f, v } + row = [id,compound.smiles] + features.each do |f,v| + table[0] << f unless table[0].include? f + row[table[0].index(f)] = v + end + table << row sdf = "" features = {} elsif line.match /^>\s+ feature_name, :measured => true) - value = value.to_f - else - feature = NominalFeature.find_or_create_by(:name => feature_name, :measured => true) - end - features[feature] = value + features[feature_name] = value read_result = false else sdf << line end end end + dataset.parse_table table end dataset.save dataset end - - # Create a dataset from CSV file - # @param [File] - # @param [TrueClass,FalseClass] accept or reject empty values + + # Create a dataset from PubChem Assay + # @param [Integer] PubChem AssayID (AID) # @return [OpenTox::Dataset] - def self.from_csv_file file - md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files - dataset = self.find_by(:md5 => md5) - if dataset - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - table = nil - [",","\t",";"].each do |sep| # guess CSV separator - if File.readlines(file).first.match(/#{sep}/) - table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - break - end - end - if table - dataset = self.new(:source => file, :name => File.basename(file), :md5 => md5) - dataset.parse_table table - else - bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." + def self.from_pubchem_aid aid + url = File.join PUBCHEM_URI, "assay/aid/#{aid}/CSV" + assay_metadata = JSON.parse(RestClientWrapper.get(File.join PUBCHEM_URI,"assay/aid/#{aid}/description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"] + name = assay_metadata["name"].gsub(/\s+/,"_") + csv = CSV.parse(RestClientWrapper.get(url)) + csv.select!{|r| r[0].match /^\d/} # discard header rows + table = [["SID","SMILES",name]] + csv.each_slice(100) do |slice| # get SMILES in chunks + sids = slice.collect{|s| s[1]} + smiles = RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT")).split("\n").collect{|s| s.to_s} + abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size + smiles.each_with_index do |smi,i| + table << [slice[i][1].to_s,smi.chomp,slice[i][3].to_s] end end + dataset = self.new(:source => url, :name => name) + dataset.parse_table table dataset end @@ -302,8 +202,8 @@ module OpenTox features = [] # guess feature types + bioactivity = true if feature_names.size == 1 feature_names.each_with_index do |f,i| - metadata = {:name => f, :measured => true} original_id ? j = i+2 : j = i+1 values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq @@ -311,11 +211,18 @@ module OpenTox if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes numeric[i] = true - feature = NumericFeature.find_or_create_by(metadata) + if bioactivity + feature = NumericBioActivity.find_or_create_by(:name => f) + else + feature = NumericSubstanceProperty.find_or_create_by(:name => f) + end else - metadata["accept_values"] = values.sort numeric[i] = false - feature = NominalFeature.find_or_create_by(metadata) + if bioactivity + feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) + else + feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) + end end features << feature if feature end @@ -326,13 +233,12 @@ module OpenTox table.each_with_index do |vals,i| original_id_value = vals.shift.strip if original_id identifier = vals.shift.strip - #warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? #and !accept_empty_values begin case compound_format when /SMILES/i - substance = OpenTox::Compound.from_smiles(identifier) + substance = Compound.from_smiles(identifier) when /InChI/i - substance = OpenTox::Compound.from_inchi(identifier) + substance = Compound.from_inchi(identifier) end rescue substance = nil @@ -345,18 +251,13 @@ module OpenTox substance.dataset_ids << self.id substance.dataset_ids.uniq! substance.save - - unless vals.size == features.size - warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." - next - end add substance, original_id, original_id_value if original_id vals.each_with_index do |v,j| if v.blank? - warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." - next + warn "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." + v = nil elsif numeric[j] v = v.to_f else @@ -364,6 +265,7 @@ module OpenTox end add substance, features[j], v end + data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions end all_substances.duplicates.each do |substance| @@ -374,6 +276,115 @@ module OpenTox save end + # Serialisation + + # Convert dataset to csv format including compound smiles as first column, other column headers are feature names + # @return [String] + def to_csv(inchi=false) + CSV.generate() do |csv| + compound = substances.first.is_a? Compound + if compound + csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + else + csv << ["Name"] + features.collect{|f| f.name} + end + substances.each do |substance| + if compound + name = (inchi ? substance.inchi : substance.smiles) + else + name = substance.name + end + nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq + + if nr_measurements.size > 1 + warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." + else + (0..nr_measurements.first-1).each do |i| + row = [name] + features.each do |f| + values(substance,f) ? row << values(substance,f)[i] : row << "" + end + csv << row + end + end + end + end + end + + # Convert dataset to SDF format + # @return [String] SDF string + def to_sdf + sdf = "" + substances.each do |substance| + sdf_lines = substance.sdf.sub(/\$\$\$\$\n/,"").split("\n") + sdf_lines[0] = substance.smiles + sdf += sdf_lines.join("\n") + features.each do |f| + sdf += "\n> <#{f.name}>\n" + sdf += values(substance,f).uniq.join "," + end + sdf += "\n$$$$\n" + end + sdf + end + + # Dataset operations + + # Merge an array of datasets + # @param [Array] OpenTox::Dataset Array to be merged + # @param [Hash] feature modifications + # @param [Hash] value modifications + # @return [OpenTox::Dataset] merged dataset + def self.merge datasets, feature_map=nil, value_map=nil + dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) + datasets.each do |d| + d.substances.each do |s| + d.features.each do |f| + d.values(s,f).each do |v| + f = feature_map[f] if feature_map and feature_map[f] + v = value_map[v] if value_map and value_map[v] + dataset.add s,f,v #unless dataset.values(s,f).include? v + end + end + end + end + dataset.save + dataset + end + + # Split a dataset into n folds + # @param [Integer] number of folds + # @return [Array] Array with folds [training_dataset,test_dataset] + def folds n + len = self.substances.size + indices = (0..len-1).to_a.shuffle + mid = (len/n) + chunks = [] + start = 0 + 1.upto(n) do |i| + last = start+mid + last = last-1 unless len%n >= i + test_idxs = indices[start..last] || [] + test_substances = test_idxs.collect{|i| substances[i]} + training_idxs = indices-test_idxs + training_substances = training_idxs.collect{|i| substances[i]} + chunk = [training_substances,test_substances].collect do |substances| + dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) + substances.each do |substance| + substance.dataset_ids << dataset.id + substance.dataset_ids.uniq! + substance.save + dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} + end + dataset.save + dataset + end + start = last+1 + chunks << chunk + end + chunks + end + # Delete dataset def delete compounds.each{|c| c.dataset_ids.delete id.to_s} @@ -453,7 +464,7 @@ module OpenTox bad_request_error "'#{compound_format}' is not a supported compound format in the header. " \ "Accepted formats: SMILES, InChI. Please take a look on the help page." end - numeric = [] + #numeric = [] features = [] # guess feature types feature_names.each_with_index do |f,i| @@ -463,11 +474,11 @@ module OpenTox feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - numeric[i] = true + #numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else metadata["accept_values"] = values.sort - numeric[i] = false + #numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end features << feature if feature -- cgit v1.2.3 From 9d17895ab9e8cd31e0f32e8e622e13612ea5ff77 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 12 Oct 2018 21:58:36 +0200 Subject: validation statistic fixes --- lib/dataset.rb | 108 ++------------------------------------------------------- 1 file changed, 3 insertions(+), 105 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index b6c6173..bbb20be 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -384,6 +384,9 @@ module OpenTox end chunks end + + def transform # TODO + end # Delete dataset def delete @@ -419,109 +422,4 @@ module OpenTox end - class Batch - - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "batch" - field :name, type: String - field :source, type: String - field :identifiers, type: Array - field :ids, type: Array - field :compounds, type: Array - field :warnings, type: Array, default: [] - - def self.from_csv_file file - source = file - name = File.basename(file,".*") - batch = self.find_by(:source => source, :name => name) - if batch - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - # check delimiter - line = File.readlines(file).first - if line.match(/\t/) - table = CSV.read file, :col_sep => "\t", :skip_blanks => true, :encoding => 'windows-1251:utf-8' - else - table = CSV.read file, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - end - batch = self.new(:source => source, :name => name, :identifiers => [], :ids => [], :compounds => []) - - # original IDs - if table[0][0] =~ /ID/i - @original_ids = table.collect{|row| row.shift} - @original_ids.shift - end - - # features - feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - compound_format = feature_names.shift.strip - unless compound_format =~ /SMILES|InChI/i - File.delete file - bad_request_error "'#{compound_format}' is not a supported compound format in the header. " \ - "Accepted formats: SMILES, InChI. Please take a look on the help page." - end - #numeric = [] - features = [] - # guess feature types - feature_names.each_with_index do |f,i| - metadata = {:name => f} - values = table.collect{|row| val=row[i+1].to_s.strip; val.blank? ? nil : val }.uniq.compact - types = values.collect{|v| v.numeric? ? true : false}.uniq - feature = nil - if values.size == 0 # empty feature - elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - #numeric[i] = true - feature = NumericFeature.find_or_create_by(metadata) - else - metadata["accept_values"] = values.sort - #numeric[i] = false - feature = NominalFeature.find_or_create_by(metadata) - end - features << feature if feature - end - - table.each_with_index do |vals,i| - identifier = vals.shift.strip.gsub(/^'|'$/,"") - begin - case compound_format - when /SMILES/i - compound = OpenTox::Compound.from_smiles(identifier) - when /InChI/i - compound = OpenTox::Compound.from_inchi(identifier) - end - rescue - compound = nil - end - # collect only for present compounds - unless compound.nil? - batch.identifiers << identifier - batch.compounds << compound.id - batch.ids << @original_ids[i] if @original_ids - else - batch.warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}." - end - end - batch.compounds.duplicates.each do |duplicate| - $logger.debug "Duplicates found in #{name}." - dup = Compound.find duplicate - positions = [] - batch.compounds.each_with_index do |co,i| - c = Compound.find co - if !c.blank? and c.inchi and c.inchi == dup.inchi - positions << i+1 - end - end - batch.warnings << "Duplicate compound at ID #{positions.join(' and ')}." - end - batch.save - end - batch - end - - end - end -- cgit v1.2.3 From 1652fd5df948da7ace622c73d158010add656b9f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 24 Oct 2018 18:21:34 +0200 Subject: dataset map --- lib/dataset.rb | 178 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 122 insertions(+), 56 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index bbb20be..aa66c9f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -8,6 +8,7 @@ module OpenTox class Dataset field :data_entries, type: Hash, default: {} + field :source, type: String field :md5, type: String # Readers @@ -52,6 +53,44 @@ module OpenTox end end + # Get OriginalId feature + # @return [OpenTox::OriginalId] + def original_id_feature + features.select{|f| f.is_a?(OriginalId)}.first + end + + # Get original id + # @param [OpenTox::Substance] substance + # @return [String] original id + def original_id substance + values(substance,original_id_feature).first + end + + # Get OriginalSmiles feature + # @return [OpenTox::OriginalSmiles] + def original_smiles_feature + features.select{|f| f.is_a?(OriginalSmiles)}.first + end + + # Get original SMILES + # @param [OpenTox::Substance] substance + # @return [String] original SMILES + def original_smiles substance + values(substance,original_smiles_feature).first + end + + # Get nominal and numeric bioactivity features + # @return [Array] + def bioactivity_features + features.select{|f| f.class.to_s.match("BioActivity")} + end + + # Get nominal and numeric bioactivity features + # @return [Array] + def transformed_bioactivity_features + features.select{|f| f.class.to_s.match(/Transformed.*BioActivity/)} + end + # Writers # Add a value for a given substance and feature @@ -188,41 +227,38 @@ module OpenTox # features feature_names = table.shift.collect{|f| f.strip} - warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + bad_request_error "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - original_id = nil if feature_names[0] =~ /ID/i # check ID column - feature_names.shift - original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => self.name+".ID") + original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) + else + original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") end + warnings = Warnings.find_or_create_by(:dataset_id => self.id) + compound_format = feature_names.shift bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + original_smiles = OriginalSmiles.create if compound_format.match(/SMILES/i) + numeric = [] features = [] # guess feature types bioactivity = true if feature_names.size == 1 + feature_names.each_with_index do |f,i| - original_id ? j = i+2 : j = i+1 + original_id.name.match(/LineID$/) ? j = i+1 : j = i+2 values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes numeric[i] = true - if bioactivity - feature = NumericBioActivity.find_or_create_by(:name => f) - else - feature = NumericSubstanceProperty.find_or_create_by(:name => f) - end + bioactivity ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f) else numeric[i] = false - if bioactivity - feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) - else - feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) - end + bioactivity ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) end features << feature if feature end @@ -231,32 +267,37 @@ module OpenTox all_substances = [] table.each_with_index do |vals,i| - original_id_value = vals.shift.strip if original_id + original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.strip identifier = vals.shift.strip begin case compound_format when /SMILES/i substance = Compound.from_smiles(identifier) + add substance, original_smiles, identifier when /InChI/i substance = Compound.from_inchi(identifier) end rescue substance = nil end + if substance.nil? # compound parsers may return nil - warn "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." + add substance, original_id, original_id_value + add substance, original_smiles, identifier + add substance, warnings, "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." next end + all_substances << substance substance.dataset_ids << self.id substance.dataset_ids.uniq! substance.save - add substance, original_id, original_id_value if original_id + add substance, original_id, original_id_value vals.each_with_index do |v,j| if v.blank? - warn "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." + add substance, warnings, "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." v = nil elsif numeric[j] v = v.to_f @@ -265,13 +306,15 @@ module OpenTox end add substance, features[j], v end - data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions + #data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions end all_substances.duplicates.each do |substance| positions = [] - all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == substance.inchi} - warn "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles} + all_substances.select{|s| s.smiles == substance.smiles}.each do |s| + add s, warnings, "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + end end save end @@ -280,13 +323,20 @@ module OpenTox # Convert dataset to csv format including compound smiles as first column, other column headers are feature names # @return [String] - def to_csv(inchi=false) + def to_csv inchi=false CSV.generate() do |csv| compound = substances.first.is_a? Compound + id = features.select{|f| f.is_a? OriginalId}.first + features.delete(id) + original_smiles = features.select{|f| f.is_a? OriginalSmiles}.first + features.delete(original_smiles) + warning = features.select{|f| f.is_a? Warnings}.first + features.delete(warning) + if compound - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + csv << [id.name, inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + ["OriginalSmiles", "Warnings"] else - csv << ["Name"] + features.collect{|f| f.name} + csv << [id.name, "Name"] + features.collect{|f| f.name} end substances.each do |substance| if compound @@ -294,19 +344,10 @@ module OpenTox else name = substance.name end - nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq - - if nr_measurements.size > 1 - warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." - else - (0..nr_measurements.first-1).each do |i| - row = [name] - features.each do |f| - values(substance,f) ? row << values(substance,f)[i] : row << "" - end - csv << row - end - end + row = [values(substance,id).first,name] + features.collect{|f| values(substance,f).join(" ")} + row << values(substance,original_smiles).join(" ") + row << values(substance,warning).join(" ") + csv << row end end end @@ -332,18 +373,19 @@ module OpenTox # Merge an array of datasets # @param [Array] OpenTox::Dataset Array to be merged - # @param [Hash] feature modifications - # @param [Hash] value modifications + # @param [Array] OpenTox::Feature Array to be merged # @return [OpenTox::Dataset] merged dataset - def self.merge datasets, feature_map=nil, value_map=nil - dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) + def self.merge datasets, features + # TODO warnings + features.uniq! + dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) datasets.each do |d| d.substances.each do |s| - d.features.each do |f| + dataset.add s,d.original_id_feature,d.original_id(s) + dataset.add s,d.original_smiles_feature,d.original_smiles(s) + features.each do |f| d.values(s,f).each do |v| - f = feature_map[f] if feature_map and feature_map[f] - v = value_map[v] if value_map and value_map[v] - dataset.add s,f,v #unless dataset.values(s,f).include? v + dataset.add s,features.first,v #unless dataset.values(s,f).include? v end end end @@ -352,6 +394,17 @@ module OpenTox dataset end + # Copy a dataset + # @return OpenTox::Dataset dataset copy + def copy + dataset = Dataset.new + dataset.data_entries = data_entries + dataset.name = name + dataset.source = id.to_s + dataset.save + dataset + end + # Split a dataset into n folds # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] @@ -384,6 +437,19 @@ module OpenTox end chunks end + + # Change nominal feature values + # @param [NominalFeature] Original feature + # @param [Hash] how to change feature values + def map feature, map + dataset = self.copy + new_feature = TransformedNominalBioActivity.find_or_create_by(:name => feature.name + " (transformed)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort) + compounds.each do |c| + values(c,feature).each { |v| dataset.add c, new_feature, map[v] } + end + dataset.save + dataset + end def transform # TODO end @@ -397,9 +463,9 @@ module OpenTox end # Dataset for lazar predictions - class LazarPrediction #< Dataset + class LazarPrediction < Dataset field :creator, type: String - field :prediction_feature_id, type: BSON::ObjectId + #field :prediction_feature_id, type: BSON::ObjectId field :predictions, type: Hash, default: {} # Get prediction feature @@ -408,16 +474,16 @@ module OpenTox Feature.find prediction_feature_id end - # Get all compounds - # @return [Array] - def compounds - substances.select{|s| s.is_a? Compound} + def prediction compound end - # Get all substances - # @return [Array] - def substances - predictions.keys.collect{|id| Substance.find id} + def probability klass + end + + def prediction_interval + end + + def predictions end end -- cgit v1.2.3 From 24e5f9cc16ba164f860620184dc39b024bc3d384 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 24 Oct 2018 23:51:32 +0200 Subject: dataset tests fixed --- lib/dataset.rb | 73 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index aa66c9f..c652b25 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -7,7 +7,8 @@ module OpenTox # Collection of substances and features class Dataset - field :data_entries, type: Hash, default: {} + field :data_entries, type: Array, default: [] #substance,feature,value + field :warnings, type: Array, default: [] field :source, type: String field :md5, type: String @@ -28,29 +29,25 @@ module OpenTox # Get all substances # @return [Array] def substances - @substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}.uniq + @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0] if row[0]}.compact.uniq @substances end # Get all features # @return [Array] def features - @features ||= data_entries.collect{|sid,data| data.keys.collect{|id| OpenTox::Feature.find(id)}}.flatten.uniq + @features ||= data_entries.collect{|row| OpenTox::Feature.find(row[1])}.uniq @features end # Get all values for a given substance and feature - # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id - # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id + # @param [OpenTox::Substance,BSON::ObjectId] substance or substance id + # @param [OpenTox::Feature,BSON::ObjectId] feature or feature id # @return [TrueClass,FalseClass,Float] def values substance,feature substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature - if data_entries[substance.to_s] and data_entries[substance.to_s][feature.to_s] - data_entries[substance.to_s][feature.to_s] - else - [nil] - end + data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]} end # Get OriginalId feature @@ -79,10 +76,18 @@ module OpenTox values(substance,original_smiles_feature).first end + def warnings_feature + features.select{|f| f.is_a?(Warnings)}.first + end + + #def warnings + #data_entries.select{|row| row[1] == warnings_feature}.collect{|row| row[2]}.compact + #end + # Get nominal and numeric bioactivity features # @return [Array] def bioactivity_features - features.select{|f| f.class.to_s.match("BioActivity")} + features.select{|f| f._type.match(/BioActivity/)} end # Get nominal and numeric bioactivity features @@ -91,6 +96,12 @@ module OpenTox features.select{|f| f.class.to_s.match(/Transformed.*BioActivity/)} end + # Get nominal and numeric substance property features + # @return [Array] + def substance_property_features + features.select{|f| f.class.to_s.match("SubstanceProperty")} + end + # Writers # Add a value for a given substance and feature @@ -100,10 +111,7 @@ module OpenTox def add(substance,feature,value) substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature - data_entries[substance.to_s] ||= {} - data_entries[substance.to_s][feature.to_s] ||= [] - data_entries[substance.to_s][feature.to_s] << value - #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source + data_entries << [substance,feature,value] if substance and feature and value end # Parsers @@ -235,8 +243,6 @@ module OpenTox original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") end - warnings = Warnings.find_or_create_by(:dataset_id => self.id) - compound_format = feature_names.shift bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i original_smiles = OriginalSmiles.create if compound_format.match(/SMILES/i) @@ -282,9 +288,7 @@ module OpenTox end if substance.nil? # compound parsers may return nil - add substance, original_id, original_id_value - add substance, original_smiles, identifier - add substance, warnings, "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." + warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." next end @@ -297,8 +301,8 @@ module OpenTox vals.each_with_index do |v,j| if v.blank? - add substance, warnings, "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." - v = nil + warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." + next elsif numeric[j] v = v.to_f else @@ -306,14 +310,14 @@ module OpenTox end add substance, features[j], v end - #data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions end + warnings_feature = Warnings.find_or_create_by(:dataset_id => id) all_substances.duplicates.each do |substance| positions = [] all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles} all_substances.select{|s| s.smiles == substance.smiles}.each do |s| - add s, warnings, "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + add s, warnings_feature, "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end end save @@ -325,28 +329,25 @@ module OpenTox # @return [String] def to_csv inchi=false CSV.generate() do |csv| + # TODO support multiple original id|smiles compound = substances.first.is_a? Compound - id = features.select{|f| f.is_a? OriginalId}.first - features.delete(id) - original_smiles = features.select{|f| f.is_a? OriginalSmiles}.first - features.delete(original_smiles) - warning = features.select{|f| f.is_a? Warnings}.first - features.delete(warning) + f = features - [original_id_feature,original_smiles_feature,warnings_feature] if compound - csv << [id.name, inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + ["OriginalSmiles", "Warnings"] + csv << ["Original ID", inchi ? "InChI" : "SMILES", "Original SMILES"] + f.collect{|f| f.name} + ["Warnings"] else - csv << [id.name, "Name"] + features.collect{|f| f.name} + csv << ["Original ID", "Name"] + f.collect{|f| f.name} + ["Warnings"] end + substances.each do |substance| if compound name = (inchi ? substance.inchi : substance.smiles) else name = substance.name end - row = [values(substance,id).first,name] + features.collect{|f| values(substance,f).join(" ")} - row << values(substance,original_smiles).join(" ") - row << values(substance,warning).join(" ") + row = [values(substance,original_id_feature).first,name,values(substance,original_smiles_feature).first] + row += f.collect{|f| values(substance,f).join(" ")} + row << values(substance,warnings_feature).join(" ") csv << row end end @@ -427,7 +428,7 @@ module OpenTox substance.dataset_ids << dataset.id substance.dataset_ids.uniq! substance.save - dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} + dataset.data_entries << data_entries.select{|row| row[0] == substance.id} end dataset.save dataset -- cgit v1.2.3 From 15f4ad23eb918a91d52779887ccfb51bc6547f1b Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Thu, 25 Oct 2018 18:58:19 +0200 Subject: dataset merge --- lib/dataset.rb | 111 ++++++++++++++++++++++++--------------------------------- 1 file changed, 47 insertions(+), 64 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index c652b25..9611fff 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -29,7 +29,7 @@ module OpenTox # Get all substances # @return [Array] def substances - @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0] if row[0]}.compact.uniq + @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0]}.uniq @substances end @@ -43,47 +43,31 @@ module OpenTox # Get all values for a given substance and feature # @param [OpenTox::Substance,BSON::ObjectId] substance or substance id # @param [OpenTox::Feature,BSON::ObjectId] feature or feature id - # @return [TrueClass,FalseClass,Float] + # @return [Array] values def values substance,feature substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]} end - # Get OriginalId feature - # @return [OpenTox::OriginalId] - def original_id_feature - features.select{|f| f.is_a?(OriginalId)}.first + # Get OriginalId features + # @return [Array] original ID features (merged datasets may have multiple original IDs) + def original_id_features + features.select{|f| f.is_a?(OriginalId)} end - # Get original id - # @param [OpenTox::Substance] substance - # @return [String] original id - def original_id substance - values(substance,original_id_feature).first + # Get OriginalSmiles features + # @return [Array] original smiles features (merged datasets may have multiple original smiles) + def original_smiles_features + features.select{|f| f.is_a?(OriginalSmiles)} end - # Get OriginalSmiles feature - # @return [OpenTox::OriginalSmiles] - def original_smiles_feature - features.select{|f| f.is_a?(OriginalSmiles)}.first + # Get Warnings features + # @return [Array] warnings features (merged datasets may have multiple warnings) + def warnings_features + features.select{|f| f.is_a?(Warnings)} end - # Get original SMILES - # @param [OpenTox::Substance] substance - # @return [String] original SMILES - def original_smiles substance - values(substance,original_smiles_feature).first - end - - def warnings_feature - features.select{|f| f.is_a?(Warnings)}.first - end - - #def warnings - #data_entries.select{|row| row[1] == warnings_feature}.collect{|row| row[2]}.compact - #end - # Get nominal and numeric bioactivity features # @return [Array] def bioactivity_features @@ -93,13 +77,13 @@ module OpenTox # Get nominal and numeric bioactivity features # @return [Array] def transformed_bioactivity_features - features.select{|f| f.class.to_s.match(/Transformed.*BioActivity/)} + features.select{|f| f._type.match(/Transformed.*BioActivity/)} end # Get nominal and numeric substance property features # @return [Array] def substance_property_features - features.select{|f| f.class.to_s.match("SubstanceProperty")} + features.select{|f| f._type.match("SubstanceProperty")} end # Writers @@ -245,7 +229,7 @@ module OpenTox compound_format = feature_names.shift bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - original_smiles = OriginalSmiles.create if compound_format.match(/SMILES/i) + original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i) numeric = [] features = [] @@ -325,31 +309,29 @@ module OpenTox # Serialisation - # Convert dataset to csv format including compound smiles as first column, other column headers are feature names + # Convert dataset to csv format # @return [String] - def to_csv inchi=false + def to_csv #inchi=false CSV.generate() do |csv| - # TODO support multiple original id|smiles + compound = substances.first.is_a? Compound - f = features - [original_id_feature,original_smiles_feature,warnings_feature] - - if compound - csv << ["Original ID", inchi ? "InChI" : "SMILES", "Original SMILES"] + f.collect{|f| f.name} + ["Warnings"] - else - csv << ["Original ID", "Name"] + f.collect{|f| f.name} + ["Warnings"] - end + f = features - original_id_features - original_smiles_features - warnings_features + header = original_id_features.collect{|f| "ID "+Dataset.find(f.dataset_id).name} + header += original_smiles_features.collect{|f| "SMILES "+Dataset.find(f.dataset_id).name} if compound + compound ? header << "Canonical SMILES" : header << "Name" + header += f.collect{|f| f.name} + header += warnings_features.collect{|f| "Warnings "+Dataset.find(f.dataset_id).name} + csv << header substances.each do |substance| - if compound - name = (inchi ? substance.inchi : substance.smiles) - else - name = substance.name - end - row = [values(substance,original_id_feature).first,name,values(substance,original_smiles_feature).first] + row = original_id_features.collect{|f| values(substance,f).join(" ")} + row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound + compound ? row << substance.smiles : row << substance.name row += f.collect{|f| values(substance,f).join(" ")} - row << values(substance,warnings_feature).join(" ") + row += warnings_features.collect{|f| values(substance,f).uniq.join(" ")} csv << row end + end end @@ -373,23 +355,13 @@ module OpenTox # Dataset operations # Merge an array of datasets - # @param [Array] OpenTox::Dataset Array to be merged - # @param [Array] OpenTox::Feature Array to be merged + # @param [Array] datasets to be merged # @return [OpenTox::Dataset] merged dataset - def self.merge datasets, features - # TODO warnings - features.uniq! + def self.merge datasets dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) datasets.each do |d| - d.substances.each do |s| - dataset.add s,d.original_id_feature,d.original_id(s) - dataset.add s,d.original_smiles_feature,d.original_smiles(s) - features.each do |f| - d.values(s,f).each do |v| - dataset.add s,features.first,v #unless dataset.values(s,f).include? v - end - end - end + dataset.data_entries += d.data_entries + dataset.warnings += d.warnings end dataset.save dataset @@ -400,6 +372,7 @@ module OpenTox def copy dataset = Dataset.new dataset.data_entries = data_entries + dataset.warnings = warnings dataset.name = name dataset.source = id.to_s dataset.save @@ -451,6 +424,16 @@ module OpenTox dataset.save dataset end + + def merge_nominal_features nominal_features, maps=[] + dataset = self.copy + new_feature = MergedNominalBioActivity.find_or_create_by(:name => nominal_features.collect{|f| f.name}.join("/") + " (transformed)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort) + + compounds.each do |c| + if map + values(c,feature).each { |v| dataset.add c, new_feature, map[v] } + else + end def transform # TODO end -- cgit v1.2.3 From 5e9a08c0b534fa96179fb5c81a9b4193e7b0aad8 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 17:58:09 +0100 Subject: dataset folds fixed --- lib/dataset.rb | 30 +++--------------------------- 1 file changed, 3 insertions(+), 27 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 9611fff..41d7b5c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -401,7 +401,7 @@ module OpenTox substance.dataset_ids << dataset.id substance.dataset_ids.uniq! substance.save - dataset.data_entries << data_entries.select{|row| row[0] == substance.id} + dataset.data_entries += data_entries.select{|row| row[0] == substance.id} end dataset.save dataset @@ -433,6 +433,8 @@ module OpenTox if map values(c,feature).each { |v| dataset.add c, new_feature, map[v] } else + end + end end def transform # TODO @@ -446,30 +448,4 @@ module OpenTox end - # Dataset for lazar predictions - class LazarPrediction < Dataset - field :creator, type: String - #field :prediction_feature_id, type: BSON::ObjectId - field :predictions, type: Hash, default: {} - - # Get prediction feature - # @return [OpenTox::Feature] - def prediction_feature - Feature.find prediction_feature_id - end - - def prediction compound - end - - def probability klass - end - - def prediction_interval - end - - def predictions - end - - end - end -- cgit v1.2.3 From d9c9d78e49d886ea91386adbbd2b523347df226e Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Mon, 29 Oct 2018 20:34:39 +0100 Subject: dataset predictions fixed --- lib/dataset.rb | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 41d7b5c..78f5633 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -86,6 +86,10 @@ module OpenTox features.select{|f| f._type.match("SubstanceProperty")} end + def prediction_features + features.select{|f| f._type.match("Prediction")} + end + # Writers # Add a value for a given substance and feature @@ -352,6 +356,25 @@ module OpenTox sdf end + def predictions + predictions = {} + substances.each do |s| + predictions[s] ||= {} + prediction_feature = prediction_features.first + predictions[s][:value] = values(s,prediction_feature).first + predictions[s][:warnings] = [] + warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } + if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction + prediction_feature.accept_values.each do |v| + f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id) + predictions[s][:probabilities] ||= {} + predictions[s][:probabilities][v] = values(s,f).first + end + end + end + predictions + end + # Dataset operations # Merge an array of datasets -- cgit v1.2.3 From d61f78093f4ddf03c27a2c8ae0bab9c1f10c80f5 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 30 Oct 2018 17:26:59 +0100 Subject: tests fixed --- lib/dataset.rb | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 78f5633..4543e42 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -41,12 +41,14 @@ module OpenTox end # Get all values for a given substance and feature - # @param [OpenTox::Substance,BSON::ObjectId] substance or substance id - # @param [OpenTox::Feature,BSON::ObjectId] feature or feature id + # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id + # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id # @return [Array] values def values substance,feature substance = substance.id if substance.is_a? Substance feature = feature.id if feature.is_a? Feature + substance = BSON::ObjectId.from_string(substance) if substance.is_a? String + feature = BSON::ObjectId.from_string(feature) if feature.is_a? String data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]} end @@ -86,6 +88,8 @@ module OpenTox features.select{|f| f._type.match("SubstanceProperty")} end + # Get nominal and numeric prediction features + # @return [Array] def prediction_features features.select{|f| f._type.match("Prediction")} end @@ -377,19 +381,6 @@ module OpenTox # Dataset operations - # Merge an array of datasets - # @param [Array] datasets to be merged - # @return [OpenTox::Dataset] merged dataset - def self.merge datasets - dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) - datasets.each do |d| - dataset.data_entries += d.data_entries - dataset.warnings += d.warnings - end - dataset.save - dataset - end - # Copy a dataset # @return OpenTox::Dataset dataset copy def copy @@ -434,6 +425,27 @@ module OpenTox end chunks end +=begin + # Merge an array of datasets + # @param [Array] datasets to be merged + # @return [OpenTox::Dataset] merged dataset + def self.merge datasets: datasets, features: features, value_maps: value_maps, keep_original_features: keep_original_features, remove_duplicates: remove_duplicates + dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")+" merged") + datasets.each_with_index do |d,i| + dataset.data_entries += d.data_entries + dataset.warnings += d.warnings + end + feature_classes = features.collect{|f| f.class}.uniq + if feature_classes.size == 1 + if features.first.nominal? + merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " (merged)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort) + compounds.each do |c| + values(c,feature).each { |v| dataset.add c, new_feature, map[v] } + end + dataset.save + dataset + end +=end # Change nominal feature values # @param [NominalFeature] Original feature -- cgit v1.2.3 From 2d4ce39cb1b489e26b0d6d96026054566a4f77b9 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 30 Oct 2018 21:11:04 +0100 Subject: dataset merge --- lib/dataset.rb | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 4543e42..46a83d7 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -94,6 +94,12 @@ module OpenTox features.select{|f| f._type.match("Prediction")} end + # Get nominal and numeric merged features + # @return [Array] + def merged_features + features.select{|f| f._type.match("Merged")} + end + # Writers # Add a value for a given substance and feature @@ -425,27 +431,48 @@ module OpenTox end chunks end -=begin + # Merge an array of datasets # @param [Array] datasets to be merged # @return [OpenTox::Dataset] merged dataset - def self.merge datasets: datasets, features: features, value_maps: value_maps, keep_original_features: keep_original_features, remove_duplicates: remove_duplicates + def self.merge datasets: , features: , value_maps: , keep_original_features: , remove_duplicates: dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")+" merged") - datasets.each_with_index do |d,i| + + datasets.each do |d| dataset.data_entries += d.data_entries dataset.warnings += d.warnings - end + end if keep_original_features + feature_classes = features.collect{|f| f.class}.uniq + merged_feature = nil if feature_classes.size == 1 - if features.first.nominal? - merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " (merged)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort) - compounds.each do |c| - values(c,feature).each { |v| dataset.add c, new_feature, map[v] } + if features.first.kind_of? NominalFeature + merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(", ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps) + else + merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO, :transformations + end + else + bad_request_error "Cannot merge features of different types (#{feature_classes})." end + + accept_values = [] + features.each_with_index do |f,i| + dataset.data_entries += datasets[i].data_entries.select{|de| de[1] == f.id}.collect do |de| + value_maps[i] ? v = value_maps[i][de[2]] : v = de[2] + accept_values << v + [de[0],merged_feature.id,v] + end + end + + if merged_feature.is_a? MergedNominalBioActivity + merged_feature.accept_values = accept_values.uniq.sort + merged_feature.save + end + + dataset.data_entries.uniq! if remove_duplicates dataset.save dataset end -=end # Change nominal feature values # @param [NominalFeature] Original feature -- cgit v1.2.3 From 5b08a8c6d8e5567d253bec92d5bf5d18fd040cdc Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 31 Oct 2018 14:50:42 +0100 Subject: pubchem import for openrisknet --- lib/dataset.rb | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 46a83d7..d02a302 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -207,21 +207,40 @@ module OpenTox # @param [Integer] PubChem AssayID (AID) # @return [OpenTox::Dataset] def self.from_pubchem_aid aid - url = File.join PUBCHEM_URI, "assay/aid/#{aid}/CSV" - assay_metadata = JSON.parse(RestClientWrapper.get(File.join PUBCHEM_URI,"assay/aid/#{aid}/description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"] + # TODO get regression data + aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}" + assay_metadata = JSON.parse(RestClientWrapper.get(File.join aid_url,"description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"] name = assay_metadata["name"].gsub(/\s+/,"_") - csv = CSV.parse(RestClientWrapper.get(url)) - csv.select!{|r| r[0].match /^\d/} # discard header rows + dataset = self.new(:source => aid_url, :name => name) + # Get assay data in chunks + # Assay record retrieval is limited to 10000 SIDs + # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435 + list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"] + listkey = list["ListKey"] + size = list["Size"] + start = 0 + csv = [] + while start < size + url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000" + csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows + start += 10000 + end table = [["SID","SMILES",name]] csv.each_slice(100) do |slice| # get SMILES in chunks - sids = slice.collect{|s| s[1]} - smiles = RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT")).split("\n").collect{|s| s.to_s} - abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size - smiles.each_with_index do |smi,i| - table << [slice[i][1].to_s,smi.chomp,slice[i][3].to_s] + cids = slice.collect{|s| s[2]} + pubchem_cids = [] + JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop| + i = cids.index(prop["CID"].to_s) + value = slice[i][3] + if value == "Active" or value == "Inactive" + table << [slice[i][1].to_s,prop["CanonicalSMILES"],slice[i][3].to_s] + pubchem_cids << prop["CID"].to_s + else + dataset.warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is #{value}." + end end + (cids-pubchem_cids).each { |cid| dataset.warnings << "Could not retrieve SMILES for CID #{cid}, all entries are ignored." } end - dataset = self.new(:source => url, :name => name) dataset.parse_table table dataset end @@ -315,7 +334,7 @@ module OpenTox positions = [] all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles} all_substances.select{|s| s.smiles == substance.smiles}.each do |s| - add s, warnings_feature, "Duplicate compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." + add s, warnings_feature, "Duplicated compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." end end save -- cgit v1.2.3 From 3a9c9332b660d35720ad4fa1f55ee0883e53aecd Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 2 Nov 2018 20:34:44 +0100 Subject: warnings fixed, cleanup --- lib/dataset.rb | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index d02a302..42733e4 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -310,10 +310,6 @@ module OpenTox end all_substances << substance - substance.dataset_ids << self.id - substance.dataset_ids.uniq! - substance.save - add substance, original_id, original_id_value vals.each_with_index do |v,j| @@ -422,6 +418,7 @@ module OpenTox # @param [Integer] number of folds # @return [Array] Array with folds [training_dataset,test_dataset] def folds n + $logger.debug "Creating #{n} folds for #{name}." len = self.substances.size indices = (0..len-1).to_a.shuffle mid = (len/n) @@ -431,19 +428,15 @@ module OpenTox last = start+mid last = last-1 unless len%n >= i test_idxs = indices[start..last] || [] - test_substances = test_idxs.collect{|i| substances[i]} + test_substances = test_idxs.collect{|i| substances[i].id} training_idxs = indices-test_idxs - training_substances = training_idxs.collect{|i| substances[i]} + training_substances = training_idxs.collect{|i| substances[i].id} chunk = [training_substances,test_substances].collect do |substances| - dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) - substances.each do |substance| - substance.dataset_ids << dataset.id - substance.dataset_ids.uniq! - substance.save - dataset.data_entries += data_entries.select{|row| row[0] == substance.id} - end - dataset.save - dataset + self.class.create( + :name => "#{self.name} (Fold #{i-1})", + :source => self.id, + :data_entries => data_entries.select{|row| substances.include? row[0]} + ) end start = last+1 chunks << chunk @@ -468,7 +461,7 @@ module OpenTox if features.first.kind_of? NominalFeature merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(", ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps) else - merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO, :transformations + merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations end else bad_request_error "Cannot merge features of different types (#{feature_classes})." @@ -521,12 +514,6 @@ module OpenTox def transform # TODO end - # Delete dataset - def delete - compounds.each{|c| c.dataset_ids.delete id.to_s} - super - end - end end -- cgit v1.2.3 From cf80ed17102a0368df8d65037d113b521cdf6f0c Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 6 Nov 2018 19:01:58 +0100 Subject: sdf export fixed --- lib/dataset.rb | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 42733e4..b09d7bf 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -199,7 +199,6 @@ module OpenTox end dataset.parse_table table end - dataset.save dataset end @@ -290,7 +289,7 @@ module OpenTox all_substances = [] table.each_with_index do |vals,i| - original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.strip + original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.to_s.strip identifier = vals.shift.strip begin case compound_format @@ -368,13 +367,17 @@ module OpenTox # @return [String] SDF string def to_sdf sdf = "" - substances.each do |substance| - sdf_lines = substance.sdf.sub(/\$\$\$\$\n/,"").split("\n") - sdf_lines[0] = substance.smiles + compounds.each do |compound| + sdf_lines = compound.sdf.sub(/\$\$\$\$\n/,"").split("\n") + sdf_lines[0] = compound.smiles sdf += sdf_lines.join("\n") - features.each do |f| - sdf += "\n> <#{f.name}>\n" - sdf += values(substance,f).uniq.join "," + bioactivity_features.each do |f| + v = values(compound,f) + unless v.empty? + sdf += "\n> <#{f.name}>\n" + sdf += v.uniq.join "," + sdf += "\n" + end end sdf += "\n$$$$\n" end -- cgit v1.2.3 From ae78e8216909ebfa708b8da3c55248a68abc291c Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 14 Nov 2018 13:35:17 +0100 Subject: public model validation, updated documentation --- lib/dataset.rb | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index b09d7bf..90b4993 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -384,6 +384,8 @@ module OpenTox sdf end + # Get lazar predictions from a dataset + # @return [Hash] predictions def predictions predictions = {} substances.each do |s| @@ -448,7 +450,11 @@ module OpenTox end # Merge an array of datasets - # @param [Array] datasets to be merged + # @param [Array] datasets Datasets to be merged + # @param [Array] features Features to be merged (same size as datasets) + # @param [Array] value_maps Value transfomations (use nil for keeping original values, same size as dataset) + # @param [Bool] keep_original_features Copy original features/values to the merged dataset + # @param [Bool] remove_duplicates Delete duplicated values (assuming they come from the same experiment) # @return [OpenTox::Dataset] merged dataset def self.merge datasets: , features: , value_maps: , keep_original_features: , remove_duplicates: dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")+" merged") @@ -489,34 +495,6 @@ module OpenTox dataset end - # Change nominal feature values - # @param [NominalFeature] Original feature - # @param [Hash] how to change feature values - def map feature, map - dataset = self.copy - new_feature = TransformedNominalBioActivity.find_or_create_by(:name => feature.name + " (transformed)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort) - compounds.each do |c| - values(c,feature).each { |v| dataset.add c, new_feature, map[v] } - end - dataset.save - dataset - end - - def merge_nominal_features nominal_features, maps=[] - dataset = self.copy - new_feature = MergedNominalBioActivity.find_or_create_by(:name => nominal_features.collect{|f| f.name}.join("/") + " (transformed)", :original_feature_id => feature.id, :transformation => map, :accept_values => map.values.sort) - - compounds.each do |c| - if map - values(c,feature).each { |v| dataset.add c, new_feature, map[v] } - else - end - end - end - - def transform # TODO - end - end end -- cgit v1.2.3 From 7e547fd4a296f497615a7805d565b378cb1bd7cd Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 14 Nov 2018 17:33:44 +0100 Subject: bad_request_error substituted with ArgumentError --- lib/dataset.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 90b4993..3979105 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -142,7 +142,7 @@ module OpenTox dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) dataset.parse_table table else - bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." + raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." end end dataset @@ -251,7 +251,7 @@ module OpenTox # features feature_names = table.shift.collect{|f| f.strip} - bad_request_error "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size if feature_names[0] =~ /ID/i # check ID column original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) @@ -260,7 +260,7 @@ module OpenTox end compound_format = feature_names.shift - bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i + raise ArgumentError, "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i) numeric = [] @@ -473,7 +473,7 @@ module OpenTox merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations end else - bad_request_error "Cannot merge features of different types (#{feature_classes})." + raise ArgumentError, "Cannot merge features of different types (#{feature_classes})." end accept_values = [] -- cgit v1.2.3 From 1b44e0cd76f2ead93b8b3fa0f970c85ef32a4b14 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Fri, 16 Nov 2018 22:45:17 +0100 Subject: confidence for prediction datasets --- lib/dataset.rb | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 3979105..df17569 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -70,6 +70,12 @@ module OpenTox features.select{|f| f.is_a?(Warnings)} end + # Get Confidence feature + # @return [OpenTox::Confidence] confidence feature + def confidence_feature + features.select{|f| f.is_a?(Confidence)}.first + end + # Get nominal and numeric bioactivity features # @return [Array] def bioactivity_features @@ -392,8 +398,9 @@ module OpenTox predictions[s] ||= {} prediction_feature = prediction_features.first predictions[s][:value] = values(s,prediction_feature).first - predictions[s][:warnings] = [] - warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } + #predictions[s][:warnings] = [] + #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } + predictions[s][:confidence] = values(s,confidence_feature).first if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction prediction_feature.accept_values.each do |v| f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id) -- cgit v1.2.3 From 455da06aa6459da0d25b286ca6cb866ff64c4c34 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 20 Jun 2019 22:01:50 +0200 Subject: separate csv serialisations for batch predictions and training data, repeated measurements in mutagenicity dataset fixed, daphnia import fixed, CENTRAL_MONGO_IP removed --- lib/dataset.rb | 75 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 24 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index df17569..596c53c 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -96,8 +96,14 @@ module OpenTox # Get nominal and numeric prediction features # @return [Array] - def prediction_features - features.select{|f| f._type.match("Prediction")} + def prediction_feature + features.select{|f| f._type.match(/Prediction$/)}.first + end + + # Get supporting nominal and numeric prediction features (class probabilities, prediction interval) + # @return [Array] + def prediction_supporting_features + features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)} end # Get nominal and numeric merged features @@ -259,7 +265,7 @@ module OpenTox feature_names = table.shift.collect{|f| f.strip} raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - if feature_names[0] =~ /ID/i # check ID column + if feature_names[0] !~ /SMILES|InChI/i # check ID column original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) else original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") @@ -343,30 +349,52 @@ module OpenTox # Serialisation - # Convert dataset to csv format + # Convert lazar prediction dataset to csv format # @return [String] - def to_csv #inchi=false - CSV.generate() do |csv| - - compound = substances.first.is_a? Compound - f = features - original_id_features - original_smiles_features - warnings_features - header = original_id_features.collect{|f| "ID "+Dataset.find(f.dataset_id).name} - header += original_smiles_features.collect{|f| "SMILES "+Dataset.find(f.dataset_id).name} if compound - compound ? header << "Canonical SMILES" : header << "Name" - header += f.collect{|f| f.name} - header += warnings_features.collect{|f| "Warnings "+Dataset.find(f.dataset_id).name} - csv << header - - substances.each do |substance| - row = original_id_features.collect{|f| values(substance,f).join(" ")} - row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound - compound ? row << substance.smiles : row << substance.name - row += f.collect{|f| values(substance,f).join(" ")} - row += warnings_features.collect{|f| values(substance,f).uniq.join(" ")} + def to_prediction_csv + + compound = substances.first.is_a? Compound + header = ["ID"] + header << "Original SMILES" if compound + compound ? header << "Canonical SMILES" : header << "Name" + header << "Prediction" if prediction_feature + header << "Confidence" if confidence_feature + header += prediction_supporting_features.collect{|f| f.name} + header << "Measurements" + csv = [header] + + substances.each do |substance| + row = original_id_features.collect{|f| values(substance,f).join(" ")} + row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound + compound ? row << substance.smiles : row << substance.name + row << values(substance,prediction_feature).join(" ") + row << values(substance,confidence_feature).join(" ") + row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")} + row << values(substance,bioactivity_features[0]).join(" ") + csv << row + end + csv.collect{|r| r.join(",")}.join("\n") + end + + # Convert dataset into csv formatted training data + # @return [String] + def to_training_csv + + p features + p bioactivity_features + header = ["Canonical SMILES"] + header << bioactivity_features[0].name + csv = [header] + + substances.each do |substance| + nr_activities = values(substance,bioactivity_features.first).size + (0..nr_activities-1).each do |n| # new row for each value + row = [substance.smiles] + row << values(substance,bioactivity_features[0])[n] csv << row end - end + csv.collect{|r| r.join(",")}.join("\n") end # Convert dataset to SDF format @@ -396,7 +424,6 @@ module OpenTox predictions = {} substances.each do |s| predictions[s] ||= {} - prediction_feature = prediction_features.first predictions[s][:value] = values(s,prediction_feature).first #predictions[s][:warnings] = [] #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } -- cgit v1.2.3 From 7aac1c36369b41501edfc261e4f7ad77dec6b2a1 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Fri, 21 Jun 2019 10:45:59 +0200 Subject: test_from_csv2 fixed, prefer merged_feature and transformed_feature in to_training_csv --- lib/dataset.rb | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index 596c53c..fb1afd2 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -107,7 +107,7 @@ module OpenTox end # Get nominal and numeric merged features - # @return [Array] + # @return [Array] def merged_features features.select{|f| f._type.match("Merged")} end @@ -380,17 +380,20 @@ module OpenTox # @return [String] def to_training_csv - p features - p bioactivity_features + export_features = merged_features + export_features = transformed_bioactivity_features if export_features.empty? + export_features = bioactivity_features if export_features.empty? + export_feature = export_features.first + header = ["Canonical SMILES"] - header << bioactivity_features[0].name + header << bioactivity_features.first.name # use original bioactivity name instead of long merged name csv = [header] substances.each do |substance| nr_activities = values(substance,bioactivity_features.first).size (0..nr_activities-1).each do |n| # new row for each value row = [substance.smiles] - row << values(substance,bioactivity_features[0])[n] + row << values(substance,export_feature)[n] csv << row end end @@ -502,7 +505,7 @@ module OpenTox merged_feature = nil if feature_classes.size == 1 if features.first.kind_of? NominalFeature - merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(", ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps) + merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(" and ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps) else merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations end -- cgit v1.2.3