require 'matrix' class Dataset def initialize file @dir = File.dirname file @dependent_variable_type = File.read(File.join(@dir,"dependent_variable_type")).chomp if @dependent_variable_type == "binary" @dependent_variable_values = {} File.readlines(File.join(@dir,"dependent_variable_values")).each_with_index{|v,i| @dependent_variable_values[v.chomp] = i} end @independent_variable_type = File.read(File.join(@dir,"independent_variable_type")).chomp @lines = File.readlines(file) @header = @lines.shift.split(",") @header.first.match(/ID|SMILES/i) ? @has_id = true : @has_id = false @dependent_variable_name = @header.pop @ids = [] @dependent_variables = [] @independent_variables = [] @independent_variable_names = [] end def print_variables File.open(File.join(@dir,"ids"),"w+") { |f| f.puts @ids.join("\n") } File.open(File.join(@dir,"dependent_variable_name"),"w+") { |f| f.puts @dependent_variable_name } File.open(File.join(@dir,"dependent_variables"),"w+") { |f| f.puts @dependent_variables.join("\n") } File.open(File.join(@dir,"independent_variable_names"),"w+") { |f| f.puts @independent_variable_names.join(",") } File.open(File.join(@dir,"independent_variables"),"w+") { |f| @independent_variables.each{|row| f.puts row.join(",")} } end def scale_independent_variables file @header.shift if @has_id @independent_variable_names = @header @lines.each_with_index do |line,i| items = line.chomp.split(",") @ids << items.shift if @dependent_variable_type == "binary" @dependent_variables << @dependent_variable_values[items.pop] elsif @dependent_variable_type == "numeric" @dependent_variables << items.pop.to_f end @independent_variables << items.collect{|i| i.to_f} end @independent_variables = Matrix[ *@independent_variables ] columns = @independent_variables.column_vectors stdev = columns.collect{|c| c.to_a.standard_deviation} stdev.each_index.reverse_each do |i| if stdev[i] == 0 @independent_variable_names.delete_at(i) columns.delete_at(i) end end @independent_variable_means = columns.collect{|c| c.to_a.mean} @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation} scaled_columns = [] columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@independent_variable_means[i])/@independent_variable_standard_deviations[i] : nil}} @independent_variables = Matrix.columns(scaled_columns).to_a print_variables File.open(File.join(@dir,"means"),"w+") { |f| f.puts @independent_variable_means.join(",") } File.open(File.join(@dir,"standard_deviations"),"w+") { |f| f.puts @independent_variable_standard_deviations.join(",") } end def fingerprint_independent_variables file, fingerprint_type="MP2D" fingerprints = [] @lines.each_with_index do |line,i| items = line.chomp.split(",") @has_id ? @ids << items.shift : @ids << i if @dependent_variable_type == "binary" @dependent_variables << @dependent_variable_values[items.pop] elsif @dependent_variable_type == "numeric" @dependent_variables << items.pop.to_f end @independent_variables << [items[0]] + Compound.new(items[0]).fingerprint(fingerprint_type) end @independent_variable_names = ["Canonical Smiles"] + fingerprints.flatten.sort.uniq print_variables end end =begin # Create a dataset from PubChem Assay # @param [Integer] PubChem AssayID (AID) # @return [OpenTox::Dataset] def self.from_pubchem_aid aid # TODO get regression data aid_url = File.join PUBCHEM_URI, "assay/aid/#{aid}" assay_metadata = JSON.parse(RestClientWrapper.get(File.join aid_url,"description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"] name = assay_metadata["name"].gsub(/\s+/,"_") dataset = self.new(:source => aid_url, :name => name) # Get assay data in chunks # Assay record retrieval is limited to 10000 SIDs # https://pubchemdocs.ncbi.nlm.nih.gov/pug-rest-tutorial$_Toc458584435 list = JSON.parse(RestClientWrapper.get(File.join aid_url, "sids/JSON?list_return=listkey").to_s)["IdentifierList"] listkey = list["ListKey"] size = list["Size"] start = 0 csv = [] while start < size url = File.join aid_url, "CSV?sid=listkey&listkey=#{listkey}&listkey_start=#{start}&listkey_count=10000" csv += CSV.parse(RestClientWrapper.get(url).to_s).select{|r| r[0].match /^\d/} # discard header rows start += 10000 end table = [["SID","SMILES",name]] csv.each_slice(100) do |slice| # get SMILES in chunks cids = slice.collect{|s| s[2]} pubchem_cids = [] JSON.parse(RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{cids.join(",")}/property/CanonicalSMILES/JSON")).to_s)["PropertyTable"]["Properties"].each do |prop| i = cids.index(prop["CID"].to_s) value = slice[i][3] if value == "Active" or value == "Inactive" table << [slice[i][1].to_s,prop["CanonicalSMILES"],slice[i][3].to_s] pubchem_cids << prop["CID"].to_s else dataset.warnings << "Ignoring CID #{prop["CID"]}/ SMILES #{prop["CanonicalSMILES"]}, because PubChem activity is #{value}." end end (cids-pubchem_cids).each { |cid| dataset.warnings << "Could not retrieve SMILES for CID #{cid}, all entries are ignored." } end dataset.parse_table table dataset end # Merge an array of datasets # @param [Array] datasets Datasets to be merged # @param [Array] features Features to be merged (same size as datasets) # @param [Array] value_maps Value transfomations (use nil for keeping original values, same size as dataset) # @param [Bool] keep_original_features Copy original features/values to the merged dataset # @param [Bool] remove_duplicates Delete duplicated values (assuming they come from the same experiment) # @return [OpenTox::Dataset] merged dataset def self.merge datasets: , features: , value_maps: , keep_original_features: , remove_duplicates: dataset = self.create(:source => datasets.collect{|d| d.id.to_s}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")+" merged") datasets.each do |d| dataset.data_entries += d.data_entries dataset.warnings += d.warnings end if keep_original_features feature_classes = features.collect{|f| f.class}.uniq merged_feature = nil if feature_classes.size == 1 if features.first.kind_of? NominalFeature merged_feature = MergedNominalBioActivity.find_or_create_by(:name => features.collect{|f| f.name}.uniq.join(" and ") + " merged", :original_feature_ids => features.collect{|f| f.id}, :transformations => value_maps) else merged_feature = MergedNumericBioActivity.find_or_create_by(:name => features.collect{|f| f.name} + " merged", :original_feature_ids => features.collect{|f| f.id}) # TODO: regression transformations end else raise ArgumentError, "Cannot merge features of different types (#{feature_classes})." end accept_values = [] features.each_with_index do |f,i| dataset.data_entries += datasets[i].data_entries.select{|de| de[1] == f.id}.collect do |de| value_maps[i] ? v = value_maps[i][de[2]] : v = de[2] accept_values << v [de[0],merged_feature.id,v] end end if merged_feature.is_a? MergedNominalBioActivity merged_feature.accept_values = accept_values.uniq.sort merged_feature.save end dataset.data_entries.uniq! if remove_duplicates dataset.save dataset end end end =end