From 1f789133d961c29d3babfaf69cdde3d675288537 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Sat, 24 Aug 2019 14:44:52 +0200 Subject: initial refactored version for mutagenicity paper --- lib/array.rb | 87 +++++++ lib/classification.rb | 43 ++-- lib/compound.rb | 475 ++++++++++++++++------------------- lib/dataset.rb | 529 +++++---------------------------------- lib/lazar.rb | 96 ++------ lib/model.rb | 669 +++++++++----------------------------------------- lib/similarity.rb | 128 ++++------ lib/statistics.rb | 71 ++++++ 8 files changed, 647 insertions(+), 1451 deletions(-) create mode 100644 lib/array.rb create mode 100644 lib/statistics.rb (limited to 'lib') diff --git a/lib/array.rb b/lib/array.rb new file mode 100644 index 0000000..55186bd --- /dev/null +++ b/lib/array.rb @@ -0,0 +1,87 @@ +class Array + + # Sum the size of single arrays in an array of arrays + # @param [Array] Array of arrays + # @return [Integer] Sum of size of array elements + def sum_size + self.inject(0) { |s,a| + if a.respond_to?('size') + s+=a.size + else + internal_server_error "No size available: #{a.inspect}" + end + } + end + + # Check if the array has just one unique value. + # @param [Array] Array to test. + # @return [TrueClass,FalseClass] + def zero_variance? + return self.uniq.size == 1 + end + + # Get the median of an array + # @return [Numeric] + def median + sorted = self.sort + len = sorted.length + (sorted[(len - 1) / 2] + sorted[len / 2]) / 2.0 + end + + # Get the mean of an array + # @return [Numeric] + def mean + self.compact.inject{ |sum, el| sum + el }.to_f / self.compact.size + end + + # Get the variance of an array + # @return [Numeric] + def sample_variance + m = self.mean + sum = self.compact.inject(0){|accum, i| accum +(i-m)**2 } + sum/(self.compact.length - 1).to_f + end + + # Get the standard deviation of an array + # @return [Numeric] + def standard_deviation + Math.sqrt(self.sample_variance) + end + + # Calculate dot product + # @param [Array] + # @return [Numeric] + def dot_product(a) + products = self.zip(a).map{|a, b| a * b} + products.inject(0) {|s,p| s + p} + end + + # Calculate magnitude + # @return [Numeric] + def magnitude + squares = self.map{|x| x ** 2} + Math.sqrt(squares.inject(0) {|s, c| s + c}) + end + + # Convert array values for R + # @return [Array] + def for_R + if self.first.is_a?(String) + #"\"#{self.collect{|v| v.sub('[','').sub(']','')}.join(" ")}\"" # quote and remove square brackets + "NA" + else + self.median + end + end + + # Collect array with index + # in analogy to each_with_index + def collect_with_index + result = [] + self.each_with_index do |elt, idx| + result << yield(elt, idx) + end + result + end +end + diff --git a/lib/classification.rb b/lib/classification.rb index 638492b..5796ce2 100644 --- a/lib/classification.rb +++ b/lib/classification.rb @@ -1,29 +1,22 @@ -module OpenTox - module Algorithm +module Lazar - # Classification algorithms - class Classification - - # Weighted majority vote - # @param [Array] dependent_variables - # @param [Array] weights - # @return [Hash] - def self.weighted_majority_vote dependent_variables:, independent_variables:nil, weights:, query_variables:nil - class_weights = {} - dependent_variables.each_with_index do |v,i| - class_weights[v] ||= [] - class_weights[v] << weights[i] unless v.nil? - end - probabilities = {} - class_weights.each do |a,w| - probabilities[a] = w.sum/weights.sum - end - probabilities = probabilities.collect{|a,p| [a,weights.max*p]}.to_h - p_max = probabilities.collect{|a,p| p}.max - prediction = probabilities.key(p_max) - {:value => prediction,:probabilities => probabilities} - end - + # Classification algorithms + class Classification + + # Weighted majority vote + # @param [Array<0,1>] dependent_variables + # @param [Array] weights + # @return [Hash] + def self.weighted_majority_vote dependent_variables:, weights: + w = [] + w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]} + w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]} + weights_sum = weights.sum.to_f + weights_max = weights.max.to_f + probabilities = [] + probabilities[0] = weights_max*w[0].sum/weights_sum + probabilities[1] = weights_max*w[1].sum/weights_sum + probabilities end end diff --git a/lib/compound.rb b/lib/compound.rb index 6d0e075..615ea6e 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,296 +1,247 @@ -module OpenTox +require 'openbabel' - # Small molecules with defined chemical structures - class Compound < Substance - require_relative "unique_descriptors.rb" - DEFAULT_FINGERPRINT = "MP2D" +# Small molecules with defined chemical structures +class Compound + DEFAULT_FINGERPRINT = "MP2D" - field :inchi, type: String - field :smiles, type: String - field :inchikey, type: String - field :names, type: Array - field :cid, type: String - field :png_id, type: BSON::ObjectId - field :svg_id, type: BSON::ObjectId - field :sdf_id, type: BSON::ObjectId - field :fingerprints, type: Hash, default: {} - field :default_fingerprint_size, type: Integer - - index({smiles: 1}, {unique: true}) - - # Overwrites standard Mongoid method to create fingerprints before database insertion - def self.find_or_create_by params - compound = self.find_or_initialize_by params - compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT).size - compound.save - compound - end + def initialize smiles + @smiles = smiles + @fingerprints = {} + end - # Create chemical fingerprint - # @param [String] fingerprint type - # @return [Array] - def fingerprint type=DEFAULT_FINGERPRINT - unless fingerprints[type] - return [] unless self.smiles - if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format - fp = obconversion(smiles,"smi","mpd").strip.split("\t") - name = fp.shift # remove Title - fingerprints[type] = fp.uniq # no fingerprint counts - elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html - level = 2 # TODO: level as parameter, evaluate level 1, see paper - fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n") - fp.shift # remove Title - fingerprints[type] = fp - else # standard fingerprints - fp = OpenBabel::OBFingerprint.find_fingerprint(type) - obmol = OpenBabel::OBMol.new - obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format "smi" - obconversion.read_string obmol, self.smiles - result = OpenBabel::VectorUnsignedInt.new - fp.get_fingerprint(obmol,result) - # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i - #p OpenBabel::OBFingerprint.describe_bits(result) - # convert result to a list of the bits that are set - # from openbabel/scripts/python/pybel.py line 830 - # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints - result = result.to_a - bitsperint = OpenBabel::OBFingerprint.getbitsperint() - bits_set = [] - start = 1 - result.each do |x| - i = start - while x > 0 do - bits_set << i if (x % 2) == 1 - x >>= 1 - i += 1 - end - start += bitsperint + # Create chemical fingerprint + # @param [String] fingerprint type + # @return [Array] + def fingerprint type=DEFAULT_FINGERPRINT + unless @fingerprints[type] + if type == "MP2D" # http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format + fp = obconversion(@smiles,"smi","mpd").strip.split("\t") + fp.shift # remove Title + @fingerprints[type] = fp.uniq # no fingerprint counts + elsif type== "MNA" # http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html + level = 2 # TODO: level as parameter, evaluate level 1, see paper + fp = obconversion(@smiles,"smi","mna","xL\"#{level}\"").split("\n") + fp.shift # remove Title + @fingerprints[type] = fp + else # standard fingerprints + fp = OpenBabel::OBFingerprint.find_fingerprint(type) + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_format "smi" + obconversion.read_string obmol, @smiles + result = OpenBabel::VectorUnsignedInt.new + fp.get_fingerprint(obmol,result) + # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i + #p OpenBabel::OBFingerprint.describe_bits(result) + # convert result to a list of the bits that are set + # from openbabel/scripts/python/pybel.py line 830 + # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints + result = result.to_a + bitsperint = OpenBabel::OBFingerprint.getbitsperint() + bits_set = [] + start = 1 + result.each do |x| + i = start + while x > 0 do + bits_set << i if (x % 2) == 1 + x >>= 1 + i += 1 end - fingerprints[type] = bits_set + start += bitsperint end - save + @fingerprints[type] = bits_set end - fingerprints[type] end + @fingerprints[type] + end - # Calculate physchem properties - # @param [Array] list of descriptors - # @return [Array] - def calculate_properties descriptors=PhysChem::OPENBABEL - calculated_ids = properties.keys - # BSON::ObjectId instances are not allowed as keys in a BSON document. - new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids - descs = {} - algos = {} - new_ids.each do |id| - descriptor = PhysChem.find id - descs[[descriptor.library, descriptor.descriptor]] = descriptor - algos[descriptor.name] = descriptor - end - # avoid recalculating Cdk features with multiple values - descs.keys.uniq.each do |k| - descs[k].send(k[0].downcase,k[1],self).each do |n,v| - properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. - end +=begin + # Calculate physchem properties + # @param [Array] list of descriptors + # @return [Array] + def calculate_properties descriptors=PhysChem::OPENBABEL + calculated_ids = properties.keys + # BSON::ObjectId instances are not allowed as keys in a BSON document. + new_ids = descriptors.collect{|d| d.id.to_s} - calculated_ids + descs = {} + algos = {} + new_ids.each do |id| + descriptor = PhysChem.find id + descs[[descriptor.library, descriptor.descriptor]] = descriptor + algos[descriptor.name] = descriptor + end + # avoid recalculating Cdk features with multiple values + descs.keys.uniq.each do |k| + descs[k].send(k[0].downcase,k[1],self).each do |n,v| + properties[algos[n].id.to_s] = v # BSON::ObjectId instances are not allowed as keys in a BSON document. end - save - descriptors.collect{|d| properties[d.id.to_s]} end - - # Match a SMARTS substructure - # @param [String] smarts - # @param [TrueClass,FalseClass] count matches or return true/false - # @return [TrueClass,FalseClass,Fixnum] - def smarts_match smarts, count=false - obconversion = OpenBabel::OBConversion.new - obmol = OpenBabel::OBMol.new - obconversion.set_in_format('smi') - obconversion.read_string(obmol,self.smiles) - smarts_pattern = OpenBabel::OBSmartsPattern.new - smarts.collect do |sma| - smarts_pattern.init(sma.smarts) - if smarts_pattern.match(obmol) - count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 - else - value = 0 - end - value + save + descriptors.collect{|d| properties[d.id.to_s]} + end +=end + + # Match a SMARTS substructure + # @param [String] smarts + # @param [TrueClass,FalseClass] count matches or return true/false + # @return [TrueClass,FalseClass,Fixnum] + def smarts_match smarts, count=false + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_format('smi') + obconversion.read_string(obmol,@smiles) + smarts_pattern = OpenBabel::OBSmartsPattern.new + smarts.collect do |sma| + smarts_pattern.init(sma.smarts) + if smarts_pattern.match(obmol) + count ? value = smarts_pattern.get_map_list.to_a.size : value = 1 + else + value = 0 end + value end + end - # Create a compound from smiles string - # @example - # compound = OpenTox::Compound.from_smiles("c1ccccc1") - # @param [String] smiles - # @return [OpenTox::Compound] - def self.from_smiles smiles - return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles - smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) - smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles) - end - - # Create a compound from InChI string - # @param [String] InChI - # @return [OpenTox::Compound] - def self.from_inchi inchi - smiles = obconversion(inchi,"inchi","can") - smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles) - end - - # Create a compound from SDF - # @param [String] SDF - # @return [OpenTox::Compound] - def self.from_sdf sdf - # do not store sdf because it might be 2D - Compound.from_smiles obconversion(sdf,"sdf","can") - end - - # Create a compound from name. Relies on an external service for name lookups. - # @example - # compound = OpenTox::Compound.from_name("Benzene") - # @param [String] name, can be also an InChI/InChiKey, CAS number, etc - # @return [OpenTox::Compound] - def self.from_name name - Compound.from_smiles RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","name",URI.escape(name),"property","CanonicalSMILES","TXT")).chomp - end + # Create a compound from smiles string + # @example + # compound = Lazar::Compound.from_smiles("c1ccccc1") + # @param [String] smiles + # @return [Lazar::Compound] + def self.from_smiles smiles + return nil if smiles.match(/\s/) # spaces seem to confuse obconversion and may lead to invalid smiles + @smiles = obconversion(smiles,"smi","can") # test if SMILES is correct and return canonical smiles (for compound comparisons) + @smiles.empty? ? nil : @smiles + end - # Get InChI - # @return [String] - def inchi - unless self["inchi"] - result = obconversion(smiles,"smi","inchi") - update(:inchi => result.chomp) if result and !result.empty? - end - self["inchi"] - end + # Create a compound from InChI string + # @param [String] InChI + # @return [OpenTox::Compound] + def self.from_inchi inchi + @smiles = obconversion(inchi,"inchi","can") + @smiles.empty? ? nil : @smiles + end - # Get InChIKey - # @return [String] - def inchikey - update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"] - self["inchikey"] - end + # Create a compound from SDF + # @param [String] SDF + # @return [OpenTox::Compound] + def self.from_sdf sdf + # do not store sdf because it might be 2D + Compound.from_smiles obconversion(sdf,"sdf","can") + end - # Get (canonical) smiles - # @return [String] - def smiles - update(:smiles => obconversion(self["smiles"],"smi","can")) unless self["smiles"] - self["smiles"] - end + # Create a compound from name. Relies on an external service for name lookups. + # @example + # compound = OpenTox::Compound.from_name("Benzene") + # @param [String] name, can be also an InChI/InChiKey, CAS number, etc + # @return [OpenTox::Compound] + def self.from_name name + Compound.from_smiles RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","name",URI.escape(name),"property","CanonicalSMILES","TXT")).chomp + end - # Get SDF - # @return [String] - def sdf - if self.sdf_id.nil? - sdf = obconversion(smiles,"smi","sdf") - file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile") - sdf_id = $gridfs.insert_one file - update :sdf_id => sdf_id - end - $gridfs.find_one(_id: self.sdf_id).data - end + # Get InChI + # @return [String] + def inchi + obconversion(@smiles,"smi","inchi") + end - # Get SVG image - # @return [image/svg] Image data - def svg - if self.svg_id.nil? - svg = obconversion(smiles,"smi","svg") - file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg") - update(:svg_id => $gridfs.insert_one(file)) - end - $gridfs.find_one(_id: self.svg_id).data - end + # Get InChIKey + # @return [String] + def inchikey + obconversion(@smiles,"smi","inchikey") + end - # Get png image - # @example - # image = compound.png - # @return [image/png] Image data - def png - if self.png_id.nil? - png = obconversion(smiles,"smi","_png2") - file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png") - update(:png_id => $gridfs.insert_one(file)) - end - Base64.decode64($gridfs.find_one(_id: self.png_id).data) - end + # Get SDF + # @return [String] + def sdf + obconversion(smiles,"smi","sdf") + end - # Get all known compound names. Relies on an external service for name lookups. - # @example - # names = compound.names - # @return [Array] - def names - update(:names => RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","smiles",URI.escape(smiles),"synonyms","TXT")).split("\n")) #unless self["names"] - self["names"] - end + # Get SVG image + # @return [image/svg] Image data + def svg + obconversion(smiles,"smi","svg") + end - # Get PubChem Compound Identifier (CID), obtained via REST call to PubChem - # @return [String] - def cid - update(:cid => RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip) unless self["cid"] - self["cid"] - end - - # Convert mmol to mg - # @return [Float] value in mg - def mmol_to_mg mmol - mmol.to_f*molecular_weight - end + # Get png image + # @example + # image = compound.png + # @return [image/png] Image data + def png + obconversion(smiles,"smi","_png2") + end - # Convert mg to mmol - # @return [Float] value in mmol - def mg_to_mmol mg - mg.to_f/molecular_weight - end - - # Calculate molecular weight of Compound with OB and store it in compound object - # @return [Float] molecular weight - def molecular_weight - mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") - calculate_properties([mw_feature]).first - end + # Get all known compound names. Relies on an external service for name lookups. + # @example + # names = compound.names + # @return [Array] + def names + RestClientWrapper.get(File.join(PUBCHEM_URI,"compound","smiles",URI.escape(smiles),"synonyms","TXT")).split("\n") + end - private + # Get PubChem Compound Identifier (CID), obtained via REST call to PubChem + # @return [String] + def cid + RestClientWrapper.post(File.join(PUBCHEM_URI, "compound", "inchi", "cids", "TXT"),{:inchi => inchi}).strip + end + + # Convert mmol to mg + # @return [Float] value in mg + def mmol_to_mg mmol + mmol.to_f*molecular_weight + end - def self.obconversion(identifier,input_format,output_format,option=nil) - obconversion = OpenBabel::OBConversion.new - obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option - obmol = OpenBabel::OBMol.new - obconversion.set_in_and_out_formats input_format, output_format - return nil if identifier.nil? - obconversion.read_string obmol, identifier - case output_format - when /smi|can|inchi/ - obconversion.write_string(obmol).split(/\s/).first - when /sdf/ - # TODO: find disconnected structures - # strip_salts - # separate - obmol.add_hydrogens - builder = OpenBabel::OBBuilder.new - builder.build(obmol) + # Convert mg to mmol + # @return [Float] value in mmol + def mg_to_mmol mg + mg.to_f/molecular_weight + end + + # Calculate molecular weight of Compound with OB and store it in compound object + # @return [Float] molecular weight + def molecular_weight + mw_feature = PhysChem.find_or_create_by(:name => "Openbabel.MW") + calculate_properties([mw_feature]).first + end - sdf = obconversion.write_string(obmol) + def self.obconversion(identifier,input_format,output_format,option=nil) + obconversion = OpenBabel::OBConversion.new + obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option + obmol = OpenBabel::OBMol.new + obconversion.set_in_and_out_formats input_format, output_format + return nil if identifier.nil? + obconversion.read_string obmol, identifier + case output_format + when /smi|can|inchi/ + obconversion.write_string(obmol).split(/\s/).first + when /sdf/ + # TODO: find disconnected structures + # strip_salts + # separate + obmol.add_hydrogens + builder = OpenBabel::OBBuilder.new + builder.build(obmol) + + sdf = obconversion.write_string(obmol) print sdf + if sdf.match(/.nan/) + + #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" + obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS) + sdf = obconversion.write_string(obmol) if sdf.match(/.nan/) - - #warn "3D generation failed for compound #{identifier}, trying to calculate 2D structure" - obconversion.set_options("gen2D", OpenBabel::OBConversion::GENOPTIONS) + #warn "2D generation failed for compound #{identifier}, rendering without coordinates." + obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS) sdf = obconversion.write_string(obmol) - if sdf.match(/.nan/) - #warn "2D generation failed for compound #{identifier}, rendering without coordinates." - obconversion.remove_option("gen2D", OpenBabel::OBConversion::GENOPTIONS) - sdf = obconversion.write_string(obmol) - end end - sdf - else - obconversion.write_string(obmol) end + sdf + else + obconversion.write_string(obmol) end + end - def obconversion(identifier,input_format,output_format,option=nil) - self.class.obconversion(identifier,input_format,output_format,option) - end + def obconversion(identifier,input_format,output_format,option=nil) + self.class.obconversion(identifier,input_format,output_format,option) end + end diff --git a/lib/dataset.rb b/lib/dataset.rb index 1d6b56c..8cb343f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -1,234 +1,76 @@ -require 'csv' -require 'tempfile' -require 'digest/md5' - -module OpenTox - - # Collection of substances and features - class Dataset - - field :data_entries, type: Array, default: [] #substance,feature,value - field :warnings, type: Array, default: [] - field :source, type: String - field :md5, type: String - - # Readers - - # Get all compounds - # @return [Array] - def compounds - substances.select{|s| s.is_a? Compound} - end - - # Get all nanoparticles - # @return [Array] - def nanoparticles - substances.select{|s| s.is_a? Nanoparticle} - end - - # Get all substances - # @return [Array] - def substances - @substances ||= data_entries.collect{|row| OpenTox::Substance.find row[0]}.uniq - @substances - end - - # Get all features - # @return [Array] - def features - @features ||= data_entries.collect{|row| OpenTox::Feature.find(row[1])}.uniq - @features - end - - # Get all values for a given substance and feature - # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id - # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id - # @return [Array] values - def values substance,feature - substance = substance.id if substance.is_a? Substance - feature = feature.id if feature.is_a? Feature - substance = BSON::ObjectId.from_string(substance) if substance.is_a? String - feature = BSON::ObjectId.from_string(feature) if feature.is_a? String - data_entries.select{|row| row[0] == substance and row[1] == feature}.collect{|row| row[2]} - end - - # Get OriginalId features - # @return [Array] original ID features (merged datasets may have multiple original IDs) - def original_id_features - features.select{|f| f.is_a?(OriginalId)} - end - - # Get OriginalSmiles features - # @return [Array] original smiles features (merged datasets may have multiple original smiles) - def original_smiles_features - features.select{|f| f.is_a?(OriginalSmiles)} - end - - # Get Warnings features - # @return [Array] warnings features (merged datasets may have multiple warnings) - def warnings_features - features.select{|f| f.is_a?(Warnings)} - end - - # Get Confidence feature - # @return [OpenTox::Confidence] confidence feature - def confidence_feature - features.select{|f| f.is_a?(Confidence)}.first - end - - # Get nominal and numeric bioactivity features - # @return [Array] - def bioactivity_features - features.select{|f| f._type.match(/BioActivity/)} - end - - # Get nominal and numeric bioactivity features - # @return [Array] - def transformed_bioactivity_features - features.select{|f| f._type.match(/Transformed.*BioActivity/)} - end - - # Get nominal and numeric substance property features - # @return [Array] - def substance_property_features - features.select{|f| f._type.match("SubstanceProperty")} - end - - # Get nominal and numeric prediction features - # @return [Array] - def prediction_feature - features.select{|f| f._type.match(/Prediction$/)}.first - end - - # Get supporting nominal and numeric prediction features (class probabilities, prediction interval) - # @return [Array] - def prediction_supporting_features - features.select{|f| f.is_a?(LazarPredictionProbability) or f.is_a?(LazarPredictionInterval)} - end - - # Get nominal and numeric merged features - # @return [Array] - def merged_features - features.select{|f| f._type.match("Merged")} - end - - # Writers - - # Add a value for a given substance and feature - # @param [OpenTox::Substance,BSON::ObjectId,String] substance or substance id - # @param [OpenTox::Feature,BSON::ObjectId,String] feature or feature id - # @param [TrueClass,FalseClass,Float] - def add(substance,feature,value) - substance = substance.id if substance.is_a? Substance - feature = feature.id if feature.is_a? Feature - data_entries << [substance,feature,value] if substance and feature and value - end - - # Parsers - - # Create a dataset from CSV file - # @param [File] Input file with the following format: - # - ID column (optional): header containing "ID" string, arbitrary ID values - # - SMILES/InChI column: header indicating "SMILES" or "InChI", Smiles or InChI strings - # - one or more properties column(s): header with property name(s), property values - # files with a single property column are read as BioActivities (i.e. dependent variable) - # files with multiple property columns are read as SubstanceProperties (i.e. independent variables) - # @return [OpenTox::Dataset] - def self.from_csv_file file - md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files - dataset = self.find_by(:md5 => md5) - if dataset - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - table = nil - sep = "," - ["\t",";"].each do |s| # guess alternative CSV separator - if File.readlines(file).first.match(/#{s}/) - sep = s - break - end - end - table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - if table - dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) - dataset.parse_table table - else - raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." - end - end - dataset - end - - # Create a dataset from CSV with descriptor values file - # @param [File] Input file with the following format: - # - ID column: header containing arbitrary string, arbitrary ID values - # - properties columns: header with property names, property values (i.e. independent variables) - # - bioactivity column (last column): header with bioactivity name, bioactivity values (i.e. dependent variable) - # @param [String] Descriptor type - # @return [OpenTox::Dataset] - def self.from_descriptor_csv_file file, category - md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files - dataset = self.find_by(:md5 => md5) - #dataset = nil - if dataset - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - p "Parsing #{file}." - table = nil - sep = "," - ["\t",";"].each do |s| # guess alternative CSV separator - if File.readlines(file).first.match(/#{s}/) - sep = s - break - end - end - table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - raise ArgumentError, "#{file} is not a valid CSV/TSV file. Could not find ',' ';' or TAB as column separator." unless table - dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) - - # features - feature_names = table.shift.collect{|f| f.strip} - raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - - original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => feature_names.shift) - - bioactivity_feature_name = feature_names.pop - values = table.collect{|row| val=row.last.to_s.strip; val.blank? ? nil : val }.uniq.compact - types = values.collect{|v| v.numeric? ? true : false}.uniq - if values.size > 5 and types.size == 1 and types.first == true # 5 max classes - bioactivity_feature = NumericBioActivity.find_or_create_by(:name => bioactivity_feature_name) - else - bioactivity_feature = NominalBioActivity.find_or_create_by(:name => bioactivity_feature_name, :accept_values => values.sort) - end - - # substances and values +require 'matrix' + +class Dataset + + def initialize file + @dir = File.dirname file + @dependent_variable_type = File.read(File.join(@dir,"dependent_variable_type")).chomp + if @dependent_variable_type == "binary" + @dependent_variable_values = {} + File.readlines(File.join(@dir,"dependent_variable_values")).each_with_index{|v,i| @dependent_variable_values[v.chomp] = i} + end + @independent_variable_type = File.read(File.join(@dir,"independent_variable_type")).chomp + @lines = File.readlines(file) + @header = @lines.shift.split(",") + @header.first.match(/ID/i) ? @has_id = true : @has_id = false + @dependent_variable_name = @header.pop + @ids = [] + @dependent_variables = [] + @independent_variables = [] + @independent_variable_names = [] + end - table.each_with_index do |vals,i| + def print_variables + File.open(File.join(@dir,"ids"),"w+") { |f| f.puts @ids.join("\n") } + File.open(File.join(@dir,"dependent_variable_name"),"w+") { |f| f.puts @dependent_variable_name } + File.open(File.join(@dir,"dependent_variables"),"w+") { |f| f.puts @dependent_variables.join("\n") } + File.open(File.join(@dir,"independent_variable_names"),"w+") { |f| f.puts @independent_variable_names.join(",") } + File.open(File.join(@dir,"independent_variables"),"w+") { |f| @independent_variables.each{|row| f.puts row.join(",")} } + end - original_id_value = vals.shift.to_s.strip - bioactivity_value = vals.pop.to_s.strip - substance = Substance.new - dataset.add substance, original_id, original_id_value + def scale_independent_variables file + @header.shift if @has_id + @independent_variable_names = @header + @lines.each_with_index do |line,i| + items = line.chomp.split(",") + @ids << items.shift + if @dependent_variable_type == "binary" + @dependent_variables << @dependent_variable_values[items.pop] + elsif @dependent_variable_type == "numeric" + @dependent_variables << items.pop.to_f + end + @independent_variables << items.collect{|i| i.to_f} + end + @independent_variables = Matrix[ *@independent_variables ] + columns = @independent_variables.column_vectors + @independent_variable_means = columns.collect{|c| c.to_a.mean} + @independent_variable_standard_deviations = columns.collect{|c| c.to_a.standard_deviation} + scaled_columns = [] + columns.each_with_index{|col,i| scaled_columns << col.collect{|v| v ? (v-@independent_variable_means[i])/@independent_variable_standard_deviations[i] : nil}} + @independent_variables = Matrix.columns(scaled_columns).to_a + print_variables + File.open(File.join(@dir,"means"),"w+") { |f| f.puts @independent_variable_means.join(",") } + File.open(File.join(@dir,"standard_deviations"),"w+") { |f| f.puts @independent_variable_standard_deviations.join(",") } + end - vals.each_with_index do |v,j| - if v.blank? - warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." - next - else - property = NumericSubstanceProperty.find_or_create_by(:name => feature_names[j],:category => category) - substance.properties[property.id.to_s] = v.to_f - end - end - substance.save - dataset.add substance, bioactivity_feature, bioactivity_value - end - dataset.save + def fingerprint_independent_variables file, fingerprint_type="MP2D" + fingerprints = [] + @lines.each_with_index do |line,i| + items = line.chomp.split(",") + @has_id ? @ids << items.shift : @ids << i + if @dependent_variable_type == "binary" + @dependent_variables << @dependent_variable_values[items.pop] + elsif @dependent_variable_type == "numeric" + @dependent_variables << items.pop.to_f end - dataset + @independent_variables << [items[0]] + Compound.new(items[0]).fingerprint(fingerprint_type) end + @independent_variable_names = ["Canonical Smiles"] + fingerprints.flatten.sort.uniq + print_variables + end +end +=begin # Create a dataset from SDF file # files with a single data field are read as BioActivities (i.e. dependent variable) # files with multiple data fields are read as SubstanceProperties (i.e. independent variable) @@ -325,178 +167,6 @@ module OpenTox dataset end - # Parse data in tabular format (e.g. from csv) - # does a lot of guesswork in order to determine feature types - # @param [Array] - def parse_table table - - # features - feature_names = table.shift.collect{|f| f.strip} - raise ArgumentError, "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size - - if feature_names[0] !~ /SMILES|InChI/i # check ID column - original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => feature_names.shift) - else - original_id = OriginalId.find_or_create_by(:dataset_id => self.id,:name => "LineID") - end - - compound_format = feature_names.shift - raise ArgumentError, "#{compound_format} is not a supported compound format. Accepted formats: SMILES, InChI." unless compound_format =~ /SMILES|InChI/i - original_smiles = OriginalSmiles.find_or_create_by(:dataset_id => self.id) if compound_format.match(/SMILES/i) - - numeric = [] - features = [] - - # guess feature types - bioactivity = true if feature_names.size == 1 - - feature_names.each_with_index do |f,i| - original_id.name.match(/LineID$/) ? j = i+1 : j = i+2 - values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact - types = values.collect{|v| v.numeric? ? true : false}.uniq - feature = nil - if values.size == 0 # empty feature - elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - numeric[i] = true - bioactivity ? feature = NumericBioActivity.find_or_create_by(:name => f) : feature = NumericSubstanceProperty.find_or_create_by(:name => f) - else - numeric[i] = false - bioactivity ? feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) : feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) - end - features << feature if feature - end - - # substances and values - - all_substances = [] - table.each_with_index do |vals,i| - original_id.name.match(/LineID$/) ? original_id_value = i+1 : original_id_value = vals.shift.to_s.strip - identifier = vals.shift.strip - begin - case compound_format - when /SMILES/i - substance = Compound.from_smiles(identifier) - add substance, original_smiles, identifier - when /InChI/i - substance = Compound.from_inchi(identifier) - end - rescue - substance = nil - end - - if substance.nil? # compound parsers may return nil - warnings << "Cannot parse #{compound_format} compound '#{identifier}' at line #{i+2} of #{source}, all entries are ignored." - next - end - - all_substances << substance - add substance, original_id, original_id_value - - vals.each_with_index do |v,j| - if v.blank? - warnings << "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." - next - elsif numeric[j] - v = v.to_f - else - v = v.strip - end - add substance, features[j], v - end - end - - warnings_feature = Warnings.find_or_create_by(:dataset_id => id) - all_substances.duplicates.each do |substance| - positions = [] - all_substances.each_with_index{|c,i| positions << i+1 if !c.blank? and c.smiles and c.smiles == substance.smiles} - all_substances.select{|s| s.smiles == substance.smiles}.each do |s| - add s, warnings_feature, "Duplicated compound #{substance.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments." - end - end - save - end - - # Serialisation - - # Convert dataset into csv formatted training data - # @return [String] - def to_training_csv - - export_features = merged_features - export_features = transformed_bioactivity_features if export_features.empty? - export_features = bioactivity_features if export_features.empty? - export_feature = export_features.first - - header = ["Canonical SMILES"] - header << bioactivity_features.first.name # use original bioactivity name instead of long merged name - csv = [header] - - substances.each do |substance| - nr_activities = values(substance,bioactivity_features.first).size - (0..nr_activities-1).each do |n| # new row for each value - row = [substance.smiles] - row << values(substance,export_feature)[n] - csv << row - end - end - csv.collect{|r| r.join(",")}.join("\n") - end - - # Convert lazar prediction dataset to csv format - # @return [String] - def to_prediction_csv - - compound = substances.first.is_a? Compound - header = ["ID"] - header << "Original SMILES" if compound - compound ? header << "Canonical SMILES" : header << "Name" - header << "Prediction" if prediction_feature - header << "Confidence" if confidence_feature - header += prediction_supporting_features.collect{|f| f.name} - header << "Measurements" - csv = [header] - - substances.each do |substance| - row = original_id_features.collect{|f| values(substance,f).join(" ")} - row += original_smiles_features.collect{|f| values(substance,f).join(" ")} if compound - compound ? row << substance.smiles : row << substance.name - row << values(substance,prediction_feature).join(" ") - row << values(substance,confidence_feature).join(" ") - row += prediction_supporting_features.collect{|f| values(substance,f).join(" ")} - row << values(substance,bioactivity_features[0]).join(" ") - csv << row - end - csv.collect{|r| r.join(",")}.join("\n") - end - - # Export fingerprints in csv format - # @return [String] - def to_fingerprint_csv type=Compound::DEFAULT_FINGERPRINT - - fingerprints = substances.collect{|s| s.fingerprints[type]}.flatten.sort.uniq - export_features = merged_features - export_features = transformed_bioactivity_features if export_features.empty? - export_features = bioactivity_features if export_features.empty? - export_feature = export_features.first - - header = ["Canonical SMILES"] - header += fingerprints - header << bioactivity_features.first.name # use original bioactivity name instead of long merged name - csv = [header] - - substances.each do |substance| - nr_activities = values(substance,bioactivity_features.first).size - (0..nr_activities-1).each do |n| # new row for each value - row = [substance.smiles] - fingerprints.each do |f| - substance.fingerprints[type].include?(f) ? row << 1 : row << 0 - end - row << values(substance,export_feature)[n] - csv << row - end - end - csv.collect{|r| r.join(",")}.join("\n") - end # Convert dataset to SDF format # @return [String] SDF string @@ -520,70 +190,6 @@ module OpenTox sdf end - # Get lazar predictions from a dataset - # @return [Hash] predictions - def predictions - predictions = {} - substances.each do |s| - predictions[s] ||= {} - predictions[s][:value] = values(s,prediction_feature).first - #predictions[s][:warnings] = [] - #warnings_features.each { |w| predictions[s][:warnings] += values(s,w) } - predictions[s][:confidence] = values(s,confidence_feature).first - if predictions[s][:value] and prediction_feature.is_a? NominalLazarPrediction - prediction_feature.accept_values.each do |v| - f = LazarPredictionProbability.find_by(:name => v, :model_id => prediction_feature.model_id, :training_feature_id => prediction_feature.training_feature_id) - predictions[s][:probabilities] ||= {} - predictions[s][:probabilities][v] = values(s,f).first - end - end - end - predictions - end - - # Dataset operations - - # Copy a dataset - # @return OpenTox::Dataset dataset copy - def copy - dataset = Dataset.new - dataset.data_entries = data_entries - dataset.warnings = warnings - dataset.name = name - dataset.source = id.to_s - dataset.save - dataset - end - - # Split a dataset into n folds - # @param [Integer] number of folds - # @return [Array] Array with folds [training_dataset,test_dataset] - def folds n - $logger.debug "Creating #{n} folds for #{name}." - len = self.substances.size - indices = (0..len-1).to_a.shuffle - mid = (len/n) - chunks = [] - start = 0 - 1.upto(n) do |i| - last = start+mid - last = last-1 unless len%n >= i - test_idxs = indices[start..last] || [] - test_substances = test_idxs.collect{|i| substances[i].id} - training_idxs = indices-test_idxs - training_substances = training_idxs.collect{|i| substances[i].id} - chunk = [training_substances,test_substances].collect do |substances| - self.class.create( - :name => "#{self.name} (Fold #{i-1})", - :source => self.id, - :data_entries => data_entries.select{|row| substances.include? row[0]} - ) - end - start = last+1 - chunks << chunk - end - chunks - end # Merge an array of datasets # @param [Array] datasets Datasets to be merged @@ -634,3 +240,4 @@ module OpenTox end end +=end diff --git a/lib/lazar.rb b/lib/lazar.rb index e77de9d..2cc1321 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -1,52 +1,5 @@ -require 'rubygems' -require "bundler/setup" -require "rest-client" -require 'addressable' -require 'yaml' -require 'json' -require 'logger' -require 'mongoid' -require 'rserve' -require "nokogiri" -require "base64" -require 'openbabel' - -# Environment setup -ENV["LAZAR_ENV"] ||= "production" -raise "Incorrect lazar environment variable LAZAR_ENV '#{ENV["LAZAR_ENV"]}', please set it to 'production' or 'development'." unless ENV["LAZAR_ENV"].match(/production|development/) - -ENV["MONGOID_ENV"] = ENV["LAZAR_ENV"] -ENV["RACK_ENV"] = ENV["LAZAR_ENV"] # should set sinatra environment -# CH: this interferes with /etc/hosts on my machine -# search for a central mongo database in use -# http://opentox.github.io/installation/2017/03/07/use-central-mongodb-in-docker-environment -# CENTRAL_MONGO_IP = `grep -oP '^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}(?=.*mongodb)' /etc/hosts`.chomp -Mongoid.load_configuration({ - :clients => { - :default => { - :database => ENV["LAZAR_ENV"], - #:hosts => (CENTRAL_MONGO_IP.blank? ? ["localhost:27017"] : ["#{CENTRAL_MONGO_IP}:27017"]), - :hosts => ["localhost:27017"] - } - } -}) -Mongoid.raise_not_found_error = false # return nil if no document is found -#$mongo = Mongo::Client.new("mongodb://#{(CENTRAL_MONGO_IP.blank? ? "127.0.0.1" : CENTRAL_MONGO_IP)}:27017/#{ENV['LAZAR_ENV']}") -$mongo = Mongo::Client.new("mongodb://127.0.0.1:27017/#{ENV['LAZAR_ENV']}") -$gridfs = $mongo.database.fs - -# Logger setup -STDOUT.sync = true # for redirection, etc see http://stackoverflow.com/questions/8549443/why-doesnt-logger-output-to-stdout-get-redirected-to-files -$logger = Logger.new STDOUT # STDERR did not work on my development machine (CH) -case ENV["LAZAR_ENV"] -when "production" - $logger.level = Logger::WARN - Mongo::Logger.level = Logger::WARN -when "development" - $logger.level = Logger::DEBUG - Mongo::Logger.level = Logger::WARN -end - +require 'fileutils' +=begin # R setup rlib = File.expand_path(File.join(File.dirname(__FILE__),"..","R")) # should work on POSIX including os x @@ -72,33 +25,32 @@ suppressPackageStartupMessages({ " PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" -CHEMBL_URI = "https://www.ebi.ac.uk/chembl/api/data/molecule/" - -# OpenTox classes and includes -CLASSES = ["Feature","Substance","Dataset","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation"]# Algorithm and Models are modules +=end [ # be aware of the require sequence as it affects class/method overwrites - "overwrite.rb", - "rest-client-wrapper.rb", - "opentox.rb", - "feature.rb", - "physchem.rb", - "substance.rb", + "array.rb", +# "overwrite.rb", +# "rest-client-wrapper.rb", +# "opentox.rb", +# "feature.rb", +# "physchem.rb", +# "substance.rb", "compound.rb", - "nanoparticle.rb", +# "nanoparticle.rb", "dataset.rb", - "algorithm.rb", +# "algorithm.rb", "similarity.rb", - "feature_selection.rb", +# "feature_selection.rb", "model.rb", - "classification.rb", - "regression.rb", - "caret.rb", - "validation-statistics.rb", - "validation.rb", - "train-test-validation.rb", - "leave-one-out-validation.rb", - "crossvalidation.rb", - "download.rb", - "import.rb", + "statistics.rb", +# "classification.rb", +# "regression.rb", +# "caret.rb", +# "validation-statistics.rb", +# "validation.rb", +# "train-test-validation.rb", +# "leave-one-out-validation.rb", +# "crossvalidation.rb", +# "download.rb", +# "import.rb", ].each{ |f| require_relative f } diff --git a/lib/model.rb b/lib/model.rb index 07759c5..44e0e50 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -1,570 +1,129 @@ -module OpenTox - - module Model - - class Lazar - - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - store_in collection: "models" - - attr_writer :independent_variables # store in GridFS to avoid Mongo database size limit problems - - field :name, type: String - field :creator, type: String, default: __FILE__ - field :algorithms, type: Hash, default:{} - field :training_dataset_id, type: BSON::ObjectId - field :substance_ids, type: Array, default:[] - field :prediction_feature_id, type: BSON::ObjectId - field :dependent_variables, type: Array, default:[] - field :descriptor_ids, type:Array, default:[] - field :independent_variables_id, type: BSON::ObjectId - field :fingerprints, type: Array, default:[] - field :descriptor_weights, type: Array, default:[] - field :descriptor_means, type: Array, default:[] - field :descriptor_sds, type: Array, default:[] - field :scaled_variables, type: Array, default:[] - field :version, type: Hash, default:{} - - # Create a lazar model - # @param [OpenTox::Dataset] training_dataset - # @param [OpenTox::Feature, nil] prediction_feature - # By default the first feature of the training dataset will be predicted, specify a prediction_feature if you want to predict another feature - # @param [Hash, nil] algorithms - # Default algorithms will be used, if no algorithms parameter is provided. The algorithms hash has the following keys: :descriptors (specifies the descriptors to be used for similarity calculations and local QSAR models), :similarity (similarity algorithm and thresholds for predictions with high and low confidence), :feature_selection (feature selection algorithm), :prediction (local QSAR algorithm). Default parameters are used for unspecified keys. - # - # @return [OpenTox::Model::Lazar] - def self.create prediction_feature:nil, training_dataset:, algorithms:{} - raise ArgumentError, "Please provide a training_dataset and a optional prediction_feature." unless prediction_feature or training_dataset - prediction_feature ||= training_dataset.features.select{|f| f.is_a? NumericBioActivity or f.is_a? NominalBioActivity}.first unless prediction_feature - - # guess model type - prediction_feature.is_a?(NumericBioActivity) ? model = LazarRegression.new : model = LazarClassification.new - - model.prediction_feature_id = prediction_feature.id - model.training_dataset_id = training_dataset.id - model.name = training_dataset.name - - # git or gem versioning - dir = File.dirname(__FILE__) - path = File.expand_path("../", File.expand_path(dir)) - if Dir.exists?(dir+"/.git") - commit = `git rev-parse HEAD`.chomp - branch = `git rev-parse --abbrev-ref HEAD`.chomp - url = `git config --get remote.origin.url`.chomp - model.version = {:url => url, :branch => branch, :commit => commit} - else - version = File.open(path+"/VERSION", &:gets).chomp - url = "https://rubygems.org/gems/lazar/versions/"+version - model.version = {:url => url, :branch => "gem", :commit => version} - end - - # set defaults# - substance_classes = training_dataset.substances.collect{|s| s.class.to_s}.uniq - raise ArgumentError, "Cannot create models for mixed substance classes '#{substance_classes.join ', '}'." unless substance_classes.size == 1 - - if substance_classes.first == "OpenTox::Compound" - - model.algorithms = { - :descriptors => { - :method => "fingerprint", - :type => "MP2D", - }, - :feature_selection => nil - } - - if model.class == LazarClassification - model.algorithms[:prediction] = { - :method => "Algorithm::Classification.weighted_majority_vote", - } - model.algorithms[:similarity] = { - :method => "Algorithm::Similarity.tanimoto", - :min => [0.5,0.2], - } - elsif model.class == LazarRegression - model.algorithms[:prediction] = { - :method => "Algorithm::Caret.rf", - } - model.algorithms[:similarity] = { - :method => "Algorithm::Similarity.tanimoto", - :min => [0.5,0.2], - } - end - - elsif substance_classes.first == "OpenTox::Nanoparticle" - model.algorithms = { - :descriptors => { - :method => "properties", - :categories => ["P-CHEM"], - }, - :similarity => { - :method => "Algorithm::Similarity.weighted_cosine", - :min => [0.5,0.2], - }, - :prediction => { - :method => "Algorithm::Caret.rf", - }, - :feature_selection => { - :method => "Algorithm::FeatureSelection.correlation_filter", - }, - } - elsif substance_classes.first == "OpenTox::Substance" and algorithms[:descriptors][:method] == "properties" and algorithms[:descriptors][:categories] - model.algorithms = { - :feature_selection => nil, - :similarity => { # similarity algorithm - :method => "Algorithm::Similarity.weighted_cosine", - :min => [0.5,0.2] - }, - } - if model.class == LazarClassification - model.algorithms[:prediction] = { - :method => "Algorithm::Classification.weighted_majority_vote", - } - elsif model.class == LazarRegression - model.algorithms[:prediction] = { - :method => "Algorithm::Caret.rf", - } - end - else - raise ArgumentError, "Cannot create models for #{substance_classes.first} #{algorithms.to_json}." - end - - # overwrite defaults with explicit parameters - algorithms.each do |type,parameters| - if parameters and parameters.is_a? Hash - parameters.each do |p,v| - model.algorithms[type] ||= {} - model.algorithms[type][p] = v - model.algorithms[:descriptors].delete :categories if type == :descriptors and p == :type - end - else - model.algorithms[type] = parameters - end - end if algorithms - - # parse dependent_variables from training dataset - training_dataset.substances.each do |substance| - values = training_dataset.values(substance,model.prediction_feature_id) - values.each do |v| - model.substance_ids << substance.id.to_s - model.dependent_variables << v - end if values - end - - descriptor_method = model.algorithms[:descriptors][:method] - model.independent_variables = [] - case descriptor_method - # parse fingerprints - when "fingerprint" - type = model.algorithms[:descriptors][:type] - model.substances.each_with_index do |s,i| - model.fingerprints[i] ||= [] - model.fingerprints[i] += s.fingerprint(type) - model.fingerprints[i].uniq! - end - model.descriptor_ids = model.fingerprints.flatten.uniq - model.descriptor_ids.each do |d| - model.independent_variables << model.substance_ids.collect_with_index{|s,i| model.fingerprints[i].include? d} if model.algorithms[:prediction][:method].match /Caret/ - end - # calculate physchem properties - when "calculate_properties" - features = model.algorithms[:descriptors][:features] - model.descriptor_ids = features.collect{|f| f.id.to_s} - model.algorithms[:descriptors].delete(:features) - model.algorithms[:descriptors].delete(:type) - model.substances.each_with_index do |s,i| - props = s.calculate_properties(features) - props.each_with_index do |v,j| - model.independent_variables[j] ||= [] - model.independent_variables[j][i] = v - end if props and !props.empty? - end - # parse independent_variables - when "properties" - feature_ids = [] - model.algorithms[:descriptors][:categories].each do |category| - p category - Feature.where(category:category).each{|f| feature_ids << f.id.to_s} - end - p feature_ids - property_ids = model.substances.collect { |s| s.properties.keys }.flatten.uniq - p property_ids - model.descriptor_ids = feature_ids & property_ids - p model.descriptor_ids - exit - model.independent_variables = model.descriptor_ids.collect{|i| properties.collect{|p| p[i] ? p[i].median : nil}} - else - raise ArgumentError, "Descriptor method '#{descriptor_method}' not implemented." - end - - if model.algorithms[:feature_selection] and model.algorithms[:feature_selection][:method] - model = Algorithm.run model.algorithms[:feature_selection][:method], model - end - - # scale independent_variables - unless model.fingerprints? - model.independent_variables.each_with_index do |var,i| - model.descriptor_means[i] = var.mean - model.descriptor_sds[i] = var.standard_deviation - model.scaled_variables << var.collect{|v| v ? (v-model.descriptor_means[i])/model.descriptor_sds[i] : nil} - end - end - model.save - model - end - - # Predict a substance (compound or nanoparticle) - # @param [OpenTox::Substance] - # @return [Hash] - def predict_substance substance, threshold = self.algorithms[:similarity][:min].first, prediction = nil - - @independent_variables = Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data - case algorithms[:similarity][:method] - when /tanimoto/ # binary features - similarity_descriptors = substance.fingerprint algorithms[:descriptors][:type] - # TODO this excludes descriptors only present in the query substance - # use for applicability domain? - query_descriptors = descriptor_ids.collect{|id| similarity_descriptors.include? id} - when /euclid|cosine/ # quantitative features - if algorithms[:descriptors][:method] == "calculate_properties" # calculate descriptors - features = descriptor_ids.collect{|id| Feature.find(id)} - query_descriptors = substance.calculate_properties(features) - similarity_descriptors = query_descriptors.collect_with_index{|v,i| (v-descriptor_means[i])/descriptor_sds[i]} - else - similarity_descriptors = [] - query_descriptors = [] - descriptor_ids.each_with_index do |id,i| - prop = substance.properties[id] - prop = prop.median if prop.is_a? Array # measured - if prop - similarity_descriptors[i] = (prop-descriptor_means[i])/descriptor_sds[i] - query_descriptors[i] = prop - end - end - end - else - raise ArgumentError, "Unknown descriptor type '#{descriptors}' for similarity method '#{similarity[:method]}'." - end - - prediction ||= {:warnings => [], :measurements => []} - prediction[:warnings] << "Similarity threshold #{threshold} < #{algorithms[:similarity][:min].first}, prediction may be out of applicability domain." if threshold < algorithms[:similarity][:min].first - neighbor_ids = [] - neighbor_similarities = [] - neighbor_dependent_variables = [] - neighbor_independent_variables = [] - - # find neighbors - substance_ids.each_with_index do |s,i| - # handle query substance - if substance.id.to_s == s - prediction[:measurements] << dependent_variables[i] unless threshold < algorithms[:similarity][:min].first # add measurements only once at first pass - prediction[:info] = "Substance '#{substance.name}, id:#{substance.id}' has been excluded from neighbors, because it is identical with the query substance." - else - if fingerprints? - neighbor_descriptors = fingerprints[i] - else - next if substance.is_a? Nanoparticle and substance.core != Nanoparticle.find(s).core # necessary for nanoparticle properties predictions - neighbor_descriptors = scaled_variables.collect{|v| v[i]} - end - sim = Algorithm.run algorithms[:similarity][:method], [similarity_descriptors, neighbor_descriptors, descriptor_weights] - if sim >= threshold - neighbor_ids << s - neighbor_similarities << sim - neighbor_dependent_variables << dependent_variables[i] - independent_variables.each_with_index do |c,j| - neighbor_independent_variables[j] ||= [] - neighbor_independent_variables[j] << @independent_variables[j][i] - end - end - end - end +class Model + + def initialize dir + @dir = dir + @dependent_variables = File.readlines(File.join(@dir,"dependent_variables")).collect{|v| v.chomp} + @dependent_variable_type = File.read(File.join(@dir, "dependent_variable_type")).chomp + if @dependent_variable_type == "binary" + abort "Incorrect dependent variable values '#{@dependent_variables.uniq.sort.join(",")}' for #{@dependent_variable_type} values" unless @dependent_variables.uniq.sort == ["0","1"] + @dependent_variables = @dependent_variables.collect{|v| v.to_i} + elsif @dependent_variable_type == "numeric" + # TODO check for floats + @dependent_variables = @dependent_variables.collect{|v| v.to_f} + end + @independent_variable_type = File.read(File.join(@dir, "independent_variable_type")).chomp + @independent_variables = [] + @smiles = [] + File.readlines(File.join(@dir,"independent_variables")).each do |line| + items = line.chomp.split(",") + @smiles << items.shift + items.collect!{|v| v.to_f} if @independent_variable_type == "numeric" + @independent_variables << items + end + @similarity_thresholds = File.readlines(File.join(@dir,"similarity_thresholds")).collect{|v| v.chomp.to_f} + end - measurements = nil - - if neighbor_similarities.empty? - prediction[:value] = nil - prediction[:warnings] << "Could not find similar substances for threshold #{threshold} with experimental data in the training dataset." - if threshold == algorithms[:similarity][:min].last - prediction[:confidence] = "Out of applicability domain: Could not find similar substances with experimental data in the training dataset (Threshold: #{algorithms[:similarity][:min].last})." - return prediction + def crossvalidation folds=10 + start_time = Time.now + nr_instances = @independent_variables.size + indices = (0..nr_instances-1).to_a.shuffle + mid = (nr_instances/folds) + start = 0 + 0.upto(folds-1) do |i| + t = Time.now + print "Fold #{i}: " + # split train data + last = start+mid + last = last-1 unless nr_instances%folds > i + test_idxs = indices[start..last] || [] + idxs = { + :test => test_idxs, + :train => indices-test_idxs + } + start = last+1 + # write training/test data + cv_dir = File.join(@dir,"crossvalidation",i.to_s) + dirs = {} + idxs.each do |t,idx| + d = File.join cv_dir,t.to_s + dirs[t] = d + FileUtils.mkdir_p d + File.open(File.join(d,"independent_variables"),"w+") do |f| + idx.each do |i| + f.print "#{@smiles[i]}," + f.puts @independent_variables[i].join(",") end - elsif neighbor_similarities.size == 1 - prediction[:value] = nil - prediction[:warnings] << "Cannot create prediction: Only one similar compound for threshold #{threshold} in the training set (Threshold: #{algorithms[:similarity][:min].last})." - prediction[:neighbors] = [{:id => neighbor_ids.first, :measurement => neighbor_dependent_variables[0], :similarity => neighbor_similarities.first}] - if threshold == algorithms[:similarity][:min].last - prediction[:confidence] = "Out of applicability domain: Only one similar compound in the training set." - return prediction - end - else - query_descriptors.collect!{|d| d ? 1 : 0} if algorithms[:feature_selection] and algorithms[:descriptors][:method] == "fingerprint" - # call prediction algorithm - result = Algorithm.run algorithms[:prediction][:method], dependent_variables:neighbor_dependent_variables,independent_variables:neighbor_independent_variables ,weights:neighbor_similarities, query_variables:query_descriptors - prediction.merge! result - prediction[:neighbors] = neighbor_ids.collect_with_index{|id,i| {:id => id, :measurement => neighbor_dependent_variables[i], :similarity => neighbor_similarities[i]}} end - if threshold == algorithms[:similarity][:min].first - if prediction[:warnings].empty? - prediction[:confidence] = "Similar to bioassay results" - return prediction - else # try again with a lower threshold - prediction[:warnings] << "Lowering similarity threshold to #{algorithms[:similarity][:min].last}." - predict_substance substance, algorithms[:similarity][:min].last, prediction - end - elsif threshold < algorithms[:similarity][:min].first - prediction[:confidence] = "Lower than bioassay results" - return prediction + File.open(File.join(d,"dependent_variables"),"w+"){ |f| f.puts idx.collect{|i| @dependent_variables[i]}.join("\n")} + if t == :train + File.open(File.join(d,"dependent_variable_type"),"w+"){ |f| f.puts @dependent_variable_type } + File.open(File.join(d,"independent_variable_type"),"w+"){ |f| f.puts @independent_variable_type } + File.open(File.join(d,"similarity_thresholds"),"w+"){ |f| f.puts @similarity_thresholds.join("\n") } end end - - # Predict a substance (compound or nanoparticle), an array of substances or a dataset - # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] - # @return [Hash, Array, OpenTox::Dataset] - def predict object - - training_dataset = Dataset.find training_dataset_id - - # parse data - substances = [] - if object.is_a? Substance - substances = [object] - elsif object.is_a? Array - substances = object - elsif object.is_a? Dataset - substances = object.substances - else - raise ArgumentError, "Please provide a OpenTox::Compound an Array of OpenTox::Substances or an OpenTox::Dataset as parameter." - end - - # make predictions - predictions = {} - substances.each do |c| - predictions[c.id.to_s] = predict_substance c - if prediction_feature.is_a? NominalBioActivity and predictions[c.id.to_s][:value] - prediction_feature.accept_values.each do |v| - predictions[c.id.to_s][:probabilities][v] ||= 0.0 # use 0 instead of empty probabilities (happens if all neighbors have the same activity) - end - end - predictions[c.id.to_s][:prediction_feature_id] = prediction_feature_id - end - - # serialize result - if object.is_a? Substance - prediction = predictions[substances.first.id.to_s] - prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} if prediction[:neighbors]# sort according to similarity - return prediction - elsif object.is_a? Array - return predictions - elsif object.is_a? Dataset - d = object.copy - #warning_feature = Warnings.find_or_create_by(:dataset_id => d.id) - confidence_feature = Confidence.find_or_create_by(:dataset_id => d.id) - if prediction_feature.is_a? NominalBioActivity - f = NominalLazarPrediction.find_or_create_by(:name => prediction_feature.name, :accept_values => prediction_feature.accept_values, :model_id => self.id, :training_feature_id => prediction_feature.id) - probability_features = {} - prediction_feature.accept_values.each do |v| - probability_features[v] = LazarPredictionProbability.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) - end - elsif prediction_feature.is_a? NumericBioActivity - f = NumericLazarPrediction.find_or_create_by(:name => prediction_feature.name, :unit => prediction_feature.unit, :model_id => self.id, :training_feature_id => prediction_feature.id) - prediction_interval = [] - ["lower","upper"].each do |v| - prediction_interval << LazarPredictionInterval.find_or_create_by(:name => v, :model_id => self.id, :training_feature_id => prediction_feature.id) - end - end - - # add predictions to dataset - predictions.each do |substance_id,p| - substance_id = BSON::ObjectId.from_string(substance_id) - d.add substance_id,confidence_feature,p[:confidence] - unless p[:value].nil? - d.add substance_id,f,p[:value] - p[:probabilities].each {|name,p| d.add substance_id,probability_features[name],p} if p[:probabilities] - p[:prediction_interval].each_with_index {|v,i| d.add substance_id, prediction_interval[i], v } if p[:prediction_interval] - end - end - d.save - return d - end - - end - - # Save the model - # Stores independent_variables in GridFS to avoid Mongo database size limit problems - def save - file = Mongo::Grid::File.new(Marshal.dump(@independent_variables), :filename => "#{id}.independent_variables") - self.independent_variables_id = $gridfs.insert_one(file) - super - end - - # Get independent variables - # @return [Array] - def independent_variables - @independent_variables ||= Marshal.load $gridfs.find_one(_id: self.independent_variables_id).data - @independent_variables - end - - # Get training dataset - # @return [OpenTox::Dataset] - def training_dataset - Dataset.find(training_dataset_id) - end - - # Get prediction feature - # @return [OpenTox::Feature] - def prediction_feature - Feature.find(prediction_feature_id) - end - - # Get training descriptors - # @return [Array] - def descriptors - descriptor_ids.collect{|id| Feature.find(id)} - end - - # Get training substances - # @return [Array] - def substances - substance_ids.collect{|id| Substance.find(id)} - end - - # Are fingerprints used as descriptors - # @return [TrueClass, FalseClass] - def fingerprints? - algorithms[:descriptors][:method] == "fingerprint" ? true : false - end - + # predict + train_model = self.class.new dirs[:train] + train_model.predict_file File.join(dirs[:test],"independent_variables") + puts Time.now-t end + puts "Total: #{Time.now-start_time}" + end +end - # Classification model - class LazarClassification < Lazar - end +class ClassificationModel < Model - # Regression model - class LazarRegression < Lazar + def predict_file independent_variable_file + pred_dir = File.dirname independent_variable_file + predictions = [] + File.readlines(independent_variable_file).each do |line| + variables = line.chomp.split(",") + smiles = variables.shift + variables = variables.collect{|v| v.to_f} if @independent_variable_type == "numeric" + predictions << predict(smiles,variables) end + File.open(File.join(pred_dir,"classification"),"w+") { |f| predictions.each {|p| f.puts p.join(",")} } + end - # Convenience class for generating and validating lazar models in a single step and predicting substances (compounds and nanoparticles), arrays of substances and datasets - class Validation - - include OpenTox - include Mongoid::Document - include Mongoid::Timestamps - - field :endpoint, type: String - field :qmrf, type: Hash - field :species, type: String - field :source, type: String - field :unit, type: String - field :warnings, type: Array - field :model_id, type: BSON::ObjectId - field :repeated_crossvalidation_id, type: BSON::ObjectId - - # Predict a substance (compound or nanoparticle), an array of substances or a dataset - # @param [OpenTox::Compound, OpenTox::Nanoparticle, Array, OpenTox::Dataset] - # @return [Hash, Array, OpenTox::Dataset] - def predict object - model.predict object - end - - # Get training dataset - # @return [OpenTox::Dataset] - def training_dataset - model.training_dataset - end - - # Get lazar model - # @return [OpenTox::Model::Lazar] - def model - Lazar.find model_id - end - - # Get algorithms - # @return [Hash] - def algorithms - model.algorithms - end - - # Get prediction feature - # @return [OpenTox::Feature] - def prediction_feature - model.prediction_feature - end - - # Get repeated crossvalidations - # @return [OpenTox::Validation::RepeatedCrossValidation] - def repeated_crossvalidation - OpenTox::Validation::RepeatedCrossValidation.find repeated_crossvalidation_id # full class name required - end - - # Get crossvalidations - # @return [Array "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - unless training_dataset # try to import - Import::Enanomapper.import - training_dataset = Dataset.where(name: "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles").first - raise ArgumentError, "Cannot import 'Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles' dataset" unless training_dataset - end - prediction_feature ||= Feature.where(name: "log2(Net cell association)", category: "TOX").first - - model_validation = self.new( - :endpoint => prediction_feature.name, - :source => prediction_feature.source, - :species => "A549 human lung epithelial carcinoma cells", - :unit => prediction_feature.unit - ) - model = LazarRegression.create prediction_feature: prediction_feature, training_dataset: training_dataset, algorithms: algorithms - model_validation[:model_id] = model.id - repeated_cv = OpenTox::Validation::RepeatedCrossValidation.create model, 10, 5 - model_validation[:repeated_crossvalidation_id] = repeated_cv.id - model_validation.save - model_validation + # TODO: with neighbors + def predict_smiles smiles + end + + def predict smiles, variables + similarities = [] + @independent_variables.each do |row| + if @independent_variable_type == "binary" + similarities << Similarity.tanimoto([row, variables]) + elsif @independent_variable_type == "numeric" + similarities << Similarity.cosine([row, variables]) end - end + neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[1]} + neighbor_idx = similarities.each_index.select{|i| similarities[i] > @similarity_thresholds[0]} if neighbor_idx.size < 2 # lower similarity threshold + neighbor_idx.select!{|i| @smiles[i] != smiles} # remove identical compounds + return [smiles,nil,nil,nil,similarities.max,neighbor_idx.size] if neighbor_idx.size < 2 + + neighbor_dependent_variables = neighbor_idx.collect{|i| @dependent_variables[i]} + neighbor_weights = neighbor_idx.collect{|i| similarities[i]} + probabilities = weighted_majority_vote(neighbor_dependent_variables, neighbor_weights) + probabilities[1] > probabilities[0] ? classification = 1 : classification = 0 + + [ smiles, classification ] + probabilities + [ similarities.max, neighbor_idx.size ] + end + + # Weighted majority vote + # @param [Array<0,1>] dependent_variables + # @param [Array] weights + # @return [Array] probabilities + def weighted_majority_vote dependent_variables, weights + w = [] + w[0] = weights.each_index.select{|i| dependent_variables[i] == 0}.collect{|i| weights[i]} + w[1] = weights.each_index.select{|i| dependent_variables[i] == 1}.collect{|i| weights[i]} + weights_sum = weights.sum.to_f + weights_max = weights.max.to_f + probabilities = [] + probabilities[0] = weights_max*w[0].sum/weights_sum + probabilities[1] = weights_max*w[1].sum/weights_sum + probabilities end - end diff --git a/lib/similarity.rb b/lib/similarity.rb index ccbc9d6..deeb81f 100644 --- a/lib/similarity.rb +++ b/lib/similarity.rb @@ -1,85 +1,61 @@ -module OpenTox - module Algorithm +class Similarity - class Vector - # Get dot product - # @param [Vector] - # @param [Vector] - # @return [Numeric] - def self.dot_product(a, b) - products = a.zip(b).map{|a, b| a * b} - products.inject(0) {|s,p| s + p} - end - - def self.magnitude(point) - squares = point.map{|x| x ** 2} - Math.sqrt(squares.inject(0) {|s, c| s + c}) - end - end - - class Similarity - - # Get Tanimoto similarity - # @param [Array>] - # @return [Float] - def self.tanimoto fingerprints - ( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f - end - - #def self.weighted_tanimoto fingerprints - #( fingerprints[0] & fingerprints[1]).size/(fingerprints[0]|fingerprints[1]).size.to_f - #end + # Get Tanimoto similarity + # @param [Array>] + # @return [Float] + def self.tanimoto fingerprints + ( fingerprints[0] & fingerprints[1] ).size/( fingerprints[0] | fingerprints[1] ).size.to_f + end - # Get Euclidean distance - # @param [Array>] - # @return [Float] - def self.euclid scaled_properties - sq = scaled_properties[0].zip(scaled_properties[1]).map{|a,b| (a - b) ** 2} - Math.sqrt(sq.inject(0) {|s,c| s + c}) - end + # Get Euclidean distance + # @param [Array>] + # @return [Float] + def self.euclid variables + sq = variables[0].zip(variables[1]).map{|a,b| (a - b) ** 2} + Math.sqrt(sq.inject(0) {|s,c| s + c}) + end - # Get cosine similarity - # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity - # @param [Array>] - # @return [Float] - def self.cosine scaled_properties - scaled_properties = remove_nils scaled_properties - Algorithm::Vector.dot_product(scaled_properties[0], scaled_properties[1]) / (Algorithm::Vector.magnitude(scaled_properties[0]) * Algorithm::Vector.magnitude(scaled_properties[1])) - end + # Get cosine similarity + # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity + # @param [Array>] + # @return [Float] + def self.cosine variables + variables[0].dot_product(variables[1]) / (variables[0].magnitude * variables[1].magnitude) + end - # Get weighted cosine similarity - # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity - # @param [Array>] [a,b,weights] - # @return [Float] - def self.weighted_cosine scaled_properties - a,b,w = remove_nils scaled_properties - return cosine(scaled_properties) if w.uniq.size == 1 - dot_product = 0 - magnitude_a = 0 - magnitude_b = 0 - (0..a.size-1).each do |i| - dot_product += w[i].abs*a[i]*b[i] - magnitude_a += w[i].abs*a[i]**2 - magnitude_b += w[i].abs*b[i]**2 - end - dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b)) - end +=begin + # Get weighted cosine similarity + # http://stackoverflow.com/questions/1838806/euclidean-distance-vs-pearson-correlation-vs-cosine-similarity + # @param [Array>] [a,b,weights] + # @return [Float] + def self.weighted_cosine scaled_properties + a,b,w = remove_nils scaled_properties + return cosine(scaled_properties) if w.uniq.size == 1 + dot_product = 0 + magnitude_a = 0 + magnitude_b = 0 + (0..a.size-1).each do |i| + dot_product += w[i].abs*a[i]*b[i] + magnitude_a += w[i].abs*a[i]**2 + magnitude_b += w[i].abs*b[i]**2 + end + dot_product/(Math.sqrt(magnitude_a)*Math.sqrt(magnitude_b)) + end - # Remove nil values - # @param [Array>] [a,b,weights] - # @return [Array>] [a,b,weights] - def self.remove_nils scaled_properties - a =[]; b = []; w = [] - (0..scaled_properties.first.size-1).each do |i| - if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan? - a << scaled_properties[0][i] - b << scaled_properties[1][i] - w << scaled_properties[2][i] - end - end - [a,b,w] + # Remove nil values + # @param [Array>] [a,b,weights] + # @return [Array>] [a,b,weights] + def self.remove_nils scaled_properties + a =[]; b = []; w = [] + (0..scaled_properties.first.size-1).each do |i| + if scaled_properties[0][i] and scaled_properties[1][i] and !scaled_properties[0][i].nan? and !scaled_properties[1][i].nan? + a << scaled_properties[0][i] + b << scaled_properties[1][i] + w << scaled_properties[2][i] end - end + [a,b,w] end +=end + end diff --git a/lib/statistics.rb b/lib/statistics.rb new file mode 100644 index 0000000..15ea416 --- /dev/null +++ b/lib/statistics.rb @@ -0,0 +1,71 @@ +class ClassificationStatistics + + def initialize dir + @dir = dir + @folds = Dir[File.join(@dir,"[0-9]*")] + @confusion_matrix_dir = File.join(@dir,"confusion_matrices") + @summaries_dir = File.join(@dir,"summaries") + end + + def confusion_matrix + + confusion_matrices = { + :all => {:tp => 0, :fp => 0, :tn => 0, :fn => 0}, + :high_confidence => {:tp => 0, :fp => 0, :tn => 0, :fn => 0}, + :low_confidence => {:tp => 0, :fp => 0, :tn => 0, :fn => 0}, + } + + @folds.each do |dir| + test_dir = File.join(dir,"test") + classifications = File.readlines(File.join(test_dir,"classification")).collect{|row| row.chomp.split(",")} + measurements = File.readlines(File.join(test_dir,"dependent_variables")).collect{|v| v.to_i} + similarity_thresholds = File.readlines(File.join(dir,"train","similarity_thresholds")).collect{|v| v.chomp.to_f} + classifications.each_with_index do |c,i| + prediction = c[1] + max_sim = c[4].to_f + unless prediction.empty? + prediction = prediction.to_i + if prediction == 1 and measurements[i] == 1 + confusion_matrices[:all][:tp] +=1 + max_sim > similarity_thresholds[1] ? confusion_matrices[:high_confidence][:tp] +=1 : confusion_matrices[:low_confidence][:tp] +=1 + elsif prediction == 0 and measurements[i] == 0 + confusion_matrices[:all][:tn] +=1 + max_sim > similarity_thresholds[1] ? confusion_matrices[:high_confidence][:tn] +=1 : confusion_matrices[:low_confidence][:tn] +=1 + elsif prediction == 1 and measurements[i] == 0 + confusion_matrices[:all][:fp] +=1 + max_sim > similarity_thresholds[1] ? confusion_matrices[:high_confidence][:fp] +=1 : confusion_matrices[:low_confidence][:fp] +=1 + elsif prediction == 0 and measurements[i] == 1 + confusion_matrices[:all][:fn] +=1 + max_sim > similarity_thresholds[1] ? confusion_matrices[:high_confidence][:fn] +=1 : confusion_matrices[:low_confidence][:fn] +=1 + end + end + end + FileUtils.mkdir_p @confusion_matrix_dir + confusion_matrices.each do |t,m| + File.open(File.join(@confusion_matrix_dir,t.to_s),"w+"){ |f| f.puts "#{m[:tp]},#{m[:fp]}\n#{m[:fn]},#{m[:tn]}" } + end + end + + end + + def summary + [:all,:high_confidence,:low_confidence].each do |cat| + confusion_matrix_file = File.join(@confusion_matrix_dir,cat.to_s) + confusion_matrix unless File.exists? confusion_matrix_file + matrix = File.readlines(confusion_matrix_file).collect{|row| row.chomp.split(",").collect{|v| v.to_f}} + tp = matrix[0][0] + fp = matrix[0][1] + fn = matrix[1][0] + tn = matrix[1][1] + FileUtils.mkdir_p @summaries_dir + File.open(File.join(@summaries_dir,cat.to_s),"w+") do |f| + f.puts "accuracy,#{(tp+tn)/(tp+fp+tn+fn)}" + f.puts "true_positive_rate,#{tp/(tp+fn)}" + f.puts "true_negative_rate,#{tn/(tn+fp)}" + f.puts "positive_predictive_value,#{tp/(tp+fp)}" + f.puts "negative_predictive_value,#{tn/(tn+fn)}" + end + end + end +end + -- cgit v1.2.3