From 8b31acab67e22f30a87c995a94f1ee1e2a3d510f Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Wed, 10 Oct 2018 21:39:11 +0200 Subject: dataset tests fixed --- lib/compound.rb | 3 - lib/dataset.rb | 369 +++++++++++++++++++++++++++-------------------------- lib/lazar.rb | 3 + test/dataset.rb | 27 ++-- test/experiment.rb | 301 ------------------------------------------- 5 files changed, 207 insertions(+), 496 deletions(-) delete mode 100644 test/experiment.rb diff --git a/lib/compound.rb b/lib/compound.rb index b53cba1..22c8575 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -1,6 +1,3 @@ -PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" -CHEMBL_URI = "https://www.ebi.ac.uk/chembl/api/data/molecule/" - module OpenTox # Small molecules with defined chemical structures diff --git a/lib/dataset.rb b/lib/dataset.rb index 6ad3215..b6c6173 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -67,166 +67,76 @@ module OpenTox #data_entries[substance.to_s][feature.to_s].uniq! if value.numeric? # assuming that identical values come from the same source end - # Dataset operations - - # Merge an array of datasets - # @param [Array] OpenTox::Dataset Array to be merged - # @param [Hash] feature modifications - # @param [Hash] value modifications - # @return [OpenTox::Dataset] merged dataset - def self.merge datasets, feature_map=nil, value_map=nil - dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) - datasets.each do |d| - d.substances.each do |s| - d.features.each do |f| - d.values(s,f).each do |v| - f = feature_map[f] if feature_map and feature_map[f] - v = value_map[v] if value_map and value_map[v] - dataset.add s,f,v #unless dataset.values(s,f).include? v - end - end - end - end - dataset.save - dataset - end - - # Split a dataset into n folds - # @param [Integer] number of folds - # @return [Array] Array with folds [training_dataset,test_dataset] - def folds n - len = self.substances.size - indices = (0..len-1).to_a.shuffle - mid = (len/n) - chunks = [] - start = 0 - 1.upto(n) do |i| - last = start+mid - last = last-1 unless len%n >= i - test_idxs = indices[start..last] || [] - test_substances = test_idxs.collect{|i| substances[i]} - training_idxs = indices-test_idxs - training_substances = training_idxs.collect{|i| substances[i]} - chunk = [training_substances,test_substances].collect do |substances| - dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) - substances.each do |substance| - substance.dataset_ids << dataset.id - substance.dataset_ids.uniq! - substance.save - dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} - end - dataset.save - dataset - end - start = last+1 - chunks << chunk - end - chunks - end - - # Serialisation + # Parsers - # Convert dataset to csv format including compound smiles as first column, other column headers are feature names - # @return [String] - # TODO original_id - def to_csv(inchi=false) - CSV.generate() do |csv| - compound = substances.first.is_a? Compound - if compound - csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} - else - csv << ["Name"] + features.collect{|f| f.name} - end - substances.each do |substance| - if compound - name = (inchi ? substance.inchi : substance.smiles) - else - name = substance.name - end - nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq - - if nr_measurements.size > 1 - warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." - else - (0..nr_measurements.first-1).each do |i| - row = [name] - features.each do |f| - values(substance,f) ? row << values(substance,f)[i] : row << "" - end - csv << row - end + # Create a dataset from CSV file + # @param [File] Input file with the following format: + # - ID column (optional): header containing "ID" string, arbitrary ID values + # - SMILES/InChI column: header indicating "SMILES" or "InChI", Smiles or InChI strings + # - one or more properties column(s): header with property name(s), property values + # files with a single property column are read as BioActivities (i.e. dependent variable) + # files with multiple property columns are read as SubstanceProperties (i.e. independent variables) + # @return [OpenTox::Dataset] + def self.from_csv_file file + md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files + dataset = self.find_by(:md5 => md5) + if dataset + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." + else + $logger.debug "Parsing #{file}." + table = nil + sep = "," + ["\t",";"].each do |s| # guess alternative CSV separator + if File.readlines(file).first.match(/#{s}/) + sep = s + break end end - end - end - - # Convert dataset to SDF format - # @return [String] SDF string - def to_sdf - sdf = "" - substances.each do |substance| - sdf_lines = substance.sdf.sub(/\$\$\$\$\n/,"").split("\n") - sdf_lines[0] = substance.smiles - sdf += sdf_lines.join("\n") - features.each do |f| - sdf += "\n> <#{f.name}>\n" - sdf += values(substance,f).uniq.join "," - end - sdf += "\n$$$$\n" - end - sdf - end - - # Parsers - - # Create a dataset from PubChem Assay - # @param [Integer] PubChem AssayID (AID) - # @return [OpenTox::Dataset] - def self.from_pubchem aid - url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/#{aid}/CSV" - csv = CSV.parse(RestClientWrapper.get(url)) - csv.select!{|r| r[0].match /^\d/} # discard header rows - table = [["SID","SMILES","Activity"]] - csv.each_slice(100) do |slice| # get SMILES in chunks - sids = slice.collect{|s| s[1]} - smiles = RestClientWrapper.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT").split("\n") - abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size - smiles.each_with_index do |smi,i| - table << [slice[i][1],smi.chomp,slice[i][3]] + table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' + if table + dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) + dataset.parse_table table + else + bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." end end - dataset = self.new(:source => url) # TODO name - dataset.parse_table table, false dataset end # Create a dataset from SDF file + # files with a single data field are read as BioActivities (i.e. dependent variable) + # files with multiple data fields are read as SubstanceProperties (i.e. independent variable) # @param [File] # @return [OpenTox::Dataset] - def self.from_sdf_file file, map=nil + def self.from_sdf_file file md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) if dataset $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." - table = nil - read_result = false - sdf = "" - dataset = self.new(:source => file, :name => File.basename(file), :md5 => md5) + + dataset = self.new(:source => file, :name => File.basename(file,".*"), :md5 => md5) original_id = OriginalId.find_or_create_by(:dataset_id => dataset.id,:name => dataset.name+".ID") + read_result = false + sdf = "" feature_name = "" compound = nil features = {} + table = [["ID","SMILES"]] File.readlines(file).each do |line| if line.match %r{\$\$\$\$} sdf << line id = sdf.split("\n").first.chomp compound = Compound.from_sdf sdf - dataset.add compound, original_id, id - features.each { |f,v| dataset.add compound, f, v } + row = [id,compound.smiles] + features.each do |f,v| + table[0] << f unless table[0].include? f + row[table[0].index(f)] = v + end + table << row sdf = "" features = {} elsif line.match /^>\s+ feature_name, :measured => true) - value = value.to_f - else - feature = NominalFeature.find_or_create_by(:name => feature_name, :measured => true) - end - features[feature] = value + features[feature_name] = value read_result = false else sdf << line end end end + dataset.parse_table table end dataset.save dataset end - - # Create a dataset from CSV file - # @param [File] - # @param [TrueClass,FalseClass] accept or reject empty values + + # Create a dataset from PubChem Assay + # @param [Integer] PubChem AssayID (AID) # @return [OpenTox::Dataset] - def self.from_csv_file file - md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files - dataset = self.find_by(:md5 => md5) - if dataset - $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." - else - $logger.debug "Parsing #{file}." - table = nil - [",","\t",";"].each do |sep| # guess CSV separator - if File.readlines(file).first.match(/#{sep}/) - table = CSV.read file, :col_sep => sep, :skip_blanks => true, :encoding => 'windows-1251:utf-8' - break - end - end - if table - dataset = self.new(:source => file, :name => File.basename(file), :md5 => md5) - dataset.parse_table table - else - bad_request_error "#{file} is not a valid CSV/TSV file. Could not find "," ";" or TAB as column separator." + def self.from_pubchem_aid aid + url = File.join PUBCHEM_URI, "assay/aid/#{aid}/CSV" + assay_metadata = JSON.parse(RestClientWrapper.get(File.join PUBCHEM_URI,"assay/aid/#{aid}/description/JSON").to_s)["PC_AssayContainer"][0]["assay"]["descr"] + name = assay_metadata["name"].gsub(/\s+/,"_") + csv = CSV.parse(RestClientWrapper.get(url)) + csv.select!{|r| r[0].match /^\d/} # discard header rows + table = [["SID","SMILES",name]] + csv.each_slice(100) do |slice| # get SMILES in chunks + sids = slice.collect{|s| s[1]} + smiles = RestClientWrapper.get(File.join(PUBCHEM_URI,"compound/cid/#{sids.join(",")}/property/CanonicalSMILES/TXT")).split("\n").collect{|s| s.to_s} + abort("Could not get SMILES for all SIDs from PubChem") unless sids.size == smiles.size + smiles.each_with_index do |smi,i| + table << [slice[i][1].to_s,smi.chomp,slice[i][3].to_s] end end + dataset = self.new(:source => url, :name => name) + dataset.parse_table table dataset end @@ -302,8 +202,8 @@ module OpenTox features = [] # guess feature types + bioactivity = true if feature_names.size == 1 feature_names.each_with_index do |f,i| - metadata = {:name => f, :measured => true} original_id ? j = i+2 : j = i+1 values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq @@ -311,11 +211,18 @@ module OpenTox if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes numeric[i] = true - feature = NumericFeature.find_or_create_by(metadata) + if bioactivity + feature = NumericBioActivity.find_or_create_by(:name => f) + else + feature = NumericSubstanceProperty.find_or_create_by(:name => f) + end else - metadata["accept_values"] = values.sort numeric[i] = false - feature = NominalFeature.find_or_create_by(metadata) + if bioactivity + feature = NominalBioActivity.find_or_create_by(:name => f, :accept_values => values.sort) + else + feature = NominalSubstanceProperty.find_or_create_by(:name => f, :accept_values => values.sort) + end end features << feature if feature end @@ -326,13 +233,12 @@ module OpenTox table.each_with_index do |vals,i| original_id_value = vals.shift.strip if original_id identifier = vals.shift.strip - #warn "No feature values for compound at line #{i+2} of #{source}." if vals.compact.empty? #and !accept_empty_values begin case compound_format when /SMILES/i - substance = OpenTox::Compound.from_smiles(identifier) + substance = Compound.from_smiles(identifier) when /InChI/i - substance = OpenTox::Compound.from_inchi(identifier) + substance = Compound.from_inchi(identifier) end rescue substance = nil @@ -345,18 +251,13 @@ module OpenTox substance.dataset_ids << self.id substance.dataset_ids.uniq! substance.save - - unless vals.size == features.size - warn "Number of values at position #{i+2} is different than header size (#{vals.size} vs. #{features.size}), all entries are ignored." - next - end add substance, original_id, original_id_value if original_id vals.each_with_index do |v,j| if v.blank? - warn "Empty value for compound '#{identifier}' and feature '#{feature_names[i]}'." - next + warn "Empty value for compound '#{identifier}' (#{original_id_value}) and feature '#{feature_names[j]}'." + v = nil elsif numeric[j] v = v.to_f else @@ -364,6 +265,7 @@ module OpenTox end add substance, features[j], v end + data_entries[substance.id.to_s] ||= nil #if vals.empty? # no features, eg batch predictions end all_substances.duplicates.each do |substance| @@ -374,6 +276,115 @@ module OpenTox save end + # Serialisation + + # Convert dataset to csv format including compound smiles as first column, other column headers are feature names + # @return [String] + def to_csv(inchi=false) + CSV.generate() do |csv| + compound = substances.first.is_a? Compound + if compound + csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name} + else + csv << ["Name"] + features.collect{|f| f.name} + end + substances.each do |substance| + if compound + name = (inchi ? substance.inchi : substance.smiles) + else + name = substance.name + end + nr_measurements = features.collect{|f| data_entries[substance.id.to_s][f.id.to_s].size if data_entries[substance.id.to_s][f.id.to_s]}.compact.uniq + + if nr_measurements.size > 1 + warn "Unequal number of measurements (#{nr_measurements}) for '#{name}'. Skipping entries." + else + (0..nr_measurements.first-1).each do |i| + row = [name] + features.each do |f| + values(substance,f) ? row << values(substance,f)[i] : row << "" + end + csv << row + end + end + end + end + end + + # Convert dataset to SDF format + # @return [String] SDF string + def to_sdf + sdf = "" + substances.each do |substance| + sdf_lines = substance.sdf.sub(/\$\$\$\$\n/,"").split("\n") + sdf_lines[0] = substance.smiles + sdf += sdf_lines.join("\n") + features.each do |f| + sdf += "\n> <#{f.name}>\n" + sdf += values(substance,f).uniq.join "," + end + sdf += "\n$$$$\n" + end + sdf + end + + # Dataset operations + + # Merge an array of datasets + # @param [Array] OpenTox::Dataset Array to be merged + # @param [Hash] feature modifications + # @param [Hash] value modifications + # @return [OpenTox::Dataset] merged dataset + def self.merge datasets, feature_map=nil, value_map=nil + dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) + datasets.each do |d| + d.substances.each do |s| + d.features.each do |f| + d.values(s,f).each do |v| + f = feature_map[f] if feature_map and feature_map[f] + v = value_map[v] if value_map and value_map[v] + dataset.add s,f,v #unless dataset.values(s,f).include? v + end + end + end + end + dataset.save + dataset + end + + # Split a dataset into n folds + # @param [Integer] number of folds + # @return [Array] Array with folds [training_dataset,test_dataset] + def folds n + len = self.substances.size + indices = (0..len-1).to_a.shuffle + mid = (len/n) + chunks = [] + start = 0 + 1.upto(n) do |i| + last = start+mid + last = last-1 unless len%n >= i + test_idxs = indices[start..last] || [] + test_substances = test_idxs.collect{|i| substances[i]} + training_idxs = indices-test_idxs + training_substances = training_idxs.collect{|i| substances[i]} + chunk = [training_substances,test_substances].collect do |substances| + dataset = self.class.create(:name => "#{self.name} (Fold #{i-1})",:source => self.id ) + substances.each do |substance| + substance.dataset_ids << dataset.id + substance.dataset_ids.uniq! + substance.save + dataset.data_entries[substance.id.to_s] = data_entries[substance.id.to_s] ||= {} + end + dataset.save + dataset + end + start = last+1 + chunks << chunk + end + chunks + end + # Delete dataset def delete compounds.each{|c| c.dataset_ids.delete id.to_s} @@ -453,7 +464,7 @@ module OpenTox bad_request_error "'#{compound_format}' is not a supported compound format in the header. " \ "Accepted formats: SMILES, InChI. Please take a look on the help page." end - numeric = [] + #numeric = [] features = [] # guess feature types feature_names.each_with_index do |f,i| @@ -463,11 +474,11 @@ module OpenTox feature = nil if values.size == 0 # empty feature elsif values.size > 5 and types.size == 1 and types.first == true # 5 max classes - numeric[i] = true + #numeric[i] = true feature = NumericFeature.find_or_create_by(metadata) else metadata["accept_values"] = values.sort - numeric[i] = false + #numeric[i] = false feature = NominalFeature.find_or_create_by(metadata) end features << feature if feature diff --git a/lib/lazar.rb b/lib/lazar.rb index d032282..13ad1f8 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -68,6 +68,9 @@ suppressPackageStartupMessages({ }) " +PUBCHEM_URI = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/" +CHEMBL_URI = "https://www.ebi.ac.uk/chembl/api/data/molecule/" + # OpenTox classes and includes CLASSES = ["Feature","Substance","Dataset","LazarPrediction","CrossValidation","LeaveOneOutValidation","RepeatedCrossValidation","Experiment"]# Algorithm and Models are modules diff --git a/test/dataset.rb b/test/dataset.rb index 4196fd8..2b439bb 100644 --- a/test/dataset.rb +++ b/test/dataset.rb @@ -24,10 +24,10 @@ class DatasetTest < MiniTest::Test # real datasets def test_import_pubchem - d = Dataset.from_pubchem 1191 + d = Dataset.from_pubchem_aid 1191 assert_equal 87, d.compounds.size assert_equal 2, d.features.size - assert_equal "Active", d.values(d.compounds[10],d.features[1]) + assert_equal ["Active"], d.values(d.compounds[10],d.features[1]) # TODO endpoint name # TODO regression import end @@ -37,7 +37,7 @@ class DatasetTest < MiniTest::Test assert_equal 53, d.compounds.size assert_equal 1, d.features.size f = d.features[0] - assert_equal "input_53.csv.ID", f.name + assert_equal "input_53.ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end @@ -47,18 +47,18 @@ class DatasetTest < MiniTest::Test assert_equal 53, d.compounds.size assert_equal 1, d.features.size f = d.features[0] - assert_equal "input_53.tsv.ID", f.name + assert_equal "input_53.ID", f.name assert_equal OriginalId, f.class assert_equal ["123-30-8"], d.values(d.compounds.first,f) end def test_import_sdf - #d = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" d = Dataset.from_sdf_file "#{DATA_DIR}/PA.sdf" - assert_equal Compound.from_smiles("C[C@H]1C(=O)O[C@@H]2CCN3[C@@H]2C(=CC3)COC(=O)[C@]([C@]1(C)O)(C)O").smiles, d.compounds.first.smiles - f = Feature.find_by(:name => "original_id") assert_equal 35, d.features.size - assert_equal ["9415"], d.values(d.compounds.first,f) + assert_kind_of NumericSubstanceProperty, d.features[1] + assert_equal NominalSubstanceProperty, d.features.last.class + assert_equal 602, d.compounds.size + assert_match "PUBCHEM_XLOGP3_AA", d.warnings.last end def test_import_hamster @@ -66,7 +66,7 @@ class DatasetTest < MiniTest::Test assert_equal Dataset, d.class assert_equal 1, d.features.size assert_equal 85, d.compounds.size - assert_equal true, d.features.first.measured + assert_equal NominalBioActivity, d.features.first.class csv = CSV.read("#{DATA_DIR}/hamster_carcinogenicity.csv") csv.shift csv.each do |row| @@ -104,7 +104,7 @@ class DatasetTest < MiniTest::Test f = File.join DATA_DIR, "multi_cell_call.csv" d = OpenTox::Dataset.from_csv_file f csv = CSV.read f - assert_equal true, d.features.first.nominal? + assert_equal NominalBioActivity, d.features.first.class assert_equal 1056, d.compounds.size assert_equal csv.first.size-1, d.features.size errors.each do |smi| @@ -157,7 +157,7 @@ class DatasetTest < MiniTest::Test def test_create_without_features_smiles_and_inchi ["smiles", "inchi"].each do |type| - d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv"), true + d = Dataset.from_csv_file File.join(DATA_DIR,"batch_prediction_#{type}_small.csv") assert_equal Dataset, d.class refute_nil d.id dataset = Dataset.find d.id @@ -169,6 +169,7 @@ class DatasetTest < MiniTest::Test # dataset operations def test_merge + skip # TODO use new Features source_feature = Feature.where(:name => "Ames test categorisation").first target_feature = Feature.where(:name => "Mutagenicity").first kazius = Dataset.from_sdf_file "#{DATA_DIR}/cas_4337.sdf" @@ -177,10 +178,11 @@ class DatasetTest < MiniTest::Test d = Dataset.merge [kazius,hansen,efsa], {source_feature => target_feature}, {1 => "mutagen", 0 => "nonmutagen"} #File.open("tmp.csv","w+"){|f| f.puts d.to_csv} assert_equal 8281, d.compounds.size - assert_equal 4, d.features.size c = Compound.from_smiles("C/C=C/C=O") assert_equal ["mutagen"], d.values(c,target_feature) assert_equal "/home/ist/lazar/test/data/cas_4337.sdf, /home/ist/lazar/test/data/hansen.csv, /home/ist/lazar/test/data/efsa.csv", d.source + p d.features + assert_equal 4, d.features.size end def test_folds @@ -219,7 +221,6 @@ class DatasetTest < MiniTest::Test c = Compound.from_smiles row.shift serialized[c.inchi] = row end - #puts serialized.to_yaml original.each do |inchi,row| row.each_with_index do |v,i| if v.numeric? diff --git a/test/experiment.rb b/test/experiment.rb deleted file mode 100644 index 418f7fe..0000000 --- a/test/experiment.rb +++ /dev/null @@ -1,301 +0,0 @@ -require_relative "setup.rb" - -class ExperimentTest < MiniTest::Test - - def test_regression_experiment - skip - datasets = [ - "EPAFHM.medi_log10.csv", - #"EPAFHM.csv", - #"FDA_v3b_Maximum_Recommended_Daily_Dose_mmol.csv", - "LOAEL_mmol_corrected_smiles.csv" - ] - experiment = Experiment.create( - :name => "Default regression for datasets #{datasets}.", - :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, - :model_settings => [ - { - :algorithm => "OpenTox::Model::LazarRegression", - } - ] - ) - #experiment.run - puts experiment.report.to_yaml - assert_equal datasets.size, experiment.results.size - experiment.results.each do |dataset_id, result| - assert_equal 1, result.size - result.each do |r| - assert_kind_of BSON::ObjectId, r[:model_id] - assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id] - end - end - end - - def test_classification_experiment - - skip - datasets = [ "hamster_carcinogenicity.csv" ] - experiment = Experiment.create( - :name => "Fminer vs fingerprint classification for datasets #{datasets}.", - :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, - :model_settings => [ - { - :algorithm => "OpenTox::Model::LazarClassification", - },{ - :algorithm => "OpenTox::Model::LazarClassification", - :neighbor_algorithm_parameter => {:min_sim => 0.3} - }, - #{ - #:algorithm => "OpenTox::Model::LazarFminerClassification", - #} - ] - ) - #experiment.run -=begin - experiment = Experiment.find "55f944a22b72ed7de2000000" -=end - puts experiment.report.to_yaml - experiment.results.each do |dataset_id, result| - assert_equal 2, result.size - result.each do |r| - assert_kind_of BSON::ObjectId, r[:model_id] - assert_kind_of BSON::ObjectId, r[:repeated_crossvalidation_id] - end - end - end - - def test_regression_fingerprints - skip -#=begin - datasets = [ - "EPAFHM.medi_log10.csv", - #"LOAEL_mmol_corrected_smiles.csv" - ] - min_sims = [0.3,0.7] - #min_sims = [0.7] - #types = ["FP2","FP3","FP4","MACCS","MP2D"] - types = ["MP2D","FP3"] - experiment = Experiment.create( - :name => "Fingerprint regression with different types for datasets #{datasets}.", - :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, - ) - types.each do |type| - min_sims.each do |min_sim| - experiment.model_settings << { - :model_algorithm => "OpenTox::Model::LazarRegression", - :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average", - :neighbor_algorithm => "fingerprint_neighbors", - :neighbor_algorithm_parameters => { - :type => type, - :min_sim => min_sim, - } - } - end - end - experiment.run -#=end -=begin - experiment = Experiment.find '56029cb92b72ed673d000000' -=end - p experiment.id - experiment.results.each do |dataset,result| - result.each do |r| - params = Model::Lazar.find(r["model_id"])[:neighbor_algorithm_parameters] - RepeatedCrossValidation.find(r["repeated_crossvalidation_id"]).crossvalidations.each do |cv| - cv.validation_ids.each do |vid| - model_params = Model::Lazar.find(Validation.find(vid).model_id)[:neighbor_algorithm_parameters] - assert_equal params[:type], model_params[:type] - assert_equal params[:min_sim], model_params[:min_sim] - refute_equal params[:training_dataset_id], model_params[:training_dataset_id] - end - end - end - end - puts experiment.report.to_yaml - p experiment.summary - end - - def test_mpd_fingerprints - skip - datasets = [ - "EPAFHM.medi_log10.csv", - ] - types = ["FP2","MP2D"] - experiment = Experiment.create( - :name => "FP2 vs MP2D fingerprint regression for datasets #{datasets}.", - :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, - ) - types.each do |type| - experiment.model_settings << { - :algorithm => "OpenTox::Model::LazarRegression", - :neighbor_algorithm => "fingerprint_neighbors", - :neighbor_algorithm_parameter => { - :type => type, - :min_sim => 0.7, - } - } - end - experiment.run - p experiment.id -=begin -=end - #experiment = Experiment.find '55ffd0c02b72ed123c000000' - p experiment - puts experiment.report.to_yaml - end - - def test_multiple_datasets - skip - datasets = [ - "EPAFHM.medi_log10.csv", - "LOAEL_mmol_corrected_smiles.csv" - ] - min_sims = [0.3] - types = ["FP2"] - experiment = Experiment.create( - :name => "Fingerprint regression with mutiple datasets #{datasets}.", - :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, - ) - types.each do |type| - min_sims.each do |min_sim| - experiment.model_settings << { - :model_algorithm => "OpenTox::Model::LazarRegression", - :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average", - :neighbor_algorithm => "fingerprint_neighbors", - :neighbor_algorithm_parameters => { - :type => type, - :min_sim => min_sim, - } - } - end - end - experiment.run - p experiment.id - experiment.results.each do |dataset,result| - result.each do |r| - params = Model::Lazar.find(r["model_id"])[:neighbor_algorithm_parameters] - RepeatedCrossValidation.find(r["repeated_crossvalidation_id"]).crossvalidations.each do |cv| - cv.validation_ids.each do |vid| - model_params = Model::Lazar.find(Validation.find(vid).model_id)[:neighbor_algorithm_parameters] - assert_equal params[:type], model_params[:type] - assert_equal params[:min_sim], model_params[:min_sim] - refute_equal params[:training_dataset_id], model_params[:training_dataset_id] - end - end - end - end - puts experiment.report.to_yaml - p experiment.summary - end - - def test_mpd_mna_regression_fingerprints - skip - datasets = [ - "EPAFHM.medi.csv", - #"hamster_carcinogenicity.csv" - ] - min_sims = [0.0,0.3] - types = ["MP2D","MNA"] - neighbor_algos = [ - "fingerprint_neighbors", - "fingerprint_count_neighbors", - ] - experiment = Experiment.create( - :name => "MNA vs MPD descriptors", - :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, - ) - types.each do |type| - min_sims.each do |min_sim| - neighbor_algos.each do |neighbor_algo| - experiment.model_settings << { - :model_algorithm => "OpenTox::Model::LazarRegression", - :prediction_algorithm => "OpenTox::Algorithm::Regression.weighted_average", - :neighbor_algorithm => neighbor_algo, - :neighbor_algorithm_parameters => { - :type => type, - :min_sim => min_sim, - } - } - end - end - end - experiment.run -#=end -=begin - experiment = Experiment.find '56029cb92b72ed673d000000' -=end - p experiment.id - puts experiment.report.to_yaml - #p experiment.summary - experiment.results.each do |dataset,result| - result.each do |r| - p r - # TODO fix r["model_id"] - params = Model::Lazar.find(r["model_id"])[:neighbor_algorithm_parameters] - RepeatedCrossValidation.find(r["repeated_crossvalidation_id"]).crossvalidations.each do |cv| - cv.validation_ids.each do |vid| - model_params = Model::Lazar.find(Validation.find(vid).model_id)[:neighbor_algorithm_parameters] - assert_equal params[:type], model_params[:type] - assert_equal params[:min_sim], model_params[:min_sim] - refute_equal params[:training_dataset_id], model_params[:training_dataset_id] - end - end - end - end - end - - def test_mpd_mna_classification_fingerprints - skip - datasets = [ - #"EPAFHM.medi.csv", - "hamster_carcinogenicity.csv" - ] - min_sims = [0.0,0.3] - types = ["MP2D","MNA"] - neighbor_algos = [ - "fingerprint_count_neighbors", - "fingerprint_neighbors", - ] - experiment = Experiment.create( - :name => "MNA vs MPD descriptors", - :dataset_ids => datasets.collect{|d| Dataset.from_csv_file(File.join(DATA_DIR, d)).id}, - ) - types.each do |type| - min_sims.each do |min_sim| - neighbor_algos.each do |neighbor_algo| - experiment.model_settings << { - :model_algorithm => "OpenTox::Model::LazarClassification", - :prediction_algorithm => "OpenTox::Algorithm::Classification.weighted_majority_vote", - :neighbor_algorithm => neighbor_algo, - :neighbor_algorithm_parameters => { - :type => type, - :min_sim => min_sim, - } - } - end - end - end - experiment.run -#=end -=begin - experiment = Experiment.find '56029cb92b72ed673d000000' -=end - p experiment.id - puts experiment.report.to_yaml - #p experiment.summary - experiment.results.each do |dataset,result| - result.each do |r| - # TODO fix r["model_id"] - params = Model::Lazar.find(r["model_id"])[:neighbor_algorithm_parameters] - RepeatedCrossValidation.find(r["repeated_crossvalidation_id"]).crossvalidations.each do |cv| - cv.validation_ids.each do |vid| - model_params = Model::Lazar.find(Validation.find(vid).model_id)[:neighbor_algorithm_parameters] - assert_equal params[:type], model_params[:type] - assert_equal params[:min_sim], model_params[:min_sim] - refute_equal params[:training_dataset_id], model_params[:training_dataset_id] - end - end - end - end - end -end -- cgit v1.2.3