From 6ab86c253ba0eb79b9e6a20effa2d18626accf2b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 13 Aug 2015 11:56:40 +0200 Subject: OpenBabel can (canonical smiles) instead of inchi as internal identifier to avoid OpenBabel InChi bug. --- lazar.gemspec | 2 +- lib/compound.rb | 54 +++++++--- lib/descriptor.rb | 8 +- lib/lazar-model.rb | 287 +++++++++++++++++++++++++++++++++++++++++++++++++++ lib/lazar.rb | 2 +- lib/neighbor.rb | 2 +- lib/overwrite.rb | 6 ++ test/compound.rb | 18 +++- test/lazar-fminer.rb | 51 +++++++++ test/validation.rb | 41 ++++++++ 10 files changed, 445 insertions(+), 26 deletions(-) create mode 100644 lib/lazar-model.rb create mode 100644 test/lazar-fminer.rb create mode 100644 test/validation.rb diff --git a/lazar.gemspec b/lazar.gemspec index 3a9a1af..7a90080 100644 --- a/lazar.gemspec +++ b/lazar.gemspec @@ -2,7 +2,7 @@ $:.push File.expand_path("../lib", __FILE__) Gem::Specification.new do |s| - s.name = "opentox-client" + s.name = "lazar" s.version = File.read("./VERSION").strip s.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler, Denis Gebele"] s.email = ["helma@in-silico.ch"] diff --git a/lib/compound.rb b/lib/compound.rb index 3418fcc..5343aa0 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -10,13 +10,13 @@ module OpenTox include OpenTox field :inchi, type: String - attr_readonly :inchi field :smiles, type: String field :inchikey, type: String field :names, type: Array field :cid, type: String field :chemblid, type: String - field :image_id, type: BSON::ObjectId + field :png_id, type: BSON::ObjectId + field :svg_id, type: BSON::ObjectId field :sdf_id, type: BSON::ObjectId field :fp4, type: Array field :fp4_size, type: Integer @@ -46,14 +46,18 @@ module OpenTox # @return [OpenTox::Compound] Compound def self.from_smiles smiles # do not store smiles because it might be noncanonical - Compound.find_or_create_by :inchi => obconversion(smiles,"smi","inchi") + Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can") end # Create a compound from inchi string # @param inchi [String] smiles InChI string # @return [OpenTox::Compound] Compound def self.from_inchi inchi - Compound.find_or_create_by :inchi => inchi + # Temporary workaround for OpenBabels Inchi bug + # http://sourceforge.net/p/openbabel/bugs/957/ + # bug has not been fixed in latest git/development version + smiles = `echo "#{inchi}" | babel -iinchi - -ocan`.chomp.strip + smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles, :inchi => inchi) end # Create a compound from sdf string @@ -61,7 +65,7 @@ module OpenTox # @return [OpenTox::Compound] Compound def self.from_sdf sdf # do not store sdf because it might be 2D - Compound.find_or_create_by :inchi => obconversion(sdf,"sdf","inchi") + Compound.find_or_create_by :smiles => obconversion(sdf,"sdf","can") end # Create a compound from name. Relies on an external service for name lookups. @@ -70,20 +74,30 @@ module OpenTox # @param name [String] can be also an InChI/InChiKey, CAS number, etc # @return [OpenTox::Compound] Compound def self.from_name name - Compound.find_or_create_by :inchi => RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"stdinchi")) + Compound.find_or_create_by :smiles => RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles")) end - # Get InChIKey + # Get InChI # @return [String] InChI string + def inchi + unless self["inchi"] + result = `echo "#{self.smiles}" | babel -ismi - -oinchi`.chomp + update(:inchi => result.chomp) unless result.empty? + end + self["inchi"] + end + + # Get InChIKey + # @return [String] InChIKey string def inchikey - update(:inchikey => obconversion(inchi,"inchi","inchikey")) unless self["inchikey"] + update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"] self["inchikey"] end # Get (canonical) smiles # @return [String] Smiles string def smiles - update(:smiles => obconversion(inchi,"inchi","smi")) unless self["smiles"] # should give canonical smiles, "can" seems to give incorrect results + update(:smiles => obconversion(self["smiles"],"smi","can")) #unless self["smiles"] # should give canonical smiles, "can" seems to give incorrect results self["smiles"] end @@ -91,7 +105,7 @@ module OpenTox # @return [String] SDF string def sdf if self.sdf_id.nil? - sdf = obconversion(inchi,"inchi","sdf") + sdf = obconversion(smiles,"smi","sdf") file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile") sdf_id = $gridfs.insert_one file update :sdf_id => sdf_id @@ -99,17 +113,29 @@ module OpenTox $gridfs.find_one(_id: self.sdf_id).data end + # Get SVG image + # @return [image/svg] Image data + def svg + if self.svg_id.nil? + svg = obconversion(smiles,"smi","svg") + file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg") + update(:image_id => $gridfs.insert_one(file)) + end + $gridfs.find_one(_id: self.svg_id).data + + end + # Get png image # @example # image = compound.png # @return [image/png] Image data def png - if self.image_id.nil? - png = obconversion(inchi,"inchi","_png2") + if self.png_id.nil? + png = obconversion(smiles,"smi","_png2") file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png") - update(:image_id => $gridfs.insert_one(file)) + update(:png_id => $gridfs.insert_one(file)) end - Base64.decode64($gridfs.find_one(_id: self.image_id).data) + Base64.decode64($gridfs.find_one(_id: self.png_id).data) end diff --git a/lib/descriptor.rb b/lib/descriptor.rb index 335f3dc..f0492a2 100644 --- a/lib/descriptor.rb +++ b/lib/descriptor.rb @@ -64,7 +64,7 @@ module OpenTox @count = count obconversion = OpenBabel::OBConversion.new obmol = OpenBabel::OBMol.new - obconversion.set_in_format('inchi') + obconversion.set_in_format('smi') smarts_pattern = OpenBabel::OBSmartsPattern.new smarts_features = [smarts_features] if smarts_features.is_a?(Feature) @smarts = smarts_features.collect{|f| f.smarts} @@ -77,7 +77,7 @@ module OpenTox # which worked with opentox-client # (but no smarts_match) #p "'#{compound.inchi}'" - obconversion.read_string(obmol,compound.inchi) + obconversion.read_string(obmol,compound.smiles) @smarts.each_with_index do |smart,s| smarts_pattern.init(smart) if smarts_pattern.match(obmol) @@ -123,10 +123,10 @@ module OpenTox obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d} obmol = OpenBabel::OBMol.new obconversion = OpenBabel::OBConversion.new - obconversion.set_in_format 'inchi' + obconversion.set_in_format 'smi' last_feature_idx = @physchem_descriptors.size @compounds.each_with_index do |compound,c| - obconversion.read_string obmol, compound.inchi + obconversion.read_string obmol, compound.smiles obdescriptors.each_with_index do |descriptor,d| @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol)) end diff --git a/lib/lazar-model.rb b/lib/lazar-model.rb new file mode 100644 index 0000000..4ca3403 --- /dev/null +++ b/lib/lazar-model.rb @@ -0,0 +1,287 @@ +module OpenTox + + module Model + + class Lazar + include OpenTox + include Mongoid::Document + include Mongoid::Timestamps + store_in collection: "models" + + field :title, type: String + field :endpoint, type: String + field :creator, type: String, default: __FILE__ + # datasets + field :training_dataset_id, type: BSON::ObjectId + # algorithms + field :prediction_algorithm, type: String + field :neighbor_algorithm, type: String + field :neighbor_algorithm_parameters, type: Hash + # prediction feature + field :prediction_feature_id, type: BSON::ObjectId + + attr_accessor :prediction_dataset + attr_accessor :training_dataset + + # Create a lazar model from a training_dataset and a feature_dataset + # @param [OpenTox::Dataset] training_dataset + # @return [OpenTox::Model::Lazar] Regression or classification model + def self.create training_dataset + + bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1 + + # TODO document convention + prediction_feature = training_dataset.features.first + prediction_feature.nominal ? lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new + lazar.training_dataset_id = training_dataset.id + lazar.prediction_feature_id = prediction_feature.id + lazar.title = prediction_feature.title + + lazar.save + lazar + end + + def predict object + + t = Time.now + at = Time.now + + training_dataset = Dataset.find training_dataset_id + prediction_feature = Feature.find prediction_feature_id + + # parse data + compounds = [] + case object.class.to_s + when "OpenTox::Compound" + compounds = [object] + when "Array" + compounds = object + when "OpenTox::Dataset" + compounds = object.compounds + else + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + end + + # make predictions + predictions = [] + compounds.each_with_index do |compound,c| + t = Time.new + neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters) + # add activities + # TODO: improve efficiency, takes 3 times longer than previous version + # TODO database activity?? + neighbors.collect! do |n| + rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first} + acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact + acts.empty? ? nil : n << acts + end + neighbors.compact! # remove neighbors without training activities + predictions << Algorithm.run(prediction_algorithm, neighbors) + end + + # serialize result + case object.class.to_s + when "OpenTox::Compound" + return predictions.first + when "Array" + return predictions + when "OpenTox::Dataset" + # prepare prediction dataset + prediction_dataset = LazarPrediction.new( + :title => "Lazar prediction for #{prediction_feature.title}", + :creator => __FILE__, + :prediction_feature_id => prediction_feature.id + + ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" ) + # TODO move into warnings field + warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") + prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.compounds = compounds + prediction_dataset.data_entries = predictions + prediction_dataset.save_all + return prediction_dataset + end + + end + + def training_activities + i = training_dataset.feature_ids.index prediction_feature_id + training_dataset.data_entries.collect{|de| de[i]} + end + + end + + class LazarClassification < Lazar + def initialize + super + self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" + self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.neighbor_algorithm_parameters = {:min_sim => 0.7} + end + end + + class LazarFminerClassification < LazarClassification + #field :feature_dataset_id, type: BSON::ObjectId + #field :feature_calculation_algorithm, type: String + + def self.create training_dataset + model = super(training_dataset) + model.update "_type" => self.to_s # adjust class + model = self.find model.id # adjust class + model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity" + model.neighbor_algorithm_parameters = { + :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match", + :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id, + :min_sim => 0.3 + } + model.save + model + end + +=begin + def predict object + + t = Time.now + at = Time.now + + @training_dataset = OpenTox::Dataset.find(training_dataset_id) + @feature_dataset = OpenTox::Dataset.find(feature_dataset_id) + + compounds = [] + case object.class.to_s + when "OpenTox::Compound" + compounds = [object] + when "Array" + compounds = object + when "OpenTox::Dataset" + compounds = object.compounds + else + bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter." + end + + $logger.debug "Setup: #{Time.now-t}" + t = Time.now + + @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} ) + + $logger.debug "Query fingerprint calculation: #{Time.now-t}" + t = Time.now + + predictions = [] + prediction_feature = OpenTox::Feature.find prediction_feature_id + tt = 0 + pt = 0 + nt = 0 + st = 0 + nit = 0 + @training_fingerprints ||= @feature_dataset.data_entries + compounds.each_with_index do |compound,c| + t = Time.new + + $logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}" + + database_activities = @training_dataset.values(compound,prediction_feature) + if database_activities and !database_activities.empty? + database_activities = database_activities.first if database_activities.size == 1 + $logger.debug "Compound #{compound.inchi} occurs in training dataset with activity #{database_activities}" + predictions << {:compound => compound, :value => database_activities, :confidence => "measured"} + next + else + + #training_fingerprints = @feature_dataset.data_entries + query_fingerprint = @query_fingerprint[c] + neighbors = [] + tt += Time.now-t + t = Time.new + + + # find neighbors + @training_fingerprints.each_with_index do |fingerprint, i| + ts = Time.new + sim = Algorithm.run(similarity_algorithm,fingerprint, query_fingerprint) + st += Time.now-ts + ts = Time.new + if sim > self.min_sim + if prediction_algorithm =~ /Regression/ + neighbors << [@feature_dataset.compound_ids[i],sim,training_activities[i], fingerprint] + else + neighbors << [@feature_dataset.compound_ids[i],sim,training_activities[i]] # use compound_ids, instantiation of Compounds is too time consuming + end + end + nit += Time.now-ts + end + + if neighbors.empty? + predictions << {:compound => compound, :value => nil, :confidence => nil, :warning => "No neighbors with similarity > #{min_sim} in dataset #{training_dataset.id}"} + next + end + nt += Time.now-t + t = Time.new + + if prediction_algorithm =~ /Regression/ + prediction = Algorithm.run(prediction_algorithm, neighbors, :min_train_performance => self.min_train_performance) + else + prediction = Algorithm.run(prediction_algorithm, neighbors) + end + prediction[:compound] = compound + prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort with ascending similarities + + + # AM: transform to original space (TODO) + #confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/ + + + $logger.debug "predicted value: #{prediction[:value]}, confidence: #{prediction[:confidence]}" + predictions << prediction + pt += Time.now-t + end + + end + $logger.debug "Transform time: #{tt}" + $logger.debug "Neighbor search time: #{nt} (Similarity calculation: #{st}, Neighbor insert: #{nit})" + $logger.debug "Prediction time: #{pt}" + $logger.debug "Total prediction time: #{Time.now-at}" + + # serialize result + case object.class.to_s + when "OpenTox::Compound" + return predictions.first + when "Array" + return predictions + when "OpenTox::Dataset" + # prepare prediction dataset + prediction_dataset = LazarPrediction.new( + :title => "Lazar prediction for #{prediction_feature.title}", + :creator => __FILE__, + :prediction_feature_id => prediction_feature.id + + ) + confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" ) + warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings") + prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ] + prediction_dataset.compounds = compounds + prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence],p[:warning]]} + prediction_dataset.save_all + return prediction_dataset + end + + end +=end + end + + class LazarRegression < Lazar + + def initialize + super + self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity" + self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" + self.neighbor_algorithm_parameters = {:min_sim => 0.7} + end + + end + + end + +end + diff --git a/lib/lazar.rb b/lib/lazar.rb index 2e7e7c2..0c5e18b 100644 --- a/lib/lazar.rb +++ b/lib/lazar.rb @@ -58,7 +58,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor "algorithm.rb", "descriptor.rb", "bbrc.rb", - "lazar.rb", + "lazar-model.rb", "similarity.rb", "neighbor.rb", "classification.rb", diff --git a/lib/neighbor.rb b/lib/neighbor.rb index a2c28d4..d849cbf 100644 --- a/lib/neighbor.rb +++ b/lib/neighbor.rb @@ -8,7 +8,7 @@ module OpenTox def self.fminer_similarity compound, params feature_dataset = Dataset.find params[:feature_dataset_id] - query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features.collect{|f| f.smarts} ) + query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features) neighbors = [] # find neighbors diff --git a/lib/overwrite.rb b/lib/overwrite.rb index 2eb0b39..a27d685 100644 --- a/lib/overwrite.rb +++ b/lib/overwrite.rb @@ -11,6 +11,12 @@ class Object end end +class Numeric + def percent_of(n) + self.to_f / n.to_f * 100.0 + end +end + module Enumerable # @return [Array] only the duplicates of an enumerable def duplicates diff --git a/test/compound.rb b/test/compound.rb index 7bbba58..b45e3d0 100644 --- a/test/compound.rb +++ b/test/compound.rb @@ -4,20 +4,20 @@ class CompoundTest < MiniTest::Test def test_0_compound_from_smiles c = OpenTox::Compound.from_smiles "F[B-](F)(F)F.[Na+]" - assert_equal "InChI=1S/BF4.Na/c2-1(3,4)5;/q-1;+1", c.inchi - assert_equal "[B-](F)(F)(F)F.[Na+]", c.smiles, "A failure here might be caused by a compound webservice running on 64bit architectures using an outdated version of OpenBabel. Please install OpenBabel version 2.3.2 or higher." # seems to be fixed in 2.3.2 + assert_equal "InChI=1S/BF4.Na/c2-1(3,4)5;/q-1;+1", c.inchi.chomp + assert_equal "F[B-](F)(F)F.[Na+]", c.smiles, "A failure here might be caused by a compound webservice running on 64bit architectures using an outdated version of OpenBabel. Please install OpenBabel version 2.3.2 or higher." # seems to be fixed in 2.3.2 end def test_1_compound_from_smiles c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N" assert_equal "InChI=1S/C6H9NO/c1-5(4-7)3-6(2)8/h5H,3H2,1-2H3", c.inchi - assert_equal "CC(CC(=O)C)C#N", c.smiles + assert_equal "CC(C#N)CC(=O)C", c.smiles end def test_2_compound_from_smiles c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F" assert_equal "InChI=1S/C6H5N2.BF4/c7-8-6-4-2-1-3-5-6;2-1(3,4)5/h1-5H;/q+1;-1", c.inchi - assert_equal "c1ccc(cc1)[N+]#N.[B-](F)(F)(F)F", c.smiles + assert_equal "F[B-](F)(F)F.N#[N+]c1ccccc1", c.smiles end def test_compound_from_name @@ -54,6 +54,7 @@ class CompoundTest < MiniTest::Test # OpenBabel segfaults randomly during inchikey calculation def test_inchikey c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H" + p c assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey end @@ -87,7 +88,14 @@ class CompoundTest < MiniTest::Test refute_nil c.fp4 end c = d.compounds[371] - assert_equal 19, c.neighbors.size + assert c.neighbors.size >= 19 end + def test_openbabel_segfault + inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/b13-5-/t11-,12-,14-,18-,19?/m1/s1" + + #r = `echo "#{inchi}" | babel -iinchi - -oinchi` + c = Compound.from_inchi(inchi) + assert_nil c + end end diff --git a/test/lazar-fminer.rb b/test/lazar-fminer.rb new file mode 100644 index 0000000..fbfa3d2 --- /dev/null +++ b/test/lazar-fminer.rb @@ -0,0 +1,51 @@ +require_relative "setup.rb" + +class LazarFminerTest < MiniTest::Test + + def test_lazar_fminer + training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv") + model = Model::LazarFminerClassification.create training_dataset#, feature_dataset + feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id] + assert_equal training_dataset.compounds.size, feature_dataset.compounds.size + p feature_dataset.features.size + #assert_equal 54, feature_dataset.features.size + feature_dataset.data_entries.each do |e| + assert_equal e.size, feature_dataset.features.size + end + #assert_equal 'C-C-C=C', feature_dataset.features.first.smarts + + [ { + :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"), + :prediction => "false", + :confidence => 0.25281385281385277, + :nr_neighbors => 11 + },{ + :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"), + :prediction => "false", + :confidence => 0.3639589577089577, + :nr_neighbors => 14 + }, { + :compound => Compound.from_smiles('OCCCCCCCC\C=C/CCCCCCCC'), + :prediction => "false", + :confidence => 0.5555555555555556, + :nr_neighbors => 1 + }].each do |example| + prediction = model.predict example[:compound] + + p prediction + #assert_equal example[:prediction], prediction[:value] + #assert_equal example[:confidence], prediction[:confidence] + #assert_equal example[:nr_neighbors], prediction[:neighbors].size + end + + # make a dataset prediction + compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv") + prediction = model.predict compound_dataset + assert_equal compound_dataset.compounds, prediction.compounds + + assert_match /No neighbors/, prediction.data_entries[7][2] + assert_equal "measured", prediction.data_entries[14][1] + # cleanup + [training_dataset,model,feature_dataset,compound_dataset].each{|o| o.delete} + end +end diff --git a/test/validation.rb b/test/validation.rb new file mode 100644 index 0000000..d98feb5 --- /dev/null +++ b/test/validation.rb @@ -0,0 +1,41 @@ +require_relative "setup.rb" + +class ValidationTest < MiniTest::Test + + def test_fminer_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + model = Model::LazarFminerClassification.create dataset#, features + cv = ClassificationCrossValidation.create model + p cv.accuracy + p cv.weighted_accuracy + assert cv.accuracy > 0.8 + assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) larger than unweighted accuracy(#{cv.accuracy}) " + end + + def test_classification_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv" + model = Model::LazarClassification.create dataset#, features + cv = ClassificationCrossValidation.create model + p cv.accuracy + p cv.weighted_accuracy + assert cv.accuracy > 0.7 + assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy should be larger than unweighted accuracy." + end + + def test_regression_crossvalidation + dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv" + #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv" + model = Model::LazarRegression.create dataset + cv = RegressionCrossValidation.create model + p cv.rmse + p cv.weighted_rmse + p cv.mae + p cv.weighted_mae + `inkview #{cv.plot}` + assert cv.rmse < 30, "RMSE > 30" + assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) " + assert cv.mae < 12 + assert cv.weighted_mae < cv.mae + end + +end -- cgit v1.2.3