From 6ab86c253ba0eb79b9e6a20effa2d18626accf2b Mon Sep 17 00:00:00 2001
From: Christoph Helma <helma@in-silico.ch>
Date: Thu, 13 Aug 2015 11:56:40 +0200
Subject: OpenBabel can (canonical smiles) instead of inchi as internal
 identifier to avoid OpenBabel InChi bug.

---
 lazar.gemspec        |   2 +-
 lib/compound.rb      |  54 +++++++---
 lib/descriptor.rb    |   8 +-
 lib/lazar-model.rb   | 287 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/lazar.rb         |   2 +-
 lib/neighbor.rb      |   2 +-
 lib/overwrite.rb     |   6 ++
 test/compound.rb     |  18 +++-
 test/lazar-fminer.rb |  51 +++++++++
 test/validation.rb   |  41 ++++++++
 10 files changed, 445 insertions(+), 26 deletions(-)
 create mode 100644 lib/lazar-model.rb
 create mode 100644 test/lazar-fminer.rb
 create mode 100644 test/validation.rb

diff --git a/lazar.gemspec b/lazar.gemspec
index 3a9a1af..7a90080 100644
--- a/lazar.gemspec
+++ b/lazar.gemspec
@@ -2,7 +2,7 @@
 $:.push File.expand_path("../lib", __FILE__)
 
 Gem::Specification.new do |s|
-  s.name        = "opentox-client"
+  s.name        = "lazar"
   s.version     = File.read("./VERSION").strip
   s.authors     = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler, Denis Gebele"]
   s.email       = ["helma@in-silico.ch"]
diff --git a/lib/compound.rb b/lib/compound.rb
index 3418fcc..5343aa0 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -10,13 +10,13 @@ module OpenTox
     include OpenTox
 
     field :inchi, type: String
-    attr_readonly :inchi
     field :smiles, type: String
     field :inchikey, type: String
     field :names, type: Array
     field :cid, type: String
     field :chemblid, type: String
-    field :image_id, type: BSON::ObjectId
+    field :png_id, type: BSON::ObjectId
+    field :svg_id, type: BSON::ObjectId
     field :sdf_id, type: BSON::ObjectId
     field :fp4, type: Array
     field :fp4_size, type: Integer
@@ -46,14 +46,18 @@ module OpenTox
     # @return [OpenTox::Compound] Compound
     def self.from_smiles smiles
       # do not store smiles because it might be noncanonical
-      Compound.find_or_create_by :inchi => obconversion(smiles,"smi","inchi")
+      Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
     end
 
     # Create a compound from inchi string
     # @param inchi [String] smiles InChI string
     # @return [OpenTox::Compound] Compound
     def self.from_inchi inchi
-      Compound.find_or_create_by :inchi => inchi
+      # Temporary workaround for OpenBabels Inchi bug
+      # http://sourceforge.net/p/openbabel/bugs/957/
+      # bug has not been fixed in latest git/development version
+      smiles = `echo "#{inchi}" | babel -iinchi - -ocan`.chomp.strip
+      smiles.empty? ? nil : Compound.find_or_create_by(:smiles => smiles, :inchi => inchi)
     end
 
     # Create a compound from sdf string
@@ -61,7 +65,7 @@ module OpenTox
     # @return [OpenTox::Compound] Compound
     def self.from_sdf sdf
       # do not store sdf because it might be 2D
-      Compound.find_or_create_by :inchi => obconversion(sdf,"sdf","inchi")
+      Compound.find_or_create_by :smiles => obconversion(sdf,"sdf","can")
     end
 
     # Create a compound from name. Relies on an external service for name lookups.
@@ -70,20 +74,30 @@ module OpenTox
     # @param name [String] can be also an InChI/InChiKey, CAS number, etc
     # @return [OpenTox::Compound] Compound
     def self.from_name name
-      Compound.find_or_create_by :inchi => RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"stdinchi"))
+      Compound.find_or_create_by :smiles => RestClientWrapper.get(File.join(CACTUS_URI,URI.escape(name),"smiles"))
     end
 
-    # Get InChIKey
+    # Get InChI
     # @return [String] InChI string
+    def inchi
+      unless self["inchi"]
+        result = `echo "#{self.smiles}" | babel -ismi - -oinchi`.chomp
+        update(:inchi => result.chomp) unless result.empty?
+      end
+      self["inchi"]
+    end
+
+    # Get InChIKey
+    # @return [String] InChIKey string
     def inchikey
-      update(:inchikey => obconversion(inchi,"inchi","inchikey")) unless self["inchikey"]
+      update(:inchikey => obconversion(smiles,"smi","inchikey")) unless self["inchikey"]
       self["inchikey"]
     end
 
     # Get (canonical) smiles
     # @return [String] Smiles string
     def smiles
-      update(:smiles => obconversion(inchi,"inchi","smi")) unless self["smiles"] # should give canonical smiles, "can" seems to give incorrect results
+      update(:smiles => obconversion(self["smiles"],"smi","can")) #unless self["smiles"] # should give canonical smiles, "can" seems to give incorrect results
       self["smiles"]
     end
 
@@ -91,7 +105,7 @@ module OpenTox
     # @return [String] SDF string
     def sdf
       if self.sdf_id.nil? 
-        sdf = obconversion(inchi,"inchi","sdf")
+        sdf = obconversion(smiles,"smi","sdf")
         file = Mongo::Grid::File.new(sdf, :filename => "#{id}.sdf",:content_type => "chemical/x-mdl-sdfile")
         sdf_id = $gridfs.insert_one file
         update :sdf_id => sdf_id
@@ -99,17 +113,29 @@ module OpenTox
       $gridfs.find_one(_id: self.sdf_id).data
     end
 
+    # Get SVG image
+    # @return [image/svg] Image data
+    def svg
+      if self.svg_id.nil?
+       svg = obconversion(smiles,"smi","svg")
+       file = Mongo::Grid::File.new(svg, :filename => "#{id}.svg", :content_type => "image/svg")
+       update(:image_id => $gridfs.insert_one(file))
+      end
+      $gridfs.find_one(_id: self.svg_id).data
+
+    end
+
     # Get png image
     # @example
     #   image = compound.png
     # @return [image/png] Image data
     def png
-      if self.image_id.nil?
-       png = obconversion(inchi,"inchi","_png2")
+      if self.png_id.nil?
+       png = obconversion(smiles,"smi","_png2")
        file = Mongo::Grid::File.new(Base64.encode64(png), :filename => "#{id}.png", :content_type => "image/png")
-       update(:image_id => $gridfs.insert_one(file))
+       update(:png_id => $gridfs.insert_one(file))
       end
-      Base64.decode64($gridfs.find_one(_id: self.image_id).data)
+      Base64.decode64($gridfs.find_one(_id: self.png_id).data)
 
     end
 
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index 335f3dc..f0492a2 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -64,7 +64,7 @@ module OpenTox
         @count = count
         obconversion = OpenBabel::OBConversion.new
         obmol = OpenBabel::OBMol.new
-        obconversion.set_in_format('inchi')
+        obconversion.set_in_format('smi')
         smarts_pattern = OpenBabel::OBSmartsPattern.new
         smarts_features = [smarts_features] if smarts_features.is_a?(Feature)
         @smarts = smarts_features.collect{|f| f.smarts}
@@ -77,7 +77,7 @@ module OpenTox
           # which worked with opentox-client
           # (but no smarts_match)
           #p "'#{compound.inchi}'"
-          obconversion.read_string(obmol,compound.inchi)
+          obconversion.read_string(obmol,compound.smiles)
           @smarts.each_with_index do |smart,s|
             smarts_pattern.init(smart)
             if smarts_pattern.match(obmol)
@@ -123,10 +123,10 @@ module OpenTox
         obdescriptors = descriptors.collect{|d| OpenBabel::OBDescriptor.find_type d}
         obmol = OpenBabel::OBMol.new
         obconversion = OpenBabel::OBConversion.new
-        obconversion.set_in_format 'inchi'
+        obconversion.set_in_format 'smi'
         last_feature_idx = @physchem_descriptors.size
         @compounds.each_with_index do |compound,c|
-          obconversion.read_string obmol, compound.inchi
+          obconversion.read_string obmol, compound.smiles
           obdescriptors.each_with_index do |descriptor,d|
             @data_entries[c][d+last_feature_idx] = fix_value(descriptor.predict(obmol))
           end
diff --git a/lib/lazar-model.rb b/lib/lazar-model.rb
new file mode 100644
index 0000000..4ca3403
--- /dev/null
+++ b/lib/lazar-model.rb
@@ -0,0 +1,287 @@
+module OpenTox
+
+  module Model
+
+    class Lazar 
+      include OpenTox
+      include Mongoid::Document
+      include Mongoid::Timestamps
+      store_in collection: "models"
+
+      field :title, type: String
+      field :endpoint, type: String
+      field :creator, type: String, default: __FILE__
+      # datasets
+      field :training_dataset_id, type: BSON::ObjectId
+      # algorithms
+      field :prediction_algorithm, type: String
+      field :neighbor_algorithm, type: String
+      field :neighbor_algorithm_parameters, type: Hash
+      # prediction feature
+      field :prediction_feature_id, type: BSON::ObjectId
+
+      attr_accessor :prediction_dataset
+      attr_accessor :training_dataset
+
+      # Create a lazar model from a training_dataset and a feature_dataset
+      # @param [OpenTox::Dataset] training_dataset
+      # @return [OpenTox::Model::Lazar] Regression or classification model
+      def self.create training_dataset
+
+        bad_request_error "More than one prediction feature found in training_dataset #{training_dataset.id}" unless training_dataset.features.size == 1
+
+        # TODO document convention
+        prediction_feature = training_dataset.features.first
+        prediction_feature.nominal ?  lazar = OpenTox::Model::LazarClassification.new : lazar = OpenTox::Model::LazarRegression.new
+        lazar.training_dataset_id = training_dataset.id
+        lazar.prediction_feature_id = prediction_feature.id
+        lazar.title = prediction_feature.title 
+
+        lazar.save
+        lazar
+      end
+
+      def predict object
+
+        t = Time.now
+        at = Time.now
+
+        training_dataset = Dataset.find training_dataset_id
+        prediction_feature = Feature.find prediction_feature_id
+
+        # parse data
+        compounds = []
+        case object.class.to_s
+        when "OpenTox::Compound"
+          compounds = [object] 
+        when "Array"
+          compounds = object
+        when "OpenTox::Dataset"
+          compounds = object.compounds
+        else 
+          bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
+        end
+
+        # make predictions
+        predictions = []
+        compounds.each_with_index do |compound,c|
+          t = Time.new
+          neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
+          # add activities
+          # TODO: improve efficiency, takes 3 times longer than previous version
+          # TODO database activity??
+          neighbors.collect! do |n|
+            rows = training_dataset.compound_ids.each_index.select{|i| training_dataset.compound_ids[i] == n.first}
+            acts = rows.collect{|row| training_dataset.data_entries[row][0]}.compact
+            acts.empty? ? nil : n << acts
+          end
+          neighbors.compact! # remove neighbors without training activities
+          predictions << Algorithm.run(prediction_algorithm, neighbors)
+        end 
+
+        # serialize result
+        case object.class.to_s
+        when "OpenTox::Compound"
+          return predictions.first
+        when "Array"
+          return predictions
+        when "OpenTox::Dataset"
+          # prepare prediction dataset
+          prediction_dataset = LazarPrediction.new(
+            :title => "Lazar prediction for #{prediction_feature.title}",
+            :creator =>  __FILE__,
+            :prediction_feature_id => prediction_feature.id
+
+          )
+          confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
+          # TODO move into warnings field
+          warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
+          prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+          prediction_dataset.compounds = compounds
+          prediction_dataset.data_entries = predictions
+          prediction_dataset.save_all
+          return prediction_dataset
+        end
+
+      end
+      
+      def training_activities
+        i = training_dataset.feature_ids.index prediction_feature_id
+        training_dataset.data_entries.collect{|de| de[i]}
+      end
+
+    end
+
+    class LazarClassification < Lazar
+      def initialize
+        super
+        self.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote"
+        self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+        self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+      end
+    end
+
+    class LazarFminerClassification < LazarClassification
+      #field :feature_dataset_id, type: BSON::ObjectId
+      #field :feature_calculation_algorithm, type: String
+
+      def self.create training_dataset
+        model = super(training_dataset)
+        model.update "_type" => self.to_s # adjust class
+        model = self.find model.id # adjust class
+        model.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fminer_similarity"
+        model.neighbor_algorithm_parameters = {
+          :feature_calculation_algorithm => "OpenTox::Algorithm::Descriptor.smarts_match",
+          :feature_dataset_id => Algorithm::Fminer.bbrc(training_dataset).id,
+          :min_sim => 0.3
+        }
+        model.save
+        model
+      end
+
+=begin
+      def predict object
+
+        t = Time.now
+        at = Time.now
+
+        @training_dataset = OpenTox::Dataset.find(training_dataset_id)
+        @feature_dataset = OpenTox::Dataset.find(feature_dataset_id)
+
+        compounds = []
+        case object.class.to_s
+        when "OpenTox::Compound"
+          compounds = [object] 
+        when "Array"
+          compounds = object
+        when "OpenTox::Dataset"
+          compounds = object.compounds
+        else 
+          bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
+        end
+
+        $logger.debug "Setup: #{Time.now-t}"
+        t = Time.now
+
+        @query_fingerprint = Algorithm.run(feature_calculation_algorithm, compounds, @feature_dataset.features.collect{|f| f.name} )
+
+        $logger.debug "Query fingerprint calculation: #{Time.now-t}"
+        t = Time.now
+
+        predictions = []
+        prediction_feature = OpenTox::Feature.find prediction_feature_id
+        tt = 0
+        pt = 0
+        nt = 0
+        st = 0
+        nit = 0
+        @training_fingerprints ||= @feature_dataset.data_entries
+        compounds.each_with_index do |compound,c|
+          t = Time.new
+
+          $logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}"
+
+          database_activities = @training_dataset.values(compound,prediction_feature)
+          if database_activities and !database_activities.empty?
+            database_activities = database_activities.first if database_activities.size == 1
+            $logger.debug "Compound #{compound.inchi} occurs in training dataset with activity #{database_activities}"
+            predictions << {:compound => compound, :value => database_activities, :confidence => "measured"}
+            next
+          else
+
+            #training_fingerprints = @feature_dataset.data_entries
+            query_fingerprint = @query_fingerprint[c]
+            neighbors = []
+            tt += Time.now-t
+            t = Time.new
+            
+
+            # find neighbors
+            @training_fingerprints.each_with_index do |fingerprint, i|
+              ts = Time.new
+              sim = Algorithm.run(similarity_algorithm,fingerprint, query_fingerprint)
+              st += Time.now-ts
+              ts = Time.new
+              if sim > self.min_sim
+                if prediction_algorithm =~ /Regression/
+                  neighbors << [@feature_dataset.compound_ids[i],sim,training_activities[i], fingerprint]
+                else
+                  neighbors << [@feature_dataset.compound_ids[i],sim,training_activities[i]] # use compound_ids, instantiation of Compounds is too time consuming
+                end
+              end
+              nit += Time.now-ts
+            end
+
+            if neighbors.empty?
+              predictions << {:compound => compound, :value => nil, :confidence => nil, :warning => "No neighbors with similarity > #{min_sim} in dataset #{training_dataset.id}"}
+              next
+            end
+            nt += Time.now-t
+            t = Time.new
+
+            if prediction_algorithm =~ /Regression/
+              prediction = Algorithm.run(prediction_algorithm, neighbors, :min_train_performance => self.min_train_performance)
+            else
+              prediction = Algorithm.run(prediction_algorithm, neighbors)
+            end
+            prediction[:compound] = compound
+            prediction[:neighbors] = neighbors.sort{|a,b| b[1] <=> a[1]} # sort with ascending similarities
+
+
+            # AM: transform to original space (TODO)
+            #confidence_value = ((confidence_value+1.0)/2.0).abs if prediction.first and similarity_algorithm =~ /cosine/
+
+
+            $logger.debug "predicted value: #{prediction[:value]}, confidence: #{prediction[:confidence]}"
+            predictions << prediction
+            pt += Time.now-t
+          end
+
+        end 
+        $logger.debug "Transform time: #{tt}"
+        $logger.debug "Neighbor search time: #{nt} (Similarity calculation: #{st}, Neighbor insert: #{nit})"
+        $logger.debug "Prediction time: #{pt}"
+        $logger.debug "Total prediction time: #{Time.now-at}"
+
+        # serialize result
+        case object.class.to_s
+        when "OpenTox::Compound"
+          return predictions.first
+        when "Array"
+          return predictions
+        when "OpenTox::Dataset"
+          # prepare prediction dataset
+          prediction_dataset = LazarPrediction.new(
+            :title => "Lazar prediction for #{prediction_feature.title}",
+            :creator =>  __FILE__,
+            :prediction_feature_id => prediction_feature.id
+
+          )
+          confidence_feature = OpenTox::NumericFeature.find_or_create_by( "title" => "Prediction confidence" )
+          warning_feature = OpenTox::NominalFeature.find_or_create_by("title" => "Warnings")
+          prediction_dataset.features = [ prediction_feature, confidence_feature, warning_feature ]
+          prediction_dataset.compounds = compounds
+          prediction_dataset.data_entries = predictions.collect{|p| [p[:value], p[:confidence],p[:warning]]}
+          prediction_dataset.save_all
+          return prediction_dataset
+        end
+
+      end
+=end
+    end
+
+    class LazarRegression < Lazar
+
+      def initialize
+        super
+        self.neighbor_algorithm = "OpenTox::Algorithm::Neighbor.fingerprint_similarity"
+        self.prediction_algorithm = "OpenTox::Algorithm::Regression.weighted_average" 
+        self.neighbor_algorithm_parameters = {:min_sim => 0.7}
+      end
+
+    end
+
+  end
+
+end
+
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 2e7e7c2..0c5e18b 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -58,7 +58,7 @@ CLASSES = ["Feature","Compound","Dataset","Validation","CrossValidation"]# Algor
   "algorithm.rb",
   "descriptor.rb",
   "bbrc.rb",
-  "lazar.rb",
+  "lazar-model.rb",
   "similarity.rb",
   "neighbor.rb",
   "classification.rb",
diff --git a/lib/neighbor.rb b/lib/neighbor.rb
index a2c28d4..d849cbf 100644
--- a/lib/neighbor.rb
+++ b/lib/neighbor.rb
@@ -8,7 +8,7 @@ module OpenTox
 
       def self.fminer_similarity compound, params
         feature_dataset = Dataset.find params[:feature_dataset_id]
-        query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features.collect{|f| f.smarts} )
+        query_fingerprint = Algorithm::Descriptor.smarts_match(compound, feature_dataset.features)
         neighbors = []
 
         # find neighbors
diff --git a/lib/overwrite.rb b/lib/overwrite.rb
index 2eb0b39..a27d685 100644
--- a/lib/overwrite.rb
+++ b/lib/overwrite.rb
@@ -11,6 +11,12 @@ class Object
   end
 end
 
+class Numeric
+  def percent_of(n)
+    self.to_f / n.to_f * 100.0
+  end
+end
+
 module Enumerable
   # @return [Array] only the duplicates of an enumerable
   def duplicates
diff --git a/test/compound.rb b/test/compound.rb
index 7bbba58..b45e3d0 100644
--- a/test/compound.rb
+++ b/test/compound.rb
@@ -4,20 +4,20 @@ class CompoundTest < MiniTest::Test
 
   def test_0_compound_from_smiles
     c = OpenTox::Compound.from_smiles "F[B-](F)(F)F.[Na+]"
-    assert_equal "InChI=1S/BF4.Na/c2-1(3,4)5;/q-1;+1", c.inchi
-    assert_equal "[B-](F)(F)(F)F.[Na+]", c.smiles, "A failure here might be caused by a compound webservice running on 64bit architectures using an outdated version of OpenBabel. Please install OpenBabel version 2.3.2 or higher." # seems to be fixed in 2.3.2
+    assert_equal "InChI=1S/BF4.Na/c2-1(3,4)5;/q-1;+1", c.inchi.chomp
+    assert_equal "F[B-](F)(F)F.[Na+]", c.smiles, "A failure here might be caused by a compound webservice running on 64bit architectures using an outdated version of OpenBabel. Please install OpenBabel version 2.3.2 or higher." # seems to be fixed in 2.3.2
   end
 
   def test_1_compound_from_smiles
     c = OpenTox::Compound.from_smiles "CC(=O)CC(C)C#N"
     assert_equal "InChI=1S/C6H9NO/c1-5(4-7)3-6(2)8/h5H,3H2,1-2H3", c.inchi
-    assert_equal "CC(CC(=O)C)C#N", c.smiles
+    assert_equal "CC(C#N)CC(=O)C", c.smiles
   end
 
   def test_2_compound_from_smiles
     c = OpenTox::Compound.from_smiles "N#[N+]C1=CC=CC=C1.F[B-](F)(F)F"
     assert_equal "InChI=1S/C6H5N2.BF4/c7-8-6-4-2-1-3-5-6;2-1(3,4)5/h1-5H;/q+1;-1", c.inchi
-    assert_equal "c1ccc(cc1)[N+]#N.[B-](F)(F)(F)F", c.smiles
+    assert_equal "F[B-](F)(F)F.N#[N+]c1ccccc1", c.smiles
   end
 
   def test_compound_from_name
@@ -54,6 +54,7 @@ class CompoundTest < MiniTest::Test
   # OpenBabel segfaults randomly during inchikey calculation
   def test_inchikey
     c = OpenTox::Compound.from_inchi "InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"
+    p c
     assert_equal "UHOVQNZJYSORNB-UHFFFAOYSA-N", c.inchikey
   end
 
@@ -87,7 +88,14 @@ class CompoundTest < MiniTest::Test
       refute_nil c.fp4
     end
     c = d.compounds[371]
-    assert_equal 19, c.neighbors.size
+    assert c.neighbors.size >= 19
   end
 
+  def test_openbabel_segfault
+    inchi = "InChI=1S/C19H27NO7/c1-11-9-19(12(2)27-19)17(23)26-14-6-8-20(4)7-5-13(15(14)21)10-25-16(22)18(11,3)24/h5,11-12,14,24H,6-10H2,1-4H3/b13-5-/t11-,12-,14-,18-,19?/m1/s1"
+
+    #r = `echo "#{inchi}" | babel -iinchi - -oinchi`
+    c = Compound.from_inchi(inchi)
+    assert_nil c
+  end
 end
diff --git a/test/lazar-fminer.rb b/test/lazar-fminer.rb
new file mode 100644
index 0000000..fbfa3d2
--- /dev/null
+++ b/test/lazar-fminer.rb
@@ -0,0 +1,51 @@
+require_relative "setup.rb"
+
+class LazarFminerTest < MiniTest::Test
+
+  def test_lazar_fminer
+    training_dataset = Dataset.from_csv_file File.join(DATA_DIR,"hamster_carcinogenicity.csv")
+    model = Model::LazarFminerClassification.create training_dataset#, feature_dataset
+    feature_dataset = Dataset.find model.neighbor_algorithm_parameters[:feature_dataset_id]
+    assert_equal training_dataset.compounds.size, feature_dataset.compounds.size
+    p feature_dataset.features.size
+    #assert_equal 54, feature_dataset.features.size
+    feature_dataset.data_entries.each do |e|
+      assert_equal e.size, feature_dataset.features.size
+    end
+    #assert_equal 'C-C-C=C', feature_dataset.features.first.smarts
+
+    [ {
+      :compound => OpenTox::Compound.from_inchi("InChI=1S/C6H6/c1-2-4-6-5-3-1/h1-6H"),
+      :prediction => "false",
+      :confidence => 0.25281385281385277,
+      :nr_neighbors => 11
+    },{
+      :compound => OpenTox::Compound.from_smiles("c1ccccc1NN"),
+      :prediction => "false",
+      :confidence => 0.3639589577089577,
+      :nr_neighbors => 14
+    }, {
+      :compound => Compound.from_smiles('OCCCCCCCC\C=C/CCCCCCCC'),
+      :prediction => "false",
+      :confidence => 0.5555555555555556,
+      :nr_neighbors => 1
+    }].each do |example|
+      prediction = model.predict example[:compound]
+
+      p prediction
+      #assert_equal example[:prediction], prediction[:value]
+      #assert_equal example[:confidence], prediction[:confidence]
+      #assert_equal example[:nr_neighbors], prediction[:neighbors].size
+    end
+
+    # make a dataset prediction
+    compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
+    prediction = model.predict compound_dataset
+    assert_equal compound_dataset.compounds, prediction.compounds
+
+    assert_match /No neighbors/, prediction.data_entries[7][2]
+    assert_equal "measured", prediction.data_entries[14][1]
+    # cleanup
+    [training_dataset,model,feature_dataset,compound_dataset].each{|o| o.delete}
+  end
+end
diff --git a/test/validation.rb b/test/validation.rb
new file mode 100644
index 0000000..d98feb5
--- /dev/null
+++ b/test/validation.rb
@@ -0,0 +1,41 @@
+require_relative "setup.rb"
+
+class ValidationTest < MiniTest::Test
+
+  def test_fminer_crossvalidation
+    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+    model = Model::LazarFminerClassification.create dataset#, features
+    cv = ClassificationCrossValidation.create model
+    p cv.accuracy
+    p cv.weighted_accuracy
+    assert cv.accuracy > 0.8
+    assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy (#{cv.weighted_accuracy}) larger than unweighted accuracy(#{cv.accuracy}) "
+  end
+
+  def test_classification_crossvalidation
+    dataset = Dataset.from_csv_file "#{DATA_DIR}/hamster_carcinogenicity.csv"
+    model = Model::LazarClassification.create dataset#, features
+    cv = ClassificationCrossValidation.create model
+    p cv.accuracy
+    p cv.weighted_accuracy
+    assert cv.accuracy > 0.7
+    assert cv.weighted_accuracy > cv.accuracy, "Weighted accuracy should be larger than unweighted accuracy."
+  end
+
+  def test_regression_crossvalidation
+    dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.medi.csv"
+    #dataset = Dataset.from_csv_file "#{DATA_DIR}/EPAFHM.csv"
+    model = Model::LazarRegression.create dataset
+    cv = RegressionCrossValidation.create model
+    p cv.rmse 
+    p cv.weighted_rmse
+    p cv.mae
+    p cv.weighted_mae
+    `inkview #{cv.plot}`
+    assert cv.rmse < 30, "RMSE > 30"
+    assert cv.weighted_rmse < cv.rmse, "Weighted RMSE (#{cv.weighted_rmse}) larger than unweighted RMSE(#{cv.rmse}) "
+    assert cv.mae < 12
+    assert cv.weighted_mae < cv.mae
+  end
+
+end
-- 
cgit v1.2.3