Fminer and Lazar tests running

author: Andreas Maunz <andreas@maunz.de> 2012-10-05 15:47:48 +0200
committer: Andreas Maunz <andreas@maunz.de> 2012-10-05 15:47:48 +0200
commit: bc6118dae334171cbaff4a61d4ae124052f74a59 (patch)
tree: 5907bd86b43b5f918599eeab18f9a9b939de3c4a
parent: 7123507b86139a002c0425357e515975c4796394 (diff)
6 files changed, 127 insertions, 130 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 78fc447..72a87cf 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -103,22 +103,7 @@ module OpenTox
 
       def add_fminer_data(fminer_instance, value_map)
 
-
-        # detect nr duplicates per compound
-        compound_sizes = {}
-        @training_dataset.compounds.each do |compound|
-          entries=@training_dataset.data_entries[compound]
-          entries.each do |feature, values|
-            compound_sizes[compound] || compound_sizes[compound] = []
-            compound_sizes[compound] << values.size unless values.size == 0
-          end
-          compound_sizes[compound].uniq!
-          raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1
-          compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
-        end
-
         id = 1 # fminer start id is not 0
-
         @training_dataset.compounds.each do |compound|
           entry=@training_dataset.data_entries[compound]
           begin
@@ -132,17 +117,17 @@ module OpenTox
             next
           end
 
-          entry.each do |feature,values|
+          entry && entry.each do |feature,values|
             if feature == @prediction_feature.uri
-              (0...compound_sizes[compound]).each { |i|
-                if values[i].nil? 
+              values.each do |value|
+                if value.nil? 
                   LOGGER.warn "No #{feature} activity for #{compound.to_s}."
                 else
                   if @prediction_feature.feature_type == "classification"
-                    activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n
+                    activity= value_map.invert[value].to_i # activities are mapped to 1..n
                     @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
                   elsif @prediction_feature.feature_type == "regression"
-                    activity= values[i].to_f 
+                    activity= value.to_f 
                   end
                   begin
                     fminer_instance.AddCompound(smiles,id) if fminer_instance
@@ -152,11 +137,11 @@ module OpenTox
                     @smi[id] = smiles
                     id += 1
                   rescue Exception => e
-                    LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer"
+                    LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
                     LOGGER.warn e.backtrace
                   end
                 end
-              }
+              end
             end
           end
         end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index c916722..3c5fa7f 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -316,13 +316,24 @@ module OpenTox
     # @param [String] feature Compound URI
     # @param [Boolean,Float] value Feature value
     def add (compound,feature,value)
-      @compounds << compound unless @compounds.include? compound
-      @features[feature] = {}  unless @features[feature]
-      @data_entries[compound] = {} unless @data_entries[compound]
-      @data_entries[compound][feature] = [] unless @data_entries[compound][feature]
-      @data_entries[compound][feature] << value if value!=nil
+      self.add_compound(compound)
+      self.add_data_entry(compound,feature,value)
     end
 
+    # Insert a data entry
+    # @param [String] compound Compound URI
+    # @param [String] feature Compound URI
+    # @param [Boolean,Float] value Feature value
+    def add_data_entry (compound,feature,value)
+      if @compounds.include? compound
+        @features[feature] = {}  unless @features[feature]
+        @data_entries[compound] = {} unless @data_entries[compound]
+        @data_entries[compound][feature] = [] unless @data_entries[compound][feature]
+        @data_entries[compound][feature] << value
+      end
+    end
+
+
     # Add/modify metadata, existing entries will be overwritten
     # @example
     #   dataset.add_metadata({DC.title => "any_title", DC.creator => "my_email"})
@@ -361,7 +372,8 @@ module OpenTox
     # Add a new compound
     # @param [String] compound Compound URI
     def add_compound (compound)
-      @compounds << compound unless @compounds.include? compound
+      @compounds << compound
+      #@compounds << compound unless @compounds.include? compound
     end
     
     # Creates a new dataset, by splitting the current dataset, i.e. using only a subset of compounds and features
@@ -443,8 +455,6 @@ module OpenTox
     # - overwrites dataset if uri exists
     # @return [String] Dataset URI
     def save(subjectid=nil)
-      # TODO: rewrite feature URI's ??
-      @compounds.uniq!
       if @uri
         if (CONFIG[:json_hosts].include?(URI.parse(@uri).host))
           #LOGGER.debug self.to_json
diff --git a/lib/model.rb b/lib/model.rb
index 77b0274..065b227 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -261,7 +261,7 @@ module OpenTox
                                     :lib => self.parameter(\"lib\"),
                                     :subjectid => subjectid
                                     })")
-          
+
           # Adding fingerprint of query compound with features and values(p_value*nr_hits)
           @compound_fingerprints = {}
           @compound_features.each do |feature, value| # value is nil if "Substructure.match"
@@ -279,12 +279,14 @@ module OpenTox
           mtf.transform
 
           # Make a prediction
-          prediction = eval("#{@prediction_algorithm}( { :props => mtf.props,
-                                                          :acts => mtf.acts,
-                                                          :sims => mtf.sims,
-                                                          :value_map => @value_map,
-                                                          :min_train_performance => self.parameter(\"min_train_performance\")
-                                                        } ) ")
+          modul, algorthm = @prediction_algorithm.split('.')
+          pred_params = { :props => mtf.props,
+                          :acts => mtf.acts,
+                          :sims => mtf.sims,
+                          :value_map => @value_map,
+                          :min_train_performance => self.parameter("min_train_performance")
+                         }
+          prediction = eval(modul).send(algorthm, pred_params) 
 
           value_feature_uri = File.join( @uri, "predicted", "value")
           confidence_feature_uri = File.join( @uri, "predicted", "confidence")
@@ -292,16 +294,17 @@ module OpenTox
           @prediction_dataset.metadata[OT.dependentVariables] = @metadata[OT.dependentVariables] unless @prediction_dataset.metadata[OT.dependentVariables] 
           @prediction_dataset.metadata[OT.predictedVariables] = [value_feature_uri, confidence_feature_uri] unless @prediction_dataset.metadata[OT.predictedVariables] 
 
+          @prediction_dataset.add_compound @compound.uri
           if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification"
-            @prediction_dataset.add @compound.uri, value_feature_uri, @value_map[prediction[:prediction].to_s]
+            @prediction_dataset.add_data_entry @compound.uri, value_feature_uri, @value_map[prediction[:prediction].to_s]
           else
-            @prediction_dataset.add @compound.uri, value_feature_uri, prediction[:prediction]
+            @prediction_dataset.add_data_entry @compound.uri, value_feature_uri, prediction[:prediction]
           end
           confidence=prediction[:confidence]
           if @similarity_algorithm.to_s =~ /cosine/
             confidence=((confidence+1.0)/2.0).abs
           end
-          @prediction_dataset.add @compound.uri, confidence_feature_uri, confidence
+          @prediction_dataset.add_data_entry @compound.uri, confidence_feature_uri, confidence
 
           @prediction_dataset.features[value_feature_uri][DC.title] = @prediction_dataset.metadata[DC.title]
           @prediction_dataset.features[confidence_feature_uri][DC.title] = "Confidence"
@@ -318,7 +321,7 @@ module OpenTox
                   OT.pValue => @p_values[feature],
                   OT.effect => @effects[feature]
                 })
-                @prediction_dataset.add @compound.uri, feature_uri, true
+                @prediction_dataset.add_data_entry @compound.uri, feature_uri, true
                 f+=1
               end
             elsif @feature_calculation_algorithm == "Substructure.lookup"
@@ -328,13 +331,13 @@ module OpenTox
                 @prediction_dataset.add_feature(feature, {
                   RDF.type => [OT.NumericFeature]
                 })
-                @prediction_dataset.add @compound.uri, feature, value
+                @prediction_dataset.add_data_entry @compound.uri, feature, value
                 f+=1
               end
             else
               @compound_features.each do |feature|
                 features[feature] = feature
-                @prediction_dataset.add @compound.uri, feature, true
+                @prediction_dataset.add_data_entry @compound.uri, feature, true
               end
             end
             n = 0
@@ -346,8 +349,10 @@ module OpenTox
                 OT.measuredActivity => neighbor[:activity],
                 RDF.type => [OT.Neighbor]
               })
-              @prediction_dataset.add @compound.uri, neighbor_uri, true
+              @prediction_dataset.add_data_entry @compound.uri, neighbor_uri, true
               f = 0 unless f
+
+              @prediction_dataset.add_compound neighbor[:compound]
               neighbor[:features].each do |feature|
                 if @feature_calculation_algorithm == "Substructure.match"
                   feature_uri = File.join( @prediction_dataset.uri, "feature", "descriptor", f.to_s) unless feature_uri = features[feature]
@@ -355,9 +360,9 @@ module OpenTox
                   feature_uri = feature
                 end
                 if @feature_calculation_algorithm == "Substructure.lookup"
-                  @prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
+                  @prediction_dataset.add_data_entry neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
                 else
-                  @prediction_dataset.add neighbor[:compound], feature_uri, true
+                  @prediction_dataset.add_data_entry neighbor[:compound], feature_uri, true
                 end
 
                 unless features.has_key? feature
@@ -390,10 +395,11 @@ module OpenTox
       # @return [Boolean] true if compound has databasse activities, false if not
       def database_activity(subjectid)
         if @activities[@compound.uri]
+          @prediction_dataset.add_compound @compound.uri
           if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification"
-            @activities[@compound.uri].each { |act| @prediction_dataset.add @compound.uri, @metadata[OT.dependentVariables], @value_map[act.to_s] }
+            @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, @metadata[OT.dependentVariables], @value_map[act.to_s] }
           else
-            @activities[@compound.uri].each { |act| @prediction_dataset.add @compound.uri, @metadata[OT.dependentVariables], act }
+            @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, @metadata[OT.dependentVariables], act }
           end
           @prediction_dataset.add_metadata(OT.hasSource => @metadata[OT.trainingDataset])
           @prediction_dataset.save(subjectid)
diff --git a/lib/parser.rb b/lib/parser.rb
index 90f0570..aa1f669 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -294,7 +294,7 @@ module OpenTox
         @id_errors = []
         @activity_errors = []
         @duplicates = {}
-        @max_class_values = 3
+        @max_class_values = 5
       end
 
       def detect_new_values(row, value_maps)
@@ -475,6 +475,7 @@ module OpenTox
         end
         @duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
         @duplicates[compound.inchi] << id+", "+row.join(", ")
+        @dataset.add_compound(compound.uri)
 
         feature_idx = 0
         row.each_index do |i|
@@ -502,12 +503,10 @@ module OpenTox
 
             feature_idx += 1
   
-            if val != nil 
-              @dataset.add(compound.uri, feature, val)
-              if @feature_types[feature].include? OT.NominalFeature
-                @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
-                @dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val)
-              end
+            @dataset.add_data_entry(compound.uri, feature, val)
+            if @feature_types[feature].include? OT.NominalFeature
+              @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
+              @dataset.features[feature][OT.acceptValue] << val unless (@dataset.features[feature][OT.acceptValue].include?(val) or val.nil?)
             end
 
           end
@@ -539,7 +538,7 @@ module OpenTox
       def initialize
         @data = {}
         @activity_errors = []
-        @max_class_values = 3
+        @max_class_values = 5
       end
 
       def feature_values(feature)
diff --git a/lib/serializer.rb b/lib/serializer.rb
index c0bb60d..8d41f05 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -462,17 +462,6 @@ module OpenTox
 
         features = dataset.features.keys
 
-        # remove missing features
-        delete_features = []
-        features.each{ |id|
-          dataset.features[id][RDF.type].each { |typestr|
-            if typestr.include? "MissingFeature"
-              delete_features << id 
-            end
-          }
-        }
-        features = features - delete_features
-
         # sort features
         features.sort!
 
@@ -486,60 +475,56 @@ module OpenTox
         compound_sizes = {}
         dataset.compounds.each do |compound|
           entries=dataset.data_entries[compound]
-          if entries
-            entries.each do |feature, values|
-              compound_sizes[compound] || compound_sizes[compound] = []
-              compound_sizes[compound] << values.size
-            end
-            compound_sizes[compound].uniq!
-            raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1
-            compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
+          entries && entries.each do |feature, values|
+            compound_sizes[compound] || compound_sizes[compound] = values.size
+            raise "Inappropriate data for CSV export" if compound_sizes[compound] != values.size
           end
         end
+        dataset.compounds.uniq.each do |compound| # Must handle compounds with no data entries
+          compound_sizes.has_key?(compound) || compound_sizes[compound] = dataset.compounds.count(compound)
+        end
+
 
         # get headers
         features_smarts && @rows.first << features_smarts || @rows.first << features
         @rows.first.flatten!
 
+        which_row = dataset.compounds.inject({}) { |h,id| h[id]=0; h }
+
         # feature positions pre-calculated
-        feature_positions = features.inject({}) { |h,f| 
-          h.merge!({f => features.index(f)+1}) # +1 due to ID
-          h
+        feature_positions = {}
+        features.each_with_index { |f,idx| 
+          feature_positions[f] = idx+1 # +1 due to ID
         }
 
-        # serialize to csv
-        dataset.compounds.each do |compound|
-          entries=dataset.data_entries[compound]
-          inchi = URI.encode_www_form_component(Compound.new(compound).to_inchi)
-
-          if entries
-            # allocate container
-            row_container = Array.new(compound_sizes[compound])
-            (0...row_container.size).each do |i|
-              row_container[i] = Array.new(@rows.first.size)
-              row_container[i][0] = inchi
-            end
+        # feature_types pre-calculated (for quoting)
+        feature_quoting = {}
+        features.each_with_index { |f,idx|
+          feature_quoting[f] = false
+          if dataset.features[f][RDF.type].size == 1 && dataset.features[f][RDF.type][0] == OT.NominalFeature
+            feature_quoting[f] = true
+          end
+        }
 
-            # fill entries
-            entries.each { |feature, values|
-              (0...compound_sizes[compound]).each { |i|
-                row_container[i][feature_positions[feature]] = values[i]
-              }
-            }
-
-            # fill zeroes for subgraphs
-            if (features_smarts)
-              row_container.collect! { |row|
-                row.collect! { |x| x ? x : 0 } 
-              }
-            end
-            row_container.each { |row| @rows << row }
+        @rows += dataset.compounds.collect do |compound| # assumes compounds list with duplicates
+          inchi_unenc = Compound.new(compound).to_inchi
+          inchi = URI.encode_www_form_component(inchi_unenc)
 
-          else
-            row = Array.new(@rows.first.size)
-            row[0] = inchi
-            @rows << row
-          end
+          i = which_row[compound] # select appropriate feature value
+
+          # allocate row
+          row = Array.new(@rows.first.size)
+          row[0] = inchi
+
+          # fill entries
+          entries=dataset.data_entries[compound]
+          entries && entries.each { |feature, values|
+            row[feature_positions[feature]] = feature_quoting[feature] ? "\""+values[i].to_s+"\"" : values[i].to_s
+          }
+          
+          which_row[compound] = i + 1
+
+          row
         end
       end
 
@@ -549,8 +534,7 @@ module OpenTox
         rows = @rows.collect
         result = ""
         result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
-        result << rows.collect{ |r| r.join(",") }.join("\n")
-        result << "\n"
+        result << rows.collect{ |r| r.join(",") }.join("\n") + "\n"
       end
 
       # Convert to spreadsheet workbook
diff --git a/lib/transform.rb b/lib/transform.rb
index 67a9ec7..fb053ee 100644
--- a/lib/transform.rb
+++ b/lib/transform.rb
@@ -409,29 +409,35 @@ module OpenTox
         # Find neighbors and store them as object variable, access all compounds for that.
         def neighbors
           @model.neighbors = []
+          which_row=@cmpds.inject({}) {|h,c| h[c]=0; h}
           if @similarity_algorithm.to_s =~ /tanimoto/
-            @cmpds.each_with_index { |cmpd, idx| add_neighbor @model.fingerprints[cmpd], idx }
+            @cmpds.each_with_index { |cmpd, idx| 
+              fp={}; @model.fingerprints[cmpd].each { |f,vals| fp[f] = vals[which_row[cmpd]] }
+              add_neighbor fp, idx, which_row[cmpd] # Pass hash, not array
+              which_row[cmpd] += 1
+            }
           else
-            @n_prop.each_with_index { |fp, idx| add_neighbor fp, idx } # AM: access all compounds
+            @cmpds.each_with_index{ |cmpd, idx| 
+              add_neighbor @n_prop[idx], idx, which_row[cmpd] # AM: access all compounds
+              which_row[cmpd] += 1
+            }
           end
         end
 
         # Adds a neighbor to @neighbors if it passes the similarity threshold
         # adjusts @ids to signal the
-        def add_neighbor(training_props, idx)
+        def add_neighbor(training_props, idx, which_row)
           sim = similarity(training_props)
           if sim > @model.parameter("min_sim")
             if @model.activities[@cmpds[idx]]
-              @model.activities[@cmpds[idx]].each do |act|
-                @model.neighbors << {
-                  :compound => @cmpds[idx],
-                  :similarity => sim,
-                  :features => @fps[idx].keys,
-                  :activity => act
-                }
-                @sims << sim
-                @ids << idx
-              end
+              @model.neighbors << {
+                :compound => @cmpds[idx],
+                :similarity => sim,
+                :features => @fps[idx].keys,
+                :activity => @model.activities[@cmpds[idx]][which_row]
+              }
+              @sims << sim
+              @ids << idx
             end
           end
        end
@@ -497,20 +503,27 @@ module OpenTox
           
           # Major BUG! Must loop over @model.compounds, hash is unordered!
           # @model.fingerprints.each 
+          which_row=@model.compounds.inject({}) {|h,c| h[c]=0; h}
           @model.compounds.each { |cmpd|
             fp = @model.fingerprints[cmpd]
             if @model.activities[cmpd] # row good
-              acts = @model.activities[cmpd]; @acts += acts
+              acts = @model.activities[cmpd]; @acts << acts[which_row[cmpd]]
               LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
-              row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
-              acts.size.times { # multiple additions for multiple activities
-                @n_prop << row.collect
-                @cmpds << cmpd
-                @fps << Marshal.load(Marshal.dump(fp))
+              row = []; @model.features.each { |f| 
+                if fp[f].nil?
+                  row << nil
+                else
+                  row << fp[f][which_row[cmpd]]
+                end
+                #row << fp[f].nil? ? nil : fp[f][which_row[cmpd]] # nils for non-existent f's
               } 
+              @n_prop << row.collect
+              @cmpds << cmpd
+              @fps << Marshal.load(Marshal.dump(fp))
             else
               LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
             end
+            which_row[cmpd] += 1
           }
 
           @model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure
author	Andreas Maunz <andreas@maunz.de>	2012-10-05 15:47:48 +0200
committer	Andreas Maunz <andreas@maunz.de>	2012-10-05 15:47:48 +0200
commit	bc6118dae334171cbaff4a61d4ae124052f74a59 (patch)
tree	5907bd86b43b5f918599eeab18f9a9b939de3c4a
parent	7123507b86139a002c0425357e515975c4796394 (diff)