Merge branch 'feature/sdf-import' into development

author: Christoph Helma <helma@in-silico.ch> 2011-07-28 17:35:49 +0000
committer: Christoph Helma <helma@in-silico.ch> 2011-07-28 17:35:49 +0000
commit: 8e55e3c5fea5e8bc39b18716f3e0c6a01e2f581f (patch)
tree: 8d5829a3ad5fdaab0b68f6997832f52a4ff1f50d
parent: 1dcc4402c15001a89b10e1e8bbdc4ab6b0a30a4b (diff)
parent: 1148087a71ac023a6758c74325ad364d7cda7dbe (diff)
2 files changed, 164 insertions, 1 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index d7a8e47..05335dd 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -102,6 +102,13 @@ module OpenTox
       copy parser.load_uri(subjectid)
     end
 
+    def load_sdf(sdf,subjectid=nil)
+      save(subjectid) unless @uri # get a uri for creating features
+      parser = Parser::Sdf.new
+      parser.dataset = self
+      parser.load_sdf(sdf)
+    end
+
     # Load CSV string (format specification: http://toxcreate.org/help)
     # - loads data_entries, compounds, features
     # - sets metadata (warnings) for parser errors
diff --git a/lib/parser.rb b/lib/parser.rb
index 07bee67..8fa5847 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -350,7 +350,6 @@ module OpenTox
         @dataset
       end
 
-
       private
 
       def warnings
@@ -454,5 +453,162 @@ module OpenTox
       end
 
     end
+
+    class Table
+
+      attr_accessor :data, :features, :compounds
+
+      def initialize
+        @data = {}
+        @activity_errors = []
+      end
+
+      def feature_values(feature)
+        @data.collect{|c, row| row[feature]}.uniq.compact
+      end
+
+      def feature_types(feature)
+        @data.collect{|c, row| feature_type(row[feature])}.uniq.compact
+      end
+
+      def features
+        @data.collect{|c,row| row.keys}.flatten.uniq
+      end
+
+      def clean_features
+        ignored_features = []
+        features.each do |feature|
+          if feature_values(feature).size > 5
+            if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
+              # REGRESSION
+            elsif feature_types(feature).include? OT.NumericFeature
+              @data.each{|c,row| row[feature] = nil unless numeric?(row[feature]) } # delete nominal features
+              @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
+            else
+              @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
+              ignored_features << feature
+              next
+            end
+          elsif feature_values(feature).size <= 1
+              @activity_errors << "Feature #{feature} ignored (less than 2 feature values)."
+              ignored_features << feature
+          else
+            # CLASSIFICATION
+          end
+        end
+        ignored_features.each do |feature|
+          @data.each{ |c,row| row.delete feature }
+        end
+        @activity_errors
+      end
+
+      def add_to_dataset(dataset)
+        features.each do |feature_name|
+          feature_uri = File.join(dataset.uri,"feature",URI.encode(feature_name))
+          dataset.add_feature(feature_uri,{DC.title => feature_name})
+        end
+
+        @data.each do |compound,row|
+          unless row.empty?
+            row.each do |feature,value|
+              if numeric?(value)
+                value = value.to_f
+              elsif value.nil? or value.empty?
+                value = nil
+              else
+                value = value.to_s
+              end
+              feature_uri = File.join(dataset.uri,"feature",URI.encode(feature))
+              dataset.add(compound, feature_uri, value)
+              #dataset.features[feature_uri][RDF.type] = feature_types(feature)
+              #dataset.features[feature_uri][OT.acceptValue] = feature_values(feature)
+              if feature_types(feature).include? OT.NumericFeature
+                dataset.features[feature_uri][RDF.type] = [OT.NumericFeature]
+              else
+                dataset.features[feature_uri][RDF.type] = [OT.NominalFeature]
+                dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) 
+              end
+            end
+          end
+        end
+      end
+
+      private
+      def numeric?(value)
+        true if Float(value) rescue false
+      end
+
+      def feature_type(value)
+        if numeric? value
+          return OT.NumericFeature
+        else
+          return OT.NominalFeature
+        end
+      end
+    end
+
+    # quick hack to enable sdf import via csv
+    # should be refactored 
+    class Sdf
+
+      attr_accessor :dataset
+
+      def initialize
+        @data = {}
+
+        @compound_errors = []
+        @activity_errors = []
+        @duplicates = {}
+      end
+
+      def load_sdf(sdf)
+
+        obconversion = OpenBabel::OBConversion.new
+        obmol = OpenBabel::OBMol.new
+        obconversion.set_in_and_out_formats "sdf", "inchi"
+
+        table = Table.new
+
+        properties = []
+        sdf.each_line { |l| properties << l.to_s if l.match(/</) }
+        properties.uniq!
+        properties.sort!
+        properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp }
+
+        rec = 0
+        sdf.split(/\$\$\$\$\r*\n/).each do |s|
+          rec += 1
+          obconversion.read_string obmol, s
+          begin
+            inchi = obconversion.write_string(obmol).gsub(/\s/,'').chomp 
+            @duplicates[inchi] = [] unless @duplicates[inchi]
+            @duplicates[inchi] << rec #inchi#+", "+row.join(", ")
+            compound = Compound.from_inchi inchi
+          rescue
+            @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
+            next
+          end
+          row = {}
+          obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
+          table.data[compound.uri] = row
+        end
+
+        # finda and remove ignored_features
+        @activity_errors = table.clean_features
+        table.add_to_dataset @dataset
+
+        warnings = ''
+        warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
+        warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
+        duplicate_warnings = ''
+        @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
+        warnings += "<p>Duplicated structures (all structures/activities used for model building, please  make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
+
+        @dataset.metadata[OT.Warnings] = warnings 
+        @dataset
+
+      end
+
+    end
   end
 end
author	Christoph Helma <helma@in-silico.ch>	2011-07-28 17:35:49 +0000
committer	Christoph Helma <helma@in-silico.ch>	2011-07-28 17:35:49 +0000
commit	8e55e3c5fea5e8bc39b18716f3e0c6a01e2f581f (patch)
tree	8d5829a3ad5fdaab0b68f6997832f52a4ff1f50d
parent	1dcc4402c15001a89b10e1e8bbdc4ab6b0a30a4b (diff)
parent	1148087a71ac023a6758c74325ad364d7cda7dbe (diff)