summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2011-07-28 17:35:49 +0000
committerChristoph Helma <helma@in-silico.ch>2011-07-28 17:35:49 +0000
commit8e55e3c5fea5e8bc39b18716f3e0c6a01e2f581f (patch)
tree8d5829a3ad5fdaab0b68f6997832f52a4ff1f50d
parent1dcc4402c15001a89b10e1e8bbdc4ab6b0a30a4b (diff)
parent1148087a71ac023a6758c74325ad364d7cda7dbe (diff)
Merge branch 'feature/sdf-import' into development
-rw-r--r--lib/dataset.rb7
-rw-r--r--lib/parser.rb158
2 files changed, 164 insertions, 1 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index d7a8e47..05335dd 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -102,6 +102,13 @@ module OpenTox
copy parser.load_uri(subjectid)
end
+ def load_sdf(sdf,subjectid=nil)
+ save(subjectid) unless @uri # get a uri for creating features
+ parser = Parser::Sdf.new
+ parser.dataset = self
+ parser.load_sdf(sdf)
+ end
+
# Load CSV string (format specification: http://toxcreate.org/help)
# - loads data_entries, compounds, features
# - sets metadata (warnings) for parser errors
diff --git a/lib/parser.rb b/lib/parser.rb
index 07bee67..8fa5847 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -350,7 +350,6 @@ module OpenTox
@dataset
end
-
private
def warnings
@@ -454,5 +453,162 @@ module OpenTox
end
end
+
+ class Table
+
+ attr_accessor :data, :features, :compounds
+
+ def initialize
+ @data = {}
+ @activity_errors = []
+ end
+
+ def feature_values(feature)
+ @data.collect{|c, row| row[feature]}.uniq.compact
+ end
+
+ def feature_types(feature)
+ @data.collect{|c, row| feature_type(row[feature])}.uniq.compact
+ end
+
+ def features
+ @data.collect{|c,row| row.keys}.flatten.uniq
+ end
+
+ def clean_features
+ ignored_features = []
+ features.each do |feature|
+ if feature_values(feature).size > 5
+ if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature
+ # REGRESSION
+ elsif feature_types(feature).include? OT.NumericFeature
+ @data.each{|c,row| row[feature] = nil unless numeric?(row[feature]) } # delete nominal features
+ @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)."
+ else
+ @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)."
+ ignored_features << feature
+ next
+ end
+ elsif feature_values(feature).size <= 1
+ @activity_errors << "Feature #{feature} ignored (less than 2 feature values)."
+ ignored_features << feature
+ else
+ # CLASSIFICATION
+ end
+ end
+ ignored_features.each do |feature|
+ @data.each{ |c,row| row.delete feature }
+ end
+ @activity_errors
+ end
+
+ def add_to_dataset(dataset)
+ features.each do |feature_name|
+ feature_uri = File.join(dataset.uri,"feature",URI.encode(feature_name))
+ dataset.add_feature(feature_uri,{DC.title => feature_name})
+ end
+
+ @data.each do |compound,row|
+ unless row.empty?
+ row.each do |feature,value|
+ if numeric?(value)
+ value = value.to_f
+ elsif value.nil? or value.empty?
+ value = nil
+ else
+ value = value.to_s
+ end
+ feature_uri = File.join(dataset.uri,"feature",URI.encode(feature))
+ dataset.add(compound, feature_uri, value)
+ #dataset.features[feature_uri][RDF.type] = feature_types(feature)
+ #dataset.features[feature_uri][OT.acceptValue] = feature_values(feature)
+ if feature_types(feature).include? OT.NumericFeature
+ dataset.features[feature_uri][RDF.type] = [OT.NumericFeature]
+ else
+ dataset.features[feature_uri][RDF.type] = [OT.NominalFeature]
+ dataset.features[feature_uri][OT.acceptValue] = feature_values(feature)
+ end
+ end
+ end
+ end
+ end
+
+ private
+ def numeric?(value)
+ true if Float(value) rescue false
+ end
+
+ def feature_type(value)
+ if numeric? value
+ return OT.NumericFeature
+ else
+ return OT.NominalFeature
+ end
+ end
+ end
+
+ # quick hack to enable sdf import via csv
+ # should be refactored
+ class Sdf
+
+ attr_accessor :dataset
+
+ def initialize
+ @data = {}
+
+ @compound_errors = []
+ @activity_errors = []
+ @duplicates = {}
+ end
+
+ def load_sdf(sdf)
+
+ obconversion = OpenBabel::OBConversion.new
+ obmol = OpenBabel::OBMol.new
+ obconversion.set_in_and_out_formats "sdf", "inchi"
+
+ table = Table.new
+
+ properties = []
+ sdf.each_line { |l| properties << l.to_s if l.match(/</) }
+ properties.uniq!
+ properties.sort!
+ properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp }
+
+ rec = 0
+ sdf.split(/\$\$\$\$\r*\n/).each do |s|
+ rec += 1
+ obconversion.read_string obmol, s
+ begin
+ inchi = obconversion.write_string(obmol).gsub(/\s/,'').chomp
+ @duplicates[inchi] = [] unless @duplicates[inchi]
+ @duplicates[inchi] << rec #inchi#+", "+row.join(", ")
+ compound = Compound.from_inchi inchi
+ rescue
+ @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}"
+ next
+ end
+ row = {}
+ obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) }
+ table.data[compound.uri] = row
+ end
+
+ # finda and remove ignored_features
+ @activity_errors = table.clean_features
+ table.add_to_dataset @dataset
+
+ warnings = ''
+ warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty?
+ warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
+ duplicate_warnings = ''
+ @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
+ warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
+
+ @dataset.metadata[OT.Warnings] = warnings
+ @dataset
+
+ end
+
+ end
end
end