diff options
author | Christoph Helma <helma@in-silico.ch> | 2011-07-28 17:35:49 +0000 |
---|---|---|
committer | Christoph Helma <helma@in-silico.ch> | 2011-07-28 17:35:49 +0000 |
commit | 8e55e3c5fea5e8bc39b18716f3e0c6a01e2f581f (patch) | |
tree | 8d5829a3ad5fdaab0b68f6997832f52a4ff1f50d | |
parent | 1dcc4402c15001a89b10e1e8bbdc4ab6b0a30a4b (diff) | |
parent | 1148087a71ac023a6758c74325ad364d7cda7dbe (diff) |
Merge branch 'feature/sdf-import' into development
-rw-r--r-- | lib/dataset.rb | 7 | ||||
-rw-r--r-- | lib/parser.rb | 158 |
2 files changed, 164 insertions, 1 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb index d7a8e47..05335dd 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -102,6 +102,13 @@ module OpenTox copy parser.load_uri(subjectid) end + def load_sdf(sdf,subjectid=nil) + save(subjectid) unless @uri # get a uri for creating features + parser = Parser::Sdf.new + parser.dataset = self + parser.load_sdf(sdf) + end + # Load CSV string (format specification: http://toxcreate.org/help) # - loads data_entries, compounds, features # - sets metadata (warnings) for parser errors diff --git a/lib/parser.rb b/lib/parser.rb index 07bee67..8fa5847 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -350,7 +350,6 @@ module OpenTox @dataset end - private def warnings @@ -454,5 +453,162 @@ module OpenTox end end + + class Table + + attr_accessor :data, :features, :compounds + + def initialize + @data = {} + @activity_errors = [] + end + + def feature_values(feature) + @data.collect{|c, row| row[feature]}.uniq.compact + end + + def feature_types(feature) + @data.collect{|c, row| feature_type(row[feature])}.uniq.compact + end + + def features + @data.collect{|c,row| row.keys}.flatten.uniq + end + + def clean_features + ignored_features = [] + features.each do |feature| + if feature_values(feature).size > 5 + if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature + # REGRESSION + elsif feature_types(feature).include? OT.NumericFeature + @data.each{|c,row| row[feature] = nil unless numeric?(row[feature]) } # delete nominal features + @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)." + else + @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)." + ignored_features << feature + next + end + elsif feature_values(feature).size <= 1 + @activity_errors << "Feature #{feature} ignored (less than 2 feature values)." + ignored_features << feature + else + # CLASSIFICATION + end + end + ignored_features.each do |feature| + @data.each{ |c,row| row.delete feature } + end + @activity_errors + end + + def add_to_dataset(dataset) + features.each do |feature_name| + feature_uri = File.join(dataset.uri,"feature",URI.encode(feature_name)) + dataset.add_feature(feature_uri,{DC.title => feature_name}) + end + + @data.each do |compound,row| + unless row.empty? + row.each do |feature,value| + if numeric?(value) + value = value.to_f + elsif value.nil? or value.empty? + value = nil + else + value = value.to_s + end + feature_uri = File.join(dataset.uri,"feature",URI.encode(feature)) + dataset.add(compound, feature_uri, value) + #dataset.features[feature_uri][RDF.type] = feature_types(feature) + #dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + if feature_types(feature).include? OT.NumericFeature + dataset.features[feature_uri][RDF.type] = [OT.NumericFeature] + else + dataset.features[feature_uri][RDF.type] = [OT.NominalFeature] + dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + end + end + end + end + end + + private + def numeric?(value) + true if Float(value) rescue false + end + + def feature_type(value) + if numeric? value + return OT.NumericFeature + else + return OT.NominalFeature + end + end + end + + # quick hack to enable sdf import via csv + # should be refactored + class Sdf + + attr_accessor :dataset + + def initialize + @data = {} + + @compound_errors = [] + @activity_errors = [] + @duplicates = {} + end + + def load_sdf(sdf) + + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_and_out_formats "sdf", "inchi" + + table = Table.new + + properties = [] + sdf.each_line { |l| properties << l.to_s if l.match(/</) } + properties.uniq! + properties.sort! + properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp } + + rec = 0 + sdf.split(/\$\$\$\$\r*\n/).each do |s| + rec += 1 + obconversion.read_string obmol, s + begin + inchi = obconversion.write_string(obmol).gsub(/\s/,'').chomp + @duplicates[inchi] = [] unless @duplicates[inchi] + @duplicates[inchi] << rec #inchi#+", "+row.join(", ") + compound = Compound.from_inchi inchi + rescue + @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}" + next + end + row = {} + obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) } + table.data[compound.uri] = row + end + + # finda and remove ignored_features + @activity_errors = table.clean_features + table.add_to_dataset @dataset + + warnings = '' + warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty? + warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty? + duplicate_warnings = '' + @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 } + warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty? + + @dataset.metadata[OT.Warnings] = warnings + @dataset + + end + + end end end |