summaryrefslogtreecommitdiff
path: root/lib/parser.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/parser.rb')
-rw-r--r--lib/parser.rb208
1 files changed, 138 insertions, 70 deletions
diff --git a/lib/parser.rb b/lib/parser.rb
index e623bf5..8c173f9 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -1,5 +1,14 @@
require 'spreadsheet'
require 'roo'
+
+class String
+
+ def to_triple
+ self.chomp.split(' ',3).collect{|i| i.sub(/\s+.$/,'').gsub(/[<>"]/,'')}
+ end
+
+end
+
module OpenTox
module Parser
@@ -12,19 +21,28 @@ module OpenTox
end
def metadata
- # TODO: load parameters
+
if @dataset
uri = File.join(@uri,"metadata")
else
uri = @uri
end
+
statements = []
- `rapper -i rdfxml -o ntriples #{uri}`.each_line do |line|
- triple = line.chomp.split('> ')
- statements << triple.collect{|i| i.sub(/\s+.$/,'').gsub(/[<>"]/,'')}
- end
- statements.each do |triple|
+ parameter_ids = []
+ `rapper -i rdfxml -o ntriples #{uri} 2>/dev/null`.each_line do |line|
+ triple = line.to_triple
@metadata[triple[1]] = triple[2].split('^^').first if triple[0] == @uri and triple[1] != RDF['type']
+ statements << triple
+ parameter_ids << triple[2] if triple[1] == OT.parameters
+ end
+ unless parameter_ids.empty?
+ @metadata[OT.parameters] = []
+ parameter_ids.each do |p|
+ parameter = {}
+ statements.each{ |t| parameter[t[1]] = t[2] if t[0] == p and t[1] != RDF['type']}
+ @metadata[OT.parameters] << parameter
+ end
end
@metadata
end
@@ -37,6 +55,8 @@ module OpenTox
include Owl
+ attr_writer :uri
+
def initialize(uri)
super uri
@dataset = ::OpenTox::Dataset.new(@uri)
@@ -47,11 +67,10 @@ module OpenTox
feature_values = {}
feature = {}
other_statements = {}
- ntriples = `rapper -i rdfxml -o ntriples #{@uri}`
- ntriples.each_line do |line|
+ `rapper -i rdfxml -o ntriples #{@uri} 2>/dev/null`.each_line do |line|
triple = line.chomp.split(' ',3)
triple = triple[0..2].collect{|i| i.sub(/\s+.$/,'').gsub(/[<>"]/,'')}
- case triple[1] # Ambit namespaces are case insensitive
+ case triple[1]
when /#{OT.values}/i
data[triple[0]] = {:compound => "", :values => []} unless data[triple[0]]
data[triple[0]][:values] << triple[2]
@@ -77,76 +96,84 @@ module OpenTox
end
def load_features
- @dataset.features.keys.each do |feature|
- @dataset.features[feature] = Parser::Owl::Generic.new(feature).metadata
+ uri = File.join(@uri,"features")
+ statements = []
+ features = Set.new
+ `rapper -i rdfxml -o ntriples #{uri} 2>/dev/null`.each_line do |line|
+ triple = line.chomp.split('> ').collect{|i| i.sub(/\s+.$/,'').gsub(/[<>"]/,'')}[0..2]
+ statements << triple
+ features << triple[0] if triple[1] == RDF['type'] and triple[2] == OT.Feature
+ end
+ statements.each do |triple|
+ if features.include? triple[0]
+ @dataset.features[triple[0]] = {} unless @dataset.features[triple[0]]
+ @dataset.features[triple[0]][triple[1]] = triple[2].split('^^').first
+ end
end
+ @dataset.features
end
+
end
end
- class Spreadsheet
+ class Spreadsheets
+ # TODO: expand for multiple columns
+
+ attr_accessor :dataset
+ def initialize
+
+ # TODO: fix 2 datasets created
+ #@dataset = Dataset.create
+ #@dataset.save # get uri
+
+ @data = []
+ @features = []
+ @feature_types = {}
- def initialize(dataset)
- @dataset = dataset
@format_errors = ""
@smiles_errors = []
@activity_errors = []
@duplicates = {}
- @nr_compounds = 0
- @data = []
- @activities = []
- @type = "classification"
end
def load_excel(book)
book.default_sheet = 0
- 1.upto(book.last_row) do |row|
- if row == 1
- @feature = File.join(@dataset.uri,"feature",book.cell(row,2))
- else
- add( book.cell(row,1), book.cell(row,2), row ) # smiles, activity
- end
- end
- parse
+ add_features book.row(1)
+ 2.upto(book.last_row) { |i| add_values book.row(i) }
+ warnings
+ @dataset
end
def load_csv(csv)
row = 0
- csv.each_line do |line|
- row += 1
- raise "Invalid CSV format at line #{row}: #{line.chomp}" unless line.chomp.match(/^.+[,;].*$/) # check CSV format
- items = line.chomp.gsub(/["']/,'').split(/\s*[,;]\s*/) # remove quotes
- if row == 1
- @feature = File.join(@dataset.uri,"feature",items[1])
- else
- add(items[0], items[1], row)
- end
- end
- parse
+ input = csv.split("\n")
+ add_features split_row(input.shift)
+ input.each { |row| add_values split_row(row) }
+ warnings
+ @dataset
end
- def parse
+ private
- # create dataset
- @data.each do |items|
- case @type
- when "classification"
- case items[1].to_s
- when TRUE_REGEXP
- @dataset.add(items[0], @feature, true )
- when FALSE_REGEXP
- @dataset.add(items[0], @feature, false)
- end
- when "regression"
- if items[1].to_f == 0
- @activity_errors << "Row #{items[2]}: Zero values not allowed for regression datasets - entry ignored."
- else
- @dataset.add items[0], @feature, items[1].to_f
- end
+ def warnings
+
+ info = ''
+ @feature_types.each do |feature,types|
+ if types.uniq.size > 1
+ type = OT.NumericFeature
+ else
+ type = types.first
end
+ @dataset.add_feature_metadata(feature,{OT.isA => type})
+ info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
+
+ # TODO: rewrite feature values
+ # TODO if value.to_f == 0 @activity_errors << "#{smiles} Zero values not allowed for regression datasets - entry ignored."
end
+ @dataset.metadata[OT.Info] = info
+
warnings = ''
warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @smiles_errors.join("<br/>") unless @smiles_errors.empty?
warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
@@ -156,34 +183,75 @@ module OpenTox
@dataset.metadata[OT.Warnings] = warnings
- @dataset
+ end
+ def add_features(row)
+ row.shift # get rid of smiles entry
+ row.each do |feature_name|
+ feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
+ @feature_types[feature_uri] = []
+ @features << feature_uri
+ @dataset.add_feature(feature_uri,{DC.title => feature_name})
+ end
end
- def add(smiles, act, row)
+ def add_values(row)
+
+ smiles = row.shift
compound = Compound.from_smiles(smiles)
if compound.nil? or compound.inchi.nil? or compound.inchi == ""
- @smiles_errors << "Row #{row}: " + [smiles,act].join(", ")
- return false
- end
- unless numeric?(act) or classification?(act)
- @activity_errors << "Row #{row}: " + [smiles,act].join(", ")
+ @smiles_errors << smiles+", "+row.join(", ")
return false
end
@duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
- @duplicates[compound.inchi] << "Row #{row}: " + [smiles, act].join(", ")
- @type = "regression" unless classification?(act)
- # TODO: set OT.NumericalFeature, ...
- @nr_compounds += 1
- @data << [ compound.uri, act , row ]
+ @duplicates[compound.inchi] << smiles+", "+row.join(", ")
+
+ row.each_index do |i|
+ value = row[i]
+ feature = @features[i]
+ type = feature_type(value)
+
+ @feature_types[feature] << type
+
+ case type
+ when OT.NominalFeature
+ case value.to_s
+ when TRUE_REGEXP
+ @dataset.add(compound.uri, feature, true )
+ when FALSE_REGEXP
+ @dataset.add(compound.uri, feature, false )
+ end
+ when OT.NumericFeature
+ @dataset.add compound.uri, feature, value.to_f
+ when OT.StringFeature
+ # TODO: insert ??
+ @dataset.add compound.uri, feature, value.to_s
+ @activity_errors << smiles+", "+row.join(", ")
+ #return false
+ end
+ end
+ end
+
+ def numeric?(value)
+ true if Float(value) rescue false
end
- def numeric?(object)
- true if Float(object) rescue false
+ def classification?(value)
+ !value.to_s.strip.match(TRUE_REGEXP).nil? or !value.to_s.strip.match(FALSE_REGEXP).nil?
+ end
+
+ def feature_type(value)
+ if classification? value
+ return OT.NominalFeature
+ elsif numeric? value
+ return OT.NumericFeature
+ else
+ return OT.StringFeature
+ end
end
- def classification?(object)
- !object.to_s.strip.match(TRUE_REGEXP).nil? or !object.to_s.strip.match(FALSE_REGEXP).nil?
+ def split_row(row)
+ row.chomp.gsub(/["']/,'').split(/\s*[,;]\s*/) # remove quotes
end
end