summaryrefslogtreecommitdiff
path: root/lib/parser.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/parser.rb')
-rw-r--r--lib/parser.rb381
1 files changed, 381 insertions, 0 deletions
diff --git a/lib/parser.rb b/lib/parser.rb
new file mode 100644
index 0000000..f33017d
--- /dev/null
+++ b/lib/parser.rb
@@ -0,0 +1,381 @@
+require 'spreadsheet'
+require 'roo'
+
+class String
+
+ # Split RDF statement into triples
+ # @return [Array] Array with [subject,predicate,object]
+ def to_triple
+ self.chomp.split(' ',3).collect{|i| i.sub(/\s+.$/,'').gsub(/[<>"]/,'')}
+ end
+
+end
+
+module OpenTox
+
+ # Parser for various input formats
+ module Parser
+
+ # OWL-DL parser
+ module Owl
+
+ # Create a new OWL-DL parser
+ # @param uri URI of OpenTox object
+ # @return [OpenTox::Parser::Owl] OWL-DL parser
+ def initialize(uri)
+ @uri = uri
+ @metadata = {}
+ end
+
+ # Read metadata from opentox service
+ # @return [Hash] Object metadata
+ def load_metadata(subjectid=nil)
+ # avoid using rapper directly because of 2 reasons:
+ # * http errors wont be noticed
+ # * subjectid cannot be sent as header
+ ##uri += "?subjectid=#{CGI.escape(subjectid)}" if subjectid
+ ## `rapper -i rdfxml -o ntriples #{uri} 2>/dev/null`.each_line do |line|
+ if File.exist?(@uri)
+ file = File.new(@uri)
+ else
+ file = Tempfile.new("ot-rdfxml")
+ if @dataset
+ # do not concat /metadata to uri string, this would not work for dataset/R401577?max=3
+ uri = URI::parse(@uri)
+ uri.path = File.join(uri.path,"metadata")
+ uri = uri.to_s
+ else
+ uri = @uri
+ end
+ file.puts OpenTox::RestClientWrapper.get uri,{:subjectid => subjectid,:accept => "application/rdf+xml"},nil,false
+ file.close
+ to_delete = file.path
+ end
+ statements = []
+ parameter_ids = []
+ `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
+ triple = line.to_triple
+ @metadata[triple[1]] = triple[2].split('^^').first if triple[0] == @uri and triple[1] != RDF['type']
+ statements << triple
+ parameter_ids << triple[2] if triple[1] == OT.parameters
+ end
+ File.delete(to_delete) if to_delete
+ unless parameter_ids.empty?
+ @metadata[OT.parameters] = []
+ parameter_ids.each do |p|
+ parameter = {}
+ statements.each{ |t| parameter[t[1]] = t[2] if t[0] == p and t[1] != RDF['type']}
+ @metadata[OT.parameters] << parameter
+ end
+ end
+ @metadata
+ end
+
+ # creates owl object from rdf-data
+ # @param [String] rdf
+ # @param [String] type of the info (e.g. OT.Task, OT.ErrorReport) needed to get the subject-uri
+ # @return [Owl] with uri and metadata set
+ def self.from_rdf( rdf, type )
+ # write to file and read convert with rapper into tripples
+ file = Tempfile.new("ot-rdfxml")
+ file.puts rdf
+ file.close
+ #puts "cmd: rapper -i rdfxml -o ntriples #{file} 2>/dev/null"
+ triples = `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`
+
+ # load uri via type
+ uri = nil
+ triples.each_line do |line|
+ triple = line.to_triple
+ if triple[1] == RDF['type'] and triple[2]==type
+ raise "uri already set, two uris found with type: "+type.to_s if uri
+ uri = triple[0]
+ end
+ end
+ File.delete(file.path)
+ # load metadata
+ metadata = {}
+ triples.each_line do |line|
+ triple = line.to_triple
+ metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type']
+ end
+ owl = Owl::Generic.new(uri)
+ owl.metadata = metadata
+ owl
+ end
+
+ # Generic parser for all OpenTox classes
+ class Generic
+ include Owl
+
+ attr_accessor :uri, :metadata
+ end
+
+ # OWL-DL parser for datasets
+ class Dataset
+
+ include Owl
+
+ attr_writer :uri
+
+ # Create a new OWL-DL dataset parser
+ # @param uri Dataset URI
+ # @return [OpenTox::Parser::Owl::Dataset] OWL-DL parser
+ def initialize(uri, subjectid=nil)
+ super uri
+ @dataset = ::OpenTox::Dataset.new(@uri, subjectid)
+ end
+
+ # Read data from dataset service. Files can be parsed by setting #uri to a filename (after initialization with a real URI)
+ # @example Read data from an external service
+ # parser = OpenTox::Parser::Owl::Dataaset.new "http://wwbservices.in-silico.ch/dataset/1"
+ # dataset = parser.load_uri
+ # @example Create dataset from RDF/XML file
+ # dataset = OpenTox::Dataset.create
+ # parser = OpenTox::Parser::Owl::Dataaset.new dataset.uri
+ # parser.uri = "dataset.rdfxml" # insert your input file
+ # dataset = parser.load_uri
+ # dataset.save
+ # @return [Hash] Internal dataset representation
+ def load_uri(subjectid=nil)
+
+ # avoid using rapper directly because of 2 reasons:
+ # * http errors wont be noticed
+ # * subjectid cannot be sent as header
+ ##uri += "?subjectid=#{CGI.escape(subjectid)}" if subjectid
+ ##`rapper -i rdfxml -o ntriples #{file} 2>/dev/null`.each_line do |line|
+ if File.exist?(@uri)
+ file = File.new(@uri)
+ else
+ file = Tempfile.new("ot-rdfxml")
+ file.puts OpenTox::RestClientWrapper.get @uri,{:subjectid => subjectid,:accept => "application/rdf+xml"},nil,false
+ file.close
+ to_delete = file.path
+ end
+
+ data = {}
+ feature_values = {}
+ feature = {}
+ other_statements = {}
+ `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
+ triple = line.chomp.split(' ',3)
+ triple = triple[0..2].collect{|i| i.sub(/\s+.$/,'').gsub(/[<>"]/,'')}
+ case triple[1]
+ when /#{OT.values}/i
+ data[triple[0]] = {:compound => "", :values => []} unless data[triple[0]]
+ data[triple[0]][:values] << triple[2]
+ when /#{OT.value}/i
+ feature_values[triple[0]] = triple[2]
+ when /#{OT.compound}/i
+ data[triple[0]] = {:compound => "", :values => []} unless data[triple[0]]
+ data[triple[0]][:compound] = triple[2]
+ when /#{OT.feature}/i
+ feature[triple[0]] = triple[2]
+ when /#{RDF.type}/i
+ if triple[2]=~/#{OT.Compound}/i and !data[triple[0]]
+ data[triple[0]] = {:compound => triple[0], :values => []}
+ end
+ else
+ end
+ end
+ File.delete(to_delete) if to_delete
+ data.each do |id,entry|
+ if entry[:values].size==0
+ # no feature values add plain compounds
+ @dataset.add_compound(entry[:compound])
+ else
+ entry[:values].each do |value_id|
+ split = feature_values[value_id].split(/\^\^/)
+ case split[-1]
+ when XSD.double, XSD.float
+ value = split.first.to_f
+ when XSD.boolean
+ value = split.first=~/(?i)true/ ? true : false
+ else
+ value = split.first
+ end
+ @dataset.add entry[:compound],feature[value_id],value
+ end
+ end
+ end
+ load_features subjectid
+ @dataset.metadata = load_metadata(subjectid)
+ @dataset
+ end
+
+ # Read only features from a dataset service.
+ # @return [Hash] Internal features representation
+ def load_features(subjectid=nil)
+ if File.exist?(@uri)
+ file = File.new(@uri)
+ else
+ file = Tempfile.new("ot-rdfxml")
+ # do not concat /features to uri string, this would not work for dataset/R401577?max=3
+ uri = URI::parse(@uri)
+ uri.path = File.join(uri.path,"features")
+ uri = uri.to_s
+ file.puts OpenTox::RestClientWrapper.get uri,{:subjectid => subjectid,:accept => "application/rdf+xml"},nil,false
+ file.close
+ to_delete = file.path
+ end
+ statements = []
+ features = Set.new
+ `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line|
+ triple = line.chomp.split('> ').collect{|i| i.sub(/\s+.$/,'').gsub(/[<>"]/,'')}[0..2]
+ statements << triple
+ features << triple[0] if triple[1] == RDF['type'] and (triple[2] == OT.Feature || triple[2] == OT.NumericFeature)
+ end
+ File.delete(to_delete) if to_delete
+ statements.each do |triple|
+ if features.include? triple[0]
+ @dataset.features[triple[0]] = {} unless @dataset.features[triple[0]]
+ @dataset.features[triple[0]][triple[1]] = triple[2].split('^^').first
+ end
+ end
+ @dataset.features
+ end
+
+ end
+
+ end
+
+ # Parser for getting spreadsheet data into a dataset
+ class Spreadsheets
+
+ attr_accessor :dataset
+
+ def initialize
+ @data = []
+ @features = []
+ @feature_types = {}
+
+ @format_errors = ""
+ @smiles_errors = []
+ @activity_errors = []
+ @duplicates = {}
+ end
+
+ # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help)
+ # @param [Excel] book Excel workbook object (created with roo gem)
+ # @return [OpenTox::Dataset] Dataset object with Excel data
+ def load_spreadsheet(book)
+ book.default_sheet = 0
+ add_features book.row(1)
+ 2.upto(book.last_row) { |i| add_values book.row(i) }
+ warnings
+ @dataset
+ end
+
+ # Load CSV string (format specification: http://toxcreate.org/help)
+ # @param [String] csv CSV representation of the dataset
+ # @return [OpenTox::Dataset] Dataset object with CSV data
+ def load_csv(csv)
+ row = 0
+ input = csv.split("\n")
+ add_features split_row(input.shift)
+ input.each { |row| add_values split_row(row) }
+ warnings
+ @dataset
+ end
+
+ private
+
+ def warnings
+
+ info = ''
+ @feature_types.each do |feature,types|
+ if types.uniq.size > 1
+ type = OT.NumericFeature
+ else
+ type = types.first
+ end
+ @dataset.add_feature_metadata(feature,{OT.isA => type})
+ info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}."
+
+ # TODO: rewrite feature values
+ # TODO if value.to_f == 0 @activity_errors << "#{smiles} Zero values not allowed for regression datasets - entry ignored."
+ end
+
+ @dataset.metadata[OT.Info] = info
+
+ warnings = ''
+ warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @smiles_errors.join("<br/>") unless @smiles_errors.empty?
+ warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty?
+ duplicate_warnings = ''
+ @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 }
+ warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty?
+
+ @dataset.metadata[OT.Warnings] = warnings
+
+ end
+
+ def add_features(row)
+ row.shift # get rid of smiles entry
+ row.each do |feature_name|
+ feature_uri = File.join(@dataset.uri,"feature",URI.encode(feature_name))
+ @feature_types[feature_uri] = []
+ @features << feature_uri
+ @dataset.add_feature(feature_uri,{DC.title => feature_name})
+ end
+ end
+
+ def add_values(row)
+
+ smiles = row.shift
+ compound = Compound.from_smiles(smiles)
+ if compound.nil? or compound.inchi.nil? or compound.inchi == ""
+ @smiles_errors << smiles+", "+row.join(", ")
+ return false
+ end
+ @duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
+ @duplicates[compound.inchi] << smiles+", "+row.join(", ")
+
+ row.each_index do |i|
+ value = row[i]
+ feature = @features[i]
+ type = feature_type(value)
+
+ @feature_types[feature] << type
+
+ case type
+ when OT.NominalFeature
+ case value.to_s
+ when TRUE_REGEXP
+ @dataset.add(compound.uri, feature, true )
+ when FALSE_REGEXP
+ @dataset.add(compound.uri, feature, false )
+ end
+ when OT.NumericFeature
+ @dataset.add compound.uri, feature, value.to_f
+ when OT.StringFeature
+ @dataset.add compound.uri, feature, value.to_s
+ @activity_errors << smiles+", "+row.join(", ")
+ end
+ end
+ end
+
+ def numeric?(value)
+ true if Float(value) rescue false
+ end
+
+ def classification?(value)
+ !value.to_s.strip.match(TRUE_REGEXP).nil? or !value.to_s.strip.match(FALSE_REGEXP).nil?
+ end
+
+ def feature_type(value)
+ if classification? value
+ return OT.NominalFeature
+ elsif numeric? value
+ return OT.NumericFeature
+ else
+ return OT.StringFeature
+ end
+ end
+
+ def split_row(row)
+ row.chomp.gsub(/["']/,'').split(/\s*[,;]\s*/) # remove quotes
+ end
+
+ end
+ end
+end