summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2012-06-28 12:19:54 +0000
committerChristoph Helma <helma@in-silico.ch>2012-06-28 12:19:54 +0000
commit00d70503797a4a9aa64a2c92643c07f261e3a5b0 (patch)
tree8bedaa0f4bf0a99d04629d048ded58b688cb82f4
parentea4f6e8984ee6043913d2ae0cdd16449cc2b582f (diff)
csv parser added
-rw-r--r--application.rb268
-rw-r--r--config.ru3
2 files changed, 232 insertions, 39 deletions
diff --git a/application.rb b/application.rb
index b3e3843..e6d89b3 100644
--- a/application.rb
+++ b/application.rb
@@ -1,26 +1,245 @@
+#require "./parser.rb"
module OpenTox
class Application < Service
- # Get metadata of the dataset
- # @return [application/rdf+xml] Metadata OWL-DL
- get '/:id/metadata' do
+ @warnings = []
+
+ helpers do
+ def parse_csv(csv)
+ parse_table CSV.parse(csv)
+ end
+
+ def parse_sdf(sdf)
+
+ obconversion = OpenBabel::OBConversion.new
+ obmol = OpenBabel::OBMol.new
+ obconversion.set_in_and_out_formats "sdf", "inchi"
+
+ table = []
+
+ properties = []
+ sdf.each_line { |l| properties << l.to_s if l.match(/</) }
+ properties.sort!
+ properties.uniq!
+ properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp }
+ properties.insert 0, "InChI"
+ table[0] = properties
+
+ rec = 0
+ sdf.split(/\$\$\$\$\r*\n/).each do |s|
+ rec += 1
+ table << []
+ begin
+ # TODO: use compound service
+ obconversion.read_string obmol, s
+ table.last << obconversion.write_string(obmol).gsub(/\s/,'').chomp
+ rescue
+ # TODO: Fix, will lead to follow up errors
+ table.last << "Could not convert structure at record #{rec}) have been ignored! \n#{s}"
+ end
+ obmol.get_data.each { |d| table.last[table.first.index(d.get_attribute)] = d.get_value }
+ end
+ parse_table table
+ end
+
+ def parse_table table
+
+ @warnings = []
+ dataset_uri = File.join(uri("/dataset"), SecureRandom.uuid)
+ #ntriples = []
+ ntriples = ["<#{dataset_uri}> <#{RDF.type}> <#{RDF::OT.Dataset}>."]
+
+ # features
+ feature_names = table.shift.collect{|f| f.strip}
+ @warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size
+ compound_format = feature_names.shift.strip
+ bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: URI, SMILES, InChI." unless compound_format =~ /URI|URL|SMILES|InChI/i
+ features = []
+ ignored_feature_indices = []
+ feature_names.each_with_index do |f,i|
+ # TODO search for existing features
+ feature = OpenTox::Feature.new File.join($feature[:uri], SecureRandom.uuid)
+ feature[RDF.type] = RDF::OT.Feature
+ feature[RDF::DC.title] = f
+ features << feature
+ values = table.collect{|row| row[i+1].strip unless row[i+1].nil?}.uniq # skip compound column
+ if values.size <= 3 # max classes
+ feature[RDF.type] = RDF::OT.NominalFeature
+ feature[RDF.type] = RDF::OT.StringFeature
+ feature[RDF::OT.acceptValue] = values
+ else
+ types = values.collect{|v| feature_type(v)}
+ if types.include?(RDF::OT.NominalFeature)
+ @warnings << "Feature '#{f}' contains nominal and numeric values."
+ #ignored_feature_indices << i
+ #next
+ else
+ feature[RDF.type] = RDF::OT.NumericFeature
+ end
+ end
+ feature.save
+ case feature[RDF.type].class.to_s
+ when "Array"
+ feature[RDF.type].each{ |t| ntriples << "<#{feature.uri}> <#{RDF.type}> <#{t}>." }
+ when "String"
+ ntriples << "<#{feature.uri}> <#{RDF.type}> <#{feature[RDF.type]}>."
+ end
+ end
+
+ # remove invalid features from table
+# puts ignored_feature_indices.inspect
+# ignored_feature_indices.each do |i|
+# features.delete_at(i)
+# table.each{|row| row.delete_at(i)}
+# end
+
+ # compounds and values
+ compound_uris = []
+ data_entry_idx = 0
+ table.each_with_index do |values,j|
+ values.collect!{|v| v.strip unless v.nil?}
+ compound = values.shift
+ begin
+ case compound_format
+ when /URI|URL/i
+ compound_uri = compound
+ when /SMILES/i
+ compound_uri = OpenTox::Compound.from_smiles($compound[:uri], compound).uri
+ when /InChI/i
+ compound_uri = OpenTox::Compound.from_inchi($compound[:uri], URI.decode_www_form_component(compound)).uri
+ end
+ @warnings << "Duplicated compound #{compound} at position #{j+2}, entries are accepted, assuming that measurements come from independent experiments." if compound_uris.include? compound_uri
+ rescue
+ @warnings << "Cannot parse compound #{compound} at position #{j+2}, all entries are ignored."
+ next
+ end
+ unless values.size == features.size
+ @warnings << "Number of values at position #{j+2} (#{values.size}) is different than header size (#{features.size}), all entries are ignored."
+ next
+ end
+ ntriples << "<#{compound_uri}> <#{RDF.type}> <#{RDF::OT.Compound}>."
+
+ values.each_with_index do |v,i|
+ @warnings << "Empty value for compound '#{compound}' (row #{j+2}) and feature '#{feature_names[i]}' (column #{i+2})." if v.blank?
+
+ # TODO multiple values, use data_entry/value uris for sorted datasets
+ # data_entry_uri = File.join dataset_uri, "dataentry", data_entry_idx
+ ntriples << "<#{dataset_uri}> <#{RDF::OT.dataEntry}> _:dataentry#{data_entry_idx} ."
+ ntriples << "_:dataentry#{data_entry_idx} <#{RDF.type}> <#{RDF::OT.DataEntry}> ."
+ ntriples << "_:dataentry#{data_entry_idx} <#{RDF::OT.compound}> <#{compound_uri}> ."
+ ntriples << "_:dataentry#{data_entry_idx} <#{RDF::OT.values}> _:values#{data_entry_idx} ."
+ ntriples << "_:values#{data_entry_idx} <#{RDF::OT.feature}> <#{features[i].uri}> ."
+ ntriples << "_:values#{data_entry_idx} <#{RDF::OT.value}> \"#{v}\" ."
+
+ data_entry_idx += 1
+
+ end
+
+ end
+
+ ntriples << "<#{dataset_uri}> <#{RDF::OT.Warnings}> \"#{@warnings.join('\n')}\" ."
+ ntriples.join("\n")
+ end
+
+ def feature_type(value)
+ if value.blank?
+ nil
+ elsif value.numeric?
+ RDF::OT.NumericFeature
+ else
+ RDF::OT.NominalFeature
+ end
+ end
+
+ end
+
+ # Create a new resource
+ post "/dataset/?" do
+ #begin
+ case @content_type
+ when "text/plain", "text/turtle", "application/rdf+xml" # no conversion needed
+ when "text/csv"
+ @body = parse_csv @body
+ @content_type = "text/plain"
+ when "application/vnd.ms-excel"
+ xls = params[:file][:tempfile].path + ".xls"
+ File.rename params[:file][:tempfile].path, xls # roo needs these endings
+ @body = parse_csv Excel.new(xls).to_csv
+ @content_type = "text/plain"
+ when "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+ xlsx = params[:file][:tempfile].path + ".xlsx"
+ File.rename params[:file][:tempfile].path, xlsx # roo needs these endings
+ @body = parse_csv Excelx.new(xlsx).to_csv
+ @content_type = "text/plain"
+ when "application/vnd.oasis.opendocument.spreadsheet"
+ ods = params[:file][:tempfile].path + ".ods"
+ File.rename params[:file][:tempfile].path, ods # roo needs these endings
+ @body = parse_csv Excelx.new(ods).to_csv
+ @content_type = "text/plain"
+ when "chemical/x-mdl-sdfile"
+ @body = parse_sdf @body
+ @content_type = "text/plain"
+ else
+ bad_request_error "#{@content_type} is not a supported content type."
+ end
+ uri = uri("/#{SERVICE}/#{SecureRandom.uuid}")
+ FourStore.put(uri, @body, @content_type)
+ if params[:file]
+ nt = "<#{uri}> <#{RDF::DC.title}> \"#{params[:file][:filename]}\".\n<#{uri}> <#{RDF::OT.hasSource}> \"#{params[:file][:filename]}\"."
+ FourStore.post(uri, nt, "text/plain")
+ end
+ #rescue
+ #bad_request_error $!.message
+ #end
+
+ #dataset.add_metadata({
+ #DC.title => File.basename(params[:file][:filename],".csv"),
+ #OT.hasSource => File.basename(params[:file][:filename])
+ #})
+ response['Content-Type'] = "text/uri-list"
+ uri
end
- # Get a dataset feature
- # @param [Header] Accept one of `application/rdf+xml or application-x-yaml` (default application/rdf+xml)
- # @return [application/rdf+xml,application/x-yaml] Feature metadata
- get %r{/(\d+)/feature/(.*)$} do |id,feature|
+ # Create or updata a resource
+ put "/dataset/:id/?" do
+ FourStore.put uri("/#{SERVICE}/#{params[:id]}"), @body, @content_type
+ end
+ # Get metadata of the dataset
+ # @return [application/rdf+xml] Metadata OWL-DL
+ get '/dataset/:id/metadata' do
end
# Get a list of all features
- # @param [Header] Accept one of `application/rdf+xml, application-x-yaml, text/uri-list` (default application/rdf+xml)
- # @return [application/rdf+xml, application-x-yaml, text/uri-list] Feature list
- get '/:id/features' do
+ # @param [Header] Accept one of `application/rdf+xml, text/turtle, text/plain, text/uri-list` (default application/rdf+xml)
+ # @return [application/rdf+xml, text/turtle, text/plain, text/uri-list] Feature list
+ get '/dataset/:id/features' do
+ accept = request.env['HTTP_ACCEPT']
+ uri = uri "/dataset/#{params[:id]}"
+ case accept
+ when "application/rdf+xml", "text/turtle", "text/plain"
+ sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>; ?p ?o. }"
+ when "text/uri-list"
+ sparql = "SELECT DISTINCT ?s FROM <#{uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>. }"
+ else
+ bad_request_error "'#{accept}' is not a supported content type."
+ end
+ FourStore.query sparql, accept
end
# Get a list of all compounds
# @return [text/uri-list] Feature list
- get '/:id/compounds' do
+ get '/dataset/:id/compounds' do
+ accept = request.env['HTTP_ACCEPT']
+ uri = uri "/dataset/#{params[:id]}"
+ case accept
+ when "application/rdf+xml", "text/turtle", "text/plain"
+ sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>; ?p ?o. }"
+ when "text/uri-list"
+ sparql = "SELECT DISTINCT ?s FROM <#{uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>. }"
+ else
+ bad_request_error "'#{accept}' is not a supported content type."
+ end
+ FourStore.query sparql, accept
end
end
end
@@ -102,33 +321,6 @@ helpers do
when "application/rdf+xml"
dataset.load_rdfxml_file(params[:file][:tempfile], @subjectid)
- when "text/csv"
- dataset.load_csv(params[:file][:tempfile].read, @subjectid)
- dataset.add_metadata({
- DC.title => File.basename(params[:file][:filename],".csv"),
- OT.hasSource => File.basename(params[:file][:filename])
- })
-
- when /ms-excel/
- extension = File.extname(params[:file][:filename])
- case extension
- when ".xls"
- xls = params[:file][:tempfile].path + ".xls"
- File.rename params[:file][:tempfile].path, xls # roo needs these endings
- book = Excel.new xls
- when ".xlsx"
- xlsx = params[:file][:tempfile].path + ".xlsx"
- File.rename params[:file][:tempfile].path, xlsx # roo needs these endings
- book = Excel.new xlsx
- else
- raise "#{params[:file][:filename]} is not a valid Excel input file."
- end
- dataset.load_spreadsheet(book, @subjectid)
- dataset.add_metadata({
- DC.title => File.basename(params[:file][:filename],extension),
- OT.hasSource => File.basename(params[:file][:filename])
- })
-
else
raise "MIME type \"#{params[:file][:type]}\" not supported."
end
diff --git a/config.ru b/config.ru
index 39d9385..a8d4293 100644
--- a/config.ru
+++ b/config.ru
@@ -1,4 +1,5 @@
SERVICE = "dataset"
require 'bundler'
Bundler.require
-run OpenTox::Service
+require './application.rb'
+run OpenTox::Application