From 00d70503797a4a9aa64a2c92643c07f261e3a5b0 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Jun 2012 12:19:54 +0000 Subject: csv parser added --- application.rb | 268 +++++++++++++++++++++++++++++++++++++++++++++++++-------- config.ru | 3 +- 2 files changed, 232 insertions(+), 39 deletions(-) diff --git a/application.rb b/application.rb index b3e3843..e6d89b3 100644 --- a/application.rb +++ b/application.rb @@ -1,26 +1,245 @@ +#require "./parser.rb" module OpenTox class Application < Service - # Get metadata of the dataset - # @return [application/rdf+xml] Metadata OWL-DL - get '/:id/metadata' do + @warnings = [] + + helpers do + def parse_csv(csv) + parse_table CSV.parse(csv) + end + + def parse_sdf(sdf) + + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_and_out_formats "sdf", "inchi" + + table = [] + + properties = [] + sdf.each_line { |l| properties << l.to_s if l.match(//,'').strip.chomp } + properties.insert 0, "InChI" + table[0] = properties + + rec = 0 + sdf.split(/\$\$\$\$\r*\n/).each do |s| + rec += 1 + table << [] + begin + # TODO: use compound service + obconversion.read_string obmol, s + table.last << obconversion.write_string(obmol).gsub(/\s/,'').chomp + rescue + # TODO: Fix, will lead to follow up errors + table.last << "Could not convert structure at record #{rec}) have been ignored! \n#{s}" + end + obmol.get_data.each { |d| table.last[table.first.index(d.get_attribute)] = d.get_value } + end + parse_table table + end + + def parse_table table + + @warnings = [] + dataset_uri = File.join(uri("/dataset"), SecureRandom.uuid) + #ntriples = [] + ntriples = ["<#{dataset_uri}> <#{RDF.type}> <#{RDF::OT.Dataset}>."] + + # features + feature_names = table.shift.collect{|f| f.strip} + @warnings << "Duplicated features in table header." unless feature_names.size == feature_names.uniq.size + compound_format = feature_names.shift.strip + bad_request_error "#{compound_format} is not a supported compound format. Accepted formats: URI, SMILES, InChI." unless compound_format =~ /URI|URL|SMILES|InChI/i + features = [] + ignored_feature_indices = [] + feature_names.each_with_index do |f,i| + # TODO search for existing features + feature = OpenTox::Feature.new File.join($feature[:uri], SecureRandom.uuid) + feature[RDF.type] = RDF::OT.Feature + feature[RDF::DC.title] = f + features << feature + values = table.collect{|row| row[i+1].strip unless row[i+1].nil?}.uniq # skip compound column + if values.size <= 3 # max classes + feature[RDF.type] = RDF::OT.NominalFeature + feature[RDF.type] = RDF::OT.StringFeature + feature[RDF::OT.acceptValue] = values + else + types = values.collect{|v| feature_type(v)} + if types.include?(RDF::OT.NominalFeature) + @warnings << "Feature '#{f}' contains nominal and numeric values." + #ignored_feature_indices << i + #next + else + feature[RDF.type] = RDF::OT.NumericFeature + end + end + feature.save + case feature[RDF.type].class.to_s + when "Array" + feature[RDF.type].each{ |t| ntriples << "<#{feature.uri}> <#{RDF.type}> <#{t}>." } + when "String" + ntriples << "<#{feature.uri}> <#{RDF.type}> <#{feature[RDF.type]}>." + end + end + + # remove invalid features from table +# puts ignored_feature_indices.inspect +# ignored_feature_indices.each do |i| +# features.delete_at(i) +# table.each{|row| row.delete_at(i)} +# end + + # compounds and values + compound_uris = [] + data_entry_idx = 0 + table.each_with_index do |values,j| + values.collect!{|v| v.strip unless v.nil?} + compound = values.shift + begin + case compound_format + when /URI|URL/i + compound_uri = compound + when /SMILES/i + compound_uri = OpenTox::Compound.from_smiles($compound[:uri], compound).uri + when /InChI/i + compound_uri = OpenTox::Compound.from_inchi($compound[:uri], URI.decode_www_form_component(compound)).uri + end + @warnings << "Duplicated compound #{compound} at position #{j+2}, entries are accepted, assuming that measurements come from independent experiments." if compound_uris.include? compound_uri + rescue + @warnings << "Cannot parse compound #{compound} at position #{j+2}, all entries are ignored." + next + end + unless values.size == features.size + @warnings << "Number of values at position #{j+2} (#{values.size}) is different than header size (#{features.size}), all entries are ignored." + next + end + ntriples << "<#{compound_uri}> <#{RDF.type}> <#{RDF::OT.Compound}>." + + values.each_with_index do |v,i| + @warnings << "Empty value for compound '#{compound}' (row #{j+2}) and feature '#{feature_names[i]}' (column #{i+2})." if v.blank? + + # TODO multiple values, use data_entry/value uris for sorted datasets + # data_entry_uri = File.join dataset_uri, "dataentry", data_entry_idx + ntriples << "<#{dataset_uri}> <#{RDF::OT.dataEntry}> _:dataentry#{data_entry_idx} ." + ntriples << "_:dataentry#{data_entry_idx} <#{RDF.type}> <#{RDF::OT.DataEntry}> ." + ntriples << "_:dataentry#{data_entry_idx} <#{RDF::OT.compound}> <#{compound_uri}> ." + ntriples << "_:dataentry#{data_entry_idx} <#{RDF::OT.values}> _:values#{data_entry_idx} ." + ntriples << "_:values#{data_entry_idx} <#{RDF::OT.feature}> <#{features[i].uri}> ." + ntriples << "_:values#{data_entry_idx} <#{RDF::OT.value}> \"#{v}\" ." + + data_entry_idx += 1 + + end + + end + + ntriples << "<#{dataset_uri}> <#{RDF::OT.Warnings}> \"#{@warnings.join('\n')}\" ." + ntriples.join("\n") + end + + def feature_type(value) + if value.blank? + nil + elsif value.numeric? + RDF::OT.NumericFeature + else + RDF::OT.NominalFeature + end + end + + end + + # Create a new resource + post "/dataset/?" do + #begin + case @content_type + when "text/plain", "text/turtle", "application/rdf+xml" # no conversion needed + when "text/csv" + @body = parse_csv @body + @content_type = "text/plain" + when "application/vnd.ms-excel" + xls = params[:file][:tempfile].path + ".xls" + File.rename params[:file][:tempfile].path, xls # roo needs these endings + @body = parse_csv Excel.new(xls).to_csv + @content_type = "text/plain" + when "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + xlsx = params[:file][:tempfile].path + ".xlsx" + File.rename params[:file][:tempfile].path, xlsx # roo needs these endings + @body = parse_csv Excelx.new(xlsx).to_csv + @content_type = "text/plain" + when "application/vnd.oasis.opendocument.spreadsheet" + ods = params[:file][:tempfile].path + ".ods" + File.rename params[:file][:tempfile].path, ods # roo needs these endings + @body = parse_csv Excelx.new(ods).to_csv + @content_type = "text/plain" + when "chemical/x-mdl-sdfile" + @body = parse_sdf @body + @content_type = "text/plain" + else + bad_request_error "#{@content_type} is not a supported content type." + end + uri = uri("/#{SERVICE}/#{SecureRandom.uuid}") + FourStore.put(uri, @body, @content_type) + if params[:file] + nt = "<#{uri}> <#{RDF::DC.title}> \"#{params[:file][:filename]}\".\n<#{uri}> <#{RDF::OT.hasSource}> \"#{params[:file][:filename]}\"." + FourStore.post(uri, nt, "text/plain") + end + #rescue + #bad_request_error $!.message + #end + + #dataset.add_metadata({ + #DC.title => File.basename(params[:file][:filename],".csv"), + #OT.hasSource => File.basename(params[:file][:filename]) + #}) + response['Content-Type'] = "text/uri-list" + uri end - # Get a dataset feature - # @param [Header] Accept one of `application/rdf+xml or application-x-yaml` (default application/rdf+xml) - # @return [application/rdf+xml,application/x-yaml] Feature metadata - get %r{/(\d+)/feature/(.*)$} do |id,feature| + # Create or updata a resource + put "/dataset/:id/?" do + FourStore.put uri("/#{SERVICE}/#{params[:id]}"), @body, @content_type + end + # Get metadata of the dataset + # @return [application/rdf+xml] Metadata OWL-DL + get '/dataset/:id/metadata' do end # Get a list of all features - # @param [Header] Accept one of `application/rdf+xml, application-x-yaml, text/uri-list` (default application/rdf+xml) - # @return [application/rdf+xml, application-x-yaml, text/uri-list] Feature list - get '/:id/features' do + # @param [Header] Accept one of `application/rdf+xml, text/turtle, text/plain, text/uri-list` (default application/rdf+xml) + # @return [application/rdf+xml, text/turtle, text/plain, text/uri-list] Feature list + get '/dataset/:id/features' do + accept = request.env['HTTP_ACCEPT'] + uri = uri "/dataset/#{params[:id]}" + case accept + when "application/rdf+xml", "text/turtle", "text/plain" + sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>; ?p ?o. }" + when "text/uri-list" + sparql = "SELECT DISTINCT ?s FROM <#{uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Feature}>. }" + else + bad_request_error "'#{accept}' is not a supported content type." + end + FourStore.query sparql, accept end # Get a list of all compounds # @return [text/uri-list] Feature list - get '/:id/compounds' do + get '/dataset/:id/compounds' do + accept = request.env['HTTP_ACCEPT'] + uri = uri "/dataset/#{params[:id]}" + case accept + when "application/rdf+xml", "text/turtle", "text/plain" + sparql = "CONSTRUCT {?s ?p ?o.} FROM <#{uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>; ?p ?o. }" + when "text/uri-list" + sparql = "SELECT DISTINCT ?s FROM <#{uri}> WHERE {?s <#{RDF.type}> <#{RDF::OT.Compound}>. }" + else + bad_request_error "'#{accept}' is not a supported content type." + end + FourStore.query sparql, accept end end end @@ -102,33 +321,6 @@ helpers do when "application/rdf+xml" dataset.load_rdfxml_file(params[:file][:tempfile], @subjectid) - when "text/csv" - dataset.load_csv(params[:file][:tempfile].read, @subjectid) - dataset.add_metadata({ - DC.title => File.basename(params[:file][:filename],".csv"), - OT.hasSource => File.basename(params[:file][:filename]) - }) - - when /ms-excel/ - extension = File.extname(params[:file][:filename]) - case extension - when ".xls" - xls = params[:file][:tempfile].path + ".xls" - File.rename params[:file][:tempfile].path, xls # roo needs these endings - book = Excel.new xls - when ".xlsx" - xlsx = params[:file][:tempfile].path + ".xlsx" - File.rename params[:file][:tempfile].path, xlsx # roo needs these endings - book = Excel.new xlsx - else - raise "#{params[:file][:filename]} is not a valid Excel input file." - end - dataset.load_spreadsheet(book, @subjectid) - dataset.add_metadata({ - DC.title => File.basename(params[:file][:filename],extension), - OT.hasSource => File.basename(params[:file][:filename]) - }) - else raise "MIME type \"#{params[:file][:type]}\" not supported." end diff --git a/config.ru b/config.ru index 39d9385..a8d4293 100644 --- a/config.ru +++ b/config.ru @@ -1,4 +1,5 @@ SERVICE = "dataset" require 'bundler' Bundler.require -run OpenTox::Service +require './application.rb' +run OpenTox::Application -- cgit v1.2.3