lib/dataset.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

# Get all datasets
get "/dataset/?" do
  datasets = Dataset.all
  case @accept
  when "text/uri-list"
    uri_list = datasets.collect{|dataset| uri("/dataset/#{dataset.id}")}
    return uri_list.join("\n") + "\n"
  when "application/json"
    datasets = JSON.parse datasets.to_json
    list = []
    datasets.each{|d| list << uri("/dataset/#{d["_id"]["$oid"]}")}
    return list.to_json
  else
    bad_request_error "Mime type #{@accept} is not supported."
  end
end

# Get a dataset
get "/dataset/:id/?" do
  if Task.where(id: params[:id]).exists?
    task = Task.find params[:id]
    halt 404, "Dataset with id: #{params[:id]} not found." unless task.percent == 100
    $logger.debug task.inspect
    response['Content-Type'] = "text/csv"
    m = Model::Validation.find task.model_id
    dataset = Batch.find task.dataset_id
    @ids = dataset.ids
    warnings = dataset.warnings.blank? ? nil : dataset.warnings.join("\n")
    unless warnings.nil?
      @parse = []
      warnings.split("\n").each do |warning|
        if warning =~ /^Cannot/
          smi = warning.split("SMILES compound").last.split("at").first
          line = warning.split("SMILES compound").last.split("at line").last.split("of").first.strip.to_i
          @parse << "Cannot parse SMILES compound#{smi}at line #{line} of #{dataset.source.split("/").last}\n"
        end
      end
      keys_array = []
      warnings.split("\n").each do |warning|
        if warning =~ /^Duplicate/
          text = warning.split("ID").first
          numbers = warning.split("ID").last.split("and")
          keys_array << numbers.collect{|n| n.strip.to_i}
        end
      end
      @dups = {}
      keys_array.each do |keys|
        keys.each do |key|
          @dups[key] = "Duplicate compound at ID #{keys.join(" and ")}\n"
        end
      end
    end
    $logger.debug "dups: #{@dups}"
    endpoint = "#{m.endpoint}_(#{m.species})"
    tempfile = Tempfile.new
    header = task.csv
    lines = []
    $logger.debug task.predictions
    task.predictions[m.id.to_s].each_with_index do |hash,idx|
      identifier = hash.keys[0]
      prediction_id = hash.values[0]
      # add duplicate warning at the end of a line if ID matches
      if @dups[idx+1]
        if prediction_id.is_a? BSON::ObjectId
          if @ids.blank?
            lines << "#{idx+1},#{identifier},#{Prediction.find(prediction_id).csv.tr("\n","")},#{@dups[idx+1]}"
          else
            lines << "#{idx+1},#{@ids[idx]},#{identifier},#{Prediction.find(prediction_id).csv.tr("\n","")},#{@dups[idx+1]}"
          end
        else
          if @ids.blank?
            lines << "#{idx+1},#{identifier},\n"
          else
            lines << "#{idx+1},#{@ids[idx]}#{identifier},\n"
          end
        end
      else
        if prediction_id.is_a? BSON::ObjectId
          if @ids.blank?
            lines << "#{idx+1},#{identifier},#{Prediction.find(prediction_id).csv}"
          else
            lines << "#{idx+1},#{@ids[idx]},#{identifier},#{Prediction.find(prediction_id).csv}"
          end
        else
          if @ids.blank?
            lines << "#{idx+1},#{identifier},\n"
          else
            lines << "#{idx+1},#{@ids[idx]}#{identifier},\n"
          end
        end
      end
    end
    (@parse && !@parse.blank?) ? tempfile.write(header+lines.join("")+"\n"+@parse.join("\n")) : tempfile.write(header+lines.join(""))
    #tempfile.write(header+lines.join(""))
    tempfile.rewind
    ########################
=begin
    header = task.csv
    lines = []
    task.predictions.each_with_index do |result,idx|
      identifier = result[0]
      prediction_id = result[1]
      prediction = Prediction.find prediction_id
      lines << "#{idx+1},#{identifier},#{prediction.csv.tr("\n","")}"
    end
    return header+lines.join("\n")
=end
    return tempfile.read
  else
    dataset = Dataset.find :id => params[:id]
    halt 400, "Dataset with id: #{params[:id]} not found." unless dataset
    case @accept
    when "application/json"
      dataset.data_entries.each do |k, v|
        dataset.data_entries[k][:URI] = uri("/substance/#{k}")
      end
      dataset[:URI] = uri("/dataset/#{dataset.id}")
      dataset[:substances] = uri("/dataset/#{dataset.id}/substances")
      dataset[:features] = uri("/dataset/#{dataset.id}/features")
      return dataset.to_json
    when "text/csv", "application/csv"
      return dataset.to_csv
    else
      bad_request_error "Mime type #{@accept} is not supported."
    end
  end
end

# Get a dataset attribute. One of compounds, nanoparticles, substances, features 
get "/dataset/:id/:attribute/?" do
  if Task.where(id: params[:id]).exists?
    halt 400, "No attributes selection available for dataset with id: #{params[:id]}.".to_json
  end
  dataset = Dataset.find :id => params[:id]
  halt 400,  "Dataset with id: #{params[:id]} not found." unless dataset
  attribs = ["compounds", "nanoparticles", "substances", "features"]
  return "Attribute '#{params[:attribute]}' is not available. Choose one of #{attribs.join(', ')}." unless attribs.include? params[:attribute]
  out = dataset.send("#{params[:attribute]}")
  return out.to_json
end