diff options
author | mguetlein <martin.guetlein@gmail.com> | 2012-11-20 12:07:38 +0100 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2012-11-20 12:07:38 +0100 |
commit | 1a4c9bbc80bf66a70cd43540304c126f5a144436 (patch) | |
tree | 8c8a000da12a8be39893be2fd098cc36533c6abc | |
parent | d743957d03b3d4eabcbc365316bbda651118375f (diff) |
dataset: adjust split, remove merge, add convenience method for working with indices; model: fix usage of predicted variable; r-util: adjust dataset usage
-rw-r--r-- | lib/dataset.rb | 157 | ||||
-rw-r--r-- | lib/model.rb | 33 | ||||
-rw-r--r-- | lib/r-util.rb | 66 |
3 files changed, 137 insertions, 119 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb index 3c5fa7f..8c33526 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -173,7 +173,7 @@ module OpenTox RestClientWrapper.get(u,{:accept=> "text/uri-list", :subjectid => subjectid}).to_s.each_line do |compound_uri| @compounds << compound_uri.chomp end - @compounds.uniq! + @compounds end # Load and return only features from the dataset service @@ -193,7 +193,10 @@ module OpenTox # @return [Array] return array with strings, nil if value is not set (e.g. when feature is numeric) def accept_values(feature) accept_values = features[feature][OT.acceptValue] - accept_values.sort if accept_values + if accept_values + accept_values = accept_values.collect{|av| av.to_s} + accept_values.sort! + end accept_values end @@ -316,6 +319,7 @@ module OpenTox # @param [String] feature Compound URI # @param [Boolean,Float] value Feature value def add (compound,feature,value) + LOGGER.warn("dataset.add is deprecated and should not be used any longer") self.add_compound(compound) self.add_data_entry(compound,feature,value) end @@ -376,80 +380,115 @@ module OpenTox #@compounds << compound unless @compounds.include? compound end + protected + # returns array with (row-)indices of a compound (in the compounds arrays) + def compound_indices( compound ) + unless defined?(@cmp_indices) and @cmp_indices.has_key?(compound) + @cmp_indices = {} + @compounds.size.times do |i| + c = @compounds[i] + if @cmp_indices[c]==nil + @cmp_indices[c] = [i] + else + @cmp_indices[c] = @cmp_indices[c]+[i] + end + end + end + @cmp_indices[compound] + end + + # returns index in data_entries-array for the compound-index + def entry_index( compound_index ) + unless defined?(@entry_indices) and @entry_indices.has_key?(compound_index) + @entry_indices = {} + @compounds.size.times do |i| + @entry_indices[i] = compound_indices(@compounds[i]).index(i) + end + end + @entry_indices[compound_index] + end + public + + # maps a compound-index from another dataset to a compound-index from this dataset + # mapping works as follows: + # (compound c is the compound identified by the compound-index of the other dataset) + # * c occurs only once in this dataset? map compound-index of other dataset to index in this dataset + # * c occurs >1 in this dataset? + # ** number of occurences is equal in both datasets? assume order is preserved(!) and map accordingly + # ** number of occurences is not equal in both datasets? cannot map, raise error + # @param [OpenTox::Dataset] dataset that should be mapped to this dataset (fully loaded) + # @param [Fixnum] compound_index, corresponding to dataset + def compound_index( dataset, compound_index ) + unless defined?(@index_map) and @index_map[dataset.uri] + map = {} + dataset.compounds.uniq.each do |compound| + self_indices = compound_indices(compound) + next unless self_indices + dataset_indices = dataset.compound_indices(compound) + if self_indices.size==1 + dataset_indices.size.times do |i| + map[dataset_indices[i]] = self_indices[0] + end + elsif self_indices.size==dataset_indices.size + # we do assume that the order is preseverd! + dataset_indices.size.times do |i| + map[dataset_indices[i]] = self_indices[i] + end + else + raise "cannot map compound #{compound} from dataset #{dataset.uri} to dataset #{uri}, "+ + "compound occurs #{dataset_indices.size} times and #{self_indices.size} times" + end + end + @index_map = {} unless defined?(@index_map) + @index_map[dataset.uri] = map + end + @index_map[dataset.uri][compound_index] + end + + # returns data entry value for a compound-index and given feature + # @param [Fixnum] compound_index + # @param [OpenTox::Feature] feature + def data_entry_value( compound_index, feature ) + raise "please give compound index instead of '#{compound_index}'" unless compound_index.is_a?(Fixnum) + c = @compounds[compound_index] + if @data_entries[c]==nil + nil + elsif @data_entries[c][feature]==nil + nil + else + @data_entries[c][feature][entry_index(compound_index)] + end + end + # Creates a new dataset, by splitting the current dataset, i.e. using only a subset of compounds and features - # @param [Array] compounds List of compound URIs + # @param [Array] compound_indices List of compound indices # @param [Array] features List of feature URIs # @param [Hash] metadata Hash containing the metadata for the new dataset # @param [String] subjectid # @return [OpenTox::Dataset] newly created dataset, already saved - def split( compounds, features, metadata, subjectid=nil) - LOGGER.debug "split dataset using "+compounds.size.to_s+"/"+@compounds.size.to_s+" compounds" - raise "no new compounds selected" unless compounds and compounds.size>0 + def split( compound_indices, features, metadata, subjectid=nil) + raise "Dataset.split : pls give compounds as indices" if compound_indices.size==nil or !compound_indices[0].is_a?(Fixnum) + LOGGER.debug "split dataset using "+compound_indices.size.to_s+"/"+@compounds.size.to_s+" compounds" dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid) - if features.size==0 - compounds.each{ |c| dataset.add_compound(c) } - else - compounds.each do |c| + features.each{|f| dataset.add_feature(f,@features[f])} + compound_indices.each do |c_idx| + c = @compounds[c_idx] + dataset.add_compound(c) + if @data_entries[c] features.each do |f| - if @data_entries[c]==nil or @data_entries[c][f]==nil - dataset.add(c,f,nil) + if @data_entries[c][f] + dataset.add_data_entry c,f,@data_entries[c][f][entry_index(c_idx)] else - @data_entries[c][f].each do |v| - dataset.add(c,f,v) - end + dataset.add_data_entry c,f,nil end end end end - # set feature metadata in new dataset accordingly (including accept values) - features.each do |f| - self.features[f].each do |k,v| - dataset.features[f][k] = v - end - end dataset.add_metadata(metadata) dataset.save(subjectid) dataset end - # merges two dataset into a new dataset (by default uses all compounds and features) - # precondition: both datasets are fully loaded - # @param [OpenTox::Dataset] dataset1 to merge - # @param [OpenTox::Dataset] dataset2 to merge - # @param [Hash] metadata - # @param [optional,String] subjectid - # @param [optional,Array] features1, if specified only this features of dataset1 are used - # @param [optional,Array] features2, if specified only this features of dataset2 are used - # @param [optional,Array] compounds1, if specified only this compounds of dataset1 are used - # @param [optional,Array] compounds2, if specified only this compounds of dataset2 are used - # example: if you want no features from dataset2, give empty array as features2 - def self.merge( dataset1, dataset2, metadata, subjectid=nil, features1=nil, features2=nil, compounds1=nil, compounds2=nil ) - features1 = dataset1.features.keys unless features1 - features2 = dataset2.features.keys unless features2 - compounds1 = dataset1.compounds unless compounds1 - compounds2 = dataset2.compounds unless compounds2 - data_combined = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid) - LOGGER.debug("merging datasets #{dataset1.uri} and #{dataset2.uri} to #{data_combined.uri}") - [[dataset1, features1, compounds1], [dataset2, features2, compounds2]].each do |dataset,features,compounds| - compounds.each{|c| data_combined.add_compound(c)} - features.each do |f| - m = dataset.features[f] - m[OT.hasSource] = dataset.uri unless m[OT.hasSource] - data_combined.add_feature(f,m) - compounds.each do |c| - dataset.data_entries[c][f].each do |v| - data_combined.add(c,f,v) - end if dataset.data_entries[c] and dataset.data_entries[c][f] - end - end - end - metadata = {} unless metadata - metadata[OT.hasSource] = "Merge from #{dataset1.uri} and #{dataset2.uri}" unless metadata[OT.hasSource] - data_combined.add_metadata(metadata) - data_combined.save(subjectid) - data_combined - end - # Save dataset at the dataset service # - creates a new dataset if uri is not set # - overwrites dataset if uri exists diff --git a/lib/model.rb b/lib/model.rb index 065b227..908cf03 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -197,6 +197,16 @@ module OpenTox @metadata[OT.parameters].collect{|p| p[OT.paramValue] if p[DC.title] == param}.compact.first end + private + def value_feature_uri + File.join( @uri, "predicted", "value") + end + + def confidence_feature_uri + File.join( @uri, "predicted", "confidence") + end + public + # Predict a dataset # @param [String] dataset_uri Dataset URI # @param [optional,subjectid] @@ -209,8 +219,13 @@ module OpenTox OT.hasSource => @uri, DC.creator => @uri, DC.title => URI.decode(File.basename( @metadata[OT.dependentVariables] )), - OT.parameters => [{DC.title => "dataset_uri", OT.paramValue => dataset_uri}] + OT.parameters => [{DC.title => "dataset_uri", OT.paramValue => dataset_uri}], + OT.dependentVariables => @metadata[OT.dependentVariables], + OT.predictedVariables => @prediction_dataset.metadata[OT.predictedVariables] }) + @prediction_dataset.add_feature(value_feature_uri, {DC.title => "Model prediction for #{@metadata[OT.dependentVariables]}"}) + @prediction_dataset.add_feature(confidence_feature_uri, {DC.title => "Confidence"}) + d = Dataset.new(dataset_uri,subjectid) d.load_compounds(subjectid) count = 0 @@ -288,12 +303,6 @@ module OpenTox } prediction = eval(modul).send(algorthm, pred_params) - value_feature_uri = File.join( @uri, "predicted", "value") - confidence_feature_uri = File.join( @uri, "predicted", "confidence") - - @prediction_dataset.metadata[OT.dependentVariables] = @metadata[OT.dependentVariables] unless @prediction_dataset.metadata[OT.dependentVariables] - @prediction_dataset.metadata[OT.predictedVariables] = [value_feature_uri, confidence_feature_uri] unless @prediction_dataset.metadata[OT.predictedVariables] - @prediction_dataset.add_compound @compound.uri if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification" @prediction_dataset.add_data_entry @compound.uri, value_feature_uri, @value_map[prediction[:prediction].to_s] @@ -305,10 +314,7 @@ module OpenTox confidence=((confidence+1.0)/2.0).abs end @prediction_dataset.add_data_entry @compound.uri, confidence_feature_uri, confidence - - @prediction_dataset.features[value_feature_uri][DC.title] = @prediction_dataset.metadata[DC.title] - @prediction_dataset.features[confidence_feature_uri][DC.title] = "Confidence" - + if verbose if @feature_calculation_algorithm == "Substructure.match" f = 0 @@ -397,10 +403,11 @@ module OpenTox if @activities[@compound.uri] @prediction_dataset.add_compound @compound.uri if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification" - @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, @metadata[OT.dependentVariables], @value_map[act.to_s] } + @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, value_feature_uri, @value_map[act.to_s] } else - @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, @metadata[OT.dependentVariables], act } + @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, value_feature_uri, act } end + @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, confidence_feature_uri, 1 } @prediction_dataset.add_metadata(OT.hasSource => @metadata[OT.trainingDataset]) @prediction_dataset.save(subjectid) true diff --git a/lib/r-util.rb b/lib/r-util.rb index cc70696..632268a 100644 --- a/lib/r-util.rb +++ b/lib/r-util.rb @@ -199,7 +199,7 @@ module OpenTox else raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric) end - raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0 + raise "not a loaded ot-dataset (#{dataset.class})" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0 raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0 raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String) LOGGER.debug("r-util> apply stratified split to #{dataset.uri}") @@ -246,57 +246,27 @@ module OpenTox def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil ) LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}" - # count duplicates - num_compounds = {} - dataset.features.keys.each do |f| - dataset.compounds.each do |c| - if dataset.data_entries[c] - val = dataset.data_entries[c][f] - size = val==nil ? 1 : val.size - num_compounds[c] = num_compounds[c]==nil ? size : [num_compounds[c],size].max - else - num_compounds[c] = 1 - end - end - end - # use either all, or the provided features, sorting is important as col-index := features if features features.sort! else features = dataset.features.keys.sort end - compounds = [] - compound_names = [] - dataset.compounds.each do |c| - count = 0 - num_compounds[c].times do |i| - compounds << c - compound_names << "#{c}$#{count}" - count+=1 - end - end # values into 2D array, then to dataframe d_values = [] - dataset.compounds.each do |c| - num_compounds[c].times do |i| - c_values = [] - features.each do |f| - if dataset.data_entries[c] - val = dataset.data_entries[c][f] - v = val==nil ? "" : val[i].to_s - else - raise "wtf" if i>0 - v = "" - end - v = missing_value if v.size()==0 - c_values << v - end - d_values << c_values + dataset.compounds.size.times do |c_idx| + c_values = [] + features.each do |f| + v = dataset.data_entry_value(c_idx,f) + v = missing_value if v==nil + c_values << v end + d_values << c_values end df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" + + compound_names = dataset.compounds.size.times.collect{|idx| dataset.compounds[idx]+"$"+idx.to_s} assign_dataframe(df_name,d_values,compound_names,features) # set dataframe column types accordingly @@ -336,15 +306,17 @@ module OpenTox dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid) dataset.add_metadata(metadata) LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}" - compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)} features.each{|f| dataset.add_feature(f,@@feats[df][f])} - features.size.times do |c| + feature_numeric = features.size.times.collect do |c| feat = OpenTox::Feature.find(features[c],subjectid) - numeric = feat.metadata[RDF.type].to_a.flatten.include?(OT.NumericFeature) - compounds.size.times do |r| - if compound_indices==nil or compound_indices.include?(r) - dataset.add(compounds[r],features[c],numeric ? values[r][c].to_f : values[r][c]) if values[r][c]!="NA" - end + feat.metadata[RDF.type].to_a.flatten.include?(OT.NumericFeature) + end + compounds.size.times do |r| + if compound_indices==nil or compound_indices.include?(r) + dataset.add_compound(compounds[r]) + features.size.times do |c| + dataset.add_data_entry(compounds[r],features[c],feature_numeric[c] ? values[r][c].to_f : values[r][c]) if values[r][c]!="NA" + end end end dataset.save(subjectid) |