diff options
author | mguetlein <martin.guetlein@gmail.com> | 2012-06-15 14:01:44 +0200 |
---|---|---|
committer | mguetlein <martin.guetlein@gmail.com> | 2012-06-15 14:01:44 +0200 |
commit | bbb27b29c92bdd2b2c709281468b28e71ef7cd3c (patch) | |
tree | e8a4b9fc942247de1f2048a12a10b544d0b9873f | |
parent | af9354a8a10c1fda8e11006fca0574b0eb5e51e6 (diff) |
dataset add() speedup, handle missing values in dataframe to dataset, modfied debug msges
-rw-r--r-- | lib/dataset.rb | 30 | ||||
-rw-r--r-- | lib/r-util.rb | 27 | ||||
-rw-r--r-- | lib/task.rb | 4 |
3 files changed, 33 insertions, 28 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb index 2c8c73a..7a398c3 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -315,9 +315,11 @@ module OpenTox # @param [String] compound Compound URI # @param [String] feature Compound URI # @param [Boolean,Float] value Feature value - def add (compound,feature,value) - @compounds << compound unless @compounds.include? compound - @features[feature] = {} unless @features[feature] + def add (compound,feature,value,skip_compound_feature_check=false) + unless skip_compound_feature_check + @compounds << compound unless @compounds.include? compound + @features[feature] = {} unless @features[feature] + end @data_entries[compound] = {} unless @data_entries[compound] @data_entries[compound][feature] = [] unless @data_entries[compound][feature] @data_entries[compound][feature] << value if value!=nil @@ -374,23 +376,21 @@ module OpenTox LOGGER.debug "split dataset using "+compounds.size.to_s+"/"+@compounds.size.to_s+" compounds" raise "no new compounds selected" unless compounds and compounds.size>0 dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid) - if features.size==0 - compounds.each{ |c| dataset.add_compound(c) } - else - compounds.each do |c| - features.each do |f| - if @data_entries[c]==nil or @data_entries[c][f]==nil - dataset.add(c,f,nil) - else - @data_entries[c][f].each do |v| - dataset.add(c,f,v) - end + compounds.each{ |c| dataset.add_compound(c) } + compounds.each do |c| + features.each do |f| + if @data_entries[c]==nil or @data_entries[c][f]==nil + dataset.add(c,f,nil,true) + else + @data_entries[c][f].each do |v| + dataset.add(c,f,v,true) end end end end # set feature metadata in new dataset accordingly (including accept values) features.each do |f| + dataset.features[f] = {} self.features[f].each do |k,v| dataset.features[f][k] = v end @@ -461,7 +461,7 @@ module OpenTox data_combined.add_feature(f,m) compounds.each do |c| dataset.data_entries[c][f].each do |v| - data_combined.add(c,f,v) + data_combined.add(c,f,v,true) end if dataset.data_entries[c] and dataset.data_entries[c][f] end end diff --git a/lib/r-util.rb b/lib/r-util.rb index f9591d5..8e99724 100644 --- a/lib/r-util.rb +++ b/lib/r-util.rb @@ -282,7 +282,7 @@ module OpenTox metadata[DC.title] = "Training dataset split of "+dataset.uri train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 } metadata[DC.title] = "Test dataset split of "+dataset.uri - test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 } + test = split_to_dataset( df, split, metadata, subjectid, missing_values ){ |i| i==0 } return train, test end end @@ -292,7 +292,7 @@ module OpenTox # takes duplicates into account # replaces missing values with param <missing_value> # returns dataframe-variable-name in R - def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil ) + def dataset_to_dataframe( dataset, missing_values="NA", subjectid=nil, features=nil ) LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}" # count duplicates @@ -341,7 +341,7 @@ module OpenTox raise "wtf" if i>0 v = "" end - v = missing_value if v.size()==0 + v = missing_values if v.size()==0 c_values << v end d_values << c_values @@ -360,7 +360,7 @@ module OpenTox features.each do |f| type = dataset.features[f][RDF.type] unless type - LOGGER.debug "derive feature type by rest-call" + LOGGER.debug "r-util> derive feature type by rest-call" feat = OpenTox::Feature.find(f,subjectid) type = feat.metadata[RDF.type] end @@ -384,13 +384,15 @@ module OpenTox # converts a dataframe into a dataset (a new dataset is created at the dataset webservice) # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!) - def dataframe_to_dataset( df, metadata={}, subjectid=nil ) - dataframe_to_dataset_indices( df, metadata, subjectid, nil) + def dataframe_to_dataset( df, metadata={}, subjectid=nil, missing_values="NA" ) + dataframe_to_dataset_indices( df, metadata, subjectid, nil, missing_values ) end private - def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil ) + def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil, missing_values="NA" ) raise unless @@feats[df].size>0 + + missing_value_regexp = Regexp.new("^#{missing_values.to_s=="0" ? "(0.0|0)" : missing_values.to_s}$") values, compound_names, features = pull_dataframe(df) compounds = compound_names.collect{|c| c.split("$")[0]} features.each{|f| raise unless @@feats[df][f]} @@ -400,16 +402,19 @@ module OpenTox compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)} features.each{|f| dataset.add_feature(f,@@feats[df][f])} features.size.times do |c| + LOGGER.debug "r-util> dataframe to dataset - feature #{c+1} / #{features.size}" if + c%25==0 && (features.size*compounds.size)>100000 type = @@feats[df][features[c]][RDF.type] unless type - LOGGER.debug "derive feature type by rest-call" + LOGGER.debug "r-util> derive feature type by rest-call" feat = OpenTox::Feature.find(features[c],subjectid) type = feat.metadata[RDF.type] end nominal = type.to_a.flatten.include?(OT.NominalFeature) compounds.size.times do |r| if compound_indices==nil or compound_indices.include?(r) - dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA" + dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f, true) if + values[r][c]!="NA" and !(values[r][c] =~ missing_value_regexp) end end end @@ -417,10 +422,10 @@ module OpenTox dataset end - def split_to_dataset( df, split, metadata={}, subjectid=nil ) + def split_to_dataset( df, split, metadata={}, subjectid=nil, missing_values="NA" ) indices = [] split.size.times{|i| indices<<i if yield(split[i]) } - dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices ) + dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices, missing_values ) LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}") dataset end diff --git a/lib/task.rb b/lib/task.rb index 86d3d5f..c235a68 100644 --- a/lib/task.rb +++ b/lib/task.rb @@ -63,7 +63,7 @@ module OpenTox LOGGER.debug "Task #{task.uri} started #{Time.now}" begin result = yield task - LOGGER.debug "Task #{task.uri} done #{Time.now} -> "+result.to_s + LOGGER.debug "Task #{task.uri} done #{Time.now} -> #{result.to_s.gsub("\n"," \\n ")}" task.completed(result) rescue => error LOGGER.error "task failed: "+error.class.to_s+": "+error.message @@ -265,7 +265,7 @@ module OpenTox end end waiting_task.waiting_for(nil) if waiting_task - LOGGER.debug "Task '"+@metadata[OT.hasStatus].to_s+"': "+@uri.to_s.chomp+", Result: "+@metadata[OT.resultURI].to_s + LOGGER.debug "Task '"+@metadata[OT.hasStatus].to_s+"': "+@uri.to_s.chomp+", Result: #{@metadata[OT.resultURI].to_s.gsub("\n"," \\n ")}" end # updates percentageCompleted value (can only be increased) |