summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2012-06-15 14:01:44 +0200
committermguetlein <martin.guetlein@gmail.com>2012-06-15 14:01:44 +0200
commitbbb27b29c92bdd2b2c709281468b28e71ef7cd3c (patch)
treee8a4b9fc942247de1f2048a12a10b544d0b9873f
parentaf9354a8a10c1fda8e11006fca0574b0eb5e51e6 (diff)
dataset add() speedup, handle missing values in dataframe to dataset, modfied debug msges
-rw-r--r--lib/dataset.rb30
-rw-r--r--lib/r-util.rb27
-rw-r--r--lib/task.rb4
3 files changed, 33 insertions, 28 deletions
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 2c8c73a..7a398c3 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -315,9 +315,11 @@ module OpenTox
# @param [String] compound Compound URI
# @param [String] feature Compound URI
# @param [Boolean,Float] value Feature value
- def add (compound,feature,value)
- @compounds << compound unless @compounds.include? compound
- @features[feature] = {} unless @features[feature]
+ def add (compound,feature,value,skip_compound_feature_check=false)
+ unless skip_compound_feature_check
+ @compounds << compound unless @compounds.include? compound
+ @features[feature] = {} unless @features[feature]
+ end
@data_entries[compound] = {} unless @data_entries[compound]
@data_entries[compound][feature] = [] unless @data_entries[compound][feature]
@data_entries[compound][feature] << value if value!=nil
@@ -374,23 +376,21 @@ module OpenTox
LOGGER.debug "split dataset using "+compounds.size.to_s+"/"+@compounds.size.to_s+" compounds"
raise "no new compounds selected" unless compounds and compounds.size>0
dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid)
- if features.size==0
- compounds.each{ |c| dataset.add_compound(c) }
- else
- compounds.each do |c|
- features.each do |f|
- if @data_entries[c]==nil or @data_entries[c][f]==nil
- dataset.add(c,f,nil)
- else
- @data_entries[c][f].each do |v|
- dataset.add(c,f,v)
- end
+ compounds.each{ |c| dataset.add_compound(c) }
+ compounds.each do |c|
+ features.each do |f|
+ if @data_entries[c]==nil or @data_entries[c][f]==nil
+ dataset.add(c,f,nil,true)
+ else
+ @data_entries[c][f].each do |v|
+ dataset.add(c,f,v,true)
end
end
end
end
# set feature metadata in new dataset accordingly (including accept values)
features.each do |f|
+ dataset.features[f] = {}
self.features[f].each do |k,v|
dataset.features[f][k] = v
end
@@ -461,7 +461,7 @@ module OpenTox
data_combined.add_feature(f,m)
compounds.each do |c|
dataset.data_entries[c][f].each do |v|
- data_combined.add(c,f,v)
+ data_combined.add(c,f,v,true)
end if dataset.data_entries[c] and dataset.data_entries[c][f]
end
end
diff --git a/lib/r-util.rb b/lib/r-util.rb
index f9591d5..8e99724 100644
--- a/lib/r-util.rb
+++ b/lib/r-util.rb
@@ -282,7 +282,7 @@ module OpenTox
metadata[DC.title] = "Training dataset split of "+dataset.uri
train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 }
metadata[DC.title] = "Test dataset split of "+dataset.uri
- test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 }
+ test = split_to_dataset( df, split, metadata, subjectid, missing_values ){ |i| i==0 }
return train, test
end
end
@@ -292,7 +292,7 @@ module OpenTox
# takes duplicates into account
# replaces missing values with param <missing_value>
# returns dataframe-variable-name in R
- def dataset_to_dataframe( dataset, missing_value="NA", subjectid=nil, features=nil )
+ def dataset_to_dataframe( dataset, missing_values="NA", subjectid=nil, features=nil )
LOGGER.debug "r-util> convert dataset to dataframe #{dataset.uri}"
# count duplicates
@@ -341,7 +341,7 @@ module OpenTox
raise "wtf" if i>0
v = ""
end
- v = missing_value if v.size()==0
+ v = missing_values if v.size()==0
c_values << v
end
d_values << c_values
@@ -360,7 +360,7 @@ module OpenTox
features.each do |f|
type = dataset.features[f][RDF.type]
unless type
- LOGGER.debug "derive feature type by rest-call"
+ LOGGER.debug "r-util> derive feature type by rest-call"
feat = OpenTox::Feature.find(f,subjectid)
type = feat.metadata[RDF.type]
end
@@ -384,13 +384,15 @@ module OpenTox
# converts a dataframe into a dataset (a new dataset is created at the dataset webservice)
# this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!)
- def dataframe_to_dataset( df, metadata={}, subjectid=nil )
- dataframe_to_dataset_indices( df, metadata, subjectid, nil)
+ def dataframe_to_dataset( df, metadata={}, subjectid=nil, missing_values="NA" )
+ dataframe_to_dataset_indices( df, metadata, subjectid, nil, missing_values )
end
private
- def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil )
+ def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil, missing_values="NA" )
raise unless @@feats[df].size>0
+
+ missing_value_regexp = Regexp.new("^#{missing_values.to_s=="0" ? "(0.0|0)" : missing_values.to_s}$")
values, compound_names, features = pull_dataframe(df)
compounds = compound_names.collect{|c| c.split("$")[0]}
features.each{|f| raise unless @@feats[df][f]}
@@ -400,16 +402,19 @@ module OpenTox
compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)}
features.each{|f| dataset.add_feature(f,@@feats[df][f])}
features.size.times do |c|
+ LOGGER.debug "r-util> dataframe to dataset - feature #{c+1} / #{features.size}" if
+ c%25==0 && (features.size*compounds.size)>100000
type = @@feats[df][features[c]][RDF.type]
unless type
- LOGGER.debug "derive feature type by rest-call"
+ LOGGER.debug "r-util> derive feature type by rest-call"
feat = OpenTox::Feature.find(features[c],subjectid)
type = feat.metadata[RDF.type]
end
nominal = type.to_a.flatten.include?(OT.NominalFeature)
compounds.size.times do |r|
if compound_indices==nil or compound_indices.include?(r)
- dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA"
+ dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f, true) if
+ values[r][c]!="NA" and !(values[r][c] =~ missing_value_regexp)
end
end
end
@@ -417,10 +422,10 @@ module OpenTox
dataset
end
- def split_to_dataset( df, split, metadata={}, subjectid=nil )
+ def split_to_dataset( df, split, metadata={}, subjectid=nil, missing_values="NA" )
indices = []
split.size.times{|i| indices<<i if yield(split[i]) }
- dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices )
+ dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices, missing_values )
LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}")
dataset
end
diff --git a/lib/task.rb b/lib/task.rb
index 86d3d5f..c235a68 100644
--- a/lib/task.rb
+++ b/lib/task.rb
@@ -63,7 +63,7 @@ module OpenTox
LOGGER.debug "Task #{task.uri} started #{Time.now}"
begin
result = yield task
- LOGGER.debug "Task #{task.uri} done #{Time.now} -> "+result.to_s
+ LOGGER.debug "Task #{task.uri} done #{Time.now} -> #{result.to_s.gsub("\n"," \\n ")}"
task.completed(result)
rescue => error
LOGGER.error "task failed: "+error.class.to_s+": "+error.message
@@ -265,7 +265,7 @@ module OpenTox
end
end
waiting_task.waiting_for(nil) if waiting_task
- LOGGER.debug "Task '"+@metadata[OT.hasStatus].to_s+"': "+@uri.to_s.chomp+", Result: "+@metadata[OT.resultURI].to_s
+ LOGGER.debug "Task '"+@metadata[OT.hasStatus].to_s+"': "+@uri.to_s.chomp+", Result: #{@metadata[OT.resultURI].to_s.gsub("\n"," \\n ")}"
end
# updates percentageCompleted value (can only be increased)