diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/merge.rb | 10 | ||||
-rw-r--r-- | lib/r-util.rb | 82 | ||||
-rw-r--r-- | lib/stratification.R | 123 |
3 files changed, 5 insertions, 210 deletions
diff --git a/lib/merge.rb b/lib/merge.rb index f30a3c1..bc6e1a7 100644 --- a/lib/merge.rb +++ b/lib/merge.rb @@ -31,6 +31,11 @@ module Lib return merge_count(object)>1 end + def self.merge_count( object ) + @@merge_count[object] = 1 if @@merge_count[object]==nil + return @@merge_count[object] + end + def self.merge_objects( object1, object2 ) raise "classes not equal : "+object1.class.to_s+" != "+object2.class.to_s if object1.class != object2.class object_class = object1.class @@ -137,11 +142,6 @@ module Lib {:value => value, :variance => variance } end - def self.merge_count( object ) - @@merge_count[object] = 1 if @@merge_count[object]==nil - return @@merge_count[object] - end - def self.set_merge_count(object, merge_count) @@merge_count[object] = merge_count end diff --git a/lib/r-util.rb b/lib/r-util.rb deleted file mode 100644 index 0d58389..0000000 --- a/lib/r-util.rb +++ /dev/null @@ -1,82 +0,0 @@ -# pending: package dir hack --------- -# CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www" -# PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages" -package_dir = CONFIG[:base_dir].split("/") -package_dir[-1] = "r-packages" -package_dir = package_dir.join("/") -PACKAGE_DIR = package_dir - - - -module Lib - - module RUtil - - def self.dataset_to_dataframe( dataset ) - LOGGER.debug "convert dataset to dataframe #{dataset.uri}" - all_features = [] - dataset.features.each do |f| - feat_name = "feature_#{f[0].split("/")[-1]}" - LOGGER.debug "- adding feature: #{feat_name}" - feat = OpenTox::Feature.find(f[0]) - nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature) - values = [] - dataset.compounds.each do |c| - val = dataset.data_entries[c][f[0]] - raise "not yet implemented" if val!=nil && val.size>1 - v = val==nil ? "" : val[0].to_s - v = "NA" if v.size()==0 - values << v - end - all_features << feat_name - @@r.assign feat_name,values - @@r.eval "#{feat_name} <- as.numeric(#{feat_name})" unless nominal - end - df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" - cmd = "#{df_name} <- data.frame(#{all_features.join(",")})" - @@r.eval cmd - #@@r.eval "head(#{df_name})" - df_name - end - - def self.stratified_split( dataframe, pct=0.3, seed=42 ) - @@r.eval "set.seed(#{seed})" - @@r.eval "split <- stratified_split(#{dataframe}, ratio=#{pct})" - split = @@r.pull 'split' - split.collect{|s| s.to_i} - end - - def self.package_installed?( package ) - @@r.eval ".libPaths(\"#{PACKAGE_DIR}\")" - p = @@r.pull "installed.packages()[,1]" - p.include?(package) - end - - def self.install_packages( package ) - unless package_installed? package - @@r.eval "install.packages(\"#{package}\", repos=\"http://cran.r-project.org\", dependencies=T, lib=\"#{PACKAGE_DIR}\")" - end - end - - def self.library( package ) - install_packages( package ) - @@r.eval "library(\"#{package}\")" - end - - def self.init_r - @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r - library("sampling") - library("gam") - @@r.eval "source(\"#{PACKAGE_DIR}/stratification.R\")" - end - - def self.quit_r - begin - @@r.quit - @@r = nil - rescue - end - end - - end -end diff --git a/lib/stratification.R b/lib/stratification.R deleted file mode 100644 index 9aa8d1f..0000000 --- a/lib/stratification.R +++ /dev/null @@ -1,123 +0,0 @@ -library("sampling") -library("gam") - -nominal_to_binary <- function( orig_data ) -{ - data = as.data.frame( orig_data ) - result = NULL - for (i in 1:ncol(data)) - { - #print(i) - if (is.numeric( data[,i] ) ) - { - if (is.null(result)) - result = data.frame(data[,i]) - else - result = data.frame(result, data[,i]) - colnames(result)[ncol(result)] <- colnames(data)[i] - } - else - { - vals = unique(data[,i]) - for (j in 1:length(vals)) - { - #print(j) - bins = c() - for (k in 1:nrow(data)) - { - if(data[,i][k] == vals[j]) - bins = c(bins,1) - else - bins = c(bins,0) - } - #print(bins) - if (is.null(result)) - result = data.frame(bins) - else - result = data.frame(result, bins) - colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j]) - if (length(vals)==2) break - } - } - } - result -} - -process_data <- function( data ) -{ - if (!is.numeric(data)) - data.num = nominal_to_binary(data) - else - data.num = data - if(any(is.na(data.num))) - data.repl = na.gam.replace(data.num) - else - data.repl = data.num - data.repl -} - -stratified_split <- function( data, ratio=0.3 ) -{ - data.processed = as.matrix(process_data( data )) - pik = rep(ratio,times=nrow(data.processed)) - data.strat = cbind(pik,data.processed) - samplecube(data.strat,pik,order=2,comment=F) -} - -stratified_k_fold_split <- function( data, num_folds=10 ) -{ - print(paste(num_folds,"-fold-split, data-size",nrow(data))) - data.processed = as.matrix(process_data( data )) - folds = rep(0, times=nrow(data)) - for (i in 1:(num_folds-1)) - { - prop = 1/(num_folds-(i-1)) - print(paste("fold",i,"/",num_folds," prop",prop)) - pik = rep(prop,times=nrow(data)) - for (j in 1:nrow(data)) - if(folds[j]!=0) - pik[j]=0 - data.strat = cbind(pik,data.processed) - s<-samplecube(data.strat,pik,order=2,comment=F) - print(paste("fold size: ",sum(s))) - for (j in 1:nrow(data)) - if (s[j] == 1) - folds[j]=i - } - for (j in 1:nrow(data)) - if (folds[j] == 0) - folds[j]=num_folds - folds -} - -plot_split <- function( data, split ) -{ - data.processed = process_data( data ) - data.pca <- prcomp(data.processed, scale=TRUE) - data.2d =as.data.frame(data.pca$x)[1:2] - plot( NULL, - xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]), - xlab = "pc 1", ylab = "pc 2") - for (j in 0:max(split)) - { - set = c() - for (i in 1:nrow(data)) - if (split[i] == j) - set = c(set,i) - points(data.2d[set,], pch = 2, col=(j+1)) - } -} - -#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5) -#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5) -#data<-rbind(a,b) -#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5) -#data<-rbind(data,c) -#data=iris -#split = stratified_k_fold_split(data, num_folds=3) -#split = stratified_split(data, ratio=0.3) -#plot_split(data,split) - - - - |