summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authormguetlein <martin.guetlein@gmail.com>2012-01-31 12:37:36 +0100
committermguetlein <martin.guetlein@gmail.com>2012-01-31 12:37:36 +0100
commit63320057e2a2b2121c5c405c31e2e7b709fa9e44 (patch)
tree189eaa7dcb29b95853f2f00f1f3291c211edbba7 /lib
parenteda4b9687ca4b093b7f194b6d0b2e58ce7eed3b2 (diff)
adapt validation to r-util (split), add new r plots
Diffstat (limited to 'lib')
-rw-r--r--lib/merge.rb10
-rw-r--r--lib/r-util.rb82
-rw-r--r--lib/stratification.R123
3 files changed, 5 insertions, 210 deletions
diff --git a/lib/merge.rb b/lib/merge.rb
index f30a3c1..bc6e1a7 100644
--- a/lib/merge.rb
+++ b/lib/merge.rb
@@ -31,6 +31,11 @@ module Lib
return merge_count(object)>1
end
+ def self.merge_count( object )
+ @@merge_count[object] = 1 if @@merge_count[object]==nil
+ return @@merge_count[object]
+ end
+
def self.merge_objects( object1, object2 )
raise "classes not equal : "+object1.class.to_s+" != "+object2.class.to_s if object1.class != object2.class
object_class = object1.class
@@ -137,11 +142,6 @@ module Lib
{:value => value, :variance => variance }
end
- def self.merge_count( object )
- @@merge_count[object] = 1 if @@merge_count[object]==nil
- return @@merge_count[object]
- end
-
def self.set_merge_count(object, merge_count)
@@merge_count[object] = merge_count
end
diff --git a/lib/r-util.rb b/lib/r-util.rb
deleted file mode 100644
index 0d58389..0000000
--- a/lib/r-util.rb
+++ /dev/null
@@ -1,82 +0,0 @@
-# pending: package dir hack ---------
-# CONFIG[:base_dir] = "/home/<user>/opentox-ruby/www"
-# PACKAGE_DIR = "/home/<user>/opentox-ruby/r-packages"
-package_dir = CONFIG[:base_dir].split("/")
-package_dir[-1] = "r-packages"
-package_dir = package_dir.join("/")
-PACKAGE_DIR = package_dir
-
-
-
-module Lib
-
- module RUtil
-
- def self.dataset_to_dataframe( dataset )
- LOGGER.debug "convert dataset to dataframe #{dataset.uri}"
- all_features = []
- dataset.features.each do |f|
- feat_name = "feature_#{f[0].split("/")[-1]}"
- LOGGER.debug "- adding feature: #{feat_name}"
- feat = OpenTox::Feature.find(f[0])
- nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature)
- values = []
- dataset.compounds.each do |c|
- val = dataset.data_entries[c][f[0]]
- raise "not yet implemented" if val!=nil && val.size>1
- v = val==nil ? "" : val[0].to_s
- v = "NA" if v.size()==0
- values << v
- end
- all_features << feat_name
- @@r.assign feat_name,values
- @@r.eval "#{feat_name} <- as.numeric(#{feat_name})" unless nominal
- end
- df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}"
- cmd = "#{df_name} <- data.frame(#{all_features.join(",")})"
- @@r.eval cmd
- #@@r.eval "head(#{df_name})"
- df_name
- end
-
- def self.stratified_split( dataframe, pct=0.3, seed=42 )
- @@r.eval "set.seed(#{seed})"
- @@r.eval "split <- stratified_split(#{dataframe}, ratio=#{pct})"
- split = @@r.pull 'split'
- split.collect{|s| s.to_i}
- end
-
- def self.package_installed?( package )
- @@r.eval ".libPaths(\"#{PACKAGE_DIR}\")"
- p = @@r.pull "installed.packages()[,1]"
- p.include?(package)
- end
-
- def self.install_packages( package )
- unless package_installed? package
- @@r.eval "install.packages(\"#{package}\", repos=\"http://cran.r-project.org\", dependencies=T, lib=\"#{PACKAGE_DIR}\")"
- end
- end
-
- def self.library( package )
- install_packages( package )
- @@r.eval "library(\"#{package}\")"
- end
-
- def self.init_r
- @@r = RinRuby.new(true,false) unless defined?(@@r) and @@r
- library("sampling")
- library("gam")
- @@r.eval "source(\"#{PACKAGE_DIR}/stratification.R\")"
- end
-
- def self.quit_r
- begin
- @@r.quit
- @@r = nil
- rescue
- end
- end
-
- end
-end
diff --git a/lib/stratification.R b/lib/stratification.R
deleted file mode 100644
index 9aa8d1f..0000000
--- a/lib/stratification.R
+++ /dev/null
@@ -1,123 +0,0 @@
-library("sampling")
-library("gam")
-
-nominal_to_binary <- function( orig_data )
-{
- data = as.data.frame( orig_data )
- result = NULL
- for (i in 1:ncol(data))
- {
- #print(i)
- if (is.numeric( data[,i] ) )
- {
- if (is.null(result))
- result = data.frame(data[,i])
- else
- result = data.frame(result, data[,i])
- colnames(result)[ncol(result)] <- colnames(data)[i]
- }
- else
- {
- vals = unique(data[,i])
- for (j in 1:length(vals))
- {
- #print(j)
- bins = c()
- for (k in 1:nrow(data))
- {
- if(data[,i][k] == vals[j])
- bins = c(bins,1)
- else
- bins = c(bins,0)
- }
- #print(bins)
- if (is.null(result))
- result = data.frame(bins)
- else
- result = data.frame(result, bins)
- colnames(result)[ncol(result)] <- paste(colnames(data)[i],"is",vals[j])
- if (length(vals)==2) break
- }
- }
- }
- result
-}
-
-process_data <- function( data )
-{
- if (!is.numeric(data))
- data.num = nominal_to_binary(data)
- else
- data.num = data
- if(any(is.na(data.num)))
- data.repl = na.gam.replace(data.num)
- else
- data.repl = data.num
- data.repl
-}
-
-stratified_split <- function( data, ratio=0.3 )
-{
- data.processed = as.matrix(process_data( data ))
- pik = rep(ratio,times=nrow(data.processed))
- data.strat = cbind(pik,data.processed)
- samplecube(data.strat,pik,order=2,comment=F)
-}
-
-stratified_k_fold_split <- function( data, num_folds=10 )
-{
- print(paste(num_folds,"-fold-split, data-size",nrow(data)))
- data.processed = as.matrix(process_data( data ))
- folds = rep(0, times=nrow(data))
- for (i in 1:(num_folds-1))
- {
- prop = 1/(num_folds-(i-1))
- print(paste("fold",i,"/",num_folds," prop",prop))
- pik = rep(prop,times=nrow(data))
- for (j in 1:nrow(data))
- if(folds[j]!=0)
- pik[j]=0
- data.strat = cbind(pik,data.processed)
- s<-samplecube(data.strat,pik,order=2,comment=F)
- print(paste("fold size: ",sum(s)))
- for (j in 1:nrow(data))
- if (s[j] == 1)
- folds[j]=i
- }
- for (j in 1:nrow(data))
- if (folds[j] == 0)
- folds[j]=num_folds
- folds
-}
-
-plot_split <- function( data, split )
-{
- data.processed = process_data( data )
- data.pca <- prcomp(data.processed, scale=TRUE)
- data.2d =as.data.frame(data.pca$x)[1:2]
- plot( NULL,
- xlim = extendrange(data.2d[,1]), ylim = extendrange(data.2d[,2]),
- xlab = "pc 1", ylab = "pc 2")
- for (j in 0:max(split))
- {
- set = c()
- for (i in 1:nrow(data))
- if (split[i] == j)
- set = c(set,i)
- points(data.2d[set,], pch = 2, col=(j+1))
- }
-}
-
-#a<-matrix(rnorm(100, mean=50, sd=4), ncol=5)
-#b<-matrix(rnorm(5000, mean=0, sd=10), ncol=5)
-#data<-rbind(a,b)
-#c<-matrix(rnorm(50, mean=-50, sd=2), ncol=5)
-#data<-rbind(data,c)
-#data=iris
-#split = stratified_k_fold_split(data, num_folds=3)
-#split = stratified_split(data, ratio=0.3)
-#plot_split(data,split)
-
-
-
-