diff options
author | rautenberg <rautenberg@in-silico.ch> | 2012-07-13 09:45:46 +0200 |
---|---|---|
committer | rautenberg <rautenberg@in-silico.ch> | 2012-07-13 09:45:46 +0200 |
commit | ca2903692658ca7badcda425153ed12eb19a2ced (patch) | |
tree | c921872907dd6ce0edaea7c6251c804bc11373f0 | |
parent | 6e016d24bf6d0272d235c466e0dab3f196f0c0d4 (diff) | |
parent | 20ea5f9b62966eecb283033b04f6aea98b23d1f8 (diff) |
Merge branch 'release/v3.2.0'v4.0.0
-rw-r--r-- | .yardopts | 1 | ||||
-rw-r--r-- | ChangeLog | 10 | ||||
-rw-r--r-- | README.markdown | 2 | ||||
-rw-r--r-- | Rakefile | 3 | ||||
-rw-r--r-- | VERSION | 2 | ||||
-rw-r--r-- | lib/algorithm.rb | 180 | ||||
-rw-r--r-- | lib/compound.rb | 60 | ||||
-rw-r--r-- | lib/dataset.rb | 41 | ||||
-rw-r--r-- | lib/model.rb | 49 | ||||
-rw-r--r-- | lib/parser.rb | 53 | ||||
-rw-r--r-- | lib/r-util.rb | 127 | ||||
-rw-r--r-- | lib/serializer.rb | 92 | ||||
-rw-r--r-- | lib/stratification.R | 78 | ||||
-rw-r--r-- | lib/transform.rb | 8 | ||||
-rw-r--r-- | lib/utils.rb | 453 | ||||
-rw-r--r-- | lib/validation.rb | 10 | ||||
-rw-r--r-- | opentox-ruby.gemspec | 200 |
17 files changed, 1017 insertions, 352 deletions
diff --git a/.yardopts b/.yardopts new file mode 100644 index 0000000..1217a60 --- /dev/null +++ b/.yardopts @@ -0,0 +1 @@ +yardoc - README.markdown ChangeLog LICENSE @@ -1,3 +1,13 @@ +v4.0.0 2012-07-12 + * fminer addition of compounds fixed + * improved performance for CSV download + * switch to opentox-ruby version 4.0.0 + +2012-04-20 + * Support for joelib and openbabel descriptors in a completely unified interface with CDK (Ambit) + * Features can have multiple types (nominal and numeric), PC descriptors have detailed meta data + * Myriads of bugfixes to CSV download code (e.g. missing descriptors, handling of duplicates) + v3.1.0 2012-02-24 * utils.rb: added for special routines (e.g. descriptor calculation) * task.rb: Polling with increasing interval diff --git a/README.markdown b/README.markdown index 79bdab2..d69b28f 100644 --- a/README.markdown +++ b/README.markdown @@ -38,4 +38,4 @@ This example shows how to create a lazar model and predict a compound, it assume Copyright --------- -Copyright (c) 2009-2011 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details. +Copyright (c) 2009-2012 Christoph Helma, Martin Guetlein, Micha Rautenberg, Andreas Maunz, David Vorgrimmler, Denis Gebele. See LICENSE for details. @@ -42,9 +42,10 @@ begin gem.add_dependency "dm-migrations", "=1.1.0" gem.add_dependency "dm-validations", "=1.1.0" gem.add_dependency "dm-sqlite-adapter", "=1.1.0" - gem.add_dependency "ruby-plot", "=0.6.0" + gem.add_dependency "ruby-plot", "=0.6.1" gem.add_dependency "gsl", "=1.14.7" gem.add_dependency "statsample", "=1.1.0" + gem.add_dependency "redis", "=2.2.2" gem.add_development_dependency 'jeweler' gem.files = FileList["[A-Z]*", "{bin,generators,lib,test}/**/*", 'lib/jeweler/templates/.gitignore'] @@ -1 +1 @@ -3.1.0
\ No newline at end of file +4.0.0 diff --git a/lib/algorithm.rb b/lib/algorithm.rb index c026c56..78fc447 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -56,25 +56,73 @@ module OpenTox def check_params(params,per_mil,subjectid=nil) raise OpenTox::NotFoundError.new "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil? - raise OpenTox::NotFoundError.new "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil? - @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid @training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}", subjectid + + unless params[:prediction_feature] # try to read prediction_feature from dataset + raise OpenTox::NotFoundError.new "Please provide a prediction_feature parameter" unless @training_dataset.features.size == 1 + prediction_feature = OpenTox::Feature.find(@training_dataset.features.keys.first,@subjectid) + params[:prediction_feature] = prediction_feature.uri + end + @prediction_feature = OpenTox::Feature.find params[:prediction_feature], subjectid + raise OpenTox::NotFoundError.new "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless @training_dataset.features and @training_dataset.features.include?(params[:prediction_feature]) unless params[:min_frequency].nil? - @minfreq=params[:min_frequency].to_i - raise "Minimum frequency must be a number >0!" unless @minfreq>0 - else - @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) # AM sugg. 8-10 per mil for BBRC, 50 per mil for LAST + # check for percentage + if params[:min_frequency].include? "pc" + per_mil=params[:min_frequency].gsub(/pc/,"") + if OpenTox::Algorithm.numeric? per_mil + per_mil = per_mil.to_i * 10 + else + bad_request=true + end + # check for per-mil + elsif params[:min_frequency].include? "pm" + per_mil=params[:min_frequency].gsub(/pm/,"") + if OpenTox::Algorithm.numeric? per_mil + per_mil = per_mil.to_i + else + bad_request=true + end + # set minfreq directly + else + if OpenTox::Algorithm.numeric? params[:min_frequency] + @minfreq=params[:min_frequency].to_i + LOGGER.debug "min_frequency #{@minfreq}" + else + bad_request=true + end + end + raise OpenTox::BadRequestError.new "Minimum frequency must be integer [n], or a percentage [n]pc, or a per-mil [n]pm , with n greater 0" if bad_request + end + if @minfreq.nil? + @minfreq=OpenTox::Algorithm.min_frequency(@training_dataset,per_mil) + LOGGER.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)" end end - def add_fminer_data(fminer_instance, params, value_map) + def add_fminer_data(fminer_instance, value_map) + + + # detect nr duplicates per compound + compound_sizes = {} + @training_dataset.compounds.each do |compound| + entries=@training_dataset.data_entries[compound] + entries.each do |feature, values| + compound_sizes[compound] || compound_sizes[compound] = [] + compound_sizes[compound] << values.size unless values.size == 0 + end + compound_sizes[compound].uniq! + raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1 + compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array + end id = 1 # fminer start id is not 0 - @training_dataset.data_entries.each do |compound,entry| + + @training_dataset.compounds.each do |compound| + entry=@training_dataset.data_entries[compound] begin - smiles = OpenTox::Compound.smiles(compound.to_s) + smiles = OpenTox::Compound.new(compound).to_smiles rescue LOGGER.warn "No resource for #{compound.to_s}" next @@ -84,32 +132,31 @@ module OpenTox next end - value_map=params[:value_map] unless params[:value_map].nil? entry.each do |feature,values| if feature == @prediction_feature.uri - values.each do |value| - if value.nil? + (0...compound_sizes[compound]).each { |i| + if values[i].nil? LOGGER.warn "No #{feature} activity for #{compound.to_s}." else if @prediction_feature.feature_type == "classification" - activity= value_map.invert[value.to_s].to_i # activities are mapped to 1..n + activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect elsif @prediction_feature.feature_type == "regression" - activity= value.to_f + activity= values[i].to_f end begin - fminer_instance.AddCompound(smiles,id) - fminer_instance.AddActivity(activity, id) + fminer_instance.AddCompound(smiles,id) if fminer_instance + fminer_instance.AddActivity(activity, id) if fminer_instance @all_activities[id]=activity # DV: insert global information @compounds[id] = compound @smi[id] = smiles id += 1 rescue Exception => e - LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" + LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer" LOGGER.warn e.backtrace end end - end + } end end end @@ -380,11 +427,11 @@ module OpenTox prediction = acts[0] else #LOGGER.debug gram_matrix.to_yaml - @r = RinRuby.new(false,false) # global R instance leads to Socket errors after a large number of requests - @r.eval "set.seed(1)" + @r = RinRuby.new(true,false) # global R instance leads to Socket errors after a large number of requests @r.eval "suppressPackageStartupMessages(library('caret'))" # requires R packages "caret" and "kernlab" @r.eval "suppressPackageStartupMessages(library('doMC'))" # requires R packages "multicore" @r.eval "registerDoMC()" # switch on parallel processing + @r.eval "set.seed(1)" begin # set data @@ -400,7 +447,14 @@ module OpenTox # prepare data LOGGER.debug "Preparing R data ..." - @r.eval "if (class(y) == 'character') { y = factor(y); suppressPackageStartupMessages(library('class')) }" # For classification + @r.eval <<-EOR + weights=NULL + if (class(y) == 'character') { + y = factor(y) + suppressPackageStartupMessages(library('class')) + #weights=unlist(as.list(prop.table(table(y)))) + } + EOR @r.eval <<-EOR rem = nearZeroVar(prop_matrix) @@ -417,8 +471,18 @@ module OpenTox # model + support vectors LOGGER.debug "Creating R SVM model ..." - @r.eval <<-EOR - model = train(prop_matrix,y,method="svmradial",tuneLength=8,trControl=trainControl(method="LGOCV",number=10),preProcess=c("center", "scale")) + train_success = @r.eval <<-EOR + # AM: TODO: evaluate class weight effect by altering: + # AM: comment in 'weights' above run and class.weights=weights vs. class.weights=1-weights + # AM: vs + # AM: comment out 'weights' above (status quo), thereby disabling weights + model = train(prop_matrix,y, + method="svmradial", + preProcess=c("center", "scale"), + class.weights=weights, + trControl=trainControl(method="LGOCV",number=10), + tuneLength=8 + ) perf = ifelse ( class(y)!='numeric', max(model$results$Accuracy), model$results[which.min(model$results$RMSE),]$Rsquared ) EOR @@ -431,6 +495,7 @@ module OpenTox # censoring prediction = nil if ( @r.perf.nan? || @r.perf < min_train_performance ) + prediction = nil unless train_success LOGGER.debug "Performance: #{sprintf("%.2f", @r.perf)}" rescue Exception => e LOGGER.debug "#{e.class}: #{e.message}" @@ -456,30 +521,42 @@ module OpenTox @r.del_missing = params[:del_missing] == true ? 1 : 0 r_result_file = params[:fds_csv_file].sub("rfe_", "rfe_R_") @r.f_fds_r = r_result_file.to_s - + # need packs 'randomForest', 'RANN' @r.eval <<-EOR - set.seed(1) suppressPackageStartupMessages(library('caret')) suppressPackageStartupMessages(library('randomForest')) suppressPackageStartupMessages(library('RANN')) suppressPackageStartupMessages(library('doMC')) registerDoMC() - + set.seed(1) + acts = read.csv(ds_csv_file, check.names=F) feats = read.csv(fds_csv_file, check.names=F) ds = merge(acts, feats, by="SMILES") # duplicates features for duplicate SMILES :-) - + features = ds[,(dim(acts)[2]+1):(dim(ds)[2])] y = ds[,which(names(ds) == prediction_feature)] - + # assumes a data matrix 'features' and a vector 'y' of target values row.names(features)=NULL - + + # features with all values missing removed + na_col = names ( which ( apply ( features, 2, function(x) all ( is.na ( x ) ) ) ) ) + features = features[,!names(features) %in% na_col] + + # features with infinite values removed + inf_col = names ( which ( apply ( features, 2, function(x) any ( is.infinite ( x ) ) ) ) ) + features = features[,!names(features) %in% inf_col] + + # features with zero variance removed + zero_var = names ( which ( apply ( features, 2, function(x) var(x, na.rm=T) ) == 0 ) ) + features = features[,!names(features) %in% zero_var] + pp = NULL if (del_missing) { # needed if rows should be removed - na_ids = apply(features,1,function(x)any(is.na(x))) + na_ids = apply ( features,1,function(x) any ( is.na ( x ) ) ) features = features[!na_ids,] y = y[!na_ids] pp = preProcess(features, method=c("scale", "center")) @@ -488,17 +565,23 @@ module OpenTox pp = preProcess(features, method=c("scale", "center", "knnImpute")) } features = predict(pp, features) - + + # features with nan values removed (sometimes preProcess return NaN values) + nan_col = names ( which ( apply ( features, 2, function(x) any ( is.nan ( x ) ) ) ) ) + features = features[,!names(features) %in% nan_col] + # determine subsets - subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) - subsets = c(2,3,4,5,7,10,subsets) + subsets = dim(features)[2]*c(0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7) + #subsets = dim(features)[2]*c(0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7) + #subsets = c(2,3,4,5,7,10,subsets) + #subsets = c(2,3,4,5,7,10,13,16,19,22,25,28,30) subsets = unique(sort(round(subsets))) subsets = subsets[subsets<=dim(features)[2]] subsets = subsets[subsets>1] - + # Recursive feature elimination - rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=50), sizes=subsets) - + rfProfile = rfe( x=features, y=y, rfeControl=rfeControl(functions=rfFuncs, number=150), sizes=subsets) + # read existing dataset and select most useful features csv=feats[,c("SMILES", rfProfile$optVariables)] write.csv(x=csv,file=f_fds_r, row.names=F, quote=F, na='') @@ -527,7 +610,7 @@ module OpenTox # @param [Hash] required keys: compound, features, feature_dataset_uri, pc_type # @return [Hash] Hash with matching Smarts and number of hits def self.lookup(params) - params[:compound].lookup(params[:features], params[:feature_dataset_uri],params[:pc_type],params[:subjectid]) + params[:compound].lookup(params[:features], params[:feature_dataset_uri], params[:pc_type], params[:lib], params[:subjectid]) end end @@ -539,3 +622,26 @@ module OpenTox end end end + +class Array + # collect method extended for parallel processing. + # Note: assign return value as: ans = arr.pcollect(n) { |obj| ... } + # @param n the number of processes to spawn (default: unlimited) + def pcollect(n = nil) + nproc = 0 + result = collect do |*a| + r, w = IO.pipe + fork do + r.close + w.write( Marshal.dump( yield(*a) ) ) + end + if n and (nproc+=1) >= n + Process.wait ; nproc -= 1 + end + [ w.close, r ].last + end + Process.waitall + result.collect{|r| Marshal.load [ r.read, r.close ].first} + end +end + diff --git a/lib/compound.rb b/lib/compound.rb index 16d266c..e493278 100644 --- a/lib/compound.rb +++ b/lib/compound.rb @@ -3,6 +3,7 @@ module OpenTox + require "rexml/document" # Ruby wrapper for OpenTox Compound Webservices (http://opentox.org/dev/apis/api-1.2/structure). class Compound @@ -134,6 +135,47 @@ module OpenTox "not available" end end + + + # Get all known compound names sorted by classification. Relies on an external service for name lookups. + # @example + # names = compound.to_names_hash + # @return [Hash] Classification => Name Array + def to_names_hash + begin + xml = RestClientWrapper.get("#{@@cactus_uri}#{@inchi}/names/xml") + xmldoc = REXML::Document.new(xml) + data = {} + + xmldoc.root.elements[1].elements.each{|e| + if data.has_key?(e.attribute("classification").value) == false + data[e.attribute("classification").value] = [e.text] + else + data[e.attribute("classification").value].push(e.text) + end + } + data + rescue + "not available" + end + end + + # Get all known compound names sorted by classification. Relies on an external service for name lookups. + # @example + # names = compound.to_names_hash + # @return [Hash] Classification => Name Array + def to_ambit_names_hash + begin + ds = OpenTox::Dataset.new + ds.save + ds.load_rdfxml(RestClientWrapper.get("http://apps.ideaconsult.net:8080/ambit2/query/compound/search/names?type=smiles&property=&search=#{@inchi}")) + ds.save + ds.uri + rescue + "not available" + end + end + # Match a smarts string # @example @@ -201,25 +243,28 @@ module OpenTox # Lookup numerical values, returns hash with feature name as key and value as value # @param [Array] Array of feature names # @param [String] Feature dataset uri + # @param [String] Comma separated pc types + # @param [String] Comma separated lib # @return [Hash] Hash with feature name as key and value as value - def lookup(feature_array,feature_dataset_uri,pc_type,subjectid=nil) + def lookup(feature_array,feature_dataset_uri,pc_type,lib,subjectid=nil) ds = OpenTox::Dataset.find(feature_dataset_uri,subjectid) #entry = ds.data_entries[self.uri] entry = nil - ds.data_entries.each { |c_uri, values| - if c_uri.split('/compound/').last == self.to_inchi + ds.data_entries.each { |c_uri, values| + compound = OpenTox::Compound.new(c_uri) + if compound.to_inchi == self.to_inchi # Compare compounds by InChI entry = ds.data_entries[c_uri] break end } LOGGER.debug "#{entry.size} entries in feature ds for query." unless entry.nil? - if entry.nil? - uri, smiles_to_inchi = OpenTox::Algorithm.get_pc_descriptors({:compounds => [self.uri], :pc_type => pc_type}) - uri = OpenTox::Algorithm.load_ds_csv(uri, smiles_to_inchi, subjectid) - ds = OpenTox::Dataset.find(uri,subjectid) + temp_ds = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid); temp_ds.add_compound(self.uri); temp_uri = temp_ds.save(subjectid) + uri = RestClientWrapper.post(File.join(CONFIG[:services]["opentox-algorithm"], "/pc/AllDescriptors"), {:dataset_uri => temp_uri, :pc_type => pc_type, :lib => lib, :subjectid => subjectid}) + ds = OpenTox::Dataset.find(uri, subjectid) entry = ds.data_entries[self.uri] ds.delete(subjectid) + temp_ds.delete(subjectid) end features = entry.keys features.each { |feature| @@ -228,7 +273,6 @@ module OpenTox entry.delete(feature) unless feature == new_feature # e.g. when loading from ambit } #res = feature_array.collect {|v| entry[v]} - #LOGGER.debug "----- am #{entry.to_yaml}" entry end diff --git a/lib/dataset.rb b/lib/dataset.rb index 95c1918..c916722 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -197,7 +197,12 @@ module OpenTox accept_values end - # Detect feature type(s) in the dataset + # Detect feature type (reduced to one across all features) + # Classification takes precedence over regression + # DEPRECATED -- + # HAS NO SENSE FOR DATASETS WITH MORE THAN 1 FEATURE + # FEATURES CAN HAVE MULTIPLE TYPES + # Replacement: see feature_types() # @return [String] `classification", "regression", "mixed" or unknown` def feature_type(subjectid=nil) load_features(subjectid) @@ -210,6 +215,24 @@ module OpenTox "unknown" end end + + + # Detect feature types. A feature can have multiple types. + # Returns types hashed by feature URI, with missing features omitted. + # Example (YAML): + # http://toxcreate3.in-silico.ch:8082/dataset/152/feature/nHal: + # - http://www.opentox.org/api/1.1#NumericFeature + # - http://www.opentox.org/api/1.1#NominalFeature + # ... + # + # @return [Hash] Keys: feature URIs, Values: Array of types + def feature_types(subjectid=nil) + load_features(subjectid) + @features.inject({}){ |h,(f,metadata)| + h[f]=metadata[RDF.type] unless metadata[RDF.type][0].include? "MissingFeature" + h + } + end =begin =end @@ -316,11 +339,14 @@ module OpenTox end # Complete feature values by adding zeroes - def complete_data_entries + # @param [Hash] key: compound, value: duplicate sizes + def complete_data_entries(compound_sizes) all_features = @features.keys @data_entries.each { |c, e| (Set.new(all_features.collect)).subtract(Set.new e.keys).to_a.each { |f| - self.add(c,f,0) + compound_sizes[c].times { + self.add(c,f,0) + } } } end @@ -454,6 +480,14 @@ module OpenTox end end + def value_map(prediction_feature_uri) + training_classes = accept_values(prediction_feature_uri).sort + value_map=Hash.new + training_classes.each_with_index { |c,i| value_map[i+1] = c } + value_map + end + + private # Copy a dataset (rewrites URI) def copy(dataset) @@ -504,6 +538,7 @@ module OpenTox @data_entries[compound.uri].collect{|f,v| @features[f] if f.match(/neighbor/)}.compact if @data_entries[compound.uri] end + # def errors(compound) # features = @data_entries[compound.uri].keys # features.collect{|f| @features[f][OT.error]}.join(" ") if features diff --git a/lib/model.rb b/lib/model.rb index a858a0f..c9d367e 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -103,7 +103,7 @@ module OpenTox include Model - attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors + attr_accessor :compound, :prediction_dataset, :features, :effects, :activities, :p_values, :fingerprints, :feature_calculation_algorithm, :similarity_algorithm, :prediction_algorithm, :subjectid, :value_map, :compound_fingerprints, :feature_calculation_algorithm, :neighbors, :compounds def initialize(uri=nil) if uri @@ -169,12 +169,13 @@ module OpenTox lazar.prediction_algorithm = hash["prediction_algorithm"] if hash["prediction_algorithm"] lazar.subjectid = hash["subjectid"] if hash["subjectid"] lazar.value_map = hash["value_map"] if hash["value_map"] + lazar.compounds = hash["compounds"] if hash["compounds"] lazar end def to_json - Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map}) + Yajl::Encoder.encode({:uri => @uri,:metadata => @metadata, :compound => @compound, :prediction_dataset => @prediction_dataset, :features => @features, :effects => @effects, :activities => @activities, :p_values => @p_values, :fingerprints => @fingerprints, :feature_calculation_algorithm => @feature_calculation_algorithm, :similarity_algorithm => @similarity_algorithm, :prediction_algorithm => @prediction_algorithm, :subjectid => @subjectid, :value_map => @value_map, :compounds => @compounds}) end def run( params, accept_header=nil, waiting_task=nil ) @@ -237,6 +238,7 @@ module OpenTox @compound = Compound.new compound_uri features = {} + #LOGGER.debug self.to_yaml unless @prediction_dataset @prediction_dataset = Dataset.create(CONFIG[:services]["opentox-dataset"], subjectid) @@ -247,19 +249,19 @@ module OpenTox OT.parameters => [{DC.title => "compound_uri", OT.paramValue => compound_uri}] } ) end - if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "regression" - all_activities = [] - all_activities = @activities.values.flatten.collect! { |i| i.to_f } - end + unless database_activity(subjectid) # adds database activity to @prediction_dataset + # Calculation of needed values for query compound @compound_features = eval("#{@feature_calculation_algorithm}({ :compound => @compound, :features => @features, :feature_dataset_uri => @metadata[OT.featureDataset], :pc_type => self.parameter(\"pc_type\"), + :lib => self.parameter(\"lib\"), :subjectid => subjectid })") + # Adding fingerprint of query compound with features and values(p_value*nr_hits) @compound_fingerprints = {} @compound_features.each do |feature, value| # value is nil if "Substructure.match" @@ -314,6 +316,16 @@ module OpenTox @prediction_dataset.add @compound.uri, feature_uri, true f+=1 end + elsif @feature_calculation_algorithm == "Substructure.lookup" + f = 0 + @compound_features.each do |feature, value| + features[feature] = feature + @prediction_dataset.add_feature(feature, { + RDF.type => [OT.NumericFeature] + }) + @prediction_dataset.add @compound.uri, feature, value + f+=1 + end else @compound_features.each do |feature| features[feature] = feature @@ -337,15 +349,26 @@ module OpenTox else feature_uri = feature end - @prediction_dataset.add neighbor[:compound], feature_uri, true + if @feature_calculation_algorithm == "Substructure.lookup" + @prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri] + else + @prediction_dataset.add neighbor[:compound], feature_uri, true + end + unless features.has_key? feature features[feature] = feature_uri - @prediction_dataset.add_feature(feature_uri, { - RDF.type => [OT.Substructure], - OT.smarts => feature, - OT.pValue => @p_values[feature], - OT.effect => @effects[feature] - }) + if @feature_calculation_algorithm == "Substructure.lookup" + @prediction_dataset.add_feature(feature_uri, { + RDF.type => [OT.NumericFeature] + }) + else + @prediction_dataset.add_feature(feature_uri, { + RDF.type => [OT.Substructure], + OT.smarts => feature, + OT.pValue => @p_values[feature], + OT.effect => @effects[feature] + }) + end f+=1 end end diff --git a/lib/parser.rb b/lib/parser.rb index 56e4fed..257d250 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -349,11 +349,15 @@ module OpenTox # Load CSV string (format specification: http://toxcreate.org/help) # @param [String] csv CSV representation of the dataset + # @param [Boolean] drop_missing Whether completely missing rows should be droppped + # @param [Boolean] all_numeric Whether all features should be treated as numeric + # @param [Boolean] del_nominal All nominal features will be removed # @return [OpenTox::Dataset] Dataset object with CSV data - def load_csv(csv, drop_missing=false) + def load_csv(csv, drop_missing=false, all_numeric=false) row = 0 input = csv.split("\n") headers = split_row(input.shift) + headers.collect! {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")} add_features(headers) value_maps = Array.new regression_features=Array.new @@ -362,7 +366,7 @@ module OpenTox row = split_row(row) value_maps = detect_new_values(row, value_maps) value_maps.each_with_index { |vm,j| - if vm.size > @max_class_values # max @max_class_values classes. + if (vm.size > @max_class_values) || all_numeric # max @max_class_values classes. regression_features[j]=true else regression_features[j]=false @@ -392,22 +396,30 @@ module OpenTox def warnings - info = '' + info = '<br>' @feature_types.each do |feature,types| + @dataset.add_feature_metadata(feature,{RDF.type => []}) if types.uniq.size == 0 - type = "helper#MissingFeature" - elsif types.uniq.size > 1 - type = OT.NumericFeature + @dataset.add_feature_metadata( + feature, {RDF.type => ( @dataset.features[feature][RDF.type] << "helper#MissingFeature" ) } # TODO: Fit to OT ontology! + ) + info += "'#{@dataset.feature_name(feature)}' detected as 'MissingFeature'<br>" else - type = types.first + info += "'#{@dataset.feature_name(feature)}' detected as " + types_arr = [] + types.uniq.each { |t| + types_arr << t + info += "'#{t.split('#').last}', " + } + + @dataset.add_feature_metadata( + feature, {RDF.type => types_arr.sort} # nominal should be first for downward compatibility + ) + + info.chop!.chop! + info += "<br>" end - @dataset.add_feature_metadata(feature,{RDF.type => [type]}) - info += "\"#{@dataset.feature_name(feature)}\" detected as #{type.split('#').last}." if type - - # TODO: rewrite feature values - # TODO if value.to_f == 0 @activity_errors << "#{id} Zero values not allowed for regression datasets - entry ignored." end - @dataset.metadata[OT.Info] = info warnings = '' @@ -469,28 +481,31 @@ module OpenTox unless @duplicate_feature_indices.include? i value = row[i] - #LOGGER.warn "Missing values for #{id}" if value.size == 0 # String is empty feature = @features[feature_idx] type = feature_type(value) # May be NIL - type = OT.NominalFeature unless (type.nil? || regression_features[i]) @feature_types[feature] << type if type + # Add nominal type if #distinct values le @max_class_values + if type == OT.NumericFeature + @feature_types[feature] << OT.NominalFeature unless regression_features[i] + end val = nil case type when OT.NumericFeature val = value.to_f + val = nil if val.infinite? when OT.NominalFeature val = value.to_s end feature_idx += 1 - if val != nil + if val != nil @dataset.add(compound.uri, feature, val) - if type != OT.NumericFeature + if @feature_types[feature].include? OT.NominalFeature @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue] - @dataset.features[feature][OT.acceptValue] << val.to_s unless @dataset.features[feature][OT.acceptValue].include?(val.to_s) + @dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val) end end @@ -654,7 +669,7 @@ module OpenTox obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) } table.data[compound.uri] = row end - + # find and remove ignored_features @activity_errors = table.clean_features table.add_to_dataset @dataset diff --git a/lib/r-util.rb b/lib/r-util.rb index 7163c46..cc70696 100644 --- a/lib/r-util.rb +++ b/lib/r-util.rb @@ -8,6 +8,18 @@ PACKAGE_DIR = package_dir require "tempfile" +class Array + + def check_uniq + hash = {} + self.each do |x| + raise "duplicate #{x}" if hash[x] + hash[x] = true + end + end + +end + module OpenTox class RUtil @@ -75,12 +87,10 @@ module OpenTox end # embedds feature values of two datasets into 2D and plots it - # fast_plot = true -> PCA, fast_plot = false -> SMACOF (iterative optimisation method) # def feature_value_plot(files, dataset_uri1, dataset_uri2, dataset_name1, dataset_name2, - features=nil, fast_plot=true, subjectid=nil, waiting_task=nil) + features=nil, subjectid=nil, waiting_task=nil) - raise "r-package smacof missing" if fast_plot==false and !package_installed?("smacof") LOGGER.debug("r-util> create feature value plot") d1 = OpenTox::Dataset.find(dataset_uri1,subjectid) d2 = OpenTox::Dataset.find(dataset_uri2,subjectid) @@ -102,17 +112,13 @@ module OpenTox @r.eval "split <- c(rep(0,nrow(#{df1})),rep(1,nrow(#{df2})))" @r.names = [dataset_name1, dataset_name2] LOGGER.debug("r-util> - convert data to 2d") - @r.eval "df.2d <- plot_pre_process(df, method='#{(fast_plot ? "pca" : "smacof")}')" + #@r.eval "save.image(\"/tmp/image.R\")" + @r.eval "df.2d <- plot_pre_process(df, method='sammon')" waiting_task.progress(75) if waiting_task - if fast_plot - info = "main='PCA-Embedding of #{features.size} features',xlab='PC1',ylab='PC2'" - else - info = "main='SMACOF-Embedding of #{features.size} features',xlab='x',ylab='y'" - end LOGGER.debug("r-util> - plot data") plot_to_files(files) do |file| - @r.eval "plot_split( df.2d, split, names, #{info})" + @r.eval "plot_split( df.2d, split, names, main='Sammon embedding of #{features.size} features',xlab='x',ylab='y')" end end @@ -170,19 +176,68 @@ module OpenTox end end - # stratified splits a dataset into two dataset the feature values + # stratified splits a dataset into two dataset according to the feature values + # all features are taken into account unless <split_features> is given + # returns two datases + def stratified_split( dataset, metadata={}, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil ) + stratified_split_internal( dataset, metadata, missing_values, nil, pct, subjectid, seed, split_features ) + end + + # stratified splits a dataset into k datasets according the feature values # all features are taken into account unless <split_features> is given - def stratified_split( dataset, missing_values="NA", pct=0.3, subjectid=nil, seed=42, split_features=nil ) + # returns two arrays of datasets + def stratified_k_fold_split( dataset, metadata={}, missing_values="NA", num_folds=10, subjectid=nil, seed=42, split_features=nil ) + stratified_split_internal( dataset, metadata, missing_values, num_folds, nil, subjectid, seed, split_features ) + end + + private + def stratified_split_internal( dataset, metadata={}, missing_values="NA", num_folds=nil, pct=nil, subjectid=nil, seed=42, split_features=nil ) + raise "internal error" if num_folds!=nil and pct!=nil + k_fold_split = num_folds!=nil + if k_fold_split + raise "num_folds not a fixnum: #{num_folds}" unless num_folds.is_a?(Fixnum) + else + raise "pct is not a numeric: #{pct}" unless pct.is_a?(Numeric) + end raise "not a loaded ot-dataset" unless dataset.is_a?(OpenTox::Dataset) and dataset.compounds.size>0 and dataset.features.size>0 + raise "missing_values=#{missing_values}" unless missing_values.is_a?(String) or missing_values==0 + raise "subjectid=#{subjectid}" unless subjectid==nil or subjectid.is_a?(String) LOGGER.debug("r-util> apply stratified split to #{dataset.uri}") - df = dataset_to_dataframe( dataset, missing_values, subjectid, split_features ) + df = dataset_to_dataframe( dataset, missing_values, subjectid) @r.eval "set.seed(#{seed})" - @r.eval "split <- stratified_split(#{df}, ratio=#{pct})" - split = @r.pull 'split' - split = split.collect{|s| 1-s.to_i} # reverse 1s and 0s, as 1 means selected, but 0 will be first set - split_to_datasets( df, split, subjectid ) + str_split_features = "" + if split_features + @r.split_features = split_features if split_features + str_split_features = "colnames=split_features" + end + #@r.eval "save.image(\"/tmp/image.R\")" + + if k_fold_split + @r.eval "split <- stratified_k_fold_split(#{df}, num_folds=#{num_folds}, #{str_split_features})" + split = @r.pull 'split' + train = [] + test = [] + num_folds.times do |f| + datasetname = 'dataset fold '+(f+1).to_s+' of '+num_folds.to_s + metadata[DC.title] = "training "+datasetname + train << split_to_dataset( df, split, metadata, subjectid ){ |i| i!=(f+1) } + metadata[DC.title] = "test "+datasetname + test << split_to_dataset( df, split, metadata, subjectid ){ |i| i==(f+1) } + end + return train, test + else + puts "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" + @r.eval "split <- stratified_split(#{df}, ratio=#{pct}, #{str_split_features})" + split = @r.pull 'split' + metadata[DC.title] = "Training dataset split of "+dataset.uri + train = split_to_dataset( df, split, metadata, subjectid ){ |i| i==1 } + metadata[DC.title] = "Test dataset split of "+dataset.uri + test = split_to_dataset( df, split, metadata, subjectid ){ |i| i==0 } + return train, test + end end + public # dataset should be loaded completely (use Dataset.find) # takes duplicates into account @@ -212,9 +267,13 @@ module OpenTox features = dataset.features.keys.sort end compounds = [] + compound_names = [] dataset.compounds.each do |c| + count = 0 num_compounds[c].times do |i| compounds << c + compound_names << "#{c}$#{count}" + count+=1 end end @@ -238,7 +297,7 @@ module OpenTox end end df_name = "df_#{dataset.uri.split("/")[-1].split("?")[0]}" - assign_dataframe(df_name,d_values,compounds,features) + assign_dataframe(df_name,d_values,compound_names,features) # set dataframe column types accordingly f_count = 1 #R starts at 1 @@ -264,25 +323,27 @@ module OpenTox # converts a dataframe into a dataset (a new dataset is created at the dataset webservice) # this is only possible if a superset of the dataframe was created by dataset_to_dataframe (metadata and URIs!) - def dataframe_to_dataset( df, subjectid=nil ) - dataframe_to_dataset_indices( df, subjectid, nil) + def dataframe_to_dataset( df, metadata={}, subjectid=nil ) + dataframe_to_dataset_indices( df, metadata, subjectid, nil) end private - def dataframe_to_dataset_indices( df, subjectid=nil, compound_indices=nil ) + def dataframe_to_dataset_indices( df, metadata={}, subjectid=nil, compound_indices=nil ) raise unless @@feats[df].size>0 - values, compounds, features = pull_dataframe(df) + values, compound_names, features = pull_dataframe(df) + compounds = compound_names.collect{|c| c.split("$")[0]} features.each{|f| raise unless @@feats[df][f]} dataset = OpenTox::Dataset.create(CONFIG[:services]["opentox-dataset"],subjectid) + dataset.add_metadata(metadata) LOGGER.debug "r-util> convert dataframe to dataset #{dataset.uri}" compounds.size.times{|i| dataset.add_compound(compounds[i]) if compound_indices==nil or compound_indices.include?(i)} features.each{|f| dataset.add_feature(f,@@feats[df][f])} features.size.times do |c| feat = OpenTox::Feature.find(features[c],subjectid) - nominal = feat.metadata[RDF.type].to_a.flatten.include?(OT.NominalFeature) + numeric = feat.metadata[RDF.type].to_a.flatten.include?(OT.NumericFeature) compounds.size.times do |r| if compound_indices==nil or compound_indices.include?(r) - dataset.add(compounds[r],features[c],nominal ? values[r][c] : values[r][c].to_f) if values[r][c]!="NA" + dataset.add(compounds[r],features[c],numeric ? values[r][c].to_f : values[r][c]) if values[r][c]!="NA" end end end @@ -290,16 +351,12 @@ module OpenTox dataset end - def split_to_datasets( df, split, subjectid=nil ) - sets = [] - (split.min.to_i .. split.max.to_i).each do |i| - indices = [] - split.size.times{|j| indices<<j if split[j]==i} - dataset = dataframe_to_dataset_indices( df, subjectid, indices ) - LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}") - sets << dataset - end - sets + def split_to_dataset( df, split, metadata={}, subjectid=nil ) + indices = [] + split.size.times{|i| indices<<i if yield(split[i]) } + dataset = dataframe_to_dataset_indices( df, metadata, subjectid, indices ) + LOGGER.debug("r-util> split into #{dataset.uri}, c:#{dataset.compounds.size}, f:#{dataset.features.size}") + dataset end def pull_dataframe(df) @@ -323,6 +380,8 @@ module OpenTox end def assign_dataframe(df,input,rownames,colnames) + rownames.check_uniq if rownames + colnames.check_uniq if colnames tmp = File.join(Dir.tmpdir,Time.new.to_f.to_s+"_"+rand(10000).to_s+".csv") file = File.new(tmp, 'w') input.each{|i| file.puts(i.collect{|e| "\"#{e}\""}.join("#")+"\n")} diff --git a/lib/serializer.rb b/lib/serializer.rb index 30cb2ba..03ca285 100644 --- a/lib/serializer.rb +++ b/lib/serializer.rb @@ -459,32 +459,80 @@ module OpenTox def initialize(dataset) @rows = [] @rows << ["SMILES"] + features = dataset.features.keys - @rows.first << features + + # prepare for subgraphs + have_substructures = features.collect{ |id| dataset.features[id][RDF.type].include? OT.Substructure}.compact.uniq + if have_substructures.size == 1 && have_substructures[0] + features_smarts = features.collect{ |id| "'" + dataset.features[id][OT.smarts] + "'" } + end + + # gather missing features + delete_features = [] + features.each{ |id| + dataset.features[id][RDF.type].each { |typestr| + if typestr.include? "MissingFeature" + delete_features << id + end + } + } + features = features - delete_features + + # detect nr duplicates per compound + compound_sizes = {} + dataset.compounds.each do |compound| + entries=dataset.data_entries[compound] + if entries + entries.each do |feature, values| + compound_sizes[compound] || compound_sizes[compound] = [] + compound_sizes[compound] << values.size + end + compound_sizes[compound].uniq! + raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1 + compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array + end + end + + # get headers + features_smarts && @rows.first << features_smarts || @rows.first << features @rows.first.flatten! - dataset.data_entries.each do |compound,entries| - cmpd = Compound.new(compound) - smiles = cmpd.to_smiles - inchi = URI.encode_www_form_component(cmpd.to_inchi) - row_container = Array.new - row = Array.new(@rows.first.size) - row_container << row - #row[0] = smiles - row[0] = inchi - entries.each do |feature, values| - i = features.index(feature)+1 - values.each do |value| - if row_container[0][i] - #LOGGER.debug "Feature '#{feature}' (nr '#{i}'): '#{value}'" - row_container << row_container.last.collect - row_container.last[i] = value - #LOGGER.debug "RC: #{row_container.to_yaml}" - else - row_container.each { |r| r[i] = value } - end + + # feature positions pre-calculated + feature_positions = features.inject({}) { |h,f| + h.merge!({f => features.index(f)+1}) # +1 due to ID + h + } + + # serialize to csv + dataset.compounds.each do |compound| + entries=dataset.data_entries[compound] + if entries + inchi = URI.encode_www_form_component(Compound.new(compound).to_inchi) + + # allocate container + row_container = Array.new(compound_sizes[compound]) + (0...row_container.size).each do |i| + row_container[i] = Array.new(@rows.first.size) + row_container[i][0] = inchi + end + + # fill entries + entries.each { |feature, values| + (0...compound_sizes[compound]).each { |i| + row_container[i][feature_positions[feature]] = values[i] + } + } + + # fill zeroes for subgraphs + if (features_smarts) + row_container.collect! { |row| + row.collect! { |x| x ? x : 0 } + } end + row_container.each { |row| @rows << row } + end - row_container.each { |r| @rows << r } end end diff --git a/lib/stratification.R b/lib/stratification.R index 76ff2d8..3f8698c 100644 --- a/lib/stratification.R +++ b/lib/stratification.R @@ -1,4 +1,13 @@ +round_it <- function( x ) +{ + if(isTRUE((x - floor(x))>=0.5)) + ceiling(x) + else + floor(x) +} + + nominal_to_binary <- function( data ) { result = NULL @@ -41,9 +50,13 @@ nominal_to_binary <- function( data ) result } -process_data <- function( data ) +process_data <- function( data, colnames=NULL ) { data.num <- as.data.frame(data) + if (!is.null(colnames)) + { + data.num = subset(data.num, select = colnames) + } if (!is.numeric(data.num)) { data.num = nominal_to_binary(data.num) @@ -72,14 +85,15 @@ cluster <- function( data, min=10, max=15 ) cbind(s$partition[,m]) } -stratified_split <- function( data, ratio=0.3, method="cluster" ) +stratified_split <- function( data, ratio=0.3, method="cluster", colnames=NULL ) { - data.processed = as.matrix(process_data( data )) + data.processed = as.matrix(process_data( data, colnames )) + print(paste("split using #features: ",ncol(data.processed))) if (method == "samplecube") { require("sampling") # adjust ratio to make samplecube return exact number of samples - ratio = round(nrow(data.processed)*ratio)/nrow(data.processed) + ratio = round_it(nrow(data.processed)*ratio)/nrow(data.processed) pik = rep(ratio,times=nrow(data.processed)) data.strat = cbind(pik,data.processed) samplecube(data.strat,pik,order=2,comment=F) @@ -101,10 +115,11 @@ stratified_split <- function( data, ratio=0.3, method="cluster" ) stop("unknown method") } -stratified_k_fold_split <- function( data, num_folds=10, method="cluster" ) +stratified_k_fold_split <- function( data, num_folds=10, method="cluster", colnames=NULL ) { print(paste(num_folds,"-fold-split, data-size",nrow(data))) - data.processed = as.matrix(process_data( data )) + data.processed = as.matrix(process_data( data, colnames )) + print(paste("split using #features: ",ncol(data.processed))) if (method == "samplecube") { folds = rep(0, times=nrow(data)) @@ -133,7 +148,7 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" ) { require("TunePareto") cl = cluster(data.processed) - res = generateCVRuns(cl,ntimes=1,nfold=3) + res = generateCVRuns(cl,ntimes=1,nfold=num_folds) folds = rep(0, times=nrow(data)) for (i in 1:num_folds) for(j in 1:length(res[[1]][[i]])) @@ -144,6 +159,50 @@ stratified_k_fold_split <- function( data, num_folds=10, method="cluster" ) stop("unknown method") } +duplicate_indices <- function( data ) { + indices = 1:nrow(data) + z = data + duplicate_index = anyDuplicated(z) + while(duplicate_index) { + duplicate_to_index = anyDuplicated(z[1:duplicate_index,],fromLast=T) + #print(paste(duplicate_index,'is dupl to',duplicate_to_index)) + indices[duplicate_index] <- duplicate_to_index + z[duplicate_index,] <- paste('123$ยง%',duplicate_index) + duplicate_index = anyDuplicated(z) + } + indices +} + +add_duplicates <- function( data, dup_indices ) { + result = data[1,] + for(i in 2:length(dup_indices)) { + row = data[rownames(data)==dup_indices[i],] + if(length(row)==0) + stop(paste('index ',i,' dup-index ',dup_indices[i],'not found in data')) + result = rbind(result, row) + } + rownames(result)<-NULL + result +} + +sammon_duplicates <- function( data, ... ) { + di <- duplicate_indices(data) + print(di) + u <- unique(data) + print(paste('unique data points',nrow(u),'of',nrow(data))) + if(nrow(u) <= 4) stop("number of unqiue datapoints <= 4") + points_unique <- sammon(dist(u), ...)$points + if (nrow(u)<nrow(data)) + { + points <- add_duplicates(points_unique, di) + points + } + else + { + points_unique + } +} + plot_pre_process <- function( data, method="pca" ) { data.processed = process_data( data ) @@ -158,6 +217,11 @@ plot_pre_process <- function( data, method="pca" ) data.emb <- smacofSym(dist(data.processed, method = "euclidean"), ndim=2, verbose=T) data.emb$conf } + else if (method == "sammon") + { + require("MASS") + sammon_duplicates(data.processed, k=2) + } else stop("unknown method") } diff --git a/lib/transform.rb b/lib/transform.rb index 8fe1093..8632f6c 100644 --- a/lib/transform.rb +++ b/lib/transform.rb @@ -396,7 +396,7 @@ module OpenTox @q_prop = gsl_q_prop_orig.row(0).to_a end - LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" + LOGGER.debug "F: #{@n_prop.size}x#{@n_prop[0].size}; R: #{@q_prop.size}" if (@n_prop && @n_prop[0] && @q_prop) LOGGER.debug "Sims: #{@sims.size}, Acts: #{@acts.size}" @sims = [ gram_matrix, @sims ] @@ -490,8 +490,10 @@ module OpenTox @cmpds = []; @fps = []; @acts = []; @n_prop = []; @q_prop = [] - @model.fingerprints.each { |fp| - cmpd = fp[0]; fp = fp[1] + # Major BUG! Must loop over @model.compounds, hash is unordered! + # @model.fingerprints.each + @model.compounds.each { |cmpd| + fp = @model.fingerprints[cmpd] if @model.activities[cmpd] # row good acts = @model.activities[cmpd]; @acts += acts LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1 diff --git a/lib/utils.rb b/lib/utils.rb index d9d7b4b..149208b 100644 --- a/lib/utils.rb +++ b/lib/utils.rb @@ -1,155 +1,414 @@ require 'csv' +require 'tempfile' module OpenTox module Algorithm + @ambit_descriptor_algorithm_uri = "http://apps.ideaconsult.net:8080/ambit2/algorithm/org.openscience.cdk.qsar.descriptors.molecular." + @ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/" + @ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632" + @keysfile = File.join(ENV['HOME'], ".opentox", "config", "pc_descriptors.yaml") + include OpenTox # Calculate physico-chemical descriptors. - # @param[Hash] Required keys: :dataset_uri, :pc_type + # @param[Hash] required: :dataset_uri, :pc_type, :rjb, :task, :add_uri, optional: :descriptor, :lib, :subjectid # @return[String] dataset uri - def self.pc_descriptors(params) + ds = OpenTox::Dataset.find(params[:dataset_uri],params[:subjectid]) + compounds = ds.compounds.collect + task_weights = {"joelib"=> 20, "openbabel"=> 1, "cdk"=> 50 } + task_weights.keys.each { |step| task_weights.delete(step) if (params[:lib] && (!params[:lib].split(",").include?(step)))} + task_weights["load"] = 10 + task_sum = Float task_weights.values.sum + task_weights.keys.each { |step| task_weights[step] /= task_sum } + task_weights.keys.each { |step| task_weights[step] = (task_weights[step]*100).floor } + + jl_master=nil + cdk_master=nil + ob_master=nil + + + # # # openbabel (via ruby bindings) + if !params[:lib] || params[:lib].split(",").include?("openbabel") + ob_master, ob_ids = get_ob_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :descriptor => params[:descriptor] } ) + params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["openbabel"]) if params[:task] + end + + + # # # joelib (via rjb) + if !params[:lib] || params[:lib].split(",").include?("joelib") + jl_master, jl_ids = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb], :pc_type => params[:pc_type], :descriptor => params[:descriptor] } ) + params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["joelib"]) if params[:task] + end + + + # # # cdk (via REST) + if !params[:lib] || params[:lib].split(",").include?("cdk") + ambit_result_uri, smiles_to_inchi, cdk_ids = get_cdk_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :task => params[:task], :step => task_weights["cdk"], :descriptor => params[:descriptor] } ) + #LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'" + cdk_master, cdk_ids, ambit_ids = load_ds_csv(ambit_result_uri, smiles_to_inchi, cdk_ids ) + params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["load"]) if params[:task] + end + + # # # fuse CSVs ("master" structures) + if jl_master && cdk_master + nr_cols = (jl_master[0].size)-1 + LOGGER.debug "Merging #{nr_cols} new columns" + cdk_master.each {|row| nr_cols.times { row.push(nil) } } + jl_master.each do |row| + temp = cdk_master.assoc(row[0]) # Finds the appropriate line in master + ((-1*nr_cols)..-1).collect.each { |idx| + temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found + } + end + master = cdk_master + else # either jl_master or cdk_master nil + master = jl_master || cdk_master + end + + if ob_master && master + nr_cols = (ob_master[0].size)-1 + LOGGER.debug "Merging #{nr_cols} new columns" + master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows + ob_master.each do |row| + temp = master.assoc(row[0]) # Finds the appropriate line in master + ((-1*nr_cols)..-1).collect.each { |idx| + temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found + } + end + else # either ob_master or master nil + master = ob_master || master + end + + if master + + ds = OpenTox::Dataset.find( + OpenTox::RestClientWrapper.post( + File.join(CONFIG[:services]["opentox-dataset"]), master.collect { |row| row.join(",") }.join("\n"), {:content_type => "text/csv", :subjectid => params[:subjectid]} + ),params[:subjectid] + ) + + # # # add feature metadata + pc_descriptors = YAML::load_file(@keysfile) + ambit_ids && ambit_ids.each_with_index { |id,idx| + raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)] + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[cdk_ids[idx]][:name]} [#{pc_descriptors[cdk_ids[idx]][:pc_type]}, #{pc_descriptors[cdk_ids[idx]][:lib]}]"}) + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => @ambit_descriptor_algorithm_uri + cdk_ids[idx]}) + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]}) + } + ob_ids && ob_ids.each { |id| + raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)] + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"}) + creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc") + creator_uri += "/#{id}" if params[:add_uri] + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri}) + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]}) + } + jl_ids && jl_ids.each { |id| + raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)] + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[id][:name]} [#{pc_descriptors[id][:pc_type]}, #{pc_descriptors[id][:lib]}]"}) + creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc") + creator_uri += "/#{id}" if params[:add_uri] + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri}) + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]}) + } + + ds.save(params[:subjectid]) + else + raise OpenTox::BadRequestError.new "No descriptors matching your criteria found." + end + + end + + + # Calculate OpenBabel physico-chemical descriptors. + # @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor + # @return[Array] CSV, array of field ids, array of field descriptions + def self.get_ob_descriptors(params) + + master = nil + begin - ds = OpenTox::Dataset.find(params[:dataset_uri]) - compounds = ds.compounds.collect - ambit_result_uri, smiles_to_inchi = get_pc_descriptors( { :compounds => compounds, :pc_type => params[:pc_type] } ) - #ambit_result_uri = ["http://apps.ideaconsult.net:8080/ambit2/dataset/987103?" ,"feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Ffeature%2F4276789&", "feature_uris[]=http%3A%2F%2Fapps.ideaconsult.net%3A8080%2Fambit2%2Fmodel%2F16%2Fpredicted"] # for testing - LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'" - load_ds_csv(ambit_result_uri, smiles_to_inchi) + csvfile = Tempfile.open(['ob_descriptors-','.csv']) + + pc_descriptors = YAML::load_file(@keysfile) + ids = pc_descriptors.collect{ |id, info| + id if info[:lib] == "openbabel" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor]) + }.compact + + if ids.length > 0 + csvfile.puts((["SMILES"] + ids).join(",")) + + # remember inchis + inchis = params[:compounds].collect { |c_uri| + URI.encode_www_form_component(OpenTox::Compound.new(c_uri).to_inchi) + } + + # Process compounds + obmol = OpenBabel::OBMol.new + obconversion = OpenBabel::OBConversion.new + obconversion.set_in_and_out_formats 'inchi', 'can' + + inchis.each_with_index { |inchi, c_idx| + row = [inchis[c_idx]] + obconversion.read_string(obmol, URI.decode_www_form_component(inchi)) + ids.each { |name| + if obmol.respond_to?(name.underscore) + val = eval("obmol.#{name.underscore}") if obmol.respond_to?(name.underscore) + else + if name != "nF" && name != "spinMult" && name != "nHal" && name != "logP" + val = OpenBabel::OBDescriptor.find_type(name.underscore).predict(obmol) + elsif name == "nF" + val = OpenBabel::OBDescriptor.find_type("nf").predict(obmol) + elsif name == "spinMult" || name == "nHal" || name == "logP" + val = OpenBabel::OBDescriptor.find_type(name).predict(obmol) + end + end + if OpenTox::Algorithm.numeric?(val) + val = Float(val) + val = nil if val.nan? + val = nil if (val && val.infinite?) + end + row << val + } + LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries" + csvfile.puts(row.join(",")) + csvfile.flush + } + master = CSV::parse(File.open(csvfile.path, "rb").read) + end + rescue Exception => e LOGGER.debug "#{e.class}: #{e.message}" LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + csvfile.close! end + [ master, ids ] + end - - # Calculates PC descriptors via Ambit -- DO NOT OVERLOAD Ambit. - # @param[Hash] Required keys: :compounds, :pc_type - # @return[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features - def self.get_pc_descriptors(params) + + + # Calculate Joelib2 physico-chemical descriptors. + # @param[Hash] required: :compounds, :pc_type, :task, optional: :descriptor + # @return[Array] CSV, array of field ids, array of field descriptions + def self.get_jl_descriptors(params) + + master = nil + s = params[:rjb]; raise "No Java environment" unless s + + # Load keys, enter CSV headers begin + csvfile = Tempfile.open(['jl_descriptors-','.csv']) - ambit_ds_service_uri = "http://apps.ideaconsult.net:8080/ambit2/dataset/" - ambit_mopac_model_uri = "http://apps.ideaconsult.net:8080/ambit2/model/69632" - descs = YAML::load_file( File.join(ENV['HOME'], ".opentox", "config", "ambit_descriptors.yaml") ) - descs_uris = [] - params[:pc_type] = "electronic,cpsa" if params[:pc_type].nil? # rescue missing pc_type - types = params[:pc_type].split(",") - descs.each { |uri, cat_name| - if types.include? cat_name[:category] - descs_uris << uri - end - } - if descs_uris.size == 0 - raise "Error! Empty set of descriptors. Did you supply one of [geometrical, topological, electronic, constitutional, hybrid, cpsa] ?" + pc_descriptors = YAML::load_file(@keysfile) + ids = pc_descriptors.collect{ |id, info| + id if info[:lib] == "joelib" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor]) + }.compact + + + if ids.length > 0 + csvfile.puts((["SMILES"] + ids).join(",")) + + # remember inchis + inchis = params[:compounds].collect { |c_uri| + cmpd = OpenTox::Compound.new(c_uri) + URI.encode_www_form_component(cmpd.to_inchi) + } + + # Process compounds + params[:compounds].each_with_index { |c_uri, c_idx| + cmpd = OpenTox::Compound.new(c_uri) + inchi = cmpd.to_inchi + sdf_data = cmpd.to_sdf + + infile = Tempfile.open(['jl_descriptors-in-','.sdf']) + outfile_path = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out") + + begin + infile.puts sdf_data + infile.flush + s.new(infile.path, outfile_path) # runs joelib + + row = [inchis[c_idx]] + ids.each_with_index do |k,i| # Fill row + re = Regexp.new(k) + open(outfile_path) do |f| + f.each do |line| + if @prev == k + entry = line.chomp + val = nil + if OpenTox::Algorithm.numeric?(entry) + val = Float(entry) + val = nil if val.nan? + val = nil if (val && val.infinite?) + end + row << val + break + end + @prev = line.gsub(/^.*types./,"").gsub(/count./,"").gsub(/>/,"").chomp if line =~ re + end + end + end + LOGGER.debug "Compound #{c_idx+1} (#{inchis.size}), #{row.size} entries" + csvfile.puts(row.join(",")) + csvfile.flush + + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + File.delete(infile.path.gsub(/\.sdf/,".numeric.sdf")) + File.delete(outfile_path) + infile.close! + end + } + master = CSV::parse(File.open(csvfile.path, "rb").read) end - #LOGGER.debug "Ambit descriptor URIs: #{descs_uris.join(", ")}" + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + [ csvfile].each { |f| f.close! } + end + + [ master, ids ] + + end + + # Calculate CDK physico-chemical descriptors via Ambit -- DO NOT OVERLOAD Ambit. + # @param[Hash] required: :compounds, :pc_type, :task, :step optional: :descriptor + # @return[Array] array of Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features, hash smiles to inchi, array of field descriptions + def self.get_cdk_descriptors(params) + + ambit_result_uri = [] # 1st pos: base uri, then features + smiles_to_inchi = {} + task_weights = {"electronic"=> 4, "topological"=> 19, "constitutional"=> 12, "geometrical"=> 3, "hybrid"=> 2, "cpsa"=> 1 } + task_weights.keys.each { |pc_type| task_weights.delete(pc_type) if (params[:pc_type] && (!params[:pc_type].split(",").include?(pc_type)))} + task_sum = Float task_weights.values.sum + task_weights.keys.each { |pc_type| task_weights[pc_type] /= task_sum } + task_weights.keys.each { |pc_type| task_weights[pc_type] *= params[:step] } + + + # extract wanted descriptors from config file and parameters + pc_descriptors = YAML::load_file(@keysfile) + + ids = pc_descriptors.collect { |id, info| + "#{info[:pc_type]}:::#{id}" if info[:lib] == "cdk" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor]) + }.compact + + if ids.size > 0 + ids.sort! + ids.collect! { |id| id.split(":::").last } + + # create dataset at Ambit begin - # Create SMI - smiles_array = []; smiles_to_inchi = {} params[:compounds].each do |n| cmpd = OpenTox::Compound.new(n) smiles_string = cmpd.to_smiles smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi) - smiles_array << smiles_string end - smi_file = Tempfile.open(['pc_ambit', '.csv']) - pc_descriptors = nil - - # Create Ambit dataset - smi_file.puts( "SMILES\n" ) - smi_file.puts( smiles_array.join("\n") ) - smi_file.flush - ambit_ds_uri = OpenTox::RestClientWrapper.post(ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} ) + smi_file = Tempfile.open(['pc_ambit', '.csv']) ; smi_file.puts( "SMILES\n" + smiles_to_inchi.keys.join("\n") ) ; smi_file.flush + ambit_ds_uri = OpenTox::RestClientWrapper.post(@ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} ) + ambit_result_uri = [ ambit_ds_uri + "?" ] # 1st pos: base uri, then features rescue Exception => e LOGGER.debug "#{e.class}: #{e.message}" LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" ensure smi_file.close! if smi_file end - ambit_smiles_uri = OpenTox::RestClientWrapper.get(ambit_ds_uri + "/features", {:accept=> "text/uri-list"} ).chomp - - # Calculate 3D for CPSA - if types.include? "cpsa" - ambit_ds_mopac_uri = OpenTox::RestClientWrapper.post(ambit_mopac_model_uri, {:dataset_uri => ambit_ds_uri}, {:accept => "text/uri-list"} ) - LOGGER.debug "MOPAC dataset: #{ambit_ds_mopac_uri }" - end - - # Get Ambit results - ambit_result_uri = [] # 1st pos: base uri, then features - ambit_result_uri << ambit_ds_uri + "?" + # get SMILES feature URI + ambit_smiles_uri = OpenTox::RestClientWrapper.get( + ambit_ds_uri + "/features", + {:accept=> "text/uri-list"} + ).chomp ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&") - descs_uris.each_with_index do |uri, i| - algorithm = Algorithm::Generic.new(uri) + # always calculate 3D (http://goo.gl/Tk81j), then get results + OpenTox::RestClientWrapper.post( + @ambit_mopac_model_uri, + {:dataset_uri => ambit_ds_uri}, + {:accept => "text/uri-list"} + ) + current_cat = "" + ids.each_with_index do |id, i| + old_cat = current_cat; current_cat = pc_descriptors[id][:pc_type] + params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[old_cat]) if params[:task] && old_cat != current_cat && old_cat != "" + algorithm = Algorithm::Generic.new(@ambit_descriptor_algorithm_uri+id) result_uri = algorithm.run({:dataset_uri => ambit_ds_uri}) ambit_result_uri << result_uri.split("?")[1] + "&" - LOGGER.debug "Ambit (#{descs_uris.size}): #{i+1}" + LOGGER.debug "Ambit (#{ids.size}): #{i+1}" end + params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[current_cat]) if params[:task] #LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}" - [ ambit_result_uri, smiles_to_inchi ] - - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" end + + [ ambit_result_uri, smiles_to_inchi, ids ] + end # Load dataset via CSV # @param[Array] Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features - # @return[String] dataset uri - def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, subjectid=nil) + # @param[Hash] keys: SMILES, values: InChIs + # @param[Array] field descriptions, one for each feature + # @return[Array] CSV, array of field ids, array of field descriptions + def self.load_ds_csv(ambit_result_uri, smiles_to_inchi, single_ids, subjectid=nil) master=nil - (1...ambit_result_uri.size).collect { |idx| - curr_uri = ambit_result_uri[0] + ambit_result_uri[idx] - LOGGER.debug "Requesting #{curr_uri}" - csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) ) - if csv_data[0] && csv_data[0].size>1 - if master.nil? # This is the smiles entry - (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] } - master = csv_data - next - else - index_uri = csv_data[0].index("SMILES") - csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information - - nr_cols = (csv_data[0].size)-1 - LOGGER.debug "Merging #{nr_cols} new columns" - master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows - csv_data.each do |row| - temp = master.assoc(row[0]) # Finds the appropriate line in master - ((-1*nr_cols)..-1).collect.each { |idx| - temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found - } + ids=[] + ambit_ids=[] + + if ambit_result_uri.size > 0 + (1...ambit_result_uri.size).collect { |idx| + curr_uri = ambit_result_uri[0] + ambit_result_uri[idx] + #LOGGER.debug "Requesting #{curr_uri}" + csv_data = CSV.parse( OpenTox::RestClientWrapper.get(curr_uri, {:accept => "text/csv", :subjectid => subjectid}) ) + if csv_data[0] && csv_data[0].size>1 + if master.nil? # This is the smiles entry + (1...csv_data.size).each{ |idx| csv_data[idx][1] = smiles_to_inchi[csv_data[idx][1]] } + master = csv_data + next + else + index_uri = csv_data[0].index("SMILES") + csv_data.map {|i| i.delete_at(index_uri)} if index_uri #Removes additional SMILES information + + nr_cols = (csv_data[0].size)-1 + LOGGER.debug "Merging #{nr_cols} new columns" + ids += Array.new(nr_cols, single_ids[idx-2]) + master.each {|row| nr_cols.times { row.push(nil) } } # Adds empty columns to all rows + csv_data.each do |row| + temp = master.assoc(row[0]) # Finds the appropriate line in master + ((-1*nr_cols)..-1).collect.each { |idx| + temp[idx] = row[nr_cols+idx+1] if temp # Updates columns if line is found + } + end end end - end - } + } - index_uri = master[0].index("Compound") - master.map {|i| i.delete_at(index_uri)} - master[0].each {|cell| cell.chomp!(" ")} - master[0][0] = "Compound" #"SMILES" - index_smi = master[0].index("SMILES") - master.map {|i| i.delete_at(index_smi)} if index_smi - #master[0][0] = "SMILES" + index_uri = master[0].index("Compound") + master.map {|i| i.delete_at(index_uri)} + master[0].each {|cell| cell.chomp!(" ")} + master[0][0] = "Compound" #"SMILES" + index_smi = master[0].index("SMILES") + master.map {|i| i.delete_at(index_smi)} if index_smi + master[0][0] = "SMILES" + ambit_ids=master[0].collect {|header| header.to_s.gsub(/[\/.\\\(\)\{\}\[\]]/,"_")} + ambit_ids.shift + end #LOGGER.debug "-------- AM: Writing to dumpfile" #File.open("/tmp/test.csv", 'w') {|f| f.write( master.collect {|r| r.join(",")}.join("\n") ) } - parser = OpenTox::Parser::Spreadsheets.new - ds = OpenTox::Dataset.new(nil,subjectid) - ds.save(subjectid) - parser.dataset = ds - ds = parser.load_csv(master.collect{|r| r.join(",")}.join("\n")) - ds.save(subjectid) + [ master, ids, ambit_ids ] + end @@ -208,8 +467,8 @@ module OpenTox end - # Effect calculation for classification - # @param [Array] Array of occurrences per class in the form of Enumerables. + # Effect calculation for classification. It is assumed that the elements of the arrays match each other pairwise + # @param [Array] Array of occurrences per class (in the form of Enumerables). # @param [Array] Array of database instance counts per class. def self.effect(occurrences, db_instances) max=0 diff --git a/lib/validation.rb b/lib/validation.rb index 85004c7..a373e56 100644 --- a/lib/validation.rb +++ b/lib/validation.rb @@ -19,8 +19,9 @@ module OpenTox def self.list( params={} ) filter_string = "" params.each do |k,v| - filter_string = "?" if filter_string.length==0 - filter_string += k.to_s+"="+v + filter_string += (filter_string.length==0 ? "?" : "&") + v = v.to_s.gsub(/;/, "%3b") if v.to_s =~ /;/ + filter_string += k.to_s+"="+v.to_s end (OpenTox::RestClientWrapper.get(CONFIG[:services]["opentox-validation"]+filter_string).split("\n")) end @@ -156,8 +157,9 @@ module OpenTox def self.list( params={} ) filter_string = "" params.each do |k,v| - filter_string = "?" if filter_string.length==0 - filter_string += k.to_s+"="+v + filter_string += (filter_string.length==0 ? "?" : "&") + v = v.to_s.gsub(/;/, "%3b") if v.to_s =~ /;/ + filter_string += k.to_s+"="+v.to_s end (OpenTox::RestClientWrapper.get(File.join(CONFIG[:services]["opentox-validation"],"crossvalidation")+filter_string).split("\n")) end diff --git a/opentox-ruby.gemspec b/opentox-ruby.gemspec index ca2d397..d3ae2d7 100644 --- a/opentox-ruby.gemspec +++ b/opentox-ruby.gemspec @@ -5,24 +5,25 @@ Gem::Specification.new do |s| s.name = %q{opentox-ruby} - s.version = "3.1.0" + s.version = "2.0.1" s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version= s.authors = ["Christoph Helma, Martin Guetlein, Andreas Maunz, Micha Rautenberg, David Vorgrimmler"] - s.date = %q{2012-03-26} + s.date = %q{2011-06-15} s.description = %q{Ruby wrapper for the OpenTox REST API (http://www.opentox.org)} s.email = %q{helma@in-silico.ch} + s.executables = ["opentox-install-debian.sh", "opentox-install-ubuntu.sh"] s.extra_rdoc_files = [ - "ChangeLog", "LICENSE", "README.markdown" ] s.files = [ - "ChangeLog", "LICENSE", "README.markdown", "Rakefile", "VERSION", + "bin/opentox-install-debian.sh", + "bin/opentox-install-ubuntu.sh", "lib/algorithm.rb", "lib/authorization.rb", "lib/compound.rb", @@ -40,126 +41,121 @@ Gem::Specification.new do |s| "lib/overwrite.rb", "lib/parser.rb", "lib/policy.rb", - "lib/r-util.rb", "lib/rest_client_wrapper.rb", "lib/serializer.rb", "lib/spork.rb", - "lib/stratification.R", "lib/task.rb", "lib/templates/default_guest_policy.xml", "lib/templates/default_policy.xml", "lib/to-html.rb", "lib/transform.rb", - "lib/utils.rb", + "lib/utils.rb" "lib/validation.rb" ] - s.homepage = %q{http://github.com/opentox/opentox-ruby} + s.homepage = %q{http://github.com/helma/opentox-ruby} s.require_paths = ["lib"] - s.rubygems_version = %q{1.5.3} + s.rubygems_version = %q{1.5.2} s.summary = %q{Ruby wrapper for the OpenTox REST API} if s.respond_to? :specification_version then s.specification_version = 3 if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then - s.add_runtime_dependency(%q<sinatra>, ["= 1.2.6"]) - s.add_runtime_dependency(%q<emk-sinatra-url-for>, ["= 0.2.1"]) - s.add_runtime_dependency(%q<sinatra-respond_to>, ["= 0.7.0"]) - s.add_runtime_dependency(%q<sinatra-static-assets>, ["= 0.5.0"]) - s.add_runtime_dependency(%q<rest-client>, ["= 1.6.1"]) - s.add_runtime_dependency(%q<rack>, ["= 1.3.5"]) - s.add_runtime_dependency(%q<rack-contrib>, ["= 1.1.0"]) - s.add_runtime_dependency(%q<rack-flash>, ["= 0.1.1"]) - s.add_runtime_dependency(%q<nokogiri>, ["= 1.4.4"]) - s.add_runtime_dependency(%q<rubyzip>, ["= 0.9.4"]) - s.add_runtime_dependency(%q<roo>, ["= 1.9.3"]) - s.add_runtime_dependency(%q<spreadsheet>, ["= 0.6.5.4"]) - s.add_runtime_dependency(%q<google-spreadsheet-ruby>, ["= 0.1.5"]) - s.add_runtime_dependency(%q<yajl-ruby>, ["= 0.8.2"]) - s.add_runtime_dependency(%q<rinruby>, ["= 2.0.2"]) - s.add_runtime_dependency(%q<ohm>, ["= 0.1.3"]) - s.add_runtime_dependency(%q<ohm-contrib>, ["= 0.1.1"]) - s.add_runtime_dependency(%q<SystemTimer>, ["= 1.2.3"]) - s.add_runtime_dependency(%q<rjb>, ["= 1.3.4"]) - s.add_runtime_dependency(%q<haml>, ["= 3.1.1"]) - s.add_runtime_dependency(%q<akephalos>, ["= 0.2.5"]) - s.add_runtime_dependency(%q<dm-core>, ["= 1.1.0"]) - s.add_runtime_dependency(%q<dm-serializer>, ["= 1.1.0"]) - s.add_runtime_dependency(%q<dm-timestamps>, ["= 1.1.0"]) - s.add_runtime_dependency(%q<dm-types>, ["= 1.1.0"]) - s.add_runtime_dependency(%q<dm-migrations>, ["= 1.1.0"]) - s.add_runtime_dependency(%q<dm-validations>, ["= 1.1.0"]) - s.add_runtime_dependency(%q<dm-sqlite-adapter>, ["= 1.1.0"]) - s.add_runtime_dependency(%q<ruby-plot>, ["= 0.6.0"]) - s.add_runtime_dependency(%q<gsl>, ["= 1.14.7"]) - s.add_runtime_dependency(%q<statsample>, ["= 1.1.0"]) + s.add_runtime_dependency(%q<sinatra>, [">= 0"]) + s.add_runtime_dependency(%q<emk-sinatra-url-for>, [">= 0"]) + s.add_runtime_dependency(%q<sinatra-respond_to>, [">= 0"]) + s.add_runtime_dependency(%q<sinatra-static-assets>, [">= 0"]) + s.add_runtime_dependency(%q<rest-client>, [">= 0"]) + s.add_runtime_dependency(%q<rack>, [">= 0"]) + s.add_runtime_dependency(%q<rack-contrib>, [">= 0"]) + s.add_runtime_dependency(%q<rack-flash>, [">= 0"]) + s.add_runtime_dependency(%q<nokogiri>, [">= 0"]) + s.add_runtime_dependency(%q<rubyzip>, [">= 0"]) + s.add_runtime_dependency(%q<roo>, [">= 0"]) + s.add_runtime_dependency(%q<spreadsheet>, [">= 0"]) + s.add_runtime_dependency(%q<google-spreadsheet-ruby>, [">= 0"]) + s.add_runtime_dependency(%q<yajl-ruby>, [">= 0"]) + s.add_runtime_dependency(%q<tmail>, [">= 0"]) + s.add_runtime_dependency(%q<rinruby>, [">= 0"]) + s.add_runtime_dependency(%q<ohm>, [">= 0"]) + s.add_runtime_dependency(%q<ohm-contrib>, [">= 0"]) + s.add_runtime_dependency(%q<SystemTimer>, [">= 0"]) + s.add_runtime_dependency(%q<rjb>, [">= 0"]) + s.add_runtime_dependency(%q<dm-core>, [">= 0"]) + s.add_runtime_dependency(%q<dm-serializer>, [">= 0"]) + s.add_runtime_dependency(%q<dm-timestamps>, [">= 0"]) + s.add_runtime_dependency(%q<dm-types>, [">= 0"]) + s.add_runtime_dependency(%q<dm-migrations>, [">= 0"]) + s.add_runtime_dependency(%q<dm-validations>, [">= 0"]) + s.add_runtime_dependency(%q<dm-sqlite-adapter>, [">= 0"]) + s.add_runtime_dependency(%q<haml>, [">= 3"]) + s.add_runtime_dependency(%q<ruby-plot>, ["~> 0.4.0"]) + s.add_runtime_dependency(%q<statsample>, [">= 0"]) s.add_development_dependency(%q<jeweler>, [">= 0"]) else - s.add_dependency(%q<sinatra>, ["= 1.2.6"]) - s.add_dependency(%q<emk-sinatra-url-for>, ["= 0.2.1"]) - s.add_dependency(%q<sinatra-respond_to>, ["= 0.7.0"]) - s.add_dependency(%q<sinatra-static-assets>, ["= 0.5.0"]) - s.add_dependency(%q<rest-client>, ["= 1.6.1"]) - s.add_dependency(%q<rack>, ["= 1.3.5"]) - s.add_dependency(%q<rack-contrib>, ["= 1.1.0"]) - s.add_dependency(%q<rack-flash>, ["= 0.1.1"]) - s.add_dependency(%q<nokogiri>, ["= 1.4.4"]) - s.add_dependency(%q<rubyzip>, ["= 0.9.4"]) - s.add_dependency(%q<roo>, ["= 1.9.3"]) - s.add_dependency(%q<spreadsheet>, ["= 0.6.5.4"]) - s.add_dependency(%q<google-spreadsheet-ruby>, ["= 0.1.5"]) - s.add_dependency(%q<yajl-ruby>, ["= 0.8.2"]) - s.add_dependency(%q<rinruby>, ["= 2.0.2"]) - s.add_dependency(%q<ohm>, ["= 0.1.3"]) - s.add_dependency(%q<ohm-contrib>, ["= 0.1.1"]) - s.add_dependency(%q<SystemTimer>, ["= 1.2.3"]) - s.add_dependency(%q<rjb>, ["= 1.3.4"]) - s.add_dependency(%q<haml>, ["= 3.1.1"]) - s.add_dependency(%q<akephalos>, ["= 0.2.5"]) - s.add_dependency(%q<dm-core>, ["= 1.1.0"]) - s.add_dependency(%q<dm-serializer>, ["= 1.1.0"]) - s.add_dependency(%q<dm-timestamps>, ["= 1.1.0"]) - s.add_dependency(%q<dm-types>, ["= 1.1.0"]) - s.add_dependency(%q<dm-migrations>, ["= 1.1.0"]) - s.add_dependency(%q<dm-validations>, ["= 1.1.0"]) - s.add_dependency(%q<dm-sqlite-adapter>, ["= 1.1.0"]) - s.add_dependency(%q<ruby-plot>, ["= 0.6.0"]) - s.add_dependency(%q<gsl>, ["= 1.14.7"]) - s.add_dependency(%q<statsample>, ["= 1.1.0"]) + s.add_dependency(%q<sinatra>, [">= 0"]) + s.add_dependency(%q<emk-sinatra-url-for>, [">= 0"]) + s.add_dependency(%q<sinatra-respond_to>, [">= 0"]) + s.add_dependency(%q<sinatra-static-assets>, [">= 0"]) + s.add_dependency(%q<rest-client>, [">= 0"]) + s.add_dependency(%q<rack>, [">= 0"]) + s.add_dependency(%q<rack-contrib>, [">= 0"]) + s.add_dependency(%q<rack-flash>, [">= 0"]) + s.add_dependency(%q<nokogiri>, [">= 0"]) + s.add_dependency(%q<rubyzip>, [">= 0"]) + s.add_dependency(%q<roo>, [">= 0"]) + s.add_dependency(%q<spreadsheet>, [">= 0"]) + s.add_dependency(%q<google-spreadsheet-ruby>, [">= 0"]) + s.add_dependency(%q<yajl-ruby>, [">= 0"]) + s.add_dependency(%q<tmail>, [">= 0"]) + s.add_dependency(%q<rinruby>, [">= 0"]) + s.add_dependency(%q<ohm>, [">= 0"]) + s.add_dependency(%q<ohm-contrib>, [">= 0"]) + s.add_dependency(%q<SystemTimer>, [">= 0"]) + s.add_dependency(%q<rjb>, [">= 0"]) + s.add_dependency(%q<dm-core>, [">= 0"]) + s.add_dependency(%q<dm-serializer>, [">= 0"]) + s.add_dependency(%q<dm-timestamps>, [">= 0"]) + s.add_dependency(%q<dm-types>, [">= 0"]) + s.add_dependency(%q<dm-migrations>, [">= 0"]) + s.add_dependency(%q<dm-validations>, [">= 0"]) + s.add_dependency(%q<dm-sqlite-adapter>, [">= 0"]) + s.add_dependency(%q<haml>, [">= 3"]) + s.add_dependency(%q<ruby-plot>, ["~> 0.4.0"]) + s.add_dependency(%q<statsample>, [">= 0"]) s.add_dependency(%q<jeweler>, [">= 0"]) end else - s.add_dependency(%q<sinatra>, ["= 1.2.6"]) - s.add_dependency(%q<emk-sinatra-url-for>, ["= 0.2.1"]) - s.add_dependency(%q<sinatra-respond_to>, ["= 0.7.0"]) - s.add_dependency(%q<sinatra-static-assets>, ["= 0.5.0"]) - s.add_dependency(%q<rest-client>, ["= 1.6.1"]) - s.add_dependency(%q<rack>, ["= 1.3.5"]) - s.add_dependency(%q<rack-contrib>, ["= 1.1.0"]) - s.add_dependency(%q<rack-flash>, ["= 0.1.1"]) - s.add_dependency(%q<nokogiri>, ["= 1.4.4"]) - s.add_dependency(%q<rubyzip>, ["= 0.9.4"]) - s.add_dependency(%q<roo>, ["= 1.9.3"]) - s.add_dependency(%q<spreadsheet>, ["= 0.6.5.4"]) - s.add_dependency(%q<google-spreadsheet-ruby>, ["= 0.1.5"]) - s.add_dependency(%q<yajl-ruby>, ["= 0.8.2"]) - s.add_dependency(%q<rinruby>, ["= 2.0.2"]) - s.add_dependency(%q<ohm>, ["= 0.1.3"]) - s.add_dependency(%q<ohm-contrib>, ["= 0.1.1"]) - s.add_dependency(%q<SystemTimer>, ["= 1.2.3"]) - s.add_dependency(%q<rjb>, ["= 1.3.4"]) - s.add_dependency(%q<haml>, ["= 3.1.1"]) - s.add_dependency(%q<akephalos>, ["= 0.2.5"]) - s.add_dependency(%q<dm-core>, ["= 1.1.0"]) - s.add_dependency(%q<dm-serializer>, ["= 1.1.0"]) - s.add_dependency(%q<dm-timestamps>, ["= 1.1.0"]) - s.add_dependency(%q<dm-types>, ["= 1.1.0"]) - s.add_dependency(%q<dm-migrations>, ["= 1.1.0"]) - s.add_dependency(%q<dm-validations>, ["= 1.1.0"]) - s.add_dependency(%q<dm-sqlite-adapter>, ["= 1.1.0"]) - s.add_dependency(%q<ruby-plot>, ["= 0.6.0"]) - s.add_dependency(%q<gsl>, ["= 1.14.7"]) - s.add_dependency(%q<statsample>, ["= 1.1.0"]) + s.add_dependency(%q<sinatra>, [">= 0"]) + s.add_dependency(%q<emk-sinatra-url-for>, [">= 0"]) + s.add_dependency(%q<sinatra-respond_to>, [">= 0"]) + s.add_dependency(%q<sinatra-static-assets>, [">= 0"]) + s.add_dependency(%q<rest-client>, [">= 0"]) + s.add_dependency(%q<rack>, [">= 0"]) + s.add_dependency(%q<rack-contrib>, [">= 0"]) + s.add_dependency(%q<rack-flash>, [">= 0"]) + s.add_dependency(%q<nokogiri>, [">= 0"]) + s.add_dependency(%q<rubyzip>, [">= 0"]) + s.add_dependency(%q<roo>, [">= 0"]) + s.add_dependency(%q<spreadsheet>, [">= 0"]) + s.add_dependency(%q<google-spreadsheet-ruby>, [">= 0"]) + s.add_dependency(%q<yajl-ruby>, [">= 0"]) + s.add_dependency(%q<tmail>, [">= 0"]) + s.add_dependency(%q<rinruby>, [">= 0"]) + s.add_dependency(%q<ohm>, [">= 0"]) + s.add_dependency(%q<ohm-contrib>, [">= 0"]) + s.add_dependency(%q<SystemTimer>, [">= 0"]) + s.add_dependency(%q<rjb>, [">= 0"]) + s.add_dependency(%q<dm-core>, [">= 0"]) + s.add_dependency(%q<dm-serializer>, [">= 0"]) + s.add_dependency(%q<dm-timestamps>, [">= 0"]) + s.add_dependency(%q<dm-types>, [">= 0"]) + s.add_dependency(%q<dm-migrations>, [">= 0"]) + s.add_dependency(%q<dm-validations>, [">= 0"]) + s.add_dependency(%q<dm-sqlite-adapter>, [">= 0"]) + s.add_dependency(%q<haml>, [">= 3"]) + s.add_dependency(%q<ruby-plot>, ["~> 0.4.0"]) + s.add_dependency(%q<statsample>, [">= 0"]) s.add_dependency(%q<jeweler>, [">= 0"]) end end |