diff options
author | dv <dv@dv.de> | 2011-08-01 14:31:35 +0200 |
---|---|---|
committer | dv <dv@dv.de> | 2011-08-01 14:31:35 +0200 |
commit | 671c2cd22be19d340beec52a2b310d81d2389faf (patch) | |
tree | a0d959350f2a06cce53e50b062350ca4c22566cd /lib | |
parent | 84fde83d9fe568a2bb10ccf302ef833766e464f2 (diff) | |
parent | 96da910e30e04af320c39bcfc64761c91e5770e1 (diff) |
Merge branch 'development' into reg_min_max
Diffstat (limited to 'lib')
-rw-r--r-- | lib/algorithm.rb | 108 | ||||
-rw-r--r-- | lib/dataset.rb | 15 | ||||
-rw-r--r-- | lib/model.rb | 35 | ||||
-rw-r--r-- | lib/parser.rb | 192 |
4 files changed, 321 insertions, 29 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb index e2397f0..50ce359 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -4,6 +4,7 @@ R = nil require "rinruby" require "statsample" +require 'uri' module OpenTox @@ -210,6 +211,106 @@ module OpenTox end end + # Structural Graph Clustering by TU Munich + # Finds clusters similar to a query structure in a given training dataset + # May be queried for cluster membership of an unknown compound + class StructuralClustering + attr_accessor :training_dataset_uri, :training_threshold, :query_dataset_uri, :query_threshold, :target_clusters_array + + # @params[String] Training dataset_uri + # @params[Float] Similarity threshold for training (optional) + # @params[String] Cluster service uri (no AA) + def initialize training_dataset_uri, training_threshold=0.8, cluster_service_uri = "http://opentox-dev.informatik.tu-muenchen.de:8080/OpenTox/algorithm/StructuralClustering" + + if (training_dataset_uri =~ URI::regexp).nil? || (cluster_service_uri =~ URI::regexp).nil? + raise "Invalid URI." + end + @training_dataset_uri = training_dataset_uri + if !OpenTox::Algorithm.numeric? training_threshold || training_threshold <0 || training_threshold >1 + raise "Training threshold out of bounds." + end + @training_threshold = training_threshold.to_f + + # Train a cluster model + params = {:dataset_uri => @training_dataset_uri, :threshold => @training_threshold } + @cluster_model_uri = OpenTox::RestClientWrapper.post cluster_service_uri, params + cluster_model_rdf = OpenTox::RestClientWrapper.get @cluster_model_uri + @datasets = OpenTox::Parser::Owl.from_rdf cluster_model_rdf, OT.Dataset, true # must extract OT.Datasets from model + + # Process parsed OWL objects + @clusterid_dataset_map = Hash.new + @datasets.each { |d| + begin + d.metadata[OT.hasSource]["Structural Clustering cluster "] = "" # must parse in metadata for string (not elegant) + @clusterid_dataset_map[d.metadata[OT.hasSource].to_i] = d.uri + rescue Exception => e + # ignore other entries! + end + } + end + + # Whether a model has been trained + def trained? + !@cluster_model_uri.nil? + end + + # Instance query: clusters for a compound + # @params[String] Query compound + # @params[Float] Similarity threshold for query to clusters (optional) + def get_clusters query_compound_uri, query_threshold = 0.5 + + if !OpenTox::Algorithm.numeric? query_threshold || query_threshold <0 || query_threshold >1 + raise "Query threshold out of bounds." + end + @query_threshold = query_threshold.to_f + + + # Preparing a query dataset + query_dataset = OpenTox::Dataset.new + @query_dataset_uri = query_dataset.save + query_dataset = OpenTox::Dataset.find @query_dataset_uri + query_dataset.add_compound query_compound_uri + @query_dataset_uri = query_dataset.save + + # Obtaining a clustering for query compound + params = { :dataset_uri => @query_dataset_uri, :threshold => @query_threshold } + cluster_query_dataset_uri = OpenTox::RestClientWrapper.post @cluster_model_uri, params + cluster_query_dataset = OpenTox::Dataset.new cluster_query_dataset_uri + cluster_query_dataset.load_all + + # Reading cluster ids for features from metadata + feature_clusterid_map = Hash.new + pattern="Prediction feature for cluster assignment " # must parse for string in metadata (not elegant) + cluster_query_dataset.features.each { |feature_uri,metadata| + metadata[DC.title][pattern]="" + feature_clusterid_map[feature_uri] = metadata[DC.title].to_i + } + + # Integrity check + unless cluster_query_dataset.compounds.size == 1 + raise "Number of predicted compounds is != 1." + end + + # Process data entry + query_compound_uri = cluster_query_dataset.compounds[0] + @target_clusters_array = Array.new + cluster_query_dataset.features.keys.each { |cluster_membership_feature| + + # Getting dataset URI for cluster + target_cluster = feature_clusterid_map[cluster_membership_feature] + dataset = @clusterid_dataset_map[target_cluster] + + # Finally look up presence + data_entry = cluster_query_dataset.data_entries[query_compound_uri] + present = data_entry[cluster_membership_feature][0] + + # Store result + @target_clusters_array << dataset if present > 0.5 # 0.0 for absence, 1.0 for presence + } + end + + end + module Neighbors # Local multi-linear regression (MLR) prediction from neighbors. @@ -815,6 +916,13 @@ module OpenTox (nr_zeroes == 0) # also remove feature present everywhere end + # Numeric value test + # @param[Object] value + # @return [Boolean] Whether value is a number + def self.numeric?(value) + true if Float(value) rescue false + end + # For symbolic features # @param [Array] Array to test, must indicate non-occurrence with 0. # @return [Boolean] Whether the feature has variance zero. diff --git a/lib/dataset.rb b/lib/dataset.rb index f701699..2147a4d 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -102,6 +102,13 @@ module OpenTox copy parser.load_uri(subjectid) end + def load_sdf(sdf,subjectid=nil) + save(subjectid) unless @uri # get a uri for creating features + parser = Parser::Sdf.new + parser.dataset = self + parser.load_sdf(sdf) + end + # Load CSV string (format specification: http://toxcreate.org/help) # - loads data_entries, compounds, features # - sets metadata (warnings) for parser errors @@ -236,7 +243,13 @@ module OpenTox sum="" @compounds.each{ |c| sum << OpenTox::Compound.new(c).to_inchi - sum << OpenTox::Compound.new(c).to_sdf + sum << OpenTox::Compound.new(c).to_sdf.sub(/\n\$\$\$\$/,'') + @data_entries[c].each{ |f,v| + sum << "> <\"#{f}\">\n" + sum << v.join(", ") + sum << "\n\n" + } + sum << "$$$$\n" } sum end diff --git a/lib/model.rb b/lib/model.rb index d5d54b6..e6fbe2f 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -50,38 +50,49 @@ module OpenTox @predicted_variable end + def predicted_variables( subjectid ) + load_predicted_variables( subjectid, false ) unless @predicted_variables + @predicted_variables + end + def predicted_confidence( subjectid ) load_predicted_variables( subjectid ) unless @predicted_confidence @predicted_confidence end private - def load_predicted_variables( subjectid=nil ) + def load_predicted_variables( subjectid=nil, use_confidence=true ) load_metadata(subjectid) if @metadata==nil or @metadata.size==0 or (@metadata.size==1 && @metadata.values[0]==@uri) if @metadata[OT.predictedVariables] predictedVariables = @metadata[OT.predictedVariables] if predictedVariables.is_a?(Array) if (predictedVariables.size==1) @predicted_variable = predictedVariables[0] - elsif (predictedVariables.size==2) + elsif (predictedVariables.size>=2) # PENDING identify confidence - conf_index = -1 - predictedVariables.size.times do |i| - f = OpenTox::Feature.find(predictedVariables[i]) - conf_index = i if f.metadata[DC.title]=~/(?i)confidence/ + if use_confidence + conf_index = -1 + predictedVariables.size.times do |i| + f = OpenTox::Feature.find(predictedVariables[i]) + conf_index = i if f.metadata[DC.title]=~/(?i)confidence/ + end + raise "could not estimate predicted variable from model: '"+uri.to_s+ + "', number of predicted-variables==2, but no confidence found" if conf_index==-1 + end + if (predictedVariables.size==2) && use_confidence + @predicted_variable = predictedVariables[1-conf_index] + @predicted_confidence = predictedVariables[conf_index] + else + @predicted_variables = predictedVariables end - raise "could not estimate predicted variable from model: '"+uri.to_s+ - "', number of predicted-variables==2, but no confidence found" if conf_index==-1 - @predicted_variable = predictedVariables[1-conf_index] - @predicted_confidence = predictedVariables[conf_index] else - raise "could not estimate predicted variable from model: '"+uri.to_s+"', number of predicted-variables > 2" + raise "could not estimate predicted variable from model: '"+uri.to_s+"', number of predicted-variables == 0" end else raise "could not estimate predicted variable from model: '"+uri.to_s+"', predicted-variables is no array" end end - raise "could not estimate predicted variable from model: '"+uri.to_s+"'" unless @predicted_variable + raise "could not estimate predicted variable from model: '"+uri.to_s+"'" unless (@predicted_variable || @predicted_variables) end end diff --git a/lib/parser.rb b/lib/parser.rb index 07bee67..d0975af 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -86,7 +86,11 @@ module OpenTox # @param [String] rdf # @param [String] type of the info (e.g. OT.Task, OT.ErrorReport) needed to get the subject-uri # @return [Owl] with uri and metadata set - def self.from_rdf( rdf, type ) + def self.from_rdf( rdf, type, allow_multiple = false ) + + uris = Array.new + owls = Array.new + # write to file and read convert with rapper into tripples file = Tempfile.new("ot-rdfxml") file.puts rdf @@ -99,20 +103,27 @@ module OpenTox triples.each_line do |line| triple = line.to_triple if triple[1] == RDF['type'] and triple[2]==type - raise "uri already set, two uris found with type: "+type.to_s if uri + if !allow_multiple + raise "uri already set, two uris found with type: "+type.to_s if uri + end uri = triple[0] + uris << uri end end File.delete(file.path) + # load metadata - metadata = {} - triples.each_line do |line| - triple = line.to_triple - metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type'] - end - owl = Owl::Generic.new(uri) - owl.metadata = metadata - owl + uris.each { |uri| + metadata = {} + triples.each_line do |line| + triple = line.to_triple + metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type'] + end + owl = Owl::Generic.new(uri) + owl.metadata = metadata + owls << owl + } + allow_multiple ? owls : owls[0] end # Generic parser for all OpenTox classes @@ -350,7 +361,6 @@ module OpenTox @dataset end - private def warnings @@ -437,12 +447,8 @@ module OpenTox end end - def numeric?(value) - true if Float(value) rescue false - end - def feature_type(value) - if numeric? value + if OpenTox::Algorithm::numeric? value return OT.NumericFeature else return OT.NominalFeature @@ -454,5 +460,159 @@ module OpenTox end end + + class Table + + attr_accessor :data, :features, :compounds + + def initialize + @data = {} + @activity_errors = [] + end + + def feature_values(feature) + @data.collect{|c, row| row[feature]}.uniq.compact + end + + def feature_types(feature) + @data.collect{|c, row| feature_type(row[feature])}.uniq.compact + end + + def features + @data.collect{|c,row| row.keys}.flatten.uniq + end + + def clean_features + ignored_features = [] + features.each do |feature| + if feature_values(feature).size > 5 + if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature + # REGRESSION + elsif feature_types(feature).include? OT.NumericFeature + @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features + @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)." + else + @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)." + ignored_features << feature + next + end + elsif feature_values(feature).size <= 1 + @activity_errors << "Feature #{feature} ignored (less than 2 feature values)." + ignored_features << feature + else + # CLASSIFICATION + end + end + ignored_features.each do |feature| + @data.each{ |c,row| row.delete feature } + end + @activity_errors + end + + def add_to_dataset(dataset) + features.each do |feature_name| + feature_uri = File.join(dataset.uri,"feature",URI.encode(feature_name)) + dataset.add_feature(feature_uri,{DC.title => feature_name}) + end + + @data.each do |compound,row| + unless row.empty? + row.each do |feature,value| + if OpenTox::Algorithm::numeric?(value) + value = value.to_f + elsif value.nil? or value.empty? + value = nil + else + value = value.to_s + end + feature_uri = File.join(dataset.uri,"feature",URI.encode(feature)) + dataset.add(compound, feature_uri, value) + #dataset.features[feature_uri][RDF.type] = feature_types(feature) + #dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + if feature_types(feature).include? OT.NumericFeature + dataset.features[feature_uri][RDF.type] = [OT.NumericFeature] + else + dataset.features[feature_uri][RDF.type] = [OT.NominalFeature] + dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + end + end + end + end + end + + private + + def feature_type(value) + if OpenTox::Algorithm::numeric? value + return OT.NumericFeature + else + return OT.NominalFeature + end + end + end + + # quick hack to enable sdf import via csv + # should be refactored + class Sdf + + attr_accessor :dataset + + def initialize + @data = {} + + @compound_errors = [] + @activity_errors = [] + @duplicates = {} + end + + def load_sdf(sdf) + + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_and_out_formats "sdf", "inchi" + + table = Table.new + + properties = [] + sdf.each_line { |l| properties << l.to_s if l.match(/</) } + properties.uniq! + properties.sort! + properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp } + + rec = 0 + sdf.split(/\$\$\$\$\r*\n/).each do |s| + rec += 1 + obconversion.read_string obmol, s + begin + inchi = obconversion.write_string(obmol).gsub(/\s/,'').chomp + @duplicates[inchi] = [] unless @duplicates[inchi] + @duplicates[inchi] << rec #inchi#+", "+row.join(", ") + compound = Compound.from_inchi inchi + rescue + @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}" + next + end + row = {} + obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) } + table.data[compound.uri] = row + end + + # finda and remove ignored_features + @activity_errors = table.clean_features + table.add_to_dataset @dataset + + warnings = '' + warnings += "<p>Incorrect Smiles structures (ignored):</p>" + @compound_errors.join("<br/>") unless @compound_errors.empty? + warnings += "<p>Irregular activities (ignored):</p>" + @activity_errors.join("<br/>") unless @activity_errors.empty? + duplicate_warnings = '' + @duplicates.each {|inchi,lines| duplicate_warnings << "<p>#{lines.join('<br/>')}</p>" if lines.size > 1 } + warnings += "<p>Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from <em>independent</em> experiments):</p>" + duplicate_warnings unless duplicate_warnings.empty? + + @dataset.metadata[OT.Warnings] = warnings + @dataset + + end + + end end end |