summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-07-19 21:25:36 +0200
committerChristoph Helma <helma@in-silico.ch>2015-07-19 21:25:36 +0200
commit23d0106f985206c898c8b30f1859b619a3970398 (patch)
treea09a9266fe8da335852e532b92f6bddac6f72fc1
parent686328edef0d15de0b8be06b2c3198e01810bc70 (diff)
lazar predictions working in principle
-rw-r--r--lib/bbrc.rb54
-rw-r--r--lib/descriptor.rb13
-rw-r--r--lib/fminer.rb25
-rw-r--r--lib/last.rb127
-rw-r--r--lib/lazar.rb306
-rw-r--r--lib/opentox-algorithm.rb11
-rw-r--r--lib/transform.rb6
7 files changed, 366 insertions, 176 deletions
diff --git a/lib/bbrc.rb b/lib/bbrc.rb
index f7d29f9..40de186 100644
--- a/lib/bbrc.rb
+++ b/lib/bbrc.rb
@@ -1,12 +1,7 @@
-ENV['FMINER_SMARTS'] = 'true'
-ENV['FMINER_NO_AROMATIC'] = 'true'
-ENV['FMINER_PVALUES'] = 'true'
-ENV['FMINER_SILENT'] = 'true'
-ENV['FMINER_NR_HITS'] = 'true'
-
module OpenTox
module Algorithm
class Fminer
+ #
# Run bbrc algorithm on dataset
#
# @param [String] dataset_uri URI of the training dataset
@@ -24,6 +19,7 @@ module OpenTox
@fminer=OpenTox::Algorithm::Fminer.new
@fminer.check_params(params,5)
+ # TODO introduce task again
#task = OpenTox::Task.run("Mining BBRC features", __FILE__ ) do |task|
time = Time.now
@@ -50,7 +46,7 @@ module OpenTox
feature_dataset.creator = __FILE__
feature_dataset.parameters = [
{ "title" => "dataset_id", "paramValue" => params[:dataset].id },
- { "title" => "prediction_feature", "paramValue" => params[:prediction_feature].id },
+ { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id },
{ "title" => "min_frequency", "paramValue" => @fminer.minfreq },
{ "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") },
{ "title" => "backbone", "paramValue" => (params[:backbone] == "false" ? "false" : "true") }
@@ -67,8 +63,9 @@ module OpenTox
#task.progress 10
step_width = 80 / @bbrc.GetNoRootNodes().to_f
+ #features_smarts = Set.new
features = []
- data_entries = [[]]
+ data_entries = Array.new(params[:dataset].compounds.size) {[]}
puts "Setup: #{Time.now-time}"
time = Time.now
@@ -76,7 +73,6 @@ module OpenTox
# run @bbrc
- # prepare to receive results as hash { c => [ [f,v], ... ] }
fminer_results = {}
(0 .. @bbrc.GetNoRootNodes()-1).each do |j|
@@ -114,37 +110,50 @@ module OpenTox
"substructure" => true,
"smarts" => smarts.dup,
"pValue" => p_value.to_f.abs.round(5),
- "effect" => effect
+ "effect" => effect,
+ "parameters" => [
+ { "title" => "dataset_id", "paramValue" => params[:dataset].id },
+ { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }
+ ]
})
features << feature
- features.uniq!
ftime += Time.now - ft
id_arrs.each { |id_count_hash|
id=id_count_hash.keys[0].to_i
count=id_count_hash.values[0].to_i
- compound_idx = params[:dataset].compounds.index @fminer.compounds[id]
+ fminer_results[@fminer.compounds[id]] || fminer_results[@fminer.compounds[id]] = {}
+ compound_idx = params[:dataset].compounds.index @fminer.compounds[id]
feature_idx = features.index feature
data_entries[compound_idx] ||= []
if params[:nr_hits] == "true"
+ fminer_results[@fminer.compounds[id]][feature] = count
data_entries[compound_idx][feature_idx] = count
else
+ fminer_results[@fminer.compounds[id]][feature] = 1
data_entries[compound_idx][feature_idx] = 1
end
}
end # end of
end # feature parsing
- #p features
- p data_entries
- #p params[:dataset].compounds
- #p @fminer.compounds
-
puts "Fminer: #{Time.now-time} (find/create Features: #{ftime})"
time = Time.now
- #puts JSON.pretty_generate(fminer_results)
+
+ # convert nil entries to 0
+ data_entries.collect! do |r|
+ if r.empty?
+ Array.new(features.size,0)
+ else
+ r[features.size-1] = 0 if r.size < features.size # grow array to match feature size
+ r.collect!{|c| c.nil? ? 0 : c} # remove nils
+ end
+ end
+
=begin
+ # This part increases runtime by a factor of ~65
+ # TODO: check if any information is lost due to simplification
fminer_compounds = @fminer.training_dataset.compounds
prediction_feature_idx = @fminer.training_dataset.features.index @fminer.prediction_feature
prediction_feature_all_acts = fminer_compounds.each_with_index.collect { |c,idx|
@@ -178,15 +187,10 @@ module OpenTox
feature_dataset.save
puts "Save: #{Time.now-time}"
- p feature_dataset
feature_dataset
-
- end
- #end
+ #end
+ end
end
end
end
-
-
-
diff --git a/lib/descriptor.rb b/lib/descriptor.rb
index 9a93b32..d862a41 100644
--- a/lib/descriptor.rb
+++ b/lib/descriptor.rb
@@ -56,25 +56,26 @@ module OpenTox
end
def self.smarts_match compounds, smarts, count=false
+ bad_request_error "Compounds for smarts_match are empty" unless compounds
+ bad_request_error "Smarts for smarts_match are empty" unless smarts
compounds = parse compounds
obconversion = OpenBabel::OBConversion.new
obmol = OpenBabel::OBMol.new
obconversion.set_in_format('inchi')
smarts_pattern = OpenBabel::OBSmartsPattern.new
- fingerprint = {}
- compounds = [compounds] unless compounds.is_a? Array
+ #fingerprint = {}
smarts = [smarts] unless smarts.is_a? Array
- compounds.each do |compound|
+ fingerprint = Array.new(compounds.size){Array.new(smarts.size,false)}
+ compounds.each_with_index do |compound,c|
obconversion.read_string(obmol,compound.inchi)
- fingerprint[compound] = {}
- smarts.each do |smart|
+ smarts.each_with_index do |smart,s|
smarts_pattern.init(smart)
if smarts_pattern.match(obmol)
count ? value = smarts_pattern.get_map_list.to_a.size : value = 1
else
value = 0
end
- fingerprint[compound][smart] = value
+ fingerprint[c][s] = value
end
end
fingerprint
diff --git a/lib/fminer.rb b/lib/fminer.rb
index 3333517..59ee224 100644
--- a/lib/fminer.rb
+++ b/lib/fminer.rb
@@ -1,4 +1,5 @@
require_relative 'bbrc'
+require_relative 'last'
=begin
* Name: fminer.rb
* Description: Fminer library
@@ -61,8 +62,7 @@ module OpenTox
end
if @minfreq.nil?
@minfreq=min_frequency(@training_dataset,@prediction_feature,per_mil)
- p "min_frequency #{@minfreq} (input was #{per_mil} per-mil)"
- #$logger.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)"
+ $logger.debug "min_frequency #{@minfreq} (input was #{per_mil} per-mil)"
end
end
@@ -164,17 +164,18 @@ module OpenTox
end
metadata = {
- RDF.type => [RDF::OT.Feature, RDF::OT.Substructure, RDF::OT.NumericFeature],
- RDF::OT.smarts => smarts.dup,
- RDF::OT.pValue => p_value.abs.round(5),
- RDF::OT.effect => effect
+ "title" => smarts.dup,
+ "substructure" => true,
+ "numeric" => true,
+ "smarts" => smarts.dup,
+ "pValue" => p_value.abs.round(5),
+ "effect" => effect,
+ "parameters" => [
+ { "title" => "dataset_id", "paramValue" => params[:dataset].id },
+ { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id }
+ ]
}
- parameters = [
- { RDF::DC.title => "dataset_uri", RDF::OT.paramValue => params[:dataset_uri] },
- { RDF::DC.title => "prediction_feature", RDF::OT.paramValue => params[:prediction_feature] }
- ]
- metadata[RDF::OT.hasSource]=feature_dataset_uri if feature_dataset_uri
- [ metadata, parameters ]
+ metadata
end
# Minimum Frequency
diff --git a/lib/last.rb b/lib/last.rb
new file mode 100644
index 0000000..944d95e
--- /dev/null
+++ b/lib/last.rb
@@ -0,0 +1,127 @@
+module OpenTox
+ module Algorithm
+ class Fminer
+
+ # Run last algorithm on a dataset
+ #
+ # @param [String] dataset_uri URI of the training dataset
+ # @param [String] prediction_feature URI of the prediction feature (i.e. dependent variable)
+ # @param [optional] parameters LAST parameters, accepted parameters are
+ # - min_frequency freq Minimum frequency (default 5)
+ # - feature_type Feature type, can be 'paths' or 'trees' (default "trees")
+ # - nr_hits Set to "true" to get hit count instead of presence
+ # - get_target Set to "true" to obtain target variable as feature
+ # @return [text/uri-list] Task URI
+ def self.last params
+
+ @fminer=OpenTox::Algorithm::Fminer.new
+ @fminer.check_params(params,80)
+
+ # TODO introduce task again
+ #task = OpenTox::Task.run("Mining LAST features", uri('/fminer/last')) do |task|
+
+ @last = Last::Last.new
+ @last.Reset
+ if @fminer.prediction_feature.feature_type == "regression"
+ @last.SetRegression(true) # AM: DO NOT MOVE DOWN! Must happen before the other Set... operations!
+ else
+ bad_request_error "No accept values for "\
+ "dataset '#{fminer.training_dataset.id}' and "\
+ "feature '#{fminer.prediction_feature.id}'" unless
+ @fminer.prediction_feature.accept_values
+ value_map=@fminer.prediction_feature.value_map
+ end
+ @last.SetMinfreq(@fminer.minfreq)
+ @last.SetType(1) if params[:feature_type] == "paths"
+ @last.SetConsoleOut(false)
+
+
+ feature_dataset = OpenTox::Dataset.new
+ feature_dataset["title"] = "LAST representatives for #{@fminer.training_dataset.title}",
+ feature_dataset.creator = __FILE__
+ feature_dataset.parameters = [
+ { "title" => "dataset_id", "paramValue" => params[:dataset].id },
+ { "title" => "prediction_feature_id", "paramValue" => params[:prediction_feature].id },
+ { "title" => "min_frequency", "paramValue" => @fminer.minfreq },
+ { "title" => "nr_hits", "paramValue" => (params[:nr_hits] == "true" ? "true" : "false") }
+ ]
+
+ @fminer.compounds = []
+ @fminer.db_class_sizes = Array.new # AM: effect
+ @fminer.all_activities = Hash.new # DV: for effect calculation (class and regr)
+ @fminer.smi = [] # needed for matching the patterns back
+
+ # Add data to fminer
+ @fminer.add_fminer_data(@last, value_map)
+ #task.progress 10
+ #step_width = 80 / @bbrc.GetNoRootNodes().to_f
+ # run @last
+ xml = ""
+ (0 .. @last.GetNoRootNodes()-1).each do |j|
+ results = @last.MineRoot(j)
+ #task.progress 10+step_width*(j+1)
+ results.each do |result|
+ xml << result
+ end
+ end
+
+ lu = LU.new # uses last-utils here
+ dom=lu.read(xml) # parse GraphML
+ smarts=lu.smarts_rb(dom,'nls') # converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de)
+ params[:nr_hits] == "true" ? hit_count=true : hit_count=false
+ matches, counts = lu.match_rb(@fminer.smi,smarts,hit_count,true) # creates instantiations
+
+ features = []
+ # create table with correct size
+ data_entries = Array.new(params[:dataset].compounds.size) {Array.new(matches.size,0)}
+ matches.each do |smarts, ids|
+ metadata = @fminer.calc_metadata(smarts, ids, counts[smarts], @last, nil, value_map, params)
+ feature = OpenTox::Feature.find_or_create_by(metadata)
+ features << feature
+ ids.each_with_index do |id,idx|
+ compound_idx = params[:dataset].compounds.index @fminer.compounds[id]
+ feature_idx = features.index feature
+ data_entries[compound_idx] ||= []
+ data_entries[compound_idx][feature_idx] = counts[smarts][idx]
+ end
+ end
+ feature_dataset.compounds = @fminer.training_dataset.compounds
+ feature_dataset.features = features
+ feature_dataset.data_entries = data_entries
+
+=begin
+ # TODO check if this code is necessary, I dont understand what it does
+ fminer_compounds = @fminer.training_dataset.compounds
+ prediction_feature_idx = @fminer.training_dataset.features.index @fminer.prediction_feature
+ prediction_feature_all_acts = fminer_compounds.each_with_index.collect { |c,idx|
+ @fminer.training_dataset.data_entries[idx][prediction_feature_idx]
+ }
+ fminer_noact_compounds = fminer_compounds - @fminer.compounds
+
+ if (params[:get_target] == "true")
+ feature_dataset.features = [ @fminer.prediction_feature ] + feature_dataset.features
+ end
+ fminer_compounds.each_with_index { |c,idx|
+ # TODO: fix value insertion
+ row = [ c ]
+ if (params[:get_target] == "true")
+ row = row + [ prediction_feature_all_acts[idx] ]
+ end
+ features.each { |f|
+ row << (fminer_results[c] ? fminer_results[c][f] : nil)
+ }
+ row.collect! { |v| v ? v : 0 } unless fminer_noact_compounds.include? c
+ feature_dataset << row
+ }
+=end
+
+ feature_dataset.save
+ feature_dataset
+
+ # end
+ end
+
+ end
+ end
+end
+
diff --git a/lib/lazar.rb b/lib/lazar.rb
index 1957c24..d0d2b76 100644
--- a/lib/lazar.rb
+++ b/lib/lazar.rb
@@ -11,6 +11,30 @@ module OpenTox
class Lazar
include OpenTox
+ include Mongoid::Document
+ include Mongoid::Timestamps
+ store_in collection: "model"
+
+ field :title, type: String
+ field :description, type: String
+ #field :parameters, type: Array, default: []
+ field :creator, type: String, default: __FILE__
+ # datasets
+ field :training_dataset_id, type: BSON::ObjectId
+ field :feature_dataset_id, type: BSON::ObjectId
+ # algorithms
+ field :feature_generation, type: String
+ field :feature_calculation_algorithm, type: String
+ field :prediction_algorithm, type: Symbol
+ field :similarity_algorithm, type: Symbol
+ # prediction features
+ field :prediction_feature_id, type: BSON::ObjectId
+ field :predicted_value_id, type: BSON::ObjectId
+ field :predicted_variables, type: Array
+ # parameters
+ field :min_sim, type: Float
+ field :propositionalized, type:Boolean
+ field :min_train_performance, type: Float
attr_accessor :prediction_dataset
@@ -18,131 +42,127 @@ module OpenTox
# Prepare lazar object (includes graph mining)
# @param[Array] lazar parameters as strings
# @param[Hash] REST parameters, as input by user
- def self.create params
+ def self.create feature_dataset, prediction_feature=nil, params={}
- lazar = OpenTox::Model::Lazar.new(File.join($model[:uri],SecureRandom.uuid))
+ lazar = OpenTox::Model::Lazar.new
- training_dataset = OpenTox::Dataset.new(params[:dataset_uri])
- lazar.parameters << {RDF::DC.title => "training_dataset_uri", RDF::OT.paramValue => training_dataset.uri}
+ bad_request_error "No features found in feature dataset #{feature_dataset.id}." if feature_dataset.features.empty?
+ lazar.feature_dataset_id = feature_dataset.id
+ training_dataset = OpenTox::Dataset.find(feature_dataset.parameters.select{|p| p["title"] == "dataset_id"}.first["paramValue"])
+ bad_request_error "Training dataset compounds do not match feature dataset compounds. Please ensure that they are in the same order." unless training_dataset.compounds == feature_dataset.compounds
+ lazar.training_dataset_id = training_dataset.id
- if params[:prediction_feature]
- resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{params[:dataset_uri]}'" unless training_dataset.find_feature_uri( params[:prediction_feature] )
+ if prediction_feature
+ resource_not_found_error "No feature '#{params[:prediction_feature]}' in dataset '#{training_dataset.id}'" unless training_dataset.features.include?( params[:prediction_feature] )
else # try to read prediction_feature from dataset
resource_not_found_error "Please provide a prediction_feature parameter" unless training_dataset.features.size == 1
- params[:prediction_feature] = training_dataset.features.first.uri
+ prediction_feature = training_dataset.features.first
end
- lazar[RDF::OT.trainingDataset] = training_dataset.uri
- prediction_feature = OpenTox::Feature.new(params[:prediction_feature])
- predicted_variable = OpenTox::Feature.find_or_create({RDF::DC.title => "#{prediction_feature.title} prediction", RDF.type => [RDF::OT.Feature, prediction_feature[RDF.type]]})
- lazar[RDF::DC.title] = prediction_feature.title
- lazar.parameters << {RDF::DC.title => "prediction_feature_uri", RDF::OT.paramValue => prediction_feature.uri}
- lazar[RDF::OT.dependentVariables] = prediction_feature.uri
-
- bad_request_error "Unknown prediction_algorithm #{params[:prediction_algorithm]}" if params[:prediction_algorithm] and !OpenTox::Algorithm::Neighbors.respond_to?(params[:prediction_algorithm])
- lazar.parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => params[:prediction_algorithm]} if params[:prediction_algorithm]
-
- confidence_feature = OpenTox::Feature.find_or_create({RDF::DC.title => "predicted_confidence", RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature]})
- lazar[RDF::OT.predictedVariables] = [ predicted_variable.uri, confidence_feature.uri ]
- case prediction_feature.feature_type
- when "classification"
- lazar.parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => "weighted_majority_vote"} unless lazar.parameter_value "prediction_algorithm"
- lazar[RDF.type] = [RDF::OT.Model, RDF::OTA.ClassificationLazySingleTarget]
- when "regression"
- lazar.parameters << {RDF::DC.title => "prediction_algorithm", RDF::OT.paramValue => "local_svm_regression"} unless lazar.parameter_value "prediction_algorithm"
- lazar[RDF.type] = [RDF::OT.Model, RDF::OTA.RegressionLazySingleTarget]
+
+ lazar.prediction_feature_id = prediction_feature.id
+ lazar.title = prediction_feature.title
+
+ if params and params[:prediction_algorithm]
+ bad_request_error "Unknown prediction_algorithm #{params[:prediction_algorithm]}" unless OpenTox::Algorithm::Neighbors.respond_to?(params[:prediction_algorithm])
+ lazar.prediction_algorithm = params[:prediction_algorithm]
end
- lazar.parameter_value("prediction_algorithm") =~ /majority_vote/ ? lazar.parameters << {RDF::DC.title => "propositionalized", RDF::OT.paramValue => false} : lazar.parameters << {RDF::DC.title => "propositionalized", RDF::OT.paramValue => true}
- lazar.parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => params[:min_sim].to_f} if params[:min_sim] and params[:min_sim].numeric?
- lazar.parameters << {RDF::DC.title => "feature_generation_uri", RDF::OT.paramValue => params[:feature_generation_uri]}
- #lazar.parameters["nr_hits"] = params[:nr_hits]
+ confidence_feature = OpenTox::Feature.find_or_create_by({
+ "title" => "Prediction confidence",
+ "numeric" => true
+ })
- if params["feature_generation_uri"]=~/fminer/
- if (params[:nr_hits] == "true")
- lazar.parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => "smarts_count"}
+ unless lazar.prediction_algorithm
+ lazar.prediction_algorithm = :weighted_majority_vote if prediction_feature.nominal
+ lazar.prediction_algorithm = :local_svm_regression if prediction_feature.numeric
+ end
+ lazar.prediction_algorithm =~ /majority_vote/ ? lazar.propositionalized = false : lazar.propositionalized = true
+
+ lazar.min_sim = params[:min_sim].to_f if params[:min_sim] and params[:min_sim].numeric?
+ lazar.nr_hits = params[:nr_hits] if params[:nr_hits]
+ lazar.feature_generation = feature_dataset.creator
+ #lazar.parameters << {"title" => "feature_generation_uri", "paramValue" => params[:feature_generation_uri]}
+ # TODO insert algorithm into feature dataset
+ # TODO store algorithms in mongodb?
+ if lazar.feature_generation =~ /fminer|bbrc|last/
+ if (lazar[:nr_hits] == "true")
+ lazar.feature_calculation_algorithm = "smarts_count"
else
- lazar.parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => "smarts_match"}
- end
- lazar.parameters << {RDF::DC.title => "similarity_algorithm", RDF::OT.paramValue => "tanimoto"}
- lazar.parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => 0.3} unless lazar.parameter_value("min_sim")
- elsif params["feature_generation_uri"]=~/descriptor/ or params["feature_generation_uri"]==nil
- if params["feature_generation_uri"]
- method = params["feature_generation_uri"].split(%r{/}).last.chomp
- lazar.parameters << {RDF::DC.title => "feature_calculation_algorithm", RDF::OT.paramValue => method}
+ lazar.feature_calculation_algorithm = "smarts_match"
end
+ lazar.similarity_algorithm = "tanimoto"
+ lazar.min_sim = 0.3 unless lazar.min_sim
+ elsif lazar.feature_generation =~/descriptor/ or lazar.feature_generation.nil?
# cosine similartiy is default (e.g. used when no fetature_generation_uri is given and a feature_dataset_uri is provided instead)
- lazar.parameters << {RDF::DC.title => "similarity_algorithm", RDF::OT.paramValue => "cosine"}
- lazar.parameters << {RDF::DC.title => "min_sim", RDF::OT.paramValue => 0.7} unless lazar.parameter_value("min_sim")
+ lazar.similarity_algorithm = "cosine"
+ lazar.min_sim = 0.7 unless lazar.min_sim
else
- bad_request_error "unnkown feature generation method #{params["feature_generation_uri"]}"
+ bad_request_error "unkown feature generation method #{lazar.feature_generation}"
end
bad_request_error "Parameter min_train_performance is not numeric." if params[:min_train_performance] and !params[:min_train_performance].numeric?
- lazar.parameters << {RDF::DC.title => "min_train_performance", RDF::OT.paramValue => params[:min_train_performance].to_f} if params[:min_train_performance] and params[:min_train_performance].numeric?
- lazar.parameters << {RDF::DC.title => "min_train_performance", RDF::OT.paramValue => 0.1} unless lazar.parameter_value("min_train_performance")
+ lazar.min_train_performance = params[:min_train_performance].to_f if params[:min_train_performance] and params[:min_train_performance].numeric?
+ lazar.min_train_performance = 0.1 unless lazar.min_train_performance
+=begin
if params[:feature_dataset_uri]
bad_request_error "Feature dataset #{params[:feature_dataset_uri]} does not exist." unless URI.accessible? params[:feature_dataset_uri]
- lazar.parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => params[:feature_dataset_uri]}
+ lazar.parameters << {"title" => "feature_dataset_uri", "paramValue" => params[:feature_dataset_uri]}
lazar[RDF::OT.featureDataset] = params["feature_dataset_uri"]
else
# run feature generation algorithm
feature_dataset_uri = OpenTox::Algorithm::Generic.new(params[:feature_generation_uri]).run(params)
- lazar.parameters << {RDF::DC.title => "feature_dataset_uri", RDF::OT.paramValue => feature_dataset_uri}
+ lazar.parameters << {"title" => "feature_dataset_uri", "paramValue" => feature_dataset_uri}
lazar[RDF::OT.featureDataset] = feature_dataset_uri
end
- lazar.put
- lazar.uri
+=end
+ lazar.save
+ lazar
end
- def predict(params)
- @prediction_dataset = OpenTox::Dataset.new
- # set instance variables and prediction dataset parameters from parameters
- params.each {|k,v|
- self.class.class_eval { attr_accessor k.to_sym }
- instance_variable_set "@#{k}", v
- @prediction_dataset.parameters << {RDF::DC.title => k, RDF::OT.paramValue => v}
- }
- #["training_compounds", "fingerprints", "training_activities", "training_fingerprints", "query_fingerprint", "neighbors"].each {|k|
- ["training_compounds", "training_activities", "training_fingerprints", "query_fingerprint", "neighbors"].each {|k|
- self.class.class_eval { attr_accessor k.to_sym }
- instance_variable_set("@#{k}", [])
- }
-
- @prediction_feature = OpenTox::Feature.new @prediction_feature_uri
- @predicted_variable = OpenTox::Feature.new @predicted_variable_uri
- @predicted_confidence = OpenTox::Feature.new @predicted_confidence_uri
- @prediction_dataset.metadata = {
- RDF::DC.title => "Lazar prediction for #{@prediction_feature.title}",
- RDF::DC.creator => @model_uri,
- RDF::OT.hasSource => @model_uri,
- RDF::OT.dependentVariables => @prediction_feature_uri,
- RDF::OT.predictedVariables => [@predicted_variable_uri,@predicted_confidence_uri]
- }
-
- @training_dataset = OpenTox::Dataset.new(@training_dataset_uri)
-
- @feature_dataset = OpenTox::Dataset.new(@feature_dataset_uri)
- bad_request_error "No features found in feature dataset #{@feature_dataset.uri}." if @feature_dataset.features.empty?
-
- @similarity_feature = OpenTox::Feature.find_or_create({RDF::DC.title => "#{@similarity_algorithm.capitalize} similarity", RDF.type => [RDF::OT.Feature, RDF::OT.NumericFeature]})
-
- @prediction_dataset.features = [ @predicted_variable, @predicted_confidence, @prediction_feature, @similarity_feature ]
+ def predict params
+
+ # tailored for performance
+ # all consistency checks should be done during model creation
+
+ time = Time.now
+
+ # prepare prediction dataset
+ prediction_dataset = OpenTox::Dataset.new
+ prediction_feature = OpenTox::Feature.find prediction_feature_id
+ prediction_feature = OpenTox::Feature.find prediction_feature_id
+ prediction_dataset.title = "Lazar prediction for #{prediction_feature.title}",
+ prediction_dataset.creator = __FILE__,
- prediction_feature_pos = @training_dataset.features.collect{|f| f.uri}.index @prediction_feature.uri
+ similarity_feature = OpenTox::Feature.find_or_create_by({
+ "title" => "#{similarity_algorithm.capitalize} similarity",
+ "numeric" => true
+ })
+
+ #prediction_dataset.features = [ predicted_confidence, prediction_feature, similarity_feature ]
- if @dataset_uri
- compounds = OpenTox::Dataset.new(@dataset_uri).compounds
+ # TODO set instance variables and prediction dataset parameters from parameters (see development branch)
+
+
+ training_dataset = OpenTox::Dataset.find(training_dataset_id)
+
+ feature_dataset = OpenTox::Dataset.find(feature_dataset_id)
+
+ if params[:compound]
+ compounds = [ params[:compound]]
else
- compounds = [ OpenTox::Compound.new(@compound_uri) ]
+ compounds = params[:dataset].compounds
end
- # @training_fingerprints = @feature_dataset.data_entries
+ puts "Setup: #{Time.now-time}"
+ time = Time.now
+
+ # TODO: this seems to be very time consuming
+ # uses > 11" on development machine
# select training fingerprints from feature dataset (do NOT use entire feature dataset)
- feature_compound_uris = @feature_dataset.compounds.collect{|c| c.uri}
- @training_fingerprints = []
+=begin
@training_dataset.compounds.each do |c|
- idx = feature_compound_uris.index(c.uri)
+ idx = @feature_dataset.compounds.index(c)
bad_request_error "training dataset compound not found in feature dataset" if idx==nil
@training_fingerprints << @feature_dataset.data_entries[idx][0..-1]
end
@@ -151,61 +171,85 @@ module OpenTox
values << nil while (values.size < @feature_dataset.features.size)
values
end
- @training_compounds = @training_dataset.compounds
- internal_server_error "sth went wrong #{@training_compounds.size} != #{@training_fingerprints.size}" if @training_compounds.size != @training_fingerprints.size
-
- feature_names = @feature_dataset.features.collect{ |f| f[RDF::DC.title] }
- query_fingerprints = {}
- # first lookup in feature dataset, than apply feature_generation_uri
- compounds.each do |c|
- idx = feature_compound_uris.index(c.uri) # just use first index, features should be equal for duplicates
- if idx!=nil
- fingerprint = {}
- @feature_dataset.features.each do |f|
- fingerprint[f[RDF::DC.title]] = @feature_dataset.data_entry_value(idx,f.uri)
- end
- query_fingerprints[c] = fingerprint
- end
- end
- # if lookup failed, try computing!
- if query_fingerprints.size!=compounds.size
- bad_request_error "no feature_generation_uri provided in model AND cannot lookup all test compounds in existing feature dataset" unless @feature_calculation_algorithm
- query_fingerprints = OpenTox::Algorithm::Descriptor.send( @feature_calculation_algorithm, compounds, feature_names )#.collect{|row| row.collect{|val| val ? val.to_f : 0.0 } }
- end
+=end
+ # replacement code (sequence has been preserved in bbrc and last
+ # uses ~0.025" on development machine
+ #@training_fingerprints = @feature_dataset.data_entries
+ #@training_compounds = @training_dataset.compounds
+
+ #feature_names = @feature_dataset.features.collect{ |f| f[:title] }
+
+ puts "Fingerprint: #{Time.now-time}"
+ time = Time.now
+ query_fingerprint = OpenTox::Algorithm::Descriptor.send( feature_calculation_algorithm, compounds, feature_dataset.features.collect{|f| f["title"]} )
+
+ puts "Fingerprint calculation: #{Time.now-time}"
+ time = Time.now
# AM: transform to cosine space
- @min_sim = (@min_sim.to_f*2.0-1.0).to_s if @similarity_algorithm =~ /cosine/
+ min_sim = (min_sim.to_f*2.0-1.0).to_s if similarity_algorithm =~ /cosine/
- compounds.each_with_index do |compound,c_count|
- $logger.debug "predict compound #{c_count+1}/#{compounds.size} #{compound.uri}"
+ neighbors = []
+ compounds.each_with_index do |compound,c|
+ $logger.debug "predict compound #{c+1}/#{compounds.size} #{compound.inchi}"
- database_activities = @training_dataset.values(compound,@prediction_feature)
+ database_activities = training_dataset.values(compound,prediction_feature)
if database_activities and !database_activities.empty?
database_activities.each do |database_activity|
$logger.debug "do not predict compound, it occurs in dataset with activity #{database_activity}"
- @prediction_dataset << [compound, nil, nil, database_activity, nil]
+ prediction_dataset << [compound, nil, nil, database_activity, nil]
end
next
- elsif @prediction_dataset.compound_indices(compound.uri)
- $logger.debug "compound already predicted (copy old prediction)"
- predicted_value = @prediction_dataset.data_entry_value(@prediction_dataset.compound_indices(compound.uri).first,@predicted_variable.uri)
- confidence_value = @prediction_dataset.data_entry_value(@prediction_dataset.compound_indices(compound.uri).first,@predicted_confidence.uri)
else
+=begin
@training_activities = @training_dataset.data_entries.collect{|entry|
act = entry[prediction_feature_pos] if entry
@prediction_feature.feature_type=="classification" ? @prediction_feature.value_map.invert[act] : act
}
+=end
+
+ #@query_fingerprint = @feature_dataset.features.collect { |f|
+ #val = query_fingerprints[compound][f.title]
+ #bad_request_error "Can not parse value '#{val}' to numeric" if val and !val.numeric?
+ #val ? val.to_f : 0.0
+ #} # query structure
+
+ # TODO reintroduce for regression
+ #mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self)
+ #mtf.transform
+ #
- @query_fingerprint = @feature_dataset.features.collect { |f|
- val = query_fingerprints[compound][f.title]
- bad_request_error "Can not parse value '#{val}' to numeric" if val and !val.numeric?
- val ? val.to_f : 0.0
- } # query structure
+ feature_dataset.data_entries.each_with_index do |fingerprint, i|
+
+ sim = OpenTox::Algorithm::Similarity.send(similarity_algorithm,fingerprint, query_fingerprint[c])
+ # TODO fix for multi feature datasets
+ neighbors << [feature_dataset.compounds[i],training_dataset.data_entries[i].first,sim] if sim > self.min_sim
+ end
+ similarity_sum = 0.0
+ confidence_sum = 0.0
+ prediction = nil
+ activities = training_dataset.data_entries.flatten.uniq.sort
+ neighbors.each do |n|
+ similarity_sum += n.last
+ if activities.index(n[1]) == 0
+ confidence_sum += n.last
+ elsif activities.index(n[1]) == 1
+ confidence_sum -= n.last
+ end
+ end
+
+ if confidence_sum > 0.0
+ prediction = activities[0]
+ else
+ prediction = activities[1]
+ end
+
+ p prediction, confidence_sum/similarity_sum
+
- mtf = OpenTox::Algorithm::Transform::ModelTransformer.new(self)
- mtf.transform
- prediction = OpenTox::Algorithm::Neighbors.send(@prediction_algorithm,
+=begin
+ prediction = OpenTox::Algorithm::Neighbors.send(prediction_algorithm,
{ :props => mtf.props,
:activities => mtf.activities,
:sims => mtf.sims,
@@ -220,8 +264,10 @@ module OpenTox
confidence_value = ((confidence_value+1.0)/2.0).abs if @similarity_algorithm =~ /cosine/
predicted_value = @prediction_feature.value_map[prediction[:prediction].to_i] if @prediction_feature.feature_type == "classification"
$logger.debug "predicted value: #{predicted_value}, confidence: #{confidence_value}"
+=end
end
+=begin
@prediction_dataset << [ compound, predicted_value, confidence_value, nil, nil ]
if @compound_uri # add neighbors only for compound predictions
@@ -231,9 +277,9 @@ module OpenTox
@prediction_dataset << [ n, nil, nil, a, neighbor[:similarity] ]
end
end
+=end
end # iteration over compounds
- @prediction_dataset.put
@prediction_dataset
end
diff --git a/lib/opentox-algorithm.rb b/lib/opentox-algorithm.rb
index 4aaad9c..03236ee 100644
--- a/lib/opentox-algorithm.rb
+++ b/lib/opentox-algorithm.rb
@@ -1,5 +1,12 @@
require 'statsample'
+ENV['FMINER_SMARTS'] = 'true'
+ENV['FMINER_NO_AROMATIC'] = 'true'
+ENV['FMINER_PVALUES'] = 'true'
+ENV['FMINER_SILENT'] = 'true'
+ENV['FMINER_NR_HITS'] = 'true'
+
+
# Require sub-Repositories
require_relative '../libfminer/libbbrc/bbrc' # include before openbabel
require_relative '../libfminer/liblast/last' #
@@ -8,3 +15,7 @@ require_relative '../last-utils/lu.rb'
#Dir[File.join(File.dirname(__FILE__),"*.rb")].each{ |f| require_relative f}
require_relative "descriptor.rb"
require_relative "fminer.rb"
+require_relative "lazar.rb"
+require_relative "transform.rb"
+require_relative "similarity.rb"
+require_relative "neighbors.rb"
diff --git a/lib/transform.rb b/lib/transform.rb
index 8b124f9..cbfa915 100644
--- a/lib/transform.rb
+++ b/lib/transform.rb
@@ -236,7 +236,6 @@ module OpenTox
# @params[OpenTox::Model] model Model to transform
def initialize model
@model = model
- @similarity_algorithm = @model.similarity_algorithm
end
# Transforms the model
@@ -282,6 +281,7 @@ module OpenTox
# neighbor calculation
@ids = [] # surviving compounds become neighbors
@sims = [] # calculated by neighbor routine
+
neighbors
n_prop_tmp = []; @ids.each { |idx| n_prop_tmp << @n_prop[idx] }; @n_prop = n_prop_tmp # select neighbors from matrix
acts_tmp = []; @ids.each { |idx| acts_tmp << @activities[idx] }; @activities = acts_tmp
@@ -294,7 +294,7 @@ module OpenTox
gram_matrix[i] = [] unless gram_matrix[i]
@n_prop.each_index do |j|
if (j>i)
- sim = eval("OpenTox::Algorithm::Similarity::#{@similarity_algorithm}(@n_prop[i], @n_prop[j])")
+ sim = OpenTox::Algorithm::Similarity.send(@similarity_algorithm.to_sym, @n_prop[i], @n_prop[j])
gram_matrix[i][j] = sim
gram_matrix[j] = [] unless gram_matrix[j]
gram_matrix[j][i] = gram_matrix[i][j]
@@ -393,7 +393,7 @@ module OpenTox
# @param[Array] A propositionalized data entry
# @return[Float] Similarity to query structure
def similarity(training_props)
- eval("OpenTox::Algorithm::Similarity").send(@model.similarity_algorithm,training_props, @q_prop)
+ OpenTox::Algorithm::Similarity.send(@model.similarity_algorithm,training_props, @q_prop)
end