summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2010-11-04 11:15:59 +0100
committerChristoph Helma <helma@in-silico.ch>2010-11-04 11:15:59 +0100
commite72bba4cdaa6fd68d62b567e21be730a49963207 (patch)
tree22c088db8e266b420bd7055edad778bd2b8ce375
parent19dd7247be22e637419d79406041a4548b169c2c (diff)
fminer with annotations, commit before merging andreas new version
-rw-r--r--fminer.rb194
-rw-r--r--lazar.rb143
m---------libfminer0
-rw-r--r--smarts.rb2
4 files changed, 147 insertions, 192 deletions
diff --git a/fminer.rb b/fminer.rb
index 30d0d9a..b5956c5 100644
--- a/fminer.rb
+++ b/fminer.rb
@@ -2,159 +2,157 @@ ENV['FMINER_SMARTS'] = 'true'
ENV['FMINER_NO_AROMATIC'] = 'true'
ENV['FMINER_PVALUES'] = 'true'
@@fminer = Bbrc::Bbrc.new
+@@fminer.SetMinfreq(5)
get '/fminer/?' do
- owl = OpenTox::OwlSerializer.create 'Algorithm', url_for('/fminer',:full)
- owl.annotate 'title',"fminer"
- owl.annotate 'creator',"http://github.com/amaunz/fminer2"
-# owl.set_data( {
-# "parameters" => [
-# { "title" => "Dataset URI", "paramScope" => "mandatory", "paramValue" => "dataset_uri" },
-# { "title" => "Feature URI for dependent variable", "paramScope" => "mandatory", "paramValue" => "feature_uri" }
-# ]
-# } )
-
-# owl.parameters = {
-# "Dataset URI" => { :scope => "mandatory", :value => "dataset_uri" },
-# "Feature URI for dependent variable" => { :scope => "mandatory", :value => "feature_uri" }
-# }
- rdf = owl.rdf
- #File.open('public/fminer.owl', 'w') {|f| f.print rdf}
+
+ metadata = {
+ DC.title => 'fminer',
+ DC.identifier => url_for("",:full),
+ DC.creator => "andreas@maunz.de, helma@in-silico.ch",
+ DC.contributor => "vorgrimmlerdavid@gmx.de",
+ OT.isA => OTA.PatternMiningSupervised
+ }
+
+ parameters = [
+ { DC.description => "Dataset URI", OT.paramScope => "mandatory", OT.title => "dataset_uri" },
+ { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", OT.title => "prediction_feature" }
+ ]
+
+ s = OpenTox::Serializer::Owl.new
+ s.add_algorithm(url_for('/fminer',:full),metadata,parameters)
response['Content-Type'] = 'application/rdf+xml'
- rdf
+ s.to_rdfxml
+
end
post '/fminer/?' do
halt 404, "Please submit a dataset_uri." unless params[:dataset_uri] and !params[:dataset_uri].nil?
- halt 404, "Please submit a feature_uri." unless params[:feature_uri] and !params[:feature_uri].nil?
- LOGGER.debug "Dataset: " + params[:dataset_uri]
- LOGGER.debug "Endpoint: " + params[:feature_uri]
- feature_uri = params[:feature_uri]
- begin
- LOGGER.debug "Retrieving #{params[:dataset_uri]}"
- training_dataset = OpenTox::Dataset.find "#{params[:dataset_uri]}"
- rescue
- LOGGER.error "Dataset #{params[:dataset_uri]} not found"
- halt 404, "Dataset #{params[:dataset_uri]} not found." if training_dataset.nil?
- end
- halt 404, "No feature #{params[:feature_uri]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:feature_uri])
+ halt 404, "Please submit a prediction_feature." unless params[:prediction_feature] and !params[:prediction_feature].nil?
+ prediction_feature = params[:prediction_feature]
- task_uri = OpenTox::Task.as_task("Mine features", url_for('/fminer',:full)) do
+ training_dataset = OpenTox::Dataset.new "#{params[:dataset_uri]}"
+ training_dataset.load_all
+ halt 404, "No feature #{params[:prediction_feature]} in dataset #{params[:dataset_uri]}" unless training_dataset.features and training_dataset.features.include?(params[:prediction_feature])
+
+ task_uri = OpenTox::Task.as_task("Mining BBRC features", url_for('/fminer',:full)) do
feature_dataset = OpenTox::Dataset.new
- title = "BBRC representatives for " + training_dataset.title
- feature_dataset.title = title
- feature_dataset.creator = url_for('/fminer',:full)
- bbrc_uri = url_for("/fminer#BBRC_representative",:full)
- feature_dataset.features << bbrc_uri
+ feature_dataset.add_metadata({
+ DC.title => "BBRC representatives for " + training_dataset.metadata[DC.title],
+ DC.creator => url_for('/fminer',:full),
+ OT.hasSource => url_for('/fminer', :full),
+ })
+ feature_dataset.add_parameters({
+ "dataset_uri" => params[:dataset_uri],
+ "prediction_feature" => params[:prediction_feature]
+ })
+ feature_dataset.save
id = 1 # fminer start id is not 0
compounds = []
-
+ nr_active=0
+ nr_inactive=0
g_hash = Hash.new# DV: for effect calculation in regression part
+
@@fminer.Reset
- #@@fminer.SetChisqSig(0.99)
- LOGGER.debug "Fminer: initialising ..."
- training_dataset.data.each do |c,features|
+ training_dataset.data_entries.each do |compound,entry|
begin
- smiles = OpenTox::Compound.new(:uri => c.to_s).smiles
+ smiles = OpenTox::Compound.new(compound.to_s).smiles
rescue
- LOGGER.warn "No resource for #{c.to_s}"
+ LOGGER.warn "No resource for #{compound.to_s}"
next
end
if smiles == '' or smiles.nil?
- LOGGER.warn "Cannot find smiles for #{c.to_s}."
- else
- feature_dataset.compounds << c.to_s
- features.each do |feature|
- act = feature[feature_uri]
- if act.nil?
- LOGGER.warn "No #{feature_uri} activiity for #{c.to_s}."
+ LOGGER.warn "Cannot find smiles for #{compound.to_s}."
+ next
+ end
+ entry.each do |feature,values|
+ values.each do |value|
+ if value.nil?
+ LOGGER.warn "No #{feature} activiity for #{compound.to_s}."
else
- case act.to_s
+ case value.to_s
when "true"
- #LOGGER.debug id.to_s + ' "' + smiles +'"' + "\t" + true.to_s
+ nr_active += 1
activity = 1
when "false"
- #LOGGER.debug id.to_s + ' "' + smiles +'"' + "\t" + false.to_s
+ nr_inactive += 1
activity = 0
else
- # AM: add quantitative activity
- activity = act.to_f
+ activity = value.to_f
@@fminer.SetRegression(true)
end
- compounds[id] = c.to_s
begin
@@fminer.AddCompound(smiles,id)
@@fminer.AddActivity(activity, id)
g_hash[id]=activity # DV: insert global information
+ compounds[id] = compound
+ id += 1
rescue
- LOGGER.warn "Could not add " + smiles + "\t" + act.to_s + " to fminer"
+ LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
end
- end
- end
- id += 1
- end
- end
- g_array=g_hash.values # DV: calculation of global median for effect calculation
- g_median=OpenTox::Utils.median(g_array)
- minfreq = (0.02*id).round
- @@fminer.SetMinfreq(minfreq)
- LOGGER.debug "Fminer: initialised with #{id} compounds, minimum frequency #{minfreq}"
+ end
+ end
+ end
+ end
- raise "no compounds" if compounds.size==0
+ g_array=g_hash.values # DV: calculation of global median for effect calculation
+ g_median=OpenTox::Algorithm.median(g_array)
+
+ # TODO read from params
+ raise "No compounds in dataset #{training_dataset.uri}" if compounds.size==0
- values = {}
+ features = Set.new
# run @@fminer
- LOGGER.debug "Fminer: mining ..."
(0 .. @@fminer.GetNoRootNodes()-1).each do |j|
+
results = @@fminer.MineRoot(j)
results.each do |result|
f = YAML.load(result)[0]
smarts = f[0]
p_value = f[1]
- # AM: f[3] missing on regression
+
if (!@@fminer.GetRegression)
ids = f[2] + f[3]
- if f[2].size > f[3].size
+ if f[2].size.to_f/ids.size > nr_active.to_f/(nr_active+nr_inactive)
effect = 'activating'
else
effect = 'deactivating'
end
else #regression part
ids = f[2]
- # DV: effect calculation
- f_arr=Array.new
- f[2].each do |id|
- f_arr.push(g_hash[id])
- end
- f_median=OpenTox::Utils.median(f_arr)
- if g_median >= f_median
- effect = 'activating'
- else
- effect = 'deactivating'
- end
- end
-
- tuple = {
- url_for('/fminer#smarts',:full) => smarts,
- url_for('/fminer#p_value',:full) => p_value.to_f,
- url_for('/fminer#effect',:full) => effect
- }
- #LOGGER.debug "#{f[0]}\t#{f[1]}\t#{effect}"
- ids.each do |id|
- feature_dataset.data[compounds[id]] = [] unless feature_dataset.data[compounds[id]]
- feature_dataset.data[compounds[id]] << {bbrc_uri => tuple}
- end
+ # DV: effect calculation
+ f_arr=Array.new
+ f[2].each do |id|
+ f_arr.push(g_hash[id])
+ end
+ f_median=OpenTox::Algorithm.median(f_arr)
+ if g_median >= f_median
+ effect = 'activating'
+ else
+ effect = 'deactivating'
+ end
+ end
+
+ feature_uri = File.join feature_dataset.uri,"feature","bbrc", features.size.to_s
+ unless features.include? smarts
+ features << smarts
+ # TODO insert correct ontology entries
+ metadata = {
+ OT.hasSource => feature_dataset.uri,
+ OT.smarts => smarts,
+ OT.p_value => p_value.to_f,
+ OT.effect => effect }
+ feature_dataset.add_feature feature_uri, metadata
+ end
+ ids.each { |id| feature_dataset.add(compounds[id], feature_uri, true)}
end
end
-
- uri = feature_dataset.save
- LOGGER.debug "Fminer finished, dataset #{uri} created."
- uri
+ feature_dataset.save
+ feature_dataset.uri
end
- LOGGER.debug "Fimer task started: "+task_uri.to_s
response['Content-Type'] = 'text/uri-list'
halt 202,task_uri.to_s+"\n"
end
diff --git a/lazar.rb b/lazar.rb
index b9163f4..98e0aa7 100644
--- a/lazar.rb
+++ b/lazar.rb
@@ -1,55 +1,35 @@
get '/lazar/?' do
- uri = url_for('/lazar',:full)
- owl = OpenTox::OwlSerializer.create 'Algorithm', uri
- owl.annotation_property uri, DC.creator, "helma@in-silico.ch", XSD.string
- owl.annotation_property uri, DC.contributor, "andreas@maunz.de", XSD.string
- owl.annotation_property uri, DC.title, "lazar", XSD.string
- owl.annotation_property uri, DC.source, "http://github.com/helma/opentox-algorithm", XSD.anyUri
- owl.object_property uri, OT.parameters, File.join(uri,"dataset_uri"), XSD.anyUri
- owl.object_property uri, OT.parameters, File.join(uri,"prediction_feature"), XSD.anyUri
- owl.object_property uri, OT.parameters, File.join(uri,"feature_generation_uri"), XSD.anyUri
- response['Content-Type'] = 'application/rdf+xml'
- owl.rdf
-end
-
-get '/lazar/prediction_feature?' do
- uri = url_for('/lazar/prediction_feature',:full)
- owl = OpenTox::OwlSerializer.create 'Parameter', uri
- owl.annotation_property uri, DC.description, "URI of the feature to be predicted", XSD.string
- owl.annotation_property uri, OT.paramScope, "mandatory", XSD.string
- response['Content-Type'] = 'application/rdf+xml'
- owl.rdf
-end
-get '/lazar/feature_generation_uri?' do
- uri = url_for('/lazar/feature_generation_uri',:full)
- owl = OpenTox::OwlSerializer.create 'Parameter', uri
- owl.annotation_property uri, DC.description, "URI of the feature_generation_algorithm", XSD.string
- owl.annotation_property uri, OT.paramScope, "mandatory", XSD.string
+ metadata = {
+ DC.title => 'lazar',
+ DC.identifier => url_for("",:full),
+ DC.creator => "helma@in-silico.ch, andreas@maunz.de",
+ DC.contributor => "vorgrimmlerdavid@gmx.de",
+ OT.isA => OTA.ClassificationLazySingleTarget
+ }
+
+ parameters = [
+ { DC.description => "Dataset URI", OT.paramScope => "mandatory", OT.title => "dataset_uri" },
+ { DC.description => "Feature URI for dependent variable", OT.paramScope => "mandatory", OT.title => "prediction_feature" },
+ { DC.description => "URI of feature genration service", OT.paramScope => "mandatory", OT.title => "feature_generation_uri" }
+ ]
+
+ s = OpenTox::Serializer::Owl.new
+ s.add_algorithm(url_for('/lazar',:full),metadata,parameters)
response['Content-Type'] = 'application/rdf+xml'
- owl.rdf
-end
+ s.to_rdfxml
-get '/lazar/dataset_uri?' do
- uri = url_for('/lazar/dataset_uri',:full)
- owl = OpenTox::OwlSerializer.create 'Parameter', uri
- owl.annotation_property uri, DC.description, "URI of the training dataset", XSD.string
- owl.annotation_property uri, OT.paramScope, "mandatory", XSD.string
- response['Content-Type'] = 'application/rdf+xml'
- owl.rdf
end
post '/lazar/?' do # create a model
- LOGGER.debug "Dataset: '" + params[:dataset_uri].to_s + "'"
- LOGGER.debug "Endpoint: '" + params[:prediction_feature].to_s + "'"
- LOGGER.debug "Feature generation: '" + params[:feature_generation_uri].to_s + "'"
dataset_uri = "#{params[:dataset_uri]}"
begin
- training_activities = OpenTox::Dataset.find(dataset_uri)
- rescue
- halt 404, "Dataset #{dataset_uri} not found"
+ training_activities = OpenTox::Dataset.new(dataset_uri)
+ training_activities.load_all
+ rescue => e
+ halt 404, "Dataset #{dataset_uri} not found (#{e.inspect})."
end
halt 404, "No prediction_feature parameter." unless params[:prediction_feature]
@@ -61,69 +41,46 @@ post '/lazar/?' do # create a model
task_uri = OpenTox::Task.as_task("Create lazar model",url_for('/lazar',:full)) do |task|
# create features
- LOGGER.debug "Starting fminer"
- params[:feature_uri] = params[:prediction_feature]
- fminer_task_uri = OpenTox::Algorithm::Fminer.create_feature_dataset(params)
- fminer_task = OpenTox::Task.find(fminer_task_uri)
- fminer_task.wait_for_completion
- raise "fminer failed" unless fminer_task.completed?
-
- LOGGER.debug "Fminer finished #{Time.now}"
- feature_dataset_uri = fminer_task.resultURI.to_s
- training_features = OpenTox::Dataset.find(feature_dataset_uri)
+ feature_dataset_uri = OpenTox::Algorithm::Fminer.new.run(params).to_s
+
+ training_features = OpenTox::Dataset.new(feature_dataset_uri)
+ training_features.load_all
halt 404, "Dataset #{feature_dataset_uri} not found." if training_features.nil?
+
lazar = OpenTox::Model::Lazar.new
- lazar.trainingDataset = dataset_uri
- lazar.feature_dataset_uri = feature_dataset_uri
- halt 404, "More than one descriptor type" unless training_features.features.size == 1
- bbrc = training_features.features.first
- training_features.data.each do |compound,features|
- lazar.fingerprints[compound] = [] unless lazar.fingerprints[compound]
- features.each do |feature|
- tuple = feature[bbrc]
- if tuple
- smarts =nil; p_value = nil; effect = nil
- tuple.each do |k,v|
- case k
- when /fminer#smarts/
- smarts = v
- lazar.features << smarts
- lazar.fingerprints[compound] << smarts
- when /fminer#p_value/
- p_value = v
- when /fminer#effect/
- effect = v
- end
- end
- lazar.p_values[smarts] = p_value
- lazar.effects[smarts] = effect
- end
- end
- end
- activities = {}
- classification = true
- training_activities.data.each do |compound,features|
+ # TODO: dataset method for iterating over data entries
+ training_features.data_entries.each do |compound,entry|
+ lazar.fingerprints[compound] = [] unless lazar.fingerprints[compound]
+ entry.keys.each do |feature|
+ # TODO fix URI
+ fminer_uri = File.join CONFIG[:services]["opentox-algorithm"], "fminer"
+ smarts = training_features.features[feature]["#{fminer_uri}#smarts"]
+ lazar.fingerprints[compound] << smarts
+ unless lazar.features.include? smarts
+ lazar.features << smarts
+ lazar.p_values[smarts] = training_features.features[feature]["#{fminer_uri}#p_value"]
+ lazar.effects[smarts] = training_features.features[feature]["#{fminer_uri}#effect"]
+ end
+ end
+
lazar.activities[compound] = [] unless lazar.activities[compound]
- features.each do |feature|
- case feature[params[:prediction_feature]].to_s
+ training_activities.data_entries[compound][params[:prediction_feature]].each do |value|
+ case value.to_s
when "true"
lazar.activities[compound] << true
when "false"
lazar.activities[compound] << false
- # AM: handle quantitative activity values of features
else
- lazar.activities[compound] << feature[params[:prediction_feature]].to_f
- classification = false
+ lazar.activities[compound] << value.to_f
+ lazar.type = "regression"
end
end
- end
- # TODO: insert regression
- if classification
- lazar.dependentVariables = params[:prediction_feature]+"_lazar_classification"
- else
- lazar.dependentVariables = params[:prediction_feature]+"_lazar_regression"
- end
+ end
+
+ lazar.metadata[OT.dependentVariables] = params[:prediction_feature]
+ lazar.metadata[OT.trainingDataset] = dataset_uri
+ lazar.metadata[OT.featureDataset] = feature_dataset_uri
model_uri = lazar.save
LOGGER.info model_uri + " created #{Time.now}"
diff --git a/libfminer b/libfminer
-Subproject 5a97d006e0ccfc48e53d5f24842a898ec9e912e
+Subproject e955cc6b24d577d7187e5660716ee69d12174a8
diff --git a/smarts.rb b/smarts.rb
index 2ea54d2..4ae6949 100644
--- a/smarts.rb
+++ b/smarts.rb
@@ -1,3 +1,3 @@
get '/match/compound/*/smarts/*/?' do
- "#{OpenTox::Compound.new(:inchi => params[:splat][0]).match?(params[:splat][1])}"
+ "#{OpenTox::Compound.from_inchi(params[:splat][0]).match?(params[:splat][1])}"
end