summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Maunz <andreas@maunz.de>2012-10-05 15:49:20 +0200
committerAndreas Maunz <andreas@maunz.de>2012-10-05 15:49:20 +0200
commitdc7ab074a06837544ae9fdc94d61640095d26271 (patch)
tree646976fd4e6d3ed0f35910217901187071991e8e
parentc3563cb82bfa7fd8047d901c33bd959a89bb7d59 (diff)
Fminer and Lazar tests running
m---------bbrc-sample0
-rw-r--r--fminer.rb115
-rw-r--r--lazar.rb49
m---------libfminer0
4 files changed, 53 insertions, 111 deletions
diff --git a/bbrc-sample b/bbrc-sample
-Subproject 1552da28ebbcbfb2128e5561cd8aff2ef534219
+Subproject f2eacee266e954f538e51107bb7bebd80b307ec
diff --git a/fminer.rb b/fminer.rb
index 2f4cb67..190c8c1 100644
--- a/fminer.rb
+++ b/fminer.rb
@@ -302,56 +302,30 @@ post '/fminer/bbrc/?' do
]
}
feature_dataset.add_feature feature_uri, metadata
- #feature_dataset.add_feature_parameters feature_uri, feature_dataset.parameters
end
- id_arrs.each { |id_count_hash|
- id=id_count_hash.keys[0].to_i
- count=id_count_hash.values[0].to_i
- fminer_results[@@fminer.compounds[id]] || fminer_results[@@fminer.compounds[id]] = {}
- fminer_results[@@fminer.compounds[id]][feature_uri] || fminer_results[@@fminer.compounds[id]][feature_uri] = []
- if params[:nr_hits] == "true"
- fminer_results[@@fminer.compounds[id]][feature_uri] << count
- else
- fminer_results[@@fminer.compounds[id]][feature_uri] << 1
+
+ # re-format to one big hash id_arr
+ id_arr = {}; id_arrs.each { |id_count_hash| id_arr[id_count_hash.keys[0]] = id_count_hash.values[0] }
+ @@fminer.compounds.collect.each_with_index { |cmpd,id| # This collects all cmpds that have an activity
+ val = id_arr[id] ? ( params[:nr_hits] == "true" ? id_arr[id].to_i : 1 ) : 0
+ if (val != 0 or params[:complete_entries] == "true")
+ fminer_results[cmpd] || fminer_results[cmpd] = {}
+ fminer_results[cmpd][feature_uri] || fminer_results[cmpd][feature_uri] = []
+ fminer_results[cmpd][feature_uri] << val
end
}
-
- end # end of
+
+ end # end of
end # feature parsing
- # Collect compounds, in order with duplicates (owd)
- fminer_compounds_owd = @@fminer.compounds.collect
- fminer_compounds_owd.shift if fminer_compounds_owd[0].nil?
-
- # Complete fminer_results, s.t. it contains entries for any training compound
- if (params[:complete_entries] == "true")
- myhelpfeature = File.join(feature_dataset.uri,"feature","bbrc", (features.size-1).to_s)
- (fminer_compounds_owd - fminer_results.keys).each { |cmpd| # add possibly multiple entries
- fminer_results[cmpd] || fminer_results[cmpd] = {}
- fminer_results[cmpd][myhelpfeature] || fminer_results[cmpd][myhelpfeature] = []
- fminer_results[cmpd][myhelpfeature] << 0
- }
- end
-
- # Add fminer results to feature dataset along owd
- all_compounds_owd = @@fminer.training_dataset.compounds.collect { |v|
- times_in_fminer_compounds_owd = fminer_compounds_owd.count(v)
- times_in_fminer_compounds_owd == 0 ? v : Array.new(times_in_fminer_compounds_owd,v)
- }.flatten
- if (params[:complete_entries] == "true")
- all_compounds_owd.each { |compound|
- feature_dataset.add_compound(compound) # add compounds *in order*
- }
- end
-
which_row = @@fminer.training_dataset.compounds.inject({}) { |h,id| h[id]=0; h }
- unused_compounds = fminer_compounds_owd - fminer_results.keys
- (fminer_compounds_owd - unused_compounds).each { |compound|
- feature_dataset.add_compound(compound) unless (params[:complete_entries] == "true")
- fminer_results[compound].each { |feature, values|
- feature_dataset.add( compound, feature, values[which_row[compound]] )
+ @@fminer.training_dataset.compounds.each { |cmpd|
+ feature_dataset.add_compound(cmpd) # *unconditionally* add compounds *in order*
+ i = which_row[cmpd]
+ fminer_results[cmpd] && fminer_results[cmpd].each { |feature, values|
+ feature_dataset.add_data_entry( cmpd, feature, values[i] )
}
- which_row[compound] += 1
+ which_row[cmpd] += 1
}
# AM: add feature values for non-present features
# feature_dataset.complete_data_entries
@@ -515,8 +489,7 @@ post '/fminer/bbrc/sample/?' do
params[:nr_hits] == "true" ? hit_count=true: hit_count=false
matches, counts, used_compounds = lu.match_rb(@@fminer.smi,smarts,hit_count,true) # last arg: always create complete entries for sampling
- # Collect compounds, in order with duplicates (owd) and put in dataset
- used_compounds.each { |idx| feature_dataset.add_compound(@@fminer.compounds[idx]) }
+ @@fminer.training_dataset.compounds.each { |cmpd| feature_dataset.add_compound(cmpd) }
feature_dataset.add_metadata({
OT.parameters => [
@@ -539,12 +512,9 @@ post '/fminer/bbrc/sample/?' do
metadata = calc_metadata (smarts, ids, counts[smarts], nil, nil , value_map, params, smarts_p_values[smarts])
feature_uri = File.join feature_dataset.uri,"feature","last", feature_dataset.features.size.to_s
feature_dataset.add_feature feature_uri, metadata
- ids.each_with_index { |id,idx| feature_dataset.add(@@fminer.compounds[id], feature_uri, counts[smarts][idx])}
+ ids.each_with_index { |id,idx| feature_dataset.add_data_entry(@@fminer.compounds[id], feature_uri, counts[smarts][idx])}
end
- # AM: add feature values for non-present features
- # feature_dataset.complete_data_entries
-
feature_dataset.save(@subjectid)
feature_dataset.uri
end
@@ -624,30 +594,19 @@ post '/fminer/last/?' do
smarts=lu.smarts_rb(dom,'nls') # converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de)
params[:nr_hits] == "true" ? hit_count=true : hit_count=false
params[:complete_entries] == "true" ? complete_entries=true : complete_entries=false
- matches, counts, used_compounds = lu.match_rb(@@fminer.smi,smarts,hit_count,complete_entries) # creates instantiations
-
- # Collect compounds, in order with duplicates (owd) and put in dataset
- fminer_compounds_owd = used_compounds.collect { |idx| @@fminer.compounds[idx] }
- if (complete_entries)
- all_compounds_owd = @@fminer.training_dataset.compounds.collect { |v|
- times_in_fminer_compounds_owd = fminer_compounds_owd.count(v)
- times_in_fminer_compounds_owd == 0 ? v : Array.new(times_in_fminer_compounds_owd,v)
- }.flatten
- all_compounds_owd.each { |compound| feature_dataset.add_compound(compound) }
- else
- fminer_compounds_owd.each { |compound| feature_dataset.add_compound(compound) }
- end
+ matches, counts = lu.match_rb(@@fminer.smi,smarts,hit_count,complete_entries) # creates instantiations
+ @@fminer.training_dataset.compounds.each { |cmpd| feature_dataset.add_compound(cmpd) }
matches.each do |smarts, ids|
metadata = calc_metadata (smarts, ids, counts[smarts], @@last, nil, value_map, params)
feature_uri = File.join feature_dataset.uri,"feature","last", feature_dataset.features.size.to_s
feature_dataset.add_feature feature_uri, metadata
- ids.each_with_index { |id,idx| feature_dataset.add(@@fminer.compounds[id], feature_uri, counts[smarts][idx])}
+ @@fminer.compounds.collect.each_with_index { |cmpd,id| # This collects all cmpds that have an activity
+ count_idx = matches[smarts].index(id)
+ feature_dataset.add_data_entry(cmpd, feature_uri, counts[smarts][count_idx]) if count_idx
+ }
end
- # AM: add feature values for non-present features
- # feature_dataset.complete_data_entries
-
feature_dataset.save(@subjectid)
feature_dataset.uri
end
@@ -703,25 +662,17 @@ post '/fminer/:method/match?' do
smarts = f_dataset.features.collect { |f,m| m[OT.smarts] }
params[:nr_hits] == "true" ? hit_count=true : hit_count=false
params[:complete_entries] == "true" ? complete_entries=true : complete_entries=false
- matches, counts, used_compounds = LU.new.match_rb(@@fminer.smi, smarts, hit_count, complete_entries) if smarts.size>0
-
- # Collect compounds, in order with duplicates (owd) and put in dataset
- fminer_compounds_owd = used_compounds.collect { |idx| @@fminer.compounds[idx] }
- if (complete_entries)
- all_compounds_owd = @@fminer.training_dataset.compounds.collect { |v|
- times_in_fminer_compounds_owd = fminer_compounds_owd.count(v)
- times_in_fminer_compounds_owd == 0 ? v : Array.new(times_in_fminer_compounds_owd,v)
- }.flatten
- all_compounds_owd.each { |compound| feature_dataset.add_compound(compound) }
- else
- fminer_compounds_owd.each { |compound| feature_dataset.add_compound(compound) }
- end
+ matches, counts = LU.new.match_rb(@@fminer.smi, smarts, hit_count, complete_entries) if smarts.size>0
+ @@fminer.training_dataset.compounds.each { |cmpd| feature_dataset.add_compound(cmpd) }
matches.each do |smarts, ids|
- metadata = calc_metadata (smarts, ids, counts[smarts], @@last, feature_dataset.uri, value_map, params)
- feature_uri = File.join feature_dataset.uri,"feature","bbrc","match", feature_dataset.features.size.to_s
+ metadata = calc_metadata (smarts, ids, counts[smarts], @@last, nil, value_map, params)
+ feature_uri = File.join feature_dataset.uri,"feature",params[:method], feature_dataset.features.size.to_s
feature_dataset.add_feature feature_uri, metadata
- ids.each_with_index { |id,idx| feature_dataset.add(@@fminer.compounds[id], feature_uri, counts[smarts][idx]) }
+ @@fminer.compounds.collect.each_with_index { |cmpd,id| # This collects all cmpds that have an activity
+ count_idx = matches[smarts].index(id)
+ feature_dataset.add_data_entry(cmpd, feature_uri, counts[smarts][count_idx]) if count_idx
+ }
end
feature_dataset.save @subjectid
diff --git a/lazar.rb b/lazar.rb
index 0e0eb40..11643b3 100644
--- a/lazar.rb
+++ b/lazar.rb
@@ -42,6 +42,8 @@ post '/lazar/?' do
raise OpenTox::NotFoundError.new "No dataset_uri parameter." unless params[:dataset_uri]
dataset_uri = params[:dataset_uri]
+ LOGGER.debug "Creating lazar model with params #{params.inspect}"
+
task = OpenTox::Task.create("Create lazar model",url_for('/lazar',:full)) do |task|
@@ -181,15 +183,18 @@ post '/lazar/?' do
# Creating InChi/URI Hash from trainig_feature for comparison with training_dataset to avoid missmatches caused by different URI authorities
feature_compounds = {}
+ which_row={}
training_features.compounds.each {|f_c_uri|
f_compound = OpenTox::Compound.new(f_c_uri)
feature_compounds[f_compound.to_inchi] = f_c_uri
+ which_row[f_compound.to_inchi] = 0
}
training_dataset.compounds.each do |t_c_uri|
t_compound = OpenTox::Compound.new(t_c_uri)
entry = training_features.data_entries[feature_compounds[t_compound.to_inchi]]
+ row_idx = which_row[t_compound.to_inchi]
if entry.nil? # Training compound not found in feature dataset
del_master_compounds << t_c_uri # Delete if training compound not found in feature dataset
@@ -201,12 +206,8 @@ post '/lazar/?' do
if (lazar.feature_calculation_algorithm == "Substructure.match") || (lazar.feature_calculation_algorithm == "Substructure.match_hits")
if training_features.features[feature]
smarts = training_features.features[feature][OT.smarts]
- #lazar.fingerprints[compound] << smarts
- if lazar.feature_calculation_algorithm == "Substructure.match_hits"
- lazar.fingerprints[t_c_uri][smarts] = entry[feature].flatten.first * training_features.features[feature][OT.pValue]
- else
- lazar.fingerprints[t_c_uri][smarts] = 1 * training_features.features[feature][OT.pValue]
- end
+ lazar.fingerprints[t_c_uri][smarts] = [] unless lazar.fingerprints[t_c_uri][smarts]
+ lazar.fingerprints[t_c_uri][smarts] << entry[feature][row_idx] * training_features.features[feature][OT.pValue]
unless lazar.features.include? smarts
lazar.features << smarts
lazar.p_values[smarts] = training_features.features[feature][OT.pValue]
@@ -216,17 +217,18 @@ post '/lazar/?' do
# CASE 2: Others
elsif entry[feature].flatten.size == 1
- lazar.fingerprints[t_c_uri][feature] = entry[feature].flatten.first
+ lazar.fingerprints[t_c_uri][feature] = [] unless lazar.fingerprints[t_c_uri][feature]
+ lazar.fingerprints[t_c_uri][feature] << entry[feature][row_idx]
lazar.features << feature unless lazar.features.include? feature
- else
- LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{t_c_uri}, feature #{feature}"
end
+
end
end
+ which_row[t_compound.to_inchi] += 1
end
task.progress 80
-
+
# Show compounds without feature information
if del_master_compounds.size>0
del_master_compounds.each{|compound| LOGGER.info "Compound: '#{compound.to_s}' not found in feature dataset and will be removed from compound list."}
@@ -235,26 +237,15 @@ post '/lazar/?' do
lazar.compounds=training_dataset.compounds.collect - del_master_compounds # Add only compounds with fingerprints
# # # Activities
- if prediction_feature.feature_type == "regression"
- lazar.compounds.each do |compound|
- entry = training_dataset.data_entries[compound]
- lazar.activities[compound] = [] unless lazar.activities[compound]
- unless entry[prediction_feature.uri].empty?
- entry[prediction_feature.uri].each do |value|
- lazar.activities[compound] << value
- end
- end
- end
- elsif prediction_feature.feature_type == "classification"
- lazar.compounds.each do |compound|
- entry = training_dataset.data_entries[compound]
- lazar.activities[compound] = [] unless lazar.activities[compound]
- unless entry[prediction_feature.uri].empty?
- entry[prediction_feature.uri].each do |value|
- lazar.activities[compound] << lazar.value_map.invert[value] # insert mapped values, not originals
- end
- end
+ which_row=lazar.compounds.inject({}) {|h,c| h[c]=0; h}
+ lazar.compounds.each do |compound|
+ entry = training_dataset.data_entries[compound]
+ lazar.activities[compound] = [] unless lazar.activities[compound]
+ if entry[prediction_feature.uri]
+ value = entry[prediction_feature.uri][which_row[compound]]
+ lazar.activities[compound] << ( prediction_feature.feature_type == "classification" ? lazar.value_map.invert[value] : value )
end
+ which_row[compound] += 1
end
task.progress 90
diff --git a/libfminer b/libfminer
-Subproject a86af9e55225c5c04403efd0209f7aca800a982
+Subproject 4327230e9f517a9e6624e6b8e018aa3cbcbb802