From dc7ab074a06837544ae9fdc94d61640095d26271 Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Fri, 5 Oct 2012 15:49:20 +0200 Subject: Fminer and Lazar tests running --- bbrc-sample | 2 +- fminer.rb | 115 +++++++++++++++++------------------------------------------- lazar.rb | 49 +++++++++++--------------- libfminer | 2 +- 4 files changed, 55 insertions(+), 113 deletions(-) diff --git a/bbrc-sample b/bbrc-sample index 1552da2..f2eacee 160000 --- a/bbrc-sample +++ b/bbrc-sample @@ -1 +1 @@ -Subproject commit 1552da28ebbcbfb2128e5561cd8aff2ef534219e +Subproject commit f2eacee266e954f538e51107bb7bebd80b307ecb diff --git a/fminer.rb b/fminer.rb index 2f4cb67..190c8c1 100644 --- a/fminer.rb +++ b/fminer.rb @@ -302,56 +302,30 @@ post '/fminer/bbrc/?' do ] } feature_dataset.add_feature feature_uri, metadata - #feature_dataset.add_feature_parameters feature_uri, feature_dataset.parameters end - id_arrs.each { |id_count_hash| - id=id_count_hash.keys[0].to_i - count=id_count_hash.values[0].to_i - fminer_results[@@fminer.compounds[id]] || fminer_results[@@fminer.compounds[id]] = {} - fminer_results[@@fminer.compounds[id]][feature_uri] || fminer_results[@@fminer.compounds[id]][feature_uri] = [] - if params[:nr_hits] == "true" - fminer_results[@@fminer.compounds[id]][feature_uri] << count - else - fminer_results[@@fminer.compounds[id]][feature_uri] << 1 + + # re-format to one big hash id_arr + id_arr = {}; id_arrs.each { |id_count_hash| id_arr[id_count_hash.keys[0]] = id_count_hash.values[0] } + @@fminer.compounds.collect.each_with_index { |cmpd,id| # This collects all cmpds that have an activity + val = id_arr[id] ? ( params[:nr_hits] == "true" ? id_arr[id].to_i : 1 ) : 0 + if (val != 0 or params[:complete_entries] == "true") + fminer_results[cmpd] || fminer_results[cmpd] = {} + fminer_results[cmpd][feature_uri] || fminer_results[cmpd][feature_uri] = [] + fminer_results[cmpd][feature_uri] << val end } - - end # end of + + end # end of end # feature parsing - # Collect compounds, in order with duplicates (owd) - fminer_compounds_owd = @@fminer.compounds.collect - fminer_compounds_owd.shift if fminer_compounds_owd[0].nil? - - # Complete fminer_results, s.t. it contains entries for any training compound - if (params[:complete_entries] == "true") - myhelpfeature = File.join(feature_dataset.uri,"feature","bbrc", (features.size-1).to_s) - (fminer_compounds_owd - fminer_results.keys).each { |cmpd| # add possibly multiple entries - fminer_results[cmpd] || fminer_results[cmpd] = {} - fminer_results[cmpd][myhelpfeature] || fminer_results[cmpd][myhelpfeature] = [] - fminer_results[cmpd][myhelpfeature] << 0 - } - end - - # Add fminer results to feature dataset along owd - all_compounds_owd = @@fminer.training_dataset.compounds.collect { |v| - times_in_fminer_compounds_owd = fminer_compounds_owd.count(v) - times_in_fminer_compounds_owd == 0 ? v : Array.new(times_in_fminer_compounds_owd,v) - }.flatten - if (params[:complete_entries] == "true") - all_compounds_owd.each { |compound| - feature_dataset.add_compound(compound) # add compounds *in order* - } - end - which_row = @@fminer.training_dataset.compounds.inject({}) { |h,id| h[id]=0; h } - unused_compounds = fminer_compounds_owd - fminer_results.keys - (fminer_compounds_owd - unused_compounds).each { |compound| - feature_dataset.add_compound(compound) unless (params[:complete_entries] == "true") - fminer_results[compound].each { |feature, values| - feature_dataset.add( compound, feature, values[which_row[compound]] ) + @@fminer.training_dataset.compounds.each { |cmpd| + feature_dataset.add_compound(cmpd) # *unconditionally* add compounds *in order* + i = which_row[cmpd] + fminer_results[cmpd] && fminer_results[cmpd].each { |feature, values| + feature_dataset.add_data_entry( cmpd, feature, values[i] ) } - which_row[compound] += 1 + which_row[cmpd] += 1 } # AM: add feature values for non-present features # feature_dataset.complete_data_entries @@ -515,8 +489,7 @@ post '/fminer/bbrc/sample/?' do params[:nr_hits] == "true" ? hit_count=true: hit_count=false matches, counts, used_compounds = lu.match_rb(@@fminer.smi,smarts,hit_count,true) # last arg: always create complete entries for sampling - # Collect compounds, in order with duplicates (owd) and put in dataset - used_compounds.each { |idx| feature_dataset.add_compound(@@fminer.compounds[idx]) } + @@fminer.training_dataset.compounds.each { |cmpd| feature_dataset.add_compound(cmpd) } feature_dataset.add_metadata({ OT.parameters => [ @@ -539,12 +512,9 @@ post '/fminer/bbrc/sample/?' do metadata = calc_metadata (smarts, ids, counts[smarts], nil, nil , value_map, params, smarts_p_values[smarts]) feature_uri = File.join feature_dataset.uri,"feature","last", feature_dataset.features.size.to_s feature_dataset.add_feature feature_uri, metadata - ids.each_with_index { |id,idx| feature_dataset.add(@@fminer.compounds[id], feature_uri, counts[smarts][idx])} + ids.each_with_index { |id,idx| feature_dataset.add_data_entry(@@fminer.compounds[id], feature_uri, counts[smarts][idx])} end - # AM: add feature values for non-present features - # feature_dataset.complete_data_entries - feature_dataset.save(@subjectid) feature_dataset.uri end @@ -624,30 +594,19 @@ post '/fminer/last/?' do smarts=lu.smarts_rb(dom,'nls') # converts patterns to LAST-SMARTS using msa variant (see last-pm.maunz.de) params[:nr_hits] == "true" ? hit_count=true : hit_count=false params[:complete_entries] == "true" ? complete_entries=true : complete_entries=false - matches, counts, used_compounds = lu.match_rb(@@fminer.smi,smarts,hit_count,complete_entries) # creates instantiations - - # Collect compounds, in order with duplicates (owd) and put in dataset - fminer_compounds_owd = used_compounds.collect { |idx| @@fminer.compounds[idx] } - if (complete_entries) - all_compounds_owd = @@fminer.training_dataset.compounds.collect { |v| - times_in_fminer_compounds_owd = fminer_compounds_owd.count(v) - times_in_fminer_compounds_owd == 0 ? v : Array.new(times_in_fminer_compounds_owd,v) - }.flatten - all_compounds_owd.each { |compound| feature_dataset.add_compound(compound) } - else - fminer_compounds_owd.each { |compound| feature_dataset.add_compound(compound) } - end + matches, counts = lu.match_rb(@@fminer.smi,smarts,hit_count,complete_entries) # creates instantiations + @@fminer.training_dataset.compounds.each { |cmpd| feature_dataset.add_compound(cmpd) } matches.each do |smarts, ids| metadata = calc_metadata (smarts, ids, counts[smarts], @@last, nil, value_map, params) feature_uri = File.join feature_dataset.uri,"feature","last", feature_dataset.features.size.to_s feature_dataset.add_feature feature_uri, metadata - ids.each_with_index { |id,idx| feature_dataset.add(@@fminer.compounds[id], feature_uri, counts[smarts][idx])} + @@fminer.compounds.collect.each_with_index { |cmpd,id| # This collects all cmpds that have an activity + count_idx = matches[smarts].index(id) + feature_dataset.add_data_entry(cmpd, feature_uri, counts[smarts][count_idx]) if count_idx + } end - # AM: add feature values for non-present features - # feature_dataset.complete_data_entries - feature_dataset.save(@subjectid) feature_dataset.uri end @@ -703,25 +662,17 @@ post '/fminer/:method/match?' do smarts = f_dataset.features.collect { |f,m| m[OT.smarts] } params[:nr_hits] == "true" ? hit_count=true : hit_count=false params[:complete_entries] == "true" ? complete_entries=true : complete_entries=false - matches, counts, used_compounds = LU.new.match_rb(@@fminer.smi, smarts, hit_count, complete_entries) if smarts.size>0 - - # Collect compounds, in order with duplicates (owd) and put in dataset - fminer_compounds_owd = used_compounds.collect { |idx| @@fminer.compounds[idx] } - if (complete_entries) - all_compounds_owd = @@fminer.training_dataset.compounds.collect { |v| - times_in_fminer_compounds_owd = fminer_compounds_owd.count(v) - times_in_fminer_compounds_owd == 0 ? v : Array.new(times_in_fminer_compounds_owd,v) - }.flatten - all_compounds_owd.each { |compound| feature_dataset.add_compound(compound) } - else - fminer_compounds_owd.each { |compound| feature_dataset.add_compound(compound) } - end + matches, counts = LU.new.match_rb(@@fminer.smi, smarts, hit_count, complete_entries) if smarts.size>0 + @@fminer.training_dataset.compounds.each { |cmpd| feature_dataset.add_compound(cmpd) } matches.each do |smarts, ids| - metadata = calc_metadata (smarts, ids, counts[smarts], @@last, feature_dataset.uri, value_map, params) - feature_uri = File.join feature_dataset.uri,"feature","bbrc","match", feature_dataset.features.size.to_s + metadata = calc_metadata (smarts, ids, counts[smarts], @@last, nil, value_map, params) + feature_uri = File.join feature_dataset.uri,"feature",params[:method], feature_dataset.features.size.to_s feature_dataset.add_feature feature_uri, metadata - ids.each_with_index { |id,idx| feature_dataset.add(@@fminer.compounds[id], feature_uri, counts[smarts][idx]) } + @@fminer.compounds.collect.each_with_index { |cmpd,id| # This collects all cmpds that have an activity + count_idx = matches[smarts].index(id) + feature_dataset.add_data_entry(cmpd, feature_uri, counts[smarts][count_idx]) if count_idx + } end feature_dataset.save @subjectid diff --git a/lazar.rb b/lazar.rb index 0e0eb40..11643b3 100644 --- a/lazar.rb +++ b/lazar.rb @@ -42,6 +42,8 @@ post '/lazar/?' do raise OpenTox::NotFoundError.new "No dataset_uri parameter." unless params[:dataset_uri] dataset_uri = params[:dataset_uri] + LOGGER.debug "Creating lazar model with params #{params.inspect}" + task = OpenTox::Task.create("Create lazar model",url_for('/lazar',:full)) do |task| @@ -181,15 +183,18 @@ post '/lazar/?' do # Creating InChi/URI Hash from trainig_feature for comparison with training_dataset to avoid missmatches caused by different URI authorities feature_compounds = {} + which_row={} training_features.compounds.each {|f_c_uri| f_compound = OpenTox::Compound.new(f_c_uri) feature_compounds[f_compound.to_inchi] = f_c_uri + which_row[f_compound.to_inchi] = 0 } training_dataset.compounds.each do |t_c_uri| t_compound = OpenTox::Compound.new(t_c_uri) entry = training_features.data_entries[feature_compounds[t_compound.to_inchi]] + row_idx = which_row[t_compound.to_inchi] if entry.nil? # Training compound not found in feature dataset del_master_compounds << t_c_uri # Delete if training compound not found in feature dataset @@ -201,12 +206,8 @@ post '/lazar/?' do if (lazar.feature_calculation_algorithm == "Substructure.match") || (lazar.feature_calculation_algorithm == "Substructure.match_hits") if training_features.features[feature] smarts = training_features.features[feature][OT.smarts] - #lazar.fingerprints[compound] << smarts - if lazar.feature_calculation_algorithm == "Substructure.match_hits" - lazar.fingerprints[t_c_uri][smarts] = entry[feature].flatten.first * training_features.features[feature][OT.pValue] - else - lazar.fingerprints[t_c_uri][smarts] = 1 * training_features.features[feature][OT.pValue] - end + lazar.fingerprints[t_c_uri][smarts] = [] unless lazar.fingerprints[t_c_uri][smarts] + lazar.fingerprints[t_c_uri][smarts] << entry[feature][row_idx] * training_features.features[feature][OT.pValue] unless lazar.features.include? smarts lazar.features << smarts lazar.p_values[smarts] = training_features.features[feature][OT.pValue] @@ -216,17 +217,18 @@ post '/lazar/?' do # CASE 2: Others elsif entry[feature].flatten.size == 1 - lazar.fingerprints[t_c_uri][feature] = entry[feature].flatten.first + lazar.fingerprints[t_c_uri][feature] = [] unless lazar.fingerprints[t_c_uri][feature] + lazar.fingerprints[t_c_uri][feature] << entry[feature][row_idx] lazar.features << feature unless lazar.features.include? feature - else - LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{t_c_uri}, feature #{feature}" end + end end + which_row[t_compound.to_inchi] += 1 end task.progress 80 - + # Show compounds without feature information if del_master_compounds.size>0 del_master_compounds.each{|compound| LOGGER.info "Compound: '#{compound.to_s}' not found in feature dataset and will be removed from compound list."} @@ -235,26 +237,15 @@ post '/lazar/?' do lazar.compounds=training_dataset.compounds.collect - del_master_compounds # Add only compounds with fingerprints # # # Activities - if prediction_feature.feature_type == "regression" - lazar.compounds.each do |compound| - entry = training_dataset.data_entries[compound] - lazar.activities[compound] = [] unless lazar.activities[compound] - unless entry[prediction_feature.uri].empty? - entry[prediction_feature.uri].each do |value| - lazar.activities[compound] << value - end - end - end - elsif prediction_feature.feature_type == "classification" - lazar.compounds.each do |compound| - entry = training_dataset.data_entries[compound] - lazar.activities[compound] = [] unless lazar.activities[compound] - unless entry[prediction_feature.uri].empty? - entry[prediction_feature.uri].each do |value| - lazar.activities[compound] << lazar.value_map.invert[value] # insert mapped values, not originals - end - end + which_row=lazar.compounds.inject({}) {|h,c| h[c]=0; h} + lazar.compounds.each do |compound| + entry = training_dataset.data_entries[compound] + lazar.activities[compound] = [] unless lazar.activities[compound] + if entry[prediction_feature.uri] + value = entry[prediction_feature.uri][which_row[compound]] + lazar.activities[compound] << ( prediction_feature.feature_type == "classification" ? lazar.value_map.invert[value] : value ) end + which_row[compound] += 1 end task.progress 90 diff --git a/libfminer b/libfminer index a86af9e..4327230 160000 --- a/libfminer +++ b/libfminer @@ -1 +1 @@ -Subproject commit a86af9e55225c5c04403efd0209f7aca800a9827 +Subproject commit 4327230e9f517a9e6624e6b8e018aa3cbcbb8022 -- cgit v1.2.3