diff options
author | Andreas Maunz <andreas@maunz.de> | 2012-10-05 15:47:48 +0200 |
---|---|---|
committer | Andreas Maunz <andreas@maunz.de> | 2012-10-05 15:47:48 +0200 |
commit | bc6118dae334171cbaff4a61d4ae124052f74a59 (patch) | |
tree | 5907bd86b43b5f918599eeab18f9a9b939de3c4a | |
parent | 7123507b86139a002c0425357e515975c4796394 (diff) |
Fminer and Lazar tests running
-rw-r--r-- | lib/algorithm.rb | 29 | ||||
-rw-r--r-- | lib/dataset.rb | 26 | ||||
-rw-r--r-- | lib/model.rb | 42 | ||||
-rw-r--r-- | lib/parser.rb | 15 | ||||
-rw-r--r-- | lib/serializer.rb | 94 | ||||
-rw-r--r-- | lib/transform.rb | 51 |
6 files changed, 127 insertions, 130 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb index 78fc447..72a87cf 100644 --- a/lib/algorithm.rb +++ b/lib/algorithm.rb @@ -103,22 +103,7 @@ module OpenTox def add_fminer_data(fminer_instance, value_map) - - # detect nr duplicates per compound - compound_sizes = {} - @training_dataset.compounds.each do |compound| - entries=@training_dataset.data_entries[compound] - entries.each do |feature, values| - compound_sizes[compound] || compound_sizes[compound] = [] - compound_sizes[compound] << values.size unless values.size == 0 - end - compound_sizes[compound].uniq! - raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1 - compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array - end - id = 1 # fminer start id is not 0 - @training_dataset.compounds.each do |compound| entry=@training_dataset.data_entries[compound] begin @@ -132,17 +117,17 @@ module OpenTox next end - entry.each do |feature,values| + entry && entry.each do |feature,values| if feature == @prediction_feature.uri - (0...compound_sizes[compound]).each { |i| - if values[i].nil? + values.each do |value| + if value.nil? LOGGER.warn "No #{feature} activity for #{compound.to_s}." else if @prediction_feature.feature_type == "classification" - activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n + activity= value_map.invert[value].to_i # activities are mapped to 1..n @db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect elsif @prediction_feature.feature_type == "regression" - activity= values[i].to_f + activity= value.to_f end begin fminer_instance.AddCompound(smiles,id) if fminer_instance @@ -152,11 +137,11 @@ module OpenTox @smi[id] = smiles id += 1 rescue Exception => e - LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer" + LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer" LOGGER.warn e.backtrace end end - } + end end end end diff --git a/lib/dataset.rb b/lib/dataset.rb index c916722..3c5fa7f 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -316,13 +316,24 @@ module OpenTox # @param [String] feature Compound URI # @param [Boolean,Float] value Feature value def add (compound,feature,value) - @compounds << compound unless @compounds.include? compound - @features[feature] = {} unless @features[feature] - @data_entries[compound] = {} unless @data_entries[compound] - @data_entries[compound][feature] = [] unless @data_entries[compound][feature] - @data_entries[compound][feature] << value if value!=nil + self.add_compound(compound) + self.add_data_entry(compound,feature,value) end + # Insert a data entry + # @param [String] compound Compound URI + # @param [String] feature Compound URI + # @param [Boolean,Float] value Feature value + def add_data_entry (compound,feature,value) + if @compounds.include? compound + @features[feature] = {} unless @features[feature] + @data_entries[compound] = {} unless @data_entries[compound] + @data_entries[compound][feature] = [] unless @data_entries[compound][feature] + @data_entries[compound][feature] << value + end + end + + # Add/modify metadata, existing entries will be overwritten # @example # dataset.add_metadata({DC.title => "any_title", DC.creator => "my_email"}) @@ -361,7 +372,8 @@ module OpenTox # Add a new compound # @param [String] compound Compound URI def add_compound (compound) - @compounds << compound unless @compounds.include? compound + @compounds << compound + #@compounds << compound unless @compounds.include? compound end # Creates a new dataset, by splitting the current dataset, i.e. using only a subset of compounds and features @@ -443,8 +455,6 @@ module OpenTox # - overwrites dataset if uri exists # @return [String] Dataset URI def save(subjectid=nil) - # TODO: rewrite feature URI's ?? - @compounds.uniq! if @uri if (CONFIG[:json_hosts].include?(URI.parse(@uri).host)) #LOGGER.debug self.to_json diff --git a/lib/model.rb b/lib/model.rb index 77b0274..065b227 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -261,7 +261,7 @@ module OpenTox :lib => self.parameter(\"lib\"), :subjectid => subjectid })") - + # Adding fingerprint of query compound with features and values(p_value*nr_hits) @compound_fingerprints = {} @compound_features.each do |feature, value| # value is nil if "Substructure.match" @@ -279,12 +279,14 @@ module OpenTox mtf.transform # Make a prediction - prediction = eval("#{@prediction_algorithm}( { :props => mtf.props, - :acts => mtf.acts, - :sims => mtf.sims, - :value_map => @value_map, - :min_train_performance => self.parameter(\"min_train_performance\") - } ) ") + modul, algorthm = @prediction_algorithm.split('.') + pred_params = { :props => mtf.props, + :acts => mtf.acts, + :sims => mtf.sims, + :value_map => @value_map, + :min_train_performance => self.parameter("min_train_performance") + } + prediction = eval(modul).send(algorthm, pred_params) value_feature_uri = File.join( @uri, "predicted", "value") confidence_feature_uri = File.join( @uri, "predicted", "confidence") @@ -292,16 +294,17 @@ module OpenTox @prediction_dataset.metadata[OT.dependentVariables] = @metadata[OT.dependentVariables] unless @prediction_dataset.metadata[OT.dependentVariables] @prediction_dataset.metadata[OT.predictedVariables] = [value_feature_uri, confidence_feature_uri] unless @prediction_dataset.metadata[OT.predictedVariables] + @prediction_dataset.add_compound @compound.uri if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification" - @prediction_dataset.add @compound.uri, value_feature_uri, @value_map[prediction[:prediction].to_s] + @prediction_dataset.add_data_entry @compound.uri, value_feature_uri, @value_map[prediction[:prediction].to_s] else - @prediction_dataset.add @compound.uri, value_feature_uri, prediction[:prediction] + @prediction_dataset.add_data_entry @compound.uri, value_feature_uri, prediction[:prediction] end confidence=prediction[:confidence] if @similarity_algorithm.to_s =~ /cosine/ confidence=((confidence+1.0)/2.0).abs end - @prediction_dataset.add @compound.uri, confidence_feature_uri, confidence + @prediction_dataset.add_data_entry @compound.uri, confidence_feature_uri, confidence @prediction_dataset.features[value_feature_uri][DC.title] = @prediction_dataset.metadata[DC.title] @prediction_dataset.features[confidence_feature_uri][DC.title] = "Confidence" @@ -318,7 +321,7 @@ module OpenTox OT.pValue => @p_values[feature], OT.effect => @effects[feature] }) - @prediction_dataset.add @compound.uri, feature_uri, true + @prediction_dataset.add_data_entry @compound.uri, feature_uri, true f+=1 end elsif @feature_calculation_algorithm == "Substructure.lookup" @@ -328,13 +331,13 @@ module OpenTox @prediction_dataset.add_feature(feature, { RDF.type => [OT.NumericFeature] }) - @prediction_dataset.add @compound.uri, feature, value + @prediction_dataset.add_data_entry @compound.uri, feature, value f+=1 end else @compound_features.each do |feature| features[feature] = feature - @prediction_dataset.add @compound.uri, feature, true + @prediction_dataset.add_data_entry @compound.uri, feature, true end end n = 0 @@ -346,8 +349,10 @@ module OpenTox OT.measuredActivity => neighbor[:activity], RDF.type => [OT.Neighbor] }) - @prediction_dataset.add @compound.uri, neighbor_uri, true + @prediction_dataset.add_data_entry @compound.uri, neighbor_uri, true f = 0 unless f + + @prediction_dataset.add_compound neighbor[:compound] neighbor[:features].each do |feature| if @feature_calculation_algorithm == "Substructure.match" feature_uri = File.join( @prediction_dataset.uri, "feature", "descriptor", f.to_s) unless feature_uri = features[feature] @@ -355,9 +360,9 @@ module OpenTox feature_uri = feature end if @feature_calculation_algorithm == "Substructure.lookup" - @prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri] + @prediction_dataset.add_data_entry neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri] else - @prediction_dataset.add neighbor[:compound], feature_uri, true + @prediction_dataset.add_data_entry neighbor[:compound], feature_uri, true end unless features.has_key? feature @@ -390,10 +395,11 @@ module OpenTox # @return [Boolean] true if compound has databasse activities, false if not def database_activity(subjectid) if @activities[@compound.uri] + @prediction_dataset.add_compound @compound.uri if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification" - @activities[@compound.uri].each { |act| @prediction_dataset.add @compound.uri, @metadata[OT.dependentVariables], @value_map[act.to_s] } + @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, @metadata[OT.dependentVariables], @value_map[act.to_s] } else - @activities[@compound.uri].each { |act| @prediction_dataset.add @compound.uri, @metadata[OT.dependentVariables], act } + @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, @metadata[OT.dependentVariables], act } end @prediction_dataset.add_metadata(OT.hasSource => @metadata[OT.trainingDataset]) @prediction_dataset.save(subjectid) diff --git a/lib/parser.rb b/lib/parser.rb index 90f0570..aa1f669 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -294,7 +294,7 @@ module OpenTox @id_errors = [] @activity_errors = [] @duplicates = {} - @max_class_values = 3 + @max_class_values = 5 end def detect_new_values(row, value_maps) @@ -475,6 +475,7 @@ module OpenTox end @duplicates[compound.inchi] = [] unless @duplicates[compound.inchi] @duplicates[compound.inchi] << id+", "+row.join(", ") + @dataset.add_compound(compound.uri) feature_idx = 0 row.each_index do |i| @@ -502,12 +503,10 @@ module OpenTox feature_idx += 1 - if val != nil - @dataset.add(compound.uri, feature, val) - if @feature_types[feature].include? OT.NominalFeature - @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue] - @dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val) - end + @dataset.add_data_entry(compound.uri, feature, val) + if @feature_types[feature].include? OT.NominalFeature + @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue] + @dataset.features[feature][OT.acceptValue] << val unless (@dataset.features[feature][OT.acceptValue].include?(val) or val.nil?) end end @@ -539,7 +538,7 @@ module OpenTox def initialize @data = {} @activity_errors = [] - @max_class_values = 3 + @max_class_values = 5 end def feature_values(feature) diff --git a/lib/serializer.rb b/lib/serializer.rb index c0bb60d..8d41f05 100644 --- a/lib/serializer.rb +++ b/lib/serializer.rb @@ -462,17 +462,6 @@ module OpenTox features = dataset.features.keys - # remove missing features - delete_features = [] - features.each{ |id| - dataset.features[id][RDF.type].each { |typestr| - if typestr.include? "MissingFeature" - delete_features << id - end - } - } - features = features - delete_features - # sort features features.sort! @@ -486,60 +475,56 @@ module OpenTox compound_sizes = {} dataset.compounds.each do |compound| entries=dataset.data_entries[compound] - if entries - entries.each do |feature, values| - compound_sizes[compound] || compound_sizes[compound] = [] - compound_sizes[compound] << values.size - end - compound_sizes[compound].uniq! - raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1 - compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array + entries && entries.each do |feature, values| + compound_sizes[compound] || compound_sizes[compound] = values.size + raise "Inappropriate data for CSV export" if compound_sizes[compound] != values.size end end + dataset.compounds.uniq.each do |compound| # Must handle compounds with no data entries + compound_sizes.has_key?(compound) || compound_sizes[compound] = dataset.compounds.count(compound) + end + # get headers features_smarts && @rows.first << features_smarts || @rows.first << features @rows.first.flatten! + which_row = dataset.compounds.inject({}) { |h,id| h[id]=0; h } + # feature positions pre-calculated - feature_positions = features.inject({}) { |h,f| - h.merge!({f => features.index(f)+1}) # +1 due to ID - h + feature_positions = {} + features.each_with_index { |f,idx| + feature_positions[f] = idx+1 # +1 due to ID } - # serialize to csv - dataset.compounds.each do |compound| - entries=dataset.data_entries[compound] - inchi = URI.encode_www_form_component(Compound.new(compound).to_inchi) - - if entries - # allocate container - row_container = Array.new(compound_sizes[compound]) - (0...row_container.size).each do |i| - row_container[i] = Array.new(@rows.first.size) - row_container[i][0] = inchi - end + # feature_types pre-calculated (for quoting) + feature_quoting = {} + features.each_with_index { |f,idx| + feature_quoting[f] = false + if dataset.features[f][RDF.type].size == 1 && dataset.features[f][RDF.type][0] == OT.NominalFeature + feature_quoting[f] = true + end + } - # fill entries - entries.each { |feature, values| - (0...compound_sizes[compound]).each { |i| - row_container[i][feature_positions[feature]] = values[i] - } - } - - # fill zeroes for subgraphs - if (features_smarts) - row_container.collect! { |row| - row.collect! { |x| x ? x : 0 } - } - end - row_container.each { |row| @rows << row } + @rows += dataset.compounds.collect do |compound| # assumes compounds list with duplicates + inchi_unenc = Compound.new(compound).to_inchi + inchi = URI.encode_www_form_component(inchi_unenc) - else - row = Array.new(@rows.first.size) - row[0] = inchi - @rows << row - end + i = which_row[compound] # select appropriate feature value + + # allocate row + row = Array.new(@rows.first.size) + row[0] = inchi + + # fill entries + entries=dataset.data_entries[compound] + entries && entries.each { |feature, values| + row[feature_positions[feature]] = feature_quoting[feature] ? "\""+values[i].to_s+"\"" : values[i].to_s + } + + which_row[compound] = i + 1 + + row end end @@ -549,8 +534,7 @@ module OpenTox rows = @rows.collect result = "" result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name - result << rows.collect{ |r| r.join(",") }.join("\n") - result << "\n" + result << rows.collect{ |r| r.join(",") }.join("\n") + "\n" end # Convert to spreadsheet workbook diff --git a/lib/transform.rb b/lib/transform.rb index 67a9ec7..fb053ee 100644 --- a/lib/transform.rb +++ b/lib/transform.rb @@ -409,29 +409,35 @@ module OpenTox # Find neighbors and store them as object variable, access all compounds for that. def neighbors @model.neighbors = [] + which_row=@cmpds.inject({}) {|h,c| h[c]=0; h} if @similarity_algorithm.to_s =~ /tanimoto/ - @cmpds.each_with_index { |cmpd, idx| add_neighbor @model.fingerprints[cmpd], idx } + @cmpds.each_with_index { |cmpd, idx| + fp={}; @model.fingerprints[cmpd].each { |f,vals| fp[f] = vals[which_row[cmpd]] } + add_neighbor fp, idx, which_row[cmpd] # Pass hash, not array + which_row[cmpd] += 1 + } else - @n_prop.each_with_index { |fp, idx| add_neighbor fp, idx } # AM: access all compounds + @cmpds.each_with_index{ |cmpd, idx| + add_neighbor @n_prop[idx], idx, which_row[cmpd] # AM: access all compounds + which_row[cmpd] += 1 + } end end # Adds a neighbor to @neighbors if it passes the similarity threshold # adjusts @ids to signal the - def add_neighbor(training_props, idx) + def add_neighbor(training_props, idx, which_row) sim = similarity(training_props) if sim > @model.parameter("min_sim") if @model.activities[@cmpds[idx]] - @model.activities[@cmpds[idx]].each do |act| - @model.neighbors << { - :compound => @cmpds[idx], - :similarity => sim, - :features => @fps[idx].keys, - :activity => act - } - @sims << sim - @ids << idx - end + @model.neighbors << { + :compound => @cmpds[idx], + :similarity => sim, + :features => @fps[idx].keys, + :activity => @model.activities[@cmpds[idx]][which_row] + } + @sims << sim + @ids << idx end end end @@ -497,20 +503,27 @@ module OpenTox # Major BUG! Must loop over @model.compounds, hash is unordered! # @model.fingerprints.each + which_row=@model.compounds.inject({}) {|h,c| h[c]=0; h} @model.compounds.each { |cmpd| fp = @model.fingerprints[cmpd] if @model.activities[cmpd] # row good - acts = @model.activities[cmpd]; @acts += acts + acts = @model.activities[cmpd]; @acts << acts[which_row[cmpd]] LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1 - row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's - acts.size.times { # multiple additions for multiple activities - @n_prop << row.collect - @cmpds << cmpd - @fps << Marshal.load(Marshal.dump(fp)) + row = []; @model.features.each { |f| + if fp[f].nil? + row << nil + else + row << fp[f][which_row[cmpd]] + end + #row << fp[f].nil? ? nil : fp[f][which_row[cmpd]] # nils for non-existent f's } + @n_prop << row.collect + @cmpds << cmpd + @fps << Marshal.load(Marshal.dump(fp)) else LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'" end + which_row[cmpd] += 1 } @model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure |