summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Maunz <andreas@maunz.de>2012-10-05 15:47:48 +0200
committerAndreas Maunz <andreas@maunz.de>2012-10-05 15:47:48 +0200
commitbc6118dae334171cbaff4a61d4ae124052f74a59 (patch)
tree5907bd86b43b5f918599eeab18f9a9b939de3c4a
parent7123507b86139a002c0425357e515975c4796394 (diff)
Fminer and Lazar tests running
-rw-r--r--lib/algorithm.rb29
-rw-r--r--lib/dataset.rb26
-rw-r--r--lib/model.rb42
-rw-r--r--lib/parser.rb15
-rw-r--r--lib/serializer.rb94
-rw-r--r--lib/transform.rb51
6 files changed, 127 insertions, 130 deletions
diff --git a/lib/algorithm.rb b/lib/algorithm.rb
index 78fc447..72a87cf 100644
--- a/lib/algorithm.rb
+++ b/lib/algorithm.rb
@@ -103,22 +103,7 @@ module OpenTox
def add_fminer_data(fminer_instance, value_map)
-
- # detect nr duplicates per compound
- compound_sizes = {}
- @training_dataset.compounds.each do |compound|
- entries=@training_dataset.data_entries[compound]
- entries.each do |feature, values|
- compound_sizes[compound] || compound_sizes[compound] = []
- compound_sizes[compound] << values.size unless values.size == 0
- end
- compound_sizes[compound].uniq!
- raise "Inappropriate data for fminer" if compound_sizes[compound].size > 1
- compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
- end
-
id = 1 # fminer start id is not 0
-
@training_dataset.compounds.each do |compound|
entry=@training_dataset.data_entries[compound]
begin
@@ -132,17 +117,17 @@ module OpenTox
next
end
- entry.each do |feature,values|
+ entry && entry.each do |feature,values|
if feature == @prediction_feature.uri
- (0...compound_sizes[compound]).each { |i|
- if values[i].nil?
+ values.each do |value|
+ if value.nil?
LOGGER.warn "No #{feature} activity for #{compound.to_s}."
else
if @prediction_feature.feature_type == "classification"
- activity= value_map.invert[values[i]].to_i # activities are mapped to 1..n
+ activity= value_map.invert[value].to_i # activities are mapped to 1..n
@db_class_sizes[activity-1].nil? ? @db_class_sizes[activity-1]=1 : @db_class_sizes[activity-1]+=1 # AM effect
elsif @prediction_feature.feature_type == "regression"
- activity= values[i].to_f
+ activity= value.to_f
end
begin
fminer_instance.AddCompound(smiles,id) if fminer_instance
@@ -152,11 +137,11 @@ module OpenTox
@smi[id] = smiles
id += 1
rescue Exception => e
- LOGGER.warn "Could not add " + smiles + "\t" + values[i].to_s + " to fminer"
+ LOGGER.warn "Could not add " + smiles + "\t" + value.to_s + " to fminer"
LOGGER.warn e.backtrace
end
end
- }
+ end
end
end
end
diff --git a/lib/dataset.rb b/lib/dataset.rb
index c916722..3c5fa7f 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -316,13 +316,24 @@ module OpenTox
# @param [String] feature Compound URI
# @param [Boolean,Float] value Feature value
def add (compound,feature,value)
- @compounds << compound unless @compounds.include? compound
- @features[feature] = {} unless @features[feature]
- @data_entries[compound] = {} unless @data_entries[compound]
- @data_entries[compound][feature] = [] unless @data_entries[compound][feature]
- @data_entries[compound][feature] << value if value!=nil
+ self.add_compound(compound)
+ self.add_data_entry(compound,feature,value)
end
+ # Insert a data entry
+ # @param [String] compound Compound URI
+ # @param [String] feature Compound URI
+ # @param [Boolean,Float] value Feature value
+ def add_data_entry (compound,feature,value)
+ if @compounds.include? compound
+ @features[feature] = {} unless @features[feature]
+ @data_entries[compound] = {} unless @data_entries[compound]
+ @data_entries[compound][feature] = [] unless @data_entries[compound][feature]
+ @data_entries[compound][feature] << value
+ end
+ end
+
+
# Add/modify metadata, existing entries will be overwritten
# @example
# dataset.add_metadata({DC.title => "any_title", DC.creator => "my_email"})
@@ -361,7 +372,8 @@ module OpenTox
# Add a new compound
# @param [String] compound Compound URI
def add_compound (compound)
- @compounds << compound unless @compounds.include? compound
+ @compounds << compound
+ #@compounds << compound unless @compounds.include? compound
end
# Creates a new dataset, by splitting the current dataset, i.e. using only a subset of compounds and features
@@ -443,8 +455,6 @@ module OpenTox
# - overwrites dataset if uri exists
# @return [String] Dataset URI
def save(subjectid=nil)
- # TODO: rewrite feature URI's ??
- @compounds.uniq!
if @uri
if (CONFIG[:json_hosts].include?(URI.parse(@uri).host))
#LOGGER.debug self.to_json
diff --git a/lib/model.rb b/lib/model.rb
index 77b0274..065b227 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -261,7 +261,7 @@ module OpenTox
:lib => self.parameter(\"lib\"),
:subjectid => subjectid
})")
-
+
# Adding fingerprint of query compound with features and values(p_value*nr_hits)
@compound_fingerprints = {}
@compound_features.each do |feature, value| # value is nil if "Substructure.match"
@@ -279,12 +279,14 @@ module OpenTox
mtf.transform
# Make a prediction
- prediction = eval("#{@prediction_algorithm}( { :props => mtf.props,
- :acts => mtf.acts,
- :sims => mtf.sims,
- :value_map => @value_map,
- :min_train_performance => self.parameter(\"min_train_performance\")
- } ) ")
+ modul, algorthm = @prediction_algorithm.split('.')
+ pred_params = { :props => mtf.props,
+ :acts => mtf.acts,
+ :sims => mtf.sims,
+ :value_map => @value_map,
+ :min_train_performance => self.parameter("min_train_performance")
+ }
+ prediction = eval(modul).send(algorthm, pred_params)
value_feature_uri = File.join( @uri, "predicted", "value")
confidence_feature_uri = File.join( @uri, "predicted", "confidence")
@@ -292,16 +294,17 @@ module OpenTox
@prediction_dataset.metadata[OT.dependentVariables] = @metadata[OT.dependentVariables] unless @prediction_dataset.metadata[OT.dependentVariables]
@prediction_dataset.metadata[OT.predictedVariables] = [value_feature_uri, confidence_feature_uri] unless @prediction_dataset.metadata[OT.predictedVariables]
+ @prediction_dataset.add_compound @compound.uri
if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification"
- @prediction_dataset.add @compound.uri, value_feature_uri, @value_map[prediction[:prediction].to_s]
+ @prediction_dataset.add_data_entry @compound.uri, value_feature_uri, @value_map[prediction[:prediction].to_s]
else
- @prediction_dataset.add @compound.uri, value_feature_uri, prediction[:prediction]
+ @prediction_dataset.add_data_entry @compound.uri, value_feature_uri, prediction[:prediction]
end
confidence=prediction[:confidence]
if @similarity_algorithm.to_s =~ /cosine/
confidence=((confidence+1.0)/2.0).abs
end
- @prediction_dataset.add @compound.uri, confidence_feature_uri, confidence
+ @prediction_dataset.add_data_entry @compound.uri, confidence_feature_uri, confidence
@prediction_dataset.features[value_feature_uri][DC.title] = @prediction_dataset.metadata[DC.title]
@prediction_dataset.features[confidence_feature_uri][DC.title] = "Confidence"
@@ -318,7 +321,7 @@ module OpenTox
OT.pValue => @p_values[feature],
OT.effect => @effects[feature]
})
- @prediction_dataset.add @compound.uri, feature_uri, true
+ @prediction_dataset.add_data_entry @compound.uri, feature_uri, true
f+=1
end
elsif @feature_calculation_algorithm == "Substructure.lookup"
@@ -328,13 +331,13 @@ module OpenTox
@prediction_dataset.add_feature(feature, {
RDF.type => [OT.NumericFeature]
})
- @prediction_dataset.add @compound.uri, feature, value
+ @prediction_dataset.add_data_entry @compound.uri, feature, value
f+=1
end
else
@compound_features.each do |feature|
features[feature] = feature
- @prediction_dataset.add @compound.uri, feature, true
+ @prediction_dataset.add_data_entry @compound.uri, feature, true
end
end
n = 0
@@ -346,8 +349,10 @@ module OpenTox
OT.measuredActivity => neighbor[:activity],
RDF.type => [OT.Neighbor]
})
- @prediction_dataset.add @compound.uri, neighbor_uri, true
+ @prediction_dataset.add_data_entry @compound.uri, neighbor_uri, true
f = 0 unless f
+
+ @prediction_dataset.add_compound neighbor[:compound]
neighbor[:features].each do |feature|
if @feature_calculation_algorithm == "Substructure.match"
feature_uri = File.join( @prediction_dataset.uri, "feature", "descriptor", f.to_s) unless feature_uri = features[feature]
@@ -355,9 +360,9 @@ module OpenTox
feature_uri = feature
end
if @feature_calculation_algorithm == "Substructure.lookup"
- @prediction_dataset.add neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
+ @prediction_dataset.add_data_entry neighbor[:compound], feature_uri, @fingerprints[neighbor[:compound]][feature_uri]
else
- @prediction_dataset.add neighbor[:compound], feature_uri, true
+ @prediction_dataset.add_data_entry neighbor[:compound], feature_uri, true
end
unless features.has_key? feature
@@ -390,10 +395,11 @@ module OpenTox
# @return [Boolean] true if compound has databasse activities, false if not
def database_activity(subjectid)
if @activities[@compound.uri]
+ @prediction_dataset.add_compound @compound.uri
if OpenTox::Feature.find(metadata[OT.dependentVariables], subjectid).feature_type == "classification"
- @activities[@compound.uri].each { |act| @prediction_dataset.add @compound.uri, @metadata[OT.dependentVariables], @value_map[act.to_s] }
+ @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, @metadata[OT.dependentVariables], @value_map[act.to_s] }
else
- @activities[@compound.uri].each { |act| @prediction_dataset.add @compound.uri, @metadata[OT.dependentVariables], act }
+ @activities[@compound.uri].each { |act| @prediction_dataset.add_data_entry @compound.uri, @metadata[OT.dependentVariables], act }
end
@prediction_dataset.add_metadata(OT.hasSource => @metadata[OT.trainingDataset])
@prediction_dataset.save(subjectid)
diff --git a/lib/parser.rb b/lib/parser.rb
index 90f0570..aa1f669 100644
--- a/lib/parser.rb
+++ b/lib/parser.rb
@@ -294,7 +294,7 @@ module OpenTox
@id_errors = []
@activity_errors = []
@duplicates = {}
- @max_class_values = 3
+ @max_class_values = 5
end
def detect_new_values(row, value_maps)
@@ -475,6 +475,7 @@ module OpenTox
end
@duplicates[compound.inchi] = [] unless @duplicates[compound.inchi]
@duplicates[compound.inchi] << id+", "+row.join(", ")
+ @dataset.add_compound(compound.uri)
feature_idx = 0
row.each_index do |i|
@@ -502,12 +503,10 @@ module OpenTox
feature_idx += 1
- if val != nil
- @dataset.add(compound.uri, feature, val)
- if @feature_types[feature].include? OT.NominalFeature
- @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
- @dataset.features[feature][OT.acceptValue] << val unless @dataset.features[feature][OT.acceptValue].include?(val)
- end
+ @dataset.add_data_entry(compound.uri, feature, val)
+ if @feature_types[feature].include? OT.NominalFeature
+ @dataset.features[feature][OT.acceptValue] = [] unless @dataset.features[feature][OT.acceptValue]
+ @dataset.features[feature][OT.acceptValue] << val unless (@dataset.features[feature][OT.acceptValue].include?(val) or val.nil?)
end
end
@@ -539,7 +538,7 @@ module OpenTox
def initialize
@data = {}
@activity_errors = []
- @max_class_values = 3
+ @max_class_values = 5
end
def feature_values(feature)
diff --git a/lib/serializer.rb b/lib/serializer.rb
index c0bb60d..8d41f05 100644
--- a/lib/serializer.rb
+++ b/lib/serializer.rb
@@ -462,17 +462,6 @@ module OpenTox
features = dataset.features.keys
- # remove missing features
- delete_features = []
- features.each{ |id|
- dataset.features[id][RDF.type].each { |typestr|
- if typestr.include? "MissingFeature"
- delete_features << id
- end
- }
- }
- features = features - delete_features
-
# sort features
features.sort!
@@ -486,60 +475,56 @@ module OpenTox
compound_sizes = {}
dataset.compounds.each do |compound|
entries=dataset.data_entries[compound]
- if entries
- entries.each do |feature, values|
- compound_sizes[compound] || compound_sizes[compound] = []
- compound_sizes[compound] << values.size
- end
- compound_sizes[compound].uniq!
- raise "Inappropriate data for CSV export for compound #{compound}" if compound_sizes[compound].size > 1
- compound_sizes[compound] = compound_sizes[compound][0] # integer instead of array
+ entries && entries.each do |feature, values|
+ compound_sizes[compound] || compound_sizes[compound] = values.size
+ raise "Inappropriate data for CSV export" if compound_sizes[compound] != values.size
end
end
+ dataset.compounds.uniq.each do |compound| # Must handle compounds with no data entries
+ compound_sizes.has_key?(compound) || compound_sizes[compound] = dataset.compounds.count(compound)
+ end
+
# get headers
features_smarts && @rows.first << features_smarts || @rows.first << features
@rows.first.flatten!
+ which_row = dataset.compounds.inject({}) { |h,id| h[id]=0; h }
+
# feature positions pre-calculated
- feature_positions = features.inject({}) { |h,f|
- h.merge!({f => features.index(f)+1}) # +1 due to ID
- h
+ feature_positions = {}
+ features.each_with_index { |f,idx|
+ feature_positions[f] = idx+1 # +1 due to ID
}
- # serialize to csv
- dataset.compounds.each do |compound|
- entries=dataset.data_entries[compound]
- inchi = URI.encode_www_form_component(Compound.new(compound).to_inchi)
-
- if entries
- # allocate container
- row_container = Array.new(compound_sizes[compound])
- (0...row_container.size).each do |i|
- row_container[i] = Array.new(@rows.first.size)
- row_container[i][0] = inchi
- end
+ # feature_types pre-calculated (for quoting)
+ feature_quoting = {}
+ features.each_with_index { |f,idx|
+ feature_quoting[f] = false
+ if dataset.features[f][RDF.type].size == 1 && dataset.features[f][RDF.type][0] == OT.NominalFeature
+ feature_quoting[f] = true
+ end
+ }
- # fill entries
- entries.each { |feature, values|
- (0...compound_sizes[compound]).each { |i|
- row_container[i][feature_positions[feature]] = values[i]
- }
- }
-
- # fill zeroes for subgraphs
- if (features_smarts)
- row_container.collect! { |row|
- row.collect! { |x| x ? x : 0 }
- }
- end
- row_container.each { |row| @rows << row }
+ @rows += dataset.compounds.collect do |compound| # assumes compounds list with duplicates
+ inchi_unenc = Compound.new(compound).to_inchi
+ inchi = URI.encode_www_form_component(inchi_unenc)
- else
- row = Array.new(@rows.first.size)
- row[0] = inchi
- @rows << row
- end
+ i = which_row[compound] # select appropriate feature value
+
+ # allocate row
+ row = Array.new(@rows.first.size)
+ row[0] = inchi
+
+ # fill entries
+ entries=dataset.data_entries[compound]
+ entries && entries.each { |feature, values|
+ row[feature_positions[feature]] = feature_quoting[feature] ? "\""+values[i].to_s+"\"" : values[i].to_s
+ }
+
+ which_row[compound] = i + 1
+
+ row
end
end
@@ -549,8 +534,7 @@ module OpenTox
rows = @rows.collect
result = ""
result << rows.shift.collect { |f| f.split('/').last }.join(",") << "\n" # only feature name
- result << rows.collect{ |r| r.join(",") }.join("\n")
- result << "\n"
+ result << rows.collect{ |r| r.join(",") }.join("\n") + "\n"
end
# Convert to spreadsheet workbook
diff --git a/lib/transform.rb b/lib/transform.rb
index 67a9ec7..fb053ee 100644
--- a/lib/transform.rb
+++ b/lib/transform.rb
@@ -409,29 +409,35 @@ module OpenTox
# Find neighbors and store them as object variable, access all compounds for that.
def neighbors
@model.neighbors = []
+ which_row=@cmpds.inject({}) {|h,c| h[c]=0; h}
if @similarity_algorithm.to_s =~ /tanimoto/
- @cmpds.each_with_index { |cmpd, idx| add_neighbor @model.fingerprints[cmpd], idx }
+ @cmpds.each_with_index { |cmpd, idx|
+ fp={}; @model.fingerprints[cmpd].each { |f,vals| fp[f] = vals[which_row[cmpd]] }
+ add_neighbor fp, idx, which_row[cmpd] # Pass hash, not array
+ which_row[cmpd] += 1
+ }
else
- @n_prop.each_with_index { |fp, idx| add_neighbor fp, idx } # AM: access all compounds
+ @cmpds.each_with_index{ |cmpd, idx|
+ add_neighbor @n_prop[idx], idx, which_row[cmpd] # AM: access all compounds
+ which_row[cmpd] += 1
+ }
end
end
# Adds a neighbor to @neighbors if it passes the similarity threshold
# adjusts @ids to signal the
- def add_neighbor(training_props, idx)
+ def add_neighbor(training_props, idx, which_row)
sim = similarity(training_props)
if sim > @model.parameter("min_sim")
if @model.activities[@cmpds[idx]]
- @model.activities[@cmpds[idx]].each do |act|
- @model.neighbors << {
- :compound => @cmpds[idx],
- :similarity => sim,
- :features => @fps[idx].keys,
- :activity => act
- }
- @sims << sim
- @ids << idx
- end
+ @model.neighbors << {
+ :compound => @cmpds[idx],
+ :similarity => sim,
+ :features => @fps[idx].keys,
+ :activity => @model.activities[@cmpds[idx]][which_row]
+ }
+ @sims << sim
+ @ids << idx
end
end
end
@@ -497,20 +503,27 @@ module OpenTox
# Major BUG! Must loop over @model.compounds, hash is unordered!
# @model.fingerprints.each
+ which_row=@model.compounds.inject({}) {|h,c| h[c]=0; h}
@model.compounds.each { |cmpd|
fp = @model.fingerprints[cmpd]
if @model.activities[cmpd] # row good
- acts = @model.activities[cmpd]; @acts += acts
+ acts = @model.activities[cmpd]; @acts << acts[which_row[cmpd]]
LOGGER.debug "#{acts.size} activities for '#{cmpd}'" if acts.size > 1
- row = []; @model.features.each { |f| row << fp[f] } # nils for non-existent f's
- acts.size.times { # multiple additions for multiple activities
- @n_prop << row.collect
- @cmpds << cmpd
- @fps << Marshal.load(Marshal.dump(fp))
+ row = []; @model.features.each { |f|
+ if fp[f].nil?
+ row << nil
+ else
+ row << fp[f][which_row[cmpd]]
+ end
+ #row << fp[f].nil? ? nil : fp[f][which_row[cmpd]] # nils for non-existent f's
}
+ @n_prop << row.collect
+ @cmpds << cmpd
+ @fps << Marshal.load(Marshal.dump(fp))
else
LOGGER.warn "No activity found for compound '#{cmpd}' in model '#{@model.uri}'"
end
+ which_row[cmpd] += 1
}
@model.features.each { |f| @q_prop << @model.compound_fingerprints[f] } # query structure