summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorChristoph Helma <helma@in-silico.ch>2015-10-07 12:34:02 +0200
committerChristoph Helma <helma@in-silico.ch>2015-10-07 12:34:02 +0200
commit8d2f1c8a0f6cc9f7a481d1117bf8b3351130b1ea (patch)
treee02208fee7d7548270e23aa37c7505691f3c5cde /lib
parentbe95000d7c14174286ddc8f1717c4b6c46e0c1cc (diff)
generalised fingerprints
Diffstat (limited to 'lib')
-rw-r--r--lib/compound.rb171
-rw-r--r--lib/crossvalidation.rb5
-rw-r--r--lib/dataset.rb4
-rw-r--r--lib/experiment.rb5
-rw-r--r--lib/feature.rb2
-rw-r--r--lib/model.rb42
6 files changed, 132 insertions, 97 deletions
diff --git a/lib/compound.rb b/lib/compound.rb
index d3df125..7a3dc5c 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -9,6 +9,8 @@ module OpenTox
class Compound
include OpenTox
+ DEFAULT_FINGERPRINT = "MP2D"
+
field :inchi, type: String
field :smiles, type: String
field :inchikey, type: String
@@ -19,77 +21,64 @@ module OpenTox
field :png_id, type: BSON::ObjectId
field :svg_id, type: BSON::ObjectId
field :sdf_id, type: BSON::ObjectId
- field :fp2, type: Array
- field :fp3, type: Array
- field :fp4, type: Array
- field :fp4_size, type: Integer
- field :maccs, type: Array
+ field :fingerprints, type: Hash, default: {}
+ field :default_fingerprint_size, type: Integer
index({smiles: 1}, {unique: true})
# Overwrites standard Mongoid method to create fingerprints before database insertion
def self.find_or_create_by params
compound = self.find_or_initialize_by params
- unless compound.fp4 and !compound.fp4.empty?
- compound.fp4_size = 0
- compound.fp4 = []
- fingerprint = FingerprintSmarts.fingerprint
- Algorithm::Descriptor.smarts_match(compound, fingerprint).each_with_index do |m,i|
- if m > 0
- compound.fp4 << fingerprint[i].id
- compound.fp4_size += 1
- end
- end
- end
+ compound.default_fingerprint_size = compound.fingerprint(DEFAULT_FINGERPRINT)
compound.save
compound
end
-
- #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
- def mpd
- smarts = obconversion(smiles,"smi","mpd").strip.split("\t")
- smarts.shift # remove Title
- smarts
-
- end
-
- #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
- def mna level=2
- smarts = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
- smarts.shift # remove Title
- smarts
- end
- def openbabel_fingerprint type="FP2"
- unless self.send(type.downcase.to_sym) # stored fingerprint
- fp = OpenBabel::OBFingerprint.find_fingerprint(type)
- obmol = OpenBabel::OBMol.new
- obconversion = OpenBabel::OBConversion.new
- obconversion.set_in_format "smi"
- obconversion.read_string obmol, smiles
- result = OpenBabel::VectorUnsignedInt.new
- fp.get_fingerprint(obmol,result)
- # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
- #p OpenBabel::OBFingerprint.describe_bits(result)
- # convert result to a list of the bits that are set
- # from openbabel/scripts/python/pybel.py line 830
- # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
- result = result.to_a
- bitsperint = OpenBabel::OBFingerprint.getbitsperint()
- bits_set = []
- start = 1
- result.each do |x|
- i = start
- while x > 0 do
- bits_set << i if (x % 2) == 1
- x >>= 1
- i += 1
+ def fingerprint type="MP2D"
+ unless fingerprints[type]
+ return [] unless self.smiles
+ #http://openbabel.org/docs/dev/FileFormats/MolPrint2D_format.html#molprint2d-format
+ if type == "MP2D"
+ fp = obconversion(smiles,"smi","mpd").strip.split("\t")
+ name = fp.shift # remove Title
+ fingerprints[type] = fp
+ #http://openbabel.org/docs/dev/FileFormats/Multilevel_Neighborhoods_of_Atoms_(MNA).html
+ elsif type== "MNA"
+ level = 2 # TODO: level as parameter, evaluate level 1, see paper
+ fp = obconversion(smiles,"smi","mna","xL\"#{level}\"").split("\n")
+ fp.shift # remove Title
+ fingerprints[type] = fp
+ else # standard fingerprints
+ fp = OpenBabel::OBFingerprint.find_fingerprint(type)
+ obmol = OpenBabel::OBMol.new
+ obconversion = OpenBabel::OBConversion.new
+ obconversion.set_in_format "smi"
+ obconversion.read_string obmol, self.smiles
+ result = OpenBabel::VectorUnsignedInt.new
+ fp.get_fingerprint(obmol,result)
+ # TODO: %ignore *::DescribeBits @ line 163 openbabel/scripts/openbabel-ruby.i
+ #p OpenBabel::OBFingerprint.describe_bits(result)
+ # convert result to a list of the bits that are set
+ # from openbabel/scripts/python/pybel.py line 830
+ # see also http://openbabel.org/docs/dev/UseTheLibrary/Python_Pybel.html#fingerprints
+ result = result.to_a
+ bitsperint = OpenBabel::OBFingerprint.getbitsperint()
+ bits_set = []
+ start = 1
+ result.each do |x|
+ i = start
+ while x > 0 do
+ bits_set << i if (x % 2) == 1
+ x >>= 1
+ i += 1
+ end
+ start += bitsperint
end
- start += bitsperint
+ fingerprints[type] = bits_set
end
- update_attribute type.downcase.to_sym, bits_set
+ save
end
- self.send(type.downcase.to_sym)
+ fingerprints[type]
end
# Create a compound from smiles string
@@ -100,7 +89,8 @@ module OpenTox
def self.from_smiles smiles
smiles = obconversion(smiles,"smi","can")
if smiles.empty?
- Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
+ return nil
+ #Compound.find_or_create_by(:warning => "SMILES parsing failed for '#{smiles}', this may be caused by an incorrect SMILES string.")
else
Compound.find_or_create_by :smiles => obconversion(smiles,"smi","can")
end
@@ -146,7 +136,7 @@ module OpenTox
result = obconversion(smiles,"smi","inchi")
#result = `echo "#{self.smiles}" | "#{File.join(File.dirname(__FILE__),"..","openbabel","bin","babel")}" -ismi - -oinchi`.chomp
- update(:inchi => result.chomp) unless result.empty?
+ update(:inchi => result.chomp) if result and !result.empty?
end
self["inchi"]
end
@@ -227,20 +217,47 @@ module OpenTox
self["chemblid"]
end
- def fingerprint_neighbors params
- bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
+ def fingerprint_count_neighbors params
+ # TODO fix
neighbors = []
- query_fingerprint = self.openbabel_fingerprint params[:type]
+ query_fingerprint = self.fingerprint params[:type]
training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
unless self == compound
- fingerprint = compound.openbabel_fingerprint params[:type]
- sim = (query_fingerprint & fingerprint).size/(query_fingerprint | fingerprint).size.to_f
- neighbors << [compound.id, sim] if sim >= params[:min_sim]
+ candidate_fingerprint = compound.fingerprint params[:type]
+ features = (query_fingerprint + candidate_fingerprint).uniq
+ min_sum = 0
+ max_sum = 0
+ features.each do |f|
+ min,max = [query_fingerprint.count(f),candidate_fingerprint.count(f)].minmax
+ min_sum += min
+ max_sum += max
+ end
+ max_sum == 0 ? sim = 0 : sim = min_sum/max_sum.to_f
+ neighbors << [compound.id, sim] if sim and sim >= params[:min_sim]
end
end
neighbors.sort{|a,b| b.last <=> a.last}
end
+ def fingerprint_neighbors params
+ bad_request_error "Incorrect parameters '#{params}' for Compound#fingerprint_neighbors. Please provide :type, :training_dataset_id, :min_sim." unless params[:type] and params[:training_dataset_id] and params[:min_sim]
+ neighbors = []
+ #if params[:type] == DEFAULT_FINGERPRINT
+ #neighbors = db_neighbors params
+ #p neighbors
+ #else
+ query_fingerprint = self.fingerprint params[:type]
+ training_dataset = Dataset.find(params[:training_dataset_id]).compounds.each do |compound|
+ unless self == compound
+ candidate_fingerprint = compound.fingerprint params[:type]
+ sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
+ neighbors << [compound.id, sim] if sim >= params[:min_sim]
+ end
+ end
+ #end
+ neighbors.sort{|a,b| b.last <=> a.last}
+ end
+
def fminer_neighbors params
bad_request_error "Incorrect parameters for Compound#fminer_neighbors. Please provide :feature_dataset_id, :min_sim." unless params[:feature_dataset_id] and params[:min_sim]
feature_dataset = Dataset.find params[:feature_dataset_id]
@@ -248,8 +265,8 @@ module OpenTox
neighbors = []
# find neighbors
- feature_dataset.data_entries.each_with_index do |fingerprint, i|
- sim = Algorithm::Similarity.tanimoto fingerprint, query_fingerprint
+ feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
+ sim = Algorithm::Similarity.tanimoto candidate_fingerprint, query_fingerprint
if sim >= params[:min_sim]
neighbors << [feature_dataset.compound_ids[i],sim] # use compound_ids, instantiation of Compounds is too time consuming
end
@@ -261,10 +278,10 @@ module OpenTox
feature_dataset = Dataset.find params[:feature_dataset_id]
query_fingerprint = Algorithm.run params[:feature_calculation_algorithm], self, params[:descriptors]
neighbors = []
- feature_dataset.data_entries.each_with_index do |fingerprint, i|
+ feature_dataset.data_entries.each_with_index do |candidate_fingerprint, i|
# TODO implement pearson and cosine similarity separatly
R.assign "x", query_fingerprint
- R.assign "y", fingerprint
+ R.assign "y", candidate_fingerprint
# pearson r
#sim = R.eval("cor(x,y,use='complete.obs',method='pearson')").to_ruby
#p "pearson"
@@ -279,10 +296,12 @@ module OpenTox
neighbors
end
- def neighbors threshold=0.7
+ def db_neighbors params
+ p "DB NEIGHBORS"
+ p params
# TODO restrict to dataset
# from http://blog.matt-swain.com/post/87093745652/chemical-similarity-search-in-mongodb
- qn = fp4.size
+ qn = fingerprint(params[:type]).size
#qmin = qn * threshold
#qmax = qn / threshold
#not sure if it is worth the effort of keeping feature counts up to date (compound deletions, additions, ...)
@@ -292,12 +311,12 @@ module OpenTox
{'$match' => {'_id' => {'$ne' => self.id}}}, # remove self
{'$project' => {
'tanimoto' => {'$let' => {
- 'vars' => {'common' => {'$size' => {'$setIntersection' => ['$fp4', fp4]}}},
- 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$fp4_size']}, '$$common']}]}
+ 'vars' => {'common' => {'$size' => {'$setIntersection' => ["'$#{DEFAULT_FINGERPRINT}'", DEFAULT_FINGERPRINT]}}},
+ 'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [qn, '$default_fingerprint_size']}, '$$common']}]}
}},
'_id' => 1
}},
- {'$match' => {'tanimoto' => {'$gte' => threshold}}},
+ {'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
{'$sort' => {'tanimoto' => -1}}
]
@@ -312,12 +331,12 @@ module OpenTox
obconversion.set_options(option, OpenBabel::OBConversion::OUTOPTIONS) if option
obmol = OpenBabel::OBMol.new
obconversion.set_in_and_out_formats input_format, output_format
+ return nil if identifier.nil?
obconversion.read_string obmol, identifier
case output_format
when /smi|can|inchi/
obconversion.write_string(obmol).gsub(/\s/,'').chomp
when /sdf/
-p "SDF conversion"
# TODO: find disconnected structures
# strip_salts
# separate
diff --git a/lib/crossvalidation.rb b/lib/crossvalidation.rb
index 4c80344..6dc8d7f 100644
--- a/lib/crossvalidation.rb
+++ b/lib/crossvalidation.rb
@@ -54,6 +54,7 @@ module OpenTox
nr_unpredicted: nr_unpredicted,
predictions: predictions
)
+ $logger.debug "Nr unpredicted: #{nr_unpredicted}"
cv.statistics
cv
end
@@ -122,6 +123,7 @@ module OpenTox
predictivity: predictivity,
finished_at: Time.now
)
+ $logger.debug "Accuracy #{accuracy}"
end
#Average area under roc 0.646
@@ -192,6 +194,9 @@ module OpenTox
r_squared: r**2,
finished_at: Time.now
)
+ $logger.debug "R^2 #{r**2}"
+ $logger.debug "RMSE #{rmse}"
+ $logger.debug "MAE #{mae}"
end
def misclassifications n=nil
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 7c8ab44..60f3bb5 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -266,8 +266,8 @@ module OpenTox
end
compounds.duplicates.each do |compound|
positions = []
- compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi == compound.inchi}
- warnings << "Duplicate compound #{compound.inchi} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
+ compounds.each_with_index{|c,i| positions << i+1 if !c.blank? and c.inchi and c.inchi == compound.inchi}
+ warnings << "Duplicate compound #{compound.smiles} at rows #{positions.join(', ')}. Entries are accepted, assuming that measurements come from independent experiments."
end
$logger.debug "Value parsing: #{Time.now-time} (Compound creation: #{compound_time})"
diff --git a/lib/experiment.rb b/lib/experiment.rb
index 6910139..0dfdf86 100644
--- a/lib/experiment.rb
+++ b/lib/experiment.rb
@@ -34,6 +34,7 @@ module OpenTox
report[:results][dataset_name] = {}
report[:results][dataset_name][:anova] = {}
report[:results][dataset_name][:data] = []
+ # TODO results[dataset_id.to_s] does not exist
results[dataset_id.to_s].each do |result|
model = Model::Lazar.find(result[:model_id])
repeated_cv = RepeatedCrossValidation.find(result[:repeated_crossvalidation_id])
@@ -67,6 +68,7 @@ module OpenTox
outcome << p
end
end
+ begin
R.assign "experiment_nr",experiments.collect{|i| "Experiment #{i}"}
R.eval "experiment_nr = factor(experiment_nr)"
R.assign "outcome", outcome
@@ -78,6 +80,9 @@ module OpenTox
# aequivalent
# sum = R.eval("summary(fit)")
#p_value = sum.to_ruby.first.last.first
+ rescue
+ p_value = nil
+ end
report[:results][dataset][:anova][param] = p_value
=begin
=end
diff --git a/lib/feature.rb b/lib/feature.rb
index 6fc2c06..13fa6d1 100644
--- a/lib/feature.rb
+++ b/lib/feature.rb
@@ -47,6 +47,7 @@ module OpenTox
class FingerprintSmarts < Smarts
field :count, type: Integer
def self.fingerprint
+=begin
@@fp4 ||= OpenTox::FingerprintSmarts.all
unless @@fp4.size == 306
@@fp4 = []
@@ -72,6 +73,7 @@ module OpenTox
end
end
@@fp4
+=end
end
end
diff --git a/lib/model.rb b/lib/model.rb
index 817a61e..cd88e0c 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -82,7 +82,6 @@ module OpenTox
end
neighbors = compound.send(neighbor_algorithm, neighbor_algorithm_parameters)
- #neighbors = Algorithm.run(neighbor_algorithm, compound, neighbor_algorithm_parameters)
# add activities
# TODO: improve efficiency, takes 3 times longer than previous version
neighbors.collect! do |n|
@@ -145,12 +144,12 @@ module OpenTox
def self.create training_dataset, params={}
model = self.new training_dataset, params
model.prediction_algorithm = "OpenTox::Algorithm::Classification.weighted_majority_vote" unless model.prediction_algorithm
- model.neighbor_algorithm |= "fingerprint_neighbors"
+ model.neighbor_algorithm ||= "fingerprint_neighbors"
model.neighbor_algorithm_parameters ||= {}
{
- :type => "FP4",
+ :type => "MP2D",
:training_dataset_id => training_dataset.id,
- :min_sim => 0.7
+ :min_sim => 0.1
}.each do |key,value|
model.neighbor_algorithm_parameters[key] ||= value
end
@@ -163,16 +162,19 @@ module OpenTox
def self.create training_dataset, params={}
model = self.new training_dataset, params
- #model.neighbor_algorithm ||= "fingerprint_neighbors"
- #model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average"
- #model.neighbor_algorithm_parameters ||= {}
- #{
+ model.neighbor_algorithm ||= "fingerprint_neighbors"
+ model.prediction_algorithm ||= "OpenTox::Algorithm::Regression.weighted_average"
+ model.neighbor_algorithm_parameters ||= {}
+ {
+ :type => "MP2D",
+ :training_dataset_id => training_dataset.id,
+ :min_sim => 0.1
#:type => "FP4",
#:training_dataset_id => training_dataset.id,
#:min_sim => 0.7
- #}.each do |key,value|
- #model.neighbor_algorithm_parameters[key] ||= value
- #end
+ }.each do |key,value|
+ model.neighbor_algorithm_parameters[key] ||= value
+ end
model.save
model
end
@@ -209,7 +211,7 @@ module OpenTox
field :source, type: String
field :unit, type: String
field :model_id, type: BSON::ObjectId
- field :crossvalidation_id, type: BSON::ObjectId
+ field :repeated_crossvalidation_id, type: BSON::ObjectId
def predict object
Lazar.find(model_id).predict object
@@ -223,8 +225,12 @@ module OpenTox
Lazar.find model_id
end
- def crossvalidation
- CrossValidation.find crossvalidation_id
+ def repeated_crossvalidation
+ RepeatedCrossValidation.find repeated_crossvalidation_id
+ end
+
+ def crossvalidations
+ repeated_crossvalidation.crossvalidations
end
def regression?
@@ -241,16 +247,14 @@ module OpenTox
prediction_model = self.new JSON.parse(File.read(metadata_file))
training_dataset = Dataset.from_csv_file file
model = nil
- cv = nil
if training_dataset.features.first.nominal?
- model = LazarFminerClassification.create training_dataset
- cv = ClassificationCrossValidation.create model
+ #model = LazarFminerClassification.create training_dataset
+ model = LazarClassification.create training_dataset
elsif training_dataset.features.first.numeric?
model = LazarRegression.create training_dataset
- cv = RegressionCrossValidation.create model
end
prediction_model[:model_id] = model.id
- prediction_model[:crossvalidation_id] = cv.id
+ prediction_model[:repeated_crossvalidation_id] = RepeatedCrossValidation.create(model).id
prediction_model.save
prediction_model
end