summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--lib/classification.rb2
-rw-r--r--lib/compound.rb6
-rw-r--r--lib/dataset.rb29
-rw-r--r--lib/model.rb35
-rw-r--r--lib/nanoparticle.rb30
-rw-r--r--lib/opentox.rb5
-rw-r--r--lib/regression.rb35
-rw-r--r--lib/substance.rb1
-rw-r--r--test/classification.rb14
-rw-r--r--test/nanoparticles.rb23
-rw-r--r--test/setup.rb4
12 files changed, 113 insertions, 72 deletions
diff --git a/.gitignore b/.gitignore
index fb51df7..6e0f374 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+R
openbabel
Gemfile.lock
*.gem
diff --git a/lib/classification.rb b/lib/classification.rb
index 0202940..4a17546 100644
--- a/lib/classification.rb
+++ b/lib/classification.rb
@@ -10,7 +10,7 @@ module OpenTox
confidence = 0.0
neighbors.each do |row|
sim = row["tanimoto"]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
+ row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
weighted_sum[act] ||= 0
weighted_sum[act] += sim
end
diff --git a/lib/compound.rb b/lib/compound.rb
index 7895619..55cd482 100644
--- a/lib/compound.rb
+++ b/lib/compound.rb
@@ -17,8 +17,6 @@ module OpenTox
field :sdf_id, type: BSON::ObjectId
field :fingerprints, type: Hash, default: {}
field :default_fingerprint_size, type: Integer
- # TODO separate between physchem, bio and tox
- field :features, type: Hash, default: {}
index({smiles: 1}, {unique: true})
@@ -291,7 +289,7 @@ module OpenTox
candidate_fingerprint = compound.fingerprint params[:type]
sim = (query_fingerprint & candidate_fingerprint).size/(query_fingerprint | candidate_fingerprint).size.to_f
feature_values = training_dataset.values(compound,prediction_feature)
- neighbors << {"_id" => compound.id, "features" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
+ neighbors << {"_id" => compound.id, "toxicities" => {prediction_feature.id.to_s => feature_values}, "tanimoto" => sim} if sim >= params[:min_sim]
end
neighbors.sort!{|a,b| b["tanimoto"] <=> a["tanimoto"]}
end
@@ -332,7 +330,7 @@ module OpenTox
'in' => {'$divide' => ['$$common', {'$subtract' => [{'$add' => [default_fingerprint_size, '$default_fingerprint_size']}, '$$common']}]}
}},
'_id' => 1,
- 'features' => 1,
+ 'toxicities' => 1,
'dataset_ids' => 1
}},
{'$match' => {'tanimoto' => {'$gte' => params[:min_sim]}}},
diff --git a/lib/dataset.rb b/lib/dataset.rb
index 25307c9..274c475 100644
--- a/lib/dataset.rb
+++ b/lib/dataset.rb
@@ -13,6 +13,10 @@ module OpenTox
substances.select{|s| s.is_a? Compound}
end
+ def nanoparticles
+ substances.select{|s| s.is_a? Nanoparticle}
+ end
+
# Get all substances
def substances
@substances ||= data_entries.keys.collect{|id| OpenTox::Substance.find id}
@@ -21,7 +25,7 @@ module OpenTox
# Get all features
def features
- @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.collect{|id| OpenTox::Feature.find(id)}
+ @features ||= data_entries.collect{|cid,f| f.first}.flatten.uniq.compact.collect{|id| OpenTox::Feature.find(id)}.compact
@features
end
@@ -98,13 +102,22 @@ module OpenTox
# @return [String]
def to_csv(inchi=false)
CSV.generate() do |csv|
- csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
+ compound = Substance.find(data_entries.first.first).is_a? Compound
+ if compound
+ csv << [inchi ? "InChI" : "SMILES"] + features.collect{|f| f.name}
+ else
+ csv << ["Name"] + features.collect{|f| f.name}
+ end
data_entries.each do |sid,f|
- substance = Substance.find cid
+ substance = Substance.find sid
features.each do |feature|
- f[feature.id].each do |v|
- csv << [inchi ? substance.inchi : substance.smiles , v]
- end
+ f[feature.id.to_s].each do |v|
+ if compound
+ csv << [inchi ? substance.inchi : substance.smiles , v]
+ else
+ csv << [substance.name , v]
+ end
+ end if f[feature.id.to_s]
end
end
end
@@ -221,8 +234,8 @@ module OpenTox
self.data_entries[compound.id.to_s] ||= {}
self.data_entries[compound.id.to_s][@features[j].id.to_s] ||= []
self.data_entries[compound.id.to_s][@features[j].id.to_s] << v
- compound.features[@features[j].id.to_s] ||= []
- compound.features[@features[j].id.to_s] << v
+ compound.toxicities[@features[j].id.to_s] ||= []
+ compound.toxicities[@features[j].id.to_s] << v
compound.save
end
end
diff --git a/lib/model.rb b/lib/model.rb
index 5140d5a..1960c10 100644
--- a/lib/model.rb
+++ b/lib/model.rb
@@ -36,6 +36,7 @@ module OpenTox
super params
# TODO document convention
+ #p training_dataset.features
prediction_feature = training_dataset.features.first
# set defaults for empty parameters
self.prediction_feature_id ||= prediction_feature.id
@@ -56,12 +57,13 @@ module OpenTox
prediction = {}
if neighbors.collect{|n| n["_id"]}.include? compound.id
- database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["features"][prediction_feature.id.to_s].uniq
+ #TODO restrict to dataset features
+ database_activities = neighbors.select{|n| n["_id"] == compound.id}.first["toxicities"][prediction_feature.id.to_s].uniq
prediction[:database_activities] = database_activities
prediction[:warning] = "#{database_activities.size} compounds have been removed from neighbors, because they have the same structure as the query compound."
neighbors.delete_if{|n| n["_id"] == compound.id}
end
- neighbors.delete_if{|n| n['features'].empty? or n['features'][prediction_feature.id.to_s] == [nil] }
+ neighbors.delete_if{|n| n['toxicities'].empty? or n['toxicities'][prediction_feature.id.to_s] == [nil] }
if neighbors.empty?
prediction.merge!({:value => nil,:confidence => nil,:warning => "Could not find similar compounds with experimental data in the training dataset.",:neighbors => []})
else
@@ -78,12 +80,11 @@ module OpenTox
# parse data
compounds = []
- case object.class.to_s
- when "OpenTox::Compound"
+ if object.is_a? Substance
compounds = [object]
- when "Array"
+ elsif object.is_a? Array
compounds = object
- when "OpenTox::Dataset"
+ elsif object.is_a? Dataset
compounds = object.compounds
else
bad_request_error "Please provide a OpenTox::Compound an Array of OpenTox::Compounds or an OpenTox::Dataset as parameter."
@@ -97,30 +98,26 @@ module OpenTox
end
# serialize result
- case object.class.to_s
- when "OpenTox::Compound"
+ if object.is_a? Substance
prediction = predictions[compounds.first.id.to_s]
prediction[:neighbors].sort!{|a,b| b[1] <=> a[1]} # sort according to similarity
+ return prediction
+ elsif object.is_a? Array
return predictions
- when "Array"
- return predictions
- when "OpenTox::Dataset"
+ elsif object.is_a? Dataset
predictions.each{|cid,p| p.delete(:neighbors)}
# prepare prediction dataset
measurement_feature = Feature.find prediction_feature_id
prediction_feature = NumericFeature.find_or_create_by( "name" => measurement_feature.name + " (Prediction)" )
- prediction_dataset = LazarPrediction.new(
+ prediction_dataset = LazarPrediction.create(
:name => "Lazar prediction for #{prediction_feature.name}",
:creator => __FILE__,
- :prediction_feature_id => prediction_feature.id
-
+ :prediction_feature_id => prediction_feature.id,
+ :predictions => predictions
)
- compounds.each_with_index do |c,i|
- prediction_dataset.predictions[c.id.to_s] = predictions[i]
- end
- prediction_dataset.save
+ #prediction_dataset.save
return prediction_dataset
end
@@ -264,7 +261,7 @@ module OpenTox
training_features = training.collect{|t| t.physchem_descriptors.keys}.flatten.uniq
query_features = nanoparticle.physchem_descriptors.keys
common_features = (training_features & query_features)
- p common_features
+ #p common_features
end
end
diff --git a/lib/nanoparticle.rb b/lib/nanoparticle.rb
index 6e9b0ea..0350363 100644
--- a/lib/nanoparticle.rb
+++ b/lib/nanoparticle.rb
@@ -5,12 +5,10 @@ module OpenTox
field :core, type: String
field :coating, type: Array, default: []
-
- field :toxicities, type: Hash, default: {}
- #field :features, type: Hash, default: {}
field :bundles, type: Array, default: []
- def predict
+ def nanoparticle_neighbors params
+ Dataset.find(params[:training_dataset_id]).nanoparticles
end
def add_feature feature, value
@@ -21,22 +19,32 @@ module OpenTox
toxicities[feature.id.to_s] ||= []
toxicities[feature.id.to_s] << value
else
- $logger.warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted."
- warnings << "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted."
+ warn "Unknown feature type '#{feature.source}'. Value '#{value}' not inserted."
end
end
def parse_ambit_value feature, v
+ # TODO: units, mmol/log10 conversion
if v.keys == ["loValue"]
- add_feature feature, v["loValue"]
+ #if v["loValue"].numeric?
+ add_feature feature, v["loValue"]
+ #else
+ #warn "'#{v["loValue"]}' is not a numeric value, entry ignored."
+ #end
elsif v.keys.size == 2 and v["loQualifier"] == "mean"
- add_feature feature, {:mean => v["loValue"]}
+ #add_feature feature, {:mean => v["loValue"]}
+ add_feature feature, v["loValue"]
+ warn "'#{feature.name}' is a mean value. Original data is not available."
elsif v.keys.size == 2 and v["loQualifier"] #== ">="
- add_feature feature, {:min => v["loValue"],:max => Float::INFINITY}
+ #add_feature feature, {:min => v["loValue"],:max => Float::INFINITY}
+ warn "Only min value available for '#{feature.name}', entry ignored"
elsif v.keys.size == 2 and v["upQualifier"] #== ">="
- add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY}
+ #add_feature feature, {:max => v["upValue"],:min => -Float::INFINITY}
+ warn "Only max value available for '#{feature.name}', entry ignored"
elsif v.size == 4 and v["loQualifier"] and v["upQualifier"]
- add_feature feature, {:min => v["loValue"],:max => v["upValue"]}
+ #add_feature feature, {:min => v["loValue"],:max => v["upValue"]}
+ add_feature feature, [v["loValue"],v["upValue"]].mean
+ warn "Using mean value of range #{v["loValue"]} - #{v["upValue"]} for '#{feature.name}'. Original data is not available."
elsif v == {} # do nothing
else
$logger.warn "Cannot parse Ambit eNanoMapper value '#{v}' for feature '#{feature.name}'."
diff --git a/lib/opentox.rb b/lib/opentox.rb
index cc18cc6..7d8a8a2 100644
--- a/lib/opentox.rb
+++ b/lib/opentox.rb
@@ -15,6 +15,11 @@ module OpenTox
field :name, type: String
field :source, type: String
field :warnings, type: Array, default: []
+
+ def warn warning
+ $logger.warn warning
+ warnings << warning
+ end
end
OpenTox.const_set klass,c
end
diff --git a/lib/regression.rb b/lib/regression.rb
index 5021fb3..cb17f25 100644
--- a/lib/regression.rb
+++ b/lib/regression.rb
@@ -9,8 +9,8 @@ module OpenTox
neighbors = params[:neighbors]
neighbors.each do |row|
sim = row["tanimoto"]
- if row["features"][params[:prediction_feature_id].to_s]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
+ if row["toxicities"][params[:prediction_feature_id].to_s]
+ row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
weighted_sum += sim*Math.log10(act)
sim_sum += sim
end
@@ -32,8 +32,8 @@ module OpenTox
neighbors.each_with_index do |row,i|
neighbor = Compound.find row["_id"]
fingerprint = neighbor.fingerprint
- if row["features"][params[:prediction_feature_id].to_s]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
+ if row["toxicities"][params[:prediction_feature_id].to_s]
+ row["toxicities"][params[:prediction_feature_id].to_s].each do |act|
activities << Math.log10(act)
weights << row["tanimoto"]
fingerprint_ids.each_with_index do |id,j|
@@ -79,21 +79,24 @@ module OpenTox
neighbors = params[:neighbors]
return {:value => nil, :confidence => nil, :warning => "No similar compounds in the training data"} unless neighbors.size > 0
- return {:value => neighbors.first["features"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
+ return {:value => neighbors.first["toxicities"][params[:prediction_feature_id]], :confidence => nil, :warning => "Only one similar compound in the training set"} unless neighbors.size > 1
activities = []
weights = []
physchem = {}
- neighbors.each_with_index do |row,i|
- neighbor = Compound.find row["_id"]
- if row["features"][params[:prediction_feature_id].to_s]
- row["features"][params[:prediction_feature_id].to_s].each do |act|
- activities << Math.log10(act)
- weights << row["tanimoto"] # TODO cosine ?
- neighbor.physchem.each do |pid,v| # insert physchem only if there is an activity
+ neighbors.each_with_index do |n,i|
+ if n["toxicities"][params[:prediction_feature_id].to_s]
+ n["toxicities"][params[:prediction_feature_id].to_s].each do |act|
+ # TODO fix!!!!
+ activities << -Math.log10(act)
+ #if act.numeric?
+ #activities << act
+ n["tanimoto"] ? weights << n["tanimoto"] : weights << 1.0 # TODO cosine ?
+ neighbor = Substance.find(n["_id"])
+ neighbor.physchem_descriptors.each do |pid,v| # insert physchem only if there is an activity
physchem[pid] ||= []
- physchem[pid] << v
+ physchem[pid] += v
end
end
end
@@ -110,8 +113,8 @@ module OpenTox
return result
else
- data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid] }
- prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem[pid]}
+ data_frame = [activities] + physchem.keys.collect { |pid| physchem[pid].collect{|v| "\"#{v.sub('[','').sub(']','')}\"" if v.is_a? String }}
+ prediction = r_model_prediction method, data_frame, physchem.keys, weights, physchem.keys.collect{|pid| compound.physchem_descriptors[pid]}
if prediction.nil?
prediction = local_weighted_average(compound, params)
prediction[:warning] = "Could not create local PLS model. Using weighted average of similar compounds."
@@ -127,6 +130,8 @@ module OpenTox
def self.r_model_prediction method, training_data, training_features, training_weights, query_feature_values
R.assign "weights", training_weights
r_data_frame = "data.frame(#{training_data.collect{|r| "c(#{r.join(',')})"}.join(', ')})"
+ #p r_data_frame
+ File.open("tmp.R","w+"){|f| f.puts "data <- #{r_data_frame}\n"}
R.eval "data <- #{r_data_frame}"
R.assign "features", training_features
R.eval "names(data) <- append(c('activities'),features)" #
diff --git a/lib/substance.rb b/lib/substance.rb
index 6768ce7..82ca65d 100644
--- a/lib/substance.rb
+++ b/lib/substance.rb
@@ -2,6 +2,7 @@ module OpenTox
class Substance
field :physchem_descriptors, type: Hash, default: {}
+ field :toxicities, type: Hash, default: {}
field :dataset_ids, type: Array, default: []
end
diff --git a/test/classification.rb b/test/classification.rb
index af23db6..7412714 100644
--- a/test/classification.rb
+++ b/test/classification.rb
@@ -30,14 +30,14 @@ class LazarClassificationTest < MiniTest::Test
# make a dataset prediction
compound_dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.mini.csv")
- prediction = model.predict compound_dataset
- assert_equal compound_dataset.compounds, prediction.compounds
+ prediction_dataset = model.predict compound_dataset
+ assert_equal compound_dataset.compounds, prediction_dataset.compounds
- cid = prediction.compounds[7].id.to_s
- assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction.predictions[cid][:warning]
- cid = prediction.compounds[9].id.to_s
- assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction.predictions[cid][:warning]
+ cid = prediction_dataset.compounds[7].id.to_s
+ assert_equal "Could not find similar compounds with experimental data in the training dataset.", prediction_dataset.predictions[cid][:warning]
+ cid = prediction_dataset.compounds[9].id.to_s
+ assert_equal "1 compounds have been removed from neighbors, because they have the same structure as the query compound.", prediction_dataset.predictions[cid][:warning]
# cleanup
- [training_dataset,model,compound_dataset].each{|o| o.delete}
+ [training_dataset,model,compound_dataset,prediction_dataset].each{|o| o.delete}
end
end
diff --git a/test/nanoparticles.rb b/test/nanoparticles.rb
index 6f241ec..46073a9 100644
--- a/test/nanoparticles.rb
+++ b/test/nanoparticles.rb
@@ -6,16 +6,29 @@ class NanoparticleTest < MiniTest::Test
dataset_ids = Import::Enanomapper.import
assert_operator Nanoparticle.count , :>, 570, "Only #{Nanoparticle.count} nanoparticles imported"
assert_operator dataset_ids.size, :>, 8, "Only #{dataset_ids.size} bundles imported"
- p dataset_ids.collect{|d| Dataset.find(d).name}
assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("NanoWiki")
assert dataset_ids.collect{|d| Dataset.find(d).name}.include? ("Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+ p dataset_ids.collect{|d| {d => Dataset.find(d).name}}
+ dataset_ids.collect do |d|
+ d = Dataset.find(d)
+ p d.name
+ puts d.to_csv
+ end
end
- def test_create_model
- Model::NanoLazar.create_all.each do |model|
- np = Nanoparticle.find(model.training_particle_ids.sample)
- model.predict np
+ def test_export
+ Dataset.all.each do |d|
+ puts d.to_csv
end
end
+ def test_create_model
+ training_dataset = Dataset.find_or_create_by(:name => "Protein Corona Fingerprinting Predicts the Cellular Interaction of Gold and Silver Nanoparticles")
+ model = Model::LazarRegression.create(training_dataset, :prediction_algorithm => "OpenTox::Algorithm::Regression.local_physchem_regression", :neighbor_algorithm => "nanoparticle_neighbors")
+ nanoparticle = training_dataset.nanoparticles[-34]
+ prediction = model.predict nanoparticle
+ p prediction
+ refute_nil prediction[:value]
+ end
+
end
diff --git a/test/setup.rb b/test/setup.rb
index e7c32b4..6c97282 100644
--- a/test/setup.rb
+++ b/test/setup.rb
@@ -5,5 +5,5 @@ require_relative '../lib/lazar.rb'
include OpenTox
TEST_DIR ||= File.expand_path(File.dirname(__FILE__))
DATA_DIR ||= File.join(TEST_DIR,"data")
-$mongo.database.drop
-$gridfs = $mongo.database.fs
+#$mongo.database.drop
+#$gridfs = $mongo.database.fs