summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md100
-rw-r--r--lib/validation-statistics.rb62
2 files changed, 116 insertions, 46 deletions
diff --git a/README.md b/README.md
index 28ed18f..1f62c36 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,75 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
#### Experiment with other algorithms
- You can pass algorithms parameters to the `Model::Validation.create_from_csv_file` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions.
+ You can pass algorithm specifications as parameters to the `Model::Validation.create_from_csv_file` and `Model::Lazar.create` commands. Algorithms for descriptors, similarity calculations, feature_selection and local models are specified in the `algorithm` parameter. Unspecified algorithms and parameters are substituted by default values. The example below selects
+
+ - MP2D fingerprint descriptors
+ - Tanimoto similarity with a threshold of 0.1
+ - no feature selection
+ - weighted majority vote predictions
+
+ ```
+algorithms = {
+ :descriptors => { # descriptor algorithm
+ :method => "fingerprint", # fingerprint descriptors
+ :type => "MP2D" # fingerprint type, e.g. FP4, MACCS
+ },
+ :similarity => { # similarity algorithm
+ :method => "Algorithm::Similarity.tanimoto",
+ :min => 0.1 # similarity threshold for neighbors
+ },
+ :feature_selection => nil, # no feature selection
+ :prediction => { # local modelling algorithm
+ :method => "Algorithm::Classification.weighted_majority_vote",
+ },
+}
+
+training_dataset = Dataset.from_csv_file "hamster_carcinogenicity.csv"
+model = Model::Lazar.create training_dataset: training_dataset, algorithms: algorithms
+ ```
+
+ The next example creates a regression model with
+
+ - calculated descriptors from OpenBabel libraries
+ - weighted cosine similarity and a threshold of 0.5
+ - descriptors that are correlated with the endpoint
+ - local partial least squares models from the R caret package
+
+ ```
+algorithms = {
+ :descriptors => { # descriptor algorithm
+ :method => "calculate_properties",
+ :features => PhysChem.openbabel_descriptors,
+ },
+ :similarity => { # similarity algorithm
+ :method => "Algorithm::Similarity.weighted_cosine",
+ :min => 0.5
+ },
+ :feature_selection => { # feature selection algorithm
+ :method => "Algorithm::FeatureSelection.correlation_filter",
+ },
+ :prediction => { # local modelling algorithm
+ :method => "Algorithm::Caret.pls",
+ },
+}
+training_dataset = Dataset.from_csv_file "EPAFHM_log10.csv"
+model = Model::Lazar.create(training_dataset:training_dataset, algorithms:algorithms)
+ ```
+
+Please consult the [API documentation](http://rdoc.info/gems/lazar) and [source code](https:://github.com/opentox/lazar) for up to date information about implemented algorithms:
+
+- Descriptor algorithms
+ - [Compounds](http://www.rubydoc.info/gems/lazar/OpenTox/Compound)
+ - [Nanoparticles](http://www.rubydoc.info/gems/lazar/OpenTox/Nanoparticle)
+- [Similarity algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Similarity)
+- [Feature selection algorithms](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/FeatureSelection)
+- Local models
+ - [Classification](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Classification)
+ - [Regression](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Regression)
+ - [R caret](http://www.rubydoc.info/gems/lazar/OpenTox/Algorithm/Caret)
+
+
+You can find more working examples in the `lazar` `model-*.rb` and `validation-*.rb` [tests](https://github.com/opentox/lazar/tree/master/test).
### Create and use `lazar` nanoparticle models
@@ -87,7 +155,35 @@ Execute the following commands either from an interactive Ruby shell or a Ruby s
#### Experiment with other datasets, endpoints and algorithms
- You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. The [API documentation](http://rdoc.info/gems/lazar) provides detailed instructions. Detailed documentation and validation results can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf).
+ You can pass training_dataset, prediction_feature and algorithms parameters to the `Model::Validation.create_from_enanomapper` command. Procedure and options are the same as for compounds. The following commands create and validate a `nano-lazar` model with
+
+ - measured P-CHEM properties as descriptors
+ - descriptors selected with correlation filter
+ - weighted cosine similarity with a threshold of 0.5
+ - Caret random forests
+
+```
+algorithms = {
+ :descriptors => {
+ :method => "properties",
+ :categories => ["P-CHEM"],
+ },
+ :similarity => {
+ :method => "Algorithm::Similarity.weighted_cosine",
+ :min => 0.5
+ },
+ :feature_selection => {
+ :method => "Algorithm::FeatureSelection.correlation_filter",
+ },
+ :prediction => {
+ :method => "Algorithm::Caret.rf",
+ },
+}
+validation_model = Model::Validation.from_enanomapper algorithms: algorithms
+```
+
+
+ Detailed documentation and validation results for nanoparticle models can be found in this [publication](https://github.com/enanomapper/nano-lazar-paper/blob/master/nano-lazar.pdf).
Documentation
-------------
diff --git a/lib/validation-statistics.rb b/lib/validation-statistics.rb
index 553e6ac..2d522ae 100644
--- a/lib/validation-statistics.rb
+++ b/lib/validation-statistics.rb
@@ -179,8 +179,12 @@ module OpenTox
R.assign "prediction", y
R.eval "all = c(measurement,prediction)"
R.eval "range = c(min(all), max(all))"
- title = feature.name
- title += "[#{feature.unit}]" if feature.unit and !feature.unit.blank?
+ if feature.name.match /Net cell association/ # ad hoc fix for awkward units
+ title = "log2(Net cell association [mL/ug(Mg)])"
+ else
+ title = feature.name
+ title += " [#{feature.unit}]" if feature.unit and !feature.unit.blank?
+ end
R.eval "image = qplot(prediction,measurement,main='#{title}',xlab='Prediction',ylab='Measurement',asp=1,xlim=range, ylim=range)"
R.eval "image = image + geom_abline(intercept=0, slope=1)"
R.eval "ggsave(file='#{tmpfile}', plot=image)"
@@ -191,51 +195,21 @@ module OpenTox
$gridfs.find_one(_id: correlation_plot_id).data
end
- # Get predictions with the largest difference between predicted and measured values
- # @params [Fixnum] number of predictions
- # @params [TrueClass,FalseClass,nil] include neighbors
- # @params [TrueClass,FalseClass,nil] show common descriptors
+ # Get predictions with measurements outside of the prediction interval
# @return [Hash]
- def worst_predictions n: 5, show_neigbors: true, show_common_descriptors: false
- worst_predictions = predictions.sort_by{|sid,p| -(p["value"] - p["measurements"].median).abs}[0,n]
- worst_predictions.collect do |p|
- substance = Substance.find(p.first)
- prediction = p[1]
- if show_neigbors
- neighbors = prediction["neighbors"].collect do |n|
- common_descriptors = []
- if show_common_descriptors
- common_descriptors = n["common_descriptors"].collect do |d|
- f=Feature.find(d)
- {
- :id => f.id.to_s,
- :name => "#{f.name} (#{f.conditions})",
- :p_value => d[:p_value],
- :r_squared => d[:r_squared],
- }
- end
- else
- common_descriptors = n["common_descriptors"].size
- end
- {
- :name => Substance.find(n["_id"]).name,
- :id => n["_id"].to_s,
- :common_descriptors => common_descriptors
- }
- end
- else
- neighbors = prediction["neighbors"].size
+ def worst_predictions
+ worst_predictions = predictions.select do |sid,p|
+ p["prediction_interval"] and p["value"] and (p["measurements"].max < p["prediction_interval"][0] or p["measurements"].min > p["prediction_interval"][1])
+ end.compact.to_h
+ worst_predictions.each do |sid,p|
+ p["error"] = (p["value"] - p["measurements"].median).abs
+ if p["measurements"].max < p["prediction_interval"][0]
+ p["distance_prediction_interval"] = (p["measurements"].max - p["prediction_interval"][0]).abs
+ elsif p["measurements"].min > p["prediction_interval"][1]
+ p["distance_prediction_interval"] = (p["measurements"].min - p["prediction_interval"][1]).abs
end
- {
- :id => substance.id.to_s,
- :name => substance.name,
- :feature => Feature.find(prediction["prediction_feature_id"]).name,
- :error => (prediction["value"] - prediction["measurements"].median).abs,
- :prediction => prediction["value"],
- :measurements => prediction["measurements"],
- :neighbors => neighbors
- }
end
+ worst_predictions.sort_by{|sid,p| p["distance_prediction_interval"] }.to_h
end
end
end