summaryrefslogtreecommitdiff
path: root/test/lazar-physchem-long.rb
blob: 280184d13adaab0bda86996343c539d56a5ee1c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
require_relative "setup.rb"

class LazarPhyschemDescriptorTest < MiniTest::Test

  def test_lazar_pc_descriptors

    # check available descriptors
    @descriptors = OpenTox::Algorithm::Descriptor::DESCRIPTORS.keys
    assert_equal 111,@descriptors.size,"wrong number of physchem descriptors"
    @descriptor_values = OpenTox::Algorithm::Descriptor::DESCRIPTOR_VALUES

    # select descriptors for test
    @num_features_offset = 0
    @descriptors.keep_if{|x| x=~/^Openbabel\./}
    @descriptors.delete("Openbabel.L5") # TODO Openbabel.L5 does not work, investigate!!!
    unless defined?($short_tests)
      # the actual descriptor calculation is rather fast, computing 3D structures takes time
      # A CDK descriptor can calculate serveral values, e.g., ALOGP produces ALOGP.ALogP, ALOGP.ALogp2, ALOGP.AMR
      # both is accepted (and tested here): Cdk.ALOGP (produces 3 features), or ALOGP.AMR (produces only 1 feature)
      @descriptors += ["Cdk.ALOGP.AMR", "Cdk.WienerNumbers", "Joelib.LogP", "Joelib.count.HeteroCycles"]
      @num_features_offset = 1 # Cdk.WienerNumbers produces 2 (instead of 1) features
    end
    puts "Descriptors: #{@descriptors}"

    # UPLOAD DATA
    @dataset = OpenTox::Dataset.from_csv_file File.join(DATA_DIR,"EPAFHM.medi.csv")
    puts "Dataset: "+@dataset.id

    @compound_smiles = "CC(C)(C)CN"
    @compound_inchi = "InChI=1S/C5H13N/c1-5(2,3)4-6/h4,6H2,1-3H3"

    prediction_a = build_model_and_predict(true)
    prediction_b = build_model_and_predict(false)
    
    p prediction_a.data_entries
    p prediction_b.data_entries
    
    assert_equal prediction_a,prediction_b,"predicted value differs depending on calculation method"
    puts "Predicted value: #{prediction_a}"
    # the actual value (from the complete EPAFHM dataset) is 5.45, but it is predicted higher when tested
    # do not expect a fixed value, this might vary with, e.g., the calculated 3d structure by OB
    assert prediction_a > 5,"predicted values should be above 5, is #{prediction_a}"
    assert prediction_a < 15,"predicted values should be below 15, is #{prediction_a}"
  end

  def build_model_and_predict(precompute_feature_dataset=true)

    model_params = {:dataset => @dataset}
    #feat_gen_uri = File.join($algorithm[:uri],"descriptor","physchem")
    
=begin
    if precompute_feature_dataset
      # PRECOMPUTE FEATURES
      p = "/tmp/mergedfile.csv"
      f = File.open(p,"w")
      f.puts File.read(File.join(DATA_DIR,"EPAFHM.medi.csv"))
      f.puts "\"#{@compound_smiles}\","
      f.close
      d = OpenTox::Dataset.from_csv_file p
      descriptors = OpenTox::Algorithm::Descriptor.physchem(d.compounds, @descriptors)
      #model_params[:feature_dataset_uri] = OpenTox::Algorithm::Generic.new(feat_gen_uri).run({:dataset_uri => d.uri, :descriptors => @descriptors})
    else
      model_params[:feature_generation_uri] = feat_gen_uri
      model_params[:descriptors] = @descriptors
    end
=end
      
    # BUILD MODEL

    #p descriptors
    feature_dataset = OpenTox::Algorithm::Descriptor.physchem(@dataset, @descriptors)
    #feature_dataset = DescriptorDataset.new
    #feature_dataset.compounds = @dataset.compounds
    #feature_dataset.data_entries = descriptors
    #feature_dataset.features = @descriptors.collect{|d| OpenTox::Feature.find_or_create_by(:title => d)}
    feature_dataset.compounds.each do |compound|
      assert_kind_of Compound, compound
    end
    feature_dataset.feature_ids.each do |id|
      assert_kind_of BSON::ObjectId, id
    end
    feature_dataset.data_entries.each do |entry|
      assert_kind_of Array, entry
      #entry.each do |e|
        #p e
      #  assert_kind_of Float, e
      #end
    end
    feature_dataset.save
    model = OpenTox::Model::Lazar.create @dataset, feature_dataset
    #model = OpenTox::Model::Lazar.new model_uri
    #assert_equal model_uri.uri?, true
    #puts "Predicted variable: "+model.predicted_variable
    
    # CHECK FEATURE DATASET
    #feature_dataset_uri = model.metadata[RDF::OT.featureDataset].first
    #puts "Feature dataset: #{feature_dataset_uri}"
    #feature_dataset = OpenTox::Dataset.new(feature_dataset_uri)
    assert_equal @dataset.compounds.size,feature_dataset.compounds.size,"Incorrect number of compounds in feature dataset"
    features = feature_dataset.features
    feature_titles = features.collect{|f| f.name}
    @descriptors.each do |d|
      if (d=~/^Cdk\./ and d.count(".")==1) # CDK descriptors (e.g. Cdk.ALOG are included as Cdk.ALOGP.ALogP, Cdk.ALOGP.ALogp2 ..)
        match = false
        feature_titles.each do |f|
          match = true if f=~/d/
        end
        assert match,"feature not found #{d} in feature dataset #{feature_titles.inspect}"
      else
        assert feature_titles.include?(d),"feature not found #{d} in feature dataset #{feature_titles.inspect}"
      end
    end
    # Cdk.WienerNumbers returns 2 features
    assert_equal (@descriptors.size+@num_features_offset),features.size,"wrong num features in feature dataset"

    # predict compound
    compound = OpenTox::Compound.from_inchi @compound_inchi
    prediction = model.predict compound
    prediction
    #prediction = OpenTox::Dataset.new prediction_uri
    #assert_equal prediction.uri.uri?, true
    #puts "Prediction "+prediction.uri
    
    # TODO check prediction
    #assert prediction.features.collect{|f| f.uri}.include?(model.predicted_variable),"prediction feature #{model.predicted_variable} not included prediction dataset #{prediction.features.collect{|f| f.uri}}"
    #assert prediction.compounds.collect{|c| c.uri}.include?(compound_uri),"compound #{compound_uri} not included in prediction dataset #{prediction.compounds.collect{|c| c.uri}}"
    #assert_equal 1,prediction.compound_indices(compound_uri).size,"compound should only be once in the dataset"
    #prediction.data_entry_value(prediction.compound_indices(compound_uri).first,model.predicted_variable)
  end

end