Java Main function
[algorithm] / lazar.rb
1 @@feature_generation_default = File.join(CONFIG[:services]["opentox-algorithm"],"fminer","bbrc")
2
3 # Get RDF/XML representation of the lazar algorithm
4 # @return [application/rdf+xml] OWL-DL representation of the lazar algorithm
5 get '/lazar/?' do
6   algorithm = OpenTox::Algorithm::Generic.new(url_for('/lazar',:full))
7   algorithm.metadata = {
8     DC.title => 'lazar',
9     DC.creator => "helma@in-silico.ch, andreas@maunz.de",
10     DC.contributor => "vorgrimmlerdavid@gmx.de",
11 #    BO.instanceOf => "http://opentox.org/ontology/ist-algorithms.owl#lazar",
12     OT.parameters => [
13       { DC.description => "Dataset URI with the dependent variable", OT.paramScope => "mandatory", DC.title => "dataset_uri" },
14       { DC.description => "Feature URI for dependent variable. Optional for datasets with only a single feature.", OT.paramScope => "optional", DC.title => "prediction_feature" },
15       { DC.description => "URI of feature generation service. Default: #{@@feature_generation_default}", OT.paramScope => "optional", DC.title => "feature_generation_uri" },
16       { DC.description => "URI of feature dataset. If this parameter is set no feature generation algorithm will be called", OT.paramScope => "optional", DC.title => "feature_dataset_uri" },
17       { DC.description => "Further parameters for the feature generation service", OT.paramScope => "optional" }
18     ]
19   }
20   case request.env['HTTP_ACCEPT']
21   when /text\/html/
22     content_type "text/html"
23     OpenTox.text_to_html algorithm.to_yaml
24   when /application\/x-yaml/
25     content_type "application/x-yaml"
26     algorithm.to_yaml
27   else
28     response['Content-Type'] = 'application/rdf+xml'  
29     algorithm.to_rdfxml
30   end
31 end
32
33 # Create a lazar prediction model
34 # @param [String] dataset_uri Training dataset URI
35 # @param [optional,String] prediction_feature URI of the feature to be predicted
36 # @param [optional,String] feature_generation_uri URI of the feature generation algorithm 
37 # @param [optional,String] - further parameters for the feature generation service 
38 # @return [text/uri-list] Task URI 
39 post '/lazar/?' do 
40
41   params[:subjectid] = @subjectid
42   raise OpenTox::NotFoundError.new "No dataset_uri parameter." unless params[:dataset_uri]
43         dataset_uri = params[:dataset_uri]
44
45
46   task = OpenTox::Task.create("Create lazar model",url_for('/lazar',:full)) do |task|
47
48     # # # BASIC SETTINGS
49     
50     raise OpenTox::NotFoundError.new "Dataset #{dataset_uri} not found." unless training_dataset = OpenTox::Dataset.new(dataset_uri)
51     training_dataset.load_all(@subjectid)
52
53     # Prediction Feature
54     prediction_feature = OpenTox::Feature.find(params[:prediction_feature],@subjectid)
55     unless params[:prediction_feature] # try to read prediction_feature from dataset
56       raise OpenTox::NotFoundError.new "#{training_dataset.features.size} features in dataset #{dataset_uri}. Please provide a prediction_feature parameter." unless training_dataset.features.size == 1
57       prediction_feature = OpenTox::Feature.find(training_dataset.features.keys.first,@subjectid)
58       params[:prediction_feature] = prediction_feature.uri # pass to feature mining service
59     end
60     raise OpenTox::NotFoundError.new "No feature #{prediction_feature.uri} in dataset #{params[:dataset_uri]}. (features: "+ training_dataset.features.inspect+")" unless training_dataset.features and training_dataset.features.include?(prediction_feature.uri)
61     
62     # Feature Generation URI
63     feature_generation_uri = @@feature_generation_default unless ( (feature_generation_uri = params[:feature_generation_uri]) || (params[:feature_dataset_uri]) )
64
65     # Create instance
66                 lazar = OpenTox::Model::Lazar.new
67     
68     # Classification: Weighted Majority, Substructure.match
69     if prediction_feature.feature_type == "classification"
70       lazar.value_map = training_dataset.value_map(params[:prediction_feature])
71
72     # Regression: SVM, Substructure.match_hits
73     elsif  prediction_feature.feature_type == "regression"
74       lazar.feature_calculation_algorithm = "Substructure.match_hits" 
75       lazar.prediction_algorithm = "Neighbors.local_svm_regression" 
76     end
77
78
79     # # # USER VALUES
80     
81     # Min Sim
82     min_sim = params[:min_sim].to_f if params[:min_sim]
83     min_sim = 0.3 unless params[:min_sim]
84
85     # Algorithm
86     lazar.prediction_algorithm = "Neighbors.#{params[:prediction_algorithm]}" if params[:prediction_algorithm]
87
88     # Nr Hits
89     nr_hits = false
90     if params[:nr_hits] == "true" || lazar.prediction_algorithm.include?("local_svm")
91       lazar.feature_calculation_algorithm = "Substructure.match_hits"
92       nr_hits = true
93     end
94     params[:nr_hits] = "true" if lazar.feature_calculation_algorithm == "Substructure.match_hits" #not sure if this line in needed 
95
96     # Propositionalization
97     propositionalized = (lazar.prediction_algorithm=="Neighbors.weighted_majority_vote" ? false : true)
98    
99     # PC type
100     pc_type = params[:pc_type] if params[:pc_type]
101     lib = params[:lib] if params[:lib]
102
103     # Min train performance
104     min_train_performance = params[:min_train_performance].to_f if params[:min_train_performance]
105     min_train_performance = 0.1 unless params[:min_train_performance]
106
107
108     task.progress 10
109
110
111     # # # Features
112
113     # Read Features, currently only OT.NumericFeatures
114     if params[:feature_dataset_uri]
115       lazar.feature_calculation_algorithm = "Substructure.lookup"
116       feature_dataset_uri = params[:feature_dataset_uri]
117       training_features = OpenTox::Dataset.new(feature_dataset_uri)
118       training_feature_types = training_features.feature_types(@subjectid)
119
120       if training_feature_types.collect { |id, info| info.include? OT.NumericFeature }.include?(false) # <- extend this
121         raise OpenTox::NotFoundError.new "Found a non-numeric feature in feature dataset"
122
123       elsif training_dataset.compounds.size < training_feature_types.size
124         raise OpenTox::BadRequestError.new "Number of training compounds (#{training_dataset.compounds.size}) smaller than number of non-missing features (#{training_feature_types.size})"
125
126       else
127         lazar.similarity_algorithm = "Similarity.cosine"
128         min_sim = 0.7 unless params[:min_sim] 
129         min_sim = (min_sim * 2.0 -1.0 ) # transform to cosine range [-1,1]
130
131         training_features_tl = training_features.features.collect{|f,info|
132           unless info[DC.description]
133             [nil, nil]
134           else
135             info[DC.description].gsub(/.*\[/,"").chop.split(", ")
136           end
137         
138         }
139         training_features_pc_types = training_features_tl.collect{|info| info[0]}.flatten.uniq.compact
140         training_features_lib = training_features_tl.collect{|info| info[1]}.flatten.uniq.compact
141         unless (params[:pc_type] and params[:lib])
142           
143           if (!params[:pc_type] && training_features_pc_types.size>0)
144             pc_type=training_features_pc_types.join(',')
145             LOGGER.info "pc_type '#{pc_type}' auto-detected from feature dataset"
146           end
147           
148           if (!params[:lib] && training_features_lib.size>0)
149             lib=training_features_lib.join(',')
150             LOGGER.info "lib '#{lib}' auto-detected from feature dataset"
151           end
152           
153           unless (pc_type and lib)
154             raise OpenTox::NotFoundError.new "No pc_type parameter given, and autodetection from feature dataset failed" unless pc_type
155             raise OpenTox::NotFoundError.new "No lib parameter given, and autodetection from feature dataset failed" unless lib
156           end
157         
158         end
159       
160       end
161
162     # Create Features
163     else 
164       params[:feature_generation_uri] = feature_generation_uri
165       params[:subjectid] = @subjectid
166       prediction_feature = OpenTox::Feature.find params[:prediction_feature], @subjectid
167       if prediction_feature.feature_type == "regression" && feature_generation_uri.match(/fminer/) 
168         params[:feature_type] = "paths" unless params[:feature_type]
169       end
170       feature_dataset_uri = OpenTox::Algorithm::Generic.new(feature_generation_uri).run(params, OpenTox::SubTask.new(task,10,70)).to_s
171       training_features = OpenTox::Dataset.new(feature_dataset_uri)
172     end
173
174
175     # # # Write fingerprints
176     
177     training_features.load_all(@subjectid)
178                 raise OpenTox::NotFoundError.new "Dataset #{feature_dataset_uri} not found." if training_features.nil?
179
180     del_master_compounds = []
181  
182     # Creating InChi/URI Hash from trainig_feature for comparison with training_dataset to avoid missmatches caused by different URI authorities
183     feature_compounds = {}
184     training_features.compounds.each {|f_c_uri|
185       f_compound = OpenTox::Compound.new(f_c_uri)
186       feature_compounds[f_compound.to_inchi] = f_c_uri
187     }
188    
189     training_dataset.compounds.each do |t_c_uri|
190
191       t_compound = OpenTox::Compound.new(t_c_uri)
192       entry = training_features.data_entries[feature_compounds[t_compound.to_inchi]]
193       
194       if entry.nil? # Training compound not found in feature dataset
195         del_master_compounds << t_c_uri # Delete if training compound not found in feature dataset
196       else
197         lazar.fingerprints[t_c_uri] = {} unless lazar.fingerprints[t_c_uri]
198         entry.keys.each do |feature|
199
200           # CASE 1: Substructure
201           if (lazar.feature_calculation_algorithm == "Substructure.match") || (lazar.feature_calculation_algorithm == "Substructure.match_hits")
202             if training_features.features[feature]
203               smarts = training_features.features[feature][OT.smarts]
204               #lazar.fingerprints[compound] << smarts
205               if lazar.feature_calculation_algorithm == "Substructure.match_hits"
206                 lazar.fingerprints[t_c_uri][smarts] = entry[feature].flatten.first * training_features.features[feature][OT.pValue]
207               else
208                 lazar.fingerprints[t_c_uri][smarts] = 1 * training_features.features[feature][OT.pValue]
209               end
210               unless lazar.features.include? smarts
211                 lazar.features << smarts
212                 lazar.p_values[smarts] = training_features.features[feature][OT.pValue]
213                 lazar.effects[smarts] = training_features.features[feature][OT.effect]
214               end
215             end
216
217           # CASE 2: Others
218           elsif entry[feature].flatten.size == 1
219             lazar.fingerprints[t_c_uri][feature] = entry[feature].flatten.first
220             lazar.features << feature unless lazar.features.include? feature
221           else
222             LOGGER.warn "More than one entry (#{entry[feature].inspect}) for compound #{t_c_uri}, feature #{feature}"
223           end
224         end
225       end
226     end
227
228     task.progress 80
229     
230     # Show compounds without feature information
231     if del_master_compounds.size>0
232       del_master_compounds.each{|compound| LOGGER.info "Compound: '#{compound.to_s}' not found in feature dataset and will be removed from compound list."}
233     end
234     # # # Compounds
235     lazar.compounds=training_dataset.compounds.collect - del_master_compounds # Add only compounds with fingerprints 
236     
237     # # # Activities
238     if prediction_feature.feature_type == "regression"
239       lazar.compounds.each do |compound| 
240         entry = training_dataset.data_entries[compound] 
241         lazar.activities[compound] = [] unless lazar.activities[compound]
242         unless entry[prediction_feature.uri].empty?
243           entry[prediction_feature.uri].each do |value|
244             lazar.activities[compound] << value
245           end
246         end
247       end
248     elsif prediction_feature.feature_type == "classification"
249       lazar.compounds.each do |compound| 
250         entry = training_dataset.data_entries[compound] 
251         lazar.activities[compound] = [] unless lazar.activities[compound]
252         unless entry[prediction_feature.uri].empty?
253           entry[prediction_feature.uri].each do |value|
254             lazar.activities[compound] << lazar.value_map.invert[value] # insert mapped values, not originals
255           end
256         end
257       end
258     end
259     task.progress 90
260
261
262     # # # Metadata
263     lazar.metadata[DC.title] = "lazar model for #{URI.decode(File.basename(prediction_feature.uri))}"
264     lazar.metadata[OT.dependentVariables] = prediction_feature.uri
265     lazar.metadata[OT.trainingDataset] = dataset_uri
266                 lazar.metadata[OT.featureDataset] = feature_dataset_uri
267     case prediction_feature.feature_type
268     when "classification"
269       lazar.metadata[RDF.type] = [OT.Model, OTA.ClassificationLazySingleTarget]
270     when "regression"
271       lazar.metadata[RDF.type] = [OT.Model, OTA.RegressionLazySingleTarget]
272     end
273
274     lazar.metadata[OT.parameters] = [
275       {DC.title => "dataset_uri", OT.paramValue => dataset_uri},
276       {DC.title => "prediction_feature", OT.paramValue => prediction_feature.uri},
277       {DC.title => "feature_generation_uri", OT.paramValue => feature_generation_uri},
278       {DC.title => "propositionalized", OT.paramValue => propositionalized},
279       {DC.title => "pc_type", OT.paramValue => pc_type},
280       {DC.title => "lib", OT.paramValue => lib},
281       {DC.title => "nr_hits", OT.paramValue => nr_hits},
282       {DC.title => "min_sim", OT.paramValue => min_sim},
283       {DC.title => "min_train_performance", OT.paramValue => min_train_performance},
284
285     ]
286                 
287                 model_uri = lazar.save(@subjectid)
288                 LOGGER.info model_uri + " created #{Time.now}"
289     model_uri
290
291         end
292   response['Content-Type'] = 'text/uri-list' 
293   raise OpenTox::ServiceUnavailableError.newtask.uri+"\n" if task.status == "Cancelled"
294   halt 202,task.uri
295 end
296