From 5470c03f091ac9c02bcb33d74ecfe9b672721899 Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Wed, 19 Sep 2012 15:58:07 +0200 Subject: Switched CDK from Ambit to rjb --- lib/model.rb | 4 +- lib/utils.rb | 145 ++++++++++++++++++++++++++++++++--------------------------- 2 files changed, 80 insertions(+), 69 deletions(-) diff --git a/lib/model.rb b/lib/model.rb index 285ab3d..77b0274 100644 --- a/lib/model.rb +++ b/lib/model.rb @@ -221,8 +221,8 @@ module OpenTox waiting_task.progress( count/d.compounds.size.to_f*100.0 ) if waiting_task rescue => e LOGGER.warn "prediction for compound "+compound_uri.to_s+" failed: "+e.message+" subjectid: #{subjectid}" - #LOGGER.debug "#{e.class}: #{e.message}" - #LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" end end diff --git a/lib/utils.rb b/lib/utils.rb index 149208b..4a04c58 100644 --- a/lib/utils.rb +++ b/lib/utils.rb @@ -41,17 +41,25 @@ module OpenTox # # # joelib (via rjb) if !params[:lib] || params[:lib].split(",").include?("joelib") - jl_master, jl_ids = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb], :pc_type => params[:pc_type], :descriptor => params[:descriptor] } ) + jl_master, jl_ids = get_jl_descriptors( { :compounds => compounds, :rjb => params[:rjb][:jl], :pc_type => params[:pc_type], :descriptor => params[:descriptor] } ) params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["joelib"]) if params[:task] end - # # # cdk (via REST) + # # # cdk (via rjb) if !params[:lib] || params[:lib].split(",").include?("cdk") - ambit_result_uri, smiles_to_inchi, cdk_ids = get_cdk_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :task => params[:task], :step => task_weights["cdk"], :descriptor => params[:descriptor] } ) - #LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'" - cdk_master, cdk_ids, ambit_ids = load_ds_csv(ambit_result_uri, smiles_to_inchi, cdk_ids ) - params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["load"]) if params[:task] + cdk_master, cdk_ids = get_cdk_descriptors( { :compounds => compounds, :rjb => params[:rjb][:cdk], :pc_type => params[:pc_type], :descriptor => params[:descriptor] } ) + cdk_ids = cdk_master[0].collect { |id| id.to_s.split("-").first } # get column headers + cdk_single_ids = cdk_master[0].collect { |id| id.to_s.sub(/[^-]*-/,"").gsub(/[\/.\\\(\)\{\}\[\]]/,"_") } # get column headers w/ nice '_' + cdk_master[0] = cdk_single_ids.collect{|x| x} # Single IDs as features in result ds + cdk_ids.shift # remove SMILES + cdk_single_ids.shift # remove SMILES + params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["cdk"]) if params[:task] + + #ambit_result_uri, smiles_to_inchi, cdk_ids = get_cdk_descriptors( { :compounds => compounds, :pc_type => params[:pc_type], :task => params[:task], :step => task_weights["cdk"], :descriptor => params[:descriptor] } ) + ##LOGGER.debug "Ambit result uri for #{params.inspect}: '#{ambit_result_uri.to_yaml}'" + #cdk_master, cdk_ids, ambit_ids = load_ds_csv(ambit_result_uri, smiles_to_inchi, cdk_ids ) + #params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights["load"]) if params[:task] end # # # fuse CSVs ("master" structures) @@ -94,10 +102,12 @@ module OpenTox # # # add feature metadata pc_descriptors = YAML::load_file(@keysfile) - ambit_ids && ambit_ids.each_with_index { |id,idx| + cdk_single_ids && cdk_single_ids.each_with_index { |id,idx| raise "Feature not found" if ! ds.features[File.join(ds.uri, "feature", id.to_s)] ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.description => "#{pc_descriptors[cdk_ids[idx]][:name]} [#{pc_descriptors[cdk_ids[idx]][:pc_type]}, #{pc_descriptors[cdk_ids[idx]][:lib]}]"}) - ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => @ambit_descriptor_algorithm_uri + cdk_ids[idx]}) + creator_uri = ds.uri.gsub(/\/dataset\/.*/, "/algorithm/pc") + creator_uri += "/#{id}" if params[:add_uri] + ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{DC.creator => creator_uri}) ds.add_feature_metadata(File.join(ds.uri, "feature", id.to_s),{OT.hasSource => params[:dataset_uri]}) } ob_ids && ob_ids.each { |id| @@ -283,73 +293,74 @@ module OpenTox end - # Calculate CDK physico-chemical descriptors via Ambit -- DO NOT OVERLOAD Ambit. - # @param[Hash] required: :compounds, :pc_type, :task, :step optional: :descriptor - # @return[Array] array of Ambit result uri, piecewise (1st: base, 2nd: SMILES, 3rd+: features, hash smiles to inchi, array of field descriptions + def self.get_cdk_descriptors(params) - ambit_result_uri = [] # 1st pos: base uri, then features - smiles_to_inchi = {} - task_weights = {"electronic"=> 4, "topological"=> 19, "constitutional"=> 12, "geometrical"=> 3, "hybrid"=> 2, "cpsa"=> 1 } - task_weights.keys.each { |pc_type| task_weights.delete(pc_type) if (params[:pc_type] && (!params[:pc_type].split(",").include?(pc_type)))} - task_sum = Float task_weights.values.sum - task_weights.keys.each { |pc_type| task_weights[pc_type] /= task_sum } - task_weights.keys.each { |pc_type| task_weights[pc_type] *= params[:step] } - - - # extract wanted descriptors from config file and parameters - pc_descriptors = YAML::load_file(@keysfile) - - ids = pc_descriptors.collect { |id, info| - "#{info[:pc_type]}:::#{id}" if info[:lib] == "cdk" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor]) - }.compact + master = nil + t = params[:rjb]; raise "No Java environment" unless t - if ids.size > 0 - ids.sort! - ids.collect! { |id| id.split(":::").last } + # Load keys, enter CSV headers + begin + pc_descriptors = YAML::load_file(@keysfile) + ids = pc_descriptors.collect{ |id, info| + id if info[:lib] == "cdk" && params[:pc_type].split(",").include?(info[:pc_type]) && (!params[:descriptor] || id == params[:descriptor]) + }.compact - # create dataset at Ambit - begin - params[:compounds].each do |n| - cmpd = OpenTox::Compound.new(n) - smiles_string = cmpd.to_smiles - smiles_to_inchi[smiles_string] = URI.encode_www_form_component(cmpd.to_inchi) + if ids.length > 0 + # remember inchis + inchis = params[:compounds].collect { |c_uri| + cmpd = OpenTox::Compound.new(c_uri) + URI.encode_www_form_component(cmpd.to_inchi) + } + begin + # Process compounds + sdf_data = [] + params[:compounds].each_with_index { |c_uri, c_idx| + cmpd = OpenTox::Compound.new(c_uri) + LOGGER.debug "3D for #{cmpd.to_smiles}" + obconv = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconv.set_in_format("smi") + obconv.read_string(obmol, cmpd.to_smiles) + obconv.set_out_format("sdf") + sdf_string = obconv.write_string(obmol) + gen3d = OpenBabel::OBOp.find_type("Gen3D") + gen3d.do(obmol) + sdf_string_3d = obconv.write_string(obmol) + if sdf_string_3d.index(/.nan/).nil? + sdf_data << sdf_string_3d + else + sdf_data << sdf_string + LOGGER.debug "3D failed (using 2D)" + end + } + infile = Tempfile.open(['jl_descriptors-in-','.sdf']) + csvfile = infile.path.gsub(/jl_descriptors-in/,"jl_descriptors-out").gsub(/\.sdf/,".csv") + infile.puts sdf_data.join("") + infile.flush + t.new(infile.path, csvfile, ids.join(',')) # runs cdk + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + infile.close! end - smi_file = Tempfile.open(['pc_ambit', '.csv']) ; smi_file.puts( "SMILES\n" + smiles_to_inchi.keys.join("\n") ) ; smi_file.flush - ambit_ds_uri = OpenTox::RestClientWrapper.post(@ambit_ds_service_uri, {:file => File.new(smi_file.path)}, {:content_type => "multipart/form-data", :accept => "text/uri-list"} ) - ambit_result_uri = [ ambit_ds_uri + "?" ] # 1st pos: base uri, then features - rescue Exception => e - LOGGER.debug "#{e.class}: #{e.message}" - LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" - ensure - smi_file.close! if smi_file - end - # get SMILES feature URI - ambit_smiles_uri = OpenTox::RestClientWrapper.get( - ambit_ds_uri + "/features", - {:accept=> "text/uri-list"} - ).chomp - ambit_result_uri << ("feature_uris[]=" + URI.encode_www_form_component(ambit_smiles_uri) + "&") - # always calculate 3D (http://goo.gl/Tk81j), then get results - OpenTox::RestClientWrapper.post( - @ambit_mopac_model_uri, - {:dataset_uri => ambit_ds_uri}, - {:accept => "text/uri-list"} - ) - current_cat = "" - ids.each_with_index do |id, i| - old_cat = current_cat; current_cat = pc_descriptors[id][:pc_type] - params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[old_cat]) if params[:task] && old_cat != current_cat && old_cat != "" - algorithm = Algorithm::Generic.new(@ambit_descriptor_algorithm_uri+id) - result_uri = algorithm.run({:dataset_uri => ambit_ds_uri}) - ambit_result_uri << result_uri.split("?")[1] + "&" - LOGGER.debug "Ambit (#{ids.size}): #{i+1}" + master = CSV::parse(File.open(csvfile, "rb").read) + master.each_with_index { |row, idx| + if idx != 0 # not alter headers + row[0] = inchis[idx-1] + row.collect! { |x| x.to_s == "null" ? nil : x } + end + } end - params[:task].progress(params[:task].metadata[OT.percentageCompleted] + task_weights[current_cat]) if params[:task] - #LOGGER.debug "Ambit result: #{ambit_result_uri.join('')}" + rescue Exception => e + LOGGER.debug "#{e.class}: #{e.message}" + LOGGER.debug "Backtrace:\n\t#{e.backtrace.join("\n\t")}" + ensure + [ csvfile ].each { |f| File.delete(f) } end - [ ambit_result_uri, smiles_to_inchi, ids ] + [ master, ids ] end -- cgit v1.2.3