From bdc6b5b40437896384561d74a510560e9e592364 Mon Sep 17 00:00:00 2001 From: "helma@in-silico.ch" Date: Tue, 9 Oct 2018 18:20:27 +0200 Subject: tentative random forest classification: hangs unpredictably during caret model generation/optimization for some (inorganic?) compounds. --- lib/dataset.rb | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'lib/dataset.rb') diff --git a/lib/dataset.rb b/lib/dataset.rb index b7d9d4e..6ad3215 100644 --- a/lib/dataset.rb +++ b/lib/dataset.rb @@ -71,6 +71,8 @@ module OpenTox # Merge an array of datasets # @param [Array] OpenTox::Dataset Array to be merged + # @param [Hash] feature modifications + # @param [Hash] value modifications # @return [OpenTox::Dataset] merged dataset def self.merge datasets, feature_map=nil, value_map=nil dataset = self.new(:source => datasets.collect{|d| d.source}.join(", "), :name => datasets.collect{|d| d.name}.uniq.join(", ")) @@ -205,7 +207,7 @@ module OpenTox md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) if dataset - $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." table = nil @@ -234,10 +236,10 @@ module OpenTox if read_result value = line.chomp if value.numeric? - feature = NumericFeature.find_or_create_by(:name => feature_name) + feature = NumericFeature.find_or_create_by(:name => feature_name, :measured => true) value = value.to_f else - feature = NominalFeature.find_or_create_by(:name => feature_name) + feature = NominalFeature.find_or_create_by(:name => feature_name, :measured => true) end features[feature] = value read_result = false @@ -259,7 +261,7 @@ module OpenTox md5 = Digest::MD5.hexdigest(File.read(file)) # use hash to identify identical files dataset = self.find_by(:md5 => md5) if dataset - $logger.debug "Skipping import of #{file}, it is already in the database (id: #{dataset.id})." + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." table = nil @@ -301,7 +303,7 @@ module OpenTox # guess feature types feature_names.each_with_index do |f,i| - metadata = {:name => f} + metadata = {:name => f, :measured => true} original_id ? j = i+2 : j = i+1 values = table.collect{|row| val=row[j].to_s.strip; val.blank? ? nil : val }.uniq.compact types = values.collect{|v| v.numeric? ? true : false}.uniq @@ -424,7 +426,7 @@ module OpenTox name = File.basename(file,".*") batch = self.find_by(:source => source, :name => name) if batch - $logger.debug "Skipping import of #{file}, it is already in the database (id: #{batch.id})." + $logger.debug "Found #{file} in the database (id: #{dataset.id}, md5: #{dataset.md5}), skipping import." else $logger.debug "Parsing #{file}." # check delimiter -- cgit v1.2.3