From 87eb7cc1e079821c2f7c5e101e7e392e9bd10f00 Mon Sep 17 00:00:00 2001 From: davor Date: Tue, 24 May 2011 09:35:11 +0200 Subject: Fixing regression detection --- lib/parser.rb | 66 +++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 15 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 7bdee95..8deaa91 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -277,7 +277,23 @@ module OpenTox def load_spreadsheet(book) book.default_sheet = 0 add_features book.row(1) - 2.upto(book.last_row) { |i| add_values book.row(i) } + + # AM: fix mixed read in + regression_features=false + 2.upto(book.last_row) { |i| + row = book.row(i) + smiles = row.shift + row.each_index do |i| + value = row[i] + type = feature_type(value) + if type == OT.NumericFeature + regression_features=true + break + end + end + } + + 2.upto(book.last_row) { |i| add_values book.row(i),regression_features } warnings @dataset end @@ -289,7 +305,23 @@ module OpenTox row = 0 input = csv.split("\n") add_features split_row(input.shift) - input.each { |row| add_values split_row(row) } + + + # AM: fix mixed read in + regression_features=false + input.each { |row| + row = split_row(row) + smiles = row.shift + row.each_index do |i| + value = row[i] + type = feature_type(value) + if type == OT.NumericFeature + regression_features=true + break + end + end + } + input.each { |row| add_values split_row(row),regression_features } warnings @dataset end @@ -335,7 +367,7 @@ module OpenTox end end - def add_values(row) + def add_values(row, regression_features=false) smiles = row.shift compound = Compound.from_smiles(smiles) @@ -353,19 +385,23 @@ module OpenTox @feature_types[feature] << type - case type - when OT.NominalFeature - case value.to_s - when TRUE_REGEXP - val = true - when FALSE_REGEXP - val = false - end - when OT.NumericFeature + if (regression_features) val = value.to_f - when OT.StringFeature - val = value.to_s - @activity_errors << smiles+", "+row.join(", ") + else + case type + when OT.NominalFeature + case value.to_s + when TRUE_REGEXP + val = true + when FALSE_REGEXP + val = false + end + when OT.NumericFeature + val = value.to_f + when OT.StringFeature + val = value.to_s + @activity_errors << smiles+", "+row.join(", ") + end end if val!=nil @dataset.add(compound.uri, feature, val) -- cgit v1.2.3 From 4a7ba2adb0743cd225ad5c2cf9f71c896d87b157 Mon Sep 17 00:00:00 2001 From: davor Date: Tue, 24 May 2011 10:45:53 +0200 Subject: Created dedicated function for value sweeping --- lib/parser.rb | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 8deaa91..4984292 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -282,15 +282,8 @@ module OpenTox regression_features=false 2.upto(book.last_row) { |i| row = book.row(i) - smiles = row.shift - row.each_index do |i| - value = row[i] - type = feature_type(value) - if type == OT.NumericFeature - regression_features=true - break - end - end + regression_features = detect_regression_features row + break if regression_features=true } 2.upto(book.last_row) { |i| add_values book.row(i),regression_features } @@ -311,21 +304,15 @@ module OpenTox regression_features=false input.each { |row| row = split_row(row) - smiles = row.shift - row.each_index do |i| - value = row[i] - type = feature_type(value) - if type == OT.NumericFeature - regression_features=true - break - end - end + regression_features = detect_regression_features row + break if regression_features=true } input.each { |row| add_values split_row(row),regression_features } warnings @dataset end + private def warnings @@ -367,6 +354,18 @@ module OpenTox end end + def detect_regression_features row + regression_features=false + row.each_index do |i| + value = row[i] + type = feature_type(value) + if type == OT.NumericFeature + regression_features=true + end + end + regression_features + end + def add_values(row, regression_features=false) smiles = row.shift -- cgit v1.2.3 From 8a20cf940c346fd04649d3c3c8f7ad4c1fcb20cb Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Tue, 24 May 2011 14:00:16 +0200 Subject: Fix: break was too early --- lib/parser.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 4984292..5f847c3 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -283,7 +283,7 @@ module OpenTox 2.upto(book.last_row) { |i| row = book.row(i) regression_features = detect_regression_features row - break if regression_features=true + break if regression_features==true } 2.upto(book.last_row) { |i| add_values book.row(i),regression_features } @@ -305,7 +305,7 @@ module OpenTox input.each { |row| row = split_row(row) regression_features = detect_regression_features row - break if regression_features=true + break if regression_features==true } input.each { |row| add_values split_row(row),regression_features } warnings @@ -355,6 +355,7 @@ module OpenTox end def detect_regression_features row + row.shift regression_features=false row.each_index do |i| value = row[i] -- cgit v1.2.3 From fe85fafc4b24cc8275ad67536d25d660249bb792 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 24 May 2011 16:10:10 +0200 Subject: adjust dataset-parser: predictedVariables may be array, do not request id/features from ambit services as not supported --- lib/parser.rb | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 5f847c3..a6878a2 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -56,7 +56,7 @@ module OpenTox `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line| triple = line.to_triple if triple[0] == @uri - if triple[1] == RDF.type # allow multiple types + if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types @metadata[triple[1]] = [] unless @metadata[triple[1]] @metadata[triple[1]] << triple[2].split('^^').first else @@ -228,7 +228,11 @@ module OpenTox file = Tempfile.new("ot-rdfxml") # do not concat /features to uri string, this would not work for dataset/R401577?max=3 uri = URI::parse(@uri) - uri.path = File.join(uri.path,"features") + # PENDING + # ambit models return http://host/dataset/id?feature_uris[]=sth but + # amibt dataset services does not support http://host/dataset/id/features?feature_uris[]=sth + # -> load features from complete dataset + uri.path = File.join(uri.path,"features") unless @uri=~/\?feature_uris\[\]/ uri = uri.to_s file.puts OpenTox::RestClientWrapper.get uri,{:subjectid => subjectid,:accept => "application/rdf+xml"},nil,false file.close -- cgit v1.2.3 From 7b28e192fdec6eaccd0e2c528df76c54ca1b1cdd Mon Sep 17 00:00:00 2001 From: mguetlein Date: Tue, 31 May 2011 23:14:22 +0200 Subject: fix: handle uri params on ambit datasets, like dataset//?(max=5|page=0) --- lib/parser.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index a6878a2..9eacf4b 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -40,8 +40,9 @@ module OpenTox else file = Tempfile.new("ot-rdfxml") if @dataset - # do not concat /metadata to uri string, this would not work for dataset/R401577?max=3 uri = URI::parse(@uri) + #remove params like dataset/?max=3 from uri, not needed for metadata + uri.query = nil uri.path = File.join(uri.path,"metadata") uri = uri.to_s else @@ -230,9 +231,10 @@ module OpenTox uri = URI::parse(@uri) # PENDING # ambit models return http://host/dataset/id?feature_uris[]=sth but - # amibt dataset services does not support http://host/dataset/id/features?feature_uris[]=sth + # amibt dataset services does not support http://host/dataset/id/features?feature_uris[]=sth + # and features are not inlcuded in http://host/dataset/id/features # -> load features from complete dataset - uri.path = File.join(uri.path,"features") unless @uri=~/\?feature_uris\[\]/ + uri.path = File.join(uri.path,"features") unless @uri=~/\?(feature_uris|page|pagesize)/ uri = uri.to_s file.puts OpenTox::RestClientWrapper.get uri,{:subjectid => subjectid,:accept => "application/rdf+xml"},nil,false file.close -- cgit v1.2.3 From eb2b0d29d506f47ad793a3763768d306c760c632 Mon Sep 17 00:00:00 2001 From: mguetlein Date: Mon, 6 Jun 2011 19:35:57 +0200 Subject: compound hack and feature type bugfix to work with ambit datasets --- lib/parser.rb | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 9eacf4b..90a997b 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -250,8 +250,13 @@ module OpenTox File.delete(to_delete) if to_delete statements.each do |triple| if features.include? triple[0] - @dataset.features[triple[0]] = {} unless @dataset.features[triple[0]] - @dataset.features[triple[0]][triple[1]] = triple[2].split('^^').first + @dataset.features[triple[0]] = {} unless @dataset.features[triple[0]] + if triple[1] == RDF.type + @dataset.features[triple[0]][triple[1]] = [] unless @dataset.features[triple[0]][triple[1]] + @dataset.features[triple[0]][triple[1]] << triple[2].split('^^').first + else + @dataset.features[triple[0]][triple[1]] = triple[2].split('^^').first + end end end @dataset.features -- cgit v1.2.3 From efd57ff4ca8445ac77435a2bdc18207fb0a94d8f Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Mon, 20 Jun 2011 16:26:18 +0200 Subject: Removed classification feature --- lib/parser.rb | 71 +++++++++++++++++++++++------------------------------------ 1 file changed, 28 insertions(+), 43 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 90a997b..ffa9ea5 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -293,11 +293,18 @@ module OpenTox regression_features=false 2.upto(book.last_row) { |i| row = book.row(i) - regression_features = detect_regression_features row - break if regression_features==true + row.shift + row.each_index do |i| + value = row[i] + value_maps[value].nil? ? value_maps[value]=0 : value_maps[value] += 1 + if value_maps.size > 5 + regression_features=true + break + end + end } - 2.upto(book.last_row) { |i| add_values book.row(i),regression_features } + 2.upto(book.last_row) { |i| add_values book.row(i) } warnings @dataset end @@ -313,12 +320,20 @@ module OpenTox # AM: fix mixed read in regression_features=false + value_maps= {0} input.each { |row| row = split_row(row) - regression_features = detect_regression_features row - break if regression_features==true + row.shift + row.each_index do |i| + value = row[i] + value_maps[value].nil? ? value_maps[value]=0 : value_maps[value] += 1 + if value_maps.size > 5 + regression_features=true + break + end + end } - input.each { |row| add_values split_row(row),regression_features } + input.each { |row| add_values split_row(row) } warnings @dataset end @@ -365,20 +380,7 @@ module OpenTox end end - def detect_regression_features row - row.shift - regression_features=false - row.each_index do |i| - value = row[i] - type = feature_type(value) - if type == OT.NumericFeature - regression_features=true - end - end - regression_features - end - - def add_values(row, regression_features=false) + def add_values(row) smiles = row.shift compound = Compound.from_smiles(smiles) @@ -396,23 +398,12 @@ module OpenTox @feature_types[feature] << type - if (regression_features) + case type + when OT.NumericFeature val = value.to_f - else - case type - when OT.NominalFeature - case value.to_s - when TRUE_REGEXP - val = true - when FALSE_REGEXP - val = false - end - when OT.NumericFeature - val = value.to_f - when OT.StringFeature - val = value.to_s - @activity_errors << smiles+", "+row.join(", ") - end + when OT.StringFeature + val = value.to_s + @activity_errors << smiles+", "+row.join(", ") end if val!=nil @dataset.add(compound.uri, feature, val) @@ -428,14 +419,8 @@ module OpenTox true if Float(value) rescue false end - def classification?(value) - !value.to_s.strip.match(TRUE_REGEXP).nil? or !value.to_s.strip.match(FALSE_REGEXP).nil? - end - def feature_type(value) - if classification? value - return OT.NominalFeature - elsif numeric? value + if numeric? value return OT.NumericFeature else return OT.StringFeature -- cgit v1.2.3 From 47973d0a325b699ca90407da99ebebc1c6928cb7 Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Tue, 21 Jun 2011 09:05:42 +0200 Subject: Allowing TAB as separator --- lib/parser.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 90a997b..5a3767a 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -443,7 +443,7 @@ module OpenTox end def split_row(row) - row.chomp.gsub(/["']/,'').split(/\s*[,;]\s*/) # remove quotes + row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes end end -- cgit v1.2.3 From ff5aa4a57aa3fa0a77609933f5c23d8bdcaf6430 Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Tue, 21 Jun 2011 11:19:35 +0200 Subject: Using Nominal Feature --- lib/parser.rb | 67 +++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 29 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index ffa9ea5..5625f60 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -282,6 +282,15 @@ module OpenTox @duplicates = {} end + def detect_new_values(row, value_maps) + row.shift + row.each_index do |i| + value = row[i] + value_maps[value].nil? ? value_maps[value]=0 : value_maps[value] += 1 + end + value_maps + end + # Load Spreadsheet book (created with roo gem http://roo.rubyforge.org/, excel format specification: http://toxcreate.org/help) # @param [Excel] book Excel workbook object (created with roo gem) # @return [OpenTox::Dataset] Dataset object with Excel data @@ -289,22 +298,19 @@ module OpenTox book.default_sheet = 0 add_features book.row(1) - # AM: fix mixed read in regression_features=false + value_maps= {} 2.upto(book.last_row) { |i| row = book.row(i) - row.shift - row.each_index do |i| - value = row[i] - value_maps[value].nil? ? value_maps[value]=0 : value_maps[value] += 1 - if value_maps.size > 5 - regression_features=true - break - end + value_maps=detect_new_values(row, value_maps) + if value_maps.size > 5 # 5 is the maximum nr of classes supported by Fminer. + regression_features=true + break end } - - 2.upto(book.last_row) { |i| add_values book.row(i) } + 2.upto(book.last_row) { |i| + add_values book.row(i), regression_features + } warnings @dataset end @@ -317,23 +323,19 @@ module OpenTox input = csv.split("\n") add_features split_row(input.shift) - - # AM: fix mixed read in regression_features=false - value_maps= {0} + value_maps= {} input.each { |row| row = split_row(row) - row.shift - row.each_index do |i| - value = row[i] - value_maps[value].nil? ? value_maps[value]=0 : value_maps[value] += 1 - if value_maps.size > 5 - regression_features=true - break - end + value_maps=detect_new_values(row, value_maps) + if value_maps.size > 5 # 5 is the maximum nr of classes supported by Fminer. + regression_features=true + break end } - input.each { |row| add_values split_row(row) } + input.each { |row| + add_values split_row(row), regression_features + } warnings @dataset end @@ -380,7 +382,7 @@ module OpenTox end end - def add_values(row) + def add_values(row, regression_features) smiles = row.shift compound = Compound.from_smiles(smiles) @@ -394,16 +396,23 @@ module OpenTox row.each_index do |i| value = row[i] feature = @features[i] - type = feature_type(value) + type = nil + if (regression_features) + type = feature_type(value) + if type != OT.NumericFeature + raise "Error! Expected numeric values." + end + else + type = OT.NominalFeature + end @feature_types[feature] << type case type when OT.NumericFeature val = value.to_f - when OT.StringFeature + when OT.NominalFeature val = value.to_s - @activity_errors << smiles+", "+row.join(", ") end if val!=nil @dataset.add(compound.uri, feature, val) @@ -423,12 +432,12 @@ module OpenTox if numeric? value return OT.NumericFeature else - return OT.StringFeature + return OT.NominalFeature end end def split_row(row) - row.chomp.gsub(/["']/,'').split(/\s*[,;]\s*/) # remove quotes + row.chomp.gsub(/["']/,'').split(/\s*[,;\t]\s*/) # remove quotes end end -- cgit v1.2.3 From 5d5db79f2b1833e77b9cb5ded5b74835bc99f9c7 Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Wed, 22 Jun 2011 13:02:49 +0000 Subject: attempt fo fix load_metadata --- lib/parser.rb | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 90a997b..79c2017 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -57,12 +57,12 @@ module OpenTox `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line| triple = line.to_triple if triple[0] == @uri - if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types + #if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types @metadata[triple[1]] = [] unless @metadata[triple[1]] @metadata[triple[1]] << triple[2].split('^^').first - else - @metadata[triple[1]] = triple[2].split('^^').first - end + #else + #@metadata[triple[1]] = triple[2].split('^^').first + #end end statements << triple parameter_ids << triple[2] if triple[1] == OT.parameters @@ -76,6 +76,9 @@ module OpenTox @metadata[OT.parameters] << parameter end end + @metadata.each do |k,v| + v = v.first if v.size == 1 + end @metadata end -- cgit v1.2.3 From 1d3d27cb689db3091c4ac6e429f2b0f5a198dcdf Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 23 Jun 2011 13:16:21 +0000 Subject: lazar predictions fixed --- lib/parser.rb | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 12ab7f3..2ce9467 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -57,12 +57,12 @@ module OpenTox `rapper -i rdfxml -o ntriples #{file.path} 2>/dev/null`.each_line do |line| triple = line.to_triple if triple[0] == @uri - #if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types + if triple[1] == RDF.type || triple[1]==OT.predictedVariables # allow multiple types @metadata[triple[1]] = [] unless @metadata[triple[1]] @metadata[triple[1]] << triple[2].split('^^').first - #else - #@metadata[triple[1]] = triple[2].split('^^').first - #end + else + @metadata[triple[1]] = triple[2].split('^^').first + end end statements << triple parameter_ids << triple[2] if triple[1] == OT.parameters @@ -76,9 +76,9 @@ module OpenTox @metadata[OT.parameters] << parameter end end - @metadata.each do |k,v| - v = v.first if v.size == 1 - end + #@metadata.each do |k,v| + #v = v.first if v and v.size == 1 + #end @metadata end -- cgit v1.2.3 From 310500f4d61f92de713577e7a09e9536ff6e7c42 Mon Sep 17 00:00:00 2001 From: am Date: Fri, 24 Jun 2011 13:29:18 +0200 Subject: Restored compatibility behavior: guessing true/false in wmv, fixed regression detection for multicolumn CSVs. Discuss: what about other labels: - remove guessing? - then, how to guarantee bw compat if ordering is lost? exploit already existing alphanum ordering on REGEX patterns? --- lib/parser.rb | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 89fcb71..07bee67 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -289,7 +289,8 @@ module OpenTox row.shift row.each_index do |i| value = row[i] - value_maps[value].nil? ? value_maps[value]=0 : value_maps[value] += 1 + value_maps[i] = Hash.new if value_maps[i].nil? + value_maps[i][value].nil? ? value_maps[i][value]=0 : value_maps[i][value] += 1 end value_maps end @@ -300,19 +301,22 @@ module OpenTox def load_spreadsheet(book) book.default_sheet = 0 add_features book.row(1) + value_maps = Array.new + regression_features=Array.new - regression_features=false - value_maps= {} 2.upto(book.last_row) { |i| row = book.row(i) - value_maps=detect_new_values(row, value_maps) - if value_maps.size > 5 # 5 is the maximum nr of classes supported by Fminer. - regression_features=true - break - end + value_maps = detect_new_values(row, value_maps) + value_maps.each_with_index { |vm,j| + if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer. + regression_features[j]=true + else + regression_features[j]=false + end + } } 2.upto(book.last_row) { |i| - add_values book.row(i), regression_features + add_values book.row(i), regression_features } warnings @dataset @@ -325,16 +329,19 @@ module OpenTox row = 0 input = csv.split("\n") add_features split_row(input.shift) + value_maps = Array.new + regression_features=Array.new - regression_features=false - value_maps= {} input.each { |row| row = split_row(row) - value_maps=detect_new_values(row, value_maps) - if value_maps.size > 5 # 5 is the maximum nr of classes supported by Fminer. - regression_features=true - break - end + value_maps = detect_new_values(row, value_maps) + value_maps.each_with_index { |vm,j| + if vm.size > 5 # 5 is the maximum nr of classes supported by Fminer. + regression_features[j]=true + else + regression_features[j]=false + end + } } input.each { |row| add_values split_row(row), regression_features @@ -385,6 +392,9 @@ module OpenTox end end + # Adds a row to a dataset + # @param Array A row split up as an array + # @param Array Indicator for regression for each field def add_values(row, regression_features) smiles = row.shift @@ -401,7 +411,7 @@ module OpenTox feature = @features[i] type = nil - if (regression_features) + if (regression_features[i]) type = feature_type(value) if type != OT.NumericFeature raise "Error! Expected numeric values." -- cgit v1.2.3 From 9bc59a8715e5b12bb989ba3ed2856630f0436b2b Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Jul 2011 12:12:51 +0000 Subject: initial sdf parser --- lib/parser.rb | 168 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 1 deletion(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 07bee67..c9de1ed 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -350,7 +350,6 @@ module OpenTox @dataset end - private def warnings @@ -454,5 +453,172 @@ module OpenTox end end + + class Table + + attr_accessor :data, :features, :compounds + + def initialize + @data = {} + @activity_errors = [] + end + + def feature_values(feature) + @data.collect{|c, row| row[feature]}.uniq.compact + end + + def feature_types(feature) + @data.collect{|c, row| feature_type(row[feature])}.uniq.compact + end + + def features + @data.collect{|c,row| row.keys}.flatten.uniq + end + + def clean_features + ignored_features = [] + features.each do |feature| + if feature_values(feature).size > 5 + if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature + # REGRESSION + elsif feature_types(feature).include? OT.NumericFeature + @data.each{|c,row| row[feature] = nil unless numeric?(row[feature]) } # delete nominal features + @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)." + else + @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)." + ignored_features << feature + next + end + elsif feature_values(feature).size <= 1 + @activity_errors << "Feature #{feature} ignored (less than 2 feature values)." + ignored_features << feature + else + # CLASSIFICATION + end + end + ignored_features.each do |feature| + @data.each{ |c,row| row.delete feature } + end + @activity_errors + end + + def add_to_dataset(dataset) + features.each do |feature_name| + feature_uri = File.join(dataset.uri,"feature",URI.encode(feature_name)) + dataset.add_feature(feature_uri,{DC.title => feature_name}) + end + + @data.each do |compound,row| + row.each do |feature,value| + if numeric?(value) + value = value.to_f + elsif value.nil? or value.empty? + value = nil + else + value = value.to_s + end + feature_uri = File.join(dataset.uri,"feature",URI.encode(feature)) + dataset.add(compound, feature_uri, value) + if feature_types(feature).include? OT.NumericFeature + dataset.features[feature_uri][RDF.type] = OT.NumericFeature + else + dataset.features[feature_uri][RDF.type] = OT.NominalFeature + dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + end + end + end + end + + private + def numeric?(value) + true if Float(value) rescue false + end + + def feature_type(value) + if numeric? value + return OT.NumericFeature + else + return OT.NominalFeature + end + end + end + + # quick hack to enable sdf import via csv + # should be refactored + class Sdf + + attr_accessor :dataset + + def initialize + @data = {} + + #@format_errors = "" + @compound_errors = [] + @activity_errors = [] + @duplicates = {} + end + + def load_sdf(sdf) + + obconversion = OpenBabel::OBConversion.new + obmol = OpenBabel::OBMol.new + obconversion.set_in_and_out_formats "sdf", "inchi" + + table = Table.new + + properties = [] + sdf.each_line { |l| properties << l.to_s if l.match(//,'').strip.chomp } + + LOGGER.debug "SDF import" + rec = 0 + sdf.split(/\$\$\$\$\r*\n/).each do |s| + rec += 1 + obconversion.read_string obmol, s + begin + inchi = obconversion.write_string(obmol).gsub(/\s/,'').chomp + @duplicates[inchi] = [] unless @duplicates[inchi] + @duplicates[inchi] << rec #inchi#+", "+row.join(", ") + compound = Compound.from_inchi inchi + rescue + @compound_errors << "Could not convert structure to InChI, all entries for this compound (record #{rec} have been ignored! \n#{s}" + next + end + row = {} + obmol.get_data.each { |d| row[d.get_attribute] = d.get_value if properties.include?(d.get_attribute) } + table.data[compound.uri] = row + end + + LOGGER.debug "Clean table" + #File.open("/home/ch/tmp_all.yaml","w+"){|f| f.puts table.to_yaml} + # REOVE ignored_features + @activity_errors = table.clean_features + #File.open("/home/ch/tmp.yaml","w+"){|f| f.puts table.to_yaml} + LOGGER.debug "Dataset insert" + table.add_to_dataset @dataset + + warnings + @dataset + + end + + private + + def warnings + + warnings = '' + warnings += "

Incorrect Smiles structures (ignored):

" + @compound_errors.join("
") unless @compound_errors.empty? + warnings += "

Irregular activities (ignored):

" + @activity_errors.join("
") unless @activity_errors.empty? + duplicate_warnings = '' + @duplicates.each {|inchi,lines| duplicate_warnings << "

#{lines.join('
')}

" if lines.size > 1 } + warnings += "

Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from independent experiments):

" + duplicate_warnings unless duplicate_warnings.empty? + + @dataset.metadata[OT.Warnings] = warnings + + end + + end end end -- cgit v1.2.3 From 49943ba879a40f2039eae710cb9e0ad8c2ffb04a Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Thu, 28 Jul 2011 15:44:55 +0200 Subject: Generalized routines 'predicted_variables' and 'from_rdf' --- lib/parser.rb | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 07bee67..a1678ea 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -86,7 +86,11 @@ module OpenTox # @param [String] rdf # @param [String] type of the info (e.g. OT.Task, OT.ErrorReport) needed to get the subject-uri # @return [Owl] with uri and metadata set - def self.from_rdf( rdf, type ) + def self.from_rdf( rdf, type, allow_multiple = false ) + + uris = Array.new + owls = Array.new + # write to file and read convert with rapper into tripples file = Tempfile.new("ot-rdfxml") file.puts rdf @@ -99,20 +103,27 @@ module OpenTox triples.each_line do |line| triple = line.to_triple if triple[1] == RDF['type'] and triple[2]==type - raise "uri already set, two uris found with type: "+type.to_s if uri + if !allow_multiple + raise "uri already set, two uris found with type: "+type.to_s if uri + end uri = triple[0] + uris << uri end end File.delete(file.path) + # load metadata - metadata = {} - triples.each_line do |line| - triple = line.to_triple - metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type'] - end - owl = Owl::Generic.new(uri) - owl.metadata = metadata - owl + uris.each { |uri| + metadata = {} + triples.each_line do |line| + triple = line.to_triple + metadata[triple[1]] = triple[2].split('^^').first if triple[0] == uri and triple[1] != RDF['type'] + end + owl = Owl::Generic.new(uri) + owl.metadata = metadata + owls << owl + } + allow_multiple ? owls : owls[0] end # Generic parser for all OpenTox classes -- cgit v1.2.3 From 1148087a71ac023a6758c74325ad364d7cda7dbe Mon Sep 17 00:00:00 2001 From: Christoph Helma Date: Thu, 28 Jul 2011 15:46:39 +0000 Subject: sdf acceptValues fixed --- lib/parser.rb | 52 +++++++++++++++++++++------------------------------- 1 file changed, 21 insertions(+), 31 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index c9de1ed..8fa5847 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -509,21 +509,25 @@ module OpenTox end @data.each do |compound,row| - row.each do |feature,value| - if numeric?(value) - value = value.to_f - elsif value.nil? or value.empty? - value = nil - else - value = value.to_s - end - feature_uri = File.join(dataset.uri,"feature",URI.encode(feature)) - dataset.add(compound, feature_uri, value) - if feature_types(feature).include? OT.NumericFeature - dataset.features[feature_uri][RDF.type] = OT.NumericFeature - else - dataset.features[feature_uri][RDF.type] = OT.NominalFeature - dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + unless row.empty? + row.each do |feature,value| + if numeric?(value) + value = value.to_f + elsif value.nil? or value.empty? + value = nil + else + value = value.to_s + end + feature_uri = File.join(dataset.uri,"feature",URI.encode(feature)) + dataset.add(compound, feature_uri, value) + #dataset.features[feature_uri][RDF.type] = feature_types(feature) + #dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + if feature_types(feature).include? OT.NumericFeature + dataset.features[feature_uri][RDF.type] = [OT.NumericFeature] + else + dataset.features[feature_uri][RDF.type] = [OT.NominalFeature] + dataset.features[feature_uri][OT.acceptValue] = feature_values(feature) + end end end end @@ -552,7 +556,6 @@ module OpenTox def initialize @data = {} - #@format_errors = "" @compound_errors = [] @activity_errors = [] @duplicates = {} @@ -572,7 +575,6 @@ module OpenTox properties.sort! properties.collect!{ |p| p.gsub(/<|>/,'').strip.chomp } - LOGGER.debug "SDF import" rec = 0 sdf.split(/\$\$\$\$\r*\n/).each do |s| rec += 1 @@ -591,23 +593,10 @@ module OpenTox table.data[compound.uri] = row end - LOGGER.debug "Clean table" - #File.open("/home/ch/tmp_all.yaml","w+"){|f| f.puts table.to_yaml} - # REOVE ignored_features + # finda and remove ignored_features @activity_errors = table.clean_features - #File.open("/home/ch/tmp.yaml","w+"){|f| f.puts table.to_yaml} - LOGGER.debug "Dataset insert" table.add_to_dataset @dataset - warnings - @dataset - - end - - private - - def warnings - warnings = '' warnings += "

Incorrect Smiles structures (ignored):

" + @compound_errors.join("
") unless @compound_errors.empty? warnings += "

Irregular activities (ignored):

" + @activity_errors.join("
") unless @activity_errors.empty? @@ -616,6 +605,7 @@ module OpenTox warnings += "

Duplicated structures (all structures/activities used for model building, please make sure, that the results were obtained from independent experiments):

" + duplicate_warnings unless duplicate_warnings.empty? @dataset.metadata[OT.Warnings] = warnings + @dataset end -- cgit v1.2.3 From fa37ab0876faaaa2acf37b147924f025a0d8cd9a Mon Sep 17 00:00:00 2001 From: Andreas Maunz Date: Fri, 29 Jul 2011 11:47:17 +0200 Subject: Added TUM clustering --- lib/parser.rb | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'lib/parser.rb') diff --git a/lib/parser.rb b/lib/parser.rb index 4ee4a22..d0975af 100644 --- a/lib/parser.rb +++ b/lib/parser.rb @@ -447,12 +447,8 @@ module OpenTox end end - def numeric?(value) - true if Float(value) rescue false - end - def feature_type(value) - if numeric? value + if OpenTox::Algorithm::numeric? value return OT.NumericFeature else return OT.NominalFeature @@ -493,7 +489,7 @@ module OpenTox if feature_types(feature).size == 1 and feature_types(feature).first == OT.NumericFeature # REGRESSION elsif feature_types(feature).include? OT.NumericFeature - @data.each{|c,row| row[feature] = nil unless numeric?(row[feature]) } # delete nominal features + @data.each{|c,row| row[feature] = nil unless OpenTox::Algorithm::numeric?(row[feature]) } # delete nominal features @activity_errors << "Nominal feature values of #{feature} ignored (using numeric features for regression models)." else @activity_errors << "Feature #{feature} ignored (more than 5 nominal feature values and no numeric values)." @@ -522,7 +518,7 @@ module OpenTox @data.each do |compound,row| unless row.empty? row.each do |feature,value| - if numeric?(value) + if OpenTox::Algorithm::numeric?(value) value = value.to_f elsif value.nil? or value.empty? value = nil @@ -545,12 +541,9 @@ module OpenTox end private - def numeric?(value) - true if Float(value) rescue false - end def feature_type(value) - if numeric? value + if OpenTox::Algorithm::numeric? value return OT.NumericFeature else return OT.NominalFeature -- cgit v1.2.3