From 21b19e9805876eb3101103383de608922e4924b9 Mon Sep 17 00:00:00 2001 From: Brian Underwood Date: Wed, 5 Apr 2017 15:47:01 -0400 Subject: [PATCH 1/4] Speed improvements for discrete --- lib/decisiontree/id3_tree.rb | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/lib/decisiontree/id3_tree.rb b/lib/decisiontree/id3_tree.rb index c14f238..1d3658a 100755 --- a/lib/decisiontree/id3_tree.rb +++ b/lib/decisiontree/id3_tree.rb @@ -3,6 +3,8 @@ ### Copyright (c) 2007 Ilya Grigorik ### Modifed at 2007 by José Ignacio Fernández +require 'set' + module DecisionTree Node = Struct.new(:attribute, :threshold, :gain) @@ -28,7 +30,7 @@ module DecisionTree end data2 = data2.map do |key, val| - key + [val.sort_by { |_k, v| v }.last.first] + key + [val.sort_by { |_, v| v }.last.first] end @tree = id3_train(data2, attributes, default) @@ -41,9 +43,9 @@ module DecisionTree def fitness_for(attribute) case type(attribute) when :discrete - proc { |a, b, c| id3_discrete(a, b, c) } + proc { |*args| id3_discrete(*args) } when :continuous - proc { |a, b, c| id3_continuous(a, b, c) } + proc { |*args| id3_continuous(*args) } end end @@ -66,14 +68,13 @@ module DecisionTree @used.has_key?(best.attribute) ? @used[best.attribute] += [best.threshold] : @used[best.attribute] = [best.threshold] tree, l = {best => {}}, ['>=', '<'] - fitness = fitness_for(best.attribute) case type(best.attribute) when :continuous partitioned_data = data.partition do |d| d[attributes.index(best.attribute)] >= best.threshold end partitioned_data.each_with_index do |examples, i| - tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0), &fitness) + tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0)) end when :discrete values = data.collect { |d| d[attributes.index(best.attribute)] }.uniq.sort @@ -83,7 +84,7 @@ module DecisionTree end end partitions.each_with_index do |examples, i| - tree[best][values[i]] = id3_train(examples, attributes - [values[i]], (data.classification.mode rescue 0), &fitness) + tree[best][values[i]] = id3_train(examples, attributes - [values[i]], (data.classification.mode rescue 0)) end end @@ -116,11 +117,14 @@ module DecisionTree # ID3 for discrete label cases def id3_discrete(data, attributes, attribute) - values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort - partitions = values.collect { |val| data.select { |d| d[attributes.index(attribute)] == val } } + index = attributes.index(attribute) + + values = Set.new + data.each { |d| values << d[index] } + partitions = values.to_a.sort.collect { |val| data.select { |d| d[index] == val } } remainder = partitions.collect { |p| (p.size.to_f / data.size) * p.classification.entropy }.inject(0) { |a, e| e += a } - [data.classification.entropy - remainder, attributes.index(attribute)] + [data.classification.entropy - remainder, index] end def predict(test) From ff298a8fb98965713293adb8760454a24afb02db Mon Sep 17 00:00:00 2001 From: Brian Underwood Date: Fri, 7 Apr 2017 10:33:40 -0400 Subject: [PATCH 2/4] Changes for performance as well as some general refactoring --- lib/core_extensions/array.rb | 38 +++++++++++++----------------------- lib/decisiontree.rb | 2 +- lib/decisiontree/id3_tree.rb | 27 +++++++++++++++++-------- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/lib/core_extensions/array.rb b/lib/core_extensions/array.rb index 5a70756..c726d5f 100644 --- a/lib/core_extensions/array.rb +++ b/lib/core_extensions/array.rb @@ -1,29 +1,19 @@ class Array - def classification - collect(&:last) - end - - # calculate information entropy def entropy - return 0 if empty? - - info = {} - each do |i| - info[i] = !info[i] ? 1 : (info[i] + 1) + each_with_object(Hash.new(0)) do |i, result| + result[i] += 1 + end.values.inject(0) do |sum, count| + percentage = count.to_f / length + sum + -percentage * Math.log2(percentage) end - - result(info, length) - end - - private - - def result(info, total) - final = 0 - info.each do |_symbol, count| - next unless count > 0 - percentage = count.to_f / total - final += -percentage * Math.log(percentage) / Math.log(2.0) - end - final end end + +module ArrayClassification + refine Array do + def classification + collect(&:last) + end + end +end + diff --git a/lib/decisiontree.rb b/lib/decisiontree.rb index 3da0b47..190fc0b 100644 --- a/lib/decisiontree.rb +++ b/lib/decisiontree.rb @@ -1,3 +1,3 @@ -require File.dirname(__FILE__) + '/decisiontree/id3_tree.rb' require 'core_extensions/object' require 'core_extensions/array' +require File.dirname(__FILE__) + '/decisiontree/id3_tree.rb' diff --git a/lib/decisiontree/id3_tree.rb b/lib/decisiontree/id3_tree.rb index 1d3658a..78d8493 100755 --- a/lib/decisiontree/id3_tree.rb +++ b/lib/decisiontree/id3_tree.rb @@ -8,6 +8,8 @@ require 'set' module DecisionTree Node = Struct.new(:attribute, :threshold, :gain) + using ArrayClassification + class ID3Tree def initialize(attributes, data, default, type) @used = {} @@ -119,10 +121,14 @@ module DecisionTree def id3_discrete(data, attributes, attribute) index = attributes.index(attribute) - values = Set.new - data.each { |d| values << d[index] } - partitions = values.to_a.sort.collect { |val| data.select { |d| d[index] == val } } - remainder = partitions.collect { |p| (p.size.to_f / data.size) * p.classification.entropy }.inject(0) { |a, e| e += a } + values = data.map { |row| row[index] }.uniq + remainder = values.sort.inject(0) do |sum, val| + classification = data.each_with_object([]) do |row, result| + result << row.last if row[index] == val + end + + sum + ((classification.size.to_f / data.size) * classification.entropy) + end [data.classification.entropy - remainder, index] end @@ -324,6 +330,7 @@ module DecisionTree class Bagging attr_accessor :classifiers + def initialize(attributes, data, default, type) @classifiers = [] @type = type @@ -333,10 +340,13 @@ module DecisionTree end def train(data = @data, attributes = @attributes, default = @default) - @classifiers = [] - 10.times { @classifiers << Ruleset.new(attributes, data, default, @type) } - @classifiers.each do |c| - c.train(data, attributes, default) + @classifiers = 5.times.map do |i| + Ruleset.new(attributes, data, default, @type) + end + + @classifiers.each_with_index do |classifier, index| + puts "Processing classifier ##{index + 1}" + classifier.train(data, attributes, default) end end @@ -352,3 +362,4 @@ module DecisionTree end end end + From 868ed916723f4dbf85b0e7cd6c884e3d3947b1f2 Mon Sep 17 00:00:00 2001 From: Brian Underwood Date: Tue, 11 Apr 2017 13:05:13 -0400 Subject: [PATCH 3/4] Put back to 10 classifiers --- lib/decisiontree/id3_tree.rb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/decisiontree/id3_tree.rb b/lib/decisiontree/id3_tree.rb index 78d8493..9434299 100755 --- a/lib/decisiontree/id3_tree.rb +++ b/lib/decisiontree/id3_tree.rb @@ -3,8 +3,6 @@ ### Copyright (c) 2007 Ilya Grigorik ### Modifed at 2007 by José Ignacio Fernández -require 'set' - module DecisionTree Node = Struct.new(:attribute, :threshold, :gain) @@ -340,7 +338,7 @@ module DecisionTree end def train(data = @data, attributes = @attributes, default = @default) - @classifiers = 5.times.map do |i| + @classifiers = 10.times.map do |i| Ruleset.new(attributes, data, default, @type) end From 13aed0b2ae57e994e3366fedf15530d86c47bccd Mon Sep 17 00:00:00 2001 From: Brian Underwood Date: Tue, 11 Apr 2017 14:57:32 -0400 Subject: [PATCH 4/4] Simplify with #sum --- lib/.DS_Store | Bin 0 -> 6148 bytes lib/core_extensions/array.rb | 5 +++-- lib/decisiontree/id3_tree.rb | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 lib/.DS_Store diff --git a/lib/.DS_Store b/lib/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..707d6541ef5e81699a5a691e25bfebba6aed5681 GIT binary patch literal 6148 zcmeHK&5qMB5FU36B-%;@iNhjsu*88&4_%73*Qo5?kU*qj5B!v5W6_|YCJ0Ujz5E>`l zkdF5d3Oh#)1sKrKK(!Igj%{Rs_O1sxC^*5r`+NTssBBT6W~Iq6=2v!F7IiZi{1Ba; z?v<<8f-v0eg}v*&7s<>lk|u5D^*F7bQSWiu%(AqaMz@n9S7;pC))e7_-h_=P@z^%m6d6