Merge branch 'dvisockas-master'

2026-02-12 11:14:53 +00:00 · 2015-11-22 07:46:25 -08:00
parent eaa9a2f5d7 2155771683
commit 15394f85a3
7 changed files with 229 additions and 132 deletions
--- a/examples/continuous-id3.rb
+++ b/examples/continuous-id3.rb
@@ -2,15 +2,25 @@ require 'rubygems'
 require 'decisiontree'
 include DecisionTree

-# ---Continuous-----------------------------------------------------------------------------------------
+# ---Continuous---

 # Read in the training data
-training, attributes = [], nil
-File.open('data/continuous-training.txt','r').each_line { |line| 
+training = []
+File.open('data/continuous-training.txt', 'r').each_line do |line|
  data = line.strip.chomp('.').split(',')
  attributes ||= data
-  training.push(data.collect {|v| (v == 'healthy') || (v == 'colic') ? (v == 'healthy' ? 1 : 0) : v.to_f})
-}
+  training_data = data.collect do |v|
+    case v
+    when 'healthy'
+      1
+    when 'colic'
+      0
+    else
+      v.to_f
+    end
+  end
+  training.push(training_data)
+end

 # Remove the attribute row from the training data
 training.shift
@@ -19,15 +29,25 @@ training.shift
 dec_tree = ID3Tree.new(attributes, training, 1, :continuous)
 dec_tree.train

-#---- Test the tree....
+# ---Test the tree---

 # Read in the test cases
 # Note: omit the attribute line (first line), we know the labels from the training data
 test = []
-File.open('data/continuous-test.txt','r').each_line { |line| 
+File.open('data/continuous-test.txt', 'r').each_line do |line|
  data = line.strip.chomp('.').split(',')
-  test.push(data.collect {|v| (v == 'healthy') || (v == 'colic') ? (v == 'healthy' ? 1 : 0) : v.to_f})
-}
+  test_data = data.collect do |v|
+    if v == 'healthy' || v == 'colic'
+      v == 'healthy' ? 1 : 0
+    else
+      v.to_f
+    end
+  end
+  test.push(test_data)
+end

 # Let the tree predict the output and compare it to the true specified value
-test.each { |t| predict = dec_tree.predict(t);  puts "Predict: #{predict} ... True: #{t.last}"}
+test.each do |t|
+  predict = dec_tree.predict(t)
+  puts "Predict: #{predict} ... True: #{t.last}"
+end
--- a/examples/discrete-id3.rb
+++ b/examples/discrete-id3.rb
@@ -1,15 +1,25 @@
 require 'rubygems'
 require 'decisiontree'

-# ---Discrete-----------------------------------------------------------------------------------------
+# ---Discrete---

 # Read in the training data
-training, attributes = [], nil
-File.open('data/discrete-training.txt','r').each_line { |line| 
+training = []
+File.open('data/discrete-training.txt', 'r').each_line do |line|
  data = line.strip.split(',')
  attributes ||= data
-  training.push(data.collect {|v| (v == 'will buy') || (v == "won't buy") ? (v == 'will buy' ? 1 : 0) : v})
-}
+  training_data = data.collect do |v|
+    case v
+    when 'will buy'
+      1
+    when "won't buy"
+      0
+    else
+      v
+    end
+  end
+  training.push(training_data)
+end

 # Remove the attribute row from the training data
 training.shift
@@ -18,17 +28,31 @@ training.shift
 dec_tree = DecisionTree::ID3Tree.new(attributes, training, 1, :discrete)
 dec_tree.train

-#---- Test the tree....
+# ---Test the tree---

 # Read in the test cases
 # Note: omit the attribute line (first line), we know the labels from the training data
 test = []
-File.open('data/discrete-test.txt','r').each_line { |line| data = line.strip.split(',') 
-  test.push(data.collect {|v| (v == 'will buy') || (v == "won't buy") ? (v == 'will buy' ? 1 : 0) : v})
-}
+File.open('data/discrete-test.txt', 'r').each_line do |line|
+  data = line.strip.split(',')
+  test_data = data.collect do |v|
+    case v
+    when 'will buy'
+      1
+    when "won't buy"
+      0
+    else
+      v
+    end
+  end
+  training.push(test_data)
+end

 # Let the tree predict the output and compare it to the true specified value
-test.each { |t|   predict = dec_tree.predict(t); puts "Predict: #{predict} ... True: #{t.last}"; }
+test.each do |t|
+  predict = dec_tree.predict(t)
+  puts "Predict: #{predict} ... True: #{t.last}"
+end

 # Graph the tree, save to 'discrete.png'
-dec_tree.graph("discrete")
+dec_tree.graph('discrete')
--- a/examples/simple.rb
+++ b/examples/simple.rb
@@ -10,7 +10,7 @@ training = [
  [38, 'sick'],
  [36.7, 'healthy'],
  [40, 'sick'],
-  [50, 'really sick'],
+  [50, 'really sick']
 ]

 # Instantiate the tree, and train it based on the data (set default to '1')
@@ -20,9 +20,7 @@ dec_tree.train
 test = [37, 'sick']

 decision = dec_tree.predict(test)
-puts "Predicted: #{decision} ... True decision: #{test.last}";
+puts "Predicted: #{decision} ... True decision: #{test.last}"

 # Graph the tree, save to 'tree.png'
-dec_tree.graph("tree")
-
-
+dec_tree.graph('tree')
--- a/lib/core_extensions/array.rb
+++ b/lib/core_extensions/array.rb
@@ -0,0 +1,29 @@
+class Array
+  def classification
+    collect(&:last)
+  end
+
+  # calculate information entropy
+  def entropy
+    return 0 if empty?
+
+    info = {}
+    each do |i|
+      info[i] = !info[i] ? 1 : (info[i] + 1)
+    end
+
+    result(info, length)
+  end
+
+  private
+
+  def result(info, total)
+    final = 0
+    info.each do |_symbol, count|
+      next unless count > 0
+      percentage = count.to_f / total
+      final += -percentage * Math.log(percentage) / Math.log(2.0)
+    end
+    final
+  end
+end
--- a/lib/core_extensions/object.rb
+++ b/lib/core_extensions/object.rb
@@ -0,0 +1,9 @@
+class Object
+  def save_to_file(filename)
+    File.open(filename, 'w+') { |f| f << Marshal.dump(self) }
+  end
+
+  def self.load_from_file(filename)
+    Marshal.load(File.read(filename))
+  end
+end
--- a/lib/decisiontree.rb
+++ b/lib/decisiontree.rb
@@ -1 +1,3 @@
 require File.dirname(__FILE__) + '/decisiontree/id3_tree.rb'
+require 'core_extensions/object'
+require 'core_extensions/array'
--- a/lib/decisiontree/id3_tree.rb
+++ b/lib/decisiontree/id3_tree.rb
@@ -3,50 +3,33 @@
 ### Copyright (c) 2007 Ilya Grigorik <ilya AT igvita DOT com>
 ### Modifed at 2007 by José Ignacio Fernández <joseignacio.fernandez AT gmail DOT com>

-class Object
-  def save_to_file(filename)
-    File.open(filename, 'w+' ) { |f| f << Marshal.dump(self) }
-  end
-
-  def self.load_from_file(filename)
-    Marshal.load( File.read( filename ) )
-  end
-end
-
-class Array
-  def classification; collect { |v| v.last }; end
-
-  # calculate information entropy
-  def entropy
-    return 0 if empty?
-
-    info = {}
-    total = 0
-    each {|i| info[i] = !info[i] ? 1 : (info[i] + 1); total += 1}
-
-    result = 0
-    info.each do |symbol, count|
-      result += -count.to_f/total*Math.log(count.to_f/total)/Math.log(2.0) if (count > 0)
-    end
-    result
-  end
-end
-
 module DecisionTree
  Node = Struct.new(:attribute, :threshold, :gain)

  class ID3Tree
    def initialize(attributes, data, default, type)
-      @used, @tree, @type = {}, {}, type
-      @data, @attributes, @default = data, attributes, default
+      @used = {}
+      @tree = {}
+      @type = type
+      @data = data
+      @attributes = attributes
+      @default = default
    end

    def train(data = @data, attributes = @attributes, default = @default)
-      attributes = attributes.map {|e| e.to_s}
+      attributes = attributes.map(&:to_s)
      initialize(attributes, data, default, @type)

      # Remove samples with same attributes leaving most common classification
-      data2 = data.inject({}) {|hash, d| hash[d.slice(0..-2)] ||= Hash.new(0); hash[d.slice(0..-2)][d.last] += 1; hash }.map{|key,val| key + [val.sort_by{ |k, v| v }.last.first]}
+      data2 = data.inject({}) do |hash, d|
+        hash[d.slice(0..-2)] ||= Hash.new(0)
+        hash[d.slice(0..-2)][d.last] += 1
+        hash
+      end
+
+      data2 = data2.map do |key, val|
+        key + [val.sort_by { |_k, v| v }.last.first]
+      end

      @tree = id3_train(data2, attributes, default)
    end
@@ -57,12 +40,14 @@ module DecisionTree

    def fitness_for(attribute)
      case type(attribute)
-        when :discrete; fitness = proc{|a,b,c| id3_discrete(a,b,c)}
-        when :continuous; fitness = proc{|a,b,c| id3_continuous(a,b,c)}
+      when :discrete
+        proc { |a, b, c| id3_discrete(a, b, c) }
+      when :continuous
+        proc { |a, b, c| id3_continuous(a, b, c) }
      end
    end

-    def id3_train(data, attributes, default, used={})
+    def id3_train(data, attributes, default, _used={})
      return default if data.empty?

      # return classification if all examples have the same classification
@@ -75,7 +60,7 @@ module DecisionTree
      performance = attributes.collect { |attribute| fitness_for(attribute).call(data, attributes, attribute) }
      max = performance.max { |a,b| a[0] <=> b[0] }
      min = performance.min { |a,b| a[0] <=> b[0] }
-      max = performance.shuffle.first if max[0] == min[0]
+      max = performance.sample if max[0] == min[0]
      best = Node.new(attributes[performance.index(max)], max[1], max[0])
      best.threshold = nil if @type == :discrete
      @used.has_key?(best.attribute) ? @used[best.attribute] += [best.threshold] : @used[best.attribute] = [best.threshold]
@@ -84,15 +69,22 @@ module DecisionTree
      fitness = fitness_for(best.attribute)
      case type(best.attribute)
      when :continuous
-          data.partition { |d| d[attributes.index(best.attribute)] >= best.threshold }.each_with_index  { |examples, i|
+        partitioned_data = data.partition do |d|
+          d[attributes.index(best.attribute)] >= best.threshold
+        end
+        partitioned_data.each_with_index do |examples, i|
          tree[best][String.new(l[i])] = id3_train(examples, attributes, (data.classification.mode rescue 0), &fitness)
-          }
+        end
      when :discrete
        values = data.collect { |d| d[attributes.index(best.attribute)] }.uniq.sort
-          partitions = values.collect { |val| data.select { |d| d[attributes.index(best.attribute)] == val } }
-          partitions.each_with_index  { |examples, i|
+        partitions = values.collect do |val|
+          data.select do |d|
+            d[attributes.index(best.attribute)] == val
+          end
+        end
+        partitions.each_with_index do |examples, i|
          tree[best][values[i]] = id3_train(examples, attributes - [values[i]], (data.classification.mode rescue 0), &fitness)
-          }
+        end
      end

      tree
@@ -100,19 +92,23 @@ module DecisionTree

    # ID3 for binary classification of continuous variables (e.g. healthy / sick based on temperature thresholds)
    def id3_continuous(data, attributes, attribute)
-      values, thresholds = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort, []
+      values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort
+      thresholds = []
      return [-1, -1] if values.size == 1
-      values.each_index { |i| thresholds.push((values[i]+(values[i+1].nil? ? values[i] : values[i+1])).to_f / 2) }
+      values.each_index do |i|
+        thresholds.push((values[i] + (values[i + 1].nil? ? values[i] : values[i + 1])).to_f / 2)
+      end
      thresholds.pop
      #thresholds -= used[attribute] if used.has_key? attribute

-      gain = thresholds.collect { |threshold|
+      gain = thresholds.collect do |threshold|
        sp = data.partition { |d| d[attributes.index(attribute)] >= threshold }
        pos = (sp[0].size).to_f / data.size
        neg = (sp[1].size).to_f / data.size

        [data.classification.entropy - pos * sp[0].classification.entropy - neg * sp[1].classification.entropy, threshold]
-      }.max { |a,b| a[0] <=> b[0] }
+      end
+      gain = gain.max { |a, b| a[0] <=> b[0] }

      return [-1, -1] if gain.size == 0
      gain
@@ -122,7 +118,7 @@ module DecisionTree
    def id3_discrete(data, attributes, attribute)
      values = data.collect { |d| d[attributes.index(attribute)] }.uniq.sort
      partitions = values.collect { |val| data.select { |d| d[attributes.index(attribute)] == val } }
-      remainder = partitions.collect {|p| (p.size.to_f / data.size) * p.classification.entropy}.inject(0) {|i,s| s+=i }
+      remainder = partitions.collect { |p| (p.size.to_f / data.size) * p.classification.entropy }.inject(0) { |a, e| e += a }

      [data.classification.entropy - remainder, attributes.index(attribute)]
    end
@@ -131,7 +127,7 @@ module DecisionTree
      descend(@tree, test)
    end

-    def graph(filename, file_type = "png")
+    def graph(filename, file_type = 'png')
      require 'graphr'
      dgp = DotGraphPrinter.new(build_tree)
      dgp.write_to_file("#{filename}.#{file_type}", file_type)
@@ -148,7 +144,7 @@ module DecisionTree
      cases = attr[1].to_a
      rules = []
      cases.each do |c, child|
-        if child.is_a?(Hash) then
+        if child.is_a?(Hash)
          build_rules(child).each do |r|
            r2 = r.clone
            r2.premises.unshift([attr.first, c])
@@ -162,12 +158,13 @@ module DecisionTree
    end

    private
+
    def descend(tree, test)
      attr = tree.to_a.first
-      return @default if !attr
+      return @default unless attr
      if type(attr.first.attribute) == :continuous
-        return attr[1]['>='] if !attr[1]['>='].is_a?(Hash) and test[@attributes.index(attr.first.attribute)] >= attr.first.threshold
-        return attr[1]['<'] if !attr[1]['<'].is_a?(Hash) and test[@attributes.index(attr.first.attribute)] < attr.first.threshold
+        return attr[1]['>='] if !attr[1]['>='].is_a?(Hash) && test[@attributes.index(attr.first.attribute)] >= attr.first.threshold
+        return attr[1]['<'] if !attr[1]['<'].is_a?(Hash) && test[@attributes.index(attr.first.attribute)] < attr.first.threshold
        return descend(attr[1]['>='], test) if test[@attributes.index(attr.first.attribute)] >= attr.first.threshold
        return descend(attr[1]['<'], test) if test[@attributes.index(attr.first.attribute)] < attr.first.threshold
      else
@@ -178,26 +175,29 @@ module DecisionTree

    def build_tree(tree = @tree)
      return [] unless tree.is_a?(Hash)
-      return [["Always", @default]] if tree.empty?
+      return [['Always', @default]] if tree.empty?

      attr = tree.to_a.first

      links = attr[1].keys.collect do |key|
        parent_text = "#{attr[0].attribute}\n(#{attr[0].object_id})"
-        if attr[1][key].is_a?(Hash) then
+        if attr[1][key].is_a?(Hash)
          child = attr[1][key].to_a.first[0]
          child_text = "#{child.attribute}\n(#{child.object_id})"
        else
          child = attr[1][key]
          child_text = "#{child}\n(#{child.to_s.clone.object_id})"
        end
-        label_text = "#{key} #{type(attr[0].attribute) == :continuous ? attr[0].threshold : ""}"
+        label_text = "#{key} ''"
+        if type(attr[0].attribute) == :continuous
+          label_text.gsub!("''", attr[0].threshold)
+        end

        [parent_text, child_text, label_text]
      end
      attr[1].keys.each { |key| links += build_tree(attr[1][key]) }

-      return links
+      links
    end
  end

@@ -207,42 +207,50 @@ module DecisionTree
    attr_accessor :attributes

    def initialize(attributes, premises = [], conclusion = nil)
-      @attributes, @premises, @conclusion = attributes, premises, conclusion
+      @attributes = attributes
+      @premises = premises
+      @conclusion = conclusion
    end

    def to_s
      str = ''
      @premises.each do |p|
-        str += "#{p.first.attribute} #{p.last} #{p.first.threshold}" if p.first.threshold
-        str += "#{p.first.attribute} = #{p.last}" if !p.first.threshold
+        if p.first.threshold
+          str += "#{p.first.attribute} #{p.last} #{p.first.threshold}"
+        else
+          str += "#{p.first.attribute} = #{p.last}"
+        end
        str += "\n"
      end
      str += "=> #{@conclusion} (#{accuracy})"
    end

    def predict(test)
-      verifies = true;
+      verifies = true
      @premises.each do |p|
-        if p.first.threshold then # Continuous
-          if !(p.last == '>=' && test[@attributes.index(p.first.attribute)] >= p.first.threshold) && !(p.last == '<' && test[@attributes.index(p.first.attribute)] < p.first.threshold) then
-            verifies = false; break
+        if p.first.threshold # Continuous
+          if !(p.last == '>=' && test[@attributes.index(p.first.attribute)] >= p.first.threshold) && !(p.last == '<' && test[@attributes.index(p.first.attribute)] < p.first.threshold)
+            verifies = false
+            break
          end
        else # Discrete
-          if test[@attributes.index(p.first.attribute)] != p.last then
-            verifies = false; break
+          if test[@attributes.index(p.first.attribute)] != p.last
+            verifies = false
+            break
          end
        end
      end
      return @conclusion if verifies
-      return nil
+      nil
    end

    def get_accuracy(data)
-      correct = 0; total = 0
+      correct = 0
+      total = 0
      data.each do |d|
        prediction = predict(d)
        correct += 1 if d.last == prediction
-        total += 1 if !prediction.nil?
+        total += 1 unless prediction.nil?
      end
      (correct.to_f + 1) / (total.to_f + 2)
    end
@@ -256,7 +264,9 @@ module DecisionTree
    attr_accessor :rules

    def initialize(attributes, data, default, type)
-      @attributes, @default, @type = attributes, default, type
+      @attributes = attributes
+      @default = default
+      @type = type
      mixed_data = data.sort_by { rand }
      cut = (mixed_data.size.to_f * 0.67).to_i
      @train_data = mixed_data.slice(0..cut - 1)
@@ -276,8 +286,9 @@ module DecisionTree
        (1..r.premises.size).each do
          acc1 = r.accuracy(data)
          p = r.premises.pop
-          if acc1 > r.get_accuracy(data) then
-            r.premises.push(p); break
+          if acc1 > r.get_accuracy(data)
+            r.premises.push(p)
+            break
          end
        end
      end
@@ -285,7 +296,8 @@ module DecisionTree
    end

    def to_s
-      str = ''; @rules.each { |rule| str += "#{rule}\n\n" }
+      str = ''
+      @rules.each { |rule| str += "#{rule}\n\n" }
      str
    end

@@ -294,15 +306,18 @@ module DecisionTree
        prediction = r.predict(test)
        return prediction, r.accuracy unless prediction.nil?
      end
-      return @default, 0.0
+      [@default, 0.0]
    end
  end

  class Bagging
    attr_accessor :classifiers
    def initialize(attributes, data, default, type)
-      @classifiers, @type = [], type
-      @data, @attributes, @default = data, attributes, default
+      @classifiers = []
+      @type = type
+      @data = data
+      @attributes = attributes
+      @default = default
    end

    def train(data = @data, attributes = @attributes, default = @default)
@@ -320,8 +335,8 @@ module DecisionTree
        predictions[p] += accuracy unless p.nil?
      end
      return @default, 0.0 if predictions.empty?
-      winner = predictions.sort_by {|k,v| -v}.first
-      return winner[0], winner[1].to_f / @classifiers.size.to_f
+      winner = predictions.sort_by { |_k, v| -v }.first
+      [winner[0], winner[1].to_f / @classifiers.size.to_f]
    end
  end
 end