Huffman Encoder (#123)

回答

# Ruby Quiz Huffman Encoder #123

module Enumerable
   def stable_sort_by(&block)
      i = 0
      self.sort_by {|v| [block.call(v), i+=1]}
   end
end

class HuffmanEncoder
   def initialize(data)
      freq = Hash.new
      data.split(//).each do |c|
         freq[c] = 0 unless freq[c]
         freq[c] += 1
      end
      @text2code = make_mapping(make_tree(freq.to_a))
      @code2text = Hash[*(@text2code.to_a.map{|pair| [pair[1], pair[0]]}.flatten)]
      @code_re = Regexp.union(*@code2text.keys)
   end
   attr_reader :mapping

   def encode(text)
      text.split(//).inject("") do |r, t|
         r << @text2code[t]
      end
   end
   # return text and rest code
   def decode(code)
      text = ""
      loop do
         c = code.slice!(@code_re)
         break if not c
         text << @code2text[c]
      end
      [text, code]
   end
   private
   def make_tree(freq)
      tree = freq.stable_sort_by {|f| f[0] }
      loop do
         break unless tree[1]
         tree.stable_sort_by {|f| f[1] }
         min1, min2 = tree.slice! 0,2
         tree << [min1[0] + min2[0], min1[1] + min2[1], min1, min2]
      end
      tree[0]
   end

   def make_mapping(tree)
      map = Hash.new
      tree[0].split(//).each {|c| map[c] = "" }
      make_mapping1(tree, map)
   end
   def make_mapping1(tree, map)
      if tree.length == 2
         map
      else
         tree[2][0].split(//).each {|c| map[c] << "0" }
         map = make_mapping1(tree[2], map)
         tree[3][0].split(//).each {|c| map[c] << "1" }
         map = make_mapping1(tree[3], map)
      end
      map
   end
end

def show_code(code)
   puts "Encoded:"
   formated_code = code.scan(/\d{0,40}/).map do |line|
      line.scan(/\d{0,8}/).join(" ")
   end.join("\n")
   puts formated_code
   puts "Dncoded Bytes:"
   d,m = code.length.divmod(8)
   d += 1if m != 0
   puts d
end

example = HuffmanEncoder.new(ARGV.join(" "))
code = example.encode(ARGV.join(" "))
show_code code
puts example.decode(code)

仕様のあいまいさ

この問題、文字列と出現頻度のペアからハフマン木を作成するときに、どの順番で木を作るかによって符号が変わってくる。
僕の実装では、

  • 同じ出現頻度の文字列は辞書順
  • 統合した枝の方は同じ文字数でも常に早生まれが軽い

で作った。