Files
  • main.py
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
""" This program* creates a Huffman code for the input text.  Note that if you're using an example from a book, the tree might not be the same if there are ties in ranking/frequency, and in some cases the branches (i.e. 0s and 1s) will be reversed, but the total number of bits used will be the same.  
[* which I wrote from first principles without any reference to other algorithms, so there may be better ways to do it!]"""

# space out tree so that children of node n
# are at positions 2n and 2n+1
def treeify(n):
  # insertion point (i.e. index) for any padding
  global insert
  # while the list isn't long enough for the
  # children to be in the right place...
  while (n*2 + 1) >= len(tree):
    # ...then pad it with blanks
    tree.insert(insert,"")
  # if the two child nodes concatenated don't give
  # you the parent node then they're not in the
  # right place, so...
  while tree[n] != tree[n*2] + tree[n*2 + 1] and tree[n] != tree[n*2 + 1] + tree[n*2]:
    # ...pad out the list with blanks
    tree.insert(insert,"")
  # ensure the next insertion point is after
  # nodes that are now in the right place
  if insert <= n*2 + 1:
    insert = n*2 + 2

# return the code for a character
def codify(char):
  # start at the root node
  pos = 1
  code = ""
  # while we haven't found the letter
  while tree[pos] != char:
    # is the letter in the left child?
    if char in tree[pos*2]:
      # add a 1 to the code
      code += "1"
      # go to the left child
      pos = pos * 2
    else:
      # add a zero to the code
      code += "0"
      # go to the right child
      pos = pos * 2 + 1
  return code

text = ""
while text == "":
  text = input("\nText to encode: ")
# use a dictionary to count the letter frequencies
alphabet = {}
for char in text:
  # if the character has been found before...
  if char in alphabet:
    # ...increment the count
    alphabet[char] = alphabet[char] + 1
  else:
    # ...else add it to the dictionary
    alphabet[char] = 1
# take a copy of the characters to use later
letters = (list(alphabet))

# tree is the list representation of the tree
# tree[1] is the root, so tree[0] is padding
tree = ["",""]
# the items in the dictionary are combined until
# there is only one left, which is the root
if len(alphabet) == 1:
  tree.append(min(alphabet))
while len(alphabet) > 1:
  # find the item with the lowest frequency...
  key = min(alphabet, key=alphabet.get)
  # ...store it and its frequency...
  value = alphabet[key]
  # ...remove it from the dictionary...
  alphabet.pop(key)
  # ...and add it to the tree below the root
  tree.insert(2,key)
  # find the item with the next lowest freq...
  key2 = min(alphabet, key=alphabet.get)
  # ...concatenate it with the previous one...
  key += key2
  # ...and add their frequecies...
  value += alphabet[key2]
  # ...also removing it from the dictionary
  alphabet.pop(key2)
  # ...and adding the new combined item and freq.
  alphabet[key] = value
  # add this next least frequent item to the tree
  tree.insert(2,key2)

# turn list into a binary tree
# we know the root is in the right place, so
# start at the first left child (node 2)...
current = 2
# ...padding at index 4 onwards
insert = 4
# loop through the values in the list making
# sure their children are in the right place
while current < len(tree):
  if len(tree[current]) > 1:
    treeify(current)
  current += 1
  
# display the results
print("\nList representation of the tree:")
print(tree)
# display the codes
print("\nCodes:")
# loop through the unique characters in the text
for char in letters:
  print("-",char,"is",codify(char))
# use the tree to code the text
print("\n"+text,"would be represented as:")
# string to store the output
huffman = ""
for char in text:
  huffman += codify(char)
print(huffman)
saving = str(100 - (int(len(huffman)/(len(text)*7)*100))) + "%."
print("\nThat's",len(huffman),"bits using Huffman coding, compared with",len(text),"x 7 =",len(text)*7,"bits using ASCII - a saving of",saving)