The exercise taken from programmingpraxis is to generate all anagrams from a phrase, e.g “hey man” -> “may hen” etc. The words must come from a dictionary. My solution has some optimizations in place. The main one: multiple words that are anagrams are treated as one and trivially expanded after the main part of the algorithm completes.

import re
from collections import defaultdict
import functools
import time
import itertools
PAT_SMALL = re.compile(r"[a-z]+$")

def get_letter_count_dict(word):
    d = defaultdict(int)
    for ch in word:
        d[ch] += 1
    return d

def filt(test_word, against):
    t = get_letter_count_dict(test_word)
    for ch in t:
        if ch not in against or t[ch] > against[ch]:
            return False
    return True

def filtered_read(fname, letter_count, length_min):
    f = open(fname)
    words = [ w.rstrip() for w in f.readlines() ]
    words = [ w for w in words if len(w) >= length_min and PAT_SMALL.match(w) and filt(w, letter_count) ]

    print("Using filtered dictionary with %d lowercase words" % len(words))
    return words

def gen_anagrams(word_letter_count, n, words_and_dicts, start_index):
    retv= []
    k = start_index
    n_words = len(words_and_dicts)
    while k < n_words:
        w1 = words_and_dicts[k][0]
        if len(w1) > n:
            k += 1
        new_let_count = words_and_dicts[k][1]
        diff_let_count = word_letter_count.copy()
        good = True
        left = n
        for ch in new_let_count:
            if ch in diff_let_count:
                letters_taken = new_let_count[ch]
                diff_let_count[ch] -= letters_taken
                if diff_let_count[ch] < 0:
                    good = False
                left -= letters_taken
                good = False
        if good:
            if left == 0:
                retv = retv + [ [w1,] ]
                leftover_sol = gen_anagrams(diff_let_count, left, words_and_dicts, k)
                for t in leftover_sol:
                    retv.append([ w1 ] + t)
        k += 1

    return retv

def expand(solutions, sorted2word):
    i = 1
    for sol in solutions:
        for mapped in itertools.product(*(sorted2word[anagram] for anagram in sol)):
            print(i, mapped)
            i += 1

def find_anagrams(word, min_length = 3, do_expand = False):
    ss = "".join(sorted([c for c in word if 'a' <= c <= 'z' ]))
    ss_d = get_letter_count_dict(ss)
    n = len(ss)

    words = filtered_read("wordlists/mwords/74550com.mon", ss_d, min_length)

    # operate on equivalence classes of all anagrams
    # create structures for reverse expansion
    sorted2word = defaultdict(list)
    wordclass = set()
    for w in words:
        sortedletters = "".join(sorted(list(w)))
        sorted2word[sortedletters] += [ w ]

    # order by length ascending, this is used in gen_anagrams
    class_dicts = sorted([ (len(w), w, get_letter_count_dict(w)) for w in wordclass if len(w) <= n  ])
    class_dicts = [ (trip[1], trip[2]) for trip in class_dicts ]

    start_time = time.clock()
    r = gen_anagrams(ss_d, n, class_dicts, 0)
    stop_time = time.clock()
    print("%d unique solutions found in %5.2f seconds" % (len(r), stop_time - start_time))
    if do_expand:
        expand(r, sorted2word)

if __name__ == "__main__":
    find_anagrams("programming praxis", do_expand = True)

About these ads