Tuesday, June 30, 2009

Anagram Code

Finally, here is the anagram code. It worked on my tests, but failed to build the anagram trie with the enable word list (which I no longer find a direct link to online). I'm getting a stack overflow error which implies there is a space leak and a bit of investigation turns up that it is somewhere in building the trie. I've tried different ways to force strictness, but to no (so far) avail. But if I run it with a smaller dictionary, or with +RTS -K16M, it works.

Notice that to keep from getting duplicates, the function "anagramsFor" only will produce a new word if it follows (or is equal to) "starter", which is set to empty to start, then to the last word generated. This should eliminate duplicates where anagramsFor generates a list like ["aaa", "bbb", "ccc"] and also one like ["aaa", "ccc", "bbb"].

It runs my (simple) test cases (using small dictionaries).

Here's the code :


{-# LANGUAGE BangPatterns #-}

{- run with +RTS -K16M for enable wordlist -}

import Char
import Data.List
import System
import qualified Data.Map as M
import Control.Monad.State
import Control.Monad.List

data Trie a b = Empty
| Node { entries :: (M.Map a (Trie a b)), ender :: Maybe b }
deriving (Eq, Show)

type AnagramTrie = Trie Char String
data AnagramHelper = Ah { trie :: AnagramTrie, strings :: M.Map String [String]}
deriving Show

type AnagramTree = Trie String [String]

emptyHelper = Ah { trie = Empty, strings = M.empty }

trieInsert :: (Ord a, Show a) => [a] -> b -> Trie a b -> Trie a b

trieInsert [] w Empty = Node {entries=M.empty, ender=Just w}
trieInsert [] w n = n -- should never happen

trieInsert !l1@(x:xs) !l Empty = Node {entries=M.singleton x (trieInsert xs l Empty ), ender=Nothing}

trieInsert !l1@(x:xs) !al !Node{entries=es, ender=en}
| M.member x es = let !st = trieInsert xs al in Node{entries=M.adjust st x es, ender=en}
| otherwise = let !st = trieInsert xs al Empty in Node{entries=M.insert x st es, ender=en}

findInTrie Empty _ = Nothing
findInTrie Node{entries = es, ender=en} [] = en
findInTrie Node{entries=es, ender=en} (c:cs)
| M.member c es = findInTrie (fromJust $ M.lookup c es) cs
| otherwise = Nothing

toString :: (Show a, Show b) => Trie a b -> String -> [String]
toString Empty indent = []
toString Node{entries=es, ender=en} indent = enderLine:subtrees
where
enderLine = case en of
Nothing -> []
Just l -> (indent ++ "-" ++ show l)
doEntry k t = (indent ++ " " ++ show [k]):(toString t (" "++indent))
subtrees = concat $ M.elems $ M.mapWithKey doEntry es

-- extract the paths in a trie (ignoring the entries, if any)
pathsThrough :: Trie a b -> [[a]]
pathsThrough Empty = [[]]
pathsThrough Node{entries=es,ender=en} = l
where
l = concatMap (\(x,y) -> (map (x:)) (pathsThrough y)) $ M.assocs es

allWhitespace s = and $ map isSpace s

makeAnagramHelperFromList l = foldl' ahInsert emptyHelper l

ahInsert !a@Ah{trie=t,strings= s} !word
| M.member sw s = Ah{trie=t, strings = M.insertWith' insertUnique sw [word] s}
| otherwise = Ah{trie=trieInsert sw sw t, strings=M.insert sw [word] s}
where
sw = sort word
insertUnique [w] l = if w `elem` l then l else (w:l)

clean x = filter (not.null) (nub x) -- profiling shows this is a bit faster

-- These are some common lists of words - one per line used here
-- these (obviously) live in my filesystem - you might use /usr/dict/share/words if
-- you're on a unix system

-- wordfile = "words/american-english-large-no-quotes"
-- wordfile = "words/american-english-smaller"
-- wordfile = "words/shorter-wordlist"
-- wordfile = "words/random-words"
-- wordfile = "words/seethe"

strip s = reverse $ dropWhile isSpace $ reverse $ dropWhile isSpace s

allchars w = (not.or) $ map (\x -> x < 'a' || x > 'z') w

-- most dictionaries include all the letters on their own, so
-- filter out all one letter words, but toss "a" and "i" back in
-- also strip the strings as there are some cases where they have trailing spaces
-- sometimes there seem to be duplicates (probably because there are caps in some versions)
-- in this version duplicates are handled by the builder

getWordsFromList l = ("a" :) $ ("i" :) $ filter (\x->(length x) > 1) $ filter allchars $ map strip $ (lines $ map toLower l)

makeAnagramHelperFromFile fn = do
rawWords <- readFile fn
let wordList = getWordsFromList rawWords
return $! makeAnagramHelperFromList wordList

getStringsFromTopWithString :: AnagramTrie -> String -> [String]

getStringsFromTopWithString Empty _ = []
getStringsFromTopWithString _ [] = []
getStringsFromTopWithString topTrie@Node{entries=es, ender=en} s@(c:cs) =
let maybeSubTree = M.lookup c es
in case maybeSubTree of
Nothing -> []
Just thisSubTree -> maybePrepend en $ findAllSubsetStringsInSubTree thisSubTree cs

findAllSubsetStringsInSubTree Empty _ = []
findAllSubsetStringsInSubTree Node{ender=en} [] = maybePrepend en []
findAllSubsetStringsInSubTree t@Node{entries=es, ender=en} (c:cs) =
let next = M.lookup c es -- does this first character in this node have a subtree ?
subs = findAllSubsetStringsInSubTree t $ dropWhile (==c) cs -- no, skip all of those and try the rest
in case next of
Nothing -> subs
Just t' -> maybePrepend en $ findAllSubsetStringsInSubTree t' cs ++ subs

maybePrepend Nothing l = l
maybePrepend (Just x) l = x:l

fromJust (Just l) = l

testWords = ["at", "tear", "rat", "ate", "sea", "teas", "tea", "erase", "tar", "art", "eat", "tears", "stare", "reset", "stares", "star" , "tartar", "seatea", "erasetar", "tee", "see"]

getTestTrie = trie getTestAH

getTestAH = makeAnagramHelperFromList testWords

anagramsFor starter ah@Ah{trie=t, strings=m} [] = Just Empty
anagramsFor starter ah@Ah{trie=t, strings=m} s = anatree
where
s' = sort s
slist = getStringsFromTopWithString t s'
subtreeskeys = filter (\(x,y) -> y /=Nothing) $ map (\x -> (x,anagramsFor x ah (s' \\ x))) slist
subtreeskeys1 = filter (\(x,y) -> x>=starter) subtreeskeys
mkAnaTrieEntries m (x, Nothing) = m
mkAnaTrieEntries m (x, Just y) = M.insert x y m

anatree = if subtreeskeys1 == []
then Nothing
else Just Node{entries=foldl mkAnaTrieEntries M.empty subtreeskeys1, ender=Just s}

simpleTest s = do
let t = anagramsFor [] getTestAH s
t' = fromJust t
mapM_ putStrLn $ toString t' ""
mapM_ putStrLn $ map show $ pathsThrough t'

main = do
(wl, strings) <- getCommandLineArgs
putStrLn $ "Using dictionary : " ++ wl
ah <- makeAnagramHelperFromFile wl
mapM_ (getAnagrams ah) strings

getCommandLineArgs = do
al <- getArgs
if length al < 1
then error "Args : [-dictionary] strings "
else let (s:xs) = al !! 0
in if s == '-'
then return (xs,drop 1 al)
else return ("words/enable.list", al)

listVariants :: AnagramHelper -> [String] -> [[String]]
listVariants Ah{strings=m} sl = map getWords sl
where
getWords s = fromJust $ M.lookup s m

getAnagrams ah s = do
putStrLn $ "getting anagrams for " ++ s
let at = anagramsFor [] ah s
case at of
Nothing -> putStrLn $ "No anagrams for: " ++ s
Just t -> do
mapM_ putStrLn $ toString t ""
let p = pathsThrough t
mapM_ putStrLn $ map show p
let v = map (listVariants ah) p
mapM_ putStrLn $ map show v

No comments: