Notice that to keep from getting duplicates, the function "anagramsFor" only will produce a new word if it follows (or is equal to) "starter", which is set to empty to start, then to the last word generated. This should eliminate duplicates where anagramsFor generates a list like ["aaa", "bbb", "ccc"] and also one like ["aaa", "ccc", "bbb"].
It runs my (simple) test cases (using small dictionaries).
Here's the code :
{-# LANGUAGE BangPatterns #-}
{- run with +RTS -K16M for enable wordlist -}
import Char
import Data.List
import System
import qualified Data.Map as M
import Control.Monad.State
import Control.Monad.List
data Trie a b = Empty
| Node { entries :: (M.Map a (Trie a b)), ender :: Maybe b }
deriving (Eq, Show)
type AnagramTrie = Trie Char String
data AnagramHelper = Ah { trie :: AnagramTrie, strings :: M.Map String [String]}
deriving Show
type AnagramTree = Trie String [String]
emptyHelper = Ah { trie = Empty, strings = M.empty }
trieInsert :: (Ord a, Show a) => [a] -> b -> Trie a b -> Trie a b
trieInsert [] w Empty = Node {entries=M.empty, ender=Just w}
trieInsert [] w n = n -- should never happen
trieInsert !l1@(x:xs) !l Empty = Node {entries=M.singleton x (trieInsert xs l Empty ), ender=Nothing}
trieInsert !l1@(x:xs) !al !Node{entries=es, ender=en}
| M.member x es = let !st = trieInsert xs al in Node{entries=M.adjust st x es, ender=en}
| otherwise = let !st = trieInsert xs al Empty in Node{entries=M.insert x st es, ender=en}
findInTrie Empty _ = Nothing
findInTrie Node{entries = es, ender=en} [] = en
findInTrie Node{entries=es, ender=en} (c:cs)
| M.member c es = findInTrie (fromJust $ M.lookup c es) cs
| otherwise = Nothing
toString :: (Show a, Show b) => Trie a b -> String -> [String]
toString Empty indent = []
toString Node{entries=es, ender=en} indent = enderLine:subtrees
where
enderLine = case en of
Nothing -> []
Just l -> (indent ++ "-" ++ show l)
doEntry k t = (indent ++ " " ++ show [k]):(toString t (" "++indent))
subtrees = concat $ M.elems $ M.mapWithKey doEntry es
-- extract the paths in a trie (ignoring the entries, if any)
pathsThrough :: Trie a b -> [[a]]
pathsThrough Empty = [[]]
pathsThrough Node{entries=es,ender=en} = l
where
l = concatMap (\(x,y) -> (map (x:)) (pathsThrough y)) $ M.assocs es
allWhitespace s = and $ map isSpace s
makeAnagramHelperFromList l = foldl' ahInsert emptyHelper l
ahInsert !a@Ah{trie=t,strings= s} !word
| M.member sw s = Ah{trie=t, strings = M.insertWith' insertUnique sw [word] s}
| otherwise = Ah{trie=trieInsert sw sw t, strings=M.insert sw [word] s}
where
sw = sort word
insertUnique [w] l = if w `elem` l then l else (w:l)
clean x = filter (not.null) (nub x) -- profiling shows this is a bit faster
-- These are some common lists of words - one per line used here
-- these (obviously) live in my filesystem - you might use /usr/dict/share/words if
-- you're on a unix system
-- wordfile = "words/american-english-large-no-quotes"
-- wordfile = "words/american-english-smaller"
-- wordfile = "words/shorter-wordlist"
-- wordfile = "words/random-words"
-- wordfile = "words/seethe"
strip s = reverse $ dropWhile isSpace $ reverse $ dropWhile isSpace s
allchars w = (not.or) $ map (\x -> x < 'a' || x > 'z') w
-- most dictionaries include all the letters on their own, so
-- filter out all one letter words, but toss "a" and "i" back in
-- also strip the strings as there are some cases where they have trailing spaces
-- sometimes there seem to be duplicates (probably because there are caps in some versions)
-- in this version duplicates are handled by the builder
getWordsFromList l = ("a" :) $ ("i" :) $ filter (\x->(length x) > 1) $ filter allchars $ map strip $ (lines $ map toLower l)
makeAnagramHelperFromFile fn = do
rawWords <- readFile fn
let wordList = getWordsFromList rawWords
return $! makeAnagramHelperFromList wordList
getStringsFromTopWithString :: AnagramTrie -> String -> [String]
getStringsFromTopWithString Empty _ = []
getStringsFromTopWithString _ [] = []
getStringsFromTopWithString topTrie@Node{entries=es, ender=en} s@(c:cs) =
let maybeSubTree = M.lookup c es
in case maybeSubTree of
Nothing -> []
Just thisSubTree -> maybePrepend en $ findAllSubsetStringsInSubTree thisSubTree cs
findAllSubsetStringsInSubTree Empty _ = []
findAllSubsetStringsInSubTree Node{ender=en} [] = maybePrepend en []
findAllSubsetStringsInSubTree t@Node{entries=es, ender=en} (c:cs) =
let next = M.lookup c es -- does this first character in this node have a subtree ?
subs = findAllSubsetStringsInSubTree t $ dropWhile (==c) cs -- no, skip all of those and try the rest
in case next of
Nothing -> subs
Just t' -> maybePrepend en $ findAllSubsetStringsInSubTree t' cs ++ subs
maybePrepend Nothing l = l
maybePrepend (Just x) l = x:l
fromJust (Just l) = l
testWords = ["at", "tear", "rat", "ate", "sea", "teas", "tea", "erase", "tar", "art", "eat", "tears", "stare", "reset", "stares", "star" , "tartar", "seatea", "erasetar", "tee", "see"]
getTestTrie = trie getTestAH
getTestAH = makeAnagramHelperFromList testWords
anagramsFor starter ah@Ah{trie=t, strings=m} [] = Just Empty
anagramsFor starter ah@Ah{trie=t, strings=m} s = anatree
where
s' = sort s
slist = getStringsFromTopWithString t s'
subtreeskeys = filter (\(x,y) -> y /=Nothing) $ map (\x -> (x,anagramsFor x ah (s' \\ x))) slist
subtreeskeys1 = filter (\(x,y) -> x>=starter) subtreeskeys
mkAnaTrieEntries m (x, Nothing) = m
mkAnaTrieEntries m (x, Just y) = M.insert x y m
anatree = if subtreeskeys1 == []
then Nothing
else Just Node{entries=foldl mkAnaTrieEntries M.empty subtreeskeys1, ender=Just s}
simpleTest s = do
let t = anagramsFor [] getTestAH s
t' = fromJust t
mapM_ putStrLn $ toString t' ""
mapM_ putStrLn $ map show $ pathsThrough t'
main = do
(wl, strings) <- getCommandLineArgs
putStrLn $ "Using dictionary : " ++ wl
ah <- makeAnagramHelperFromFile wl
mapM_ (getAnagrams ah) strings
getCommandLineArgs = do
al <- getArgs
if length al < 1
then error "Args : [-dictionary] strings "
else let (s:xs) = al !! 0
in if s == '-'
then return (xs,drop 1 al)
else return ("words/enable.list", al)
listVariants :: AnagramHelper -> [String] -> [[String]]
listVariants Ah{strings=m} sl = map getWords sl
where
getWords s = fromJust $ M.lookup s m
getAnagrams ah s = do
putStrLn $ "getting anagrams for " ++ s
let at = anagramsFor [] ah s
case at of
Nothing -> putStrLn $ "No anagrams for: " ++ s
Just t -> do
mapM_ putStrLn $ toString t ""
let p = pathsThrough t
mapM_ putStrLn $ map show p
let v = map (listVariants ah) p
mapM_ putStrLn $ map show v