[Haskell-cafe] Parse text difficulty

Thu Dec 9 04:18:56 EST 2004

Hi Everyone

My first post to the mailing list is a cry for help.  Apologies for that.
I've seen an example of how this is done in the archives but I'm
afraid I'm a bit more behind than the person who seemed to understand
the answer so if someone could help me??

The problem is this:

I've show(n) a particular data type and it shows up as:
[([2,6],"British"),([1],"Charles"),([1,8],"Clarke"),([2,6],"Council"),([2],"Edinburgh"),([1],"Education"),([4],"Increasingly")]

What I want to do is format that nicely into a table.
The best way of doing (I thought) was to:
Remove the first "[(" and final ")]"
Then replace "),(" with a newline(\n)

Which would give: 
[2,6],"British"
[1],"Charles"
[1,8],"Clarke"
[2,6],"Council"
......etc

I get the impression I may find it easier adding newlines earlier on
in my program but I thought this may be the easiest way.  I'll include
all the code for the whole program in case it helps to see where I'm
coming from.  It takes an input file of text and outputs an index to
an output file.  My soul question and drive is to lay out the index in
a nicely formatted fashion.  Any help would be very much appreciated.

module TextProc where    
-- import Prelude hiding (Word)
import IO
import List
type Word = String			-- define types
type Line = String
type Doc = String

start :: IO ()
start
 = do
   putStrLn "******** Enter Choice *********"
   putStrLn "1. Enter Input and Output files"
   putStrLn "2. Exit"
   putStrLn "*******************************"
   choice <- getLine
   if (choice == "1")
       then
       ( do
         putStrLn "Type input file name:"
         fileNameI <- getLine
         text <- readFile fileNameI
         putStrLn "Type output file name:"
         fileNameO <- getLine
         writeFile fileNameO (makeIndex text)
       )
       else
       ( do
         return()
       )

makeIndex :: Doc -> Doc  -- changed so output can be written to file
makeIndex 
 = show .
   shorten .    -- [([Int], Word)] -> [([Int], Word)] 
   amalgamate . -- [([Int], Word)] -> [([Int], Word)]
   makeLists .  -- [(Int, Word)]   -> [([Int], Word)]
   sortLs .     -- [(Int, Word)]   -> [(Int, Word)]
   allNumWords . -- [(Int, Line)]   -> [(Int, Word)]
   numLines .   -- [Line]          -> [(Int, Line)]
   splitUp     -- Doc             -> [Line]

splitUp :: Doc -> [Line]
splitUp [] = [] 
splitUp ls 
 = takeWhile (/='\n') ls :           -- first line
   (splitUp .                        -- split up other line
    dropWhile (=='\n') .             -- delete 1st newLine(s)
    dropWhile (/='\n')) ls           -- other lines

numLines :: [Line] -> [(Int, Line)]
numLines lines                       -- list of pairs of                
 = zip [1 .. length lines] lines     -- line no. & line

splitWords :: Line -> [Word]         -- split up lines into words
splitWords [] = [] 
splitWords line
 = takeWhile isLetter line :         -- first word in line
        (splitWords .                -- split other words  
         dropWhile (not.isLetter) .   -- delete separators
         dropWhile isLetter) line    -- other words

   where 
    isLetter ch
        =    ('a' <= ch) && (ch <= 'z')
          || ('A' <= ch) && (ch <= 'Z')
          || ('-' == ch)

numWords :: (Int, Line) -> [(Int, Word)] -- attach line no. to each word
numWords (number, line) 
 = map addLineNum (splitWords line)  -- all line pairs
   where 
    addLineNum word = (number, word) -- a pair

allNumWords :: [(Int, Line)] -> [(Int, Word)]
allNumWords = concat . map numWords -- doc pairs

sortLs :: [(Int, Word)] -> [(Int, Word)]
sortLs [ ] = [ ]
sortLs (a:x)
 = sortLs [b | b <- x, compare b a] -- sort 1st half
 ++ [a] ++                          -- 1st in middle 
 sortLs [b | b <- x, compare a b]   -- sort 2nd half
  where 
   compare (n1, w1) (n2, w2) 
    = (w1 < w2)                     -- 1st word less 
      || (w1 == w2 && n1 < n2)      -- check no.

makeLists :: [(Int, Word)] -> [([Int], Word)]
makeLists 
 = map mk                           -- all pairs 
   where mk (num, word) = ([num], word)
                                    -- list of single no.

amalgamate :: [([Int], Word)] -> [([Int], Word)]
amalgamate [ ] = [ ]
amalgamate [a] = [a]
amalgamate ((n1, w1) : (n2, w2) : rest) -- pairs of pairs 
 | w1 /= w2   = (n1, w1) : amalgamate ((n2, w2) : rest) 
 | otherwise = amalgamate ((n1 ++ n2, w1) : rest) 
                                        -- if words are same grow list
of numbers

shorten :: [([Int], Word)] -> [([Int], Word)]
shorten 
 = filter long                          -- keep pairs >4 
   where 
   long (num, word) = length word > 4    -- check word >4