[commit: haddock] master: Create basic token classification method. (6fb8d5a)

git at git.haskell.org git at git.haskell.org
Wed Jul 8 08:39:38 UTC 2015


Repository : ssh://git@git.haskell.org/haddock

On branch  : master
Link       : http://git.haskell.org/haddock.git/commitdiff/6fb8d5abbcc92f5155fdc9596ca1c87fe87f6187

>---------------------------------------------------------------

commit 6fb8d5abbcc92f5155fdc9596ca1c87fe87f6187
Author: Łukasz Hanuszczak <lukasz.hanuszczak at gmail.com>
Date:   Thu Jun 4 23:21:17 2015 +0200

    Create basic token classification method.


>---------------------------------------------------------------

6fb8d5abbcc92f5155fdc9596ca1c87fe87f6187
 .../src/Haddock/Backends/Hyperlinker/Parser.hs     | 104 +++++++++++++++++++--
 1 file changed, 98 insertions(+), 6 deletions(-)

diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
index 4e0d738..be6b7ce 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
@@ -20,11 +20,18 @@ data Span = Span
     }
 
 data TokenType
-    = Identifier
-    | Comment
-    | Whitespace
-    | Operator
-    | Symbol
+    = TkIdentifier
+    | TkKeyword
+    | TkString
+    | TkChar
+    | TkNumber
+    | TkOperator
+    | TkGlyph
+    | TkSpecial
+    | TkSpace
+    | TkComment
+    | TkCpp
+    | TkUnknown
 
 parse :: String -> [Token]
 parse = tokenize . tag . chunk
@@ -66,4 +73,89 @@ tag =
         in (pos', (Span pos pos', c):cs)
 
 tokenize :: [(Span, String)] -> [Token]
-tokenize = undefined
+tokenize =
+    map aux
+  where
+    aux (sp, str) = Token
+        { tkType = classify str
+        , tkValue = str
+        , tkSpan = sp
+        }
+
+classify :: String -> TokenType
+classify (c:_)
+    | isSpace c = TkSpace
+    | isDigit c = TkNumber
+    | c `elem` special = TkSpecial
+    | c == '#' = TkCpp
+    | c == '"' = TkString
+    | c == '\'' = TkChar
+classify str
+    | str `elem` keywords = TkKeyword
+    | str `elem` glyphs = TkGlyph
+    | all (`elem` symbols) str = TkOperator
+    | "--" `isPrefixOf` str = TkComment
+    | "{-" `isPrefixOf` str = TkComment
+    | isIdentifier str = TkIdentifier
+    | otherwise = TkUnknown
+
+keywords :: [String]
+keywords =
+    [ "as"
+    , "case"
+    , "class"
+    , "data"
+    , "default"
+    , "deriving"
+    , "do"
+    , "else"
+    , "hiding"
+    , "if"
+    , "import"
+    , "in"
+    , "infix"
+    , "infixl"
+    , "infixr"
+    , "instance"
+    , "let"
+    , "module"
+    , "newtype"
+    , "of"
+    , "qualified"
+    , "then"
+    , "type"
+    , "where"
+    , "forall"
+    , "mdo"
+    ]
+
+glyphs :: [String]
+glyphs =
+    [ ".."
+    , ":"
+    , "::"
+    , "="
+    , "\\"
+    , "|"
+    , "<-"
+    , "->"
+    , "@"
+    , "~"
+    , "~#"
+    , "=>"
+    , "-"
+    , "!"
+    ]
+
+special :: [Char]
+special = "()[]{},;`"
+
+-- TODO: Add support for any Unicode symbol or punctuation.
+-- source: http://stackoverflow.com/questions/10548170/what-characters-are-permitted-for-haskell-operators
+symbols :: [Char]
+symbols = "!#$%&*+./<=>?@\\^|-~:"
+
+isIdentifier :: String -> Bool
+isIdentifier (c:str)
+    | isLetter c = all (\c' -> isAlphaNum c' || c == '\'') str
+isIdentifier _ = False



More information about the ghc-commits mailing list