[commit: haddock] master: Add some documentation for parser module of source hyperlinker. (416c384)
git at git.haskell.org
git at git.haskell.org
Wed Jul 8 08:41:08 UTC 2015
Repository : ssh://git@git.haskell.org/haddock
On branch : master
Link : http://git.haskell.org/haddock.git/commitdiff/416c384a981593005c9c6bf87ac27b7c2f9b8695
>---------------------------------------------------------------
commit 416c384a981593005c9c6bf87ac27b7c2f9b8695
Author: Łukasz Hanuszczak <lukasz.hanuszczak at gmail.com>
Date: Sun Jun 21 23:48:03 2015 +0200
Add some documentation for parser module of source hyperlinker.
>---------------------------------------------------------------
416c384a981593005c9c6bf87ac27b7c2f9b8695
.../src/Haddock/Backends/Hyperlinker/Parser.hs | 39 ++++++++++++++++++++++
1 file changed, 39 insertions(+)
diff --git a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
index 7f40816..6e195db 100644
--- a/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
+++ b/haddock-api/src/Haddock/Backends/Hyperlinker/Parser.hs
@@ -40,9 +40,20 @@ data TokenType
| TkUnknown
deriving (Eq)
+-- | Turn source code string into a stream of more descriptive tokens.
+--
+-- Result should retain original file layout (including comments, whitespace,
+-- etc.), i.e. the following "law" should hold:
+--
+-- @concat . map 'tkValue' . 'parse' = id@
parse :: String -> [Token]
parse = tokenize . tag . chunk
+-- | Split raw source string to more meaningful chunks.
+--
+-- This is the initial stage of tokenization process. Each chunk is either
+-- a comment (including comment delimiters), a whitespace string, preprocessor
+-- macro (and all its content until the end of a line) or valid Haskell lexeme.
chunk :: String -> [String]
chunk [] = []
chunk str@(c:_)
@@ -56,6 +67,11 @@ chunk str
where
chunk' (c, rest) = c:(chunk rest)
+-- | Split input to "first line" string and the rest of it.
+--
+-- Ideally, this should be done simply with @'break' (== '\n')@. However,
+-- Haskell also allows line-unbreaking (or whatever it is called) so things
+-- are not as simple and this function deals with that.
spanToNewline :: String -> (String, String)
spanToNewline [] = ([], [])
spanToNewline ('\\':'\n':str) =
@@ -66,6 +82,16 @@ spanToNewline (c:str) =
let (str', rest) = spanToNewline str
in (c:str', rest)
+-- | Split input to whitespace string, (optional) preprocessor directive and
+-- the rest of it.
+--
+-- Again, using something like @'span' 'isSpace'@ would be nice to chunk input
+-- to whitespace. The problem is with /#/ symbol - if it is placed at the very
+-- beginning of a line, it should be recognized as preprocessor macro. In any
+-- other case, it is ordinary Haskell symbol and can be used to declare
+-- operators. Hence, while dealing with whitespace we also check whether there
+-- happens to be /#/ symbol just after a newline character - if that is the
+-- case, we begin treating the whole line as preprocessor macro.
spanSpaceOrCpp :: String -> (String, Maybe String, String)
spanSpaceOrCpp ('\n':'#':str) =
let (str', rest) = spanToNewline str
@@ -76,6 +102,10 @@ spanSpaceOrCpp (c:str')
in (c:space, mcpp, rest)
spanSpaceOrCpp str = ("", Nothing, str)
+-- | Split input to comment content (including delimiters) and the rest.
+--
+-- Again, some more logic than simple 'span' is required because of Haskell
+-- comment nesting policy.
chunkComment :: Int -> String -> (String, String)
chunkComment _ [] = ("", "")
chunkComment depth ('{':'-':str) =
@@ -90,6 +120,7 @@ chunkComment depth (e:str) =
let (c, rest) = chunkComment depth str
in (e:c, rest)
+-- | Assign source location for each chunk in given stream.
tag :: [String] -> [(Span, String)]
tag =
reverse . snd . foldl aux (Position 1 1, [])
@@ -100,6 +131,7 @@ tag =
move pos '\n' = pos { posRow = posRow pos + 1, posCol = 1 }
move pos _ = pos { posCol = posCol pos + 1 }
+-- | Turn unrecognised chunk stream to more descriptive token stream.
tokenize :: [(Span, String)] -> [Token]
tokenize =
map aux
@@ -110,6 +142,13 @@ tokenize =
, tkSpan = sp
}
+-- | Classify given string as appropriate Haskell token.
+--
+-- This method is based on Haskell 98 Report lexical structure description:
+-- https://www.haskell.org/onlinereport/lexemes.html
+--
+-- However, this is probably far from being perfect and most probably does not
+-- handle correctly all corner cases.
classify :: String -> TokenType
classify str
| "--" `isPrefixOf` str = TkComment
More information about the ghc-commits
mailing list