[commit: ghc] master: Improve StringBuffer and FastString docs (21dde81)

Sun Dec 18 01:02:15 UTC 2016

Repository : ssh://git@git.haskell.org/ghc

On branch  : master
Link       : http://ghc.haskell.org/trac/ghc/changeset/21dde8126d615a082648c916a3e20d9878f22517/ghc

>---------------------------------------------------------------

commit 21dde8126d615a082648c916a3e20d9878f22517
Author: Phil Ruffwind <rf at rufflewind.com>
Date:   Sat Dec 17 18:09:06 2016 -0500

    Improve StringBuffer and FastString docs
    
    This area of code contains a lot of unsafe functionality, so it might be
    worth documenting to reduce the risk of misuse.
    
    Test Plan: inspection
    
    Reviewers: austin, bgamari
    
    Reviewed By: bgamari
    
    Subscribers: thomie
    
    Differential Revision: https://phabricator.haskell.org/D2872


>---------------------------------------------------------------

21dde8126d615a082648c916a3e20d9878f22517
 compiler/utils/FastString.hs   | 13 +++++++++++++
 compiler/utils/StringBuffer.hs | 42 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/compiler/utils/FastString.hs b/compiler/utils/FastString.hs
index 8f76584..8d1bbb5 100644
--- a/compiler/utils/FastString.hs
+++ b/compiler/utils/FastString.hs
@@ -23,6 +23,10 @@
 --   * Outputing them is fast.
 --   * Generated by 'sLit'.
 --   * Turn into 'Outputable.SDoc' with 'Outputable.ptext'
+--   * Requires manual memory management.
+--     Improper use may lead to memory leaks or dangling pointers.
+--   * It assumes Latin-1 as the encoding, therefore it cannot represent
+--     arbitrary Unicode strings.
 --
 -- Use 'LitString' unless you want the facilities of 'FastString'.
 module FastString
@@ -560,14 +564,19 @@ hPutFS handle fs = BS.hPut handle $ fastStringToByteString fs
 -- -----------------------------------------------------------------------------
 -- LitStrings, here for convenience only.
 
+-- | A 'LitString' is a pointer to some null-terminated array of bytes.
 type LitString = Ptr Word8
 --Why do we recalculate length every time it's requested?
 --If it's commonly needed, we should perhaps have
 --data LitString = LitString {-#UNPACK#-}!Addr# {-#UNPACK#-}!Int#
 
+-- | Wrap an unboxed address into a 'LitString'.
 mkLitString# :: Addr# -> LitString
 mkLitString# a# = Ptr a#
 
+-- | Encode a 'String' into a newly allocated 'LitString' using Latin-1
+-- encoding.  The original string must not contain non-Latin-1 characters
+-- (above codepoint @0xff@).
 {-# INLINE mkLitString #-}
 mkLitString :: String -> LitString
 mkLitString s =
@@ -583,9 +592,13 @@ mkLitString s =
    return p
  )
 
+-- | Decode a 'LitString' back into a 'String' using Latin-1 encoding.
+-- This does not free the memory associated with 'LitString'.
 unpackLitString :: LitString -> String
 unpackLitString (Ptr p) = unpackCString# p
 
+-- | Compute the length of a 'LitString', which must necessarily be
+-- null-terminated.
 lengthLS :: LitString -> Int
 lengthLS = ptrStrLength
 
diff --git a/compiler/utils/StringBuffer.hs b/compiler/utils/StringBuffer.hs
index 7da9f6c..bac752a 100644
--- a/compiler/utils/StringBuffer.hs
+++ b/compiler/utils/StringBuffer.hs
@@ -89,6 +89,8 @@ instance Show StringBuffer where
 -- -----------------------------------------------------------------------------
 -- Creation / Destruction
 
+-- | Read a file into a 'StringBuffer'.  The resulting buffer is automatically
+-- managed by the garbage collector.
 hGetStringBuffer :: FilePath -> IO StringBuffer
 hGetStringBuffer fname = do
    h <- openBinaryFile fname ReadMode
@@ -161,6 +163,8 @@ appendStringBuffers sb1 sb2
           calcLen sb = len sb - cur sb
           size =  sb1_len + sb2_len
 
+-- | Encode a 'String' into a 'StringBuffer' as UTF-8.  The resulting buffer
+-- is automatically managed by the garbage collector.
 stringToStringBuffer :: String -> StringBuffer
 stringToStringBuffer str =
  unsafePerformIO $ do
@@ -175,10 +179,15 @@ stringToStringBuffer str =
 -- -----------------------------------------------------------------------------
 -- Grab a character
 
--- Getting our fingers dirty a little here, but this is performance-critical
+-- | Return the first UTF-8 character of a nonempty 'StringBuffer' and as well
+-- the remaining portion (analogous to 'Data.List.uncons').  __Warning:__ The
+-- behavior is undefined if the 'StringBuffer' is empty.  The result shares
+-- the same buffer as the original.  Similar to 'utf8DecodeChar', if the
+-- character cannot be decoded as UTF-8, '\0' is returned.
 {-# INLINE nextChar #-}
 nextChar :: StringBuffer -> (Char,StringBuffer)
 nextChar (StringBuffer buf len (I# cur#)) =
+  -- Getting our fingers dirty a little here, but this is performance-critical
   inlinePerformIO $ do
     withForeignPtr buf $ \(Ptr a#) -> do
         case utf8DecodeChar# (a# `plusAddr#` cur#) of
@@ -186,6 +195,10 @@ nextChar (StringBuffer buf len (I# cur#)) =
              let cur' = I# (cur# +# nBytes#) in
              return (C# c#, StringBuffer buf len cur')
 
+-- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
+-- to 'Data.List.head').  __Warning:__ The behavior is undefined if the
+-- 'StringBuffer' is empty.  Similar to 'utf8DecodeChar', if the character
+-- cannot be decoded as UTF-8, '\0' is returned.
 currentChar :: StringBuffer -> Char
 currentChar = fst . nextChar
 
@@ -200,29 +213,50 @@ prevChar (StringBuffer buf _   cur) _     =
 -- -----------------------------------------------------------------------------
 -- Moving
 
+-- | Return a 'StringBuffer' with the first UTF-8 character removed (analogous
+-- to 'Data.List.tail').  __Warning:__ The behavior is undefined if the
+-- 'StringBuffer' is empty.  The result shares the same buffer as the
+-- original.
 stepOn :: StringBuffer -> StringBuffer
 stepOn s = snd (nextChar s)
 
-offsetBytes :: Int -> StringBuffer -> StringBuffer
+-- | Return a 'StringBuffer' with the first @n@ bytes removed.  __Warning:__
+-- If there aren't enough characters, the returned 'StringBuffer' will be
+-- invalid and any use of it may lead to undefined behavior.  The result
+-- shares the same buffer as the original.
+offsetBytes :: Int                      -- ^ @n@, the number of bytes
+            -> StringBuffer
+            -> StringBuffer
 offsetBytes i s = s { cur = cur s + i }
 
+-- | Compute the difference in offset between two 'StringBuffer's that share
+-- the same buffer.  __Warning:__ The behavior is undefined if the
+-- 'StringBuffer's use separate buffers.
 byteDiff :: StringBuffer -> StringBuffer -> Int
 byteDiff s1 s2 = cur s2 - cur s1
 
+-- | Check whether a 'StringBuffer' is empty (analogous to 'Data.List.null').
 atEnd :: StringBuffer -> Bool
 atEnd (StringBuffer _ l c) = l == c
 
 -- -----------------------------------------------------------------------------
 -- Conversion
 
-lexemeToString :: StringBuffer -> Int {-bytes-} -> String
+-- | Decode the first @n@ bytes of a 'StringBuffer' as UTF-8 into a 'String'.
+-- Similar to 'utf8DecodeChar', if the character cannot be decoded as UTF-8,
+-- they will be replaced with '\0'.
+lexemeToString :: StringBuffer
+               -> Int                   -- ^ @n@, the number of bytes
+               -> String
 lexemeToString _ 0 = ""
 lexemeToString (StringBuffer buf _ cur) bytes =
   inlinePerformIO $
     withForeignPtr buf $ \ptr ->
       utf8DecodeString (ptr `plusPtr` cur) bytes
 
-lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
+lexemeToFastString :: StringBuffer
+                   -> Int               -- ^ @n@, the number of bytes
+                   -> FastString
 lexemeToFastString _ 0 = nilFS
 lexemeToFastString (StringBuffer buf _ cur) len =
    inlinePerformIO $