[Git][ghc/ghc][wip/angerman/aarch64-ncg] 13 commits: [aarch64] Fix spill/reload

Tue Sep 8 16:29:38 UTC 2020

Moritz Angermann pushed to branch wip/angerman/aarch64-ncg at Glasgow Haskell Compiler / GHC


Commits:
342e6e9a by Moritz Angermann at 2020-08-26T06:50:19+00:00
[aarch64] Fix spill/reload

- - - - -
29bfd65e by Moritz Angermann at 2020-09-07T04:35:02+00:00
Try to get PIC right.

- - - - -
cbd07d60 by Moritz Angermann at 2020-09-07T04:36:28+00:00
Spill/Reload only need a smaller window

- - - - -
499350e0 by Moritz Angermann at 2020-09-07T04:37:32+00:00
Drop bad/useless optimisation

This was due to not handling PIC symbols correctly and injecting CmmLoad
as we do on other platforms, but this doesn't translate to aarch64's got lookups.

- - - - -
8a91cda3 by Moritz Angermann at 2020-09-07T04:37:53+00:00
B is b

- - - - -
f5f17010 by Moritz Angermann at 2020-09-07T04:39:01+00:00
Fix CCall

|Now mark used registers properly for the Register Allocator.

- - - - -
54bf82b5 by Moritz Angermann at 2020-09-07T16:13:59+00:00
:sob:

- - - - -
7aebacf9 by Moritz Angermann at 2020-09-07T16:14:29+00:00
:sob: :sob:

- - - - -
4cc23070 by Moritz Angermann at 2020-09-08T07:36:55+00:00
:sob: Segfault no 3. This showed up in T4114

- - - - -
873aff9f by Moritz Angermann at 2020-09-08T15:20:01+00:00
Add mkComment to `Instruction`

- - - - -
a2e7c94b by Moritz Angermann at 2020-09-08T15:22:23+00:00
Use mkComment for debugging

- - - - -
0e4a574b by Moritz Angermann at 2020-09-08T15:26:32+00:00
Fix T4114 crashes

T4114 causes this codepath to show up.

- - - - -
d088f7d1 by Moritz Angermann at 2020-09-08T15:27:13+00:00
Cleanup some compiler warnings

- - - - -


12 changed files:

- compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
- compiler/GHC/CmmToAsm/AArch64/Instr.hs
- compiler/GHC/CmmToAsm/AArch64/Ppr.hs
- compiler/GHC/CmmToAsm/Instr.hs
- compiler/GHC/CmmToAsm/PIC.hs
- compiler/GHC/CmmToAsm/PPC/Instr.hs
- compiler/GHC/CmmToAsm/Reg/Linear.hs
- compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
- compiler/GHC/CmmToAsm/Reg/Linear/State.hs
- compiler/GHC/CmmToAsm/Reg/Liveness.hs
- compiler/GHC/CmmToAsm/SPARC/Instr.hs
- compiler/GHC/CmmToAsm/X86/Instr.hs


Changes:

=====================================
compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
=====================================
@@ -104,8 +104,8 @@ cmmTopCodeGen
     -> NatM [NatCmmDecl RawCmmStatics Instr]
 
 -- Thus we'll have to deal with either CmmProc ...
-cmmTopCodeGen cmm@(CmmProc info lab live graph) = do
-  config <- getConfig
+cmmTopCodeGen _cmm@(CmmProc info lab live graph) = do
+  -- config <- getConfig
   -- do
   --   traceM $ "-- -------------------------- cmmTopGen (CmmProc) -------------------------- --\n"
   --         ++ showSDocUnsafe (ppr cmm)
@@ -120,15 +120,12 @@ cmmTopCodeGen cmm@(CmmProc info lab live graph) = do
       os   = platformOS platform
 
   case picBaseMb of
-      Just picBase -> do
-          -- XXX: PIC not yet implemented
-          panic "AArch64.cmmTopCodeGen: PIC not implemented"
-          return tops
+      Just _picBase -> panic "AArch64.cmmTopCodeGen: picBase not implemented"
       Nothing -> return tops
 
 -- ... or CmmData. Do we want to align this?
-cmmTopCodeGen cmm@(CmmData sec dat) = do
-  config <- getConfig
+cmmTopCodeGen _cmm@(CmmData sec dat) = do
+  -- config <- getConfig
   -- do
   --   traceM $ "-- -------------------------- cmmTopGen (CmmData) -------------------------- --\n"
   --         ++ showSDocUnsafe (ppr cmm)
@@ -141,7 +138,7 @@ basicBlockCodeGen
                 , [NatCmmDecl RawCmmStatics Instr])
 
 basicBlockCodeGen block = do
-  config <- getConfig
+  -- config <- getConfig
   -- do
   --   traceM $ "-- --------------------------- basicBlockCodeGen --------------------------- --\n"
   --         ++ showSDocUnsafe (ppr block)
@@ -385,7 +382,7 @@ getFloatReg expr = do
     Any II64 code -> do
       tmp <- getNewRegNat FF64
       return (tmp, FF64, code tmp)
-    Any w _code -> pprPanic "can't do getFloatReg on" (ppr expr)
+    Any _w _code -> pprPanic "can't do getFloatReg on" (ppr expr)
     -- can't do much for fixed.
     Fixed rep reg code ->
       return (reg, rep, code)
@@ -481,7 +478,7 @@ getRegister' config plat expr
                                                     , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half2) SLSL 32)
                                                     , MOVK (OpReg W64 dst) (OpImmShift (ImmInt half3) SLSL 48)
                                                     ]))
-        CmmInt i rep -> do
+        CmmInt _i rep -> do
           (op, imm_code) <- litToImm' lit
           return (Any (intFormat rep) (\dst -> imm_code `snocOL` ANN (text $ show expr) (MOV (OpReg rep dst) op)))
 
@@ -490,8 +487,8 @@ getRegister' config plat expr
           (op, imm_code) <- litToImm' lit
           return (Any (floatFormat w) (\dst -> imm_code `snocOL` ANN (text $ show expr) (MOV (OpReg w dst) op)))
 
-        CmmFloat f W8  -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for bytes" (ppr expr)
-        CmmFloat f W16 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for halfs" (ppr expr)
+        CmmFloat _f W8  -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for bytes" (ppr expr)
+        CmmFloat _f W16 -> pprPanic "getRegister' (CmmLit:CmmFloat), no support for halfs" (ppr expr)
         CmmFloat f W32 -> do
           let word = castFloatToWord32 (fromRational f) :: Word32
               half0 = fromIntegral (fromIntegral word :: Word16)
@@ -516,19 +513,19 @@ getRegister' config plat expr
                                                       , MOVK (OpReg W64 tmp) (OpImmShift (ImmInt half3) SLSL 48)
                                                       , MOV (OpReg W64 dst) (OpReg W64 tmp)
                                                       ]))
-        CmmFloat f w -> pprPanic "getRegister' (CmmLit:CmmFloat), unsupported float lit" (ppr expr)
+        CmmFloat _f _w -> pprPanic "getRegister' (CmmLit:CmmFloat), unsupported float lit" (ppr expr)
         CmmVec _ -> pprPanic "getRegister' (CmmLit:CmmVec): " (ppr expr)
-        CmmLabel lbl -> do
+        CmmLabel _lbl -> do
           (op, imm_code) <- litToImm' lit
           let rep = cmmLitType plat lit
               format = cmmTypeFormat rep
           return (Any format (\dst -> imm_code `snocOL` (ANN (text $ show expr) $ LDR format (OpReg (formatToWidth format) dst) op)))
 
-        CmmLabelOff lbl off | is12bit (fromIntegral off) -> do
+        CmmLabelOff _lbl off | is12bit (fromIntegral off) -> do
           (op, imm_code) <- litToImm' lit
           let rep = cmmLitType plat lit
               format = cmmTypeFormat rep
-              width = typeWidth rep
+              -- width = typeWidth rep
           return (Any format (\dst -> imm_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op))
 
         CmmLabelOff lbl off -> do
@@ -586,13 +583,16 @@ getRegister' config plat expr
         MO_FF_Conv from to -> return $ Any (floatFormat to) (\dst -> code `snocOL` FCVT (OpReg to dst) (OpReg from reg))
 
         -- Conversions
-        MO_XX_Conv from to -> swizzleRegisterRep (intFormat to) <$> getRegister e
+        MO_XX_Conv _from to -> swizzleRegisterRep (intFormat to) <$> getRegister e
 
         _ -> pprPanic "getRegister' (monadic CmmMachOp):" (ppr expr)
       where toImm W8 =  (OpImm (ImmInt 7))
             toImm W16 = (OpImm (ImmInt 15))
             toImm W32 = (OpImm (ImmInt 31))
             toImm W64 = (OpImm (ImmInt 63))
+            toImm W128 = (OpImm (ImmInt 127))
+            toImm W256 = (OpImm (ImmInt 255))
+            toImm W512 = (OpImm (ImmInt 511))
     -- Dyadic machops:
     --
     -- The general idea is:
@@ -604,8 +604,8 @@ getRegister' config plat expr
     --      fallthrough to alert us if things go wrong!
     -- OPTIMIZATION WARNING: Dyadic CmmMachOp destructuring
     -- 0. XXX This should not exist! Rewrite: Reg +- 0 -> Reg
-    CmmMachOp (MO_Add _) [expr'@(CmmReg (CmmGlobal r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
-    CmmMachOp (MO_Sub _) [expr'@(CmmReg (CmmGlobal r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
+    CmmMachOp (MO_Add _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
+    CmmMachOp (MO_Sub _) [expr'@(CmmReg (CmmGlobal _r)), CmmLit (CmmInt 0 _)] -> getRegister' config plat expr'
     -- 1. Compute Reg +/- n directly.
     --    For Add/Sub we can directly encode 12bits, or 12bits lsl #12.
     CmmMachOp (MO_Add w) [(CmmReg reg), CmmLit (CmmInt n _)]
@@ -656,7 +656,7 @@ getRegister' config plat expr
             return $ Any format_x (\dst -> code_x `appOL` code_y `appOL` op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
 
           withTempIntReg w op = OpReg w <$> getNewRegNat (intFormat w) >>= op
-          withTempFloatReg w op = OpReg w <$> getNewRegNat (floatFormat w) >>= op
+          -- withTempFloatReg w op = OpReg w <$> getNewRegNat (floatFormat w) >>= op
 
           intOp w op = do
             -- compute x<m> <- x
@@ -782,7 +782,7 @@ getRegister' config plat expr
         -- XXX
 
         op -> pprPanic "getRegister' (unhandled dyadic CmmMachOp): " $ (pprMachOp op) <+> text "in" <+> (ppr expr)
-    CmmMachOp op xs
+    CmmMachOp _op _xs
       -> pprPanic "getRegister' (variadic CmmMachOp): " (ppr expr)
 
   where
@@ -841,18 +841,18 @@ getAmode platform (CmmRegOff reg off)
 -- CmmStore (CmmMachOp (MO_Add w) [CmmLoad expr, CmmLit (CmmInt n w')]) (expr2)
 -- E.g. a CmmStoreOff really. This can be translated to `str $expr2, [$expr, #n ]
 -- for `n` in range.
-getAmode platform (CmmMachOp (MO_Add _w) [expr, CmmLit (CmmInt off _w')])
+getAmode _platform (CmmMachOp (MO_Add _w) [expr, CmmLit (CmmInt off _w')])
   | -256 <= off, off <= 255
   = do (reg, _format, code) <- getSomeReg expr
        return $ Amode (AddrRegImm reg (ImmInteger off)) code
 
-getAmode platform (CmmMachOp (MO_Sub _w) [expr, CmmLit (CmmInt off _w')])
+getAmode _platform (CmmMachOp (MO_Sub _w) [expr, CmmLit (CmmInt off _w')])
   | -256 <= -off, -off <= 255
   = do (reg, _format, code) <- getSomeReg expr
        return $ Amode (AddrRegImm reg (ImmInteger (-off))) code
 
 -- Generic case
-getAmode _plat expr
+getAmode _platform expr
   = do (reg, _format, code) <- getSomeReg expr
        return $ Amode (AddrReg reg) code
 
@@ -904,7 +904,7 @@ assignReg_FltCode = assignReg_IntCode
 -- -----------------------------------------------------------------------------
 -- Jumps
 genJump :: CmmExpr{-the branch target-} -> [Reg] -> NatM InstrBlock
-genJump expr@(CmmLit (CmmLabel lbl)) regs
+genJump expr@(CmmLit (CmmLabel lbl)) _regs
   = return $ unitOL (ANN (text $ show expr) (J (TLabel lbl)))
   -- = return (toOL [ PUSH_STACK_FRAME
   --               , DELTA (-16)
@@ -912,12 +912,7 @@ genJump expr@(CmmLit (CmmLabel lbl)) regs
   --               , POP_STACK_FRAME
   --               , DELTA 0] )

--- no reason to load label into register just to
--- do a register jump.
-genJump expr@(CmmLoad (CmmLit (CmmLabel lbl)) _rep) regs
-  = return $ unitOL (ANN (text $ show expr) (J (TLabel lbl)))
-
-genJump expr regs = do
+genJump expr _regs = do
     (target, _format, code) <- getSomeReg expr
     return (code `appOL` unitOL (ANN (text $ show expr) (J (TReg target)))
                         --  toOL [ PUSH_STACK_FRAME
@@ -1107,9 +1102,10 @@ genCCall target dest_regs arg_regs bid = do
       -- this will give us the format information to match on.
       arg_regs' <- mapM getSomeReg arg_regs
 
-      (stackArgs, passArgumentsCode) <- passArguments allGpArgRegs allFpArgRegs arg_regs' 0 nilOL
+      (stackArgs, passRegs, passArgumentsCode) <- passArguments allGpArgRegs allFpArgRegs arg_regs' 0 [] nilOL
+
+      (returnRegs, readResultsCode)   <- readResults allGpArgRegs allFpArgRegs dest_regs [] nilOL
 
-      readResultsCode   <- readResults allGpArgRegs allFpArgRegs dest_regs nilOL
       let moveStackDown 0 = toOL [ PUSH_STACK_FRAME
                                  , DELTA (-16) ]
           moveStackDown i | odd i = moveStackDown (i + 1)
@@ -1126,7 +1122,7 @@ genCCall target dest_regs arg_regs bid = do
       let code =    call_target_code          -- compute the label (possibly into a register)
             `appOL` moveStackDown stackArgs
             `appOL` passArgumentsCode         -- put the arguments into x0, ...
-            `appOL` (unitOL $ BL call_target) -- branch and link.
+            `appOL` (unitOL $ BL call_target passRegs returnRegs) -- branch and link.
             `appOL` readResultsCode           -- parse the results into registers
             `appOL` moveStackUp stackArgs
       return (code, Nothing)
@@ -1253,8 +1249,8 @@ genCCall target dest_regs arg_regs bid = do
       genCCall (ForeignTarget target cconv) dest_regs arg_regs bid
 
     -- XXX: Optimize using paired load LDP
-    passArguments :: [Reg] -> [Reg] -> [(Reg, Format, InstrBlock)] -> Int -> InstrBlock -> NatM (Int, InstrBlock)
-    passArguments _ _ [] stackArgs accumCode = return (stackArgs, accumCode)
+    passArguments :: [Reg] -> [Reg] -> [(Reg, Format, InstrBlock)] -> Int -> [Reg] -> InstrBlock -> NatM (Int, [Reg], InstrBlock)
+    passArguments _ _ [] stackArgs accumRegs accumCode = return (stackArgs, accumRegs, accumCode)
     -- passArguments _ _ [] accumCode stackArgs | isEven stackArgs = return $ SUM (OpReg W64 x31) (OpReg W64 x31) OpImm (ImmInt (-8 * stackArgs))
     -- passArguments _ _ [] accumCode stackArgs = return $ SUM (OpReg W64 x31) (OpReg W64 x31) OpImm (ImmInt (-8 * (stackArgs + 1)))
     -- passArguments [] fpRegs (arg0:arg1:args) stack accumCode = do
@@ -1292,40 +1288,40 @@ genCCall target dest_regs arg_regs bid = do
       -- For AArch64 specificies see: https://developer.arm.com/docs/ihi0055/latest/procedure-call-standard-for-the-arm-64-bit-architecture
       --
     -- Still have GP regs, and we want to pass an GP argument.
-    passArguments (gpReg:gpRegs) fpRegs ((r, format, code_r):args) stackArgs accumCode | isIntFormat format = do
+    passArguments (gpReg:gpRegs) fpRegs ((r, format, code_r):args) stackArgs accumRegs accumCode | isIntFormat format = do
       let w = formatToWidth format
-      passArguments gpRegs fpRegs args stackArgs (accumCode `appOL` code_r `snocOL` MOV (OpReg w gpReg) (OpReg w r))
+      passArguments gpRegs fpRegs args stackArgs (gpReg:accumRegs) (accumCode `appOL` code_r `snocOL` (ANN (text $ "Pass gp argument: " ++ show r) $ MOV (OpReg w gpReg) (OpReg w r)))
 
     -- Still have FP regs, and we want to pass an FP argument.
-    passArguments gpRegs (fpReg:fpRegs) ((r, format, code_r):args) stackArgs accumCode | isFloatFormat format = do
+    passArguments gpRegs (fpReg:fpRegs) ((r, format, code_r):args) stackArgs accumRegs accumCode | isFloatFormat format = do
       let w = formatToWidth format
-      passArguments gpRegs fpRegs args stackArgs (accumCode `appOL` code_r `snocOL` MOV (OpReg w fpReg) (OpReg w r))
+      passArguments gpRegs fpRegs args stackArgs (fpReg:accumRegs) (accumCode `appOL` code_r `snocOL` (ANN (text $ "Pass fp argument: " ++ show r) $ MOV (OpReg w fpReg) (OpReg w r)))
 
     -- No mor regs left to pass. Must pass on stack.
-    passArguments [] [] ((r, format, code_r):args) stackArgs accumCode = do
+    passArguments [] [] ((r, format, code_r):args) stackArgs accumRegs accumCode = do
       let w = formatToWidth format
-          stackCode = code_r `snocOL` STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt (stackArgs * 8))))
-      passArguments [] [] args (stackArgs+1) (stackCode `appOL` accumCode)
+          stackCode = code_r `snocOL` (ANN (text $ "Pass argument: " ++ show r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt (stackArgs * 8)))))
+      passArguments [] [] args (stackArgs+1) accumRegs (stackCode `appOL` accumCode)
 
     -- Still have fpRegs left, but want to pass a GP argument. Must be passed on the stack then.
-    passArguments [] fpRegs ((r, format, code_r):args) stackArgs accumCode | isIntFormat format = do
+    passArguments [] fpRegs ((r, format, code_r):args) stackArgs accumRegs accumCode | isIntFormat format = do
       let w = formatToWidth format
-          stackCode = code_r `snocOL` STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt (stackArgs * 8))))
-      passArguments [] fpRegs args (stackArgs+1) (stackCode `appOL` accumCode)
+          stackCode = code_r `snocOL` (ANN (text $ "Pass argument: " ++ show r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt (stackArgs * 8)))))
+      passArguments [] fpRegs args (stackArgs+1) accumRegs (stackCode `appOL` accumCode)
 
     -- Still have gpRegs left, but want to pass a FP argument. Must be passed on the stack then.
-    passArguments gpRegs [] ((r, format, code_r):args) stackArgs accumCode | isFloatFormat format = do
+    passArguments gpRegs [] ((r, format, code_r):args) stackArgs accumRegs accumCode | isFloatFormat format = do
       let w = formatToWidth format
-          stackCode = code_r `snocOL` STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt (stackArgs * 8))))
-      passArguments gpRegs [] args (stackArgs+1) (stackCode `appOL` accumCode)
+          stackCode = code_r `snocOL` (ANN (text $ "Pass argument: " ++ show r) $ STR format (OpReg w r) (OpAddr (AddrRegImm (regSingle 31) (ImmInt (stackArgs * 8)))))
+      passArguments gpRegs [] args (stackArgs+1) accumRegs (stackCode `appOL` accumCode)
 
-    passArguments _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")
+    passArguments _ _ _ _ _ _ = pprPanic "passArguments" (text "invalid state")
 
-    readResults :: [Reg] -> [Reg] -> [LocalReg] -> InstrBlock -> NatM InstrBlock
-    readResults _ _ [] accumCode = return accumCode
-    readResults [] _ _ _ = pprPanic "genCCall, out of gp registers when reading results" (ppr target)
-    readResults _ [] _ _ = pprPanic "genCCall, out of gp registers when reading results" (ppr target)
-    readResults (gpReg:gpRegs) (fpReg:fpRegs) (dst:dsts) accumCode = do
+    readResults :: [Reg] -> [Reg] -> [LocalReg] -> [Reg]-> InstrBlock -> NatM ([Reg], InstrBlock)
+    readResults _ _ [] accumRegs accumCode = return (accumRegs, accumCode)
+    readResults [] _ _ _ _ = pprPanic "genCCall, out of gp registers when reading results" (ppr target)
+    readResults _ [] _ _ _ = pprPanic "genCCall, out of fp registers when reading results" (ppr target)
+    readResults (gpReg:gpRegs) (fpReg:fpRegs) (dst:dsts) accumRegs accumCode = do
       -- gp/fp reg -> dst
       platform <- getPlatform
       let rep = cmmRegType platform (CmmLocal dst)
@@ -1333,8 +1329,8 @@ genCCall target dest_regs arg_regs bid = do
           w   = cmmRegWidth platform (CmmLocal dst)
           r_dst = getRegisterReg platform (CmmLocal dst)
       if isFloatFormat format
-        then readResults (gpReg:gpRegs) fpRegs dsts (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w fpReg))
-        else readResults gpRegs (fpReg:fpRegs) dsts (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w gpReg))
+        then readResults (gpReg:gpRegs) fpRegs dsts (fpReg:accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w fpReg))
+        else readResults gpRegs (fpReg:fpRegs) dsts (gpReg:accumRegs) (accumCode `snocOL` MOV (OpReg w r_dst) (OpReg w gpReg))
 
 
 


=====================================
compiler/GHC/CmmToAsm/AArch64/Instr.hs
=====================================
@@ -35,6 +35,9 @@ import Control.Monad (replicateM)
 import Data.Maybe (fromMaybe)
 
 import Debug.Trace
+import GHC.Stack
+
+import Data.Bits ((.&.), complement)
 
 -- | XXX: verify this!
 stackFrameHeaderSize :: Platform -> Int
@@ -78,6 +81,7 @@ instance Instruction Instr where
         mkJumpInstr             = aarch64_mkJumpInstr
         mkStackAllocInstr       = aarch64_mkStackAllocInstr
         mkStackDeallocInstr     = aarch64_mkStackDeallocInstr
+        mkComment               = pure . COMMENT
 
 -- | Get the registers that are being used by this instruction.
 -- regUsage doesn't need to do any trickery for jumps and such.
@@ -129,7 +133,7 @@ aarch64_regUsageOfInstr platform instr = case instr of
   J t                      -> usage (regTarget t, [])
   B t                      -> usage (regTarget t, [])
   BCOND _ t                -> usage (regTarget t, [])
-  BL t                     -> usage (regTarget t, callerSavedRegisters)
+  BL t ps _rs              -> usage (regTarget t ++ ps, callerSavedRegisters)
 
   -- 5. Atomic Instructions ----------------------------------------------------
   -- 6. Conditional Instructions -----------------------------------------------
@@ -254,7 +258,7 @@ aarch64_patchRegsOfInstr instr env = case instr of
     -- 4. Branch Instructions --------------------------------------------------
     J t            -> J (patchTarget t)
     B t            -> B (patchTarget t)
-    BL t           -> BL (patchTarget t)
+    BL t rs ts     -> BL (patchTarget t) rs ts
     BCOND c t      -> BCOND c (patchTarget t)
 
     -- 5. Atomic Instructions --------------------------------------------------
@@ -314,7 +318,7 @@ aarch64_jumpDestsOfInstr (CBZ _ t) = [ id | TBlock id <- [t]]
 aarch64_jumpDestsOfInstr (CBNZ _ t) = [ id | TBlock id <- [t]]
 aarch64_jumpDestsOfInstr (J t) = [id | TBlock id <- [t]]
 aarch64_jumpDestsOfInstr (B t) = [id | TBlock id <- [t]]
-aarch64_jumpDestsOfInstr (BL t) = [ id | TBlock id <- [t]]
+aarch64_jumpDestsOfInstr (BL t _ _) = [ id | TBlock id <- [t]]
 aarch64_jumpDestsOfInstr (BCOND _ t) = [ id | TBlock id <- [t]]
 aarch64_jumpDestsOfInstr _ = []
 
@@ -329,7 +333,7 @@ aarch64_patchJumpInstr instr patchF
         CBNZ r (TBlock bid) -> CBNZ r (TBlock (patchF bid))
         J (TBlock bid) -> J (TBlock (patchF bid))
         B (TBlock bid) -> B (TBlock (patchF bid))
-        BL (TBlock bid) -> BL (TBlock (patchF bid))
+        BL (TBlock bid) ps rs -> BL (TBlock (patchF bid)) ps rs
         BCOND c (TBlock bid) -> BCOND c (TBlock (patchF bid))
         _ -> pprPanic "patchJumpInstr" (text $ show instr)
 
@@ -337,100 +341,98 @@ aarch64_patchJumpInstr instr patchF
 
 -- | An instruction to spill a register into a spill slot.
 aarch64_mkSpillInstr
-   :: NCGConfig
+   :: HasCallStack
+   => NCGConfig
    -> Reg       -- register to spill
    -> Int       -- current stack delta
    -> Int       -- spill slot to use
-   -> Instr
--- XXX this is stupid. We essentially do sp <- sp - 4095; str xN, [sp - ...] ; sp <- sp + 4095
-{-
-aarch64_mkSpillInstr config reg delta slot | (spillSlotToOffset config slot) - delta > 4095
-    = let (!d, !isns) = traceShow msg $ aarch64_mkSpillInstr config reg (delta + delta') slot
-      in traceShow "Spill(1) ret" (d, subIsn : isns ++ [addIsn])
-      where delta' = 4095
-            !addIsn = ADD sp sp (OpImm (ImmInt delta'))
-            !subIsn = SUB sp sp (OpImm (ImmInt delta'))
-            msg = "Spill(1): " ++ show (spillSlotToOffset config slot) ++ "; Delta: " ++ show delta ++ " -> " ++ show (delta + delta')
-
-aarch64_mkSpillInstr config reg delta slot | (spillSlotToOffset config slot) - delta > 255
-    = let (!d, !isns) = traceShow msg $ aarch64_mkSpillInstr config reg (delta + delta') slot
-      in traceShow "Spill(2) ret" (d, subIsn : isns ++ [addIsn])
-      where delta' = (spillSlotToOffset config slot) - delta
-            !addIsn = ADD sp sp (OpImm (ImmInt delta'))
-            !subIsn = SUB sp sp (OpImm (ImmInt delta'))
-            msg = "Spill(2): " ++ show (spillSlotToOffset config slot) ++ "; Delta: " ++ show delta ++ " -> " ++ show (delta + delta')
-
-aarch64_mkSpillInstr config reg delta slot | (spillSlotToOffset config slot) - delta < -4095
-    = let (!d, !isns) = traceShow msg $ aarch64_mkSpillInstr config reg (delta - delta') slot
-      in traceShow "Spill(3) ret" (d, addIsn : isns ++ [subIsn])
-      where delta' = 4095
-            !addIsn = ADD sp sp (OpImm (ImmInt delta'))
-            !subIsn = SUB sp sp (OpImm (ImmInt delta'))
-            msg = "Spill(3): " ++ show (spillSlotToOffset config slot) ++ "; Delta: " ++ show delta ++ " -> " ++ show (delta + delta')
-
-aarch64_mkSpillInstr config reg delta slot | (spillSlotToOffset config slot) - delta < -256
-    = let (!d, !isns) = traceShow msg $ aarch64_mkSpillInstr config reg (delta + delta') slot
-      in traceShow "Spill(4) ret" (d, subIsn : isns ++ [addIsn])
-      where delta' = (spillSlotToOffset config slot) - delta
-            !addIsn = ADD sp sp (OpImm (ImmInt delta'))
-            !subIsn = SUB sp sp (OpImm (ImmInt delta'))
-            msg = "Spill(4): " ++ show (spillSlotToOffset config slot) ++ "; Delta: " ++ show delta ++ " -> " ++ show (delta + delta')
--}
-aarch64_mkSpillInstr config reg delta slot
-    = --[
-      -- ANN (text "Spill") $
-      -- traceShow ("Spill: " ++ show (off - delta)) $
-      STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt $ off - delta)))
-      --]
+   -> [Instr]
+
+-- Alright, so here's the plan.  On aarch64, we can't spill into arbitrary locations,
+-- the range is essentially [-256,255], or [0, 0xFFF]*8 = [0, 32760] for 64bits. For
+-- other ranges we need to adjust SP first; we should strive to keep it 16byte aligned.
+--
+-- To adjust for other range, we can use ADD/SUB, with a positive immediate of [0, 0xFFF],
+-- or [0, 0xFFF] << 12.
+
+aarch64_mkSpillInstr config reg delta slot =
+  case (spillSlotToOffset config slot) - delta of
+    imm | -256 <= imm && imm <= 255                               -> [ mkStr imm ]
+    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkStr imm ]
+    imm | imm > 0xfff && imm <= 0xffffff && imm .&. 0x7 == 0x0    -> [ mkAdd (imm .&~. 0xfff)
+                                                                     , mkStr (imm .&.  0xfff)
+                                                                     , mkSub (imm .&~. 0xfff) ]
+    -- imm | imm > 0xfff && imm <= 0xffffff                          -> [ mkAdd (imm .&~. 0xfff)
+    --                                                                  , mkAdd (imm .&.  0xff0)
+    --                                                                  , mkStr (imm .&.  0x00f)
+    --                                                                  , mkSub (imm .&.  0xff0)
+    --                                                                  , mkSub (imm .&~. 0xfff) ]
+    -- if we have a negative offset, well subtract another 0x1000 from it, and then
+    -- use the positive
+    -- imm | -imm > 0xfff && -imm <= 0xffefff && imm .&. 0x7 == 0x0  -> [ mkSub (-imm .&~. 0xfff + 0x1000)
+    --                                                                  , mkStr (0x1000 - (-imm .&. 0xfff))
+    --                                                                  , mkAdd (-imm .&~. 0xfff + 0x1000) ]
+    -- imm | -imm > 0xfff && -imm <= 0xffffff                        -> [ mkSub (-imm .&~. 0xfff)
+    --                                                                  , mkSub (-imm .&.  0xff0)
+    --                                                                  , mkStr (-(-imm .&. 0x00f))
+    --                                                                  , mkAdd (-imm .&.  0xff0)
+    --                                                                  , mkAdd (-imm .&~. 0xfff) ]
+    imm -> pprPanic "aarch64_mkSpillInstr" (text "Unable to spill into" <+> int imm)
     where
+        a .&~. b = a .&. (complement b)
+
         fmt = case reg of
             RegReal (RealRegSingle n) | n < 32 -> II64
             _                                  -> FF64
+
+        mkStr imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
+        mkAdd imm = ANN (text "Spill Add") $ ADD sp sp (OpImm (ImmInt imm))
+        mkSub imm = ANN (text "Spill Sub") $ SUB sp sp (OpImm (ImmInt imm))
+
         off = spillSlotToOffset config slot
 
+-- fails in compiler/stage2/build/GHC/Driver/Pipeline.o
 aarch64_mkLoadInstr
    :: NCGConfig
    -> Reg       -- register to load
    -> Int       -- current stack delta
    -> Int       -- spill slot to use
-   -> Instr
--- XXX this is stupid. We essentially do sp <- sp - 4095; str xN, [sp - ...] ; sp <- sp + 4095
-{-
-aarch64_mkLoadInstr config reg delta slot | (spillSlotToOffset config slot) - delta > 4095
-    = let (!d, !isns) =  traceShow msg $ aarch64_mkLoadInstr config reg (delta + delta') slot
-      in traceShow "Reload(1) ret" (d, SUB sp sp (OpImm (ImmInt 4095)) : isns ++ [ADD sp sp (OpImm (ImmInt 4095))])
-      where delta' = 4095
-            msg = "Reload(1): " ++ show (spillSlotToOffset config slot) ++ "; Delta: " ++ show delta ++ " -> " ++ show (delta + delta')
-
-aarch64_mkLoadInstr config reg delta slot | (spillSlotToOffset config slot) - delta > 255
-    = let (!d, !isns) = traceShow msg $ aarch64_mkLoadInstr config reg (delta + delta') slot
-      in traceShow "Reload(2) ret" (d, SUB sp sp (OpImm (ImmInt delta')) : isns ++ [ADD sp sp (OpImm (ImmInt delta'))])
-      where delta' = (spillSlotToOffset config slot) - delta
-            msg = "Reload(2): " ++ show (spillSlotToOffset config slot) ++ "; Delta: " ++ show delta ++ " -> " ++ show (delta + delta')
-
-aarch64_mkLoadInstr config reg delta slot | (spillSlotToOffset config slot) - delta < -4095
-    = let (!d, !isns) =  traceShow msg $ aarch64_mkLoadInstr config reg (delta + delta') slot
-      in traceShow "Reload(3) ret" (d, ADD sp sp (OpImm (ImmInt 4095)) : isns ++ [SUB sp sp (OpImm (ImmInt 4095))])
-      where delta' = -4095
-            msg = "Reload(3): " ++ show (spillSlotToOffset config slot) ++ "; Delta: " ++ show delta ++ " -> " ++ show (delta + delta')
-
-aarch64_mkLoadInstr config reg delta slot | (spillSlotToOffset config slot) - delta < -256
-    = let (!d, !isns) = traceShow msg $ aarch64_mkLoadInstr config reg (delta + delta') slot
-      in traceShow "Reload(4) ret" (d, SUB sp sp (OpImm (ImmInt delta')) : isns ++ [ADD sp sp (OpImm (ImmInt delta'))])
-      where delta' = (spillSlotToOffset config slot) - delta
-            msg = "Reload(4): " ++ show (spillSlotToOffset config slot) ++ "; Delta: " ++ show delta ++ " -> " ++ show (delta + delta')
-
--}
-aarch64_mkLoadInstr config reg delta slot
-    = --[
-      -- ANN (text "Reload") $
-      -- traceShow ("Reload: " ++ show (off - delta)) $
-      LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt $ off - delta)))
-      -- ]
+   -> [Instr]
+
+aarch64_mkLoadInstr config reg delta slot =
+  case (spillSlotToOffset config slot) - delta of
+    imm | -256 <= imm && imm <= 255                               -> [ mkLdr imm ]
+    imm | imm > 0 && imm .&. 0x7 == 0x0 && imm <= 0xfff           -> [ mkLdr imm ]
+    imm | imm > 0xfff && imm <= 0xffffff && imm .&. 0x7 == 0x0    -> [ mkAdd (imm .&~. 0xfff)
+                                                                     , mkLdr (imm .&.  0xfff)
+                                                                     , mkSub (imm .&~. 0xfff) ]
+    -- imm | imm > 0xfff && imm <= 0xffffff                          -> [ mkAdd (imm .&~. 0xfff)
+    --                                                                  , mkAdd (imm .&.  0xff0)
+    --                                                                  , mkLdr (imm .&.  0x00f)
+    --                                                                  , mkSub (imm .&.  0xff0)
+    --                                                                  , mkSub (imm .&~. 0xfff) ]
+    -- if we have a negative offset, well subtract another 0x1000 from it, and then
+    -- use the positive
+    -- imm | -imm > 0xfff && -imm <= 0xffefff && imm .&. 0x7 == 0x0  -> [ mkSub (-imm .&~. 0xfff + 0x1000)
+    --                                                                  , mkLdr (0x1000 - (-imm .&. 0xfff))
+    --                                                                  , mkAdd (-imm .&~. 0xfff + 0x1000) ]
+    -- imm | -imm > 0xfff && -imm <= 0xffffff                        -> [ mkSub (-imm .&~. 0xfff)
+    --                                                                  , mkSub (-imm .&.  0xff0)
+    --                                                                  , mkLdr (-(-imm .&. 0x00f))
+    --                                                                  , mkAdd (-imm .&.  0xff0)
+    --                                                                  , mkAdd (-imm .&~. 0xfff) ]
+    imm -> pprPanic "aarch64_mkSpillInstr" (text "Unable to spill into" <+> int imm)
     where
+        a .&~. b = a .&. (complement b)
+
         fmt = case reg of
             RegReal (RealRegSingle n) | n < 32 -> II64
             _                                  -> FF64
+
+        mkLdr imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
+        mkAdd imm = ANN (text "Reload Add") $ ADD sp sp (OpImm (ImmInt imm))
+        mkSub imm = ANN (text "Reload Sub") $ SUB sp sp (OpImm (ImmInt imm))
+
         off = spillSlotToOffset config slot
 
 --------------------------------------------------------------------------------
@@ -458,30 +460,12 @@ aarch64_isMetaInstr instr
 -- | Copy the value in a register to another one.
 -- Must work for all register classes.
 aarch64_mkRegRegMoveInstr :: Reg -> Reg -> Instr
-aarch64_mkRegRegMoveInstr src dst = MOV (OpReg W64 dst) (OpReg W64 src)
+aarch64_mkRegRegMoveInstr src dst = ANN (text $ "Reg->Reg Move: " ++ show src ++ " -> " ++ show dst) $ MOV (OpReg W64 dst) (OpReg W64 src)
 
 -- | Take the source and destination from this reg -> reg move instruction
 -- or Nothing if it's not one
 aarch64_takeRegRegMoveInstr :: Instr -> Maybe (Reg,Reg)
--- XXX: *if* we enable this here, we run into the elimination shortcut in
---      Linear.hs:441, which does not ensure that the register is free'd
---      before it's reused.  This seesm to be an issue if we have two function
---      calls in the same block.
---
---      y = f a b
---
---      will generate:
---      mov x0 a
---      mov x1 b
---      bl f
---      mov y x0
---
---      We'll elimitate the last mov y x0 call, and alias x0 = y.
---
---      No we'll need to reuse x0 for the next function, and when we
---      try to allocate it, it's already allocated.
---
---aarch64_takeRegRegMoveInstr (MOV (OpReg dst) (OpReg src)) = Just (src, dst)
+--aarch64_takeRegRegMoveInstr (MOV (OpReg fmt dst) (OpReg fmt' src)) | fmt == fmt' = Just (src, dst)
 aarch64_takeRegRegMoveInstr _ = Nothing
 
 -- | Make an unconditional jump instruction.
@@ -491,15 +475,15 @@ aarch64_mkJumpInstr id = [B (TBlock id)]
 aarch64_mkStackAllocInstr :: Platform -> Int -> [Instr]
 aarch64_mkStackAllocInstr platform n
     | n == 0 = []
-    | n > 0 && n < 4096 = [ SUB sp sp (OpImm (ImmInt n)) ]
-    | n > 0 = SUB sp sp (OpImm (ImmInt 4095)) : aarch64_mkStackAllocInstr platform (n - 4095)
+    | n > 0 && n < 4096 = [ ANN (text "Alloc More Stack") $ SUB sp sp (OpImm (ImmInt n)) ]
+    | n > 0 =  ANN (text "Alloc More Stack") (SUB sp sp (OpImm (ImmInt 4095))) : aarch64_mkStackAllocInstr platform (n - 4095)
 aarch64_mkStackAllocInstr platform n = pprPanic "aarch64_mkStackAllocInstr" (int n)
 
 aarch64_mkStackDeallocInstr :: Platform -> Int -> [Instr]
 aarch64_mkStackDeallocInstr platform n
     | n == 0 = []
-    | n > 0 && n < 4096 = [ ADD sp sp (OpImm (ImmInt n)) ]
-    | n > 0 = ADD sp sp (OpImm (ImmInt 4095)) : aarch64_mkStackDeallocInstr platform (n + 4095)
+    | n > 0 && n < 4096 = [ ANN (text "Dealloc More Stack") $ ADD sp sp (OpImm (ImmInt n)) ]
+    | n > 0 =  ANN (text "Dealloc More Stack") (ADD sp sp (OpImm (ImmInt 4095))) : aarch64_mkStackDeallocInstr platform (n - 4095)
 aarch64_mkStackDeallocInstr platform n = pprPanic "aarch64_mkStackDeallocInstr" (int n)
 
 --
@@ -540,6 +524,7 @@ allocMoreStack platform slots proc@(CmmProc info lbl live (ListGraph code)) = do
 
       insert_dealloc insn r = case insn of
         J _ -> dealloc ++ (insn : r)
+        ANN _ (J _) -> dealloc ++ (insn : r)
         _other | aarch64_jumpDestsOfInstr insn /= []
             -> aarch64_patchJumpInstr insn retarget : r
         _other -> insn : r
@@ -669,7 +654,7 @@ data Instr
     -- Branching.
     | J Target            -- like B, but only generated from genJump. Used to distinguish genJumps from others.
     | B Target            -- unconditional branching b/br. (To a blockid, label or register)
-    | BL Target           -- branch and link (e.g. set x30 to next pc, and branch)
+    | BL Target [Reg] [Reg] -- branch and link (e.g. set x30 to next pc, and branch)
     | BCOND Cond Target   -- branch with condition. b.<cond>
 
     -- 8. Synchronization Instructions -----------------------------------------


=====================================
compiler/GHC/CmmToAsm/AArch64/Ppr.hs
=====================================
@@ -449,13 +449,13 @@ pprInstr platform instr = case instr of
 
   -- 4. Branch Instructions ----------------------------------------------------
   J t            -> pprInstr platform (B t)
-  B (TBlock bid) -> text "\tbl" <+> ppr (mkLocalBlockLabel (getUnique bid))
-  B (TLabel lbl) -> text "\tbl" <+> ppr lbl
-  B (TReg r)     -> text "\tblr" <+> pprReg W64 r
+  B (TBlock bid) -> text "\tb" <+> ppr (mkLocalBlockLabel (getUnique bid))
+  B (TLabel lbl) -> text "\tb" <+> ppr lbl
+  B (TReg r)     -> text "\tbr" <+> pprReg W64 r
 
-  BL (TBlock bid) -> text "\tbl" <+> ppr (mkLocalBlockLabel (getUnique bid))
-  BL (TLabel lbl) -> text "\tbl" <+> ppr lbl
-  BL (TReg r)     -> text "\tblr" <+> pprReg W64 r
+  BL (TBlock bid) _ _ -> text "\tbl" <+> ppr (mkLocalBlockLabel (getUnique bid))
+  BL (TLabel lbl) _ _ -> text "\tbl" <+> ppr lbl
+  BL (TReg r)     _ _ -> text "\tblr" <+> pprReg W64 r
 
   BCOND c (TBlock bid) -> text "\t" <> pprBcond c <+> ppr (mkLocalBlockLabel (getUnique bid))
   BCOND c (TLabel lbl) -> text "\t" <> pprBcond c <+> ppr lbl
@@ -477,38 +477,38 @@ pprInstr platform instr = case instr of
   -- NOTE: GHC may do whacky things where it only load the lower part of an
   --       address. Not observing the correct size when loading will lead
   --       inevitably to crashes.
-  STR f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+  STR _f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
     text "\tstrb" <+> pprOp o1 <> comma <+> pprOp o2
-  STR f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+  STR _f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
     text "\tstrh" <+> pprOp o1 <> comma <+> pprOp o2
-  STR f o1 o2 -> text "\tstr" <+> pprOp o1 <> comma <+> pprOp o2
+  STR _f o1 o2 -> text "\tstr" <+> pprOp o1 <> comma <+> pprOp o2
 
-  LDR f o1 (OpImm (ImmIndex lbl' off)) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+  LDR _f o1 (OpImm (ImmIndex lbl' off)) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
     text "\tadrp" <+> pprOp o1 <> comma <+> text ":got:" <> ppr lbl $$
     text "\tldr" <+> pprOp o1 <> comma <+> text "[" <> pprOp o1 <> comma <+> text ":got_lo12:" <> ppr lbl <> text "]" $$
     text "\tadd" <+> pprOp o1 <> comma <+> pprOp o1 <> comma <+> char '#' <> int off -- XXX: check that off is in 12bits.
 
-  LDR f o1 (OpImm (ImmIndex lbl off)) ->
+  LDR _f o1 (OpImm (ImmIndex lbl off)) ->
     text "\tadrp" <+> pprOp o1 <> comma <+> ppr lbl $$
     text "\tadd" <+> pprOp o1 <> comma <+> pprOp o1 <> comma <+> text ":lo12:" <> ppr lbl $$
     text "\tadd" <+> pprOp o1 <> comma <+> pprOp o1 <> comma <+> char '#' <> int off -- XXX: check that off is in 12bits.
 
-  LDR f o1 (OpImm (ImmCLbl lbl')) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
+  LDR _f o1 (OpImm (ImmCLbl lbl')) | Just (_info, lbl) <- dynamicLinkerLabelInfo lbl' ->
     text "\tadrp" <+> pprOp o1 <> comma <+> text ":got:" <> ppr lbl $$
     text "\tldr" <+> pprOp o1 <> comma <+> text "[" <> pprOp o1 <> comma <+> text ":got_lo12:" <> ppr lbl <> text "]"
 
-  LDR f o1 (OpImm (ImmCLbl lbl)) ->
+  LDR _f o1 (OpImm (ImmCLbl lbl)) ->
     text "\tadrp" <+> pprOp o1 <> comma <+> ppr lbl $$
     text "\tadd" <+> pprOp o1 <> comma <+> pprOp o1 <> comma <+> text ":lo12:" <> ppr lbl
 
-  LDR f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+  LDR _f o1@(OpReg W8 (RegReal (RealRegSingle i))) o2 | i < 32 ->
     text "\tldrsb" <+> pprOp o1 <> comma <+> pprOp o2
-  LDR f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
+  LDR _f o1@(OpReg W16 (RegReal (RealRegSingle i))) o2 | i < 32 ->
     text "\tldrsh" <+> pprOp o1 <> comma <+> pprOp o2
-  LDR f o1 o2 -> text "\tldr" <+> pprOp o1 <> comma <+> pprOp o2
+  LDR _f o1 o2 -> text "\tldr" <+> pprOp o1 <> comma <+> pprOp o2
 
-  STP f o1 o2 o3 -> text "\tstp" <+> pprOp o1 <> comma <+> pprOp o2 <> comma <+> pprOp o3
-  LDP f o1 o2 o3 -> text "\tldp" <+> pprOp o1 <> comma <+> pprOp o2 <> comma <+> pprOp o3
+  STP _f o1 o2 o3 -> text "\tstp" <+> pprOp o1 <> comma <+> pprOp o2 <> comma <+> pprOp o3
+  LDP _f o1 o2 o3 -> text "\tldp" <+> pprOp o1 <> comma <+> pprOp o2 <> comma <+> pprOp o3
 
   -- 8. Synchronization Instructions -------------------------------------------
   DMBSY -> text "\tdmb sy"


=====================================
compiler/GHC/CmmToAsm/Instr.hs
=====================================
@@ -26,6 +26,10 @@ import GHC.Cmm hiding (topInfoTable)
 
 import GHC.CmmToAsm.Config
 
+import GHC.Utils.Outputable (SDoc)
+
+import GHC.Stack
+
 -- | Holds a list of source and destination registers used by a
 --      particular instruction.
 --
@@ -137,20 +141,22 @@ class   Instruction instr where
 
         -- | An instruction to spill a register into a spill slot.
         mkSpillInstr
-                :: NCGConfig
+                :: HasCallStack
+                => NCGConfig
                 -> Reg          -- ^ the reg to spill
                 -> Int          -- ^ the current stack delta
                 -> Int          -- ^ spill slot to use
-                -> instr        -- ^ instructions
+                -> [instr]        -- ^ instructions
 
 
         -- | An instruction to reload a register from a spill slot.
         mkLoadInstr
-                :: NCGConfig
+                :: HasCallStack
+                => NCGConfig
                 -> Reg          -- ^ the reg to reload.
                 -> Int          -- ^ the current stack delta
                 -> Int          -- ^ the spill slot to use
-                -> instr        -- ^ instructions
+                -> [instr]        -- ^ instructions
 
         -- | See if this instruction is telling us the current C stack delta
         takeDeltaInstr
@@ -205,3 +211,6 @@ class   Instruction instr where
                 :: Platform
                 -> Int
                 -> [instr]
+
+        -- Create a comment instruction
+        mkComment :: SDoc -> [instr]


=====================================
compiler/GHC/CmmToAsm/PIC.hs
=====================================
@@ -268,11 +268,15 @@ howToAccessLabel config _ OSMinGW32 this_mod _ lbl
 -- is enough for ~64MB of range. Anything else will need to go through a veneer,
 -- which is the job of the linker to build.  We might only want to lookup
 -- Data References through the GOT.
--- howToAccessLabel _config ArchAArch64 _os _this_mod kind _lbl
---         = case kind of
---             DataReference -> AccessDirectly -- AccessViaSymbolPtr
---             CallReference -> AccessDirectly
---             JumpReference -> AccessDirectly
+howToAccessLabel config ArchAArch64 _os this_mod kind lbl
+        | not (ncgExternalDynamicRefs config)
+        = AccessDirectly
+
+        | labelDynamic config this_mod lbl
+        = AccessViaSymbolPtr
+
+        | otherwise
+        = AccessDirectly
 
 
 -- Mach-O (Darwin, Mac OS X)


=====================================
compiler/GHC/CmmToAsm/PPC/Instr.hs
=====================================
@@ -76,6 +76,7 @@ instance Instruction Instr where
         mkJumpInstr             = ppc_mkJumpInstr
         mkStackAllocInstr       = ppc_mkStackAllocInstr
         mkStackDeallocInstr     = ppc_mkStackDeallocInstr
+        mkComment               = const []
 
 
 ppc_mkStackAllocInstr :: Platform -> Int -> [Instr]
@@ -541,7 +542,7 @@ ppc_mkSpillInstr
    -> Reg       -- register to spill
    -> Int       -- current stack delta
    -> Int       -- spill slot to use
-   -> Instr
+   -> [Instr]
 
 ppc_mkSpillInstr config reg delta slot
   = let platform = ncgPlatform config
@@ -558,7 +559,7 @@ ppc_mkSpillInstr config reg delta slot
                 Just _  -> ST
                 Nothing -> STFAR -- pseudo instruction: 32 bit offsets
 
-    in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+    in [instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))]
 
 
 ppc_mkLoadInstr
@@ -566,7 +567,7 @@ ppc_mkLoadInstr
    -> Reg       -- register to load
    -> Int       -- current stack delta
    -> Int       -- spill slot to use
-   -> Instr
+   -> [Instr]
 
 ppc_mkLoadInstr config reg delta slot
   = let platform = ncgPlatform config
@@ -583,7 +584,7 @@ ppc_mkLoadInstr config reg delta slot
                 Just _  -> LD
                 Nothing -> LDFAR -- pseudo instruction: 32 bit offsets
 
-    in instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))
+    in [instr fmt reg (AddrRegImm sp (ImmInt (off-delta)))]
 
 
 -- | The size of a minimal stackframe header including minimal


=====================================
compiler/GHC/CmmToAsm/Reg/Linear.hs
=====================================
@@ -256,7 +256,7 @@ linearRegAlloc' config initFreeRegs entry_ids block_live sccs
         return  (blocks, stats, getStackUse stack)
 
 
-linearRA_SCCs :: OutputableRegConstraint freeRegs instr
+linearRA_SCCs :: (HasCallStack, OutputableRegConstraint freeRegs instr)
               => [BlockId]
               -> BlockMap RegSet
               -> [NatBasicBlock instr]
@@ -291,7 +291,7 @@ linearRA_SCCs entry_ids block_live blocksAcc (CyclicSCC blocks : sccs)
    more sanity checking to guard against this eventuality.
 -}
 
-process :: OutputableRegConstraint freeRegs instr
+process :: (HasCallStack, OutputableRegConstraint freeRegs instr)
         => [BlockId]
         -> BlockMap RegSet
         -> [GenBasicBlock (LiveInstr instr)]
@@ -335,7 +335,7 @@ process entry_ids block_live (b@(BasicBlock id _) : blocks)
 -- | Do register allocation on this basic block
 --
 processBlock
-        :: OutputableRegConstraint freeRegs instr
+        :: (HasCallStack, OutputableRegConstraint freeRegs instr)
         => BlockMap RegSet              -- ^ live regs on entry to each basic block
         -> LiveBasicBlock instr         -- ^ block to do register allocation on
         -> RegM freeRegs [NatBasicBlock instr]   -- ^ block with registers allocated
@@ -382,7 +382,7 @@ initBlock id block_live
 
 -- | Do allocation for a sequence of instructions.
 linearRA
-        :: OutputableRegConstraint freeRegs instr
+        :: (HasCallStack, OutputableRegConstraint freeRegs instr)
         => BlockMap RegSet                      -- ^ map of what vregs are live on entry to each block.
         -> [instr]                              -- ^ accumulator for instructions already processed.
         -> [NatBasicBlock instr]                -- ^ accumulator for blocks of fixup code.
@@ -409,7 +409,7 @@ linearRA block_live accInstr accFixups id (instr:instrs)
 
 -- | Do allocation for a single instruction.
 raInsn
-        :: OutputableRegConstraint freeRegs instr
+        :: (HasCallStack, OutputableRegConstraint freeRegs instr)
         => BlockMap RegSet                      -- ^ map of what vregs are love on entry to each block.
         -> [instr]                              -- ^ accumulator for instructions already processed.
         -> BlockId                              -- ^ the id of the current block, for debugging
@@ -489,7 +489,7 @@ isInReg src assig | Just (InReg _) <- lookupUFM assig src = True
                   | otherwise = False
 
 
-genRaInsn :: (OutputableRegConstraint freeRegs instr)
+genRaInsn :: (HasCallStack, OutputableRegConstraint freeRegs instr)
           => BlockMap RegSet
           -> [instr]
           -> BlockId
@@ -499,7 +499,7 @@ genRaInsn :: (OutputableRegConstraint freeRegs instr)
           -> RegM freeRegs ([instr], [NatBasicBlock instr])
 
 genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
---   pprTraceM "genRaInsn" $ ppr (block_id, instr)
+--  pprTraceM "genRaInsn" $ ppr (block_id, instr)
   platform <- getPlatform
   case regUsageOfInstr platform instr of { RU read written ->
     do
@@ -509,10 +509,10 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
     -- we don't need to do anything with real registers that are
     -- only read by this instr.  (the list is typically ~2 elements,
     -- so using nub isn't a problem).
-    let real_read       = nub [ rr      | (RegReal rr) <- read]
     let virt_read       = nub [ vr      | (RegVirtual vr) <- read ]
 
 --     do
+--         let real_read       = nub [ rr      | (RegReal rr) <- read]
 --         freeregs <- getFreeRegsR
 --         assig    <- getAssigR
 
@@ -589,8 +589,27 @@ genRaInsn block_live new_instrs block_id instr r_dying w_dying = do
                                  | src == dst   -> []
                                 _               -> [patched_instr]
 
-    let code = squashed_instr ++ w_spills ++ reverse r_spills
-                ++ clobber_saves ++ new_instrs
+    -- On the use of @reverse@ below.
+    -- Since we can now have spills and reloads produce multiple instructions
+    -- we need to ensure they are emitted in the correct order.  Previously
+    -- we did not, as mkSpill/mkReload/mkRegRegMove produced single instructions
+    -- only and as such order didn't matter. Now it does.  And reversing the
+    -- spills (clobber will also spill), will ensure they are emitted in the
+    -- right order.
+
+    -- u <- getUniqueR
+    let code = -- mkComment (text "<genRaInsn(" <> ppr u <> text ")>")
+               -- ++ mkComment (text "<genRaInsn(" <> ppr u <> text "):squashed>")] ++
+               squashed_instr
+        --     ++ mkComment (text "<genRaInsn(" <> ppr u <> text "):w_spills>")
+            ++ reverse w_spills
+        --     ++ mkComment (text "<genRaInsn(" <> ppr u <> text "):r_spills>")
+            ++ reverse r_spills
+        --     ++ mkComment (text "<genRaInsn(" <> ppr u <> text "):clobber_saves>")
+            ++ reverse clobber_saves
+        --     ++ mkComment (text "<genRaInsn(" <> ppr u <> text "):new_instrs>")
+            ++ new_instrs
+        --     ++ mkComment (text "</genRaInsn(" <> ppr u <> text ")>")
 
 --    pprTrace "patched-code" ((vcat $ map (docToSDoc . pprInstr) code)) $ do
 --    pprTrace "pached-fixup" ((ppr fixup_blocks)) $ do
@@ -642,7 +661,7 @@ releaseRegs regs = do
 --
 
 saveClobberedTemps
-        :: (Instruction instr, FR freeRegs)
+        :: (HasCallStack, Instruction instr, FR freeRegs)
         => [RealReg]            -- real registers clobbered by this instruction
         -> [Reg]                -- registers which are no longer live after this insn
         -> RegM freeRegs [instr]         -- return: instructions to spill any temps that will
@@ -665,9 +684,11 @@ saveClobberedTemps clobbered dying
 
         (instrs,assig') <- clobber assig [] to_spill
         setAssigR assig'
-        return instrs
-
+        return $ -- mkComment (text "<saveClobberedTemps>") ++
+                 instrs
+--              ++ mkComment (text "</saveClobberedTemps>")
    where
+--     clobber :: UniqFM Loc -> [instr] -> [(Unique, RealReg)] -> RegM freeRegs ([instr], UniqFM Loc)
      clobber assig instrs []
             = return (instrs, assig)
 
@@ -701,7 +722,7 @@ saveClobberedTemps clobbered dying
 
                   let new_assign  = addToUFM assig temp (InBoth reg slot)
 
-                  clobber new_assign (spill : instrs) rest
+                  clobber new_assign (spill ++ instrs) rest
 
 
 
@@ -716,8 +737,6 @@ clobberRegs clobbered
  = do   platform <- getPlatform
         freeregs <- getFreeRegsR
 
-        config <- getConfig
-
         let gpRegs  = frGetFreeRegs platform RcInteger freeregs :: [RealReg]
             fltRegs = frGetFreeRegs platform RcFloat   freeregs :: [RealReg]
             dblRegs = frGetFreeRegs platform RcDouble  freeregs :: [RealReg]
@@ -773,7 +792,7 @@ data SpillLoc = ReadMem StackSlot  -- reading from register only in memory
 --   the list of free registers and free stack slots.
 
 allocateRegsAndSpill
-        :: forall freeRegs instr. (FR freeRegs, Outputable instr, Instruction instr)
+        :: forall freeRegs instr. (HasCallStack, FR freeRegs, Outputable instr, Instruction instr)
         => Bool                 -- True <=> reading (load up spilled regs)
         -> [VirtualReg]         -- don't push these out
         -> [instr]              -- spill insns
@@ -839,7 +858,7 @@ findPrefRealReg vreg = do
 
 -- reading is redundant with reason, but we keep it around because it's
 -- convenient and it maintains the recursive structure of the allocator. -- EZY
-allocRegsAndSpill_spill :: (FR freeRegs, Instruction instr, Outputable instr)
+allocRegsAndSpill_spill :: (HasCallStack, FR freeRegs, Instruction instr, Outputable instr)
                         => Bool
                         -> [VirtualReg]
                         -> [instr]
@@ -919,10 +938,7 @@ allocRegsAndSpill_spill reading keep spills alloc r rs assig spill_loc
                         | (temp_to_push_out, (my_reg :: RealReg)) : _
                                         <- candidates_inReg
                         = do
-                                (spill_insn, slot) <- spillR (RegReal my_reg) temp_to_push_out
-                                let spill_store  = (if reading then id else reverse)
-                                                        -- COMMENT (fsLit "spill alloc"):
-                                                           [spill_insn]
+                                (spill_store, slot) <- spillR (RegReal my_reg) temp_to_push_out
 
                                 -- record that this temp was spilled
                                 recordSpill (SpillAlloc temp_to_push_out)
@@ -972,7 +988,7 @@ loadTemp vreg (ReadMem slot) hreg spills
  = do
         insn <- loadR (RegReal hreg) slot
         recordSpill (SpillLoad $ getUnique vreg)
-        return  $  {- COMMENT (fsLit "spill load") : -} insn : spills
+        return  $  {- mkComment (text "spill load") : -} insn ++ spills
 
 loadTemp _ _ _ spills =
    return spills


=====================================
compiler/GHC/CmmToAsm/Reg/Linear/JoinToTargets.hs
=====================================
@@ -24,10 +24,13 @@ import GHC.Cmm.BlockId
 import GHC.Cmm.Dataflow.Collections
 import GHC.Data.Graph.Directed
 import GHC.Utils.Outputable
+import GHC.Utils.Monad (concatMapM)
 import GHC.Types.Unique
 import GHC.Types.Unique.FM
 import GHC.Types.Unique.Set
 
+import GHC.Stack
+
 -- | For a jump instruction at the end of a block, generate fixup code so its
 --      vregs are in the correct regs for its destination.
 --
@@ -295,7 +298,7 @@ expandNode vreg src dst
 --      cycles in expandNode above.
 --
 handleComponent
-        :: Instruction instr
+        :: (HasCallStack, Instruction instr)
         => Int -> instr -> SCC (Node Loc Unique)
         -> RegM freeRegs [instr]
 
@@ -304,7 +307,7 @@ handleComponent
 --      go via a spill slot.
 --
 handleComponent delta _  (AcyclicSCC (DigraphNode vreg src dsts))
-        = mapM (makeMove delta vreg src) dsts
+        = concatMapM (makeMove delta vreg src) dsts
 
 
 -- Handle some cyclic moves.
@@ -338,7 +341,7 @@ handleComponent delta instr
 
         -- make sure to do all the reloads after all the spills,
         --      so we don't end up clobbering the source values.
-        return ([instrSpill] ++ concat remainingFixUps ++ [instrLoad])
+        return (instrSpill ++ concat remainingFixUps ++ instrLoad)
 
 handleComponent _ _ (CyclicSCC _)
  = panic "Register Allocator: handleComponent cyclic"
@@ -347,12 +350,12 @@ handleComponent _ _ (CyclicSCC _)
 -- | Move a vreg between these two locations.
 --
 makeMove
-    :: Instruction instr
+    :: (HasCallStack, Instruction instr)
     => Int      -- ^ current C stack delta.
     -> Unique   -- ^ unique of the vreg that we're moving.
     -> Loc      -- ^ source location.
     -> Loc      -- ^ destination location.
-    -> RegM freeRegs instr  -- ^ move instruction.
+    -> RegM freeRegs [instr]  -- ^ move instruction.
 
 makeMove delta vreg src dst
  = do config <- getConfig
@@ -361,7 +364,7 @@ makeMove delta vreg src dst
       case (src, dst) of
           (InReg s, InReg d) ->
               do recordSpill (SpillJoinRR vreg)
-                 return $ mkRegRegMoveInstr platform (RegReal s) (RegReal d)
+                 return $ [mkRegRegMoveInstr platform (RegReal s) (RegReal d)]
           (InMem s, InReg d) ->
               do recordSpill (SpillJoinRM vreg)
                  return $ mkLoadInstr config (RegReal d) delta s


=====================================
compiler/GHC/CmmToAsm/Reg/Linear/State.hs
=====================================
@@ -55,6 +55,7 @@ import GHC.Types.Unique
 import GHC.Types.Unique.Supply
 
 import Control.Monad (ap)
+import GHC.Stack
 
 -- Avoids using unboxed tuples when loading into GHCi
 #if !defined(GHC_LOADED_INTO_GHCI)
@@ -126,8 +127,8 @@ makeRAStats state
         , ra_fixupList          = ra_fixups state }
 
 
-spillR :: Instruction instr
-       => Reg -> Unique -> RegM freeRegs (instr, Int)
+spillR :: (HasCallStack, Instruction instr)
+       => Reg -> Unique -> RegM freeRegs ([instr], Int)
 
 spillR reg temp = RegM $ \s ->
   let (stack1,slot) = getStackSlotFor (ra_stack s) temp
@@ -137,7 +138,7 @@ spillR reg temp = RegM $ \s ->
 
 
 loadR :: Instruction instr
-      => Reg -> Int -> RegM freeRegs instr
+      => Reg -> Int -> RegM freeRegs [instr]
 
 loadR reg slot = RegM $ \s ->
   RA_Result s (mkLoadInstr (ra_config s) reg (ra_delta s) slot)


=====================================
compiler/GHC/CmmToAsm/Reg/Liveness.hs
=====================================
@@ -518,7 +518,7 @@ stripLiveBlock
         -> LiveBasicBlock instr
         -> NatBasicBlock instr
 
-stripLiveBlock config (BasicBlock i lis)
+stripLiveBlock _config (BasicBlock i lis)
  =      BasicBlock i instrs'
 
  where  (instrs', _)
@@ -527,13 +527,15 @@ stripLiveBlock config (BasicBlock i lis)
         spillNat acc []
          =      return (reverse acc)
 
-        spillNat acc (LiveInstr (SPILL reg slot) _ : instrs)
-         = do   delta   <- get
-                spillNat (mkSpillInstr config reg delta slot : acc) instrs
+        spillNat _acc (LiveInstr (SPILL _reg _slot) _ : _instrs)
+         = error "dead code: spill"
+        --  do   delta   <- get
+        --         spillNat (mkSpillInstr config reg delta slot : acc) instrs
 
-        spillNat acc (LiveInstr (RELOAD slot reg) _ : instrs)
-         = do   delta   <- get
-                spillNat (mkLoadInstr config reg delta slot : acc) instrs
+        spillNat _acc (LiveInstr (RELOAD _slot _reg) _ : _instrs)
+         = error "dead code: reload:"
+        --  do   delta   <- get
+        --         spillNat (mkLoadInstr config reg delta slot : acc) instrs
 
         spillNat acc (LiveInstr (Instr instr) _ : instrs)
          | Just i <- takeDeltaInstr instr


=====================================
compiler/GHC/CmmToAsm/SPARC/Instr.hs
=====================================
@@ -104,6 +104,7 @@ instance Instruction Instr where
         mkJumpInstr             = sparc_mkJumpInstr
         mkStackAllocInstr       = panic "no sparc_mkStackAllocInstr"
         mkStackDeallocInstr     = panic "no sparc_mkStackDeallocInstr"
+        mkComment               = const []
 
 
 -- | SPARC instruction set.
@@ -373,7 +374,7 @@ sparc_mkSpillInstr
     -> Reg      -- ^ register to spill
     -> Int      -- ^ current stack delta
     -> Int      -- ^ spill slot to use
-    -> Instr
+    -> [Instr]
 
 sparc_mkSpillInstr config reg delta slot
  = let  platform = ncgPlatform config
@@ -384,7 +385,7 @@ sparc_mkSpillInstr config reg delta slot
                         RcFloat   -> FF32
                         RcDouble  -> FF64
 
-    in ST fmt reg (fpRel (negate off_w))
+    in [ST fmt reg (fpRel (negate off_w))]
 
 
 -- | Make a spill reload instruction.
@@ -393,7 +394,7 @@ sparc_mkLoadInstr
     -> Reg      -- ^ register to load into
     -> Int      -- ^ current stack delta
     -> Int      -- ^ spill slot to use
-    -> Instr
+    -> [Instr]
 
 sparc_mkLoadInstr config reg delta slot
   = let platform = ncgPlatform config
@@ -404,7 +405,7 @@ sparc_mkLoadInstr config reg delta slot
                         RcFloat   -> FF32
                         RcDouble  -> FF64
 
-        in LD fmt (fpRel (- off_w)) reg
+        in [LD fmt (fpRel (- off_w)) reg]
 
 
 --------------------------------------------------------------------------------


=====================================
compiler/GHC/CmmToAsm/X86/Instr.hs
=====================================
@@ -71,6 +71,7 @@ instance Instruction Instr where
         mkJumpInstr             = x86_mkJumpInstr
         mkStackAllocInstr       = x86_mkStackAllocInstr
         mkStackDeallocInstr     = x86_mkStackDeallocInstr
+        mkComment               = const []
 
 
 -- -----------------------------------------------------------------------------
@@ -668,15 +669,15 @@ x86_mkSpillInstr
     -> Reg      -- register to spill
     -> Int      -- current stack delta
     -> Int      -- spill slot to use
-    -> Instr
+    -> [Instr]
 
 x86_mkSpillInstr config reg delta slot
   = let off     = spillSlotToOffset platform slot - delta
     in
     case targetClassOfReg platform reg of
-           RcInteger   -> MOV (archWordFormat is32Bit)
-                                   (OpReg reg) (OpAddr (spRel platform off))
-           RcDouble    -> MOV FF64 (OpReg reg) (OpAddr (spRel platform off))
+           RcInteger   -> [MOV (archWordFormat is32Bit)
+                                   (OpReg reg) (OpAddr (spRel platform off))]
+           RcDouble    -> [MOV FF64 (OpReg reg) (OpAddr (spRel platform off))]
            _         -> panic "X86.mkSpillInstr: no match"
     where platform = ncgPlatform config
           is32Bit = target32Bit platform
@@ -687,15 +688,15 @@ x86_mkLoadInstr
     -> Reg      -- register to load
     -> Int      -- current stack delta
     -> Int      -- spill slot to use
-    -> Instr
+    -> [Instr]
 
 x86_mkLoadInstr config reg delta slot
   = let off     = spillSlotToOffset platform slot - delta
     in
         case targetClassOfReg platform reg of
-              RcInteger -> MOV (archWordFormat is32Bit)
-                                    (OpAddr (spRel platform off)) (OpReg reg)
-              RcDouble  -> MOV FF64 (OpAddr (spRel platform off)) (OpReg reg)
+              RcInteger -> [MOV (archWordFormat is32Bit)
+                                    (OpAddr (spRel platform off)) (OpReg reg)]
+              RcDouble  -> [MOV FF64 (OpAddr (spRel platform off)) (OpReg reg)]
               _           -> panic "X86.x86_mkLoadInstr"
     where platform = ncgPlatform config
           is32Bit = target32Bit platform



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/3dbc6ac67a0efac9eaabb5ea4623247fa717d189...d088f7d1af476d5bdc7cc750ce91e8f48ba222d8

-- 
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/compare/3dbc6ac67a0efac9eaabb5ea4623247fa717d189...d088f7d1af476d5bdc7cc750ce91e8f48ba222d8
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20200908/3cdfd0fc/attachment-0001.html>