[Git][ghc/ghc][wip/andreask/arm_immediates] Aarch64 NCG: Use encoded immediates for literals.
Andreas Klebinger (@AndreasK)
gitlab at gitlab.haskell.org
Tue Jul 25 12:07:30 UTC 2023
Andreas Klebinger pushed to branch wip/andreask/arm_immediates at Glasgow Haskell Compiler / GHC
Commits:
be3d2837 by Andreas Klebinger at 2023-07-25T13:57:37+02:00
Aarch64 NCG: Use encoded immediates for literals.
Try to generate
instr x2, <imm>
instead of
mov x1, lit
instr x2, x1
When possible. This get's rid if quite a few redundant
mov instructions.
I believe this causes a metric decrease for LargeRecords as
we reduce register pressure.
-------------------------
Metric Decrease:
LargeRecord
-------------------------
- - - - -
4 changed files:
- compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
- compiler/GHC/CmmToAsm/AArch64/Instr.hs
- compiler/GHC/CmmToAsm/AArch64/Ppr.hs
- compiler/GHC/CmmToAsm/AArch64/Regs.hs
Changes:
=====================================
compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
=====================================
@@ -372,6 +372,97 @@ getSomeReg expr = do
Fixed rep reg code ->
return (reg, rep, code)
+{- Note [Aarch64 immediates]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Aarch64 with it's fixed width instruction encoding uses leftover space for
+immediates.
+If you want the full rundown consult the arch reference document:
+"ArmĀ® Architecture Reference Manual" - "C3.4 Data processing - immediate"
+
+The gist of it is that different instructions allow for different immediate encodings.
+The ones we care about for better code generation are:
+
+* Simple but potentially repeated bit-patterns for logic instructions.
+* 16bit numbers shifted by multiples of 16.
+* 12 bit numbers optionally shifted by 12 bits.
+
+It might seem like the ISA allows for 64bit immediates but this isn't the case.
+Rather there are some instruction aliases which allow for large unencoded immediates
+which will then be transalted to one of the immediate encodings implicitly.
+
+For example mov x1, #0x10000 is allowed but will be assembled to movz x1, #0x1, lsl #16
+-}
+
+-- | Move (wide immediate)
+-- Allows for 16bit immediate which can be shifted by 0/16/32/48 bits.
+-- Used with MOVZ,MOVN, MOVK
+-- See Note [Aarch64 immediates]
+getMovWideImm :: Integer -> Width -> Maybe Operand
+getMovWideImm n w
+ -- TODO: Handle sign extension/negatives
+ | n <= 0
+ = Nothing
+ -- Fits in 16 bits
+ | sized_n < 2^(16 :: Int)
+ = Just $ OpImm (ImmInteger truncated)
+
+ -- 0x0000 0000 xxxx 0000
+ | trailing_zeros >= 16 && sized_n < 2^(32 :: Int)
+ = Just $ OpImmShift (ImmInteger $ truncated `shiftR` 16) SLSL 16
+
+ -- 0x 0000 xxxx 0000 0000
+ | trailing_zeros >= 32 && sized_n < 2^(48 :: Int)
+ = Just $ OpImmShift (ImmInteger $ truncated `shiftR` 32) SLSL 32
+
+ -- 0x xxxx 0000 0000 0000
+ | trailing_zeros >= 48
+ = Just $ OpImmShift (ImmInteger $ truncated `shiftR` 48) SLSL 48
+
+ | otherwise
+ = Nothing
+ where
+ truncated = narrowU w n
+ sized_n = fromIntegral truncated :: Word64
+ trailing_zeros = countTrailingZeros sized_n
+
+-- | Arithmetic(immediate)
+-- Allows for 12bit immediates which can be shifted by 0 or 12 bits.
+-- Used with ADD, ADDS, SUB, SUBS, CMP, CMN
+-- See Note [Aarch64 immediates]
+getArithImm :: Integer -> Width -> Maybe Operand
+getArithImm n w
+ -- TODO: Handle sign extension
+ | n <= 0
+ = Nothing
+ -- Fits in 16 bits
+ -- Fits in 12 bits
+ | sized_n < 2^(12::Int)
+ = Just $ OpImm (ImmInteger truncated)
+
+ -- 12 bits shifted by 12 places.
+ | trailing_zeros >= 12 && sized_n < 2^(24::Int)
+ = Just $ OpImmShift (ImmInteger $ truncated `shiftR` 12) SLSL 12
+
+ | otherwise
+ = Nothing
+ where
+ sized_n = fromIntegral truncated :: Word64
+ truncated = narrowU w n
+ trailing_zeros = countTrailingZeros sized_n
+
+-- | Logical (immediate)
+-- Allows encoding of some repeated bitpatterns
+-- Used with AND, ANDS, EOR, ORR, TST
+-- and their aliases which includes at least MOV (bitmask immediate)
+-- See Note [Aarch64 immediates]
+getBitmaskImm :: Integer -> Width -> Maybe Operand
+getBitmaskImm n w
+ | isAArch64Bitmask truncated = Just $ OpImm (ImmInteger truncated)
+ | otherwise = Nothing
+ where
+ truncated = narrowU w n
+
+
-- TODO OPT: we might be able give getRegister
-- a hint, what kind of register we want.
getFloatReg :: HasCallStack => CmmExpr -> NatM (Reg, Format, InstrBlock)
@@ -494,8 +585,14 @@ getRegister' config plat expr
CmmLit lit
-> case lit of
- -- TODO handle CmmInt 0 specially, use wzr or xzr.
-
+ -- Use wzr xzr for CmmInt 0 if the width matches up, otherwise do a move.
+ -- TODO: Reenable after https://gitlab.haskell.org/ghc/ghc/-/issues/23632 is fixed.
+ -- CmmInt 0 W32 -> do
+ -- let format = intFormat W32
+ -- return (Fixed format reg_zero (unitOL $ (COMMENT ((text . show $ expr))) ))
+ -- CmmInt 0 W64 -> do
+ -- let format = intFormat W64
+ -- return (Fixed format reg_zero (unitOL $ (COMMENT ((text . show $ expr))) ))
CmmInt i W8 | i >= 0 -> do
return (Any (intFormat W8) (\dst -> unitOL $ annExpr expr (MOV (OpReg W8 dst) (OpImm (ImmInteger (narrowU W8 i))))))
CmmInt i W16 | i >= 0 -> do
@@ -510,8 +607,13 @@ getRegister' config plat expr
-- Those need the upper bits set. We'd either have to explicitly sign
-- or figure out something smarter. Lowered to
-- `MOV dst XZR`
+ CmmInt i w | i >= 0
+ , Just imm_op <- getMovWideImm i w -> do
+ return (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOVZ (OpReg w dst) imm_op)))
+
CmmInt i w | isNbitEncodeable 16 i, i >= 0 -> do
return (Any (intFormat w) (\dst -> unitOL $ annExpr expr (MOV (OpReg W16 dst) (OpImm (ImmInteger i)))))
+
CmmInt i w | isNbitEncodeable 32 i, i >= 0 -> do
let half0 = fromIntegral (fromIntegral i :: Word16)
half1 = fromIntegral (fromIntegral (i `shiftR` 16) :: Word16)
@@ -586,7 +688,6 @@ getRegister' config plat expr
(op, imm_code) <- litToImm' lit
let rep = cmmLitType plat lit
format = cmmTypeFormat rep
- -- width = typeWidth rep
return (Any format (\dst -> imm_code `snocOL` LDR format (OpReg (formatToWidth format) dst) op))
CmmLabelOff lbl off -> do
@@ -791,17 +892,51 @@ getRegister' config plat expr
-- withTempFloatReg w op = OpReg w <$> getNewRegNat (floatFormat w) >>= op
-- A "plain" operation.
- bitOp w op = do
+ bitOpImm w op encode_imm = do
-- compute x<m> <- x
-- compute x<o> <- y
-- <OP> x<n>, x<m>, x<o>
(reg_x, format_x, code_x) <- getSomeReg x
- (reg_y, format_y, code_y) <- getSomeReg y
- massertPpr (isIntFormat format_x == isIntFormat format_y) $ text "bitOp: incompatible"
+ (op_y, format_y, code_y) <- case y of
+ CmmLit (CmmInt n w)
+ | Just imm_operand_y <- encode_imm n w
+ -> return (imm_operand_y, intFormat w, nilOL)
+ _ -> do
+ (reg_y, format_y, code_y) <- getSomeReg y
+ return (OpReg w reg_y, format_y, code_y)
+ massertPpr (isIntFormat format_x == isIntFormat format_y) $ text "bitOpImm: incompatible"
return $ Any (intFormat w) (\dst ->
code_x `appOL`
code_y `appOL`
- op (OpReg w dst) (OpReg w reg_x) (OpReg w reg_y))
+ op (OpReg w dst) (OpReg w reg_x) op_y)
+
+ -- A (potentially signed) integer operation.
+ -- In the case of 8- and 16-bit signed arithmetic we must first
+ -- sign-extend both arguments to 32-bits.
+ -- See Note [Signed arithmetic on AArch64].
+ intOpImm :: Bool -> Width -> (Operand -> Operand -> Operand -> OrdList Instr) -> (Integer -> Width -> Maybe Operand) -> NatM (Register)
+ intOpImm {- is signed -} True w op _encode_imm = intOp True w op
+ intOpImm False w op encode_imm = do
+ -- compute x<m> <- x
+ -- compute x<o> <- y
+ -- <OP> x<n>, x<m>, x<o>
+ (reg_x, format_x, code_x) <- getSomeReg x
+ (op_y, format_y, code_y) <- case y of
+ CmmLit (CmmInt n w)
+ | Just imm_operand_y <- encode_imm n w
+ -> return (imm_operand_y, intFormat w, nilOL)
+ _ -> do
+ (reg_y, format_y, code_y) <- getSomeReg y
+ return (OpReg w reg_y, format_y, code_y)
+ massertPpr (isIntFormat format_x && isIntFormat format_y) $ text "intOp: non-int"
+ -- This is the width of the registers on which the operation
+ -- should be performed.
+ let w' = opRegWidth w
+ return $ Any (intFormat w) $ \dst ->
+ code_x `appOL`
+ code_y `appOL`
+ op (OpReg w' dst) (OpReg w' reg_x) (op_y) `appOL`
+ truncateReg w' w dst -- truncate back to the operand's original width
-- A (potentially signed) integer operation.
-- In the case of 8- and 16-bit signed arithmetic we must first
@@ -847,9 +982,9 @@ getRegister' config plat expr
case op of
-- Integer operations
-- Add/Sub should only be Integer Options.
- MO_Add w -> intOp False w (\d x y -> unitOL $ annExpr expr (ADD d x y))
+ MO_Add w -> intOpImm False w (\d x y -> unitOL $ annExpr expr (ADD d x y)) getArithImm
-- TODO: Handle sub-word case
- MO_Sub w -> intOp False w (\d x y -> unitOL $ annExpr expr (SUB d x y))
+ MO_Sub w -> intOpImm False w (\d x y -> unitOL $ annExpr expr (SUB d x y)) getArithImm
-- Note [CSET]
-- ~~~~~~~~~~~
@@ -891,8 +1026,8 @@ getRegister' config plat expr
-- N.B. We needn't sign-extend sub-word size (in)equality comparisons
-- since we don't care about ordering.
- MO_Eq w -> bitOp w (\d x y -> toOL [ CMP x y, CSET d EQ ])
- MO_Ne w -> bitOp w (\d x y -> toOL [ CMP x y, CSET d NE ])
+ MO_Eq w -> bitOpImm w (\d x y -> toOL [ CMP x y, CSET d EQ ]) getArithImm
+ MO_Ne w -> bitOpImm w (\d x y -> toOL [ CMP x y, CSET d NE ]) getArithImm
-- Signed multiply/divide
MO_Mul w -> intOp True w (\d x y -> unitOL $ MUL d x y)
@@ -921,10 +1056,10 @@ getRegister' config plat expr
MO_S_Lt w -> intOp True w (\d x y -> toOL [ CMP x y, CSET d SLT ])
-- Unsigned comparisons
- MO_U_Ge w -> intOp False w (\d x y -> toOL [ CMP x y, CSET d UGE ])
- MO_U_Le w -> intOp False w (\d x y -> toOL [ CMP x y, CSET d ULE ])
- MO_U_Gt w -> intOp False w (\d x y -> toOL [ CMP x y, CSET d UGT ])
- MO_U_Lt w -> intOp False w (\d x y -> toOL [ CMP x y, CSET d ULT ])
+ MO_U_Ge w -> intOpImm False w (\d x y -> toOL [ CMP x y, CSET d UGE ]) getArithImm
+ MO_U_Le w -> intOpImm False w (\d x y -> toOL [ CMP x y, CSET d ULE ]) getArithImm
+ MO_U_Gt w -> intOpImm False w (\d x y -> toOL [ CMP x y, CSET d UGT ]) getArithImm
+ MO_U_Lt w -> intOpImm False w (\d x y -> toOL [ CMP x y, CSET d ULT ]) getArithImm
-- Floating point arithmetic
MO_F_Add w -> floatOp w (\d x y -> unitOL $ ADD d x y)
@@ -947,9 +1082,9 @@ getRegister' config plat expr
MO_F_Lt w -> floatCond w (\d x y -> toOL [ CMP x y, CSET d OLT ]) -- x < y <=> y >= x
-- Bitwise operations
- MO_And w -> bitOp w (\d x y -> unitOL $ AND d x y)
- MO_Or w -> bitOp w (\d x y -> unitOL $ ORR d x y)
- MO_Xor w -> bitOp w (\d x y -> unitOL $ EOR d x y)
+ MO_And w -> bitOpImm w (\d x y -> unitOL $ AND d x y) getBitmaskImm
+ MO_Or w -> bitOpImm w (\d x y -> unitOL $ ORR d x y) getBitmaskImm
+ MO_Xor w -> bitOpImm w (\d x y -> unitOL $ EOR d x y) getBitmaskImm
MO_Shl w -> intOp False w (\d x y -> unitOL $ LSL d x y)
MO_U_Shr w -> intOp False w (\d x y -> unitOL $ LSR d x y)
MO_S_Shr w -> intOp True w (\d x y -> unitOL $ ASR d x y)
@@ -999,7 +1134,7 @@ getRegister' config plat expr
where
isNbitEncodeable :: Int -> Integer -> Bool
- isNbitEncodeable n i = let shift = n - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
+ isNbitEncodeable n_bits i = let shift = n_bits - 1 in (-1 `shiftL` shift) <= i && i < (1 `shiftL` shift)
-- N.B. MUL does not set the overflow flag.
do_mul_may_oflo :: Width -> CmmExpr -> CmmExpr -> NatM Register
=====================================
compiler/GHC/CmmToAsm/AArch64/Instr.hs
=====================================
@@ -110,6 +110,7 @@ regUsageOfInstr platform instr = case instr of
LSR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
MOV dst src -> usage (regOp src, regOp dst)
MOVK dst src -> usage (regOp src, regOp dst)
+ MOVZ dst src -> usage (regOp src, regOp dst)
MVN dst src -> usage (regOp src, regOp dst)
ORR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
ROR dst src1 src2 -> usage (regOp src1 ++ regOp src2, regOp dst)
@@ -251,6 +252,7 @@ patchRegsOfInstr instr env = case instr of
LSR o1 o2 o3 -> LSR (patchOp o1) (patchOp o2) (patchOp o3)
MOV o1 o2 -> MOV (patchOp o1) (patchOp o2)
MOVK o1 o2 -> MOVK (patchOp o1) (patchOp o2)
+ MOVZ o1 o2 -> MOVZ (patchOp o1) (patchOp o2)
MVN o1 o2 -> MVN (patchOp o1) (patchOp o2)
ORR o1 o2 o3 -> ORR (patchOp o1) (patchOp o2) (patchOp o3)
ROR o1 o2 o3 -> ROR (patchOp o1) (patchOp o2) (patchOp o3)
@@ -381,9 +383,8 @@ mkSpillInstr config reg delta slot =
where
a .&~. b = a .&. (complement b)
- fmt = case reg of
- RegReal (RealRegSingle n) | n < 32 -> II64
- _ -> FF64
+ fmt = fmtOfRealReg (case reg of { RegReal r -> r; _ -> panic "Expected real reg"})
+
mkIp0SpillAddr imm = ANN (text "Spill: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
mkStrSp imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
mkStrIp0 imm = ANN (text "Spill@" <> int (off - delta)) $ STR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 16) (ImmInt imm)))
@@ -408,9 +409,7 @@ mkLoadInstr config reg delta slot =
where
a .&~. b = a .&. (complement b)
- fmt = case reg of
- RegReal (RealRegSingle n) | n < 32 -> II64
- _ -> FF64
+ fmt = fmtOfRealReg (case reg of { RegReal r -> r; _ -> panic "Expected real reg"})
mkIp0SpillAddr imm = ANN (text "Reload: IP0 <- SP + " <> int imm) $ ADD ip0 sp (OpImm (ImmInt imm))
mkLdrSp imm = ANN (text "Reload@" <> int (off - delta)) $ LDR fmt (OpReg W64 reg) (OpAddr (AddrRegImm (regSingle 31) (ImmInt imm)))
@@ -619,7 +618,7 @@ data Instr
| MOV Operand Operand -- rd = rn or rd = #i
| MOVK Operand Operand
-- | MOVN Operand Operand
- -- | MOVZ Operand Operand
+ | MOVZ Operand Operand
| MVN Operand Operand -- rd = ~rn
| ORN Operand Operand Operand -- rd = rn | ~op2
| ORR Operand Operand Operand -- rd = rn | op2
@@ -708,6 +707,7 @@ instrCon i =
LSR{} -> "LSR"
MOV{} -> "MOV"
MOVK{} -> "MOVK"
+ MOVZ{} -> "MOVZ"
MVN{} -> "MVN"
ORN{} -> "ORN"
ORR{} -> "ORR"
@@ -782,6 +782,9 @@ wzr = OpReg W32 (RegReal (RealRegSingle (-1)))
sp = OpReg W64 (RegReal (RealRegSingle 31))
ip0 = OpReg W64 (RegReal (RealRegSingle 16))
+reg_zero :: Reg
+reg_zero = RegReal (RealRegSingle (-1))
+
_x :: Int -> Operand
_x i = OpReg W64 (RegReal (RealRegSingle i))
x0, x1, x2, x3, x4, x5, x6, x7 :: Operand
=====================================
compiler/GHC/CmmToAsm/AArch64/Ppr.hs
=====================================
@@ -417,6 +417,7 @@ pprInstr platform instr = case instr of
| isFloatOp o1 || isFloatOp o2 -> op2 (text "\tfmov") o1 o2
| otherwise -> op2 (text "\tmov") o1 o2
MOVK o1 o2 -> op2 (text "\tmovk") o1 o2
+ MOVZ o1 o2 -> op2 (text "\tmovz") o1 o2
MVN o1 o2 -> op2 (text "\tmvn") o1 o2
ORN o1 o2 o3 -> op3 (text "\torn") o1 o2 o3
ORR o1 o2 o3 -> op3 (text "\torr") o1 o2 o3
=====================================
compiler/GHC/CmmToAsm/AArch64/Regs.hs
=====================================
@@ -77,6 +77,8 @@ litToImm (CmmInt i w) = ImmInteger (narrowS w i)
-- narrow to the width: a CmmInt might be out of
-- range, but we assume that ImmInteger only contains
-- in-range values. A signed value should be fine here.
+ -- AK: We do call this with out of range values, however
+ -- it just truncates as we would expect.
litToImm (CmmFloat f W32) = ImmFloat f
litToImm (CmmFloat f W64) = ImmDouble f
litToImm (CmmLabel l) = ImmCLbl l
@@ -147,6 +149,13 @@ classOfRealReg (RealRegSingle i)
| i < 32 = RcInteger
| otherwise = RcDouble
+fmtOfRealReg :: RealReg -> Format
+fmtOfRealReg real_reg =
+ case classOfRealReg real_reg of
+ RcInteger -> II64
+ RcDouble -> FF64
+ RcFloat -> panic "No float regs on arm"
+
regDotColor :: RealReg -> SDoc
regDotColor reg
= case classOfRealReg reg of
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/be3d2837eaba0b089a10700e7e75adac437cc15b
--
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/be3d2837eaba0b089a10700e7e75adac437cc15b
You're receiving this email because of your account on gitlab.haskell.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20230725/1c664a23/attachment-0001.html>
More information about the ghc-commits
mailing list