[Git][ghc/ghc][master] ncg(aarch64): Add fsqrt instruction, byteSwap primitives [#24956]

Fri Jul 12 15:43:27 UTC 2024


Marge Bot pushed to branch master at Glasgow Haskell Compiler / GHC


Commits:
dee035bf by Alex Mason at 2024-07-12T11:42:41-04:00
ncg(aarch64): Add fsqrt instruction, byteSwap primitives [#24956]

Implements the FSQRT machop using native assembly rather than a C call.

Implements MO_BSwap by producing assembly to do the byte swapping
instead of producing a foreign call a C function.

In `tar`, the hot loop for `deserialise` got almost 4x faster by
avoiding the foreign call which caused spilling live variables to the
stack -- this means the loop did 4x more memory read/writing than
necessary in that particular case!

- - - - -


3 changed files:

- compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
- compiler/GHC/CmmToAsm/AArch64/Instr.hs
- compiler/GHC/CmmToAsm/AArch64/Ppr.hs


Changes:

=====================================
compiler/GHC/CmmToAsm/AArch64/CodeGen.hs
=====================================
@@ -1607,9 +1607,20 @@ genCCall target dest_regs arg_regs = do
     PrimTarget MO_F32_Fabs
       | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
         unaryFloatOp W32 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
+      | otherwise -> panic "mal-formed MO_F32_Fabs"
     PrimTarget MO_F64_Fabs
       | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
         unaryFloatOp W64 (\d x -> unitOL $ FABS d x) arg_reg dest_reg
+      | otherwise -> panic "mal-formed MO_F64_Fabs"
+    PrimTarget MO_F32_Sqrt
+      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
+        unaryFloatOp W32 (\d x -> unitOL $ FSQRT d x) arg_reg dest_reg
+      | otherwise -> panic "mal-formed MO_F32_Sqrt"
+    PrimTarget MO_F64_Sqrt
+      | [arg_reg] <- arg_regs, [dest_reg] <- dest_regs ->
+        unaryFloatOp W64 (\d x -> unitOL $ FSQRT d x) arg_reg dest_reg
+      | otherwise -> panic "mal-formed MO_F64_Sqrt"
+
 
     PrimTarget (MO_S_Mul2 w)
           -- Life is easier when we're working with word sized operands,
@@ -1864,7 +1875,25 @@ genCCall target dest_regs arg_regs = do
                     , RBIT (r dst') (r dst')
                     ]
             | otherwise -> unsupported (MO_BRev  w)
-
+    PrimTarget (MO_BSwap  w)
+          | w == W64 || w == W32
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst_reg = getRegisterReg platform (CmmLocal dst)
+              return $ code_x `snocOL` REV (OpReg w dst_reg) (OpReg w reg_a)
+          | w == W16
+          , [src] <- arg_regs
+          , [dst] <- dest_regs
+          -> do
+              (reg_a, _format_x, code_x) <- getSomeReg src
+              let dst' = getRegisterReg platform (CmmLocal dst)
+                  r n = OpReg W32 n
+              -- Swaps the bytes in each 16bit word
+              -- TODO: Expose the 32 & 64 bit version of this?
+              return $ code_x `snocOL` REV16 (r dst') (r reg_a)
+          | otherwise -> unsupported (MO_BSwap w)
 
     -- or a possibly side-effecting machine operation
     -- mop :: CallishMachOp (see GHC.Cmm.MachOp)
@@ -1894,8 +1923,6 @@ genCCall target dest_regs arg_regs = do
         MO_F64_Log1P -> mkCCall "log1p"
         MO_F64_Exp   -> mkCCall "exp"
         MO_F64_ExpM1 -> mkCCall "expm1"
-        MO_F64_Fabs  -> mkCCall "fabs"
-        MO_F64_Sqrt  -> mkCCall "sqrt"
 
         -- 32 bit float ops
         MO_F32_Pwr   -> mkCCall "powf"
@@ -1916,8 +1943,6 @@ genCCall target dest_regs arg_regs = do
         MO_F32_Log1P -> mkCCall "log1pf"
         MO_F32_Exp   -> mkCCall "expf"
         MO_F32_ExpM1 -> mkCCall "expm1f"
-        MO_F32_Fabs  -> mkCCall "fabsf"
-        MO_F32_Sqrt  -> mkCCall "sqrtf"
 
         -- 64-bit primops
         MO_I64_ToI   -> mkCCall "hs_int64ToInt"
@@ -1990,7 +2015,6 @@ genCCall target dest_regs arg_regs = do
         MO_PopCnt w         -> mkCCall (popCntLabel w)
         MO_Pdep w           -> mkCCall (pdepLabel w)
         MO_Pext w           -> mkCCall (pextLabel w)
-        MO_BSwap w          -> mkCCall (bSwapLabel w)
 
         -- -- Atomic read-modify-write.
         MO_AtomicRead w ord


=====================================
compiler/GHC/CmmToAsm/AArch64/Instr.hs
=====================================
@@ -102,6 +102,9 @@ regUsageOfInstr platform instr = case instr of
   UXTH dst src             -> usage (regOp src, regOp dst)
   CLZ  dst src             -> usage (regOp src, regOp dst)
   RBIT dst src             -> usage (regOp src, regOp dst)
+  REV   dst src            -> usage (regOp src, regOp dst)
+  -- REV32 dst src            -> usage (regOp src, regOp dst)
+  REV16 dst src            -> usage (regOp src, regOp dst)
   -- 3. Logical and Move Instructions ------------------------------------------
   AND dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
   ASR dst src1 src2        -> usage (regOp src1 ++ regOp src2, regOp dst)
@@ -139,6 +142,7 @@ regUsageOfInstr platform instr = case instr of
   SCVTF dst src            -> usage (regOp src, regOp dst)
   FCVTZS dst src           -> usage (regOp src, regOp dst)
   FABS dst src             -> usage (regOp src, regOp dst)
+  FSQRT dst src            -> usage (regOp src, regOp dst)
   FMA _ dst src1 src2 src3 ->
     usage (regOp src1 ++ regOp src2 ++ regOp src3, regOp dst)
 
@@ -237,7 +241,11 @@ patchRegsOfInstr instr env = case instr of
     SXTH o1 o2       -> SXTH (patchOp o1) (patchOp o2)
     UXTH o1 o2       -> UXTH (patchOp o1) (patchOp o2)
     CLZ o1 o2        -> CLZ  (patchOp o1) (patchOp o2)
-    RBIT o1 o2       -> RBIT  (patchOp o1) (patchOp o2)
+    RBIT o1 o2       -> RBIT (patchOp o1) (patchOp o2)
+    REV   o1 o2      -> REV  (patchOp o1) (patchOp o2)
+    -- REV32 o1 o2      -> REV32 (patchOp o1) (patchOp o2)
+    REV16 o1 o2      -> REV16 (patchOp o1) (patchOp o2)
+
 
     -- 3. Logical and Move Instructions ----------------------------------------
     AND o1 o2 o3   -> AND  (patchOp o1) (patchOp o2) (patchOp o3)
@@ -277,6 +285,7 @@ patchRegsOfInstr instr env = case instr of
     SCVTF o1 o2    -> SCVTF (patchOp o1) (patchOp o2)
     FCVTZS o1 o2   -> FCVTZS (patchOp o1) (patchOp o2)
     FABS o1 o2     -> FABS (patchOp o1) (patchOp o2)
+    FSQRT o1 o2    -> FSQRT (patchOp o1) (patchOp o2)
     FMA s o1 o2 o3 o4 ->
       FMA s (patchOp o1) (patchOp o2) (patchOp o3) (patchOp o4)
 
@@ -598,6 +607,12 @@ data Instr
     | UBFX Operand Operand Operand Operand -- rd = rn[i,j]
     | CLZ  Operand Operand -- rd = countLeadingZeros(rn)
     | RBIT Operand Operand -- rd = reverseBits(rn)
+    | REV Operand Operand   -- rd = reverseBytes(rn): (for 32 & 64 bit operands)
+                            -- 0xAABBCCDD -> 0xDDCCBBAA
+    | REV16 Operand Operand -- rd = reverseBytes16(rn)
+                            -- 0xAABB_CCDD -> xBBAA_DDCC
+    -- | REV32 Operand Operand -- rd = reverseBytes32(rn) - 64bit operands only!
+    --                         -- 0xAABBCCDD_EEFFGGHH -> 0XDDCCBBAA_HHGGFFEE
 
     -- 3. Logical and Move Instructions ----------------------------------------
     | AND Operand Operand Operand -- rd = rn & op2
@@ -642,6 +657,8 @@ data Instr
     | FCVTZS Operand Operand
     -- Float ABSolute value
     | FABS Operand Operand
+    -- Float SQuare RooT
+    | FSQRT Operand Operand
 
     -- | Floating-point fused multiply-add instructions
     --
@@ -685,6 +702,9 @@ instrCon i =
       UBFX{} -> "UBFX"
       CLZ{} -> "CLZ"
       RBIT{} -> "RBIT"
+      REV{} -> "REV"
+      REV16{} -> "REV16"
+      -- REV32{} -> "REV32"
       AND{} -> "AND"
       ASR{} -> "ASR"
       EOR{} -> "EOR"
@@ -712,6 +732,7 @@ instrCon i =
       SCVTF{} -> "SCVTF"
       FCVTZS{} -> "FCVTZS"
       FABS{} -> "FABS"
+      FSQRT{} -> "FSQRT"
       FMA variant _ _ _ _ ->
         case variant of
           FMAdd  -> "FMADD"


=====================================
compiler/GHC/CmmToAsm/AArch64/Ppr.hs
=====================================
@@ -398,6 +398,9 @@ pprInstr platform instr = case instr of
   UBFM o1 o2 o3 o4 -> op4 (text "\tubfm") o1 o2 o3 o4
   CLZ  o1 o2       -> op2 (text "\tclz")  o1 o2
   RBIT  o1 o2      -> op2 (text "\trbit")  o1 o2
+  REV o1 o2        -> op2 (text "\trev")  o1 o2
+  REV16 o1 o2      -> op2 (text "\trev16")  o1 o2
+  -- REV32 o1 o2      -> op2 (text "\trev32")  o1 o2
   -- signed and unsigned bitfield extract
   SBFX o1 o2 o3 o4 -> op4 (text "\tsbfx") o1 o2 o3 o4
   UBFX o1 o2 o3 o4 -> op4 (text "\tubfx") o1 o2 o3 o4
@@ -531,6 +534,7 @@ pprInstr platform instr = case instr of
   SCVTF o1 o2 -> op2 (text "\tscvtf") o1 o2
   FCVTZS o1 o2 -> op2 (text "\tfcvtzs") o1 o2
   FABS o1 o2 -> op2 (text "\tfabs") o1 o2
+  FSQRT o1 o2 -> op2 (text "\tfsqrt") o1 o2
   FMA variant d r1 r2 r3 ->
     let fma = case variant of
                 FMAdd  -> text "\tfmadd"



View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/dee035bf618d75a18fe72dd3977434c0749a2156

-- 
This project does not include diff previews in email notifications.
View it on GitLab: https://gitlab.haskell.org/ghc/ghc/-/commit/dee035bf618d75a18fe72dd3977434c0749a2156
You're receiving this email because of your account on gitlab.haskell.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mail.haskell.org/pipermail/ghc-commits/attachments/20240712/3a8a15bd/attachment-0001.html>