|
|
Message-ID: <20251208174940.949856-15-bill.roberts@arm.com>
Date: Mon, 8 Dec 2025 11:44:57 -0600
From: Bill Roberts <bill.roberts@....com>
To: musl@...ts.openwall.com
Cc: Bill Roberts <bill.roberts@....com>
Subject: [RFC 14/14] aarch64: rewrite memset routine in C using inline asm
Rewrite the AArch64 memset routine from assembly into
implementations using inline assembly.
This change eliminates the need for handwritten function prologues and
epilogues in memset.s, which simplifies maintenance and allows the compiler
to automatically insert architecture features such as BTI landing pads and
pointer authentication (PAC) sequences where applicable.
Moving to C also enables the compiler to manage register allocation,
stack usage, and ABI compliance automatically while keeping the low-level
behavior (bitmasks and register accesses) explicit and verifiable.
No functional changes intended.
Signed-off-by: Bill Roberts <bill.roberts@....com>
---
src/string/aarch64/memset.S | 115 ---------------------------------
src/string/aarch64/memset.c | 122 ++++++++++++++++++++++++++++++++++++
2 files changed, 122 insertions(+), 115 deletions(-)
delete mode 100644 src/string/aarch64/memset.S
create mode 100644 src/string/aarch64/memset.c
diff --git a/src/string/aarch64/memset.S b/src/string/aarch64/memset.S
deleted file mode 100644
index f0d29b7f..00000000
--- a/src/string/aarch64/memset.S
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * memset - fill memory with a constant byte
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- *
- */
-
-#define dstin x0
-#define val x1
-#define valw w1
-#define count x2
-#define dst x3
-#define dstend x4
-#define zva_val x5
-
-.global memset
-.type memset,%function
-memset:
-
- dup v0.16B, valw
- add dstend, dstin, count
-
- cmp count, 96
- b.hi .Lset_long
- cmp count, 16
- b.hs .Lset_medium
- mov val, v0.D[0]
-
- /* Set 0..15 bytes. */
- tbz count, 3, 1f
- str val, [dstin]
- str val, [dstend, -8]
- ret
- nop
-1: tbz count, 2, 2f
- str valw, [dstin]
- str valw, [dstend, -4]
- ret
-2: cbz count, 3f
- strb valw, [dstin]
- tbz count, 1, 3f
- strh valw, [dstend, -2]
-3: ret
-
- /* Set 17..96 bytes. */
-.Lset_medium:
- str q0, [dstin]
- tbnz count, 6, .Lset96
- str q0, [dstend, -16]
- tbz count, 5, 1f
- str q0, [dstin, 16]
- str q0, [dstend, -32]
-1: ret
-
- .p2align 4
- /* Set 64..96 bytes. Write 64 bytes from the start and
- 32 bytes from the end. */
-.Lset96:
- str q0, [dstin, 16]
- stp q0, q0, [dstin, 32]
- stp q0, q0, [dstend, -32]
- ret
-
- .p2align 4
-.Lset_long:
- and valw, valw, 255
- bic dst, dstin, 15
- str q0, [dstin]
- cmp count, 160
- ccmp valw, 0, 0, hs
- b.ne .Lno_zva
-
-#ifndef SKIP_ZVA_CHECK
- mrs zva_val, dczid_el0
- and zva_val, zva_val, 31
- cmp zva_val, 4 /* ZVA size is 64 bytes. */
- b.ne .Lno_zva
-#endif
- str q0, [dst, 16]
- stp q0, q0, [dst, 32]
- bic dst, dst, 63
- sub count, dstend, dst /* Count is now 64 too large. */
- sub count, count, 128 /* Adjust count and bias for loop. */
-
- .p2align 4
-.Lzva_loop:
- add dst, dst, 64
- dc zva, dst
- subs count, count, 64
- b.hi .Lzva_loop
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-
-.Lno_zva:
- sub count, dstend, dst /* Count is 16 too large. */
- sub dst, dst, 16 /* Dst is biased by -32. */
- sub count, count, 64 + 16 /* Adjust count and bias for loop. */
-.Lno_zva_loop:
- stp q0, q0, [dst, 32]
- stp q0, q0, [dst, 64]!
- subs count, count, 64
- b.hi .Lno_zva_loop
- stp q0, q0, [dstend, -64]
- stp q0, q0, [dstend, -32]
- ret
-
-.size memset,.-memset
-
diff --git a/src/string/aarch64/memset.c b/src/string/aarch64/memset.c
new file mode 100644
index 00000000..dfc820c6
--- /dev/null
+++ b/src/string/aarch64/memset.c
@@ -0,0 +1,122 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include <stddef.h>
+
+void *memset(void *dstin, int c, size_t count)
+{
+ __asm__ volatile(
+ "dup v0.16B, w1\n\t"
+ "add x4, x0, x2\n\t" // dstend = dstin + count
+
+ "cmp x2, 96\n\t"
+ "b.hi .Lset_long\n\t"
+ "cmp x2, 16\n\t"
+ "b.hs .Lset_medium\n\t"
+ "mov x1, v0.D[0]\n\t"
+
+ /* Set 0..15 bytes. */
+ "tbz x2, 3, 1f\n\t"
+ "str x1, [x0]\n\t"
+ "str x1, [x4, -8]\n\t"
+ "ret\n\t"
+ "nop\n"
+
+ "1:\n\t"
+ "tbz x2, 2, 2f\n\t"
+ "str w1, [x0]\n\t"
+ "str w1, [x4, -4]\n\t"
+ "ret\n"
+
+ "2:\n\t"
+ "cbz x2, 3f\n\t"
+ "strb w1, [x0]\n\t"
+ "tbz x2, 1, 3f\n\t"
+ "strh w1, [x4, -2]\n"
+ "3:\n\t"
+ "ret\n"
+
+ /* Set 17..96 bytes. */
+ ".Lset_medium:\n\t"
+ "str q0, [x0]\n\t"
+ "tbnz x2, 6, .Lset96\n\t"
+ "str q0, [x4, -16]\n\t"
+ "tbz x2, 5, 1f\n\t"
+ "str q0, [x0, 16]\n\t"
+ "str q0, [x4, -32]\n"
+ "1:\n\t"
+ "ret\n\t"
+
+ ".p2align 4\n"
+ /* Set 64..96 bytes. Write 64 bytes from the start and
+ 32 bytes from the end. */
+ ".Lset96:\n\t"
+ "str q0, [x0, 16]\n\t"
+ "stp q0, q0, [x0, 32]\n\t"
+ "stp q0, q0, [x4, -32]\n\t"
+ "ret\n\t"
+
+ ".p2align 4\n"
+ ".Lset_long:\n\t"
+ "and w1, w1, 255\n\t"
+ "bic x3, x0, 15\n\t"
+ "str q0, [x0]\n\t"
+ "cmp x2, 160\n\t"
+ "ccmp w1, 0, 0, hs\n\t"
+ "b.ne .Lno_zva\n\t"
+
+#ifndef SKIP_ZVA_CHECK
+ "mrs x5, dczid_el0\n\t"
+ "and x5, x5, 31\n\t"
+ "cmp x5, 4\n\t" /* ZVA size is 64 bytes. */
+ "b.ne .Lno_zva\n\t"
+#endif
+ "str q0, [x3, 16]\n\t"
+ "stp q0, q0, [x3, 32]\n\t"
+ "bic x3, x3, 63\n\t"
+ "sub x2, x4, x3\n\t" /* Count is now 64 too large. */
+ "sub x2, x2, 128\n\t" /* Adjust count and bias for loop. */
+
+ ".p2align 4\n"
+ ".Lzva_loop:\n\t"
+ "add x3, x3, 64\n\t"
+ "dc zva, x3\n\t"
+ "subs x2, x2, 64\n\t"
+ "b.hi .Lzva_loop\n\t"
+ "stp q0, q0, [x4, -64]\n\t"
+ "stp q0, q0, [x4, -32]\n\t"
+ "ret\n"
+
+ ".Lno_zva:\n\t"
+ "sub x2, x4, x3\n\t" /* Count is 16 too large. */
+ "sub x3, x3, 16\n\t" /* Dst is biased by -32. */
+ "sub x2, x2, 64 + 16\n" /* Adjust count and bias for loop. */
+
+ ".Lno_zva_loop:\n\t"
+ "stp q0, q0, [x3, 32]\n\t"
+ "stp q0, q0, [x3, 64]!\n\t"
+ "subs x2, x2, 64\n\t"
+ "b.hi .Lno_zva_loop\n\t"
+ "stp q0, q0, [x4, -64]\n\t"
+ "stp q0, q0, [x4, -32]\n\t"
+ "ret\n\t"
+ :
+ :
+ : "x3", "x4", "x5", // dst, dstend, zva_val
+ "v0", // SIMD register used for pattern
+ "cc", "memory"
+ );
+
+ __builtin_unreachable(); // all returns are via the asm 'ret' paths
+}
+
--
2.51.0
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.