Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251208174940.949856-15-bill.roberts@arm.com>
Date: Mon,  8 Dec 2025 11:44:57 -0600
From: Bill Roberts <bill.roberts@....com>
To: musl@...ts.openwall.com
Cc: Bill Roberts <bill.roberts@....com>
Subject: [RFC 14/14] aarch64: rewrite memset routine in C using inline asm

Rewrite the AArch64 memset routine from assembly into
implementations using inline assembly.

This change eliminates the need for handwritten function prologues and
epilogues in memset.s, which simplifies maintenance and allows the compiler
to automatically insert architecture features such as BTI landing pads and
pointer authentication (PAC) sequences where applicable.

Moving to C also enables the compiler to manage register allocation,
stack usage, and ABI compliance automatically while keeping the low-level
behavior (bitmasks and register accesses) explicit and verifiable.

No functional changes intended.

Signed-off-by: Bill Roberts <bill.roberts@....com>
---
 src/string/aarch64/memset.S | 115 ---------------------------------
 src/string/aarch64/memset.c | 122 ++++++++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+), 115 deletions(-)
 delete mode 100644 src/string/aarch64/memset.S
 create mode 100644 src/string/aarch64/memset.c

diff --git a/src/string/aarch64/memset.S b/src/string/aarch64/memset.S
deleted file mode 100644
index f0d29b7f..00000000
--- a/src/string/aarch64/memset.S
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * memset - fill memory with a constant byte
- *
- * Copyright (c) 2012-2020, Arm Limited.
- * SPDX-License-Identifier: MIT
- */
-
-/* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- *
- */
-
-#define dstin   x0
-#define val     x1
-#define valw    w1
-#define count   x2
-#define dst     x3
-#define dstend  x4
-#define zva_val x5
-
-.global memset
-.type memset,%function
-memset:
-
-	dup     v0.16B, valw
-	add     dstend, dstin, count
-
-	cmp     count, 96
-	b.hi    .Lset_long
-	cmp     count, 16
-	b.hs    .Lset_medium
-	mov     val, v0.D[0]
-
-	/* Set 0..15 bytes.  */
-	tbz     count, 3, 1f
-	str     val, [dstin]
-	str     val, [dstend, -8]
-	ret
-	nop
-1:      tbz     count, 2, 2f
-	str     valw, [dstin]
-	str     valw, [dstend, -4]
-	ret
-2:      cbz     count, 3f
-	strb    valw, [dstin]
-	tbz     count, 1, 3f
-	strh    valw, [dstend, -2]
-3:      ret
-
-	/* Set 17..96 bytes.  */
-.Lset_medium:
-	str     q0, [dstin]
-	tbnz    count, 6, .Lset96
-	str     q0, [dstend, -16]
-	tbz     count, 5, 1f
-	str     q0, [dstin, 16]
-	str     q0, [dstend, -32]
-1:      ret
-
-	.p2align 4
-	/* Set 64..96 bytes.  Write 64 bytes from the start and
-	   32 bytes from the end.  */
-.Lset96:
-	str     q0, [dstin, 16]
-	stp     q0, q0, [dstin, 32]
-	stp     q0, q0, [dstend, -32]
-	ret
-
-	.p2align 4
-.Lset_long:
-	and     valw, valw, 255
-	bic     dst, dstin, 15
-	str     q0, [dstin]
-	cmp     count, 160
-	ccmp    valw, 0, 0, hs
-	b.ne    .Lno_zva
-
-#ifndef SKIP_ZVA_CHECK
-	mrs     zva_val, dczid_el0
-	and     zva_val, zva_val, 31
-	cmp     zva_val, 4              /* ZVA size is 64 bytes.  */
-	b.ne    .Lno_zva
-#endif
-	str     q0, [dst, 16]
-	stp     q0, q0, [dst, 32]
-	bic     dst, dst, 63
-	sub     count, dstend, dst      /* Count is now 64 too large.  */
-	sub     count, count, 128       /* Adjust count and bias for loop.  */
-
-	.p2align 4
-.Lzva_loop:
-	add     dst, dst, 64
-	dc      zva, dst
-	subs    count, count, 64
-	b.hi    .Lzva_loop
-	stp     q0, q0, [dstend, -64]
-	stp     q0, q0, [dstend, -32]
-	ret
-
-.Lno_zva:
-	sub     count, dstend, dst      /* Count is 16 too large.  */
-	sub     dst, dst, 16            /* Dst is biased by -32.  */
-	sub     count, count, 64 + 16   /* Adjust count and bias for loop.  */
-.Lno_zva_loop:
-	stp     q0, q0, [dst, 32]
-	stp     q0, q0, [dst, 64]!
-	subs    count, count, 64
-	b.hi    .Lno_zva_loop
-	stp     q0, q0, [dstend, -64]
-	stp     q0, q0, [dstend, -32]
-	ret
-
-.size memset,.-memset
-
diff --git a/src/string/aarch64/memset.c b/src/string/aarch64/memset.c
new file mode 100644
index 00000000..dfc820c6
--- /dev/null
+++ b/src/string/aarch64/memset.c
@@ -0,0 +1,122 @@
+/*
+ * memset - fill memory with a constant byte
+ *
+ * Copyright (c) 2012-2020, Arm Limited.
+ * SPDX-License-Identifier: MIT
+ */
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
+ *
+ */
+
+#include <stddef.h>
+
+void *memset(void *dstin, int c, size_t count)
+{
+	__asm__ volatile(
+		"dup     v0.16B, w1\n\t"
+		"add     x4, x0, x2\n\t"          // dstend = dstin + count
+
+		"cmp     x2, 96\n\t"
+		"b.hi    .Lset_long\n\t"
+		"cmp     x2, 16\n\t"
+		"b.hs    .Lset_medium\n\t"
+		"mov     x1, v0.D[0]\n\t"
+
+		/* Set 0..15 bytes.  */
+		"tbz     x2, 3, 1f\n\t"
+		"str     x1, [x0]\n\t"
+		"str     x1, [x4, -8]\n\t"
+		"ret\n\t"
+		"nop\n"
+
+	"1:\n\t"
+		"tbz     x2, 2, 2f\n\t"
+		"str     w1, [x0]\n\t"
+		"str     w1, [x4, -4]\n\t"
+		"ret\n"
+
+	"2:\n\t"
+		"cbz     x2, 3f\n\t"
+		"strb    w1, [x0]\n\t"
+		"tbz     x2, 1, 3f\n\t"
+		"strh    w1, [x4, -2]\n"
+	"3:\n\t"
+		"ret\n"
+
+		/* Set 17..96 bytes.  */
+	".Lset_medium:\n\t"
+		"str     q0, [x0]\n\t"
+		"tbnz    x2, 6, .Lset96\n\t"
+		"str     q0, [x4, -16]\n\t"
+		"tbz     x2, 5, 1f\n\t"
+		"str     q0, [x0, 16]\n\t"
+		"str     q0, [x4, -32]\n"
+	"1:\n\t"
+		"ret\n\t"
+
+		".p2align 4\n"
+		/* Set 64..96 bytes.  Write 64 bytes from the start and
+		   32 bytes from the end.  */
+	".Lset96:\n\t"
+		"str     q0, [x0, 16]\n\t"
+		"stp     q0, q0, [x0, 32]\n\t"
+		"stp     q0, q0, [x4, -32]\n\t"
+		"ret\n\t"
+
+		".p2align 4\n"
+	".Lset_long:\n\t"
+		"and     w1, w1, 255\n\t"
+		"bic     x3, x0, 15\n\t"
+		"str     q0, [x0]\n\t"
+		"cmp     x2, 160\n\t"
+		"ccmp    w1, 0, 0, hs\n\t"
+		"b.ne    .Lno_zva\n\t"
+
+#ifndef SKIP_ZVA_CHECK
+		"mrs     x5, dczid_el0\n\t"
+		"and     x5, x5, 31\n\t"
+		"cmp     x5, 4\n\t"              /* ZVA size is 64 bytes.  */
+		"b.ne    .Lno_zva\n\t"
+#endif
+		"str     q0, [x3, 16]\n\t"
+		"stp     q0, q0, [x3, 32]\n\t"
+		"bic     x3, x3, 63\n\t"
+		"sub     x2, x4, x3\n\t"         /* Count is now 64 too large.  */
+		"sub     x2, x2, 128\n\t"        /* Adjust count and bias for loop.  */
+
+		".p2align 4\n"
+	".Lzva_loop:\n\t"
+		"add     x3, x3, 64\n\t"
+		"dc      zva, x3\n\t"
+		"subs    x2, x2, 64\n\t"
+		"b.hi    .Lzva_loop\n\t"
+		"stp     q0, q0, [x4, -64]\n\t"
+		"stp     q0, q0, [x4, -32]\n\t"
+		"ret\n"
+
+	".Lno_zva:\n\t"
+		"sub     x2, x4, x3\n\t"     /* Count is 16 too large.  */
+		"sub     x3, x3, 16\n\t"     /* Dst is biased by -32.  */
+		"sub     x2, x2, 64 + 16\n"  /* Adjust count and bias for loop.  */
+
+	".Lno_zva_loop:\n\t"
+		"stp     q0, q0, [x3, 32]\n\t"
+		"stp     q0, q0, [x3, 64]!\n\t"
+		"subs    x2, x2, 64\n\t"
+		"b.hi    .Lno_zva_loop\n\t"
+		"stp     q0, q0, [x4, -64]\n\t"
+		"stp     q0, q0, [x4, -32]\n\t"
+		"ret\n\t"
+		:
+		:
+		: "x3", "x4", "x5",   // dst, dstend, zva_val
+		  "v0",               // SIMD register used for pattern
+		  "cc", "memory"
+	);
+
+	__builtin_unreachable();  // all returns are via the asm 'ret' paths
+}
+
-- 
2.51.0

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.