|
|
Message-Id: <20251023154721.130468-2-pincheng.plct@isrc.iscas.ac.cn>
Date: Thu, 23 Oct 2025 23:47:21 +0800
From: Pincheng Wang <pincheng.plct@...c.iscas.ac.cn>
To: musl@...ts.openwall.com
Cc: pincheng.plct@...c.iscas.ac.cn
Subject: [PATCH v2 1/1] riscv64: add runtime-detected vector optimized memset
Add a RISC-V vector extension optimized memset implementation with
runtime CPU capability detection via HW_CAP.
The implementation provides both vector and scalar variants in a single
binary. At process startup, __init_riscv_string_optimizations() queries
AT_HWCAP to detect RVV support and selects the appropriate
implementation via function pointer dispatch. This allows the same libc
to run correctly on both vector-capable and non-vector RISC-V CPUs.
The vector implementation uses vsetvli for dynamic vector length and
employs a head-tail filling strategy for small sizes to minimize
overhead.
To prevent illegal instruction error, arch.mak disables compiler
auto-vectorization globally except for memset.S, ensuring only the
runtime-detected code uses vector instructions.
Signed-off-by: Pincheng Wang <pincheng.plct@...c.iscas.ac.cn>
---
arch/riscv64/arch.mak | 12 +++
src/env/__libc_start_main.c | 3 +
src/internal/libc.h | 1 +
src/string/riscv64/memset.S | 134 +++++++++++++++++++++++++++
src/string/riscv64/memset_dispatch.c | 32 +++++++
5 files changed, 182 insertions(+)
create mode 100644 arch/riscv64/arch.mak
create mode 100644 src/string/riscv64/memset.S
create mode 100644 src/string/riscv64/memset_dispatch.c
diff --git a/arch/riscv64/arch.mak b/arch/riscv64/arch.mak
new file mode 100644
index 00000000..5978eb0a
--- /dev/null
+++ b/arch/riscv64/arch.mak
@@ -0,0 +1,12 @@
+# Disable tree vectorization for all files except memset.S
+
+# Reason: We have hand-optimized vector memset.S that uses runtime detection
+# to switch between scalar and vector implementations based on CPU capability.
+# However, GCC may auto-vectorize other functions (like memcpy, strcpy, etc.)
+# which would cause illegal instruction errors on CPUs without vector extensions.
+
+# Therefore, we disable auto-vectorization for all files except memset.S,
+# ensuring only our runtime-detected vector code uses vector instructions.
+
+# Add -fno-tree-vectorize to all object files except memset.S
+$(filter-out obj/src/string/riscv64/memset.o obj/src/string/riscv64/memset.lo, $(ALL_OBJS) $(LOBJS)): CFLAGS_ALL += -fno-tree-vectorize
diff --git a/src/env/__libc_start_main.c b/src/env/__libc_start_main.c
index c5b277bd..c23e63f4 100644
--- a/src/env/__libc_start_main.c
+++ b/src/env/__libc_start_main.c
@@ -38,6 +38,9 @@ void __init_libc(char **envp, char *pn)
__init_tls(aux);
__init_ssp((void *)aux[AT_RANDOM]);
+#ifdef __riscv
+ __init_riscv_string_optimizations();
+#endif
if (aux[AT_UID]==aux[AT_EUID] && aux[AT_GID]==aux[AT_EGID]
&& !aux[AT_SECURE]) return;
diff --git a/src/internal/libc.h b/src/internal/libc.h
index 619bba86..28e893a1 100644
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -40,6 +40,7 @@ extern hidden struct __libc __libc;
hidden void __init_libc(char **, char *);
hidden void __init_tls(size_t *);
hidden void __init_ssp(void *);
+hidden void __init_riscv_string_optimizations(void);
hidden void __libc_start_init(void);
hidden void __funcs_on_exit(void);
hidden void __funcs_on_quick_exit(void);
diff --git a/src/string/riscv64/memset.S b/src/string/riscv64/memset.S
new file mode 100644
index 00000000..8568f32b
--- /dev/null
+++ b/src/string/riscv64/memset.S
@@ -0,0 +1,134 @@
+#ifdef __riscv_vector
+
+ .text
+ .global memset_vect
+/* void *memset_vect(void *s, int c, size_t n)
+ * a0 = s (dest), a1 = c (fill byte), a2 = n (size)
+ * Returns a0.
+ */
+memset_vect:
+ mv t0, a0 /* running dst; keep a0 as return */
+ beqz a2, .Ldone_vect /* n == 0 then return */
+
+ li t3, 8
+ bltu a2, t3, .Lsmall_vect /* small-size fast path */
+
+ /* Broadcast fill byte once. */
+ vsetvli t1, zero, e8, m8, ta, ma
+ vmv.v.x v0, a1
+
+.Lbulk_vect:
+ vsetvli t1, a2, e8, m8, ta, ma /* t1 = vl (bytes) */
+ vse8.v v0, (t0)
+ add t0, t0, t1
+ sub a2, a2, t1
+ bnez a2, .Lbulk_vect
+ j .Ldone_vect
+
+/* Small-size fast path (< 8).
+ * Head-tail fills to minimize branches and avoid vsetvli overhead.
+ */
+.Lsmall_vect:
+ /* Fill s[0], s[n-1] */
+ sb a1, 0(t0)
+ add t2, t0, a2
+ sb a1, -1(t2)
+ li t3, 2
+ bleu a2, t3, .Ldone_vect
+
+ /* Fill s[1], s[2], s[n-2], s[n-3] */
+ sb a1, 1(t0)
+ sb a1, 2(t0)
+ sb a1, -2(t2)
+ sb a1, -3(t2)
+ li t3, 6
+ bleu a2, t3, .Ldone_vect
+
+ /* Fill s[3], s[n-4] */
+ sb a1, 3(t0)
+ sb a1, -4(t2)
+ /* fallthrough for n <= 8 */
+
+.Ldone_vect:
+ ret
+.size memset_vect, .-memset_vect
+
+ .text
+ .global memset_scalar
+memset_scalar:
+ mv t0, a0
+ beqz a2, .Ldone_scalar
+
+ andi a1, a1, 0xff
+
+ sb a1, 0(t0)
+ add t2, t0, a2
+ sb a1, -1(t2)
+ li t3, 2
+ bleu a2, t3, .Ldone_scalar
+
+ sb a1, 1(t0)
+ sb a1, 2(t0)
+ sb a1, -2(t2)
+ sb a1, -3(t2)
+ li t3, 6
+ bleu a2, t3, .Ldone_scalar
+
+ sb a1, 3(t0)
+ sb a1, -4(t2)
+ li t3, 8
+ bleu a2, t3, .Ldone_scalar
+
+ addi t4, t0, 4
+ addi t5, t2, -4
+.Lloop_scalar:
+ bgeu t4, t5, .Ldone_scalar
+ sb a1, 0(t4)
+ addi t4, t4, 1
+ j .Lloop_scalar
+
+.Ldone_scalar:
+ ret
+.size memset_scalar, .-memset_scalar
+
+#else
+
+ .text
+ .global memset_scalar
+memset_scalar:
+ mv t0, a0
+ beqz a2, .Ldone_scalar
+
+ andi a1, a1, 0xff
+
+ sb a1, 0(t0)
+ add t2, t0, a2
+ sb a1, -1(t2)
+ li t3, 2
+ bleu a2, t3, .Ldone_scalar
+
+ sb a1, 1(t0)
+ sb a1, 2(t0)
+ sb a1, -2(t2)
+ sb a1, -3(t2)
+ li t3, 6
+ bleu a2, t3, .Ldone_scalar
+
+ sb a1, 3(t0)
+ sb a1, -4(t2)
+ li t3, 8
+ bleu a2, t3, .Ldone_scalar
+
+ addi t4, t0, 4
+ addi t5, t2, -4
+.Lloop_scalar:
+ bgeu t4, t5, .Ldone_scalar
+ sb a1, 0(t4)
+ addi t4, t4, 1
+ j .Lloop_scalar
+
+.Ldone_scalar:
+ ret
+.size memset_scalar, .-memset_scalar
+
+#endif
diff --git a/src/string/riscv64/memset_dispatch.c b/src/string/riscv64/memset_dispatch.c
new file mode 100644
index 00000000..c8648177
--- /dev/null
+++ b/src/string/riscv64/memset_dispatch.c
@@ -0,0 +1,32 @@
+#include "libc.h"
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/auxv.h>
+
+void *memset_scalar(void *s, int c, size_t n);
+void *memset_vect(void *s, int c, size_t n);
+
+/* Use scalar implementation by default */
+__attribute__((visibility("hidden")))
+void *(*__memset_ptr)(void *, int, size_t) = memset_scalar;
+
+void *memset(void *s, int c, size_t n)
+{
+ return __memset_ptr(s, c, n);
+}
+
+static int __has_rvv_via_hwcap(void)
+{
+ const unsigned long V_bit = (1ul << ('V' - 'A'));
+ unsigned long hwcap = getauxval(AT_HWCAP);
+ return (hwcap & V_bit) != 0;
+}
+
+__attribute__((visibility("hidden")))
+void __init_riscv_string_optimizations(void)
+{
+ if (__has_rvv_via_hwcap())
+ __memset_ptr = memset_vect;
+ else
+ __memset_ptr = memset_scalar;
+}
--
2.39.5
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.