Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20251113161518.57357-2-pincheng.plct@isrc.iscas.ac.cn>
Date: Fri, 14 Nov 2025 00:15:18 +0800
From: Pincheng Wang <pincheng.plct@...c.iscas.ac.cn>
To: musl@...ts.openwall.com
Cc: pincheng.plct@...c.iscas.ac.cn
Subject: [PATCH 1/1] riscv64: add optimized memset, memcpy and memmove

Add RISC-V vector extension optimized memset, memcpy and memmove
implementation with runtime CPU capability detection via HW_CAP.

The implementations provide both vector and scalar variants in a single
binary. At process startup, __init_riscv_string_optimizations() queries
AT_HWCAP to detect RVV support and selects the appropriate
implementations via function pointer dispatch. This allows the same libc
to run correctly on both vector-capable and non-vector RISC-V CPUs.

The vector implementation uses m8 register grouping and processes data
in vector-length chunks, providing significant performance improvements
on RVV-capable hardware while maintaining compatibility with non-vector
RISC-V systems.

Signed-off-by: Pincheng Wang <pincheng.plct@...c.iscas.ac.cn>
---
 src/env/__libc_start_main.c          |  3 ++
 src/internal/libc.h                  |  3 ++
 src/string/riscv64/memcpy.c          |  4 ++
 src/string/riscv64/memcpy_vector.S   | 27 ++++++++++++++
 src/string/riscv64/memmove.c         |  4 ++
 src/string/riscv64/memmove_vector.S  | 51 ++++++++++++++++++++++++++
 src/string/riscv64/memset.c          |  4 ++
 src/string/riscv64/memset_dispatch.c | 36 ++++++++++++++++++
 src/string/riscv64/memset_vector.S   | 28 ++++++++++++++
 src/string/riscv64/string_dispatch.c | 55 ++++++++++++++++++++++++++++
 10 files changed, 215 insertions(+)
 create mode 100644 src/string/riscv64/memcpy.c
 create mode 100644 src/string/riscv64/memcpy_vector.S
 create mode 100644 src/string/riscv64/memmove.c
 create mode 100644 src/string/riscv64/memmove_vector.S
 create mode 100644 src/string/riscv64/memset.c
 create mode 100644 src/string/riscv64/memset_dispatch.c
 create mode 100644 src/string/riscv64/memset_vector.S
 create mode 100644 src/string/riscv64/string_dispatch.c

diff --git a/src/env/__libc_start_main.c b/src/env/__libc_start_main.c
index c5b277bd..c23e63f4 100644
--- a/src/env/__libc_start_main.c
+++ b/src/env/__libc_start_main.c
@@ -38,6 +38,9 @@ void __init_libc(char **envp, char *pn)
 
 	__init_tls(aux);
 	__init_ssp((void *)aux[AT_RANDOM]);
+#ifdef __riscv
+	__init_riscv_string_optimizations();
+#endif
 
 	if (aux[AT_UID]==aux[AT_EUID] && aux[AT_GID]==aux[AT_EGID]
 		&& !aux[AT_SECURE]) return;
diff --git a/src/internal/libc.h b/src/internal/libc.h
index 619bba86..28860b06 100644
--- a/src/internal/libc.h
+++ b/src/internal/libc.h
@@ -40,6 +40,9 @@ extern hidden struct __libc __libc;
 hidden void __init_libc(char **, char *);
 hidden void __init_tls(size_t *);
 hidden void __init_ssp(void *);
+#ifdef __riscv
+hidden void __init_riscv_string_optimizations(void);
+#endif
 hidden void __libc_start_init(void);
 hidden void __funcs_on_exit(void);
 hidden void __funcs_on_quick_exit(void);
diff --git a/src/string/riscv64/memcpy.c b/src/string/riscv64/memcpy.c
new file mode 100644
index 00000000..01892e69
--- /dev/null
+++ b/src/string/riscv64/memcpy.c
@@ -0,0 +1,4 @@
+/* Rename the generic memcpy to __memcpy_scalar and include it */
+#define memcpy __memcpy_scalar
+#include "../memcpy.c"
+#undef memcpy
diff --git a/src/string/riscv64/memcpy_vector.S b/src/string/riscv64/memcpy_vector.S
new file mode 100644
index 00000000..6a045832
--- /dev/null
+++ b/src/string/riscv64/memcpy_vector.S
@@ -0,0 +1,27 @@
+    .text
+    .global __memcpy_vect
+    .option push
+    .option arch, +v
+/* void *__memcpy_vect(void *dest, const void *src, size_t n)
+ * a0 = dest, a1 = src, a2 = n
+ * Returns a0.
+ */
+__memcpy_vect:
+    mv      t0, a0                    /* running dst */
+    mv      t1, a1                    /* running src */
+    beqz    a2, .Ldone_copy           /* n == 0 then return */
+
+.Lbulk_copy:
+    vsetvli t2, a2, e8, m8, ta, ma    /* t2 = vl (bytes) */
+    vle8.v  v0, (t1)
+    vse8.v  v0, (t0)
+    add     t0, t0, t2
+    add     t1, t1, t2
+    sub     a2, a2, t2
+    bnez    a2, .Lbulk_copy
+    /* fallthrough */
+
+.Ldone_copy:
+    ret
+.size __memcpy_vect, .-__memcpy_vect
+.option pop
diff --git a/src/string/riscv64/memmove.c b/src/string/riscv64/memmove.c
new file mode 100644
index 00000000..915d6ba9
--- /dev/null
+++ b/src/string/riscv64/memmove.c
@@ -0,0 +1,4 @@
+/* Rename the generic memmove to __memmove_scalar and include it */
+#define memmove __memmove_scalar
+#include "../memmove.c"
+#undef memmove
diff --git a/src/string/riscv64/memmove_vector.S b/src/string/riscv64/memmove_vector.S
new file mode 100644
index 00000000..aec87a52
--- /dev/null
+++ b/src/string/riscv64/memmove_vector.S
@@ -0,0 +1,51 @@
+    .text
+    .global __memmove_vect
+    .option push
+    .option arch, +v
+/* void *__memmove_vect(void *dest, const void *src, size_t n)
+ * a0 = dest, a1 = src, a2 = n
+ * Returns a0.
+ */
+__memmove_vect:
+    beqz    a2, .Ldone_move           /* n == 0 */
+    beq     a0, a1, .Ldone_move       /* dst == src */
+
+    /* overlap check */
+    bgeu    a1, a0, .Lforward_move    /* src >= dst then forward move*/
+
+    sub     t2, a0, a1                /* t2 = dst - src */
+    bgeu    t2, a2, .Lforward_move    /* no overlap then forward move */
+
+    /* backward move */
+    add     t0, a0, a2                /* running dst_end */
+    add     t1, a1, a2                /* running src_end */
+
+.Lbackward_loop:
+    vsetvli t3, a2, e8, m8, ta, ma    /* t3 = vl (bytes) */
+    sub     t0, t0, t3
+    sub     t1, t1, t3
+    vle8.v  v0, (t1)
+    vse8.v  v0, (t0)
+    sub     a2, a2, t3
+    bnez    a2, .Lbackward_loop
+    j       .Ldone_move
+
+    /* forward move, same as __memcpy_vect */
+.Lforward_move:
+    mv      t0, a0                    /* running dst */
+    mv      t1, a1                    /* running src */
+
+.Lforward_loop:
+    vsetvli t3, a2, e8, m8, ta, ma
+    vle8.v  v0, (t1)
+    vse8.v  v0, (t0)
+    add     t0, t0, t3
+    add     t1, t1, t3
+    sub     a2, a2, t3
+    bnez    a2, .Lforward_loop
+    /* fallthrough */
+
+.Ldone_move:
+    ret
+.size __memmove_vect, .-__memmove_vect
+.option pop
diff --git a/src/string/riscv64/memset.c b/src/string/riscv64/memset.c
new file mode 100644
index 00000000..11fa3032
--- /dev/null
+++ b/src/string/riscv64/memset.c
@@ -0,0 +1,4 @@
+/* Rename the generic memset to __memset_scalar and include it */
+#define memset __memset_scalar
+#include "../memset.c"
+#undef memset
diff --git a/src/string/riscv64/memset_dispatch.c b/src/string/riscv64/memset_dispatch.c
new file mode 100644
index 00000000..8e36d0f9
--- /dev/null
+++ b/src/string/riscv64/memset_dispatch.c
@@ -0,0 +1,36 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/auxv.h>
+#include "libc.h"
+
+void *__memset_scalar(void *s, int c, size_t n);
+void *__memset_vect(void *s, int c, size_t n);
+
+/* memset function pointer, runtime-dispatched based on RVV support */
+__attribute__((visibility("hidden")))
+#ifndef __riscv_vector
+void *(*__memset_ptr)(void *, int, size_t) = __memset_scalar;
+#else
+void *(*__memset_ptr)(void *, int, size_t) = __memset_vect;
+#endif
+
+void *memset(void *s, int c, size_t n)
+{
+	return __memset_ptr(s, c, n);
+}
+
+static __inline int __has_rvv_via_hwcap(void)
+{
+	const unsigned long V_bit = (1ul << ('V' - 'A'));
+	unsigned long hwcap = __getauxval(AT_HWCAP);
+	return (hwcap & V_bit) != 0;
+}
+
+__attribute__((visibility("hidden")))
+void __init_riscv_string_optimizations(void)
+{
+	if (__has_rvv_via_hwcap())
+		__memset_ptr = __memset_vect;
+	else
+		__memset_ptr = __memset_scalar;
+}
diff --git a/src/string/riscv64/memset_vector.S b/src/string/riscv64/memset_vector.S
new file mode 100644
index 00000000..513645d3
--- /dev/null
+++ b/src/string/riscv64/memset_vector.S
@@ -0,0 +1,28 @@
+    .text
+    .global __memset_vect
+    .option push
+    .option arch, +v
+/* void *__memset_vect(void *s, int c, size_t n)
+ * a0 = s (dest), a1 = c (fill byte), a2 = n (size)
+ * Returns a0.
+ */
+__memset_vect:
+    mv      t0, a0                    /* running dst; keep a0 as return */
+    beqz    a2, .Ldone_vect           /* n == 0 then return */
+
+    /* Broadcast fill byte once. */
+    vsetvli t1, zero, e8, m8, ta, ma
+    vmv.v.x v0, a1
+
+.Lbulk_vect:
+    vsetvli t1, a2, e8, m8, ta, ma    /* t1 = vl (bytes) */
+    vse8.v  v0, (t0)
+    add     t0, t0, t1
+    sub     a2, a2, t1
+    bnez    a2, .Lbulk_vect
+    /* fallthrough */
+
+.Ldone_vect:
+    ret
+.size __memset_vect, .-__memset_vect
+.option pop
diff --git a/src/string/riscv64/string_dispatch.c b/src/string/riscv64/string_dispatch.c
new file mode 100644
index 00000000..2844d906
--- /dev/null
+++ b/src/string/riscv64/string_dispatch.c
@@ -0,0 +1,55 @@
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/auxv.h>
+#include "libc.h"
+
+void *__memset_scalar(void *s, int c, size_t n);
+void *__memset_vect(void *s, int c, size_t n);
+void *__memcpy_scalar(void *restrict dest, const void *restrict src, size_t n);
+void *__memcpy_vect(void *restrict dest, const void *restrict src, size_t n);
+void *__memmove_scalar(void *restrict dest, const void *restrict src, size_t n);
+void *__memmove_vect(void *restrict dest, const void *restrict src, size_t n);
+
+/* string function pointer, runtime-dispatched based on RVV support */
+__attribute__((visibility("hidden")))
+#ifndef __riscv_vector
+void *(*__memset_ptr)(void *, int, size_t) = __memset_scalar;
+void *(*__memcpy_ptr)(void *, const void *, size_t) = __memcpy_scalar;
+void *(*__memmove_ptr)(void *, const void *, size_t) = __memmove_scalar;
+#else
+void *(*__memset_ptr)(void *, int, size_t) = __memset_vect;
+void *(*__memcpy_ptr)(void *, const void *, size_t) = __memcpy_vect;
+void *(*__memmove_ptr)(void *, const void *, size_t) = __memmove_vect;
+#endif
+
+void *memset(void *s, int c, size_t n)
+{
+	return __memset_ptr(s, c, n);
+}
+
+void *memcpy(void *restrict dest, const void *restrict src, size_t n)
+{
+	return __memcpy_ptr(dest, src, n);
+}
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+	return __memmove_ptr(dest, src, n);
+}
+
+static __inline int __has_rvv_via_hwcap(void)
+{
+	const unsigned long V_bit = (1ul << ('V' - 'A'));
+	unsigned long hwcap = __getauxval(AT_HWCAP);
+	return (hwcap & V_bit) != 0;
+}
+
+__attribute__((visibility("hidden")))
+void __init_riscv_string_optimizations(void)
+{
+	if (__has_rvv_via_hwcap()) {
+		__memset_ptr = __memset_vect;
+		__memcpy_ptr = __memcpy_vect;
+		__memmove_ptr = __memmove_vect;
+	}
+}
-- 
2.39.5

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.