musl - Re: [PATCH v2 resend 1/1] riscv64: add runtime-detected vector optimized memset

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <8c823ef1-4f99-4548-a60c-a7abd0d083d4@app.fastmail.com>
Date: Thu, 23 Oct 2025 16:56:36 -0400
From: "Stefan O'Rear" <sorear@...tmail.com>
To: musl@...ts.openwall.com
Subject: Re: [PATCH v2 resend 1/1] riscv64: add runtime-detected vector
 optimized memset

On Thu, Oct 23, 2025, at 12:06 PM, Pincheng Wang wrote:
> Add a RISC-V vector extension optimized memset implementation with
> runtime CPU capability detection via HW_CAP.
>
> The implementation provides both vector and scalar variants in a single
> binary. At process startup, __init_riscv_string_optimizations() queries
> AT_HWCAP to detect RVV support and selects the appropriate
> implementation via function pointer dispatch. This allows the same libc
> to run correctly on both vector-capable and non-vector RISC-V CPUs.
>
> The vector implementation uses vsetvli for dynamic vector length and
> employs a head-tail filling strategy for small sizes to minimize
> overhead.
>
> To prevent illegal instruction error, arch.mak disables compiler
> auto-vectorization globally except for memset.S, ensuring only the
> runtime-detected code uses vector instructions.
>
> Signed-off-by: Pincheng Wang <pincheng.plct@...c.iscas.ac.cn>
> ---
>  arch/riscv64/arch.mak                |  12 +++
>  src/env/__libc_start_main.c          |   3 +
>  src/internal/libc.h                  |   1 +
>  src/string/riscv64/memset.S          | 134 +++++++++++++++++++++++++++
>  src/string/riscv64/memset_dispatch.c |  38 ++++++++
>  5 files changed, 188 insertions(+)
>  create mode 100644 arch/riscv64/arch.mak
>  create mode 100644 src/string/riscv64/memset.S
>  create mode 100644 src/string/riscv64/memset_dispatch.c
>
> diff --git a/arch/riscv64/arch.mak b/arch/riscv64/arch.mak
> new file mode 100644
> index 00000000..5978eb0a
> --- /dev/null
> +++ b/arch/riscv64/arch.mak
> @@ -0,0 +1,12 @@
> +# Disable tree vectorization for all files except memset.S
> +
> +# Reason: We have hand-optimized vector memset.S that uses runtime 
> detection
> +# to switch between scalar and vector implementations based on CPU 
> capability.
> +# However, GCC may auto-vectorize other functions (like memcpy, 
> strcpy, etc.)
> +# which would cause illegal instruction errors on CPUs without vector 
> extensions.
> +
> +# Therefore, we disable auto-vectorization for all files except 
> memset.S,
> +# ensuring only our runtime-detected vector code uses vector 
> instructions.
> +
> +# Add -fno-tree-vectorize to all object files except memset.S
> +$(filter-out obj/src/string/riscv64/memset.o 
> obj/src/string/riscv64/memset.lo, $(ALL_OBJS) $(LOBJS)): CFLAGS_ALL += 
> -fno-tree-vectorize

This isn't sufficient to prevent gcc from generating vector instructions
for e.g. struct zeroing idioms.

It's also the wrong approach because it prevents people who _do_ know at
compile time that their target hardware has V from using it pervasively.

To be consistent with anything else being built with the same options,
-march in CFLAGS should be the minimum set of extensions that are known
at compile time to be available, not including any that must be detected
at runtime.  Then runtime-detected extensions can be made available within
contexts guarded by the runtime detection; this is fairly easy for assembly.

> diff --git a/src/env/__libc_start_main.c b/src/env/__libc_start_main.c
> index c5b277bd..c23e63f4 100644
> --- a/src/env/__libc_start_main.c
> +++ b/src/env/__libc_start_main.c
> @@ -38,6 +38,9 @@ void __init_libc(char **envp, char *pn)
> 
>  	__init_tls(aux);
>  	__init_ssp((void *)aux[AT_RANDOM]);
> +#ifdef __riscv
> +	__init_riscv_string_optimizations();
> +#endif
> 
>  	if (aux[AT_UID]==aux[AT_EUID] && aux[AT_GID]==aux[AT_EGID]
>  		&& !aux[AT_SECURE]) return;
> diff --git a/src/internal/libc.h b/src/internal/libc.h
> index 619bba86..28e893a1 100644
> --- a/src/internal/libc.h
> +++ b/src/internal/libc.h
> @@ -40,6 +40,7 @@ extern hidden struct __libc __libc;
>  hidden void __init_libc(char **, char *);
>  hidden void __init_tls(size_t *);
>  hidden void __init_ssp(void *);
> +hidden void __init_riscv_string_optimizations(void);
>  hidden void __libc_start_init(void);
>  hidden void __funcs_on_exit(void);
>  hidden void __funcs_on_quick_exit(void);
> diff --git a/src/string/riscv64/memset.S b/src/string/riscv64/memset.S
> new file mode 100644
> index 00000000..8568f32b
> --- /dev/null
> +++ b/src/string/riscv64/memset.S
> @@ -0,0 +1,134 @@
> +#ifdef __riscv_vector

unconditional

> +
> +    .text
> +    .global memset_vect
> +/* void *memset_vect(void *s, int c, size_t n)
> + * a0 = s (dest), a1 = c (fill byte), a2 = n (size)
> + * Returns a0.
> + */

.option push
.option arch,+v

> +memset_vect:
> +    mv      t0, a0                    /* running dst; keep a0 as 
> return */
> +    beqz    a2, .Ldone_vect           /* n == 0 then return */
> +
> +    li      t3, 8
> +    bltu    a2, t3, .Lsmall_vect      /* small-size fast path */
> +
> +    /* Broadcast fill byte once. */
> +    vsetvli t1, zero, e8, m8, ta, ma
> +    vmv.v.x v0, a1
> +
> +.Lbulk_vect:
> +    vsetvli t1, a2, e8, m8, ta, ma    /* t1 = vl (bytes) */
> +    vse8.v  v0, (t0)
> +    add     t0, t0, t1
> +    sub     a2, a2, t1
> +    bnez    a2, .Lbulk_vect
> +    j       .Ldone_vect
> +
> +/* Small-size fast path (< 8).
> + * Head-tail fills to minimize branches and avoid vsetvli overhead.
> + */
> +.Lsmall_vect:

Compilers will generate inline code for memset(s,c,n) where n is a small
constant.  The only reason for memset to actually be called with small n
is if n is variable. I suspect that on real code it will typically be
faster to use vector instructions for small memsets because of the
avoided branch mispredictions.

> +    /* Fill s[0], s[n-1] */
> +    sb      a1, 0(t0)
> +    add     t2, t0, a2
> +    sb      a1, -1(t2)
> +    li      t3, 2
> +    bleu    a2, t3, .Ldone_vect
> +
> +    /* Fill s[1], s[2], s[n-2], s[n-3] */
> +    sb      a1, 1(t0)
> +    sb      a1, 2(t0)
> +    sb      a1, -2(t2)
> +    sb      a1, -3(t2)
> +    li      t3, 6
> +    bleu    a2, t3, .Ldone_vect
> +
> +    /* Fill s[3], s[n-4] */
> +    sb      a1, 3(t0)
> +    sb      a1, -4(t2)
> +    /* fallthrough for n <= 8 */
> +
> +.Ldone_vect:
> +    ret
> +.size memset_vect, .-memset_vect

.option pop

> +    .text
> +    .global memset_scalar
> +memset_scalar:
> +    mv      t0, a0                    
> +    beqz    a2, .Ldone_scalar
> +
> +    andi    a1, a1, 0xff              
> +
> +    sb      a1, 0(t0)                 
> +    add     t2, t0, a2
> +    sb      a1, -1(t2)                
> +    li      t3, 2
> +    bleu    a2, t3, .Ldone_scalar
> +
> +    sb      a1, 1(t0)
> +    sb      a1, 2(t0)
> +    sb      a1, -2(t2)
> +    sb      a1, -3(t2)
> +    li      t3, 6
> +    bleu    a2, t3, .Ldone_scalar
> +
> +    sb      a1, 3(t0)
> +    sb      a1, -4(t2)
> +    li      t3, 8
> +    bleu    a2, t3, .Ldone_scalar
> +
> +    addi    t4, t0, 4
> +    addi    t5, t2, -4
> +.Lloop_scalar:
> +    bgeu    t4, t5, .Ldone_scalar
> +    sb      a1, 0(t4)
> +    addi    t4, t4, 1
> +    j       .Lloop_scalar
> +
> +.Ldone_scalar:
> +    ret
> +.size memset_scalar, .-memset_scalar
> +
> +#else
> +
> +    .text
> +    .global memset_scalar
> +memset_scalar:
> +    mv      t0, a0                    
> +    beqz    a2, .Ldone_scalar
> +
> +    andi    a1, a1, 0xff              
> +
> +    sb      a1, 0(t0)                 
> +    add     t2, t0, a2
> +    sb      a1, -1(t2)                
> +    li      t3, 2
> +    bleu    a2, t3, .Ldone_scalar
> +
> +    sb      a1, 1(t0)
> +    sb      a1, 2(t0)
> +    sb      a1, -2(t2)
> +    sb      a1, -3(t2)
> +    li      t3, 6
> +    bleu    a2, t3, .Ldone_scalar
> +
> +    sb      a1, 3(t0)
> +    sb      a1, -4(t2)
> +    li      t3, 8
> +    bleu    a2, t3, .Ldone_scalar
> +
> +    addi    t4, t0, 4
> +    addi    t5, t2, -4
> +.Lloop_scalar:
> +    bgeu    t4, t5, .Ldone_scalar
> +    sb      a1, 0(t4)
> +    addi    t4, t4, 1
> +    j       .Lloop_scalar
> +
> +.Ldone_scalar:
> +    ret
> +.size memset_scalar, .-memset_scalar
> +
> +#endif
> diff --git a/src/string/riscv64/memset_dispatch.c 
> b/src/string/riscv64/memset_dispatch.c
> new file mode 100644
> index 00000000..aadf19fb
> --- /dev/null
> +++ b/src/string/riscv64/memset_dispatch.c
> @@ -0,0 +1,38 @@
> +#include "libc.h"
> +#include <stddef.h>
> +#include <stdint.h>
> +#include <sys/auxv.h>
> +
> +void *memset_scalar(void *s, int c, size_t n);
> +#ifdef __riscv_vector
> +void *memset_vect(void *s, int c, size_t n);
> +#endif
> +
> +/* Use scalar implementation by default */

Not really a default since it's always set by __init_libc.  It does
control the memset implementation used in dynlink.c prior to _start.

> +__attribute__((visibility("hidden")))
> +void *(*__memset_ptr)(void *, int, size_t) = memset_scalar;
> +
> +void *memset(void *s, int c, size_t n)
> +{
> +	return __memset_ptr(s, c, n);
> +}
> +
> +static int __has_rvv_via_hwcap(void)
> +{
> +	const unsigned long V_bit = (1ul << ('V' - 'A'));
> +	unsigned long hwcap = getauxval(AT_HWCAP);

getauxval is not a reserved identifier in C and it can be overridden by a
symbol from the main program with a different meaning. Use __getauxval.

You might want to rename __memset_scalar and __memset_vector for the same
reason, but mem* are "potentially reserved identifiers" so this isn't
strictly required.

> +	return (hwcap & V_bit) != 0;
> +}
> +
> +__attribute__((visibility("hidden")))
> +void __init_riscv_string_optimizations(void)
> +{
> +#ifdef __riscv_vector
> +	if (__has_rvv_via_hwcap())
> +		__memset_ptr = memset_vect;
> +	else
> +		__memset_ptr = memset_scalar;
> +#else
> +	__memset_ptr = memset_scalar;
> +#endif
> +}
> -- 
> 2.39.5

-s
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.