Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250925131557.8907-2-pincheng.plct@isrc.iscas.ac.cn>
Date: Thu, 25 Sep 2025 21:15:57 +0800
From: Pincheng Wang <pincheng.plct@...c.iscas.ac.cn>
To: musl@...ts.openwall.com
Cc: pincheng.plct@...c.iscas.ac.cn
Subject: [PATCH 1/1] riscv64: optimize memset implementation with vector extension

Use head-tail filling strategy for small sizes and dynamic vsetvli
approach for vector loops to reduce branch overhead. Add conditional
compilation to fall back to scalar implementation when __riscv_vector is
not available.

Signed-off-by: Pincheng Wang <pincheng.plct@...c.iscas.ac.cn>
---
 src/string/riscv64/memset.S | 101 ++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 src/string/riscv64/memset.S

diff --git a/src/string/riscv64/memset.S b/src/string/riscv64/memset.S
new file mode 100644
index 00000000..5fc6ee14
--- /dev/null
+++ b/src/string/riscv64/memset.S
@@ -0,0 +1,101 @@
+#ifdef __riscv_vector
+
+    .text
+    .global memset
+/* void *memset(void *s, int c, size_t n)
+ * a0 = s (dest), a1 = c (fill byte), a2 = n (size)
+ * Returns a0.
+ */
+memset:
+    mv      t0, a0                    /* running dst; keep a0 as return */
+    beqz    a2, .Ldone                /* n == 0 → return */
+
+    li      t3, 8
+    bltu    a2, t3, .Lsmall           /* small-size fast path */
+
+    /* Broadcast fill byte once. */
+    vsetvli t1, zero, e8, m8, ta, ma
+    vmv.v.x v0, a1
+
+.Lbulk:
+    vsetvli t1, a2, e8, m8, ta, ma    /* t1 = vl (bytes) */
+    vse8.v  v0, (t0)
+    add     t0, t0, t1
+    sub     a2, a2, t1
+    bnez    a2, .Lbulk
+    j       .Ldone
+
+/* Small-size fast path (< 8).
+ * Head-tail fills to minimize branches and avoid vsetvli overhead.
+ */
+.Lsmall:
+    /* Fill s[0], s[n-1] */
+    sb      a1, 0(t0)
+    add     t2, t0, a2
+    sb      a1, -1(t2)
+    li      t3, 2
+    bleu    a2, t3, .Ldone
+
+    /* Fill s[1], s[2], s[n-2], s[n-3] */
+    sb      a1, 1(t0)
+    sb      a1, 2(t0)
+    sb      a1, -2(t2)
+    sb      a1, -3(t2)
+    li      t3, 6
+    bleu    a2, t3, .Ldone
+
+    /* Fill s[3], s[n-4] */
+    sb      a1, 3(t0)
+    sb      a1, -4(t2)
+    /* fallthrough for n <= 8 */
+
+.Ldone:
+    ret
+.size memset, .-memset
+
+#else  /* !__riscv_vector */
+
+    .text
+    .global memset
+/* Fallback scalar memset
+ * void *memset(void *s, int c, size_t n)
+ */
+memset:
+    mv      t0, a0                    /* running dst; keep a0 as return */
+    beqz    a2, .Ldone
+
+    andi    a1, a1, 0xff              /* use low 8 bits only */
+
+    /* Head-tail strategy for small n */
+    sb      a1, 0(t0)                 /* s[0] */
+    add     t2, t0, a2
+    sb      a1, -1(t2)                /* s[n-1] */
+    li      t3, 2
+    bleu    a2, t3, .Ldone
+
+    sb      a1, 1(t0)
+    sb      a1, 2(t0)
+    sb      a1, -2(t2)
+    sb      a1, -3(t2)
+    li      t3, 6
+    bleu    a2, t3, .Ldone
+
+    sb      a1, 3(t0)
+    sb      a1, -4(t2)
+    li      t3, 8
+    bleu    a2, t3, .Ldone
+
+    /* Linear fill middle region [4, n-4) */
+    addi    t4, t0, 4
+    addi    t5, t2, -4
+.Lloop:
+    bgeu    t4, t5, .Ldone
+    sb      a1, 0(t4)
+    addi    t4, t4, 1
+    j       .Lloop
+
+.Ldone:
+    ret
+.size memset, .-memset
+
+#endif
-- 
2.39.5

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.