Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Wed,  7 Jun 2023 18:07:08 +0800
From: zhangfei <zhang_fei_0403@....com>
To: dalias@...c.org,
	musl@...ts.openwall.com
Cc: zhangfei <zhangfei@...iscas.ac.cn>
Subject: [PATCH 1/3] RISC-V: Optimize memset

From: zhangfei <zhangfei@...iscas.ac.cn>

This code is based on linux/arch/riscv/lib/memset.S. Removed macro definition and modified
to support RISCV64.
When the amount of data in the source code is less than 16 bytes or after loop tail
processing, byte storage is used. Here we refer to musl/src/string/memset.c, and modify it
to fill head and tail with minimal branching.

Signed-off-by: Zhang Fei<zhangfei@...iscas.ac.cn>
---
 src/string/riscv64/memset.S | 136 ++++++++++++++++++++++++++++++++++++
 1 file changed, 136 insertions(+)
 create mode 100644 src/string/riscv64/memset.S

diff --git a/src/string/riscv64/memset.S b/src/string/riscv64/memset.S
new file mode 100644
index 0000000..f8663d7
--- /dev/null
+++ b/src/string/riscv64/memset.S
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#define SZREG 8
+#define REG_S sd
+
+.global memset
+.type memset,@function
+memset:
+        move 	t0, a0  /* Preserve return value */
+
+	/* Defer to byte-oriented fill for small sizes */
+	sltiu 	a3, a2, 16
+	bnez 	a3, 4f
+
+	/*
+	 * Round to nearest XLEN-aligned address
+	 * greater than or equal to start address
+	 */
+	addi 	a3, t0, SZREG-1
+	andi 	a3, a3, ~(SZREG-1)
+	beq 	a3, t0, 2f  /* Skip if already aligned */
+	/* Handle initial misalignment */
+	sub 	a4, a3, t0
+1:
+	sb 	a1, 0(t0)
+	addi 	t0, t0, 1
+	bltu 	t0, a3, 1b
+	sub 	a2, a2, a4  /* Update count */
+
+2: 
+	andi 	a1, a1, 0xff
+	slli 	a3, a1, 8
+	or 	a1, a3, a1
+	slli 	a3, a1, 16
+	or 	a1, a3, a1
+	slli 	a3, a1, 32
+	or 	a1, a3, a1
+
+	/* Calculate end address */
+	andi 	a4, a2, ~(SZREG-1)
+	add 	a3, t0, a4
+
+	andi 	a4, a4, 31*SZREG  /* Calculate remainder */
+	beqz 	a4, 3f            /* Shortcut if no remainder */
+	neg 	a4, a4
+	addi 	a4, a4, 32*SZREG  /* Calculate initial offset */
+
+	/* Adjust start address with offset */
+	sub 	t0, t0, a4
+
+	/* Jump into loop body */
+	/* Assumes 64-bit instruction lengths */
+	la 	a5, 3f
+	srli 	a4, a4, 1
+	add 	a5, a5, a4
+	jr 	a5
+3:
+	REG_S 	a1,        0(t0)
+	REG_S 	a1,    SZREG(t0)
+	REG_S 	a1,  2*SZREG(t0)
+	REG_S 	a1,  3*SZREG(t0)
+	REG_S 	a1,  4*SZREG(t0)
+	REG_S 	a1,  5*SZREG(t0)
+	REG_S 	a1,  6*SZREG(t0)
+	REG_S 	a1,  7*SZREG(t0)
+	REG_S 	a1,  8*SZREG(t0)
+	REG_S 	a1,  9*SZREG(t0)
+	REG_S 	a1, 10*SZREG(t0)
+	REG_S 	a1, 11*SZREG(t0)
+	REG_S 	a1, 12*SZREG(t0)
+	REG_S 	a1, 13*SZREG(t0)
+	REG_S 	a1, 14*SZREG(t0)
+	REG_S 	a1, 15*SZREG(t0)
+	REG_S 	a1, 16*SZREG(t0)
+	REG_S 	a1, 17*SZREG(t0)
+	REG_S 	a1, 18*SZREG(t0)
+	REG_S 	a1, 19*SZREG(t0)
+	REG_S 	a1, 20*SZREG(t0)
+	REG_S 	a1, 21*SZREG(t0)
+	REG_S 	a1, 22*SZREG(t0)
+	REG_S 	a1, 23*SZREG(t0)
+	REG_S 	a1, 24*SZREG(t0)
+	REG_S 	a1, 25*SZREG(t0)
+	REG_S 	a1, 26*SZREG(t0)
+	REG_S 	a1, 27*SZREG(t0)
+	REG_S 	a1, 28*SZREG(t0)
+	REG_S 	a1, 29*SZREG(t0)
+	REG_S 	a1, 30*SZREG(t0)
+	REG_S 	a1, 31*SZREG(t0)
+	addi 	t0, t0, 32*SZREG
+	bltu 	t0, a3, 3b
+	andi 	a2, a2, SZREG-1  /* Update count */
+
+4:
+	/* Handle trailing misalignment */
+	beqz 	a2, 6f
+	add 	a3, t0, a2
+5:
+        /* Fill head and tail with minimal branching. Each
+         * conditional ensures that all the subsequently used
+         * offsets are well-defined and in the dest region. */
+	sb 	a1, 0(t0)
+	sb 	a1, -1(a3)
+	li 	a4, 2
+       bgeu 	a4, a2, 6f 
+        
+       sb 	a1, 1(t0) 
+       sb 	a1, 2(t0) 
+       sb 	a1, -2(a3) 
+       sb 	a1, -3(a3) 
+	li 	a4, 6
+       bgeu 	a4, a2, 6f 
+
+       sb 	a1, 3(t0) 
+       sb 	a1, -4(a3) 
+	li 	a4, 8
+       bgeu 	a4, a2, 6f 
+        
+       sb 	a1, 4(t0) 
+       sb 	a1, 5(t0) 
+       sb 	a1, -5(a3) 
+	li 	a4, 11
+       bgeu 	a4, a2, 6f 
+ 
+       sb 	a1, 6(t0) 
+       sb 	a1, -6(a3) 
+       sb 	a1, -7(a3) 
+	li 	a4, 14
+       bgeu 	a4, a2, 6f 
+
+       sb 	a1, 7(t0) 
+6:
+	ret
-- 
2.34.1

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.