Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Wed,  7 Jun 2023 18:07:09 +0800
From: zhangfei <zhang_fei_0403@....com>
To: dalias@...c.org,
	musl@...ts.openwall.com
Cc: zhangfei <zhangfei@...iscas.ac.cn>
Subject: [PATCH 2/3] RISC-V: Optimize memcpy 

From: zhangfei <zhangfei@...iscas.ac.cn>

This code is based on linux/arch/riscv/lib/memcpy.S. Removed macro definition to support
RISCV64.
The original implementation in the kernel uses byte-wise copy if src and dst are not
co-aligned.This approach is not efficient enough.Therefore, the patch linked below has
been used to modify this section.

https://lore.kernel.org/all/20210216225555.4976-1-gary@garyguo.net/ 

The link above has been optimized memcpy for misaligned cases.If they are not co-aligned,
then it will load two adjacent words from src and use shifts to assemble a full machine 
word.

Signed-off-by: Zhang Fei<zhangfei@...iscas.ac.cn>
---
 src/string/riscv64/memcpy.S | 159 ++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 src/string/riscv64/memcpy.S

diff --git a/src/string/riscv64/memcpy.S b/src/string/riscv64/memcpy.S
new file mode 100644
index 0000000..ee59924
--- /dev/null
+++ b/src/string/riscv64/memcpy.S
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2013 Regents of the University of California
+ */
+
+#define SZREG 8
+#define REG_S sd
+#define REG_L ld
+
+.global memcpy
+.type memcpy,@function
+memcpy:
+        /* Save for return value */
+        mv      t6, a0
+
+        /*
+         * Register allocation for code below:
+         * a0 - start of uncopied dst
+         * a1 - start of uncopied src
+         * t0 - end of uncopied dst
+         */
+        add     t0, a0, a2
+
+        /*
+         * Use bytewise copy if too small.
+         *
+         * This threshold must be at least 2*SZREG to ensure at least one
+         * wordwise copy is performed. It is chosen to be 16 because it will
+         * save at least 7 iterations of bytewise copy, which pays off the
+         * fixed overhead.
+         */
+        li      a3, 16
+        bltu    a2, a3, .Lbyte_copy_tail
+
+        /*
+         * Bytewise copy first to align a0 to word boundary.
+         */
+        addi    a2, a0, SZREG-1
+        andi    a2, a2, ~(SZREG-1)
+        beq     a0, a2, 2f
+1:
+        lb      a5, 0(a1)
+        addi    a1, a1, 1
+        sb      a5, 0(a0)
+        addi    a0, a0, 1
+        bne     a0, a2, 1b
+2:
+
+        /*
+         * Now a0 is word-aligned. If a1 is also word aligned, we could perform
+         * aligned word-wise copy. Otherwise we need to perform misaligned
+         * word-wise copy.
+         */
+        andi    a3, a1, SZREG-1
+        bnez    a3, .Lmisaligned_word_copy
+
+        /* Unrolled wordwise copy */
+        addi    t0, t0, -(16*SZREG-1)
+        bgeu    a0, t0, 2f
+1:
+        REG_L   a2,        0(a1)
+        REG_L   a3,    SZREG(a1)
+        REG_L   a4,  2*SZREG(a1)
+        REG_L   a5,  3*SZREG(a1)
+        REG_L   a6,  4*SZREG(a1)
+        REG_L   a7,  5*SZREG(a1)
+        REG_L   t1,  6*SZREG(a1)
+        REG_L   t2,  7*SZREG(a1)
+        REG_L   t3,  8*SZREG(a1)
+        REG_L   t4,  9*SZREG(a1)
+        REG_L   t5, 10*SZREG(a1)
+        REG_S   a2,        0(a0)
+        REG_S   a3,    SZREG(a0)
+        REG_S   a4,  2*SZREG(a0)
+        REG_S   a5,  3*SZREG(a0)
+        REG_S   a6,  4*SZREG(a0)
+        REG_S   a7,  5*SZREG(a0)
+        REG_S   t1,  6*SZREG(a0)
+        REG_S   t2,  7*SZREG(a0)
+        REG_S   t3,  8*SZREG(a0)
+        REG_S   t4,  9*SZREG(a0)
+        REG_S   t5, 10*SZREG(a0)
+        REG_L   a2, 11*SZREG(a1)
+        REG_L   a3, 12*SZREG(a1)
+        REG_L   a4, 13*SZREG(a1)
+        REG_L   a5, 14*SZREG(a1)
+        REG_L   a6, 15*SZREG(a1)
+        addi    a1, a1, 16*SZREG
+        REG_S   a2, 11*SZREG(a0)
+        REG_S   a3, 12*SZREG(a0)
+        REG_S   a4, 13*SZREG(a0)
+        REG_S   a5, 14*SZREG(a0)
+        REG_S   a6, 15*SZREG(a0)
+        addi    a0, a0, 16*SZREG
+        bltu    a0, t0, 1b
+2:
+        /* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
+        addi    t0, t0, 15*SZREG
+
+        /* Wordwise copy */
+        bgeu    a0, t0, 2f
+1:
+        REG_L   a5, 0(a1)
+        addi    a1, a1, SZREG
+        REG_S   a5, 0(a0)
+        addi    a0, a0, SZREG
+        bltu    a0, t0, 1b
+2:
+        addi    t0, t0, SZREG-1
+
+.Lbyte_copy_tail:
+        /*
+         * Bytewise copy anything left.
+         */
+        beq     a0, t0, 2f
+1:
+        lb      a5, 0(a1)
+        addi    a1, a1, 1
+        sb      a5, 0(a0)
+        addi    a0, a0, 1
+        bne     a0, t0, 1b
+2:
+
+        mv      a0, t6
+        ret
+
+.Lmisaligned_word_copy:
+        /*
+         * Misaligned word-wise copy.
+         * For misaligned copy we still perform word-wise copy, but we need to
+         * use the value fetched from the previous iteration and do some shifts.
+         * This is safe because we wouldn't access more words than necessary.
+         */
+
+        /* Calculate shifts */
+        slli    t3, a3, 3
+        sub     t4, x0, t3 /* negate is okay as shift will only look at LSBs */
+
+        /* Load the initial value and align a1 */
+        andi    a1, a1, ~(SZREG-1)
+        REG_L   a5, 0(a1)
+
+        addi    t0, t0, -(SZREG-1)
+        /* At least one iteration will be executed here, no check */
+1:
+        srl     a4, a5, t3
+        REG_L   a5, SZREG(a1)
+        addi    a1, a1, SZREG
+        sll     a2, a5, t4
+        or      a2, a2, a4
+        REG_S   a2, 0(a0)
+        addi    a0, a0, SZREG
+        bltu    a0, t0, 1b
+
+        /* Update pointers to correct value */
+        addi    t0, t0, SZREG-1
+        add     a1, a1, a3
+
+        j       .Lbyte_copy_tail
-- 
2.34.1

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.