musl - Re: [PATCH] math: add LoongArch support for common APIs with inline assembly.

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <9216362a-951e-4c56-b7a3-b865fde49aed@loongson.cn>
Date: Sat, 11 May 2024 09:06:34 +0800
From: lixing <lixing@...ngson.cn>
To: ticat_fp <fanpeng@...ngson.cn>, musl@...ts.openwall.com
Cc: huajingyun@...ngson.cn, wanghongliang@...ngson.cn
Subject: Re: [PATCH] math: add LoongArch support for common APIs with inline
 assembly.

Hi, Rich

Can you take some time to check these loongarch64 optimization of math 
functions ?

We've verified this patch in alpine system.

Thanks.

在 2024/4/23 上午10:26, ticat_fp 写道:
> Including: ceil, copysign, fabs, floor, fma, fmax, fmin, llrint,
> lrint, rint, sqrt and their f versions.
>
> ---
>   src/math/loongarch64/ceil.c      | 25 +++++++++++++++++++++++++
>   src/math/loongarch64/ceilf.c     | 25 +++++++++++++++++++++++++
>   src/math/loongarch64/copysign.c  |  7 +++++++
>   src/math/loongarch64/copysignf.c |  7 +++++++
>   src/math/loongarch64/fabs.c      |  7 +++++++
>   src/math/loongarch64/fabsf.c     |  7 +++++++
>   src/math/loongarch64/floor.c     | 22 ++++++++++++++++++++++
>   src/math/loongarch64/floorf.c    | 22 ++++++++++++++++++++++
>   src/math/loongarch64/fma.c       |  7 +++++++
>   src/math/loongarch64/fmaf.c      |  7 +++++++
>   src/math/loongarch64/fmax.c      |  7 +++++++
>   src/math/loongarch64/fmaxf.c     |  7 +++++++
>   src/math/loongarch64/fmin.c      |  7 +++++++
>   src/math/loongarch64/fminf.c     |  7 +++++++
>   src/math/loongarch64/llrint.c    | 17 +++++++++++++++++
>   src/math/loongarch64/llrintf.c   | 17 +++++++++++++++++
>   src/math/loongarch64/lrint.c     | 17 +++++++++++++++++
>   src/math/loongarch64/lrintf.c    | 17 +++++++++++++++++
>   src/math/loongarch64/rint.c      |  7 +++++++
>   src/math/loongarch64/rintf.c     |  7 +++++++
>   src/math/loongarch64/sqrt.c      |  7 +++++++
>   src/math/loongarch64/sqrtf.c     |  7 +++++++
>   22 files changed, 260 insertions(+)
>   create mode 100644 src/math/loongarch64/ceil.c
>   create mode 100644 src/math/loongarch64/ceilf.c
>   create mode 100644 src/math/loongarch64/copysign.c
>   create mode 100644 src/math/loongarch64/copysignf.c
>   create mode 100644 src/math/loongarch64/fabs.c
>   create mode 100644 src/math/loongarch64/fabsf.c
>   create mode 100644 src/math/loongarch64/floor.c
>   create mode 100644 src/math/loongarch64/floorf.c
>   create mode 100644 src/math/loongarch64/fma.c
>   create mode 100644 src/math/loongarch64/fmaf.c
>   create mode 100644 src/math/loongarch64/fmax.c
>   create mode 100644 src/math/loongarch64/fmaxf.c
>   create mode 100644 src/math/loongarch64/fmin.c
>   create mode 100644 src/math/loongarch64/fminf.c
>   create mode 100644 src/math/loongarch64/llrint.c
>   create mode 100644 src/math/loongarch64/llrintf.c
>   create mode 100644 src/math/loongarch64/lrint.c
>   create mode 100644 src/math/loongarch64/lrintf.c
>   create mode 100644 src/math/loongarch64/rint.c
>   create mode 100644 src/math/loongarch64/rintf.c
>   create mode 100644 src/math/loongarch64/sqrt.c
>   create mode 100644 src/math/loongarch64/sqrtf.c
>
> diff --git a/src/math/loongarch64/ceil.c b/src/math/loongarch64/ceil.c
> new file mode 100644
> index 00000000..95781f4b
> --- /dev/null
> +++ b/src/math/loongarch64/ceil.c
> @@ -0,0 +1,25 @@
> +#include <math.h>
> +#include <stdint.h>
> +
> +double ceil(double x)
> +{
> +    int32_t old;
> +    int32_t new;
> +    int32_t tmp1;
> +    int32_t tmp2;
> +
> +    __asm__ __volatile__(
> +    "movfcsr2gr %[orig_old],  $r0               \n\t"
> +    "li.d       %[tmp1], 0x200                  \n\t"
> +    "or         %[new],  %[orig_old], %[tmp1]   \n\t"
> +    "li.d       %[tmp2], 0xfffffeff             \n\t"
> +    "and        %[new],  %[new], %[tmp2]        \n\t"
> +    "movgr2fcsr $r0,     %[new]                 \n\t"
> +    "frint.d    %[result],       %[orig_x]      \n\t"
> +    "movgr2fcsr $r0,     %[orig_old]            \n\t"
> +    : [result] "+f"(x), [old]"+r"(old), [new]"+r"(new), [tmp1] "+r"(tmp1), [tmp2] "+r"(tmp2)
> +    : [orig_x] "f"(x), [orig_old]"r"(old), [orig_new]"r"(new), [orig_tmp1] "r"(tmp1), [orig_tmp2] "r"(tmp2)
> +    :);
> +
> +    return x;
> +}
> diff --git a/src/math/loongarch64/ceilf.c b/src/math/loongarch64/ceilf.c
> new file mode 100644
> index 00000000..03a2d933
> --- /dev/null
> +++ b/src/math/loongarch64/ceilf.c
> @@ -0,0 +1,25 @@
> +#include <math.h>
> +#include <stdint.h>
> +
> +float ceilf(float x)
> +{
> +    int32_t old;
> +    int32_t new;
> +    int32_t tmp1;
> +    int32_t tmp2;
> +
> +    __asm__ __volatile__(
> +    "movfcsr2gr %[orig_old],  $r0               \n\t"
> +    "li.d       %[tmp1], 0x200                  \n\t"
> +    "or         %[new],  %[orig_old], %[tmp1]   \n\t"
> +    "li.d       %[tmp2], 0xfffffeff             \n\t"
> +    "and        %[new],  %[new], %[tmp2]        \n\t"
> +    "movgr2fcsr $r0,     %[new]                 \n\t"
> +    "frint.s    %[result],       %[orig_x]      \n\t"
> +    "movgr2fcsr $r0,     %[orig_old]            \n\t"
> +    : [result] "+f"(x), [old]"+r"(old), [new]"+r"(new), [tmp1] "+r"(tmp1), [tmp2] "+r"(tmp2)
> +    : [orig_x] "f"(x), [orig_old]"r"(old), [orig_new]"r"(new), [orig_tmp1] "r"(tmp1), [orig_tmp2] "r"(tmp2)
> +    :);
> +
> +    return x;
> +}
> diff --git a/src/math/loongarch64/copysign.c b/src/math/loongarch64/copysign.c
> new file mode 100644
> index 00000000..9e3b8de3
> --- /dev/null
> +++ b/src/math/loongarch64/copysign.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +double copysign(double x, double y)
> +{
> +	__asm__ __volatile__("fcopysign.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/copysignf.c b/src/math/loongarch64/copysignf.c
> new file mode 100644
> index 00000000..98df4254
> --- /dev/null
> +++ b/src/math/loongarch64/copysignf.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +float copysignf(float x, float y)
> +{
> +	__asm__ __volatile__("fcopysign.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/fabs.c b/src/math/loongarch64/fabs.c
> new file mode 100644
> index 00000000..3db57fb5
> --- /dev/null
> +++ b/src/math/loongarch64/fabs.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +double fabs(double x)
> +{
> +	__asm__ __volatile__("fabs.d   %0, %1" : "=f"(x) : "f"(x));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/fabsf.c b/src/math/loongarch64/fabsf.c
> new file mode 100644
> index 00000000..e24201c5
> --- /dev/null
> +++ b/src/math/loongarch64/fabsf.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +float fabsf(float x)
> +{
> +	__asm__ __volatile__("fabs.s   %0, %1" : "=f"(x) : "f"(x));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/floor.c b/src/math/loongarch64/floor.c
> new file mode 100644
> index 00000000..7aead2a3
> --- /dev/null
> +++ b/src/math/loongarch64/floor.c
> @@ -0,0 +1,22 @@
> +#include <math.h>
> +#include <stdint.h>
> +
> +double floor(double x)
> +{
> +    int32_t old;
> +    int32_t new;
> +    int32_t tmp1;
> +
> +    __asm__ __volatile__(
> +    "movfcsr2gr %[old],  $r0                \n\t"
> +    "li.d       %[tmp1], 0x300              \n\t"
> +    "or         %[new],  %[old], %[tmp1]    \n\t"
> +    "movgr2fcsr $r0,    %[new]              \n\t"
> +    "frint.d    %[result],       %[orig_x]  \n\t"
> +    "movgr2fcsr $r0, %[old]                 \n\t"
> +    : [result] "+f"(x), [old]"+r"(old), [tmp1] "+r"(tmp1), [new]"+r"(new)
> +    : [orig_x] "f"(x), [origin_old] "r"(old), [orig_new] "r"(new), [orig_tmp1] "r"(tmp1)
> +    :);
> +
> +    return x;
> +}
> diff --git a/src/math/loongarch64/floorf.c b/src/math/loongarch64/floorf.c
> new file mode 100644
> index 00000000..772d15eb
> --- /dev/null
> +++ b/src/math/loongarch64/floorf.c
> @@ -0,0 +1,22 @@
> +#include <math.h>
> +#include <stdint.h>
> +
> +float floorf(float x)
> +{
> +    int32_t old;
> +    int32_t new;
> +    int32_t tmp1;
> +
> +    __asm__ __volatile__(
> +    "movfcsr2gr %[old],  $r0                \n\t"
> +    "li.d       %[tmp1], 0x300              \n\t"
> +    "or         %[new],  %[old], %[tmp1]    \n\t"
> +    "movgr2fcsr $r0,    %[new]              \n\t"
> +    "frint.s    %[result],       %[orig_x]  \n\t"
> +    "movgr2fcsr $r0, %[old]                 \n\t"
> +    : [result] "+f"(x), [old]"+r"(old), [tmp1] "+r"(tmp1), [new]"+r"(new)
> +    : [orig_x] "f"(x), [origin_old] "r"(old), [orig_new] "r"(new), [orig_tmp1] "r"(tmp1)
> +    :);
> +
> +    return x;
> +}
> diff --git a/src/math/loongarch64/fma.c b/src/math/loongarch64/fma.c
> new file mode 100644
> index 00000000..0b6a3f23
> --- /dev/null
> +++ b/src/math/loongarch64/fma.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +double fma(double x, double y, double z)
> +{
> +	__asm__ __volatile__("fmadd.d %0, %1, %2, %3" : "=f" (x) : "f"(x) , "f" (y), "f" (z));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/fmaf.c b/src/math/loongarch64/fmaf.c
> new file mode 100644
> index 00000000..77a8363b
> --- /dev/null
> +++ b/src/math/loongarch64/fmaf.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +float fmaf(float x, float y, float z)
> +{
> +	__asm__ __volatile__("fmadd.s %0, %1, %2, %3" : "=f" (x) : "f"(x) , "f" (y), "f" (z));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/fmax.c b/src/math/loongarch64/fmax.c
> new file mode 100644
> index 00000000..2d091877
> --- /dev/null
> +++ b/src/math/loongarch64/fmax.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +double fmax(double x, double y)
> +{
> +	__asm__ __volatile__("fmax.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/fmaxf.c b/src/math/loongarch64/fmaxf.c
> new file mode 100644
> index 00000000..1106d47c
> --- /dev/null
> +++ b/src/math/loongarch64/fmaxf.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +float fmaxf(float x, float y)
> +{
> +	__asm__ __volatile__("fmax.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/fmin.c b/src/math/loongarch64/fmin.c
> new file mode 100644
> index 00000000..9c44ce87
> --- /dev/null
> +++ b/src/math/loongarch64/fmin.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +double fmin(double x, double y)
> +{
> +	__asm__ __volatile__("fmin.d %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/fminf.c b/src/math/loongarch64/fminf.c
> new file mode 100644
> index 00000000..94a0fa45
> --- /dev/null
> +++ b/src/math/loongarch64/fminf.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +float fminf(float x, float y)
> +{
> +	__asm__ __volatile__("fmin.s %0, %1, %2" : "=f"(x) : "f"(x), "f"(y));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/llrint.c b/src/math/loongarch64/llrint.c
> new file mode 100644
> index 00000000..766222d3
> --- /dev/null
> +++ b/src/math/loongarch64/llrint.c
> @@ -0,0 +1,17 @@
> +#include <math.h>
> +#include <stdint.h>
> +
> +long long llrint(double x)
> +{
> +    long long r;
> +
> +    __asm__ __volatile__(
> +    "frint.d %[x], %[orig_x]             \n\t"
> +    "ftintrz.l.d %[x], %[x]              \n\t"
> +    "movfr2gr.d %[result], %[x]          \n\t"
> +    : [result]"+r"(r), [x]"+f"(x)
> +    : [orig_x]"f"(x)
> +    :);
> +
> +    return r;
> +}
> diff --git a/src/math/loongarch64/llrintf.c b/src/math/loongarch64/llrintf.c
> new file mode 100644
> index 00000000..f5b9dd9f
> --- /dev/null
> +++ b/src/math/loongarch64/llrintf.c
> @@ -0,0 +1,17 @@
> +#include <math.h>
> +#include <stdint.h>
> +
> +long long llrintf(float x)
> +{
> +    long long r;
> +
> +    __asm__ __volatile__(
> +    "frint.s %[x], %[orig_x]            \n\t"
> +    "ftintrz.w.s %[x], %[x]             \n\t"
> +    "movfr2gr.s %[result], %[x]         \n\t"
> +    : [result]"+r"(r), [x]"+f"(x)
> +    : [orig_x]"f"(x)
> +    :);
> +
> +    return r;
> +}
> diff --git a/src/math/loongarch64/lrint.c b/src/math/loongarch64/lrint.c
> new file mode 100644
> index 00000000..d82239d1
> --- /dev/null
> +++ b/src/math/loongarch64/lrint.c
> @@ -0,0 +1,17 @@
> +#include <math.h>
> +#include <stdint.h>
> +
> +long lrint(double x)
> +{
> +    long r;
> +
> +    __asm__ __volatile__(
> +    "frint.d %[x], %[orig_x]             \n\t"
> +    "ftintrz.l.d %[x], %[x]             \n\t"
> +    "movfr2gr.d %[result], %[x]          \n\t"
> +    : [result]"+r"(r), [x]"+f"(x)
> +    : [orig_x]"f"(x)
> +    :);
> +
> +    return r;
> +}
> diff --git a/src/math/loongarch64/lrintf.c b/src/math/loongarch64/lrintf.c
> new file mode 100644
> index 00000000..b30872e9
> --- /dev/null
> +++ b/src/math/loongarch64/lrintf.c
> @@ -0,0 +1,17 @@
> +#include <math.h>
> +#include <stdint.h>
> +
> +long lrintf(float x)
> +{
> +    long r;
> +
> +    __asm__ __volatile__(
> +    "frint.s %[x], %[orig_x]             \n\t"
> +    "ftintrz.l.s %[x], %[x]             \n\t"
> +    "movfr2gr.s %[result], %[x]          \n\t"
> +    : [result]"+r"(r), [x]"+f"(x)
> +    : [orig_x]"f"(x)
> +    :);
> +
> +    return r;
> +}
> diff --git a/src/math/loongarch64/rint.c b/src/math/loongarch64/rint.c
> new file mode 100644
> index 00000000..862cea8c
> --- /dev/null
> +++ b/src/math/loongarch64/rint.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +double rint(double x)
> +{
> +    __asm__ __volatile__("frint.d %0, %1" : "=f"(x) : "f"(x));
> +    return x;
> +}
> diff --git a/src/math/loongarch64/rintf.c b/src/math/loongarch64/rintf.c
> new file mode 100644
> index 00000000..79ac216b
> --- /dev/null
> +++ b/src/math/loongarch64/rintf.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +float rintf(float x)
> +{
> +    __asm__ __volatile__("frint.s %0, %1" : "=f"(x) : "f"(x));
> +    return x;
> +}
> diff --git a/src/math/loongarch64/sqrt.c b/src/math/loongarch64/sqrt.c
> new file mode 100644
> index 00000000..a70e20e9
> --- /dev/null
> +++ b/src/math/loongarch64/sqrt.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +double sqrt(double x)
> +{
> +	__asm__ __volatile__("fsqrt.d %1, %0" : "=f"(x) : "f"(x));
> +	return x;
> +}
> diff --git a/src/math/loongarch64/sqrtf.c b/src/math/loongarch64/sqrtf.c
> new file mode 100644
> index 00000000..796609b0
> --- /dev/null
> +++ b/src/math/loongarch64/sqrtf.c
> @@ -0,0 +1,7 @@
> +#include <math.h>
> +
> +float sqrtf(float x)
> +{
> +	__asm__ __volatile__("fsqrt.s %1, %0" : "=f"(x) : "f"(x));
> +	return x;
> +}
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.