musl - [C23 128 bit 1/4] add an emulation for 128 bit arithmetic as needed for C library support

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <04d1def74188d4783389363c4a3a0e1d942d7e36.1685536608.git.Jens.Gustedt@inria.fr>
Date: Wed, 31 May 2023 16:15:47 +0200
From: Jens Gustedt <Jens.Gustedt@...ia.fr>
To: musl@...ts.openwall.com
Subject: [C23 128 bit 1/4] add an emulation for 128 bit arithmetic as needed for C library support

With C23 it will be possible for compilers to add extended integer
types that are wider than `intmax_t`, in particular for the
`[u]int128_t` types. This patch enables the minimal support for these
types that is needed to implement the necessary bits in `printf` and
`scanf` like interfaces. This support is added unconditionally, but
only compilers that effectively implement the types will be able to
profit from this new setting.

The usage of these types is then even independent of whether or not a
given compiler has full support for `[u]int128_t`; just the types
indicated by gcc' `__int128` are needed for this to be useful.

We provide two implementations for most of the functions, but which
both have the same ABI. One that uses gcc' `__int128` types underneath
and anotherone that hand-codes minimal operations by sticking them
together from different sub-words of the 128 entity.

The hand-coded version is not complete, it lacks a important bit in
terms of a function

      uwide128 __uwide128_pop(va_list *ap);

that is used by `vfprintf` and similar to pop a 128 bit value from the
variable argument list. (`uwide128` is an internal structure type that
is used for the emulation)

This version *must* be completed for each architecture that we support
by a .s file. For architectures that have a compiler that implements
`__int128`, it is relatively simple to generate such a file: just
compile the file uwide128.c with -S option and extract this one and
only function into a platform specific subdirectory.
---
 src/internal/uwide128.c            | 213 +++++++++++++++++++++++++++++
 src/internal/uwide128.h            |  57 ++++++++
 src/internal/x86_64/uwide128_pop.s |  27 ++++
 3 files changed, 297 insertions(+)
 create mode 100644 src/internal/uwide128.c
 create mode 100644 src/internal/uwide128.h
 create mode 100644 src/internal/x86_64/uwide128_pop.s

diff --git a/src/internal/uwide128.c b/src/internal/uwide128.c
new file mode 100644
index 00000000..6e4b08bf
--- /dev/null
+++ b/src/internal/uwide128.c
@@ -0,0 +1,213 @@
+#include <uwide128.h>
+#include <stdarg.h>
+
+#if __SIZEOF_INT128__
+
+union u { unsigned __int128 x; uwide128 s; };
+
+__attribute__((__weak__))
+uwide128 __uwide128_neg(uwide128 a) {
+	union u both = { .s = a, };
+	both.x = -both.x;
+	return both.s;
+}
+
+__attribute__((__weak__))
+uwide128 __uwide128_add(uwide128 a, uint8_t b) {
+	union u both = { .s = a, };
+	both.x += b;
+	return both.s;
+}
+
+__attribute__((__weak__))
+uwide128 __uwide128_sub(uwide128 a, uint8_t b) {
+	union u both = { .s = a, };
+	both.x -= b;
+	return both.s;
+}
+
+__attribute__((__weak__))
+uwide128 __uwide128_mul(uwide128 a, uint8_t b) {
+	union u both = { .s = a, };
+	both.x *= b;
+	return both.s;
+}
+
+__attribute__((__weak__))
+uint8_t __uwide128_div10(uwide128* a) {
+	union u both = { .s = *a, };
+	uint8_t ret = both.x % 10;
+	both.x /= 10;
+	*a = both.s;
+	return ret;
+}
+
+__attribute__((__weak__))
+uint8_t __uwide128_div2(uwide128* a) {
+	union u both = { .s = *a, };
+	uint8_t ret = both.x % 2;
+	both.x /= 2;
+	*a = both.s;
+	return ret;
+}
+
+__attribute__((__weak__))
+uint8_t __uwide128_div8(uwide128* a) {
+	union u both = { .s = *a, };
+	uint8_t ret = both.x % 8;
+	both.x /= 8;
+	*a = both.s;
+	return ret;
+}
+
+__attribute__((__weak__))
+uint8_t __uwide128_div16(uwide128* a) {
+	union u both = { .s = *a, };
+	uint8_t ret = both.x % 16;
+	both.x /= 16;
+	*a = both.s;
+	return ret;
+}
+
+__attribute__((__weak__))
+_Bool  __uwide128_le(uwide128 a, uwide128 b) {
+	union u botha = { .s = a, };
+	union u bothb = { .s = b, };
+	return botha.x <= bothb.x;
+}
+
+__attribute__((__weak__))
+_Bool  __uwide128_iszero(uwide128 a) {
+	union u both = { .s = a, };
+	return !both.x;
+}
+
+uwide128 __uwide128_pop(va_list *ap)
+{
+	return (union u){ .x = va_arg(*ap, __int128) }.s;
+}
+
+#else
+
+__attribute__((__weak__))
+_Bool __uwide128_le(uwide128 a, uwide128 b) {
+	return (a.v64[hi64] > b.v64[hi64])
+	  ? false
+	  : ((a.v64[hi64] < b.v64[hi64])
+	     ? true
+	     : (a.v64[lo64] <= b.v64[lo64]));
+}
+
+__attribute__((__weak__))
+_Bool __uwide128_iszero(uwide128 a)
+{
+	return !a.v64[0] && !a.v64[1];
+}
+
+__attribute__((__weak__))
+uwide128 __uwide128_neg(uwide128 a)
+{
+	uwide128 ret = { .v64 = { [0] = ~a.v64[0], [1] = ~a.v64[1], }, };
+	if (!a.v64[lo64]) ret.v64[hi64]--;
+	ret.v64[lo64]--;
+	return ret;
+}
+
+uwide128 __uwide128_add(uwide128 a, uint8_t b)
+{
+	uwide128 ret;
+	uint64_t carry = a.v32[wo32_0];
+	carry += b;
+	ret.v32[wo32_0] = carry;
+	carry >>= 32;
+	carry += a.v32[wo32_1];
+	ret.v32[wo32_1] = carry;
+	carry >>= 32;
+	carry += a.v32[wo32_2];
+	ret.v32[wo32_2] = carry;
+	carry >>= 32;
+	carry += a.v32[wo32_3];
+	ret.v32[wo32_3] = carry;
+	return ret;
+}
+
+uwide128 __uwide128_sub(uwide128 a, uint8_t b)
+{
+	uwide128 ret;
+	int64_t carry = a.v32[wo32_0];
+	carry -= b;
+	ret.v32[wo32_0] = carry;
+	carry /= UINT64_C(0x100000000);
+	carry += a.v32[wo32_1];
+	ret.v32[wo32_1] = carry;
+	carry /= UINT64_C(0x100000000);
+	carry += a.v32[wo32_2];
+	ret.v32[wo32_2] = carry;
+	carry /= UINT64_C(0x100000000);
+	carry += a.v32[wo32_3];
+	ret.v32[wo32_3] = carry;
+	return ret;
+}
+
+uwide128 __uwide128_mul(uwide128 a, uint8_t b)
+{
+	uwide128 ret;
+	uint64_t carry;
+	uint64_t prod = a.v32[wo32_0];
+	prod *= b;
+	carry = prod;
+	ret.v32[wo32_0] = carry;
+	carry >>= 32;
+	prod = a.v32[wo32_1];
+	prod *= b;
+	carry += prod;
+	ret.v32[wo32_1] = carry;
+	carry >>= 32;
+	prod = a.v32[wo32_2];
+	prod *= b;
+	carry += prod;
+	ret.v32[wo32_2] = carry;
+	carry >>= 32;
+	prod = a.v32[wo32_3];
+	prod *= b;
+	carry += prod;
+	ret.v32[wo32_3] = carry;
+	return ret;
+}
+
+static uint8_t __uwide128_div(uwide128* a, uint8_t b)
+{
+	uint64_t rest = a->v64[hi64] % b;
+	a->v64[hi64] /= b;
+	rest <<= 32;
+	rest |= a->v32[wo32_1];
+	a->v32[wo32_1] = rest / b;
+	rest %= b;
+	rest <<= 32;
+	rest |= a->v32[wo32_0];
+	a->v32[wo32_0] = rest / b;
+	rest %= b;
+	return rest;
+}
+
+uint8_t __uwide128_div10(uwide128* a)
+{
+	return __uwide128_div(a, 10);
+}
+
+uint8_t __uwide128_div2(uwide128* a)
+{
+	return __uwide128_div(a, 2);
+}
+
+uint8_t __uwide128_div8(uwide128* a)
+{
+	return __uwide128_div(a, 8);
+}
+
+uint8_t __uwide128_div16(uwide128* a)
+{
+	return __uwide128_div(a, 16);
+}
+
+#endif
diff --git a/src/internal/uwide128.h b/src/internal/uwide128.h
new file mode 100644
index 00000000..f6a02934
--- /dev/null
+++ b/src/internal/uwide128.h
@@ -0,0 +1,57 @@
+#include <features.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdarg.h>
+
+typedef union uwide128 uwide128;
+union uwide128 {
+	uint64_t v64[2];
+	uint32_t v32[4];
+};
+
+#define word64(X) (__BYTE_ORDER == __LITTLE_ENDIAN ? (X) : (2-(X)))
+#define lo64 word64(0)
+#define hi64 word64(1)
+
+#define word32(X) (__BYTE_ORDER == __LITTLE_ENDIAN ? (X) : (4-(X)))
+#define wo32_0 word32(0)
+#define wo32_1 word32(1)
+#define wo32_2 word32(2)
+#define wo32_3 word32(3)
+
+
+#define comp2(LO, HI) { { [lo64] = (LO), [hi64] = (HI), }, }
+
+#if __STDC_VERSION__ >= 202311L
+constexpr uwide128 __uwide128_max = { -1, -1, };
+#define UWIDE128_MAX __uwide128_max
+#else
+#define UWIDE128_MAX ((void)0, (uwide128)comp2(-1, -1))
+#endif
+
+static __inline uwide128 __uwide128_i64(int64_t a)  __unsequenced;
+static __inline uwide128 __uwide128_u64(uint64_t a) __unsequenced;
+
+static __inline uwide128 __uwide128_i64(int64_t a)
+{
+	return (uwide128)comp2(a, (a < 0) ? UINT64_MAX : 0);
+}
+
+static __inline uwide128 __uwide128_u64(uint64_t a)
+{
+	return (uwide128)comp2(a, 0);
+}
+
+_Bool __uwide128_le(uwide128, uwide128)    __unsequenced; // intscan
+_Bool __uwide128_iszero(uwide128)          __unsequenced; // vfprintf
+uwide128 __uwide128_neg(uwide128)          __unsequenced; // intscan
+uwide128 __uwide128_add(uwide128, uint8_t) __unsequenced; // intscan
+uwide128 __uwide128_sub(uwide128, uint8_t) __unsequenced; // intscan
+uwide128 __uwide128_mul(uwide128, uint8_t) __unsequenced; // intscan
+uint8_t __uwide128_div10(uwide128*);           // vfprintf
+uint8_t __uwide128_div2(uwide128*);            // vfprintf
+uint8_t __uwide128_div8(uwide128*);            // vfprintf
+uint8_t __uwide128_div16(uwide128*);           // vfprintf
+uwide128 __uwide128_pop(va_list *ap);          // vfprintf
+uwide128 __uwide128_i64(int64_t);              // vfprintf
+uwide128 __uwide128_u64(uint64_t);             // vfprintf
diff --git a/src/internal/x86_64/uwide128_pop.s b/src/internal/x86_64/uwide128_pop.s
new file mode 100644
index 00000000..f9f84348
--- /dev/null
+++ b/src/internal/x86_64/uwide128_pop.s
@@ -0,0 +1,27 @@
+.text
+.global __uwide128_pop
+.weak	__uwide128_pop
+.type	__uwide128_pop, @function
+__uwide128_pop:
+	endbr64
+	movl	(%rdi), %edx
+	cmpl	$39, %edx
+	ja	1
+	movl	%edx, %eax
+	addl	$16, %edx
+	addq	16(%rdi), %rax
+	movl	%edx, (%rdi)
+	movq	8(%rax), %rdx
+	movq	(%rax), %rax
+	ret
+	.p2align 4,,10
+	.p2align 3
+1:
+	movq	8(%rdi), %rax
+	addq	$15, %rax
+	andq	$-16, %rax
+	leaq	16(%rax), %rdx
+	movq	%rdx, 8(%rdi)
+	movq	8(%rax), %rdx
+	movq	(%rax), %rax
+	ret
-- 
2.34.1
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.