Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260419023802.GI1827@brightrain.aerifal.cx>
Date: Sat, 18 Apr 2026 22:38:02 -0400
From: "dalias@...ifal.cx" <dalias@...ifal.cx>
To: David Sparks <sparks05@...ton.me>
Cc: "musl@...ts.openwall.com" <musl@...ts.openwall.com>,
	"mailto.luca.kellermann@...il.com" <mailto.luca.kellermann@...il.com>
Subject: Re: Some additional qsort patches

On Wed, Apr 15, 2026 at 01:05:45AM +0000, David Sparks wrote:
> On Tuesday, April 14th, 2026 at 16:10, dalias@...ifal.cx <dalias@...ifal.cx> wrote:
> > On Tue, Apr 14, 2026 at 03:24:32AM +0000, David Sparks wrote:
> >> The recent qsort patches caused conflicts with some local changes
> >> I've had sitting around for a long time.  Since the code is fresh
> >
> > This is exactly why we don't introduce gratuitous churn in the form of
> > whitespace changes, minor refactorings, etc. If we'd applied these
> > patches, the CVE fixes would not have applied cleanly to previous
> > versions, and we would had had to backport them manually. On top of
> > that, churn puts a further load on anyone who is actually reviewing
> > all changes between versions to make sure they trust them, for anyone
> > maintaining their own out-of-tree functional patchsets, etc.
> 
> Thank you!  This would be a wonderful addition to the coding-style
> wiki page, which I read very carefully before sending anything.

Yeah. There are a lot of things like this that were written down in
various places over the years but never consolidated in a good place
for new folks to find.
> 
> > I know the compare has come up before, and just never been acted on.
> > We should probably go ahead and do it.
> >
> > I don't see how the refactoring of pntz/shr is an improvement. It
> > makes some call points "prettier", others "uglier", but it's entirely
> > a cosmetic change.
> 
> In addition to being prettier, it gives the compiler more scope for
> optimization.

All of this code is static, so I think the entire scope is available
for IPA optimization already.

> Because of the 2-word size, pntz() branches on p[0] == 1, and then
> shr() branches on trail >= 64.  This double if() seems pointless when
> they're both actually the same condition.  (I just don't know what to
> call the operation.  If it were a left shift, it'd be "normalize".
> "Strip trailing zeros"?  "pop_root"?  But naming is a bikeshed issue,
> not an efficiency one.)
> 
> Now, a modern optimizing compiler can in fact see the equivalence and
> merge the two branches, but  it's not as good.  Here's a standalone
> test program:
> 
> #include <stdlib.h>
> #define WORDBITS (int)(8 * sizeof(size_t))
> 
> static int ctz(size_t x)
> {
> 	return __builtin_ctzg(x);
> }
> 
> static int pctz(size_t p[2])
> {
> 	size_t p0 = p[0] & -(size_t)2;
> 	if (p0)
> 		return ctz(p0);
> 	else
> 		return ctz(p[1]) + WORDBITS;
> }
> 
> static void shr(size_t p[2], int sh)
> {
> 	if (sh >= WORDBITS) {
> 		p[0] = p[1];
> 		p[1] = 0;
> 		sh -= WORDBITS;
> 		if (!sh)
> 			return;
> 	}
> 	p[0] = p[0] >> sh | p[1] << (WORDBITS - sh);
> 	p[1] >>= sh;
> }
> 
> int
> normalize1(size_t p[2])
> {
> 	int sh = pctz(p);
> 	shr(p, sh);
> 	return sh;
> }
> 
> int
> normalize2(size_t p[2])
> {
> 	size_t p0 = p[0] & -(size_t)2;
> 	int sh;
> 	if (p0) {
> 		sh = ctz(p0);
> 		p[0] = p0 >> sh | p[1] << (WORDBITS - sh);
> 		p[1] >>= sh;
> 	} else {
> 		sh = ctz(p[1]);
> 		p[0] = p[1] >> sh;
> 		p[1] = 0;
> 		sh += WORDBITS;
> 	}
> 	return sh;
> }
> 
> int
> normalize3(size_t p[2])
> {
> 	unsigned __int128 x = (unsigned __int128)p[1] << WORDBITS | (p[0] & -(size_t)2);
> 	int sh = __builtin_ctzg(x);
> 	x >>= sh;
> 	p[0] = x;
> 	p[1] = x >> sh;
> 	return sh;
> }
> 
> Which compiles (gcc 15.2 -O2 -m64 generic) to
> 
> 	.file	"shr.c"
> 	.text
> 	.p2align 4
> 	.globl	normalize1
> normalize1:
> 	pushq	%rbx
> 	movq	(%rcx), %r8
> 	movq	8(%rcx), %r9
> 	movq	%r8, %r10
> 	movq	%rcx, %rdx
> 	andq	$-2, %r10
> 	je	.L2
> 	xorl	%eax, %eax
> 	movq	%r9, %rbx
> 	rep bsfq	%r10, %rax
> 	movq	%r9, %r10
> 	movl	%eax, %ecx
> 	movl	%eax, %r11d
> 	negl	%ecx
> 	salq	%cl, %r10
> 	movl	%eax, %ecx
> 	shrq	%cl, %rbx
> .L3:
> 	movl	%r11d, %ecx
> 	movq	%rbx, 8(%rdx)
> 	shrq	%cl, %r8
> 	orq	%r10, %r8
> 	movq	%r8, (%rdx)
> .L1:
> 	popq	%rbx
> 	ret
> 	.p2align 4,,10
> 	.p2align 3
> .L2:
> 	xorl	%ecx, %ecx
> 	movq	%r9, (%rdx)
> 	movl	$64, %eax
> 	rep bsfq	%r9, %rcx
> 	movq	$0, 8(%rdx)
> 	movl	%ecx, %r11d
> 	testl	%ecx, %ecx
> 	je	.L1
> 	leal	64(%rcx), %eax
> 	xorl	%ebx, %ebx
> 	movq	%r9, %r8
> 	jmp	.L3
> 
> 	.p2align 4
> 	.globl	normalize2
> normalize2:
> 	movq	(%rcx), %rdx
> 	movq	8(%rcx), %r10
> 	movq	%rcx, %r8
> 	andq	$-2, %rdx
> 	je	.L8
> 	xorl	%r11d, %r11d
> 	movq	%r10, %r9
> 	rep bsfq	%rdx, %r11
> 	movl	%r11d, %ecx
> 	movl	%r11d, %eax
> 	negl	%ecx
> 	salq	%cl, %r9
> 	movl	%r11d, %ecx
> 	shrq	%cl, %rdx
> 	orq	%rdx, %r9
> 	movq	%r10, %rdx
> 	shrq	%cl, %rdx
> 	movq	%r9, (%r8)
> 	movq	%rdx, 8(%r8)
> 	ret
> 	.p2align 4,,10
> 	.p2align 3
> .L8:
> 	xorl	%ecx, %ecx
> 	movq	%rdx, 8(%r8)
> 	rep bsfq	%r10, %rcx
> 	shrq	%cl, %r10
> 	leal	64(%rcx), %eax
> 	movq	%r10, %r9
> 	movq	%r9, (%r8)
> 	ret
> 
> 	.p2align 4
> 	.globl	normalize3
> normalize3:
> 	xorl	%r10d, %r10d
> 	movq	8(%rcx), %rdx
> 	movq	(%rcx), %rax
> 	andq	$-2, %rax
> 	rep bsfq	%rax, %r10
> 	movq	%rcx, %r8
> 	xorl	%ecx, %ecx
> 	rep bsfq	%rdx, %rcx
> 	addl	$64, %ecx
> 	testq	%rax, %rax
> 	cmovne	%r10d, %ecx
> 	xorl	%r9d, %r9d
> 	shrdq	%cl, %rdx, %rax
> 	shrq	%cl, %rdx
> 	testb	$64, %cl
> 	cmovne	%rdx, %rax
> 	cmovne	%r9, %rdx
> 	movq	%rax, (%r8)
> 	shrdq	%cl, %rdx, %rax
> 	shrq	%cl, %rdx
> 	testb	$64, %cl
> 	cmovne	%rdx, %rax
> 	movq	%rax, 8(%r8)
> 	movl	%ecx, %eax
> 	ret
> 
> normalize1 is 0x70 bytes (including 6 bytes of padding after the ret)
> with two internal branches, normalize2 is 0x58 bytes (3 bytes internal
> padding) with one internal branch, and normalize3 is 0x54 bytes with
> no internal branches.

Before we even worry about whether this warrants manual optimization
we should probably know how much time is spent on it to begin with. I
suspect it's vanishingly small.

> >> TODO: Clean out redundant header files.  Neither stdint.h nor stdlib.h
> >> are actually needed.  (Noticed as I was composing this mail.)
> >
> > stdlib.h is needed because it's the public declaration of qsort. We
> > always include the public declaration of the interface being
> > implemented. This ensures the compiler checks that the signatures
> > match.
> 
> Doh!  An excellent reason and I apologize for not realizing it.
> Which is also the reason for #define _BSD_SOURCE.
> 
> I will prepare a smaller revised patch set.
> 
> There are two additional changes I could prepare if you're
> interested in seeing them, but I held off for fear of the
> magnitude of the surgery:
> 
> 1) Using __uint128_t (or uint64_t on 32-bit) for the tree-size
>    bitmap.  This would require a chunk of ugly preprocessor
>    code to find the appropriate data type, plus the current
>    code as a fallback.

Yeah, that's not something I'd want to be doing. musl is written in
C99 to the maximum extent possible, using a minimal set of extensions,
and builds fine on pcc, gcc 3, firm/cparser, etc.

On most archs using __[u]int128_t would be calling out to libgcc
anyway, and the branch would just be moved there.

> 2) Saving a compare in trinkle().  There seem to be two ways
>    to go about this:
>    2a) Move the top level of sift() into trinkle(), and have
>        the call to sift() take the appropriate subtree, or

What do you mean by the "top level"? The first iteration of the loop?

>    2b) Implement a do_sift() helper which takes the root compare
>        as an argument and can be used by both sift() and
>        trinkle().  (This has the potential to take an ar[] array
>        as well and merge the cycle() operations.)

This maybe sounds better, but I'd probably have to see it to
understand what you mean.

Rich

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.