Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20260522022955.2999203-1-jtunney@gmail.com>
Date: Thu, 21 May 2026 19:29:36 -0700
From: Justine Tunney <jtunney@...il.com>
To: musl@...ts.openwall.com
Cc: Justine Tunney <jtunney@...il.com>
Subject: [PATCH] Make qsort 50% faster

This change makes musl qsort go 50% faster on common sizes by avoiding
memcpy calls. On x86-64 with cc -Os it can be as much as 2x faster. No
changes have been made to the smoothsort algorithm itself. This simple
patch brings musl much closer to other C libraries in terms of sorting
perf, while maintaining smoothsort's highly desirable properties, e.g.
small size, stack safety, signal safety, no malloc dependency, etc.

The cost is +670 bytes of .text (on x86-64) to specialize cycle() for
widths 4, 8, 12, 16, 20, 24, 28, and 32 (gcc, without stack protector).
But if you want to be like OpenBSD's qsort and only specialize 4 and 8,
then you're looking at a +140 byte increase. I chose the upper bound of
what's practical; after 32, gcc stops inlining memcpy cleanly.

Further analysis and benchmark numbers are available in my repo:
https://github.com/jart/musl-qsort-speedup

---
 src/stdlib/qsort.c | 83 ++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 70 insertions(+), 13 deletions(-)

diff --git a/src/stdlib/qsort.c b/src/stdlib/qsort.c
index 28607450..8af8b09d 100644
--- a/src/stdlib/qsort.c
+++ b/src/stdlib/qsort.c
@@ -20,6 +20,7 @@
  */
 
 /* Minor changes by Rich Felker for integration in musl, 2011-04-27. */
+/* Some optimizations added by Justine Tunney for musl, 2026-05-21. */
 
 /* Smoothsort, an adaptive variant of Heapsort.  Memory usage: O(1).
    Run time: Worst case O(n log n), close to O(n) in the mostly-sorted case. */
@@ -33,6 +34,7 @@
 #define ntz(x) a_ctz_l((x))
 
 typedef int (*cmpfun)(const void *, const void *, void *);
+typedef void (*cyclefun)(size_t, unsigned char **, long);
 
 /* returns index of first bit set, excluding the low bit assumed to always
  * be set, starting from low bit of p[0] up through high bit of p[1] */
@@ -42,11 +44,34 @@ static inline int pntz(size_t p[2]) {
 	return 0;
 }
 
-static void cycle(size_t width, unsigned char* ar[], int n)
+/* specializes cycle() for constant width */
+#define DECLARE_CYCLE(NAME, WIDTH)					\
+	static void NAME(size_t width, unsigned char* ar[], long n)	\
+	{								\
+		long i;							\
+		char t[WIDTH];						\
+		if(n < 2)						\
+			return;						\
+		memcpy(t, ar[0], WIDTH);				\
+		for (i = 0; i < n - 1; i++)				\
+			memcpy(ar[i], ar[i + 1], WIDTH);		\
+		memcpy(ar[n - 1], t, WIDTH);				\
+	}
+
+DECLARE_CYCLE(cycle4, 4)
+DECLARE_CYCLE(cycle8, 8)
+DECLARE_CYCLE(cycle12, 12)
+DECLARE_CYCLE(cycle16, 16)
+DECLARE_CYCLE(cycle20, 20)
+DECLARE_CYCLE(cycle24, 24)
+DECLARE_CYCLE(cycle28, 28)
+DECLARE_CYCLE(cycle32, 32)
+
+static void cycle(size_t width, unsigned char* ar[], long n)
 {
 	unsigned char tmp[256];
 	size_t l;
-	int i;
+	long i;
 
 	if(n < 2) {
 		return;
@@ -97,7 +122,7 @@ static inline void shr(size_t p[2], int n)
 #define AR_LEN  (16 * sizeof(size_t))
 #define AR_MASK (AR_LEN - 1)
 
-static void sift(unsigned char *head, size_t width, cmpfun cmp, void *arg, int pshift, size_t lp[])
+static void sift(unsigned char *head, size_t width, cmpfun cmp, void *arg, int pshift, size_t lp[], cyclefun cycler)
 {
 	unsigned char *rt, *lf;
 	unsigned char *ar[AR_LEN];
@@ -121,10 +146,10 @@ static void sift(unsigned char *head, size_t width, cmpfun cmp, void *arg, int p
 			pshift -= 2;
 		}
 	}
-	cycle(width, ar, i & AR_MASK);
+	cycler(width, ar, i & AR_MASK);
 }
 
-static void trinkle(unsigned char *head, size_t width, cmpfun cmp, void *arg, size_t pp[2], int pshift, int trusty, size_t lp[])
+static void trinkle(unsigned char *head, size_t width, cmpfun cmp, void *arg, size_t pp[2], int pshift, int trusty, size_t lp[], cyclefun cycler)
 {
 	unsigned char *stepson,
 	              *rt, *lf;
@@ -158,8 +183,8 @@ static void trinkle(unsigned char *head, size_t width, cmpfun cmp, void *arg, si
 		trusty = 0;
 	}
 	if(!trusty) {
-		cycle(width, ar, i & AR_MASK);
-		sift(head, width, cmp, arg, pshift, lp);
+		cycler(width, ar, i & AR_MASK);
+		sift(head, width, cmp, arg, pshift, lp, cycler);
 	}
 }
 
@@ -169,11 +194,43 @@ void __qsort_r(void *base, size_t nel, size_t width, cmpfun cmp, void *arg)
 	size_t i, size = width * nel;
 	unsigned char *head, *high;
 	size_t p[2] = {1, 0};
+	cyclefun cycler;
 	int pshift = 1;
 	int trail;
 
 	if (!size) return;
 
+	/* choose fastest cycle() function implementation */
+	switch(width) {
+	case 4:
+		cycler = cycle4;
+		break;
+	case 8:
+		cycler = cycle8;
+		break;
+	case 12:
+		cycler = cycle12;
+		break;
+	case 16:
+		cycler = cycle16;
+		break;
+	case 20:
+		cycler = cycle20;
+		break;
+	case 24:
+		cycler = cycle24;
+		break;
+	case 28:
+		cycler = cycle28;
+		break;
+	case 32:
+		cycler = cycle32;
+		break;
+	default:
+		cycler = cycle;
+		break;
+	}
+
 	head = base;
 	high = head + size - width;
 
@@ -182,14 +239,14 @@ void __qsort_r(void *base, size_t nel, size_t width, cmpfun cmp, void *arg)
 
 	while(head < high) {
 		if((p[0] & 3) == 3) {
-			sift(head, width, cmp, arg, pshift, lp);
+			sift(head, width, cmp, arg, pshift, lp, cycler);
 			shr(p, 2);
 			pshift += 2;
 		} else {
 			if(lp[pshift - 1] >= high - head) {
-				trinkle(head, width, cmp, arg, p, pshift, 0, lp);
+				trinkle(head, width, cmp, arg, p, pshift, 0, lp, cycler);
 			} else {
-				sift(head, width, cmp, arg, pshift, lp);
+				sift(head, width, cmp, arg, pshift, lp, cycler);
 			}
 
 			if(pshift == 1) {
@@ -205,7 +262,7 @@ void __qsort_r(void *base, size_t nel, size_t width, cmpfun cmp, void *arg)
 		head += width;
 	}
 
-	trinkle(head, width, cmp, arg, p, pshift, 0, lp);
+	trinkle(head, width, cmp, arg, p, pshift, 0, lp, cycler);
 
 	while(pshift != 1 || p[0] != 1 || p[1] != 0) {
 		if(pshift <= 1) {
@@ -217,10 +274,10 @@ void __qsort_r(void *base, size_t nel, size_t width, cmpfun cmp, void *arg)
 			pshift -= 2;
 			p[0] ^= 7;
 			shr(p, 1);
-			trinkle(head - lp[pshift] - width, width, cmp, arg, p, pshift + 1, 1, lp);
+			trinkle(head - lp[pshift] - width, width, cmp, arg, p, pshift + 1, 1, lp, cycler);
 			shl(p, 1);
 			p[0] |= 1;
-			trinkle(head - width, width, cmp, arg, p, pshift, 1, lp);
+			trinkle(head - width, width, cmp, arg, p, pshift, 1, lp, cycler);
 		}
 		head -= width;
 	}
-- 
2.43.0

Powered by blists - more mailing lists

Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.