musl - [PATCH] ldso: skip gnu hash calculation if precomputed

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <DEPUQFJLZZVW.14ANQUQF9NVLU@tum.de>
Date: Fri, 05 Dec 2025 01:21:11 +0100
From: "Fabian Rast" <fabian.rast@....de>
To: <musl@...ts.openwall.com>
Subject: [PATCH] ldso: skip gnu hash calculation if precomputed

calculating the hashes of symbol names takes a significant amount of
time while applying relocations. However, for symbol lookups for
names that are themselves part of the gnu hashtable in the object
being relocated, the hashvalue is already present in that hashtable
and can be efficiently retrieved using the symbol index.

the gnu hash table only stores the upper 31 bits of the 32 bit wide
hashes. the lsb is used to indicate the end of a chain.
this missing bit can be guessed for the cost of traversing two chains
in the worst case.
---
 ldso/dynlink.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/ldso/dynlink.c b/ldso/dynlink.c
index 715948f4..fe716b9e 100644
--- a/ldso/dynlink.c
+++ b/ldso/dynlink.c
@@ -311,16 +311,27 @@ static Sym *gnu_lookup_filtered(uint32_t h1, uint32_t *hashtab, struct dso *dso,
 #if defined(__GNUC__)
 __attribute__((always_inline))
 #endif
-static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_def, int use_deps)
+static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_def, int use_deps, struct dso *dso2, int sym_index)
 {
-	uint32_t h = 0, gh = gnu_hash(s), gho = gh / (8*sizeof(size_t)), *ght;
-	size_t ghm = 1ul << gh % (8*sizeof(size_t));
+	uint32_t h = 0, gh, gho, *ght, gh_precomp;
+	size_t ghm;
+
+	gh_precomp = dso2 && (ght = dso2->ghashtab) && sym_index >= ght[1];
+	gh = gh_precomp
+		? ght[4 + ght[2] * 2 + ght[0] + sym_index - ght[1]] & ~1u
+		: gnu_hash(s);
+
+	gho = gh / (8*sizeof(size_t));
+	ghm = 1ul << gh % (8*sizeof(size_t));
+
 	struct symdef def = {0};
 	struct dso **deps = use_deps ? dso->deps : 0;
 	for (; dso; dso=use_deps ? *deps++ : dso->syms_next) {
 		Sym *sym;
 		if ((ght = dso->ghashtab)) {
 			sym = gnu_lookup_filtered(gh, ght, dso, s, gho, ghm);
+			if (!sym && gh_precomp)
+				sym = gnu_lookup_filtered(gh+1, ght, dso, s, gho, (ghm<<1)?(ghm<<1):1);
 		} else {
 			if (!h) h = sysv_hash(s);
 			sym = sysv_lookup(s, h, dso);
@@ -344,7 +355,7 @@ static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_d
 
 static struct symdef find_sym(struct dso *dso, const char *s, int need_def)
 {
-	return find_sym2(dso, s, need_def, 0);
+	return find_sym2(dso, s, need_def, 0, NULL, STN_UNDEF);
 }
 
 static struct symdef get_lfs64(const char *name)
@@ -428,7 +439,7 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
 			ctx = type==REL_COPY ? head->syms_next : head;
 			def = (sym->st_info>>4) == STB_LOCAL
 				? (struct symdef){ .dso = dso, .sym = sym }
-				: find_sym(ctx, name, type==REL_PLT);
+				: find_sym2(ctx, name, type==REL_PLT, 0, dso, sym_index);
 			if (!def.sym) def = get_lfs64(name);
 			if (!def.sym && (sym->st_shndx != SHN_UNDEF
 			    || sym->st_info>>4 != STB_WEAK)) {
@@ -2277,7 +2288,7 @@ static void *do_dlsym(struct dso *p, const char *s, void *ra)
 		return 0;
 	} else
 		use_deps = 1;
-	struct symdef def = find_sym2(p, s, 0, use_deps);
+	struct symdef def = find_sym2(p, s, 0, use_deps, NULL, STN_UNDEF);
 	if (!def.sym) {
 		error("Symbol not found: %s", s);
 		return 0;
-- 
2.52.0

Hi,

i would like to propose an optimization to dynamic symbol lookup that
improves load times of executables using big libraries:

During relocation symbol lookups are commonly using the gnu hashtable.
I found that a significant portion of time is spent calculating hashes
of symbol names as part of the gnu hashtable lookup.

It is unfortunate that this is done at load-time, because
the hashes are already known at link time.

Because hashes are stored almost in full in the gnu hashtable
it is possible to use these precomputed values from the hashtable
in some cases.

If a symbol is beeing looked up to apply a relocation and that
symbol is also defined in the dso being relocated (and thus part
of the hashtable) the upper 31 bits of the hash can be efficiently
loaded from the hashtable using the symbol index.

The missing lsb can be guessed for the cost of possibly looking
in two chains for the two possible hash values.

Please let me know what you think, I am happy to hear your feedback.
Best Regards,

Fabian Rast


= Benchmarks
I tested this using clang 22 linking libclang-cpp and libLLVM.  I inserted
timestamps into the loader startup code to measure startuptime until the
programs entry point is called.
On my laptop (AMD Ryzen AI 9 365) averaged over 200 measurements:
Without optimization: 5441.205 us
With    optimization: 4714.1   us
This suggests a roughly 10% improvement in startuptime in this scenario.


I also measured running musl in ldd mode and without the timing measurement
modifications:
```
/home/fr/src/poop/zig-out/bin/poop -d 10000 \
    'env LD_LIBRARY_PATH=/home/fr/src/musl/master-install/lib/ /home/fr/src/musl/master-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22' \
    'env LD_LIBRARY_PATH=/home/fr/src/musl/precomp-install/lib/ /home/fr/src/musl/precomp-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22'
```

Benchmark 1 (1297 runs): env LD_LIBRARY_PATH=/home/fr/src/musl/master-install/lib/ /home/fr/src/musl/master-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          7.66ms ±  863us    5.08ms … 11.3ms         38 ( 3%)        0%
  peak_rss           12.6MB ± 81.2KB    12.0MB … 12.7MB        185 (14%)        0%
  cpu_cycles         11.0M  ± 1.12M     9.08M  … 15.6M          65 ( 5%)        0%
  instructions       19.3M  ±  897      19.3M  … 19.3M          48 ( 4%)        0%
  cache_references    463K  ± 2.52K      453K  …  481K          11 ( 1%)        0%
  cache_misses       84.2K  ± 1.45K     81.0K  … 91.2K          26 ( 2%)        0%
  branch_misses      79.6K  ±  814      78.0K  … 83.3K           3 ( 0%)        0%
Benchmark 2 (1445 runs): env LD_LIBRARY_PATH=/home/fr/src/musl/precomp-install/lib/ /home/fr/src/musl/precomp-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22
  measurement          mean ± σ            min … max           outliers         delta
  wall_time          6.89ms ±  615us    4.70ms … 9.00ms         19 ( 1%)        ⚡- 10.1% ±  0.7%
  peak_rss           12.6MB ± 82.3KB    12.1MB … 12.7MB         89 ( 6%)          +  0.0% ±  0.0%
  cpu_cycles         9.35M  ±  807K     7.72M  … 12.5M          91 ( 6%)        ⚡- 15.3% ±  0.7%
  instructions       15.4M  ±  865      15.4M  … 15.4M          42 ( 3%)        ⚡- 19.9% ±  0.0%
  cache_references    466K  ± 4.28K      454K  …  504K          30 ( 2%)          +  0.8% ±  0.1%
  cache_misses       85.6K  ± 1.50K     82.7K  … 92.3K          54 ( 4%)        💩+  1.7% ±  0.1%
  branch_misses      83.4K  ±  686      81.8K  … 85.6K           1 ( 0%)        💩+  4.7% ±  0.1%

To reproduce without compiling clang with a musl cross toolchain an
alpine docker container can be used:
- Install the two different versions into `./master-install` and `./precomp-install`
- sudo docker run --rm -it -v./master-install:/tmp/master-install -v./precomp-install:/tmp/precomp-install alpine:latest sh
in the container:
- apk add clang hyperfine
- hyperfine -N -w 100 'env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang' 'env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang'

```
Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang
  Time (mean ± σ):      12.6 ms ±   1.8 ms    [User: 9.7 ms, System: 2.8 ms]
  Range (min … max):     9.3 ms …  16.0 ms    191 runs

Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang
  Time (mean ± σ):      10.5 ms ±   1.4 ms    [User: 7.7 ms, System: 2.7 ms]
  Range (min … max):     7.5 ms …  13.5 ms    298 runs

Summary
  env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang ran
    1.20 ± 0.23 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang
```


Download attachment "signature.asc" of type "application/pgp-signature" (229 bytes)
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.