musl - Re: [PATCH] ldso: skip gnu hash calculation if precomputed

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251205154341.GM3520958@port70.net>
Date: Fri, 5 Dec 2025 16:43:41 +0100
From: Szabolcs Nagy <nsz@...t70.net>
To: Fabian Rast <fabian.rast@....de>
Cc: musl@...ts.openwall.com
Subject: Re: [PATCH] ldso: skip gnu hash calculation if precomputed

* Fabian Rast <fabian.rast@....de> [2025-12-05 01:21:11 +0100]:
> calculating the hashes of symbol names takes a significant amount of
> time while applying relocations. However, for symbol lookups for
> names that are themselves part of the gnu hashtable in the object
> being relocated, the hashvalue is already present in that hashtable
> and can be efficiently retrieved using the symbol index.
> 
> the gnu hash table only stores the upper 31 bits of the 32 bit wide
> hashes. the lsb is used to indicate the end of a chain.
> this missing bit can be guessed for the cost of traversing two chains
> in the worst case.

you mean two chains per loaded dso?

then i'd expect the optimization to be only enabled
if dso-index-in-symbol-lookup-order < N.

> ---
>  ldso/dynlink.c | 23 +++++++++++++++++------
>  1 file changed, 17 insertions(+), 6 deletions(-)
> 
> diff --git a/ldso/dynlink.c b/ldso/dynlink.c
> index 715948f4..fe716b9e 100644
> --- a/ldso/dynlink.c
> +++ b/ldso/dynlink.c
> @@ -311,16 +311,27 @@ static Sym *gnu_lookup_filtered(uint32_t h1, uint32_t *hashtab, struct dso *dso,
>  #if defined(__GNUC__)
>  __attribute__((always_inline))
>  #endif
> -static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_def, int use_deps)
> +static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_def, int use_deps, struct dso *dso2, int sym_index)
>  {
> -	uint32_t h = 0, gh = gnu_hash(s), gho = gh / (8*sizeof(size_t)), *ght;
> -	size_t ghm = 1ul << gh % (8*sizeof(size_t));
> +	uint32_t h = 0, gh, gho, *ght, gh_precomp;
> +	size_t ghm;
> +
> +	gh_precomp = dso2 && (ght = dso2->ghashtab) && sym_index >= ght[1];
> +	gh = gh_precomp
> +		? ght[4 + ght[2] * 2 + ght[0] + sym_index - ght[1]] & ~1u
> +		: gnu_hash(s);
> +
> +	gho = gh / (8*sizeof(size_t));
> +	ghm = 1ul << gh % (8*sizeof(size_t));
> +
>  	struct symdef def = {0};
>  	struct dso **deps = use_deps ? dso->deps : 0;
>  	for (; dso; dso=use_deps ? *deps++ : dso->syms_next) {
>  		Sym *sym;
>  		if ((ght = dso->ghashtab)) {
>  			sym = gnu_lookup_filtered(gh, ght, dso, s, gho, ghm);
> +			if (!sym && gh_precomp)
> +				sym = gnu_lookup_filtered(gh+1, ght, dso, s, gho, (ghm<<1)?(ghm<<1):1);

how can ghm<<1 == 0 happen?

>  		} else {
>  			if (!h) h = sysv_hash(s);
>  			sym = sysv_lookup(s, h, dso);
> @@ -344,7 +355,7 @@ static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_d
>  
>  static struct symdef find_sym(struct dso *dso, const char *s, int need_def)
>  {
> -	return find_sym2(dso, s, need_def, 0);
> +	return find_sym2(dso, s, need_def, 0, NULL, STN_UNDEF);
>  }
>  
>  static struct symdef get_lfs64(const char *name)
> @@ -428,7 +439,7 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
>  			ctx = type==REL_COPY ? head->syms_next : head;
>  			def = (sym->st_info>>4) == STB_LOCAL
>  				? (struct symdef){ .dso = dso, .sym = sym }
> -				: find_sym(ctx, name, type==REL_PLT);
> +				: find_sym2(ctx, name, type==REL_PLT, 0, dso, sym_index);
>  			if (!def.sym) def = get_lfs64(name);
>  			if (!def.sym && (sym->st_shndx != SHN_UNDEF
>  			    || sym->st_info>>4 != STB_WEAK)) {
> @@ -2277,7 +2288,7 @@ static void *do_dlsym(struct dso *p, const char *s, void *ra)
>  		return 0;
>  	} else
>  		use_deps = 1;
> -	struct symdef def = find_sym2(p, s, 0, use_deps);
> +	struct symdef def = find_sym2(p, s, 0, use_deps, NULL, STN_UNDEF);
>  	if (!def.sym) {
>  		error("Symbol not found: %s", s);
>  		return 0;
> -- 
> 2.52.0
...
> I tested this using clang 22 linking libclang-cpp and libLLVM.  I inserted
> timestamps into the loader startup code to measure startuptime until the
> programs entry point is called.
> On my laptop (AMD Ryzen AI 9 365) averaged over 200 measurements:
> Without optimization: 5441.205 us
> With    optimization: 4714.1   us
> This suggests a roughly 10% improvement in startuptime in this scenario.

i don't quite understand the scenario (is this just clang startup time?
why different than the number below?) but would be good to know how many
dsos (and relocs etc) are involved.

> 
> 
> I also measured running musl in ldd mode and without the timing measurement
> modifications:
> ```
> /home/fr/src/poop/zig-out/bin/poop -d 10000 \
>     'env LD_LIBRARY_PATH=/home/fr/src/musl/master-install/lib/ /home/fr/src/musl/master-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22' \
>     'env LD_LIBRARY_PATH=/home/fr/src/musl/precomp-install/lib/ /home/fr/src/musl/precomp-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22'
> ```
> 
> Benchmark 1 (1297 runs): env LD_LIBRARY_PATH=/home/fr/src/musl/master-install/lib/ /home/fr/src/musl/master-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22
>   measurement          mean ± σ            min … max           outliers         delta
>   wall_time          7.66ms ±  863us    5.08ms … 11.3ms         38 ( 3%)        0%
>   peak_rss           12.6MB ± 81.2KB    12.0MB … 12.7MB        185 (14%)        0%
>   cpu_cycles         11.0M  ± 1.12M     9.08M  … 15.6M          65 ( 5%)        0%
>   instructions       19.3M  ±  897      19.3M  … 19.3M          48 ( 4%)        0%
>   cache_references    463K  ± 2.52K      453K  …  481K          11 ( 1%)        0%
>   cache_misses       84.2K  ± 1.45K     81.0K  … 91.2K          26 ( 2%)        0%
>   branch_misses      79.6K  ±  814      78.0K  … 83.3K           3 ( 0%)        0%
> Benchmark 2 (1445 runs): env LD_LIBRARY_PATH=/home/fr/src/musl/precomp-install/lib/ /home/fr/src/musl/precomp-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22
>   measurement          mean ± σ            min … max           outliers         delta
>   wall_time          6.89ms ±  615us    4.70ms … 9.00ms         19 ( 1%)        ⚡- 10.1% ±  0.7%
>   peak_rss           12.6MB ± 82.3KB    12.1MB … 12.7MB         89 ( 6%)          +  0.0% ±  0.0%
>   cpu_cycles         9.35M  ±  807K     7.72M  … 12.5M          91 ( 6%)        ⚡- 15.3% ±  0.7%
>   instructions       15.4M  ±  865      15.4M  … 15.4M          42 ( 3%)        ⚡- 19.9% ±  0.0%
>   cache_references    466K  ± 4.28K      454K  …  504K          30 ( 2%)          +  0.8% ±  0.1%
>   cache_misses       85.6K  ± 1.50K     82.7K  … 92.3K          54 ( 4%)        💩+  1.7% ±  0.1%
>   branch_misses      83.4K  ±  686      81.8K  … 85.6K           1 ( 0%)        💩+  4.7% ±  0.1%

these results look good.
thanks.

> 
> To reproduce without compiling clang with a musl cross toolchain an
> alpine docker container can be used:
> - Install the two different versions into `./master-install` and `./precomp-install`
> - sudo docker run --rm -it -v./master-install:/tmp/master-install -v./precomp-install:/tmp/precomp-install alpine:latest sh
> in the container:
> - apk add clang hyperfine
> - hyperfine -N -w 100 'env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang' 'env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang'
> 
> ```
> Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang
>   Time (mean ± σ):      12.6 ms ±   1.8 ms    [User: 9.7 ms, System: 2.8 ms]
>   Range (min … max):     9.3 ms …  16.0 ms    191 runs
> 
> Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang
>   Time (mean ± σ):      10.5 ms ±   1.4 ms    [User: 7.7 ms, System: 2.7 ms]
>   Range (min … max):     7.5 ms …  13.5 ms    298 runs
> 
> Summary
>   env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang ran
>     1.20 ± 0.23 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang
> ```
>
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.