|
|
Message-Id: <DEPUQFJLZZVW.14ANQUQF9NVLU@tum.de>
Date: Fri, 05 Dec 2025 01:21:11 +0100
From: "Fabian Rast" <fabian.rast@....de>
To: <musl@...ts.openwall.com>
Subject: [PATCH] ldso: skip gnu hash calculation if precomputed
calculating the hashes of symbol names takes a significant amount of
time while applying relocations. However, for symbol lookups for
names that are themselves part of the gnu hashtable in the object
being relocated, the hashvalue is already present in that hashtable
and can be efficiently retrieved using the symbol index.
the gnu hash table only stores the upper 31 bits of the 32 bit wide
hashes. the lsb is used to indicate the end of a chain.
this missing bit can be guessed for the cost of traversing two chains
in the worst case.
---
ldso/dynlink.c | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/ldso/dynlink.c b/ldso/dynlink.c
index 715948f4..fe716b9e 100644
--- a/ldso/dynlink.c
+++ b/ldso/dynlink.c
@@ -311,16 +311,27 @@ static Sym *gnu_lookup_filtered(uint32_t h1, uint32_t *hashtab, struct dso *dso,
#if defined(__GNUC__)
__attribute__((always_inline))
#endif
-static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_def, int use_deps)
+static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_def, int use_deps, struct dso *dso2, int sym_index)
{
- uint32_t h = 0, gh = gnu_hash(s), gho = gh / (8*sizeof(size_t)), *ght;
- size_t ghm = 1ul << gh % (8*sizeof(size_t));
+ uint32_t h = 0, gh, gho, *ght, gh_precomp;
+ size_t ghm;
+
+ gh_precomp = dso2 && (ght = dso2->ghashtab) && sym_index >= ght[1];
+ gh = gh_precomp
+ ? ght[4 + ght[2] * 2 + ght[0] + sym_index - ght[1]] & ~1u
+ : gnu_hash(s);
+
+ gho = gh / (8*sizeof(size_t));
+ ghm = 1ul << gh % (8*sizeof(size_t));
+
struct symdef def = {0};
struct dso **deps = use_deps ? dso->deps : 0;
for (; dso; dso=use_deps ? *deps++ : dso->syms_next) {
Sym *sym;
if ((ght = dso->ghashtab)) {
sym = gnu_lookup_filtered(gh, ght, dso, s, gho, ghm);
+ if (!sym && gh_precomp)
+ sym = gnu_lookup_filtered(gh+1, ght, dso, s, gho, (ghm<<1)?(ghm<<1):1);
} else {
if (!h) h = sysv_hash(s);
sym = sysv_lookup(s, h, dso);
@@ -344,7 +355,7 @@ static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_d
static struct symdef find_sym(struct dso *dso, const char *s, int need_def)
{
- return find_sym2(dso, s, need_def, 0);
+ return find_sym2(dso, s, need_def, 0, NULL, STN_UNDEF);
}
static struct symdef get_lfs64(const char *name)
@@ -428,7 +439,7 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
ctx = type==REL_COPY ? head->syms_next : head;
def = (sym->st_info>>4) == STB_LOCAL
? (struct symdef){ .dso = dso, .sym = sym }
- : find_sym(ctx, name, type==REL_PLT);
+ : find_sym2(ctx, name, type==REL_PLT, 0, dso, sym_index);
if (!def.sym) def = get_lfs64(name);
if (!def.sym && (sym->st_shndx != SHN_UNDEF
|| sym->st_info>>4 != STB_WEAK)) {
@@ -2277,7 +2288,7 @@ static void *do_dlsym(struct dso *p, const char *s, void *ra)
return 0;
} else
use_deps = 1;
- struct symdef def = find_sym2(p, s, 0, use_deps);
+ struct symdef def = find_sym2(p, s, 0, use_deps, NULL, STN_UNDEF);
if (!def.sym) {
error("Symbol not found: %s", s);
return 0;
--
2.52.0
Hi,
i would like to propose an optimization to dynamic symbol lookup that
improves load times of executables using big libraries:
During relocation symbol lookups are commonly using the gnu hashtable.
I found that a significant portion of time is spent calculating hashes
of symbol names as part of the gnu hashtable lookup.
It is unfortunate that this is done at load-time, because
the hashes are already known at link time.
Because hashes are stored almost in full in the gnu hashtable
it is possible to use these precomputed values from the hashtable
in some cases.
If a symbol is beeing looked up to apply a relocation and that
symbol is also defined in the dso being relocated (and thus part
of the hashtable) the upper 31 bits of the hash can be efficiently
loaded from the hashtable using the symbol index.
The missing lsb can be guessed for the cost of possibly looking
in two chains for the two possible hash values.
Please let me know what you think, I am happy to hear your feedback.
Best Regards,
Fabian Rast
= Benchmarks
I tested this using clang 22 linking libclang-cpp and libLLVM. I inserted
timestamps into the loader startup code to measure startuptime until the
programs entry point is called.
On my laptop (AMD Ryzen AI 9 365) averaged over 200 measurements:
Without optimization: 5441.205 us
With optimization: 4714.1 us
This suggests a roughly 10% improvement in startuptime in this scenario.
I also measured running musl in ldd mode and without the timing measurement
modifications:
```
/home/fr/src/poop/zig-out/bin/poop -d 10000 \
'env LD_LIBRARY_PATH=/home/fr/src/musl/master-install/lib/ /home/fr/src/musl/master-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22' \
'env LD_LIBRARY_PATH=/home/fr/src/musl/precomp-install/lib/ /home/fr/src/musl/precomp-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22'
```
Benchmark 1 (1297 runs): env LD_LIBRARY_PATH=/home/fr/src/musl/master-install/lib/ /home/fr/src/musl/master-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22
measurement mean ± σ min … max outliers delta
wall_time 7.66ms ± 863us 5.08ms … 11.3ms 38 ( 3%) 0%
peak_rss 12.6MB ± 81.2KB 12.0MB … 12.7MB 185 (14%) 0%
cpu_cycles 11.0M ± 1.12M 9.08M … 15.6M 65 ( 5%) 0%
instructions 19.3M ± 897 19.3M … 19.3M 48 ( 4%) 0%
cache_references 463K ± 2.52K 453K … 481K 11 ( 1%) 0%
cache_misses 84.2K ± 1.45K 81.0K … 91.2K 26 ( 2%) 0%
branch_misses 79.6K ± 814 78.0K … 83.3K 3 ( 0%) 0%
Benchmark 2 (1445 runs): env LD_LIBRARY_PATH=/home/fr/src/musl/precomp-install/lib/ /home/fr/src/musl/precomp-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22
measurement mean ± σ min … max outliers delta
wall_time 6.89ms ± 615us 4.70ms … 9.00ms 19 ( 1%) ⚡- 10.1% ± 0.7%
peak_rss 12.6MB ± 82.3KB 12.1MB … 12.7MB 89 ( 6%) + 0.0% ± 0.0%
cpu_cycles 9.35M ± 807K 7.72M … 12.5M 91 ( 6%) ⚡- 15.3% ± 0.7%
instructions 15.4M ± 865 15.4M … 15.4M 42 ( 3%) ⚡- 19.9% ± 0.0%
cache_references 466K ± 4.28K 454K … 504K 30 ( 2%) + 0.8% ± 0.1%
cache_misses 85.6K ± 1.50K 82.7K … 92.3K 54 ( 4%) 💩+ 1.7% ± 0.1%
branch_misses 83.4K ± 686 81.8K … 85.6K 1 ( 0%) 💩+ 4.7% ± 0.1%
To reproduce without compiling clang with a musl cross toolchain an
alpine docker container can be used:
- Install the two different versions into `./master-install` and `./precomp-install`
- sudo docker run --rm -it -v./master-install:/tmp/master-install -v./precomp-install:/tmp/precomp-install alpine:latest sh
in the container:
- apk add clang hyperfine
- hyperfine -N -w 100 'env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang' 'env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang'
```
Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang
Time (mean ± σ): 12.6 ms ± 1.8 ms [User: 9.7 ms, System: 2.8 ms]
Range (min … max): 9.3 ms … 16.0 ms 191 runs
Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang
Time (mean ± σ): 10.5 ms ± 1.4 ms [User: 7.7 ms, System: 2.7 ms]
Range (min … max): 7.5 ms … 13.5 ms 298 runs
Summary
env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang ran
1.20 ± 0.23 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang
```
Download attachment "signature.asc" of type "application/pgp-signature" (229 bytes)
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.