|
|
Message-Id: <DERB9JCURGJJ.33K6YRKGATQ2W@tum.de>
Date: Sat, 06 Dec 2025 18:31:06 +0100
From: "Fabian Rast" <fabian.rast@....de>
To: <musl@...ts.openwall.com>
Subject: [PATCH v2] ldso: skip gnu hash calculation if precomputed
calculating the hashes of symbol names can take a significant amount of
time while applying relocations. however, symbol lookups for
symbols that are themselves part of the gnu hashtable in the object
being relocated, the hashvalue is already present in that hashtable
and can be efficiently retrieved using the symbol index.
the gnu hash table only stores the upper 31 bits of the 32 bit wide
hashes. the lsb is used to indicate the end of a chain.
this missing bit can be inferred from the position of the symbol index
relative to the two possible chains.
---
ldso/dynlink.c | 25 +++++++++++++++++++------
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/ldso/dynlink.c b/ldso/dynlink.c
index 44d9b181..6dee7a97 100644
--- a/ldso/dynlink.c
+++ b/ldso/dynlink.c
@@ -320,10 +320,23 @@ static Sym *gnu_lookup_filtered(uint32_t h1, uint32_t *hashtab, struct dso *dso,
#if defined(__GNUC__)
__attribute__((always_inline))
#endif
-static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_def, int use_deps)
+static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_def, int use_deps, struct dso *dso2, int sym_index)
{
- uint32_t h = 0, gh = gnu_hash(s), gho = gh / (8*sizeof(size_t)), *ght;
- size_t ghm = 1ul << gh % (8*sizeof(size_t));
+ uint32_t h = 0, gh, gho, *ght;
+ size_t ghm;
+
+ if (dso2 && (ght = dso2->ghashtab) && sym_index >= ght[1] && ght[0] > 1) {
+ uint32_t nbuckets = ght[0], *buckets = ght + 4 + ght[2] * 2;
+ uint32_t h0 = buckets[nbuckets + sym_index - ght[1]] & ~1u, h1 = h0|1;
+ uint32_t i0 = buckets[h0 % nbuckets], i1 = buckets[h1 % nbuckets];
+ gh = i0 < i1 && sym_index < i1
+ || i0 > i1 && sym_index >= i0
+ || !i1 ? h0 : h1;
+ } else gh = gnu_hash(s);
+
+ gho = gh / (8*sizeof(size_t));
+ ghm = 1ul << gh % (8*sizeof(size_t));
+
struct symdef def = {0};
struct dso **deps = use_deps ? dso->deps : 0;
for (; dso; dso=use_deps ? *deps++ : dso->syms_next) {
@@ -353,7 +366,7 @@ static inline struct symdef find_sym2(struct dso *dso, const char *s, int need_d
static struct symdef find_sym(struct dso *dso, const char *s, int need_def)
{
- return find_sym2(dso, s, need_def, 0);
+ return find_sym2(dso, s, need_def, 0, NULL, STN_UNDEF);
}
static struct symdef get_lfs64(const char *name)
@@ -437,7 +450,7 @@ static void do_relocs(struct dso *dso, size_t *rel, size_t rel_size, size_t stri
ctx = type==REL_COPY ? head->syms_next : head;
def = (sym->st_info>>4) == STB_LOCAL
? (struct symdef){ .dso = dso, .sym = sym }
- : find_sym(ctx, name, type==REL_PLT);
+ : find_sym2(ctx, name, type==REL_PLT, 0, dso, sym_index);
if (!def.sym) def = get_lfs64(name);
if (!def.sym && (sym->st_shndx != SHN_UNDEF
|| sym->st_info>>4 != STB_WEAK)) {
@@ -2345,7 +2358,7 @@ static void *do_dlsym(struct dso *p, const char *s, void *ra)
return 0;
} else
use_deps = 1;
- struct symdef def = find_sym2(p, s, 0, use_deps);
+ struct symdef def = find_sym2(p, s, 0, use_deps, NULL, STN_UNDEF);
if (!def.sym) {
error("Symbol not found: %s", s);
return 0;
--
2.52.0
Hello,
thank you for your feedback on the previous patch.
I have integrated the idea of using the position of the
symbol index relative to the start of the two possible chains
to infer the missing bit in a single calculation that is independent
of the search path length.
I decided to do this instead of using the bloom filter because it seems
more reliable, e.g. the bloom filter might accept both versions of the hash.
There some edgecases that i considered:
the gnu hashtable might only have a single bucket, in which case
this approach cant work. (the bloom filter idea could still work here though)
In this case the hash is just calculated normally.
The hashtable might contain empty chains (buckets[h % nbuckets] == 0)
It is impossible for both considered buckets to be empty. If one bucket is empty
the correct hash must be the one indexing the other bucket.
The relative position of the two chains (which one is first) is undefined.
The chain indexed by h1 can be before the chain indexed by h0.
Even if the buckets array is sorted, it can be that
h0 % nbuckets == nbuckets - 1, and h1 % nbuckets == 0.
> uint32_t i0 = buckets[h0 % nbuckets], i1 = buckets[h1 % nbuckets];
I used two modulo operations because it was slightly faster than
branching on (h0 % nbuckets)+1 == nbuckets. I can't confidently explain this...
= Benchmarks
My interpretation of the benchmarks:
All benchmarks have a lot of variance sadly. Clang seems to be an extreme
example, where this patch helps a lot more than in other scenarios. This
is probably dependend on the average cost of a symbol hash, that grows
with longer symbol names.
For example:
avglen.sh
```
eu-readelf --dyn-syms $1 | awk '{ total_len += length($8); total += 1 } END { print total_len/total }'
```
sh avglen.sh /usr/lib/libLLVM.so.21.1
84.3108
sh avglen.sh /usr/lib/libclang-cpp.so.21.1
88.9683
sh avglen.sh /usr/lib/firefox/libxul.so
24.2836
sh avglen.sh /usr/lib/libwebkit2gtk-4.1.so.0.17.6
37.898
To me it looks like the very long symbol names in libclang and libLLVM are
the cause for the speedups in the clang example, while there are only barely
measurable/no speedups with other libraries.
/home/fr/src/poop/zig-out/bin/poop -d 10000 \
'env LD_LIBRARY_PATH=/home/fr/src/musl/master-install/lib/ /home/fr/src/musl/master-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22' \
'env LD_LIBRARY_PATH=/home/fr/src/musl/precomp-install/lib/ /home/fr/src/musl/precomp-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22'
Benchmark 1 (1437 runs): env LD_LIBRARY_PATH=/home/fr/src/musl/master-install/lib/ /home/fr/src/musl/master-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22
measurement mean ± σ min … max outliers delta
wall_time 6.94ms ± 1.12ms 3.93ms … 9.19ms 0 ( 0%) 0%
peak_rss 12.7MB ± 100KB 12.3MB … 12.9MB 6 ( 0%) 0%
cpu_cycles 10.8M ± 671K 9.34M … 14.4M 26 ( 2%) 0%
instructions 19.3M ± 880 19.3M … 19.3M 49 ( 3%) 0%
cache_references 466K ± 3.17K 457K … 485K 21 ( 1%) 0%
cache_misses 84.7K ± 1.33K 82.1K … 90.6K 37 ( 3%) 0%
branch_misses 79.5K ± 731 77.9K … 82.8K 4 ( 0%) 0%
Benchmark 2 (1599 runs): env LD_LIBRARY_PATH=/home/fr/src/musl/precomp-install/lib/ /home/fr/src/musl/precomp-install/lib/ld-musl-x86_64.so.1 --list /home/fr/src/llvm-project/build/bin/clang-22
measurement mean ± σ min … max outliers delta
wall_time 6.24ms ± 795us 3.59ms … 8.19ms 69 ( 4%) ⚡- 10.0% ± 1.0%
peak_rss 12.7MB ± 94.4KB 12.3MB … 12.8MB 9 ( 1%) + 0.1% ± 0.1%
cpu_cycles 8.79M ± 486K 7.49M … 11.6M 89 ( 6%) ⚡- 18.9% ± 0.4%
instructions 14.7M ± 859 14.7M … 14.7M 49 ( 3%) ⚡- 23.6% ± 0.0%
cache_references 465K ± 4.66K 453K … 487K 17 ( 1%) - 0.1% ± 0.1%
cache_misses 84.3K ± 1.38K 81.0K … 90.9K 108 ( 7%) - 0.4% ± 0.1%
branch_misses 77.0K ± 485 75.9K … 78.8K 12 ( 1%) ⚡- 3.2% ± 0.1%
The speedup is similar to the first version of the patch. My explanation
for this is, that, as Szabolcs Nagy pointed out, running clang does not
involve many dsos, so the overhead of the second bucket search per dso
had only a tiny impact.
The other benchmarks were executed in an alpine docker container:
sudo docker run --rm -it -v./master-install:/tmp/master-install -v./precomp-install:/tmp/precomp-install alpine:latest sh
apk add hyperfine clang ghostscript-gtk libreoffice-common webkit2gtk-4.1 firefox mpv ffmpeg
bm.sh:
```
hyperfine -N -w 500 -r 1000 "env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list $1" "env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list $1"
```
= clang
/ # sh bm.sh /usr/bin/clang
Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang
Time (mean ± σ): 11.7 ms ± 2.8 ms [User: 8.8 ms, System: 2.8 ms]
Range (min … max): 6.7 ms … 19.6 ms 1000 runs
Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang
Time (mean ± σ): 9.5 ms ± 1.7 ms [User: 6.7 ms, System: 2.7 ms]
Range (min … max): 5.6 ms … 13.5 ms 1000 runs
Summary
env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/clang ran
1.24 ± 0.38 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/clang
= gsx
/ # sh bm.sh /usr/bin/gsx
Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/gsx
Time (mean ± σ): 15.3 ms ± 3.4 ms [User: 10.6 ms, System: 4.6 ms]
Range (min … max): 10.3 ms … 23.4 ms 1000 runs
Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/gsx
Time (mean ± σ): 14.9 ms ± 3.2 ms [User: 10.2 ms, System: 4.6 ms]
Range (min … max): 9.9 ms … 21.5 ms 1000 runs
Summary
env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/gsx ran
1.03 ± 0.32 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/gsx
= ffmpeg
/ # sh bm.sh /usr/bin/ffmpeg
Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/ffmpeg
Time (mean ± σ): 20.5 ms ± 4.1 ms [User: 15.1 ms, System: 5.1 ms]
Range (min … max): 14.6 ms … 32.7 ms 1000 runs
Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/ffmpeg
Time (mean ± σ): 19.6 ms ± 3.7 ms [User: 14.3 ms, System: 5.1 ms]
Range (min … max): 14.0 ms … 31.4 ms 1000 runs
Summary
env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/ffmpeg ran
1.05 ± 0.29 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/ffmpeg
= mpv
/ # sh bm.sh /usr/bin/mpv
Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/mpv
Time (mean ± σ): 52.9 ms ± 4.4 ms [User: 44.7 ms, System: 7.9 ms]
Range (min … max): 46.3 ms … 91.3 ms 1000 runs
Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/mpv
Time (mean ± σ): 51.4 ms ± 4.5 ms [User: 43.2 ms, System: 7.9 ms]
Range (min … max): 44.7 ms … 91.7 ms 1000 runs
Summary
env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/bin/mpv ran
1.03 ± 0.12 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/bin/mpv
= libreoffice
/ # sh bm.sh /usr/lib/libreoffice/program/soffice.bin
Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/lib/libreoffice/program/soffice.bin
Time (mean ± σ): 45.8 ms ± 5.2 ms [User: 38.4 ms, System: 7.2 ms]
Range (min … max): 38.0 ms … 79.1 ms 1000 runs
Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/lib/libreoffice/program/soffice.bin
Time (mean ± σ): 45.0 ms ± 4.9 ms [User: 37.6 ms, System: 7.2 ms]
Range (min … max): 36.6 ms … 69.6 ms 1000 runs
Summary
env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/lib/libreoffice/program/soffice.bin ran
1.02 ± 0.16 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/lib/libreoffice/program/soffice.bin
= /usr/lib/libwebkit2gtk-*
/ # sh bm.sh /usr/lib/libwebkit2gtk-4.1.so.0
Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/lib/libwebkit2gtk-4.1.so.0
Time (mean ± σ): 57.6 ms ± 5.1 ms [User: 49.2 ms, System: 8.1 ms]
Range (min … max): 49.1 ms … 93.5 ms 1000 runs
Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/lib/libwebkit2gtk-4.1.so.0
Time (mean ± σ): 54.9 ms ± 4.9 ms [User: 46.4 ms, System: 8.3 ms]
Range (min … max): 47.6 ms … 95.2 ms 1000 runs
Summary
env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/lib/libwebkit2gtk-4.1.so.0 ran
1.05 ± 0.13 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/lib/libwebkit2gtk-4.1.so.0
= firefox
/ # sh bm.sh /usr/lib/firefox/libxul.so
Benchmark 1: env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/lib/firefox/libxul.so
Time (mean ± σ): 18.9 ms ± 3.6 ms [User: 13.7 ms, System: 5.0 ms]
Range (min … max): 12.6 ms … 28.2 ms 1000 runs
Benchmark 2: env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/lib/firefox/libxul.so
Time (mean ± σ): 18.3 ms ± 3.7 ms [User: 13.3 ms, System: 4.9 ms]
Range (min … max): 12.8 ms … 28.4 ms 1000 runs
Summary
env LD_LIBRARY_PATH=/tmp/precomp-install/lib /tmp/precomp-install/lib/libc.so --list /usr/lib/firefox/libxul.so ran
1.03 ± 0.29 times faster than env LD_LIBRARY_PATH=/tmp/master-install/lib /tmp/master-install/lib/libc.so --list /usr/lib/firefox/libxul.so
Download attachment "signature.asc" of type "application/pgp-signature" (229 bytes)
Powered by blists - more mailing lists
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.