>From 54d5caf36cdce4e5008aecfcc2b02580fb52d0cb Mon Sep 17 00:00:00 2001 From: Joakim Sindholt Date: Wed, 29 Mar 2017 11:51:02 +0200 Subject: [PATCH] add IDNA support to name lookups --- src/network/lookup_name.c | 202 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 193 insertions(+), 9 deletions(-) diff --git a/src/network/lookup_name.c b/src/network/lookup_name.c index fb7303a..fd4275c 100644 --- a/src/network/lookup_name.c +++ b/src/network/lookup_name.c @@ -10,9 +10,21 @@ #include #include #include +#include #include "lookup.h" #include "stdio_impl.h" #include "syscall.h" +#include "locale_impl.h" + +enum { + base = 36, + tmin = 1, + tmax = 26, + skew = 38, + damp = 700, + initial_bias = 72, + initial_n = 128, +}; static int is_valid_hostname(const char *host) { @@ -22,6 +34,163 @@ static int is_valid_hostname(const char *host) return !*s; } +static unsigned int adapt(unsigned int delta, unsigned int numpoints, int firsttime) +{ + unsigned int k = 0; + delta /= firsttime ? damp : 2; + delta += delta / numpoints; + while (delta > ((base - tmin) * tmax) / 2) { + delta /= base - tmin; + k += base; + } + return k + ((base - tmin + 1) * delta) / (delta + skew); +} + +static ssize_t punyenc(char *dst, const char *src, size_t len, size_t max) +{ + static const char *const tbl = "abcdefghijklmnopqrstuvwxyz0123456789"; + const unsigned char *usrc = (void *)src; + unsigned int codepoints = 0; + unsigned int dlen = 0; + unsigned int si, mi; + unsigned int n = initial_n; + unsigned int delta = 0; + unsigned int bias = initial_bias; + unsigned int h, b; + for (si = 0; si < len; ++si) { + if (usrc[si] < 0x80) { + if (dlen == max) + return -1; + dst[dlen++] = src[si]; + } else if ((usrc[si] & 0xC0) == 0xC0) { + ++codepoints; + } + } + codepoints += dlen; + h = b = dlen; + if (dlen) { + if (dlen == max) + return -1; + dst[dlen++] = '-'; + } + while (h < codepoints) { + unsigned int m = (unsigned int)-1; + unsigned int c; + wchar_t wc; + for (mi = 0; mi < len; ) { + mi += mbtowc(&wc, src + mi, len - mi); + c = (unsigned int)wc; + if (c >= n && c < m) + m = c; + } + if (((unsigned int)-1 - delta) / (h + 1) < m - n) + return -1; + delta += (m - n) * (h + 1); + n = m; + + for (mi = 0; mi < len; ) { + mi += mbtowc(&wc, src + mi, len - mi); + c = (unsigned int)wc; + if (c < n /* || c < 0x80 not necessary*/) + if (++delta == 0) + return -1; + if (c == n) { + unsigned int q = delta; + unsigned int k; + for (k = base; ; k += base) { + unsigned int t; + if (k <= bias + tmin) { + t = tmin; + } else if (k >= bias + tmax) { + t = tmax; + } else { + t = k - bias; + } + if (q < t) + break; + if (dlen == max) + return -1; + dst[dlen++] = tbl[t + ((q - t) % (base - t))]; + q = (q - t) / (base - t); + } + if (dlen == max) + return -1; + dst[dlen++] = tbl[q]; + bias = adapt(delta, h + 1, h == b); + delta = 0; + ++h; + } + } + ++delta; + ++n; + } + return dlen; +} + +static ssize_t idnaenc(char dst[static 256], const char *src) +{ + size_t left = strlen(src); + size_t olen = 0; + + while (left) { + const char *dot; + size_t len, i; + int basic = 1; + + dot = memchr(src, '.', left); + if (!dot) { dot = src + left; } + len = dot - src; + if (len == 0) { return -1; } + left -= len + !!*dot; + + for (i = 0; i < len; ) { + unsigned int c; + wchar_t wc; + int n = mbtowc(&wc, src + i, len - i); + c = (n <= 0) ? 0 : (unsigned int)wc; + if (c < 0x80) { + if (!isalnum(c) && !(i > 0 && c == '-')) + return -1; + } else { + if ((c >= 0x7F && c <= 0x9F) || + (c >= 0xD800 && c <= 0xDFFF)) + return -1; + basic = 0; + } + i += n; + } + if (basic) { + if (len > 63 || len > 254 - olen) + return -1; + for (i = 0; i < len; ++i) + dst[olen + i] = tolower(src[i]); + olen += len; + } else { + ssize_t r; + size_t max; + if (olen >= 254 - 4) + return -1; + max = 254 - 4 - olen; + if (max > 63 - 4) + max = 63 - 4; + memcpy(dst + olen, "xn--", 4); + r = punyenc(dst + olen + 4, src, len, max); + if (r <= 0) + return -1; + olen += r + 4; + } + if (olen == 255 || (!*dot && olen == 254)) + return -1; + if (*dot) + dst[olen++] = *dot; + src = dot + !!*dot; + } + if (olen == 0) + return -1; + dst[olen] = 0; + return olen; +} + static int name_from_null(struct address buf[static 2], const char *name, int family, int flags) { int cnt = 0; @@ -61,12 +230,25 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati return EAI_SYSTEM; } while (fgets(line, sizeof line, f) && cnt < MAXADDRS) { - char *p, *z; + char idna[256]; + ssize_t r; + char *p, *z, c; if ((p=strchr(line, '#'))) *p++='\n', *p=0; - for(p=line+1; (p=strstr(p, name)) && - (!isspace(p[-1]) || !isspace(p[l])); p++); - if (!p) continue; + /* skip ip address and canonicalize names */ + for (p=line; *p && !isspace(*p); p++); + while (*p) { + for (; *p && isspace(*p); p++); + for (z=p; *z && !isspace(*z); z++); + c = *z; + *z = 0; + r = idnaenc(idna, p); + *z = c; + if (r == l && memcmp(idna, name, l) == 0) + break; + p = z; + } + if (!*p) continue; /* Isolate IP address to parse */ for (p=line; *p && !isspace(*p); p++); @@ -86,7 +268,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati for (; *p && isspace(*p); p++); for (z=p; *z && !isspace(*z); z++); *z = 0; - if (is_valid_hostname(p)) memcpy(canon, p, z-p+1); + if ((r = idnaenc(idna, p)) > 0) memcpy(canon, idna, r); } __fclose_ca(f); return cnt ? cnt : badfam; @@ -285,15 +467,17 @@ static int addrcmp(const void *_a, const void *_b) int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags) { + char _name[256]; int cnt = 0, i, j; *canon = 0; if (name) { - /* reject empty name and check len so it fits into temp bufs */ - size_t l = strnlen(name, 255); - if (l-1 >= 254) + /* convert unicode name to RFC3492 punycode */ + ssize_t l; + if ((l = idnaenc(_name, name)) <= 0) return EAI_NONAME; - memcpy(canon, name, l+1); + memcpy(canon, _name, l+1); + name = _name; } /* Procedurally, a request for v6 addresses with the v4-mapped -- 2.10.2