musl - Re: [PATCH v2] IDNA support in name lookups

Follow @Openwall on Twitter for new release announcements and other news
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20170423010100.GM17319@brightrain.aerifal.cx>
Date: Sat, 22 Apr 2017 21:01:00 -0400
From: Rich Felker <dalias@...c.org>
To: musl@...ts.openwall.com
Subject: Re: [PATCH v2] IDNA support in name lookups

On Sun, Apr 02, 2017 at 09:30:26AM +0200, Joakim Sindholt wrote:
> Changes since v1:
> * Reject UTF-16 surrogate range runes
> * Remove locale override
> 
> This is from some discussion on IRC and while I agree that it's more
> "correct" in POSIX terms, I'm not particularly happy about having to
> explicitly enable UTF-8 support with setlocale.

Yes, I'm not really happy about the decision on the C locale either,
but I understand the reasons various parties wanted it that way and I
think trying to follow the closest-to-working consensus process we
have is better than just following it when we agree with the outcomes.

> There might still be bugs and character ranges that need to be rejected.

As far as I can tell, no normalization is done. This might be
problematic for strings where the natural way users would type it does
not match the normalized form required in IDN's, but it would also be
expensive to handle. I think it's okay to punt on this until it proves
to actually be a problem.

> >From 54d5caf36cdce4e5008aecfcc2b02580fb52d0cb Mon Sep 17 00:00:00 2001
> From: Joakim Sindholt <opensource@...sha.com>
> Date: Wed, 29 Mar 2017 11:51:02 +0200
> Subject: [PATCH] add IDNA support to name lookups
> 
> ---
>  src/network/lookup_name.c | 202 +++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 193 insertions(+), 9 deletions(-)
> 
> diff --git a/src/network/lookup_name.c b/src/network/lookup_name.c
> index fb7303a..fd4275c 100644
> --- a/src/network/lookup_name.c
> +++ b/src/network/lookup_name.c

I think I'd rather put these functions in their own source file, just
because they're logically distinct, even if this is the only caller.

> @@ -10,9 +10,21 @@
>  #include <unistd.h>
>  #include <pthread.h>
>  #include <errno.h>
> +#include <wchar.h>
>  #include "lookup.h"
>  #include "stdio_impl.h"
>  #include "syscall.h"
> +#include "locale_impl.h"

Is locale_impl.h actually used now?

> +
> +enum {
> +	base         = 36,
> +	tmin         = 1,
> +	tmax         = 26,
> +	skew         = 38,
> +	damp         = 700,
> +	initial_bias = 72,
> +	initial_n    = 128,
> +};

Especially because these names being introduced at file-scope sounds
error-prone in a large existing file.

>  static int is_valid_hostname(const char *host)
>  {
> @@ -22,6 +34,163 @@ static int is_valid_hostname(const char *host)
>  	return !*s;
>  }
>  
> +static unsigned int adapt(unsigned int delta, unsigned int numpoints, int firsttime)
> +{
> +	unsigned int k = 0;
> +	delta /= firsttime ? damp : 2;
> +	delta += delta / numpoints;
> +	while (delta > ((base - tmin) * tmax) / 2) {
> +		delta /= base - tmin;
> +		k += base;
> +	}
> +	return k + ((base - tmin + 1) * delta) / (delta + skew);
> +}
> +
> +static ssize_t punyenc(char *dst, const char *src, size_t len, size_t max)
> +{
> +	static const char *const tbl = "abcdefghijklmnopqrstuvwxyz0123456789";
> +	const unsigned char *usrc = (void *)src;
> +	unsigned int codepoints = 0;
> +	unsigned int dlen = 0;
> +	unsigned int si, mi;
> +	unsigned int n = initial_n;
> +	unsigned int delta = 0;
> +	unsigned int bias = initial_bias;
> +	unsigned int h, b;
> +	for (si = 0; si < len; ++si) {
> +		if (usrc[si] < 0x80) {
> +			if (dlen == max)
> +				return -1;
> +			dst[dlen++] = src[si];
> +		} else if ((usrc[si] & 0xC0) == 0xC0) {
> +			++codepoints;
> +		}
> +	}
> +	codepoints += dlen;
> +	h = b = dlen;
> +	if (dlen) {
> +		if (dlen == max)
> +			return -1;
> +		dst[dlen++] = '-';
> +	}
> +	while (h < codepoints) {
> +		unsigned int m = (unsigned int)-1;
> +		unsigned int c;
> +		wchar_t wc;
> +		for (mi = 0; mi < len; ) {
> +			mi += mbtowc(&wc, src + mi, len - mi);
> +			c = (unsigned int)wc;
> +			if (c >= n && c < m)
> +				m = c;
> +		}
> +		if (((unsigned int)-1 - delta) / (h + 1) < m - n)
> +			return -1;
> +		delta += (m - n) * (h + 1);
> +		n = m;
> +
> +		for (mi = 0; mi < len; ) {
> +			mi += mbtowc(&wc, src + mi, len - mi);
> +			c = (unsigned int)wc;
> +			if (c < n /* || c < 0x80 not necessary*/)
> +				if (++delta == 0)
> +					return -1;
> +			if (c == n) {
> +				unsigned int q = delta;
> +				unsigned int k;
> +				for (k = base; ; k += base) {
> +					unsigned int t;
> +					if (k <= bias + tmin) {
> +						t = tmin;
> +					} else if (k >= bias + tmax) {
> +						t = tmax;
> +					} else {
> +						t = k - bias;
> +					}
> +					if (q < t)
> +						break;
> +					if (dlen == max)
> +						return -1;
> +					dst[dlen++] = tbl[t + ((q - t) % (base - t))];
> +					q = (q - t) / (base - t);
> +				}
> +				if (dlen == max)
> +					return -1;
> +				dst[dlen++] = tbl[q];
> +				bias = adapt(delta, h + 1, h == b);
> +				delta = 0;
> +				++h;
> +			}
> +		}
> +		++delta;
> +		++n;
> +	}
> +	return dlen;
> +}
> +
> +static ssize_t idnaenc(char dst[static 256], const char *src)
> +{
> +	size_t left = strlen(src);
> +	size_t olen = 0;
> +
> +	while (left) {
> +		const char *dot;
> +		size_t len, i;
> +		int basic = 1;
> +
> +		dot = memchr(src, '.', left);
> +		if (!dot) { dot = src + left; }
> +		len = dot - src;
> +		if (len == 0) { return -1; }
> +		left -= len + !!*dot;
> +
> +		for (i = 0; i < len; ) {
> +			unsigned int c;
> +			wchar_t wc;
> +			int n = mbtowc(&wc, src + i, len - i);
> +			c = (n <= 0) ? 0 : (unsigned int)wc;
> +			if (c < 0x80) {
> +				if (!isalnum(c) && !(i > 0 && c == '-'))
> +					return -1;
> +			} else {
> +				if ((c >=   0x7F && c <=   0x9F) ||
> +				    (c >= 0xD800 && c <= 0xDFFF))
> +					return -1;
> +				basic = 0;
> +			}
> +			i += n;
> +		}
> +		if (basic) {
> +			if (len > 63 || len > 254 - olen)
> +				return -1;
> +			for (i = 0; i < len; ++i)
> +				dst[olen + i] = tolower(src[i]);
> +			olen += len;
> +		} else {
> +			ssize_t r;
> +			size_t max;
> +			if (olen >= 254 - 4)
> +				return -1;
> +			max = 254 - 4 - olen;
> +			if (max > 63 - 4)
> +				max = 63 - 4;
> +			memcpy(dst + olen, "xn--", 4);
> +			r = punyenc(dst + olen + 4, src, len, max);
> +			if (r <= 0)
> +				return -1;
> +			olen += r + 4;
> +		}
> +		if (olen == 255 || (!*dot && olen == 254))
> +			return -1;
> +		if (*dot)
> +			dst[olen++] = *dot;
> +		src = dot + !!*dot;
> +	}
> +	if (olen == 0)
> +		return -1;
> +	dst[olen] = 0;
> +	return olen;
> +}
> +
>  static int name_from_null(struct address buf[static 2], const char *name, int family, int flags)
>  {
>  	int cnt = 0;
> @@ -61,12 +230,25 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
>  		return EAI_SYSTEM;
>  	}
>  	while (fgets(line, sizeof line, f) && cnt < MAXADDRS) {
> -		char *p, *z;
> +		char idna[256];
> +		ssize_t r;
> +		char *p, *z, c;
>  
>  		if ((p=strchr(line, '#'))) *p++='\n', *p=0;
> -		for(p=line+1; (p=strstr(p, name)) &&
> -			(!isspace(p[-1]) || !isspace(p[l])); p++);
> -		if (!p) continue;
> +		/* skip ip address and canonicalize names */
> +		for (p=line; *p && !isspace(*p); p++);
> +		while (*p) {
> +			for (; *p && isspace(*p); p++);
> +			for (z=p; *z && !isspace(*z); z++);
> +			c = *z;
> +			*z = 0;
> +			r = idnaenc(idna, p);
> +			*z = c;
> +			if (r == l && memcmp(idna, name, l) == 0)
> +				break;
> +			p = z;
> +		}
> +		if (!*p) continue;
>  
>  		/* Isolate IP address to parse */
>  		for (p=line; *p && !isspace(*p); p++);
> @@ -86,7 +268,7 @@ static int name_from_hosts(struct address buf[static MAXADDRS], char canon[stati
>  		for (; *p && isspace(*p); p++);
>  		for (z=p; *z && !isspace(*z); z++);
>  		*z = 0;
> -		if (is_valid_hostname(p)) memcpy(canon, p, z-p+1);
> +		if ((r = idnaenc(idna, p)) > 0) memcpy(canon, idna, r);
>  	}
>  	__fclose_ca(f);
>  	return cnt ? cnt : badfam;
> @@ -285,15 +467,17 @@ static int addrcmp(const void *_a, const void *_b)

Is there any reason this needs to be done, or should be done, for
lookups from the hosts file? IDN/punycode is a hack for transporting
unicode names on top of DNS protocol. For hosts file you can just put
the proper unicode strings directly in the file.

>  int __lookup_name(struct address buf[static MAXADDRS], char canon[static 256], const char *name, int family, int flags)
>  {
> +	char _name[256];
>  	int cnt = 0, i, j;
>  
>  	*canon = 0;
>  	if (name) {
> -		/* reject empty name and check len so it fits into temp bufs */
> -		size_t l = strnlen(name, 255);
> -		if (l-1 >= 254)
> +		/* convert unicode name to RFC3492 punycode */
> +		ssize_t l;
> +		if ((l = idnaenc(_name, name)) <= 0)
>  			return EAI_NONAME;
> -		memcpy(canon, name, l+1);
> +		memcpy(canon, _name, l+1);
> +		name = _name;
>  	}

If it's not needed for hosts backend, this code probably belongs
localized to the dns lookup, rather than at the top of __lookup_name.

BTW there's perhaps also a need for the opposite-direction
translation, both for ai_canonname (when a CNAME points to IDN) and
for getnameinfo reverse lookups. But that can be added as a second
patch I think.

Rich
Confused about mailing lists and their use? Read about mailing lists on Wikipedia and check out these guidelines on proper formatting of your messages.