diff --git a/include/stdlib.h b/include/stdlib.h index f034c6e..fbe7a21 100644 --- a/include/stdlib.h +++ b/include/stdlib.h @@ -76,7 +76,8 @@ size_t wcstombs (char *__restrict, const wchar_t *__restrict, size_t); #define EXIT_FAILURE 1 #define EXIT_SUCCESS 0 -#define MB_CUR_MAX ((size_t)+4) +size_t __ctype_get_mb_cur_max(void); +#define MB_CUR_MAX (__ctype_get_mb_cur_max()) #define RAND_MAX (0x7fffffff) diff --git a/src/ctype/__ctype_get_mb_cur_max.c b/src/ctype/__ctype_get_mb_cur_max.c index d235f4d..94b0bd4 100644 --- a/src/ctype/__ctype_get_mb_cur_max.c +++ b/src/ctype/__ctype_get_mb_cur_max.c @@ -1,6 +1,7 @@ #include +#include "locale_impl.h" size_t __ctype_get_mb_cur_max() { - return 4; + return MB_CUR_MAX; } diff --git a/src/internal/stdio_impl.h b/src/internal/stdio_impl.h index 79be9fd..aafdc08 100644 --- a/src/internal/stdio_impl.h +++ b/src/internal/stdio_impl.h @@ -35,7 +35,8 @@ struct _IO_FILE { int fd; int pipe_pid; long lockcount; - short dummy3; + unsigned char dummy3; + unsigned char utf8; signed char mode; signed char lbf; int lock; diff --git a/src/locale/iconv.c b/src/locale/iconv.c index a0b0232..138d596 100644 --- a/src/locale/iconv.c +++ b/src/locale/iconv.c @@ -152,8 +152,10 @@ static void put_32(unsigned char *s, unsigned c, int e) } /* Adapt as needed */ -#define mbrtowc_utf8 mbrtowc -#define wctomb_utf8 wctomb +size_t __utf8rtowc(wchar_t *, const char *, size_t, mbstate_t *); +size_t __wctoutf8(char *, wchar_t); +#define mbrtowc_utf8 __utf8rtowc +#define wctomb_utf8 __wctoutf8 size_t iconv(iconv_t cd0, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb) { diff --git a/src/locale/langinfo.c b/src/locale/langinfo.c index 13abf45..0b130e7 100644 --- a/src/locale/langinfo.c +++ b/src/locale/langinfo.c @@ -32,7 +32,7 @@ char *__nl_langinfo_l(nl_item item, locale_t loc) int idx = item & 65535; const char *str; - if (item == CODESET) return "UTF-8"; + if (item == CODESET) return loc->ctype_utf8 ? "UTF-8" : "ASCII+8BIT"; switch (cat) { case LC_NUMERIC: diff --git a/src/multibyte/btowc.c b/src/multibyte/btowc.c index 9d2c3b1..695964c 100644 --- a/src/multibyte/btowc.c +++ b/src/multibyte/btowc.c @@ -1,7 +1,10 @@ #include #include +#include "locale_impl.h" wint_t btowc(int c) { - return c<128U ? c : EOF; + if (c+1U <= 128) return c; + if (MB_CUR_MAX == 1) return (unsigned char)c + 0x10f000; + return WEOF; } diff --git a/src/multibyte/mbrtowc.c b/src/multibyte/mbrtowc.c index e7b3654..c96d6e7 100644 --- a/src/multibyte/mbrtowc.c +++ b/src/multibyte/mbrtowc.c @@ -6,6 +6,7 @@ #include #include +#include "locale_impl.h" #include "internal.h" size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate_t *restrict st) @@ -27,6 +28,7 @@ size_t mbrtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate if (!n) return -2; if (!c) { if (*s < 0x80) return !!(*wc = *s); + if (MB_CUR_MAX==1) return (*wc = *s + 0x10f000), 1; if (*s-SA > SB-SA) goto ilseq; c = bittab[*s++-SA]; n--; } diff --git a/src/multibyte/mbsrtowcs.c b/src/multibyte/mbsrtowcs.c index 3c1343a..f4e46f3 100644 --- a/src/multibyte/mbsrtowcs.c +++ b/src/multibyte/mbsrtowcs.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include "locale_impl.h" #include "internal.h" size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbstate_t *restrict st) @@ -24,6 +26,23 @@ size_t mbsrtowcs(wchar_t *restrict ws, const char **restrict src, size_t wn, mbs } } + if (MB_CUR_MAX==1) { + if (!ws) return strlen((const char *)s); + for (;;) { + if (!wn) { + *src = (const void *)s; + return wn0; + } + if (!*s) break; + c = *s++; + *ws++ = c + (0x10f000 & -(c>>7)); + wn--; + } + *ws = 0; + *src = 0; + return wn0-wn; + } + if (!ws) for (;;) { if (*s-1u < 0x7f && (uintptr_t)s%4 == 0) { while (!(( *(uint32_t*)s | *(uint32_t*)s-0x01010101) & 0x80808080)) { diff --git a/src/multibyte/mbtowc.c b/src/multibyte/mbtowc.c index 803d221..6a2e3f9 100644 --- a/src/multibyte/mbtowc.c +++ b/src/multibyte/mbtowc.c @@ -6,6 +6,7 @@ #include #include +#include "locale_impl.h" #include "internal.h" int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n) @@ -19,6 +20,7 @@ int mbtowc(wchar_t *restrict wc, const char *restrict src, size_t n) if (!wc) wc = &dummy; if (*s < 0x80) return !!(*wc = *s); + if (MB_CUR_MAX==1) return (*wc = *s + 0x10f000), 1; if (*s-SA > SB-SA) goto ilseq; c = bittab[*s++-SA]; diff --git a/src/multibyte/utf8rtowc.c b/src/multibyte/utf8rtowc.c new file mode 100644 index 0000000..6bb5220 --- /dev/null +++ b/src/multibyte/utf8rtowc.c @@ -0,0 +1,48 @@ +/* + * This code was written by Rich Felker in 2010; no copyright is claimed. + * This code is in the public domain. Attribution is appreciated but + * unnecessary. + */ + +#include +#include +#include "internal.h" + +size_t __utf8rtowc(wchar_t *restrict wc, const char *restrict src, size_t n, mbstate_t *restrict st) +{ + unsigned c; + const unsigned char *s = (const void *)src; + const unsigned N = n; + + c = *(unsigned *)st; + if (!wc) wc = (void *)&wc; + + if (!n) return -2; + if (!c) { + if (*s < 0x80) return !!(*wc = *s); + if (*s-SA > SB-SA) goto ilseq; + c = bittab[*s++-SA]; n--; + } + + if (n) { + if (OOB(c,*s)) goto ilseq; +loop: + c = c<<6 | *s++-0x80; n--; + if (!(c&(1U<<31))) { + *(unsigned *)st = 0; + *wc = c; + return N-n; + } + if (n) { + if (*s-0x80u >= 0x40) goto ilseq; + goto loop; + } + } + + *(unsigned *)st = c; + return -2; +ilseq: + *(unsigned *)st = 0; + errno = EILSEQ; + return -1; +} diff --git a/src/multibyte/wcrtomb.c b/src/multibyte/wcrtomb.c index 59f733d..0c7d0f0 100644 --- a/src/multibyte/wcrtomb.c +++ b/src/multibyte/wcrtomb.c @@ -6,6 +6,7 @@ #include #include +#include "locale_impl.h" size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st) { @@ -13,6 +14,13 @@ size_t wcrtomb(char *restrict s, wchar_t wc, mbstate_t *restrict st) if ((unsigned)wc < 0x80) { *s = wc; return 1; + } else if (MB_CUR_MAX == 1) { + if ((unsigned)wc - 0x10f080 >= 0x80) { + errno = EILSEQ; + return -1; + } + *s = wc; + return 1; } else if ((unsigned)wc < 0x800) { *s++ = 0xc0 | (wc>>6); *s = 0x80 | (wc&0x3f); diff --git a/src/multibyte/wctob.c b/src/multibyte/wctob.c index d6353ee..c365016 100644 --- a/src/multibyte/wctob.c +++ b/src/multibyte/wctob.c @@ -1,8 +1,10 @@ #include #include +#include "locale_impl.h" int wctob(wint_t c) { if (c < 128U) return c; + if (MB_CUR_MAX == 1 && c-0x10f080U < 128) return (unsigned char)c; return EOF; } diff --git a/src/multibyte/wctoraw8.c b/src/multibyte/wctoraw8.c new file mode 100644 index 0000000..24c4d8c --- /dev/null +++ b/src/multibyte/wctoraw8.c @@ -0,0 +1,13 @@ +#include +#include + +int __wctoraw8(wchar_t wc) +{ + if ((unsigned)wc < 0x80) { + return wc; + } else if ((unsigned)wc - 0x10f080 < 0x80) { + return wc - 0x10f000; + } + errno = EILSEQ; + return -1; +} diff --git a/src/multibyte/wctoutf8.c b/src/multibyte/wctoutf8.c new file mode 100644 index 0000000..bf05783 --- /dev/null +++ b/src/multibyte/wctoutf8.c @@ -0,0 +1,33 @@ +/* + * This code was written by Rich Felker in 2010; no copyright is claimed. + * This code is in the public domain. Attribution is appreciated but + * unnecessary. + */ + +#include +#include + +size_t __wctoutf8(char * s, wchar_t wc) +{ + if ((unsigned)wc < 0x80) { + *s = wc; + return 1; + } else if ((unsigned)wc < 0x800) { + *s++ = 0xc0 | (wc>>6); + *s = 0x80 | (wc&0x3f); + return 2; + } else if ((unsigned)wc < 0xd800 || (unsigned)wc-0xe000 < 0x2000) { + *s++ = 0xe0 | (wc>>12); + *s++ = 0x80 | ((wc>>6)&0x3f); + *s = 0x80 | (wc&0x3f); + return 3; + } else if ((unsigned)wc-0x10000 < 0x100000) { + *s++ = 0xf0 | (wc>>18); + *s++ = 0x80 | ((wc>>12)&0x3f); + *s++ = 0x80 | ((wc>>6)&0x3f); + *s = 0x80 | (wc&0x3f); + return 4; + } + errno = EILSEQ; + return -1; +} diff --git a/src/stdio/fgetwc.c b/src/stdio/fgetwc.c index 8626d54..b3f10f9 100644 --- a/src/stdio/fgetwc.c +++ b/src/stdio/fgetwc.c @@ -2,6 +2,8 @@ #include #include +size_t __utf8rtowc(wchar_t *, const char *, size_t, mbstate_t *); + wint_t __fgetwc_unlocked(FILE *f) { mbstate_t st = { 0 }; @@ -10,7 +12,14 @@ wint_t __fgetwc_unlocked(FILE *f) unsigned char b; size_t l; - f->mode |= f->mode+1; + if (!f->utf8) { + fwide(f, 1); + if (!f->utf8) { + c = getc_unlocked(f); + if (c >= 128) c += 0x10f000; + return c; + } + } /* Convert character from buffer if possible */ if (f->rpos < f->rend) { diff --git a/src/stdio/fputwc.c b/src/stdio/fputwc.c index 7b621dd..dfb9b8d 100644 --- a/src/stdio/fputwc.c +++ b/src/stdio/fputwc.c @@ -3,14 +3,22 @@ #include #include +int __wctoraw8(wchar_t); + wint_t __fputwc_unlocked(wchar_t c, FILE *f) { char mbc[MB_LEN_MAX]; int l; - f->mode |= f->mode+1; + if (!f->utf8) { + fwide(f, 1); + if (!f->utf8) { + c = __wctoraw8(c); + return c<0 ? WEOF : putc_unlocked(c, f); + } + } - if (isascii(c)) { + if (c < 128U) { c = putc_unlocked(c, f); } else if (f->wpos + MB_LEN_MAX < f->wend) { l = wctomb((void *)f->wpos, c); diff --git a/src/stdio/fwide.c b/src/stdio/fwide.c index 8088e7a..0ebaff4 100644 --- a/src/stdio/fwide.c +++ b/src/stdio/fwide.c @@ -1,5 +1,6 @@ #include #include "stdio_impl.h" +#include "locale_impl.h" #define SH (8*sizeof(int)-1) #define NORMALIZE(x) ((x)>>SH | -((-(x))>>SH)) @@ -7,7 +8,10 @@ int fwide(FILE *f, int mode) { FLOCK(f); - if (!f->mode) f->mode = NORMALIZE(mode); + if (!f->mode && mode) { + f->mode = NORMALIZE(mode); + f->utf8 = (MB_CUR_MAX > 1); + } mode = f->mode; FUNLOCK(f); return mode; diff --git a/src/stdio/ungetwc.c b/src/stdio/ungetwc.c index 8cc85a6..2b6137e 100644 --- a/src/stdio/ungetwc.c +++ b/src/stdio/ungetwc.c @@ -4,6 +4,8 @@ #include #include +int __wctoraw8(wchar_t); + wint_t ungetwc(wint_t c, FILE *f) { unsigned char mbc[MB_LEN_MAX]; @@ -11,20 +13,25 @@ wint_t ungetwc(wint_t c, FILE *f) if (c == WEOF) return c; - /* Try conversion early so we can fail without locking if invalid */ - if (!isascii(c) && (l = wctomb((void *)mbc, c)) < 0) - return WEOF; - FLOCK(f); - f->mode |= f->mode+1; + if (!f->utf8) { + fwide(f, 1); + if (!f->utf8) { + c = __wctoraw8(c); + c = ungetc(c, f); + FUNLOCK(f); + return c; + } + } - if ((!f->rend && __toread(f)) || f->rpos < f->buf - UNGET + l) { + if ((c>=128U && (l = wctomb((void *)mbc, c)) < 0) || + (!f->rend && __toread(f)) || f->rpos < f->buf - UNGET + l) { FUNLOCK(f); return EOF; } - if (isascii(c)) *--f->rpos = c; + if (c<128U) *--f->rpos = c; else memcpy(f->rpos -= l, mbc, l); f->flags &= ~F_EOF; diff --git a/src/stdio/vfwprintf.c b/src/stdio/vfwprintf.c index c640059..ec0565d 100644 --- a/src/stdio/vfwprintf.c +++ b/src/stdio/vfwprintf.c @@ -355,7 +355,7 @@ int vfwprintf(FILE *restrict f, const wchar_t *restrict fmt, va_list ap) } FLOCK(f); - f->mode |= f->mode+1; + fwide(f, 1); ret = wprintf_core(f, fmt, &ap2, nl_arg, nl_type); FUNLOCK(f); va_end(ap2); diff --git a/src/stdio/vfwscanf.c b/src/stdio/vfwscanf.c index ac5c2c2..223aad4 100644 --- a/src/stdio/vfwscanf.c +++ b/src/stdio/vfwscanf.c @@ -104,7 +104,7 @@ int vfwscanf(FILE *restrict f, const wchar_t *restrict fmt, va_list ap) FLOCK(f); - f->mode |= f->mode+1; + fwide(f, 1); for (p=fmt; *p; p++) {