/* * Copyright 2001-2004 Unicode, Inc. * * Disclaimer * * This source code is provided as is by Unicode, Inc. No claims are * made as to fitness for any particular purpose. No warranties of any * kind are expressed or implied. The recipient agrees to determine * applicability of information provided. If this file has been * purchased on magnetic or optical media from Unicode, Inc., the * sole remedy for any claim will be exchange of defective media * within 90 days of receipt. * * Limitations on Rights to Redistribute This Code * * Unicode, Inc. hereby grants the right to freely use the information * supplied in this file in the creation of products supporting the * Unicode Standard, and to make copies of this file in any form * for internal or external distribution as long as this notice * remains attached. */ /* --------------------------------------------------------------------- Conversion from UTF-8 to UTF-16. Source code file. Stripped and modified for John the Ripper ; see ConvertUTF.c.original for the original content. Magnum, 2009 Author: Mark E. Davis, 1994. Rev History: Rick McGowan, fixes & updates May 2001. Sept 2001: fixed const & error conditions per mods suggested by S. Parent & A. Lillich. June 2002: Tim Dodd added detection and handling of incomplete source sequences, enhanced error detection, added casts to eliminate compiler warnings. July 2003: slight mods to back out aggressive FFFE detection. Jan 2004: updated switches in from-UTF8 conversions. Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. See the header file "ConvertUTF.h" for complete documentation. ------------------------------------------------------------------------ */ #include "ConvertUTF.h" #if !defined(uint16) && !defined(HAVE_UINT16_FROM_RPC_RPC_H) #if (SIZEOF_SHORT == 4) #define uint16 __ERROR___CANNOT_DETERMINE_TYPE_FOR_INT16; #else /* SIZEOF_SHORT != 4 */ #define uint16 unsigned short #endif /* SIZEOF_SHORT != 4 */ #endif #if !defined(int16) && !defined(HAVE_INT16_FROM_RPC_RPC_H) #if (SIZEOF_SHORT == 4) #define int16 __ERROR___CANNOT_DETERMINE_TYPE_FOR_INT16; #else /* SIZEOF_SHORT != 4 */ #define int16 short #endif /* SIZEOF_SHORT != 4 */ #endif #include #include "arch.h" #include "byteorder.h" #if !defined (NOT_JOHN) #include "options.h" #else struct opts { int flags; }; #define FLG_UTF8 1 struct opts options; #endif static const int halfShift = 10; /* used for shifting by 10 bits */ static const UTF32 halfBase = 0x0010000UL; static const UTF32 halfMask = 0x3FFUL; #define UNI_SUR_HIGH_START (UTF32)0xD800 #define UNI_SUR_LOW_START (UTF32)0xDC00 #define UNI_SUR_LOW_END (UTF32)0xDFFF /* * Index into the table below with the first byte of a UTF-8 sequence to * get the number of trailing bytes that are supposed to follow it. * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is * left as-is for anyone who may want to do such conversion, which was * allowed in earlier algorithms. */ static const char trailingBytesForUTF8[64] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; /* * Magic values subtracted from a buffer value during UTF8 conversion. * This table contains as many values as there might be trailing bytes * in a UTF-8 sequence. */ static const UTF32 offsetsFromUTF8[4] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL }; // Convert a string into an NT UNICODE string. Note that regardless of processor type // this must be in intel (little-endian) format. // // v3 has 'size checking, and other error checking' int utf8towcs2(UTF16 *target, int len, const UTF8 *source, int sourceLen) { const UTF16 *targetStart, *bufferEnd; const UTF8 *sourceStart, *sourceEnd; unsigned extraBytesToRead; UTF32 ch; int success; if (!sourceLen) { *target = 0; return 0; } targetStart = target; bufferEnd = target + len; sourceStart = source; sourceEnd = source + sourceLen; success = 1; while (success) { if (*source < 0xC0) { #if (ARCH_LITTLE_ENDIAN==1) *target++ = (UTF16)*source++; #else UTF8 val = *src++; SSVAL(target,0,val); ++target; #endif if (*source == 0) break; if (target >= bufferEnd) { success = 0; break; } continue; } #if (0) ch = 0; extraBytesToRead = trailingBytesForUTF8[*source & 0x3F]; if (&source[extraBytesToRead] >= sourceEnd) { // for john, simply null terminate the uft16 buffer, and return; // the number of utf8 bytes PRIOR to this. This is a 'broken' // character. We throw it away, and proceed with what we have. success = 0; break; } // The cases fall through. switch (extraBytesToRead) { case 3: ch += *source++; ch <<= 6; case 2: ch += *source++; ch <<= 6; case 1: ch += *source++; ch <<= 6; case 0: ch += *source++; break; default: success = 0; break; } #else // this is a touch faster. ch = *source; extraBytesToRead = trailingBytesForUTF8[ch & 0x3F]; if (&source[extraBytesToRead] >= sourceEnd) { // for john, simply null terminate the uft16 buffer, and return; // the number of utf8 bytes PRIOR to this. This is a 'broken' // character. We throw it away, and proceed with what we have. success = 0; break; } // The cases fall through. switch (extraBytesToRead) { case 3: ch <<= 6; ch += *++source; case 2: ch <<= 6; ch += *++source; case 1: ch <<= 6; ch += *++source; case 0: ++source; break; default: break; } #endif ch -= offsetsFromUTF8[extraBytesToRead]; #if (ARCH_LITTLE_ENDIAN==1) *target++ = (UTF16)ch; /* normal case */ #else SSVAL(target,0,ch); ++target; #endif if (*source == 0) break; if (target >= bufferEnd) { success = 0; break; } } *target = 0; // Null-terminate if (!success) return -1*(source-sourceStart); return (target-targetStart); } // v1 It does not have source length, so simply calls v2 with strlen to get source length int utf8towcs (UTF16 *target, const UTF8 *source, int len) { return utf8towcs2(target, len, source, strlen(source)); } // Convert a string into an NT UNICODE string. Note that regardless of processor type // this must be in intel (little-endian) format. // // This version converts from UTF-8 if the --utf8 option was given to John // and from ISO-8859-1 otherwise. The latter is a little faster. // int plaintowcs(UTF16 *dst, UTF8 *src, int len) { if (!(options.flags & FLG_UTF8)) { // Convert from ISO-8859-1 int i; for(i = 0; i < len; i++) { #if (ARCH_LITTLE_ENDIAN==1) *dst++ = (UTF16)*src++; #else UTF8 val = *src++; SSVAL(dst,0,val); ++dst; #endif } dst[i] = 0; return i; } else { // Convert from UTF-8 return utf8towcs2(dst, len, src, strlen(src)); } }