/*
 * Copyright 2001-2004 Unicode, Inc.
 * 
 * Disclaimer
 * 
 * This source code is provided as is by Unicode, Inc. No claims are
 * made as to fitness for any particular purpose. No warranties of any
 * kind are expressed or implied. The recipient agrees to determine
 * applicability of information provided. If this file has been
 * purchased on magnetic or optical media from Unicode, Inc., the
 * sole remedy for any claim will be exchange of defective media
 * within 90 days of receipt.
 * 
 * Limitations on Rights to Redistribute This Code
 * 
 * Unicode, Inc. hereby grants the right to freely use the information
 * supplied in this file in the creation of products supporting the
 * Unicode Standard, and to make copies of this file in any form
 * for internal or external distribution as long as this notice
 * remains attached.
 */

/* ---------------------------------------------------------------------

    Conversion from UTF-8 to UTF-16.  Source code file.

	Stripped and modified for John the Ripper ; see ConvertUTF.c.original
	for the original content. Magnum, 2009

    Author: Mark E. Davis, 1994.
    Rev History: Rick McGowan, fixes & updates May 2001.
    Sept 2001: fixed const & error conditions per
	mods suggested by S. Parent & A. Lillich.
    June 2002: Tim Dodd added detection and handling of incomplete
	source sequences, enhanced error detection, added casts
	to eliminate compiler warnings.
    July 2003: slight mods to back out aggressive FFFE detection.
    Jan 2004: updated switches in from-UTF8 conversions.
    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.

    See the header file "ConvertUTF.h" for complete documentation.


------------------------------------------------------------------------ */

#include "ConvertUTF.h"
#if !defined(uint16) && !defined(HAVE_UINT16_FROM_RPC_RPC_H)
#if (SIZEOF_SHORT == 4)
#define uint16 __ERROR___CANNOT_DETERMINE_TYPE_FOR_INT16;
#else /* SIZEOF_SHORT != 4 */
#define uint16 unsigned short
#endif /* SIZEOF_SHORT != 4 */
#endif

#if !defined(int16) && !defined(HAVE_INT16_FROM_RPC_RPC_H)
#if (SIZEOF_SHORT == 4)
#define int16 __ERROR___CANNOT_DETERMINE_TYPE_FOR_INT16;
#else /* SIZEOF_SHORT != 4 */
#define int16 short
#endif /* SIZEOF_SHORT != 4 */
#endif

#include <string.h>
#include "arch.h"
#include "byteorder.h"
#if !defined (NOT_JOHN)
#include "options.h"
#else
struct opts { int flags; };
#define FLG_UTF8 1
struct opts options;
#endif

static const int halfShift  = 10; /* used for shifting by 10 bits */

static const UTF32 halfBase = 0x0010000UL;
static const UTF32 halfMask = 0x3FFUL;

#define UNI_SUR_HIGH_START  (UTF32)0xD800
#define UNI_SUR_LOW_START   (UTF32)0xDC00
#define UNI_SUR_LOW_END     (UTF32)0xDFFF

/*
 * Index into the table below with the first byte of a UTF-8 sequence to
 * get the number of trailing bytes that are supposed to follow it.
 * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
 * left as-is for anyone who may want to do such conversion, which was
 * allowed in earlier algorithms.
 */
static const char trailingBytesForUTF8[64] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};

/*
 * Magic values subtracted from a buffer value during UTF8 conversion.
 * This table contains as many values as there might be trailing bytes
 * in a UTF-8 sequence.
 */
static const UTF32 offsetsFromUTF8[4] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL };


// Convert a string into an NT UNICODE string. Note that regardless of processor type 
// this must be in intel (little-endian) format.
//
// v3 has 'size checking, and other error checking'
int utf8towcs2(UTF16 *target, int len, const UTF8 *source, int sourceLen) {
	const UTF16 *targetStart, *bufferEnd;
	const UTF8  *sourceStart, *sourceEnd;
	unsigned extraBytesToRead;
	UTF32 ch;
	int success;

	if (!sourceLen) {
		*target = 0;
		return 0;
	}

	targetStart = target;
	bufferEnd = target + len;
	sourceStart = source;
	sourceEnd = source + sourceLen;
	success = 1;

	while (success) {
		if (*source < 0xC0) {
#if (ARCH_LITTLE_ENDIAN==1)
			*target++ = (UTF16)*source++;
#else
			UTF8 val = *src++;
			SSVAL(target,0,val);
			++target;
#endif
			if (*source == 0)
				break;
			if (target >= bufferEnd) {
				success = 0;
				break;
			}
			continue;
		}
#if (0)
		ch = 0;
		extraBytesToRead = trailingBytesForUTF8[*source & 0x3F];
		if (&source[extraBytesToRead] >= sourceEnd) {
			// for john, simply null terminate the uft16 buffer, and return;
			// the number of utf8 bytes PRIOR to this.  This is a 'broken'
			// character. We throw it away, and proceed with what we have.
			success = 0;
			break;
		}
		// The cases fall through.
		switch (extraBytesToRead) {
			case 3: ch += *source++; ch <<= 6;
			case 2: ch += *source++; ch <<= 6;
			case 1: ch += *source++; ch <<= 6;
			case 0: ch += *source++; break;
			default: success = 0; break;
		}
#else
		// this is a touch faster.
		ch = *source;
		extraBytesToRead = trailingBytesForUTF8[ch & 0x3F];
		if (&source[extraBytesToRead] >= sourceEnd) {
			// for john, simply null terminate the uft16 buffer, and return;
			// the number of utf8 bytes PRIOR to this.  This is a 'broken'
			// character. We throw it away, and proceed with what we have.
			success = 0;
			break;
		}
		// The cases fall through.
		switch (extraBytesToRead) {
			case 3: ch <<= 6; ch += *++source;
			case 2: ch <<= 6; ch += *++source;
			case 1: ch <<= 6; ch += *++source;
			case 0: ++source; break;
			default: break;
		}
#endif
		ch -= offsetsFromUTF8[extraBytesToRead];

#if (ARCH_LITTLE_ENDIAN==1)
		*target++ = (UTF16)ch; /* normal case */
#else
		SSVAL(target,0,ch);
		++target;
#endif
		if (*source == 0)
			break;
		if (target >= bufferEnd) {
			success = 0;
			break;
		}
	}
	*target = 0; // Null-terminate
	if (!success)
		return -1*(source-sourceStart);
	return (target-targetStart);
}

// v1 It does not have source length, so simply calls v2 with strlen to get source length
int utf8towcs (UTF16 *target, const UTF8 *source, int len) {
	return utf8towcs2(target, len, source, strlen(source));
}


// Convert a string into an NT UNICODE string. Note that regardless of processor type 
// this must be in intel (little-endian) format.
//
// This version converts from UTF-8 if the --utf8 option was given to John
// and from ISO-8859-1 otherwise. The latter is a little faster.
//
int plaintowcs(UTF16 *dst, UTF8 *src, int len)
{
	if (!(options.flags & FLG_UTF8)) { // Convert from ISO-8859-1
		int i;
		for(i = 0; i < len; i++) {
#if (ARCH_LITTLE_ENDIAN==1)
			*dst++ = (UTF16)*src++;
#else
			UTF8 val = *src++;
			SSVAL(dst,0,val);
			++dst;
#endif
		}
		dst[i] = 0;
		return i;
	} else { // Convert from UTF-8
		return utf8towcs2(dst, len, src, strlen(src));
	}
}