/*
								+----------------------------------+
								|                                  |
								|   ***  Unicode conversion  ***   |
								|                                  |
								|   Copyright  -tHE SWINe- 2008   |
								|                                  |
								|           UniConv.cpp            |
								|                                  |
								+----------------------------------+
*/

/*
 *	2009-07-09
 *
 *	fixed unicode mappings url (http://www.unicode.org/Public/MAPPINGS/)
 *
 *	added more complete list of conversions between 8-bit, UTF-8, UTF-16
 *	(LE or BE) and UTF-32 to CUnicodeConversion
 *
 *	added alias CUniConv for CUnicodeConversion
 *
 *	2009-09-13
 *
 *	changed CUnicodeMapping::TCharacterMapping::n_character to unsigned (signed
 *	caused most encodings to fail working with character codes above 128,
 *	CUnicodeMapping::n_FromUnicode() and CUnicodeMapping::FromUnicode() functions
 *	were affected by this change)
 *
 *	2009-10-11
 *
 *	changed type of input data from const uint8_t* to const void* in some of
 *	CUniConv routines (convenience, do not have to type-cast anymore). functionality
 *	remains unchanged.
 *
 *	2009-10-20
 *
 *	fixed some warnings when compiling under VC 2005, implemented "Security
 *	Enhancements in the CRT " for VC 2008. compare against MyProjects_2009-10-19_
 *
 */

#include "NewFix.h"
#include "CallStack.h"
#include <stdio.h>
#include <string>
#include <algorithm>
#include "Integer.h"
#include "StlUtils.h"
#include "MinMax.h"
#include "UniConv.h"

/*
 *								=== CUniConv ===
 */

/*
 *	static int CUniConv::n_Decode_UTF16(const void *p_data,
 *		int n_size, std::string &r_s_string, bool b_allow_bom,
 *		bool b_expect_little_endian = true)
 *		- converts buffer p_data containing n_size bytes of UTF-16 encoded string
 *		  to us-ascii (codes above 255 are replaced by '?') and puts result to r_s_string
 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
 *		- in case BOM is disabled or not present, it depends on value of
 *		  b_expect_little_endian wheter little or big endian is used
 *		- note two consecutive null characters are considered end of the string
 *		  (they are counted as read, but are not output to r_s_string)
 *		- returns number of bytes read from the buffer or -1 on error
 */
int CUniConv::n_Decode_UTF16(const void *__p_data, int n_size,
	std::string &r_s_string, bool b_allow_bom, bool b_expect_little_endian)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	n_size &= ~1;
	// size must be even

	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, n_size / 2))
		return -1;
	// reserve enough space in the string

	int n_read = 0;
	// number of bytes read

	bool b_little_endian = b_expect_little_endian;
	// use default endianness

	if(n_size > 2 && b_allow_bom) {
		if(p_data[0] == 0xff && p_data[1] == 0xfe) {
			b_little_endian = true;
			n_size -= 2;
			p_data += 2;
			n_read += 2;
		} else if(p_data[0] == 0xfe && p_data[1] == 0xff) {
			b_little_endian = false;
			n_size -= 2;
			p_data += 2;
			n_read += 2;
		}
		// skip BOM
	}
	// try to read BOM (if present and allowed)

	const int n_i0 = (b_little_endian)? 1 : 0;
	const int n_i1 = (b_little_endian)? 0 : 1;
	// byte indexing (endianness)

	for(const uint8_t *p_char = p_data,
	   *p_end = p_data + n_size; p_char < p_end;
	   p_char += 2, n_read += 2) {
		if(p_char[0] == 0 && p_char[1] == 0) {
			n_read += 2;
			break;
		}
		// can be null-terminated

		int n_code = (p_char[n_i0] << 8) | p_char[n_i1];
		// read code

		if((n_code >> 10) == 0x36) {
			// n_code is a high surrogate

			p_char += 2;
			n_read += 2;
			if(p_char >= p_end)
				return -1; // not enough data
			int n_code2 = (p_char[n_i0] << 8) | p_char[n_i1];
			if((n_code2 >> 10) != 0x37)
				return -1;
			// read low surrogate

			n_code = (((n_code & 0x3ff) << 10) | (n_code2 & 0x3ff)) + 0x10000;
			// have surrogate pair
		} else if((n_code >> 10) == 0x37)
			return -1; // lonely low surrogate
		// read surrogate pairs

		if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
			return -1;
		// check if code is valid character

		if(n_code > 0xff)
			n_code = '?';
		// don't know how to handle otherwise. and don't need to.

		r_s_string += char(n_code);
	}

	return n_read;
}

/*
 *	static int CUniConv::n_Decode_UTF16(const void *p_data, int n_size,
 *		std::string &r_s_string, const CUnicodeMapping &r_map, bool b_allow_bom,
 *		bool b_expect_little_endian = true)
 *		- converts buffer p_data containing n_size bytes of UTF-16 encoded string
 *		  to 8-bit charset, given by mapping p_map and puts result to r_s_string
 *		- in case mapping of unicode to 8-bit charset doesn't exist, function behavior
 *		  depends on value of substitute character set in r_map. in case it's negative
 *		  (default) the function fails, otherwise function uses the replacement character.
 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
 *		- in case BOM is disabled or not present, it depends on value of
 *		  b_expect_little_endian wheter little or big endian is used
 *		- note two consecutive null characters are considered end of the string
 *		  (they are counted as read, but are not output to r_s_string)
 *		- returns number of bytes read from the buffer or -1 on error
 */
int CUniConv::n_Decode_UTF16(const void *__p_data, int n_size, std::string &r_s_string,
	const CUnicodeMapping &r_map, bool b_allow_bom, bool b_expect_little_endian)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	n_size &= ~1;
	// size must be even

	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, n_size / 2))
		return -1;
	// reserve enough space in the string

	int n_read = 0;
	// number of bytes read

	bool b_little_endian = b_expect_little_endian;
	// use default endianness

	if(n_size > 2 && b_allow_bom) {
		if(p_data[0] == 0xff && p_data[1] == 0xfe) {
			b_little_endian = true;
			n_size -= 2;
			p_data += 2;
			n_read += 2;
		} else if(p_data[0] == 0xfe && p_data[1] == 0xff) {
			b_little_endian = false;
			n_size -= 2;
			p_data += 2;
			n_read += 2;
		}
		// skip BOM
	}
	// try to read BOM (if present and allowed)

	const int n_i0 = (b_little_endian)? 1 : 0;
	const int n_i1 = (b_little_endian)? 0 : 1;
	// byte indexing (endianness)

	for(const uint8_t *p_char = p_data,
	   *p_end = p_data + n_size; p_char < p_end;
	   p_char += 2, n_read += 2) {
		if(p_char[0] == 0 && p_char[1] == 0) {
			n_read += 2;
			break;
		}
		// can be null-terminated

		int n_code = (p_char[n_i0] << 8) | p_char[n_i1];
		// read code

		if((n_code >> 10) == 0x36) {
			// n_code is a high surrogate

			p_char += 2;
			n_read += 2;
			if(p_char >= p_end)
				return -1; // not enough data
			int n_code2 = (p_char[n_i0] << 8) | p_char[n_i1];
			if((n_code2 >> 10) != 0x37)
				return -1;
			// read low surrogate

			n_code = (((n_code & 0x3ff) << 10) | (n_code2 & 0x3ff)) + 0x10000;
			// have surrogate pair
		} else if((n_code >> 10) == 0x37)
			return -1; // lonely low surrogate
		// read surrogate pairs

		if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
			return -1;
		// check if code is valid character

		if((n_code = r_map.n_FromUnicode(n_code)) < 0)
			return -1;
		// translate to 8-bit charset

		r_s_string += char(n_code);
	}

	return n_read;
}

/*
 *	static int CUniConv::n_Decode_UTF8(const void *p_data,
 *		int n_size, std::string &r_s_string, bool b_allow_bom)
 *		- converts buffer p_data containing n_size bytes of UTF-8 encoded string
 *		  to us-ascii (codes above 255 are replaced by '?') and puts result to r_s_string
 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
 *		- note null character is considered explicit end of the string
 *		  (it is counted as read, but is not part of r_s_string)
 *		- returns number of bytes read from the buffer or -1 on error
 */
int CUniConv::n_Decode_UTF8(const void *__p_data,
	int n_size, std::string &r_s_string, bool b_allow_bom)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, n_size))
		return -1;
	// reserve enough space in the string

	int n_read = 0;
	// number of bytes read

	if(b_allow_bom && n_size >= 3 && p_data[0] == 0xef &&
	   p_data[1] == 0xbb && p_data[2] == 0xbf) {
		p_data += 3;
		n_size -= 3;
		n_read = 3;
	}
	// skip UTF-8 BOM

	for(const uint8_t *p_char = p_data,
	   *p_end = p_data + n_size; p_char < p_end;
	   ++ p_char, ++ n_read) {
		if(!*p_char) {
			++ n_read;
			break;
		}
		// can be null-terminated

		uint8_t n_byte0 = *p_char;
		// get byte

		int n_code;
		if((n_byte0 & 0x80) != 0) {
			int n_byte_num;
			if((n_byte0 & 0xe0) == 0xc0)
				n_byte_num = 1;
			else if((n_byte0 & 0xf0) == 0xe0)
				n_byte_num = 2;
			else if((n_byte0 & 0xf8) == 0xf0)
				n_byte_num = 3;
			else
				return -1;
			n_code = n_byte0 & (0x3f >> n_byte_num);
			// multi-byte character - apply mask and determine number of bytes

			if(p_char + n_byte_num >= p_end)
				return -1;
			// have enough data?

			for(; n_byte_num; -- n_byte_num) {
				++ p_char;
				++ n_read;
				uint8_t n_byte = *p_char;
				if((n_byte & 0xc0) != 0x80)
					return -1;
				n_code <<= 6;
				n_code |= n_byte & 0x3f;
			}
			// add more bytes

			if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
				return -1;
			// check if code is valid character (UTF-16 surrogates not allowed)
		} else
			n_code = n_byte0; // single-byte
		// decode utf-8 char

		if(n_code > 0xff)
			n_code = '?';
		// don't know how to handle otherwise. and don't need to.

		r_s_string += char(n_code);
	}

	return n_read;
}

/*
 *	static int CUniConv::n_Decode_UTF8(const void *p_data, int n_size,
 *		std::string &r_s_string, const CUnicodeMapping &r_map, bool b_allow_bom)
 *		- converts buffer p_data containing n_size bytes of UTF-8 encoded string
 *		  to 8-bit charset, given by mapping p_map and puts result to r_s_string
 *		- in case mapping of unicode to 8-bit charset doesn't exist, function behavior
 *		  depends on value of substitute character set in r_map. in case it's negative
 *		  (default) the function fails, otherwise function uses the replacement character.
 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
 *		- note null character is considered explicit end of the string
 *		  (it is counted as read, but is not part of r_s_string)
 *		- returns number of bytes read from the buffer or -1 on error
 */
int CUniConv::n_Decode_UTF8(const void *__p_data, int n_size,
	std::string &r_s_string, const CUnicodeMapping &r_map, bool b_allow_bom)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, n_size))
		return -1;
	// reserve enough space in the string

	int n_read = 0;
	// number of bytes read

	if(b_allow_bom && n_size >= 3 && p_data[0] == 0xef &&
	   p_data[1] == 0xbb && p_data[2] == 0xbf) {
		p_data += 3;
		n_size -= 3;
		n_read = 3;
	}
	// skip UTF-8 BOM

	for(const uint8_t *p_char = p_data,
	   *p_end = p_data + n_size; p_char < p_end;
	   ++ p_char, ++ n_read) {
		if(!*p_char) {
			++ n_read;
			break;
		}
		// can be null-terminated

		uint8_t n_byte0 = *p_char;
		// get byte

		int n_code;
		if((n_byte0 & 0x80) != 0) {
			int n_byte_num;
			if((n_byte0 & 0xe0) == 0xc0)
				n_byte_num = 1;
			else if((n_byte0 & 0xf0) == 0xe0)
				n_byte_num = 2;
			else if((n_byte0 & 0xf8) == 0xf0)
				n_byte_num = 3;
			else
				return -1;
			n_code = n_byte0 & (0x3f >> n_byte_num);
			// multi-byte character - apply mask and determine number of bytes

			if(p_char + n_byte_num >= p_end)
				return -1;
			// have enough data?

			for(; n_byte_num; -- n_byte_num) {
				++ p_char;
				++ n_read;
				uint8_t n_byte = *p_char;
				if((n_byte & 0xc0) != 0x80)
					return -1;
				n_code <<= 6;
				n_code |= n_byte & 0x3f;
			}
			// add more bytes

			if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
				return -1;
			// check if code is valid character (UTF-16 surrogates not allowed)
		} else
			n_code = n_byte0; // single-byte
		// decode utf-8 char

		if((n_code = r_map.n_FromUnicode(n_code)) < 0)
			return -1;
		// translate to 8-bit charset

		r_s_string += char(n_code);
	}

	return n_read;
}

/*
 *	static int CUniConv::n_UTF16_LE_Char_Size(uint8_t n_first_byte,
 *		uint8_t n_second_byte)
 *		- returns size of UTF-16 (little endian) character based on it's first
 *		  two bytes n_first_byte, n_second_byte. result is in bytes and is either 2
 *		  or 4 (surrogate pair)
 *		- returns -1 on failure (low surrogate)
 *		- note actually only the second byte is required, but that should be
 *		  optimized-away in inline expansion of the function
 */
int CUniConv::n_UTF16_LE_Char_Size(uint8_t n_first_byte, uint8_t n_second_byte)
{
	/*int n_code = (n_second_byte << 8) | n_first_byte;
	if((n_code >> 10) == 0x36)
		return 4;*/
	_ASSERTE((((n_second_byte << 8) | n_first_byte) >> 10) == (n_second_byte << 2));
	if((n_second_byte << 2) == 0x36) // can decide based on a single byte
		return 4;
	else if((n_second_byte << 2) == 0x37) // lonely low surrogate
		return -1;
	return 2;
}

/*
 *	static int CUniConv::n_UTF16_BE_Char_Size(uint8_t n_first_byte,
 *		uint8_t n_second_byte)
 *		- returns size of UTF-16 (little endian) character based on it's first
 *		  two bytes n_first_byte, n_second_byte. result is in bytes and is either 2
 *		  or 4 (surrogate pair)
 *		- returns -1 on failure (low surrogate)
 *		- note actually only the first byte is required, but that should be
 *		  optimized-away in inline expansion of the function
 */
int CUniConv::n_UTF16_BE_Char_Size(uint8_t n_first_byte, uint8_t n_second_byte)
{
	/*int n_code = (n_first_byte << 8) | n_second_byte;
	if((n_code >> 10) == 0x36)
		return 4;*/
	_ASSERTE((((n_first_byte << 8) | n_second_byte) >> 10) == (n_first_byte << 2));
	if((n_first_byte << 2) == 0x36) // can decide based on a single byte
		return 4;
	else if((n_second_byte << 2) == 0x37) // lonely low surrogate
		return -1;
	return 2;
}

/*
 *	static int CUniConv::n_UTF16_LE_Code(const void *p_data,
 *		int n_size, int &r_n_read)
 *		- decodes a single UTF-16 (little endian) character
 *		- p_data is buffer with n_size bytes of UTF-16 data
 *		- r_n_read will contain number of bytes read from input
 *		  buffer upon function return
 *		- returns character code (UTF-32) on success, -1 on failure
 *		- note this would interpret BOM as ordinary character code
 */
int CUniConv::n_UTF16_LE_Code(const void *__p_data, int n_size, int &r_n_read)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	if(n_size < 2) {
		r_n_read = 0;
		return -1;
	}
	// make sure there are at least two characters

	int n_code = (p_data[1] << 8) | p_data[0];
	// read code

	if((n_code >> 10) == 0x36) {
		// n_code is a high surrogate

		p_data += 2;
		if(n_size < 4) {
			r_n_read = 2;
			return -1; // not enough data
		}
		int n_code2 = (p_data[1] << 8) | p_data[0];
		if((n_code2 >> 10) != 0x37) {
			r_n_read = 2;
			return -1;
		}
		// read low surrogate

		r_n_read = 4;
		n_code = (((n_code & 0x3ff) << 10) | (n_code2 & 0x3ff)) + 0x10000;
		// have surrogate pair
	} else {
		r_n_read = 2;
		if((n_code >> 10) == 0x37) // lonely low surrogate
			return -1;
		// single character
	}
	// read surrogate pairs

	if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
		return -1;
	// check if code is valid character

	return n_code;
}

/*
 *	static int CUniConv::n_UTF16_BE_Code(const void *p_data,
 *		int n_size, int &r_n_read)
 *		- decodes a single UTF-16 (big endian) character
 *		- p_data is buffer with n_size bytes of UTF-16 data
 *		- r_n_read will contain number of bytes read from input
 *		  buffer upon function return
 *		- returns character code (UTF-32) on success, -1 on failure
 *		- note this would interpret BOM as ordinary character code
 */
int CUniConv::n_UTF16_BE_Code(const void *__p_data, int n_size, int &r_n_read)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	if(n_size < 2) {
		r_n_read = 0;
		return -1;
	}
	// make sure there are at least two characters

	int n_code = (p_data[0] << 8) | p_data[1];
	// read code

	if((n_code >> 10) == 0x36) {
		// n_code is a high surrogate

		p_data += 2;
		if(n_size < 4) {
			r_n_read = 2;
			return -1; // not enough data
		}
		int n_code2 = (p_data[0] << 8) | p_data[1];
		if((n_code2 >> 10) != 0x37) {
			r_n_read = 2;
			return -1;
		}
		// read low surrogate

		r_n_read = 4;
		n_code = (((n_code & 0x3ff) << 10) | (n_code2 & 0x3ff)) + 0x10000;
		// have surrogate pair
	} else {
		r_n_read = 2;
		if((n_code >> 10) == 0x37) // lonely low surrogate
			return -1;
		// single character
	}
	// read surrogate pairs

	if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
		return -1;
	// check if code is valid character

	return n_code;
}

/*
 *	static int CUniConv::n_UTF16_to_UTF32(const void *p_data,
 *		int n_size, std::basic_string<int> &r_s_string, bool b_allow_bom,
 *		bool b_expect_little_endian = true)
 *		- converts buffer p_data containing n_size bytes of UTF-16 encoded string
 *		  to plain character codes (UTF-32) and puts result to r_s_string
 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
 *		- in case BOM is disabled or not present, it depends on value of
 *		  b_expect_little_endian wheter little or big endian is used
 *		- note two consecutive null characters are considered end of the string
 *		  (they are counted as read, but are not output to r_s_string)
 *		- returns number of bytes read from the buffer or -1 on error
 */
int CUniConv::n_UTF16_to_UTF32(const void *__p_data, int n_size,
	std::basic_string<int> &r_s_string, bool b_allow_bom, bool b_expect_little_endian)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	n_size &= ~1;
	// size must be even

	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, n_size / 2))
		return -1;
	// reserve enough space in the string

	int n_read = 0;
	// number of bytes read

	bool b_little_endian = b_expect_little_endian;
	// use default endianness

	if(n_size > 2 && b_allow_bom) {
		if(p_data[0] == 0xff && p_data[1] == 0xfe) {
			b_little_endian = true;
			n_size -= 2;
			p_data += 2;
			n_read += 2;
		} else if(p_data[0] == 0xfe && p_data[1] == 0xff) {
			b_little_endian = false;
			n_size -= 2;
			p_data += 2;
			n_read += 2;
		}
		// skip BOM
	}
	// try to read BOM (if present and allowed)

	const int n_i0 = (b_little_endian)? 1 : 0;
	const int n_i1 = (b_little_endian)? 0 : 1;
	// byte indexing (endianness)

	for(const uint8_t *p_char = p_data,
	   *p_end = p_data + n_size; p_char < p_end;
	   p_char += 2, n_read += 2) {
		if(p_char[0] == 0 && p_char[1] == 0) {
			n_read += 2;
			break;
		}
		// can be null-terminated

		int n_code = (p_char[n_i0] << 8) | p_char[n_i1];
		// read code

		if((n_code >> 10) == 0x36) {
			// n_code is a high surrogate

			p_char += 2;
			n_read += 2;
			if(p_char >= p_end)
				return -1; // not enough data
			int n_code2 = (p_char[n_i0] << 8) | p_char[n_i1];
			if((n_code2 >> 10) != 0x37)
				return -1;
			// read low surrogate

			n_code = (((n_code & 0x3ff) << 10) | (n_code2 & 0x3ff)) + 0x10000;
			// have surrogate pair
		} else if((n_code >> 10) == 0x37)
			return -1; // lonely low surrogate
		// read surrogate pairs

		if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
			return -1;
		// check if code is valid character

		r_s_string += n_code;
	}

	return n_read;
}

/*
 *	static int CUniConv::n_UTF8_Char_Size(uint8_t n_first_byte)
 *		- returns size of UTF-8 character based on it's first byte n_first_byte
 *		  returned size is in bytes and includes the first byte (values range 1 to 4)
 *		- returns -1 on failure (invalid UTF-8 character)
 */
int CUniConv::n_UTF8_Char_Size(uint8_t n_first_byte)
{
	if(n_first_byte & 0x80) {
		if((n_first_byte & 0xe0) == 0xc0)
			return 2;
		else if((n_first_byte & 0xf0) == 0xe0)
			return 3;
		else if((n_first_byte & 0xf8) == 0xf0)
			return 4;
		else
			return -1;
	} else
		return 1;
}

/*
 *	static int CUniConv::n_UTF8_Code(const void *p_data,
 *		int n_size, int &r_n_read)
 *		- decodes a single UTF-8 character
 *		- p_data is buffer with n_size bytes of UTF-8 data
 *		- r_n_read will contain number of bytes read from input
 *		  buffer upon function return
 *		- returns character code (UTF-32) on success, -1 on failure
 *		- note this doesn't allow UTF-16 surrogates (character range 0xd800 to 0xdfff)
 *		  or characters above 0x10ffff (returns -1 instead)
 */
int CUniConv::n_UTF8_Code(const void *__p_data, int n_size, int &r_n_read)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	if(!n_size) {
		r_n_read = 0;
		return -1;
	}
	// make sure there's at least a single characer

	uint8_t n_byte0 = *p_data;
	// get first byte

	if((n_byte0 & 0x80) != 0) {
		int n_byte_num;
		if((n_byte0 & 0xe0) == 0xc0)
			n_byte_num = 1;
		else if((n_byte0 & 0xf0) == 0xe0)
			n_byte_num = 2;
		else if((n_byte0 & 0xf8) == 0xf0)
			n_byte_num = 3;
		else {
			r_n_read = 1;
			return -1;
		}
		// multi-byte character: determine number of bytes

		if(n_byte_num >= n_size) {
			r_n_read = 1;
			return -1;
		}
		// have enough data?

		int n_code = n_byte0 & (0x3f >> n_byte_num);
		r_n_read = 1 + n_byte_num;
		for(; n_byte_num; -- n_byte_num) {
			uint8_t n_byte = *(++ p_data);
			if((n_byte & 0xc0) != 0x80) {
				r_n_read -= n_byte_num - 1; // correct number of bytes actually read
				return -1;
			}
			n_code = (n_code << 6) | (n_byte & 0x3f);
		}
		// add more bytes

		if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
			return -1;
		// check if code is valid character (UTF-16 surrogates not allowed)

		return n_code;
	} else {
		r_n_read = 1;
		return n_byte0; // single-byte
	}
	// decode utf-8 char
}

/*
 *	static int CUniConv::n_UTF8_to_UTF32(const void *p_data,
 *		int n_size, std::basic_string<int> &r_s_string, bool b_allow_bom)
 *		- converts buffer p_data containing n_size bytes of UTF-8 encoded string
 *		  to plain character codes (UTF-32) and puts result to r_s_string
 *		- if b_allow_bom is set, BOM (byte-order-mark) is expected
 *		- note null character is considered explicit end of the string
 *		  (it is counted as read, but is not part of r_s_string)
 *		- returns number of bytes read from the buffer or -1 on error
 */
int CUniConv::n_UTF8_to_UTF32(const void *__p_data,
	int n_size, std::basic_string<int> &r_s_string, bool b_allow_bom)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, n_size))
		return -1;
	// reserve enough space in the string

	int n_read = 0;
	// number of bytes read

	if(b_allow_bom && n_size >= 3 && p_data[0] == 0xef &&
	   p_data[1] == 0xbb && p_data[2] == 0xbf) {
		p_data += 3;
		n_size -= 3;
		n_read = 3;
	}
	// skip UTF-8 BOM

	for(const uint8_t *p_char = p_data,
	   *p_end = p_data + n_size; p_char < p_end;
	   ++ p_char, ++ n_read) {
		if(!*p_char) {
			++ n_read;
			break;
		}
		// can be null-terminated

		uint8_t n_byte0 = *p_char;
		// get byte

		if((n_byte0 & 0x80) != 0) {
			int n_byte_num;
			if((n_byte0 & 0xe0) == 0xc0)
				n_byte_num = 1;
			else if((n_byte0 & 0xf0) == 0xe0)
				n_byte_num = 2;
			else if((n_byte0 & 0xf8) == 0xf0)
				n_byte_num = 3;
			else
				return -1;
			// multi-byte character: determine number of bytes

			if(p_char + n_byte_num >= p_end)
				return -1;
			// have enough data?

			int n_code = n_byte0 & (0x3f >> n_byte_num);
			n_read += n_byte_num;
			for(; n_byte_num; -- n_byte_num) {
				uint8_t n_byte = *(++ p_char);
				if((n_byte & 0xc0) != 0x80) {
					n_read -= n_byte_num - 1; // correct number of bytes actually read
					return -1;
				}
				n_code = (n_code << 6) | (n_byte & 0x3f);
			}
			// add more bytes

			if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
				return -1;
			// check if code is valid character (UTF-16 surrogates not allowed)

			r_s_string += n_code;
		} else
			r_s_string += int(n_byte0); // single-byte
		// decode utf-8 char
	}

	return n_read;
}

/*
 *	static bool CUniConv::UTF32_to_UTF8(const int *p_data,
 *		int n_length, std::string &r_s_string, bool b_use_bom)
 *		- encodes plain unicode characters (UTF-32) in p_data as UTF-8
 *		- n_size is length of string, contained in p_data (may be null-terminated)
 *		- output is returned in r_s_string
 *		- setting b_use_bom causes UTF-8 BOM being present in output
 *		- returns true on success, false on failure (not enough memory, or invalid chars)
 *		- note it's possible to call with size = -1 in case p_data contains
 *		  null-terminated string (the loop will break after decrementing size down
 *		  to zero (2^32 - 1 chars for -1), or when encountered zero character)
 */
bool CUniConv::UTF32_to_UTF8(const int *p_data,
	int n_length, std::string &r_s_string, bool b_use_bom)
{
	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, (b_use_bom)? max(0, n_length) + 3 : max(0, n_length)))
		return false;
	// reserve some space in the string (may not be enough,
	// this is lower-bound; should be enough for us-english though)

	if(b_use_bom) {
		r_s_string += char(0xef);
		r_s_string += char(0xbb);
		r_s_string += char(0xbf);
	}
	// begin with UTF-8 BOM

	for(const int *p_end = p_data + n_length; p_data != p_end; ++ p_data) {
		int n_code = *p_data;
		// read character

		if(!n_code)
			break;
		// may as well be null-terminated

		if(n_code <= 0x7f) {
			if(!stl_ut::Reserve_1More(r_s_string))
				return false;
			// make sure there's enough space

			r_s_string += char(n_code);
			// save as a single value
		} else if(n_code <= 0x7ff) {
			if(!stl_ut::Reserve_NMore(r_s_string, 2))
				return false;
			// make sure there's enough space

			r_s_string += char(0xc0 | (n_code >> 6));
			r_s_string += char(0x80 | (n_code & 0x3f));
			// save as pair of values
		} else if(n_code <= 0xffff) {
			if(n_code >= 0xd800 && n_code <= 0xdfff)
				return false;
			// can't encode utf-16 surrogates. it's prohibited in utf-8 specs.

			if(!stl_ut::Reserve_NMore(r_s_string, 3))
				return false;
			// make sure there's enough space

			r_s_string += char(0xe0 | (n_code >> 12));
			r_s_string += char(0x80 | ((n_code >> 6) & 0x3f));
			r_s_string += char(0x80 | (n_code & 0x3f));
			// save as trinity of values
		} else if(n_code <= 0x10ffff) {
			if(!stl_ut::Reserve_NMore(r_s_string, 4))
				return false;
			// make sure there's enough space

			r_s_string += char(0xf0 | (n_code >> 18));
			r_s_string += char(0x80 | ((n_code >> 12) & 0x3f));
			r_s_string += char(0x80 | ((n_code >> 6) & 0x3f));
			r_s_string += char(0x80 | (n_code & 0x3f));
			// save as quadruple of values
		} else {
			return false;
			// too high character to encode
		}
	}

	return true;
}

/*
 *	static bool CUniConv::Encode_UTF8(const char *p_data, int n_size,
 *		const int *p_mapping_table, std::string &r_s_string, bool b_use_bom)
 *		- encodes generic 8-bit encoded characters in p_data as UTF-8
 *		- n_size is length of string, contained in p_data (may be null-terminated)
 *		- p_mapping_table is table with 256 entries for each 8-bit code, containing
 *		  corresponding UTF-32 character, or negative number for undefined characters
 *		  (note entry with index 0 is always ignored, 8-bit char 0 is terminating zero)
 *		- setting b_use_bom causes UTF-8 BOM being present in output
 *		- output is returned in r_s_string
 *		- returns true on success, false on failure (not enough memory, or invalid chars)
 *		- note it's possible to call with size = -1 in case p_data contains
 *		  null-terminated string (the loop will break after decrementing size down
 *		  to zero (2^32 - 1 chars for -1), or when encountered zero character)
 */
bool CUniConv::Encode_UTF8(const char *p_data, int n_size,
	const int *p_mapping_table, std::string &r_s_string, bool b_use_bom)
{
	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, (b_use_bom)? max(0, n_size) + 3 : max(0, n_size)))
		return false;
	// reserve some space in the string (may not be enough,
	// this is lower-bound; should be enough for us-english though)

	if(b_use_bom) {
		r_s_string += char(0xef);
		r_s_string += char(0xbb);
		r_s_string += char(0xbf);
	}
	// begin with UTF-8 BOM

	for(const char *p_end = p_data + n_size; p_data != p_end; ++ p_data) {
		int n_code = int(unsigned char(*p_data));
		if(!n_code) // may as well be null-terminated
			break;
		// read character

		n_code = p_mapping_table[n_code];
		if(n_code < 0)
			return false;
		// translate to UTF-32 using the table

		if(n_code <= 0x7f) {
			if(!stl_ut::Reserve_1More(r_s_string))
				return false;
			// make sure there's enough space

			r_s_string += char(n_code);
			// save as a single value
		} else if(n_code <= 0x7ff) {
			if(!stl_ut::Reserve_NMore(r_s_string, 2))
				return false;
			// make sure there's enough space

			r_s_string += char(0xc0 | (n_code >> 6));
			r_s_string += char(0x80 | (n_code & 0x3f));
			// save as pair of values
		} else if(n_code <= 0xffff) {
			if(n_code >= 0xd800 && n_code <= 0xdfff)
				return false;
			// can't encode utf-16 surrogates. it's prohibited in utf-8 specs.

			if(!stl_ut::Reserve_NMore(r_s_string, 3))
				return false;
			// make sure there's enough space

			r_s_string += char(0xe0 | (n_code >> 12));
			r_s_string += char(0x80 | ((n_code >> 6) & 0x3f));
			r_s_string += char(0x80 | (n_code & 0x3f));
			// save as trinity of values
		} else if(n_code <= 0x10ffff) {
			if(!stl_ut::Reserve_NMore(r_s_string, 4))
				return false;
			// make sure there's enough space

			r_s_string += char(0xf0 | (n_code >> 18));
			r_s_string += char(0x80 | ((n_code >> 12) & 0x3f));
			r_s_string += char(0x80 | ((n_code >> 6) & 0x3f));
			r_s_string += char(0x80 | (n_code & 0x3f));
			// save as quadruple of values
		} else {
			return false;
			// too high character to encode
		}
	}

	return true;
}

/*
 *	static bool CUniConv::UTF32_to_UTF16(const int *p_data,
 *		int n_length, std::basic_string<unsigned short> &r_s_string,
 *		bool b_use_bom = false, bool b_little_endian = true)
 *		- encodes plain unicode characters (UTF-32) in p_data as UTF-8
 *		- n_size is length of string, contained in p_data (may be null-terminated)
 *		- output is returned in r_s_string
 *		- b_use_bom decides wheter to include byte-order mark in the output
 *		- b_little_endian decides wheter to encode as UTF-16 LE (true), or BE (false)
 *		- returns true on success, false on failure (not enough memory, or invalid chars)
 *		- note it's possible to call with size = -1 in case p_data contains
 *		  null-terminated string (the loop will break after decrementing size down
 *		  to zero (2^32 - 1 chars for -1), or when encountered zero character)
 */
bool CUniConv::UTF32_to_UTF16(const int *p_data, int n_length,
	std::basic_string<unsigned short> &r_s_string, bool b_use_bom, bool b_little_endian)
{
	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, (b_use_bom)? max(0, n_length) + 1 : max(0, n_length)))
		return false;
	// reserve some space in the string (may not be enough this is lower-bound)

	if(b_use_bom)
		r_s_string += (b_little_endian)? 0xfeff : 0xfffe;
	// include BOM

	for(const int *p_end = p_data + n_length; p_data != p_end; ++ p_data) {
		int n_code = *p_data;
		// read character

		if(!n_code)
			break;
		// may as well be null-terminated

		if(n_code < 0x10000) {
			if(n_code >= 0xd800 && n_code <= 0xdfff)
				return false;
			// noncharacters

			if(!stl_ut::Reserve_1More(r_s_string))
				return false;
			// make sure there's enough space

			r_s_string += (b_little_endian)? unsigned short(n_code) : n_HiLoSwap(n_code);
			// save as a single value
		} else if(n_code <= 0x10ffff) {
			if(!stl_ut::Reserve_NMore(r_s_string, 2))
				return false;
			// make sure there's enough space

			unsigned short n_head = unsigned short((0xd800 - (0x10000 >> 10)) + (n_code >> 10));
			r_s_string += (b_little_endian)? n_head : n_HiLoSwap(n_head);
			unsigned short n_tail = unsigned short(0xdc00 + (n_code & 0x3ff));
			r_s_string += (b_little_endian)? n_tail : n_HiLoSwap(n_tail);
			// save as surrogate pair
		} else {
			return false;
			// too high character to encode
		}
	}

	return true;
}

/*
 *	static bool CUniConv::Encode_UTF16(const char *p_data, int n_size,
 *		const int *p_mapping_table, std::basic_string<unsigned short> &r_s_string,
 *		bool b_use_bom = false, bool b_little_endian = true)
 *		- encodes generic 8-bit encoded characters in p_data as UTF-16
 *		- n_size is length of string, contained in p_data (may be null-terminated)
 *		- p_mapping_table is table with 256 entries for each 8-bit code, containing
 *		  corresponding UTF-32 character, or negative number for undefined characters
 *		  (note entry with index 0 is always ignored, 8-bit char 0 is terminating zero)
 *		- output is returned in r_s_string
 *		- b_use_bom decides wheter to include byte-order mark in the output
 *		- b_little_endian decides wheter to encode as UTF-16 LE (true), or BE (false)
 *		- returns true on success, false on failure (not enough memory, or invalid chars)
 *		- note it's possible to call with size = -1 in case p_data contains
 *		  null-terminated string (the loop will break after decrementing size down
 *		  to zero (2^32 - 1 chars for -1), or when encountered zero character)
 */
bool CUniConv::Encode_UTF16(const char *p_data, int n_size,
	const int *p_mapping_table, std::basic_string<unsigned short> &r_s_string,
	bool b_use_bom, bool b_little_endian)
{
	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, (b_use_bom)? max(0, n_size) + 1 : max(0, n_size)))
		return false;
	// reserve some space in the string (may not be enough this is lower-bound)

	if(b_use_bom)
		r_s_string += (b_little_endian)? 0xfeff : 0xfffe;
	// include BOM

	for(const char *p_end = p_data + n_size; p_data != p_end; ++ p_data) {
		int n_code = int(unsigned char(*p_data));
		if(!n_code) // may as well be null-terminated
			break;
		// read character

		n_code = p_mapping_table[n_code];
		if(n_code < 0)
			return false;
		// translate to UTF-32 using the table

		if(n_code < 0x10000) {
			if(n_code >= 0xd800 && n_code <= 0xdfff)
				return false;
			// noncharacters

			if(!stl_ut::Reserve_1More(r_s_string))
				return false;
			// make sure there's enough space

			r_s_string += (b_little_endian)? unsigned short(n_code) : n_HiLoSwap(n_code);
			// save as a single value
		} else if(n_code <= 0x10ffff) {
			if(!stl_ut::Reserve_NMore(r_s_string, 2))
				return false;
			// make sure there's enough space

			unsigned short n_head = unsigned short((0xd800 - (0x10000 >> 10)) + (n_code >> 10));
			r_s_string += (b_little_endian)? n_head : n_HiLoSwap(n_head);
			unsigned short n_tail = unsigned short(0xdc00 + (n_code & 0x3ff));
			r_s_string += (b_little_endian)? n_tail : n_HiLoSwap(n_tail);
			// save as surrogate pair
		} else {
			return false;
			// too high character to encode
		}
	}

	return true;
}

/*
 *	static int CUniConv::n_UTF16_to_UTF8(const void *p_data, int n_size,
 *		std::string &r_s_string, bool b_use_utf8_bom, bool b_allow_utf16_bom,
 *		bool b_expect_utf16_little_endian = true)
 *		- converts buffer p_data containing n_size bytes of UTF-16 encoded string
 *		  to UTF-8 and puts result to r_s_string
 *		- setting b_use_utf8_bom causes UTF-8 BOM being present in output
 *		- if b_allow_utf16_bom is set, BOM (byte-order-mark) is expected
 *		- in case BOM is disabled or not present, it depends on value of
 *		  b_expect_utf16_little_endian wheter little or big endian is used
 *		- note two consecutive null characters are considered end of the string
 *		  (they are counted as read, but are not output to r_s_string)
 *		- returns number of bytes read from the buffer or -1 on error
 */
int CUniConv::n_UTF16_to_UTF8(const void *__p_data, int n_size,
	std::string &r_s_string, bool b_use_utf8_bom, bool b_allow_utf16_bom,
	bool b_expect_utf16_little_endian)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	n_size &= ~1;
	// size must be even

	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, (b_use_utf8_bom)? n_size / 2 + 3 : n_size / 2))
		return -1;
	// reserve enough space in the string

	if(b_use_utf8_bom) {
		r_s_string += char(0xef);
		r_s_string += char(0xbb);
		r_s_string += char(0xbf);
	}
	// begin with utf-8 BOM

	int n_read = 0;
	// number of bytes read

	bool b_little_endian = b_expect_utf16_little_endian;
	// use default endianness

	if(n_size > 2 && b_allow_utf16_bom) {
		if(p_data[0] == 0xff && p_data[1] == 0xfe) {
			b_little_endian = true;
			n_size -= 2;
			p_data += 2;
			n_read += 2;
		} else if(p_data[0] == 0xfe && p_data[1] == 0xff) {
			b_little_endian = false;
			n_size -= 2;
			p_data += 2;
			n_read += 2;
		}
		// skip BOM
	}
	// try to read BOM (if present and allowed)

	const int n_i0 = (b_little_endian)? 1 : 0;
	const int n_i1 = (b_little_endian)? 0 : 1;
	// byte indexing (endianness)

	for(const uint8_t *p_char = p_data,
	   *p_end = p_data + n_size; p_char < p_end;
	   p_char += 2, n_read += 2) {
		if(p_char[0] == 0 && p_char[1] == 0) {
			n_read += 2;
			break;
		}
		// can be null-terminated

		int n_code = (p_char[n_i0] << 8) | p_char[n_i1];
		// read code

		if((n_code >> 10) == 0x36) { // high surrogate
			p_char += 2;
			n_read += 2;
			if(p_char >= p_end)
				return -1; // not enough data
			int n_code2 = (p_char[n_i0] << 8) | p_char[n_i1];
			if((n_code2 >> 10) != 0x37)
				return -1;
			// read low surrogate

			n_code = (((n_code & 0x3ff) << 10) | (n_code2 & 0x3ff)) + 0x10000;
			// have surrogate pair
		} else if((n_code >> 10) == 0x37)
			return -1; // lonely low surrogate
		// read surrogate pairs

		if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
			return -1;
		// check if code is valid character

		if(n_code <= 0x7f) {
			if(!stl_ut::Reserve_1More(r_s_string))
				return -1;
			// make sure there's enough space

			r_s_string += char(n_code);
			// save as a single value
		} else if(n_code <= 0x7ff) {
			if(!stl_ut::Reserve_NMore(r_s_string, 2))
				return -1;
			// make sure there's enough space

			r_s_string += char(0xc0 | (n_code >> 6));
			r_s_string += char(0x80 | (n_code & 0x3f));
			// save as pair of values
		} else if(n_code <= 0xffff) {
			/*if(n_code >= 0xd800 && n_code <= 0xdfff) // already checked above
				return false;*/
			// can't encode utf-16 surrogates. it's prohibited in utf-8 specs.

			if(!stl_ut::Reserve_NMore(r_s_string, 3))
				return -1;
			// make sure there's enough space

			r_s_string += char(0xe0 | (n_code >> 12));
			r_s_string += char(0x80 | ((n_code >> 6) & 0x3f));
			r_s_string += char(0x80 | (n_code & 0x3f));
			// save as trinity of values
		} else /*if(n_code <= 0x10ffff)*/ {
			if(!stl_ut::Reserve_NMore(r_s_string, 4))
				return -1;
			// make sure there's enough space

			r_s_string += char(0xf0 | (n_code >> 18));
			r_s_string += char(0x80 | ((n_code >> 12) & 0x3f));
			r_s_string += char(0x80 | ((n_code >> 6) & 0x3f));
			r_s_string += char(0x80 | (n_code & 0x3f));
			// save as quadruple of values
		} /*else {
			return false;
			// too high character to encode // already checked above
		}*/
		// convert to UTF-8
	}

	return n_read;
}

/*
 *	static int CUniConv::n_UTF8_to_UTF16(const void *p_data,
 *		int n_size, std::basic_string<unsigned short> &r_s_string,
 *		bool b_allow_utf8_bom, bool b_include_utf16_bom = false,
 *		bool b_utf16_little_endian = true)
 *		- converts buffer p_data containing n_size bytes of UTF-8 encoded string
 *		  to UTF-16 and puts result to r_s_string
 *		- note null character is considered explicit end of the string
 *		  (it is counted as read, but is not part of r_s_string)
 *		- b_allow_utf8_bom decides wheter to accept UTF-8 BOM
 *		- b_include_utf16_bom decides wheter to include byte-order mark in the output
 *		- b_utf16_little_endian decides wheter to encode as UTF-16 LE (true), or BE (false)
 *		- returns number of bytes read from the buffer or -1 on error
 */
int CUniConv::n_UTF8_to_UTF16(const void *__p_data, int n_size,
	std::basic_string<unsigned short> &r_s_string, bool b_allow_utf8_bom,
	bool b_include_utf16_bom, bool b_utf16_little_endian)
{
	const uint8_t *p_data = (const uint8_t*)__p_data;

	r_s_string.erase();
	if(!stl_ut::Reserve_N(r_s_string, (b_include_utf16_bom)? n_size + 1 : n_size))
		return -1;
	// reserve enough space in the string

	int n_read = 0;
	// number of bytes read

	if(b_allow_utf8_bom && n_size >= 3 && p_data[0] == 0xef &&
	   p_data[1] == 0xbb && p_data[2] == 0xbf) {
		p_data += 3;
		n_size -= 3;
		n_read = 3;
	}
	// skip UTF-8 BOM

	if(b_include_utf16_bom)
		r_s_string += (b_utf16_little_endian)? 0xfeff : 0xfffe;
	// include BOM

	for(const uint8_t *p_char = p_data,
	   *p_end = p_data + n_size; p_char < p_end; ++ p_char, ++ n_read) {
		if(!*p_char) {
			++ n_read;
			break;
		}
		// can be null-terminated

		uint8_t n_byte0 = *p_char;
		// get byte

		int n_code;
		if((n_byte0 & 0x80) != 0) {
			int n_byte_num;
			if((n_byte0 & 0xe0) == 0xc0)
				n_byte_num = 1;
			else if((n_byte0 & 0xf0) == 0xe0)
				n_byte_num = 2;
			else if((n_byte0 & 0xf8) == 0xf0)
				n_byte_num = 3;
			else
				return -1;
			n_code = n_byte0 & (0x3f >> n_byte_num);
			// multi-byte character - apply mask and determine number of bytes

			if(p_char + n_byte_num >= p_end)
				return -1;
			// have enough data?

			for(; n_byte_num; -- n_byte_num) {
				++ p_char;
				++ n_read;
				uint8_t n_byte = *p_char;
				if((n_byte & 0xc0) != 0x80)
					return -1;
				n_code <<= 6;
				n_code |= n_byte & 0x3f;
			}
			// add more bytes

			if((n_code >= 0xd800 && n_code <= 0xdfff) || n_code > 0x10ffff)
				return -1;
			// check if code is valid character (UTF-16 surrogates not allowed)
		} else
			n_code = n_byte0; // single-byte
		// decode utf-8 char

		if(n_code < 0x10000) {
			/*if(n_code >= 0xd800 && n_code <= 0xdfff) // already checked above
				return false;*/
			// noncharacters

			if(!stl_ut::Reserve_1More(r_s_string))
				return -1;
			// make sure there's enough space

			r_s_string += (b_utf16_little_endian)? unsigned short(n_code) : n_HiLoSwap(n_code);
			// save as a single value
		} else /*if(n_code <= 0x10ffff)*/ {
			if(!stl_ut::Reserve_NMore(r_s_string, 2))
				return -1;
			// make sure there's enough space

			unsigned short n_head = unsigned short((0xd800 - (0x10000 >> 10)) + (n_code >> 10));
			r_s_string += (b_utf16_little_endian)? n_head : n_HiLoSwap(n_head);
			unsigned short n_tail = unsigned short(0xdc00 + (n_code & 0x3ff));
			r_s_string += (b_utf16_little_endian)? n_tail : n_HiLoSwap(n_tail);
			// save as surrogate pair
		} /*else {
			return false;
			// too high character to encode // already checked above
		}*/
		// encode utf-16
	}

	return n_read;
}

/*
 *								=== ~CUniConv ===
 */

/*
 *								=== CUnicodeMapping ===
 */

/*
 *	CUnicodeMapping::CUnicodeMapping(const char *p_s_filename, bool b_avoid_accents = false)
 *		- default constructor; loads 8-bit charset mapping table from file p_s_filename
 *		  (files from www.unicode.org/mappings, Table format: Format A)
 *		- if b_avoid_accents is set, latin accent characters are replaced by
 *		  ordinary ones, relies on comments in the file, such as:
 *			0xC1	0x00C1	# LATIN CAPITAL LETTER A WITH ACUTE
 *		  (then the unicode character 0x00C1 will be replaced with 'A'),
 *		  note this only affects conversion of 8-bit strings from unicode, not to unicode
 *		- it's recommended to call b_Status() to see if constructor succeeded
 */
CUnicodeMapping::CUnicodeMapping(const char *p_s_filename, bool b_avoid_accents)
	:m_n_inverse_map_size(0), m_n_subst_char(-1)
{
	for(int i = 0; i < 256; ++ i)
		m_p_mapping[i] = -1;
	// clear inverse mapping table

	FILE *p_fr;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
	if(fopen_s(&p_fr, p_s_filename, "r")) {
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
	if(!(p_fr = fopen(p_s_filename, "r"))) {
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		m_n_inverse_map_size = 0; // to mark error
		return;
	}
	// open file

	std::string s_line;
	while(GetLine(s_line, p_fr)) {
		size_t b = 0, e = s_line.length();

		if(s_line.find("0x") != 0)
			continue;
		b += 2;
		// skip 0x

		char n_char = 0;
		while(b < e && isxdigit(s_line[b])) {
			int n_digit = isdigit(s_line[b])? s_line[b] - '0' :
				tolower(s_line[b]) - 'a' + 10;
			n_char <<= 4;
			n_char |= n_digit;
			++ b;
		}
		// convert hexadecimal char code

		if(b == e || !isspace(s_line[b]))
			continue;
		while(b < e && isspace(s_line[b]))
			++ b;
		// skip whitespace

		if(s_line.find("0x", b) != b)
			continue;
		b += 2;
		// skip 0x

		int n_unicode = 0;
		while(b < e && isxdigit(s_line[b])) {
			int n_digit = isdigit(s_line[b])? s_line[b] - '0' :
				tolower(s_line[b]) - 'a' + 10;
			n_unicode <<= 4;
			n_unicode |= n_digit;
			++ b;
		}
		// convert hexadecimal unicode

		while(b < e && isspace(s_line[b]))
			++ b;
		// skip whitespace

		m_p_mapping[unsigned char(n_char)] = n_unicode;
		// store inverse mapping

		if(b_avoid_accents) {
			char n_char_name;
			bool b_capital;
			if(Parse_LatinCharacterName(s_line, n_char_name, b_capital)) {
				if(b_capital)
					n_char = toupper(n_char_name);
				else
					n_char = tolower(n_char_name);
				// replace character (possibly with accent) by a simple character
			}
		}
		// try to avoid accents

		if(m_n_inverse_map_size == 256) {
			m_n_inverse_map_size = 0; // to mark error
			fclose(p_fr);
			return;
		}
		// too much characters in there

		m_p_inverse_map[m_n_inverse_map_size].n_character = n_char;
		m_p_inverse_map[m_n_inverse_map_size].n_unicode = n_unicode;
		++ m_n_inverse_map_size;
		// add to the list
	}
	// read lines

	std::sort(m_p_inverse_map, m_p_inverse_map + m_n_inverse_map_size, b_SmallerUnicode);
	// sort by unicode

	if(ferror(p_fr))
		m_n_inverse_map_size = 0; // to mark error
	fclose(p_fr);
	// close file
}

/*
 *	bool CUnicodeMapping::b_Status() const
 *		- returns true if constructor succeeded, otherwise returns false
 *		- note functions below are designed to work, even if constructor
 *		  failed (will not cause access violation / etc.)
 */
bool CUnicodeMapping::b_Status() const
{
	return m_n_inverse_map_size > 0;
}

/*
 *	int CUnicodeMapping::n_FromUnicode(int n_unicode) const
 *		- translates unicode character n_unicode (UTF-32) to 8-bit charset
 *		- in case given character cannot be represented, substitute character
 *		  is used instead (default -1, can be set using n_Set_SubsituteChar())
 *		- returns 8-bit representation of (UTF-32) unicode character n_unicode
 */
int CUnicodeMapping::n_FromUnicode(int n_unicode) const
{
	const TCharacterMapping *p_mapping = std::lower_bound(m_p_inverse_map,
		m_p_inverse_map + m_n_inverse_map_size, n_unicode);
	if(p_mapping != m_p_inverse_map + m_n_inverse_map_size)
		return p_mapping->n_character;
	return m_n_subst_char;
}

/*
 *	int CUnicodeMapping::n_FromUnicode(int n_unicode, int n_substitute) const
 *		- translates unicode character n_unicode (UTF-32) to 8-bit charset
 *		- in case given character cannot be represented, n_substitute is used instead
 *		- returns 8-bit representation of (UTF-32) unicode character n_unicode
 */
int CUnicodeMapping::n_FromUnicode(int n_unicode, int n_substitute) const
{
	const TCharacterMapping *p_mapping = std::lower_bound(m_p_inverse_map,
		m_p_inverse_map + m_n_inverse_map_size, n_unicode);
	if(p_mapping != m_p_inverse_map + m_n_inverse_map_size)
		return p_mapping->n_character;
	return n_substitute;
}

/*
 *	int CUnicodeMapping::n_Set_SubsituteChar(int n_substitute)
 *		- sets substitute character for conversion from unicode
 *		  to 8-bit charset to n_substitute
 *		- returns current former character
 *		- note setting -1 as substitute character causes conversion routines to fail
 *		  when there's no conversion for a particular character (default)
 *		- note setting '?' as substitute character makes conversion routines
 *		  never fail, they just return strings with question marks, insead of
 *		  characters which can't be represented in a given 8-bit charset
 */
int CUnicodeMapping::n_Set_SubsituteChar(int n_substitute)
{
	int n_result = m_n_subst_char;
	m_n_subst_char = n_substitute;
	return n_result;
}

/*
 *	int CUnicodeMapping::n_SubsituteChar() const
 *		- returns substitute character
 */
int CUnicodeMapping::n_SubsituteChar() const
{
	return m_n_subst_char;
}

/*
 *	bool CUnicodeMapping::FromUnicode(std::string &r_s_dest,
 *		const std::basic_string<int> &r_s_string, char n_substitute = '?') const
 *		- translates unicode (UTF-32) string r_s_string to 8-bit charset string r_s_dest
 *		- in case given character cannot be represented, n_substitute is used instead
 *		- r_s_dest is filled with translated string in 8-bit charset
 *		- returns true on success, false on failure (not enough memory)
 */
bool CUnicodeMapping::FromUnicode(std::string &r_s_dest,
	const std::basic_string<int> &r_s_string, char n_substitute) const
{
	if(!stl_ut::Resize_To_N(r_s_dest, r_s_string.length()))
		return false;
	// alloc dest string

	for(size_t i = 0, n = r_s_string.length(); i < n; ++ i)
		r_s_dest[i] = n_FromUnicode(r_s_string[i], n_substitute);
	// translate

	return true;
}

/*
 *	bool CUnicodeMapping::ToUnicode(std::basic_string<int> &r_s_dest, std::string &r_s_string)
 *		- translates 8-bit charset string r_s_string to unicode (UTF-32) string r_s_dest
 *		- in case given character cannot be represented, function fails
 *		- r_s_dest is filled with translated string in UTF-32 character set
 *		- returns true on success, false on failure
 */
bool CUnicodeMapping::ToUnicode(std::basic_string<int> &r_s_dest, std::string &r_s_string)
{
	if(!stl_ut::Resize_To_N(r_s_dest, r_s_string.length()))
		return false;
	// alloc dest string

	for(size_t i = 0, n = r_s_string.length(); i < n; ++ i) {
		int n_code;
		if((n_code = n_ToUnicode(r_s_string[i])) == -1)
			return false;
		r_s_dest[i] = n_code;
	}
	// translate

	return true;
}

inline bool CUnicodeMapping::b_HigherUnicode(const TCharacterMapping &r_t_a,
	int n_unicode)
{
	return r_t_a.n_unicode < n_unicode;
}

inline bool CUnicodeMapping::b_SmallerUnicode(const TCharacterMapping &r_t_a,
	const TCharacterMapping &r_t_b)
{
	return r_t_a.n_unicode < r_t_b.n_unicode;
}

bool CUnicodeMapping::GetLine(std::string &r_s_line, FILE *p_fr)
{
	while(!feof(p_fr)) {
		r_s_line.erase();
		for(int c = fgetc(p_fr); c != '\n' && c != EOF; c = fgetc(p_fr)) {
			if(!stl_ut::Reserve_1More(r_s_line))
				return false;
			r_s_line += c;
		}
		// read line

		if(r_s_line.find('#') == 0)
			r_s_line.erase(r_s_line.find('#'));
		// throw away full-line comment

		size_t b = 0, e = r_s_line.length();
		while(e > 0 && isspace(r_s_line[e - 1]))
			-- e;
		while(b < e && isspace(r_s_line[b]))
			++ b;
		r_s_line.erase(e);
		r_s_line.erase(0, b);
		// throw away begin / end whitespace

		if(!r_s_line.length())
			continue;
		// skip empty lines

		return true;
	}

	return false;
}

bool CUnicodeMapping::Parse_LatinCharacterName(const std::string &r_s_line,
	char &r_n_char_name, bool &r_b_capital)
{
	size_t b = r_s_line.find('#'), e = r_s_line.length();
	if(b == std::string::npos)
		return false;

	if(b < e && r_s_line[b] == '#') {
		++ b;
		// skip #

		while(b < e && isspace(r_s_line[b]))
			++ b;
		// skip whitespace

		const char *p_s_latin = "latin";
		while(b < e && tolower(r_s_line[b]) == *p_s_latin) {
			++ p_s_latin;
			++ b;
		}
		if(*p_s_latin)
			return false;
		// skip "latin"

		if(b == e || !isspace(r_s_line[b]))
			return false;
		while(b < e && isspace(r_s_line[b]))
			++ b;
		// skip whitespace

		bool b_capital;
		const char *p_s_case;
		if(b < e && tolower(r_s_line[b]) == 'c') {
			p_s_case = "capital";
			b_capital = true;
		} else /*if(b < e && tolower(r_s_line[b]) == 's')*/ {
			p_s_case = "small";
			b_capital = false;
		}
		while(b < e && tolower(r_s_line[b]) == *p_s_case) {
			++ p_s_case;
			++ b;
		}
		if(*p_s_case)
			return false;
		// skip "capital" or "small"

		if(b == e || !isspace(r_s_line[b]))
			return false;
		while(b < e && isspace(r_s_line[b]))
			++ b;
		// skip whitespace

		const char *p_s_letter = "letter";
		while(b < e && tolower(r_s_line[b]) == *p_s_letter) {
			++ p_s_letter;
			++ b;
		}
		if(*p_s_letter)
			return false;
		// skip "letter"

		if(b == e || !isspace(r_s_line[b]))
			return false;
		while(b < e && isspace(r_s_line[b]))
			++ b;
		// skip whitespace

		if(b < e && isalpha(r_s_line[b]) &&
		   ((b + 1 < e && isspace(r_s_line[b + 1])) || b + 1 == e)) {
			r_n_char_name = r_s_line[b];
			r_b_capital = b_capital;
			return true;
		}
		// in case there's a single letter
	}
	// try to find comment with character name

	return false;
}

/*
 *								=== ~CUnicodeMapping ===
 */
