/*
								+----------------------------------+
								|                                  |
								|  *** Basic compression algs ***  |
								|                                  |
								|   Copyright  -tHE SWINe- 2008   |
								|                                  |
								|           Compress.cpp           |
								|                                  |
								+----------------------------------+
*/

/**
 *	@file Compress.cpp
 *	@author -tHE SWINe-
 *	@date 2008
 *	@brief Simple experimental data compression framework, focused on Burrows-Wheeler methods.
 *
 *	@date 2007-02-25
 *
 *	this is first beta version if the file. t_odo - rewrite TBuffer so it can realloc
 *	itself in a way std::vector can. todo - try to avoid any (re)allocations while (un)packing
 *
 *	@date 2008-03-13
 *
 *	TBuffer was rewritten as requested, fixed some signed / unsigned mismatches for gcc
 *
 *	@date 2008-11-20
 *
 *	TBuffer unit is no longer unsigned char, but uint8_t instead, this should avoid
 *	incompatibility with some extended character encoding in newer versions of visual studio
 *
 *	@date 2009-05-23
 *
 *	removed all instances of std::vector::reserve and replaced them by stl_ut::Reserve_*
 *
 *	@date 2009-10-08
 *
 *	slightly improved CHuffmanCodec, fixed bug in cannonical Huffman code generation for
 *	trees where there are no symbols of length n, but there are both shorter and longer
 *	symbols (codes got shifted too much, got too long, had to be regenerated). this was
 *	hurting compression and so it had to be fixed, but the bug was also in decompression
 *	code, so this sadly breaks backward compatibility.
 *
 *	@date 2009-10-11
 *
 *	replaced stl container ::resize() by stl_ut::Resize_*() to avoid unhandled
 *	std::bad_alloc
 *
 *	optimized CBurrowsWheelerTransform::CWrapMemCmp() wrap-arround memory comparator by
 *	calculating lengths of blocks that do not wrap and comparing them in shorter loops
 *
 *	added __BWT_ENABLE_THREADED_ENCODE macro
 *
 *	@date 2009-10-20
 *
 *	fixed some warnings when compiling under VC 2005, implemented "Security
 *	Enhancements in the CRT" for VC 2008. compare against MyProjects_2009-10-19_
 *
 */

#include "NewFix.h"
#include "CallStack.h"
#include <vector>
#include <map>
#include <set>
#include <algorithm>
#include <string.h>
#include <stdio.h> // for debugging only
#include "MinMax.h"
#include "StlUtils.h"
#include "Buffer.h"
#include "Compress.h"
#include "Crc.h"

#ifdef __BWT_ENABLE_THREADED_ENCODE
#include "Thread.h"
#endif // __BWT_ENABLE_THREADED_ENCODE

#if defined(_MSC_VER) && !defined(__MWERKS__) && !defined(for) && _MSC_VER <= 1200
#define for if(0) {} else for
#endif // _MSC_VER && !__MWERKS__ && !for && _MSC_VER <= 1200
// msvc 'for' scoping hack

/*
 *								=== CBurrowsWheelerTransform ===
 */

/*
 *	CBurrowsWheelerTransform::CWrapMemCmp
 *		- function object implementing less-than ordering
 *		  for indices, pointing to round buffer
 */
class CBurrowsWheelerTransform::CWrapMemCmp {
protected:
	const uint8_t *m_p_org, *m_p_end;
	size_t m_n_size;

public:
	/*
	 *	inline CWrapMemCmp::CWrapMemCmp(const TBuffer &r_t_data_buffer)
	 *		- default constructor
	 *		- we're going to sort indices pointing to r_t_data_buffer
	 */
	inline CWrapMemCmp(const TBuffer &r_t_data_buffer)
		:m_p_org(r_t_data_buffer.p_Data()),
		m_p_end(r_t_data_buffer.p_Data() + r_t_data_buffer.n_Size()),
		m_n_size(r_t_data_buffer.n_Size())
	{}

#ifdef _MSC_VER // MSVC assembly syntax
	static int _fast_memcmp(const uint8_t *a, const uint8_t *b, int n_length)
	{
		{
			__asm {
				mov ecx, dword ptr n_length
				shr ecx, 4
				jz _dontHaveResult
				mov esi, dword ptr a
				mov edx, dword ptr b

			_longLoop:
				prefetch [esi+16]
				prefetch [edx+16]
				prefetch [esi+32]
				prefetch [edx+32]
				prefetch [esi+48]
				prefetch [edx+48]
				prefetch [esi+64]
				prefetch [edx+64] // call for more data

				mov eax, [esi] // todo - use MMX / SSE for this
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4

				mov eax, [esi]
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4

				mov eax, [esi]
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4

				mov eax, [esi]
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4

				loop _longLoop
				mov dword ptr a, esi
				mov dword ptr b, edx // !!!
				jmp _dontHaveResult

			_isBelow:
			}
			return -1;

			__asm { _isAbove: }
			return 1;

			__asm {
			_dontHaveResult:
			}
		}
		// compare using longs (endian-dependable, uses bswap, prepared for MMX/SSE)

		/*{
			__asm {
				mov ecx, dword ptr n_length
				shr ecx, 2
				jz _dontHaveResult
				mov esi, dword ptr a
				mov edx, dword ptr b

			_longLoop:
				mov eax, [esi]
				bswap eax
				mov ebx, [edx]
				bswap ebx
				cmp eax, ebx
				jb _isBelow
				ja _isAbove
				add esi, 4
				add edx, 4
				loop _longLoop
				jmp _dontHaveResult

			_isBelow:
			}
			return -1;

			__asm { _isAbove: }
			return 1;

			__asm {
			_dontHaveResult:
				mov dword ptr a, esi
				mov dword ptr b, edx // !!!
			}
		}*/
		// compare using longs (endian-dependable, uses bswap)

		n_length %= 4 * sizeof(uint32_t);
		while(n_length --) {
			int cmp;
			if(cmp = *a ++ - *b ++)
				return cmp;
		}
		// compare up to 3 bytes
		// this compiles quite well

		return 0;
	}

	/*static int _fast_memcmp_check(const uint8_t *a, const uint8_t *b, int n_length)
	{
		int r1 = _fast_memcmp(a, b, n_length);
		if(r1 > 0)
			r1 = 1;
		else if(r1 < 0)
			r1 = -1;
		int r2 = memcmp(a, b, n_length);
		if(r2 > 0)
			r2 = 1;
		else if(r2 < 0)
			r2 = -1;
		_ASSERTE(r1 == r2);
		return r1;
	}*/
#endif // _MSC_VER

	/*
	 *	inline bool CWrapMemCmp::operator ()(uint32_t n_off_a,
	 *		uint32_t n_off_b) const
	 *		- less-than ordering operator
	 */
	inline bool operator ()(uint32_t n_off_a, uint32_t n_off_b) const
	{
		if(n_off_a == n_off_b)
			return false;
		// they are equal then

		const uint8_t *p_org = m_p_org, *p_end = m_p_end;
		// antialiass

		const uint8_t *p_byte_a = p_org + n_off_a;
		const uint8_t *p_byte_b = p_org + n_off_b;
		// get pointers

#ifdef _MSC_VER // MSVC assembly syntax
		size_t n_stage1_length = m_n_size - max(n_off_a, n_off_b);
		// until first one wraps arround

		{
			int n_result;
			if((n_result = _fast_memcmp(p_byte_a, p_byte_b, n_stage1_length * sizeof(uint8_t))) < 0)
				return true;
			else if(n_result > 0)
				return false;
		}
		// compare first stage (until one of arrays wraps arround)

		p_byte_a += n_stage1_length;
		p_byte_b += n_stage1_length;
		if(p_byte_a == p_end)
			p_byte_a = p_org;
		else /*if(p_byte_b == p_end)*/ { // n_off_a != n_off_b
			_ASSERTE(p_byte_b == p_end); // it must have
			p_byte_b = p_org;
		}
		// shift, wrap arround

		size_t n_stage2_length = m_n_size - min(n_off_a, n_off_b) - n_stage1_length;
		// until second one wraps arround

		{
			int n_result;
			if((n_result = _fast_memcmp(p_byte_a, p_byte_b, n_stage2_length * sizeof(uint8_t))) < 0)
				return true;
			else if(n_result > 0)
				return false;
		}
		// compare second stage (until the other array wraps arround)

		p_byte_a += n_stage2_length;
		p_byte_b += n_stage2_length;
		if(p_byte_a == p_end)
			p_byte_a = p_org;
		else /*if(p_byte_b == p_end)*/ { // n_off_a != n_off_b
			_ASSERTE(p_byte_b == p_end); // it must have
			p_byte_b = p_org;
		}
		// shift, wrap arround

		size_t n_stage3_length = m_n_size - n_stage1_length - n_stage2_length;
		// the rest of comparison

		{
			int n_result;
			if((n_result = _fast_memcmp(p_byte_a, p_byte_b, n_stage3_length * sizeof(uint8_t))) < 0)
				return true;
			else if(n_result > 0)
				return false;
		}
		// compare third stage
#else
		for(size_t i = m_n_size; i; -- i) {
			if(*p_byte_a < *p_byte_b)
				return true;
			else if(*p_byte_a > *p_byte_b)
				return false;
			// compare

			++ p_byte_a, ++ p_byte_b;
			// increment

			if(p_byte_a == p_end)
				p_byte_a = p_org;
			else if(p_byte_b == p_end) // n_off_a != n_off_b
				p_byte_b = p_org;
			// wrap
		}
		// simple, naive code
#endif

		return false;
	}
};

/*
 *	CBurrowsWheelerTransform::CIota
 *		- function object for creating ascending integer sequences
 */
class CBurrowsWheelerTransform::CIota {
protected:
	int m_n_counter;

public:
	inline CIota(int n_value)
		:m_n_counter(n_value)
	{}

	inline int operator ()()
	{
		return m_n_counter ++;
	}
};

/*
 *	static bool CBurrowsWheelerTransform::Decode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CBurrowsWheelerTransform::Decode(const TBuffer &r_t_src, TBuffer &r_t_dest)
{
	_ASSERTE(&r_t_src != &r_t_dest);
	_ASSERTE(r_t_src.p_Data());
	// make some assumptions about source buffer

	if(r_t_src.n_Size() < sizeof(uint32_t))
		return false;
	uint32_t n_primary_index = *(uint32_t*)r_t_src.p_Data();
	// get primary index

	const uint8_t *p_src_data = r_t_src.p_Data() + sizeof(uint32_t);
	size_t n_src_size = r_t_src.n_Size() - sizeof(uint32_t);
	// get pointer to and size of the real data

	if(n_primary_index >= n_src_size)
		return false;
	// see if primary index is valid

	if(!r_t_dest.Resize(n_src_size, false))
		return false;
	// alloc output buffer

	const int n_max_word_value = 1 << (8 * sizeof(uint8_t));
	// number of combinations in a single word

	uint32_t p_buckets[n_max_word_value] = {0};

	std::vector<uint32_t> indices_list;
	if(!stl_ut::Reserve_N(indices_list, n_src_size))
		return false;
	// allocate list for indices

	{
		const uint8_t *p_src = p_src_data,
			*p_end = p_src_data + n_src_size;
		for(; p_src != p_end; ++ p_src) {				// for i := 0 to N-1 do
			indices_list.push_back(p_buckets[*p_src]);	//     P[i] := C[L[i]];
			if(!(++ p_buckets[*p_src]))					//     C[L[i]] := C[L[i]] + 1
				return false; // overflow
		}
	}
	// generate indices

	{
		uint32_t n_sum = 0;								// sum := 0;
 		for(int i = 0; i < n_max_word_value; ++ i) {	// for ch := FIRST(alphabet) to LAST(alphabet) do
			uint32_t n_value = p_buckets[i];			//     temp := C[ch]
			p_buckets[i] = n_sum;						//     C[ch] := sum
			if(n_sum > UINT32_MAX - n_value)
				return false; // overflow
			n_sum += n_value;							//     sum := sum + temp
		}
	}
	// integrate buckets

	{
		const uint8_t *p_src = p_src_data;
		uint8_t *p_dest = r_t_dest.p_Data() + n_src_size - 1,
			*p_end = r_t_dest.p_Data() - 1;
		for(uint32_t i = n_primary_index; p_dest != p_end; -- p_dest,	// i:=I;
		   i = indices_list[i] + p_buckets[p_src[i]])					// for j := N-1 downto 0 do
			*p_dest = p_src[i];											//     S[j] := L[i];
																		//     i := P[i] + C[L[i]]
	}
	// fill output buffer (backwards)

	return true;
}

#ifdef __BWT_ENABLE_THREADED_ENCODE

typedef std::vector<uint32_t>::iterator TULongIter;

class CBurrowsWheelerTransform::CSorter : public CRunable {
protected:
	TULongIter m_p_begin, m_p_end;
	const TBuffer *m_p_src;

public:
	CSorter()
	{}

	CSorter(TULongIter p_begin, TULongIter p_end, const TBuffer &r_t_src)
		:m_p_begin(p_begin), m_p_end(p_end), m_p_src(&r_t_src)
	{}

	virtual void Run()
	{
		std::sort(m_p_begin, m_p_end, CWrapMemCmp(*m_p_src));
		// sort the interval
	}
};

class CBurrowsWheelerTransform::CMerger : public CRunable {
protected:
	TULongIter m_p_out, m_p_begin, m_p_middle, m_p_end;
	std::vector<uint32_t> m_vec;
	const TBuffer *m_p_src;

public:
	CMerger()
	{}

	CMerger(TULongIter p_out, TULongIter p_begin,
		TULongIter p_middle, TULongIter p_end, const TBuffer &r_t_src)
		:m_p_out(p_out), m_p_begin(p_begin), m_p_middle(p_middle), m_p_end(p_end), m_p_src(&r_t_src)
	{}

	virtual void Run()
	{
		std::merge(m_p_begin, m_p_middle, m_p_middle, m_p_end, m_p_out, CWrapMemCmp(*m_p_src));
		// merge somewhere else. my implementation of inplace_merge is not thread safe

		//std::inplace_merge(m_p_begin, m_p_middle, m_p_end, CWrapMemCmp(*m_p_src));
		// sort the interval
	}
};

/*
 *	static bool CBurrowsWheelerTransform::ThreadedEncode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer, int n_thread_num)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- works in parallel, n_thread_num must be power of two
 *		- returns true on success, false on failure
 *		- note this doesn't work with empty input buffer
 *		- note this only gets compiled if __BWT_ENABLE_THREADED_ENCODE macro
 *		  is defined (not by default)
 */
bool CBurrowsWheelerTransform::ThreadedEncode(const TBuffer &r_t_src,
	TBuffer &r_t_dest, int n_thread_num)
{
	_ASSERTE(&r_t_src != &r_t_dest);
	_ASSERTE(r_t_src.p_Data());
	// make some assumptions about source buffer

	if(r_t_src.n_Size() > UINT32_MAX - sizeof(uint32_t))
		return false; // overflow
	if(!r_t_dest.Resize(r_t_src.n_Size() + sizeof(uint32_t), false))
		return false;
	// allocate output buffer

	std::vector<uint32_t> indices_list(r_t_src.n_Size());
	std::vector<uint32_t> sorted_list(r_t_src.n_Size());
	if(indices_list.size() < r_t_src.n_Size() ||
	   sorted_list.size() < r_t_src.n_Size())
		return false;
	// allocate list for indices

	std::generate(indices_list.begin(), indices_list.end(), CIota(0));
	// generate sequence of indices

	const int n_max_thread_num = 64;
	// max number of worker threads

	if(n_thread_num > n_max_thread_num)
		return false;

	{
		const size_t n_size = r_t_src.n_Size();
		const size_t n_part_size = n_size / n_thread_num;
		const size_t n_rest_size = n_size % n_thread_num;
		// size of one part for sorting thread and of the rest

		CSorter p_sorter[n_max_thread_num];
		CThread p_thread[n_max_thread_num];

		{
			size_t n_begin = 0;
			size_t n_cur_size = n_part_size + n_rest_size;
			for(int i = 0; i < n_thread_num; ++ i,
			   n_begin += n_cur_size, n_cur_size = n_part_size) {
				p_sorter[i] = CSorter(indices_list.begin() + n_begin,
					indices_list.begin() + n_begin + n_cur_size, r_t_src);
				p_thread[i].AttachRunable(p_sorter[i]);
				if(!p_thread[i].Start())
					return false;
				// start sorting in threads
			}

			for(int i = 0; i < n_thread_num; ++ i)
				p_thread[i].Stop();
			// wait for threads to finish
		}
		// partially sort list, in n_thread_num threads

		CMerger p_merger[n_max_thread_num / 2];

		for(int n = n_thread_num / 2, m = 1; n != 0; n /= 2, m *= 2) {
			_ASSERTE(!(n % 2) || n == 1); // must be power of two

			size_t n_begin = 0;
			size_t n_cur_half = n_part_size * m + n_rest_size;
			size_t n_cur_size = n_cur_half + n_part_size * m;
			for(int i = 0; i < n; ++ i, n_begin += n_cur_size,
			   n_cur_half = n_part_size * m, n_cur_size = n_part_size * 2 * m) {
				p_merger[i] = CMerger(sorted_list.begin() + n_begin,
					indices_list.begin() + n_begin,
					indices_list.begin() + n_begin + n_cur_half,
					indices_list.begin() + n_begin + n_cur_size, r_t_src);
				p_thread[i].AttachRunable(p_merger[i]);
				if(!p_thread[i].Start())
					return false;
				// start merging in threads
			}
			// merge sub-lists, all merges in one iteration can be executed in parallel

			for(int i = 0; i < n; ++ i)
				p_thread[i].Stop();
			// wait for threads to finish

			indices_list.swap(sorted_list);
		}
		// merge to create sorted list, need n_thread_num / 2 threads
	}
	// sort indices (bottleneck)

	const uint8_t *p_src = r_t_src.p_Data();
	uint8_t *p_dest = r_t_dest.p_Data() + sizeof(uint32_t);
	for(size_t i = 0, n = r_t_src.n_Size(); i < n; ++ i, ++ p_dest)
		*p_dest = p_src[(indices_list[i] + n - 1) % n];
	// fill output buffer

	size_t n_primary_index = std::find(indices_list.begin(),
		indices_list.end(), 0U) - indices_list.begin();
	_ASSERTE(n_primary_index < r_t_src.n_Size());
	// find primary index

	*(uint32_t*)r_t_dest.p_Data() = uint32_t(n_primary_index);
	// write it to front of the buffer

	return true;
}

#endif // __BWT_ENABLE_THREADED_ENCODE

/*
 *	static bool CBurrowsWheelerTransform::Encode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CBurrowsWheelerTransform::Encode(const TBuffer &r_t_src, TBuffer &r_t_dest)
{
	_ASSERTE(&r_t_src != &r_t_dest);
	_ASSERTE(r_t_src.p_Data());
	// make some assumptions about source buffer

	if(r_t_src.n_Size() > UINT32_MAX - sizeof(uint32_t))
		return false; // overflow
	if(!r_t_dest.Resize(r_t_src.n_Size() + sizeof(uint32_t), false))
		return false;
	// allocate output buffer

	std::vector<uint32_t> indices_list(r_t_src.n_Size());
	if(indices_list.size() < r_t_src.n_Size())
		return false;
	// allocate list for indices

	std::generate(indices_list.begin(), indices_list.end(), CIota(0));
	// generate sequence of indices

	std::sort(indices_list.begin(), indices_list.end(), CWrapMemCmp(r_t_src));
	// sort indices

	const uint8_t *p_src = r_t_src.p_Data();
	uint8_t *p_dest = r_t_dest.p_Data() + sizeof(uint32_t);
	for(size_t i = 0, n = r_t_src.n_Size(); i < n; ++ i, ++ p_dest)
		*p_dest = p_src[(indices_list[i] + n - 1) % n];
	// fill output buffer

	size_t n_primary_index = std::find(indices_list.begin(),
		indices_list.end(), 0U) - indices_list.begin();
	_ASSERTE(n_primary_index < r_t_src.n_Size());
	// find primary index

	*(uint32_t*)r_t_dest.p_Data() = uint32_t(n_primary_index);
	// write it to front of the buffer

	return true;
}

/*
 *								=== ~CBurrowsWheelerTransform ===
 */

/*
 *								=== CMoveToFrontTransform ===
 */

/*
 *	static void CMoveToFrontTransform::Decode(const TBuffer &r_t_buffer, int n_algorithm)
 *		- decodes data in r_t_buffer (operates in-sit)
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 */
void CMoveToFrontTransform::Decode(TBuffer &r_t_buffer, int n_algorithm)
{
	_Decode(r_t_buffer, r_t_buffer, n_algorithm);
	// decode can work in-sit
}

/*
 *	static void CMoveToFrontTransform::Encode(const TBuffer &r_t_buffer, int n_algorithm)
 *		- encodes data in r_t_buffer (operates in-sit)
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 */
void CMoveToFrontTransform::Encode(TBuffer &r_t_buffer, int n_algorithm)
{
	_Encode(r_t_buffer, r_t_buffer, n_algorithm);
	// encode can work in-sit
}

/*
 *	static bool CMoveToFrontTransform::Decode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer, int n_algorithm)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 *		- returns true on success, false on failure
 */
bool CMoveToFrontTransform::Decode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm)
{
	if(!r_t_dest.Resize(r_t_src.n_Size(), false))
		return false;
	// allocate output buffer

	_Decode(r_t_src, r_t_dest, n_algorithm);
	// decode

	return true;
}

/*
 *	static bool CMoveToFrontTransform::Encode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer, int n_algorithm)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 *		- returns true on success, false on failure
 */
bool CMoveToFrontTransform::Encode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm)
{
	if(!r_t_dest.Resize(r_t_src.n_Size(), false))
		return false;
	// allocate output buffer

	_Encode(r_t_src, r_t_dest, n_algorithm);
	// encode

	return true;
}

/*
 *	static bool CMoveToFrontTransform::_Decode(const TBuffer &r_t_src,
 *		TBuffer &r_t_dest, int n_algorithm)
 *		- decodes data from r_t_src, outputs to r_t_dest which
 *		  must be allocated to the same size as r_t_src
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 */
void CMoveToFrontTransform::_Decode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm)
{
	_ASSERTE(r_t_src.n_Size() == r_t_dest.n_Size());
	_ASSERTE(r_t_src.p_Data() && r_t_dest.p_Data());
	// make some assumptions about input / output buffers

	const int n_max_word_value = 1 << (8 * sizeof(uint8_t));
	// number of combinations in a single word

	uint8_t p_data[n_max_word_value];
	for(int i = 0; i < n_max_word_value; ++ i)
		p_data[i] = (uint8_t)i;
	// start with buffer of successive numbers

	if(n_algorithm == algo_MTF) {
		const uint8_t *p_src = r_t_src.p_Data(),
			*p_end = r_t_src.p_Data() + r_t_src.n_Size();
		for(uint8_t *p_dest = r_t_dest.p_Data(); p_src != p_end; ++ p_src, ++ p_dest) {
			uint8_t n_index = *p_src;
			if(!n_index)
				*p_dest = p_data[0];
			else {
				int n_data = p_data[n_index];
				// get index in the array

				for(int i = n_index; i > 0; -- i)
					p_data[i] = p_data[i - 1];
				// shift the array (simulate encoder)

				p_data[0] = n_data;
				// move our byte to the front

				*p_dest = n_data;
				// output index where the byte has been found
			}
		}
	} else /*if(n_algorithm == algo_MTF_1)*/ {
		const uint8_t *p_src = r_t_src.p_Data(),
			*p_end = r_t_src.p_Data() + r_t_src.n_Size();
		for(uint8_t *p_dest = r_t_dest.p_Data(); p_src != p_end; ++ p_src, ++ p_dest) {
			uint8_t n_index = *p_src;
			if(!n_index)
				*p_dest = p_data[0];
			else {
				int n_data = p_data[n_index];
				// get index in the array

				if(n_index == 1) {
					for(int i = n_index; i > 0; -- i)
						p_data[i] = p_data[i - 1];
					// shift the array (simulate encoder)

					p_data[0] = n_data;
					// move our byte to the front
				} else {
					for(int i = n_index; i > 1; -- i)
						p_data[i] = p_data[i - 1];
					// shift the array (simulate encoder)

					p_data[1] = n_data;
					// move our byte to second place
				}

				*p_dest = n_data;
				// output index where the byte has been found
			}
		}
	}
	// decode
}

/*
 *	static bool CMoveToFrontTransform::_Encode(const TBuffer &r_t_src,
 *		TBuffer &r_t_dest, int n_algorithm)
 *		- encodes data from r_t_src, outputs to r_t_dest which
 *		  must be allocated to the same size as r_t_src
 *		- n_algorithm is MTF algorithm, one of algo_MTF, algo_MTF_1
 */
void CMoveToFrontTransform::_Encode(const TBuffer &r_t_src, TBuffer &r_t_dest, int n_algorithm)
{
	_ASSERTE(r_t_src.n_Size() == r_t_dest.n_Size());
	_ASSERTE(r_t_src.p_Data() && r_t_dest.p_Data());
	// make some assumptions about input / output buffers

	const int n_max_word_value = 1 << (8 * sizeof(uint8_t));
	// number of combinations in a single word

	uint8_t p_indices[n_max_word_value];
	for(int i = 0; i < n_max_word_value; ++ i)
		p_indices[i] = (uint8_t)i;
	// start with buffer of successive numbers

	if(n_algorithm == algo_MTF) {
		const uint8_t *p_src = r_t_src.p_Data(),
			*p_end = r_t_src.p_Data() + r_t_src.n_Size();
		for(uint8_t *p_dest = r_t_dest.p_Data(); p_src != p_end; ++ p_src, ++ p_dest) {
			uint8_t n_data = *p_src;
			if(n_data == p_indices[0])
				*p_dest = 0; // we wish this would happen a lot
			else {
				int n_index = 0;
				for(;; ++ n_index) {
					_ASSERTE(n_index < n_max_word_value);
					if(p_indices[n_index] == n_data)
						break;
				}
				// find it in array

				for(int i = n_index; i > 0; -- i)
					p_indices[i] = p_indices[i - 1];
				// shift the array

				p_indices[0] = n_data;
				// move our byte to the front

				*p_dest = n_index;
				// output index where the byte has been found
			}
		}
	} else /*if(n_algorithm == algo_MTF_1)*/ {
		_ASSERTE(n_algorithm == algo_MTF_1);
		const uint8_t *p_src = r_t_src.p_Data(),
			*p_end = r_t_src.p_Data() + r_t_src.n_Size();
		for(uint8_t *p_dest = r_t_dest.p_Data(); p_src != p_end; ++ p_src, ++ p_dest) {
			uint8_t n_data = *p_src;
			if(n_data == p_indices[0])
				*p_dest = 0; // we wish this would happen a lot
			else {
				int n_index = 0;
				for(;; ++ n_index) {
					_ASSERTE(n_index < n_max_word_value);
					if(p_indices[n_index] == n_data)
						break;
				}
				// find it in array

				if(n_index == 1) {
					for(int i = n_index; i > 0; -- i)
						p_indices[i] = p_indices[i - 1];
					// shift the array

					p_indices[0] = n_data;
					// move our byte to the front
				} else {
					for(int i = n_index; i > 1; -- i)
						p_indices[i] = p_indices[i - 1];
					// shift the array

					p_indices[1] = n_data;
					// move our byte to second place
				}

				*p_dest = n_index;
				// output index where the byte has been found
			}
		}
	}
	// encode
}

/*
 *								=== ~CMoveToFrontTransform ===
 */

/*
 *								=== CRunLengthCodec ===
 */

/*
 *	static bool CRunLengthCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CRunLengthCodec::Decode(const TBuffer &r_t_src, TBuffer &r_t_dest)
{
	uint8_t *p_output = r_t_dest.p_Data(),
		*p_end = r_t_dest.p_Data() + r_t_dest.n_Size();
	// output buffer (size will change dynamically)

	for(const uint8_t *p_src = r_t_src.p_Data(),
	   *p_end2 = r_t_src.p_Data() + r_t_src.n_Size(); p_src != p_end2;) {
		uint8_t n_code = *p_src;
		bool b_compressed = n_code & 1;
		int n_run_length = (n_code >> 1) + 1;
		// determine compression and run length

		if(p_src + ((b_compressed)? 2 : 1 + n_run_length) > p_end2)
			return false;
		// buffer overrun, invalid input data

		++ p_src;
		// skip code

		if(p_output + n_run_length > p_end) {
			uint32_t n_off = uint32_t(p_output - r_t_dest.p_Data());
			if(!r_t_dest.Grow(n_run_length))
				return false;
			p_output = r_t_dest.p_Data() + n_off;
			p_end = r_t_dest.p_Data() + r_t_dest.n_Size();
		}
		// make sure there's enough space

		if(b_compressed) {
			uint8_t n_data = *p_src ++;
			for(const uint8_t *p_end2 = p_output + n_run_length;
			   p_output != p_end2; ++ p_output)
				*p_output = n_data;
			// replicate the same byte n_run_length times
		} else {
			for(const uint8_t *p_end2 = p_output + n_run_length;
			   p_output != p_end2; ++ p_output, ++ p_src)
				*p_output = *p_src;
			// copy n_run_length bytes
		}
		// read data

		_ASSERTE(p_output <= p_end);
		_ASSERTE(p_end == r_t_dest.p_Data() + r_t_dest.n_Size());
		// make sure we don't cross buffer boundaries
	}
	// IRLE loop

	r_t_dest.Resize(uint32_t(p_output - r_t_dest.p_Data()));
	// shrink the buffer to it's final length

	return true;
}

/*
 *	static bool CRunLengthCodec::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CRunLengthCodec::Encode(const TBuffer &r_t_src, TBuffer &r_t_dest)
{
	const int n_max_repeats = 0x80;
	// compressor config

	r_t_dest.Resize(r_t_dest.n_Capacity());
	uint8_t *p_output = r_t_dest.p_Data(),
		*p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
	// output buffer (size will change dynamically)

	for(const uint8_t *p_src = r_t_src.p_Data(),
	   *p_end = r_t_src.p_Data() + r_t_src.n_Size(); p_src != p_end;) {
		int n_uncompressed_size = p_end - p_src, n_compressed_size = 0;
		if(p_src + min_RunLength < p_end) {
			for(const uint8_t *p_src2 = p_src, *p_end2 = p_end - min_RunLength;
			   p_src2 != p_end2; ++ p_src2) {
				if(*p_src2 == p_src2[1]) {
					uint8_t n_byte = *p_src2;
					int n_run_length = 1;
					while(p_src2 + n_run_length != p_end && p_src2[n_run_length] == n_byte)
						++ n_run_length;
					if(n_run_length >= min_RunLength) {
						n_compressed_size = n_run_length;
						n_uncompressed_size = p_src2 - p_src;
						break;
					}
				}
			}
		}
		// get size of uncompressed data, preceeding compressed data,
		// get size of compressed data as well

		int n_add = n_uncompressed_size + (n_uncompressed_size +
			n_max_repeats - 1) / n_max_repeats;
		if(n_compressed_size)
			n_add += 2 * (n_compressed_size + n_max_repeats - 1) / n_max_repeats;
		if(p_output + n_add > p_out_end) {
			size_t n_off = p_output - r_t_dest.p_Data();
			if(!r_t_dest.Grow(n_add) || !r_t_dest.Resize(r_t_dest.n_Capacity()))
				return false;
			p_output = r_t_dest.p_Data() + n_off;
			p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
			_ASSERTE(p_output + n_add <= p_out_end);
		}
		// make sure there's enough space

		while(n_uncompressed_size) {
			int n_write = min(n_uncompressed_size, n_max_repeats);
			_ASSERTE(!((n_write - 1) & 0x80));

			*p_output ++ = (n_write - 1) << 1;
			memcpy(p_output, p_src, n_write * sizeof(uint8_t));

			p_output += n_write;
			n_uncompressed_size -= n_write;
			p_src += n_write;

			_ASSERTE(p_output <= p_out_end);
		}
		// write uncompressed part

		while(n_compressed_size) {
			int n_write = min(n_compressed_size, n_max_repeats);
			_ASSERTE(!((n_write - 1) & 0x80));

			*p_output ++ = ((n_write - 1) << 1) | 1;
			*p_output ++ = *p_src;

			n_compressed_size -= n_write;
			p_src += n_write;

			_ASSERTE(p_output <= p_out_end);
		}
		// write compressed part
	}
	// RLE loop

	_ASSERTE(p_output <= p_out_end);
	r_t_dest.Resize(uint32_t(p_output - r_t_dest.p_Data()));
	// shrink the buffer to it's final length

	return true;
}

/*
 *								=== ~CRunLengthCodec ===
 */

/*
 *								=== CModifiedRLECodec ===
 */

/**
 *	@def __MODIFIED_RLE_USE_RLE_EXP
 *	@brief if defined, uses the exponential RLE. otherwise uses naive RLE
 */
#define __MODIFIED_RLE_USE_RLE_EXP

/**
 *	@brief exponential encoding for RLE
 *
 *	@tparam _n_min_run_length is minimal run length to encode, in symbols (e.g. 3)
 *	@tparam _n_bit_granularity is granularity of encoded lenghts,
 *		in bits (e.g. 1 is bit encoding, 8 is encoding by byte)
 *	@tparam _n_min_width is minimal run length encode size, in granules (2 seems
 *		to give the best compression)
 */
template <const int _n_min_run_length, const int _n_bit_granularity,
	const int _n_min_width>
struct CModifiedRLECodec::TExpEncoding {
	/**
	 *	@brief configuration stored as enum
	 */
	enum {
		n_min_run_length = _n_min_run_length,
		n_min_width = _n_min_width, // in granules, not in bits!
		n_bit_granularity = _n_bit_granularity,
		n_thresh_num = 32 / n_bit_granularity
	};

	size_t p_run_thresh[n_thresh_num]; /**< @brief run encode thresholds */
	// compare to p_run_thresh to know how many bytes to use, subtract p_run_offset before encoding

	TExpEncoding()
	{
		_ASSERTE(n_min_width > 0);
		_ASSERTE(n_min_width <= n_thresh_num);
		_ASSERTE(n_bit_granularity < 32);
		// these should be static asserts, but they are not meant to be overrided by the user

		for(int i = 0; i < n_min_width; ++ i)
			p_run_thresh[i] = n_min_run_length;
		for(int i = n_min_width; i < n_thresh_num; ++ i) {
			int n_byte_num = i;
			size_t n_max_run_length = n_Mask(size_t(n_bit_granularity * n_byte_num));
			n_max_run_length += p_run_thresh[i - 1];
			p_run_thresh[i] = n_max_run_length + 1;
		}
	}

	int n_Encode_Byte_Num(size_t n_compressed_size) const
	{
		if(n_thresh_num > 4) { // compile-time constant
			const size_t *p_it = std::upper_bound(p_run_thresh + n_min_width,
				p_run_thresh + n_thresh_num, n_compressed_size); // binary search for big tables
			if(p_it < p_run_thresh + n_thresh_num && n_compressed_size < *p_it)
				return p_it - p_run_thresh;
		} else {
			for(int i = n_min_width; i < n_thresh_num; ++ i) { // linear search for small tables
				if(n_compressed_size < p_run_thresh[i])
					return i;
			}
		}
		return n_thresh_num;
	}

	size_t n_Adjust_Size(size_t n_compressed_size, int n_compressed_run_byte_num) const
	{
		_ASSERTE(n_compressed_size >= p_run_thresh[n_compressed_run_byte_num - 1]);
		return n_compressed_size - p_run_thresh[n_compressed_run_byte_num - 1];
	}

	size_t n_UnAdjust_Size(size_t n_compressed_size, int n_compressed_run_byte_num) const
	{
		_ASSERTE(n_compressed_size < SIZE_MAX - p_run_thresh[n_compressed_run_byte_num - 1]);
		return n_compressed_size + p_run_thresh[n_compressed_run_byte_num - 1];
	}

	void UnitTest() const
	{
		for(int j = n_min_width; j < n_thresh_num; ++ j) {
			size_t n_compressed_size = p_run_thresh[j] - 1;
			int n_expected_byte_num = j;

			int n_compressed_run_byte_num = n_Encode_Byte_Num(n_compressed_size);
			n_compressed_size = n_Adjust_Size(n_compressed_size, n_compressed_run_byte_num);
			// use table to determine number of bytes

			_ASSERTE(n_compressed_run_byte_num == n_expected_byte_num); // number of bytes must match
			_ASSERTE(n_compressed_size == n_Mask(size_t(n_compressed_run_byte_num * n_bit_granularity))); // must be all ones
		}
		// simple cases - upper bounds

		for(int j = 0; j < n_thresh_num; ++ j) {
			size_t n_compressed_size = p_run_thresh[j];
			int n_expected_byte_num = j + 1;

			int n_compressed_run_byte_num = n_Encode_Byte_Num(n_compressed_size);
			n_compressed_size = n_Adjust_Size(n_compressed_size, n_compressed_run_byte_num);
			// use table to determine number of bytes

			_ASSERTE(n_compressed_run_byte_num == max(n_min_width, n_expected_byte_num)); // number of bytes must match
			_ASSERTE(n_compressed_size == 0); // must be zero
		}
		// simple cases - lower bounds

		for(size_t _j = n_min_run_length; _j < SIZE_MAX; ++ _j) {
			size_t n_compressed_size = _j;
			int n_compressed_run_byte_num = n_Encode_Byte_Num(n_compressed_size);
			n_compressed_size = n_Adjust_Size(n_compressed_size, n_compressed_run_byte_num);
			// use table to determine number of bytes

			if(n_compressed_run_byte_num < n_thresh_num) {
				n_compressed_size >>= n_bit_granularity * n_compressed_run_byte_num;
				_ASSERTE(!n_compressed_size);
				// make sure it fits in the given number of bytes
			}
		}

		// 2 - 257 encoded using 1 byte
		// 258 - (65537 + 258) encoded using 2 bytes, as we can subtract 258 from the value
		// this depends on value of n_max_run_length, cannot be stored as const
	}
};

/*
 *	static bool CModifiedRLECodec::Decode(const TBuffer &r_t_src,
 *		TBuffer &r_t_src_runs, TBuffer &r_t_dest)
 *		- decodes data from r_t_src / r_t_src_runs, outputs to r_t_dest (can be empty)
 *		- returns true on success, false on failure
 */
bool CModifiedRLECodec::Decode(const TBuffer &r_t_src, const TBuffer &r_t_src_runs, TBuffer &r_t_dest)
{
	r_t_dest.Resize(r_t_dest.n_Capacity());
	uint8_t *p_output = r_t_dest.p_Data(),
		*p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
	// output buffer (size will change dynamically)

	const uint8_t *p_src = r_t_src.p_Data(), *p_end = p_src + r_t_src.n_Size();
	// source buffers

	//FILE *p_fw = fopen("rle_dec.txt", "w"); // debug

#ifdef __MODIFIED_RLE_USE_RLE_EXP
	TExpEncoding<> exp_decode; 
	CBitDecoder decoder(r_t_src_runs);
#else // __MODIFIED_RLE_USE_RLE_EXP
	const uint8_t *p_rl_src = r_t_src_runs.p_Data(), *p_rl_end = p_rl_src + r_t_src_runs.n_Size();
#endif // __MODIFIED_RLE_USE_RLE_EXP

	while(p_src != p_end) {
		int n_uncompressed_size = p_end - p_src, n_compressed_num = 0;
		if(min_RunLength <= n_uncompressed_size) { /*min_RunLength <= p_end - p_src*/ /*p_src + min_RunLength <= p_end*/
			for(const uint8_t *p_src2 = p_src, *p_end2 = p_end - min_RunLength + 1;
			   p_src2 != p_end2; ++ p_src2) {
				if(*p_src2 == p_src2[1]) {
					uint8_t n_byte = *p_src2;
					int n_run_length = 2;
					while(p_src2 + n_run_length != p_end && p_src2[n_run_length] == n_byte)
						++ n_run_length;
					if(n_run_length >= min_RunLength) {
						n_compressed_num = n_run_length;
						n_uncompressed_size = p_src2 - p_src;
						break;
					}
				}
			}
		}
		// get size of uncompressed data, preceeding compressed data,
		// get size of compressed data as well

#ifdef __MODIFIED_RLE_USE_RLE_EXP
		int n_compressed_size = 0;
		if(n_compressed_num) {
			if(n_compressed_num < min_RunLength)
				return false;
			// must be either 0 or more than min_RunLength

			int n_rle_byte_num = n_compressed_num - (min_RunLength - TExpEncoding<>::n_min_width);
			int n_rle_bit_num = TExpEncoding<>::n_bit_granularity * n_rle_byte_num;
			if(!decoder.Decode_Symbol(n_compressed_size, n_rle_bit_num))
				return false;
			n_compressed_size = exp_decode.n_UnAdjust_Size(n_compressed_size, n_rle_byte_num);
			// calculate length of compressed data
		}
#else // __MODIFIED_RLE_USE_RLE_EXP
		if(n_compressed_num % min_RunLength)
			return false;
		n_compressed_num /= min_RunLength;
		// must be multiplies of min_RunLength

		int n_compressed_size = 0;
		if(p_rl_src + n_compressed_num > p_rl_end)
			return false;
		for(int i = 0; i < n_compressed_num; ++ i, ++ p_rl_src)
			n_compressed_size += int(*p_rl_src) + min_RunLength;
		// calculate length of compressed data
#endif // __MODIFIED_RLE_USE_RLE_EXP

		/*if(n_uncompressed_size)
		{	if(n_uncompressed_size == 2) {
				_ASSERTE(min_RunLength != 2 || p_src[0] != p_src[1]); // would be a run, no?
				fprintf(p_fw, "uncompressed %d [0x%02x, 0x%02x]\n", n_uncompressed_size, p_src[0], p_src[1]);
			} else
				fprintf(p_fw, "uncompressed %d\n", n_uncompressed_size);
		}
		if(n_compressed_size)
			fprintf(p_fw, "compressed %d [0x%02x]\n", n_compressed_size, p_src[n_uncompressed_size]);*/
		// debug

		int n_add = n_uncompressed_size + n_compressed_size;
		if(p_output + n_add > p_out_end) {
			size_t n_off = p_output - r_t_dest.p_Data();
			if(!r_t_dest.Grow(n_add) || !r_t_dest.Resize(r_t_dest.n_Capacity()))
				return false;
			p_output = r_t_dest.p_Data() + n_off;
			p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
			_ASSERTE(p_output + n_add <= p_out_end);
		}
		// make sure there's enough space

		memcpy(p_output, p_src, n_uncompressed_size);
		p_output += n_uncompressed_size;
		p_src += n_uncompressed_size;
		// copy uncompressed data

		memset(p_output, *p_src, n_compressed_size);
		p_output += n_compressed_size;
#ifdef __MODIFIED_RLE_USE_RLE_EXP
		p_src += n_compressed_num;
#else // __MODIFIED_RLE_USE_RLE_EXP
		p_src += min_RunLength * n_compressed_num;
#endif // __MODIFIED_RLE_USE_RLE_EXP
		// fill compressed data

		_ASSERTE(p_output <= p_out_end);
		_ASSERTE(p_src <= p_end);
	}
	// decompress modified RLE

	//fclose(p_fw); // debug

	_ASSERTE(p_src == p_end);
	//_ASSERTE(decoder.b_Finished()); // t_odo - fix this! (save compressed data, compare output, etc.)
	// make sure we've read both input buffers

	_ASSERTE(p_output <= p_out_end);
	r_t_dest.Resize(uint32_t(p_output - r_t_dest.p_Data()));
	// shrink the buffer to it's final length

#ifdef __MODIFIED_RLE_USE_RLE_EXP
	return decoder.b_Finished(); // may be a problem in the data
#else // __MODIFIED_RLE_USE_RLE_EXP
	return p_rl_src == p_rl_end; // may be a problem in the data
#endif // __MODIFIED_RLE_USE_RLE_EXP
}

/*
 *	static bool CModifiedRLECodec::Encode(const TBuffer &r_t_src,
 *		TBuffer &r_t_dest, TBuffer &r_t_dest_runs)
 *		- encodes data from r_t_src, outputs to r_t_dest / r_t_dest_runs (can be empty)
 *		- returns true on success, false on failure
 */
bool CModifiedRLECodec::Encode(const TBuffer &r_t_src, TBuffer &r_t_dest, TBuffer &r_t_dest_runs)
{
	r_t_dest.Resize(r_t_dest.n_Capacity());
	uint8_t *p_output = r_t_dest.p_Data(),
		*p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
	// output buffer (size will change dynamically)

#ifdef __MODIFIED_RLE_USE_RLE_EXP
	TExpEncoding<> exp_encode;

	r_t_dest_runs.Resize(0);
	CBitEncoder bit_coder(r_t_dest_runs);
#else // __MODIFIED_RLE_USE_RLE_EXP
	r_t_dest_runs.Resize(r_t_dest_runs.n_Capacity());
	uint8_t *p_rl_out = r_t_dest_runs.p_Data(),
		*p_rl_out_end = r_t_dest_runs.p_Data() + r_t_dest_runs.n_Size();
	// output buffer (size will change dynamically)
#endif // __MODIFIED_RLE_USE_RLE_EXP

	for(const uint8_t *p_src = r_t_src.p_Data(),
	   *p_end = r_t_src.p_Data() + r_t_src.n_Size(); p_src != p_end;) {
		int n_uncompressed_size = p_end - p_src, n_compressed_size = 0;
		if(p_src + min_RunLength <= p_end) {
			for(const uint8_t *p_src2 = p_src, *p_end2 = p_end - min_RunLength + 1;
			   p_src2 != p_end2; ++ p_src2) {
				if(*p_src2 == p_src2[1]) {
					uint8_t n_byte = *p_src2;
					int n_run_length = 1;
					while(p_src2 + n_run_length != p_end && p_src2[n_run_length] == n_byte)
						++ n_run_length;
					if(n_run_length >= min_RunLength) {
						n_compressed_size = n_run_length;
						n_uncompressed_size = p_src2 - p_src;
						break;
					}
				}
			}
		}
		// get size of uncompressed data, preceeding compressed data,
		// get size of compressed data as well

		int n_compressed_run_byte_num;
		{
			int n_add_rl = 0;
			int n_add = n_uncompressed_size;
			if(n_compressed_size) {
#ifdef __MODIFIED_RLE_USE_RLE_EXP
				_ASSERTE(n_compressed_size >= min_RunLength);

				n_compressed_run_byte_num = exp_encode.n_Encode_Byte_Num(n_compressed_size);
				// use table to determine number of bytes

				n_add += min_RunLength + n_compressed_run_byte_num - TExpEncoding<>::n_min_width;
				n_add_rl = n_compressed_run_byte_num;
#else // __MODIFIED_RLE_USE_RLE_EXP
				n_compressed_run_byte_num = ((n_compressed_size + n_max_repeats - 1) / n_max_repeats);
				n_add += min_RunLength * n_compressed_run_byte_num;
				n_add_rl = n_compressed_run_byte_num;
#endif // __MODIFIED_RLE_USE_RLE_EXP
			}
			if(p_output + n_add > p_out_end) {
				size_t n_off = p_output - r_t_dest.p_Data();
				if(!r_t_dest.Grow(n_add) || !r_t_dest.Resize(r_t_dest.n_Capacity()))
					return false;
				p_output = r_t_dest.p_Data() + n_off;
				p_out_end = r_t_dest.p_Data() + r_t_dest.n_Size();
				_ASSERTE(p_output + n_add <= p_out_end);
			}
#ifndef __MODIFIED_RLE_USE_RLE_EXP
			if(p_rl_out + n_add_rl > p_rl_out_end) { // allocates when encoding
				size_t n_off = p_rl_out - r_t_dest_runs.p_Data();
				if(!r_t_dest_runs.Grow(n_add_rl) || !r_t_dest_runs.Resize(r_t_dest_runs.n_Capacity()))
					return false;
				p_rl_out = r_t_dest_runs.p_Data() + n_off;
				p_rl_out_end = r_t_dest_runs.p_Data() + r_t_dest_runs.n_Size();
				_ASSERTE(p_rl_out + n_add_rl <= p_rl_out_end);
			}
#endif // !__MODIFIED_RLE_USE_RLE_EXP
		}
		// make sure there's enough space

		memcpy(p_output, p_src, n_uncompressed_size * sizeof(uint8_t));
		p_output += n_uncompressed_size;
		p_src += n_uncompressed_size;
		_ASSERTE(p_output <= p_out_end);
		// write uncompressed part

#ifdef __MODIFIED_RLE_USE_RLE_EXP
		if(n_compressed_size) {
			for(int i = 0; i < min_RunLength + n_compressed_run_byte_num -
			   TExpEncoding<>::n_min_width; ++ i, ++ p_output)
				*p_output = *p_src;
			p_src += n_compressed_size;
			_ASSERTE(p_output <= p_out_end); // make sure we didn't overwrite
			// write output data

			n_compressed_size = exp_encode.n_Adjust_Size(n_compressed_size, n_compressed_run_byte_num);
			// subtract minimal n_compressed_size for a given n_compressed_run_byte_num

			// t_odo - implement true RLE exp (or check if this gives the same results)
			// note probably not, the number of n_compressed_run_byte_num written to p_rl_out
			// may be too high (although the bytes are nulls)

			// note that the bytes written to the buffer do not compress all that well,
			// maybe writing packed bits results in a smaller run length stream which
			// also does not compress well, but is smaller

			int n_rle_bit_num = TExpEncoding<>::n_bit_granularity * n_compressed_run_byte_num;
			if(!bit_coder.Encode_Symbol(n_compressed_size, n_rle_bit_num)) // use a nice function
				return false;
			// write run length
		}
#else // __MODIFIED_RLE_USE_RLE_EXP
		while(n_compressed_size) {
			int n_write = min(n_compressed_size, n_max_repeats);

			if(n_compressed_size > n_write &&
			   n_compressed_size - min_RunLength < n_write)
				n_write = n_compressed_size - min_RunLength;
			_ASSERTE(n_compressed_size == n_write ||
				n_compressed_size - n_write >= min_RunLength);
			// in case the next run would be encoded as negative, shrink this one

			_ASSERTE(n_write >= min_RunLength); // make sure it won't underflow
			_ASSERTE((n_write - min_RunLength) < 256); // make sure it won't overflow

			*p_rl_out ++ = n_write - min_RunLength;

			for(int i = 0; i < min_RunLength; ++ i)
				*p_output ++ = *p_src;
			// write run value, min_RunLength times

			n_compressed_size -= n_write;
			p_src += n_write;

			_ASSERTE(p_output <= p_out_end);
			_ASSERTE(p_rl_out <= p_rl_out_end);
		}
#endif // __MODIFIED_RLE_USE_RLE_EXP
		// write compressed part
	}
	// RLE loop

#ifdef __MODIFIED_RLE_USE_RLE_EXP
	if(!bit_coder.Flush())
		return false;
	// write the last byte, if not empty
#else // __MODIFIED_RLE_USE_RLE_EXP
	_ASSERTE(p_rl_out <= p_rl_out_end);
	r_t_dest_runs.Resize(uint32_t(p_rl_out - r_t_dest_runs.p_Data()));
#endif // __MODIFIED_RLE_USE_RLE_EXP

	_ASSERTE(p_output <= p_out_end);
	r_t_dest.Resize(uint32_t(p_output - r_t_dest.p_Data()));
	// shrink the buffers to their final lengths

	return true;
}

/*
 *								=== ~CModifiedRLECodec ===
 */

/*
 *								=== CHuffmanCodec ===
 */

// utility Huffman bitstream function (reused by another classes below as well)
static inline bool Encode_Symbol(int n_value, int n_bit_num,
	uint8_t &r_n_byte, int &r_n_bit_num, uint8_t *&r_p_output,
	uint8_t *&r_p_out_end, TBuffer &r_t_out_buffer)
{
	/*for(-- n_bit_num; n_bit_num >= 0; -- n_bit_num) {
		int n_bit = (n_value >> n_bit_num) & 1;

		r_n_byte <<= 1;
		r_n_byte |= n_bit;
		if(r_n_bit_num == 7) {
			r_n_bit_num = 0;
			_ASSERTE(r_p_output <= r_p_out_end);
			if(r_p_output == r_p_out_end) {
				size_t n_off = r_p_output - r_t_out_buffer.p_Data();
				if(!r_t_out_buffer.Grow(1))
					return false;
				r_p_output = r_t_out_buffer.p_Data() + n_off;
				r_p_out_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
			}
			*r_p_output ++ = r_n_byte;
		} else
			++ r_n_bit_num;
	}

	return true;*/

	int n_bit_num_left = 8 - r_n_bit_num;
	bool b_result = CBitCoder<int>::Encode_Symbol(n_value, n_bit_num, r_n_byte,
		n_bit_num_left, r_p_output, r_p_out_end, r_t_out_buffer);
	r_n_bit_num = 8 - n_bit_num_left;
	return b_result;
	// use the other function, except that it has the opposite bit counter
}

// t_odo - separate Huffman tree from Huffman codec (template it?)
// t_odo - create second version of Huffman codec for encoding runs of zeroes
// (builds tree for symbols and tree for encoding length of runs in case symbol is a zero)

/*
 *	static bool CHuffmanCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CHuffmanCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	const uint8_t *p_end = CHuffmanUtil<CHuff8::_TySymbol,
		max_CodeBitNum>::p_Decode(r_t_in_buffer, 0, r_t_out_buffer);
	return p_end && p_end == r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
	// reuse the templated code, it is binary compatible with the code below
}

/*
 *	static bool CHuffmanCodec::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CHuffmanCodec::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	return CHuffmanUtil<CHuff8::_TySymbol, max_CodeBitNum>::Encode(r_t_in_buffer, r_t_out_buffer);
	// reuse the templated code, it is binary compatible with the old code
}

/*
 *								=== ~CHuffmanCodec ===
 */

/*
 *								=== CRLE0_HuffmanCodec ===
 */

bool CRLE0_HuffmanCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// least possible size of input

	const uint8_t *p_input = r_t_in_buffer.p_Data();
	const uint8_t *p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
	// input buffer pointer

	uint32_t n_unpack_length = *(uint32_t*)p_input;
	p_input += sizeof(uint32_t);
	// get size of uncompressed data

	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CDecodeTable huff_table(p_input, p_src_end);
	if(!huff_table.Initialize())
		return false;
	p_input = huff_table.p_Pointer();
	CHuffmanUtil<CHuff16::_TySymbol, CHuff16::max_CodeBitNum>::CDecodeTable huff_rl_table(p_input, p_src_end);
	if(!huff_rl_table.Initialize())
		return false;
	p_input = huff_rl_table.p_Pointer();

	if(!r_t_out_buffer.Resize(n_unpack_length, false))
		return false;
	// alloc output buffer

	uint8_t n_byte = 0;
	int n_bit_num = 0;
	const uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
	for(uint8_t *p_dest = r_t_out_buffer.p_Data(); p_dest != p_dest_end;) {
		if(!huff_table.Decode_Symbol(*p_dest, n_byte, n_bit_num, p_input, p_src_end))
			return false;
		// decode a single symbol using one of the tables

		if(!*p_dest) {
			uint16_t n_run_length;
			if(!huff_rl_table.Decode_Symbol(n_run_length, n_byte, n_bit_num, p_input, p_src_end))
				return false;
			_ASSERTE(n_run_length >= 1);
			memset(p_dest, 0, n_run_length * sizeof(uint8_t)); // decompress zero run
			p_dest += n_run_length;
		} else
			++ p_dest;
		// decode run length from the second Huffman tree
	}
	// decode data

	_ASSERTE(p_input == p_src_end);
	// make sure we've read the whole input buffer

	return true;
}

bool CRLE0_HuffmanCodec::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	std::vector<CHuff8::TFrequency> symbol_freq;
	std::vector<CHuff16::TFrequency> run_length_freq;

	{
		if(!stl_ut::Resize_To_N(symbol_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)))
			return false;
		for(size_t i = 0, n = symbol_freq.size(); i < n; ++ i)
			symbol_freq[i].n_symbol = i;
		// alloc symbol frequencies

		std::map<uint16_t, size_t> run_length_set;

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(; p_src != p_end; ++ p_src) {
			++ symbol_freq[*p_src].n_frequency;
			_ASSERTE(symbol_freq[*p_src].n_frequency > 0);
			// increment symbol frequency

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				try {
					while(n_zero_run_length) {
						uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 65536 repeats

						std::map<uint16_t, size_t>::iterator p_freq_it = run_length_set.find(n_run_length);
						if(p_freq_it != run_length_set.end())
							++ (*p_freq_it).second;
						else
							run_length_set[n_run_length] = 1;
						// increment run length frequency

						if(n_zero_run_length) {
							++ symbol_freq[0].n_frequency;
							_ASSERTE(symbol_freq[0].n_frequency > 0);
						}
						// in case the run of zeroes is saved as multiple chunks,
						// increase frequency of the zero symbol as well
					}
				} catch(std::bad_alloc&) {
					return false;
				}
				// accumulate zero run length frequencies

				p_src = p_last_zero - 1;
				// shift behind the zero run
			}
		}
		// calculate frequencies

		if(!stl_ut::Resize_To_N(run_length_freq, run_length_set.size(),
		   CHuff16::TFrequency(0)))
			return false;
		std::vector<CHuff16::TFrequency>::iterator p_freq_it =
			run_length_freq.begin();
		for(std::map<uint16_t, size_t>::const_iterator p_rlf_it = run_length_set.begin(),
		   p_end_it = run_length_set.end(); p_rlf_it != p_end_it; ++ p_rlf_it, ++ p_freq_it) {
			uint16_t n_run_length = (*p_rlf_it).first;
			size_t n_frequency = (*p_rlf_it).second;
			*p_freq_it = CHuff16::TFrequency(n_run_length, n_frequency);
		}
		// copy set of frequencies to the list of frequecies
	}
	// calculate symbol frequencies

	CHuff8 huff_tree;
	CHuff16 huff_rl_tree;
	huff_tree.Use_SymbolFrequencies(symbol_freq, true);
	huff_rl_tree.Use_SymbolFrequencies(run_length_freq, true);
	// use the calculated frequencies, note that both are sorted

	if(!huff_tree.Assign_CodeWords() || !huff_rl_tree.Assign_CodeWords())
		return false;
	// assign codewords (sorts symbol by frequency)

	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CEncodeTable huff_table(huff_tree);
	CHuffmanUtil<CHuff16::_TySymbol, CHuff16::max_CodeBitNum>::CEncodeTable huff_rl_table(huff_rl_tree);
	if(!r_t_out_buffer.Resize(sizeof(uint32_t) +
	   huff_table.n_Table_Size() + huff_rl_table.n_Table_Size(), false) || // prealloc size of all the tables
	   !r_t_out_buffer.Resize(sizeof(uint32_t), false) || // resize to a single uint32_t (decoded buffer size)
	   !huff_table.Write_Table(r_t_out_buffer) || // append the first table
	   !huff_rl_table.Write_Table(r_t_out_buffer)) // append the second table
		return false;
	*(uint32_t*)r_t_out_buffer.p_Data() = uint32_t(r_t_in_buffer.n_Size());
	// write code tables

	huff_tree.SortFrequencies_BySymbol();
	huff_rl_tree.SortFrequencies_BySymbol();
	// prepare for encoding

	{
		CBitEncoder coder(r_t_out_buffer); // append

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(; p_src != p_src_end; ++ p_src) {
			_ASSERTE(p_src < p_src_end);
			const CHuff8::TFrequency &r_sym = huff_tree.r_LookupSymbol(*p_src);
			_ASSERTE(r_sym.n_symbol == *p_src);
			// find symbol in Huffman tree

			if(!coder.Encode_Symbol(r_sym.n_code_word, r_sym.n_code_length))
				return false;
			// write symbol as series of bits

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_src_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_src_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				for(;;) {
					uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
					n_zero_run_length -= n_run_length;
					// decompose the zero run to up to 65536 repeats

					const CHuff16::TFrequency &r_freq0 = huff_rl_tree.r_LookupSymbol(n_run_length);
					_ASSERTE(r_freq0.n_symbol == n_run_length);
					// find run length in the second Huffman tree

					if(!coder.Encode_Symbol(r_freq0.n_code_word, r_freq0.n_code_length))
						return false;
					// write symbol as series of bits

					if(n_run_length == UINT16_MAX) {
						if(!coder.Encode_Symbol(r_sym.n_code_word, r_sym.n_code_length))
							return false;
					} else {
						_ASSERTE(!n_zero_run_length);
						break; // finished
					}
					// in case the zero run length is decomposed,
					// we need to write another zero symbol first
				}

				p_src = p_last_zero - 1;
			}
			_ASSERTE(p_src < p_src_end);
			// in case the symbol was zero, encode run length and skip the occurences of the symbol
		}

		if(!coder.Flush())
			return false;
	}
	// compress

	return true;
}

/*
 *								=== ~CRLE0_HuffmanCodec ===
 */

/*
 *								=== CRLE0_HuffmanCodec_1 ===
 */

bool CRLE0_HuffmanCodec_1::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// least possible size of input

	const uint8_t *p_input = r_t_in_buffer.p_Data();
	const uint8_t *p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
	// input buffer pointer

	uint32_t n_unpack_length = *(uint32_t*)p_input;
	p_input += sizeof(uint32_t);
	// get size of uncompressed data

	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CDecodeTable huff_table(p_input, p_src_end);
	if(!huff_table.Initialize())
		return false;
	p_input = huff_table.p_Pointer();
	CHuffmanUtil<CHuffRL::_TySymbol, CHuffRL::max_CodeBitNum>::CDecodeTable huff_rl_table(p_input, p_src_end);
	if(!huff_rl_table.Initialize())
		return false;
	p_input = huff_rl_table.p_Pointer();
	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CDecodeTable huff_0_table(p_input, p_src_end);
	if(!huff_0_table.Initialize())
		return false;
	p_input = huff_0_table.p_Pointer();
	CHuffmanUtil<CHuffRL::_TySymbol, CHuffRL::max_CodeBitNum>::CDecodeTable huff_rl_table2(p_input, p_src_end);
	if(run_Chaining == 3) {
		if(!huff_rl_table2.Initialize())
			return false;
		p_input = huff_rl_table2.p_Pointer();
	}

	if(!r_t_out_buffer.Resize(n_unpack_length, false))
		return false;
	// alloc output buffer

	uint8_t n_byte = 0;
	int n_bit_num = 0;
	bool b_had_zero_run = false;
	const uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
	for(uint8_t *p_dest = r_t_out_buffer.p_Data(); p_dest != p_dest_end;) {
		if(!((b_had_zero_run)? huff_0_table : huff_table).Decode_Symbol(*p_dest,
		   n_byte, n_bit_num, p_input, p_src_end))
			return false;
		b_had_zero_run = false;
		// decode a single symbol using one of the tables

		if(!*p_dest) {
			b_had_zero_run = true;
			bool b_first = true;
			//size_t n_accum = 0, n_filled = 0;
			for(int n_accum_bit_num = 0;;) {
				CHuffRL::_TySymbol n_run_length;
				if(!((run_Chaining == 3 && !b_first)? huff_rl_table2 :
				   huff_rl_table).Decode_Symbol(n_run_length, n_byte, n_bit_num, p_input, p_src_end))
					return false;
				b_first = false;
				_ASSERTE(run_Chaining != 0 || n_run_length >= 1);
				if(run_Chaining < 2 || !n_accum_bit_num/*!n_filled*/) {
					memset(p_dest, 0, n_run_length * sizeof(uint8_t)); // decompress zero run
					p_dest += n_run_length;
				} else {
					_ASSERTE(run_Chaining == 2 || run_Chaining == 3); // compile-time constant
					size_t n_to_fill = /*n_accum |*/ (size_t(n_run_length) << n_accum_bit_num); // this is exponentially chained run length
					//_ASSERTE(n_to_fill > n_filled);
					//n_to_fill -= n_filled; // we already filled (lower word) zeroes
					memset(p_dest, 0, n_to_fill * sizeof(uint8_t)); // decompress zero run
					p_dest += n_to_fill;
				}
				if(run_Chaining == 0) // compile-time constant
					break; // no implicit chaining, there will be another null symbol encoded
				else if(run_Chaining == 1) { // compile-time constant
					if(n_run_length < CHuffRL::_TySymbol(-1))
						break;
					// simple additive chaining, if te run is of UINT16_MAX nulls, a next run is expected
				} else {
					_ASSERTE(run_Chaining == 2 || run_Chaining == 3); // compile-time constant
					// exponential run chaining, if the following symbol is null,
					// another zero run is the high word of the real run length

					if(p_dest == p_dest_end)
						break;
					// no more runs

					uint8_t n_symbol;
					if(!huff_0_table.Decode_Symbol(n_symbol, n_byte, n_bit_num, p_input, p_src_end))
						return false;
					// decode another symbol

					if(n_symbol != 0) {
						*p_dest = n_symbol;
						++ p_dest;
						b_had_zero_run = false; // just decoded using huff_0_table, clear the flag
						break;
					}
					// handle "just the next symbol"

					//n_filled += n_run_length;
					//n_accum |= n_run_length << n_accum_bit_num;
					n_accum_bit_num += 8 * sizeof(CHuffRL::_TySymbol);
					// there will be at one more run length
				}
			}
		} else
			++ p_dest;
		// decode run length from the second Huffman tree
	}
	// decode data

	_ASSERTE(p_input == p_src_end);
	// make sure we've read the whole input buffer

	return true;
}

bool CRLE0_HuffmanCodec_1::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	std::vector<CHuff8::TFrequency> symbol_freq;
	std::vector<CHuffRL::TFrequency> run_length_freq;
	std::vector<CHuffRL::TFrequency> run_length_freq2;
	std::vector<CHuff8::TFrequency> symbol_after_zero_run_freq;

	{
		if(!stl_ut::Resize_To_N(symbol_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)) ||
		   !stl_ut::Resize_To_N(symbol_after_zero_run_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)))
			return false;
		for(size_t i = 0, n = symbol_freq.size(); i < n; ++ i) {
			symbol_freq[i].n_symbol = i;
			symbol_after_zero_run_freq[i].n_symbol = i;
		}
		// alloc symbol frequencies

		std::map<CHuffRL::_TySymbol, size_t> run_length_set, run_length_set2;

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_end; ++ p_src) {
			if(!b_had_zero_run) {
				++ symbol_freq[*p_src].n_frequency;
				_ASSERTE(symbol_freq[*p_src].n_frequency > 0);
				// encodes all symbols, including the nulls
			} else {
				_ASSERTE(*p_src != 0); // there shouldn't be a 0 after a zero run
				b_had_zero_run = false; // clear the flag
				++ symbol_after_zero_run_freq[*p_src].n_frequency;
				_ASSERTE(symbol_after_zero_run_freq[*p_src].n_frequency > 0);
				// encodes symbols occuring after zero runs
			}
			// increment symbol frequency

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				try {
					if(run_Chaining == 0 || run_Chaining == 1) { // compile-time constant
						while(n_zero_run_length) {
							CHuffRL::_TySymbol n_run_length = CHuffRL::_TySymbol(min(n_zero_run_length,
								size_t(CHuffRL::_TySymbol(-1))));
							n_zero_run_length -= n_run_length;
							// decompose the zero run to up to 65536 repeats

							_ASSERTE(run_length_set[n_run_length] < SIZE_MAX);
							++ run_length_set[n_run_length];
							// increment run length frequency

							if(run_Chaining == 0) { // compile-time constant
								if(n_zero_run_length) {
									++ symbol_after_zero_run_freq[0].n_frequency;
									_ASSERTE(symbol_after_zero_run_freq[0].n_frequency > 0);
								}
								// in case the run of zeroes is saved as multiple chunks,
								// increase frequency of the zero symbol as well
							} else {
								if(n_run_length == CHuffRL::_TySymbol(-1) && !n_zero_run_length) {
									_ASSERTE(run_length_set[0] < SIZE_MAX);
									++ run_length_set[0];
								}
								// in case the run was divided and the last run was UINT16_MAX,
								// write a dummy zero-length run for the decoder
							}
						}
					} else {
						_ASSERTE(run_Chaining == 2 || run_Chaining == 3); // compile-time constant
						bool b_first = true;
						while(n_zero_run_length) {
							CHuffRL::_TySymbol n_run_length = CHuffRL::_TySymbol(n_zero_run_length);
							n_zero_run_length >>= 8 * sizeof(CHuffRL::_TySymbol);
							// decompose the zero runs exponentially

							if(run_Chaining == 3 && !b_first) {
								_ASSERTE(run_length_set2[n_run_length] < SIZE_MAX);
								++ run_length_set2[n_run_length];
							} else {
								_ASSERTE(run_length_set[n_run_length] < SIZE_MAX);
								++ run_length_set[n_run_length];
							}
							// increment run length frequency

							if(n_zero_run_length) {
								++ symbol_after_zero_run_freq[0].n_frequency;
								_ASSERTE(symbol_after_zero_run_freq[0].n_frequency > 0);
							}
							// in case the run of zeroes is saved as multiple chunks,
							// increase frequency of the zero symbol as well
							
							b_first = false;
						}
					}
				} catch(std::bad_alloc&) {
					return false;
				}
				// accumulate zero run length frequencies

				p_src = p_last_zero - 1;
				// shift behind the zero run

				b_had_zero_run = true;
				// set the context flag
			}
		}
		// calculate frequencies

		if(!stl_ut::Resize_To_N(run_length_freq, run_length_set.size(),
		   CHuffRL::TFrequency(0)))
			return false;
		std::vector<CHuffRL::TFrequency>::iterator p_freq_it =
			run_length_freq.begin();
		for(std::map<CHuffRL::_TySymbol, size_t>::const_iterator p_rlf_it = run_length_set.begin(),
		   p_end_it = run_length_set.end(); p_rlf_it != p_end_it; ++ p_rlf_it, ++ p_freq_it) {
			CHuffRL::_TySymbol n_run_length = (*p_rlf_it).first;
			size_t n_frequency = (*p_rlf_it).second;
			*p_freq_it = CHuffRL::TFrequency(n_run_length, n_frequency);
		}
		// copy set of frequencies to the list of frequecies

		if(run_Chaining == 3) {
			if(!stl_ut::Resize_To_N(run_length_freq2, run_length_set2.size(),
			   CHuffRL::TFrequency(0)))
				return false;
			std::vector<CHuffRL::TFrequency>::iterator p_freq_it =
				run_length_freq2.begin();
			for(std::map<CHuffRL::_TySymbol, size_t>::const_iterator p_rlf_it = run_length_set2.begin(),
			   p_end_it = run_length_set2.end(); p_rlf_it != p_end_it; ++ p_rlf_it, ++ p_freq_it) {
				CHuffRL::_TySymbol n_run_length = (*p_rlf_it).first;
				size_t n_frequency = (*p_rlf_it).second;
				*p_freq_it = CHuffRL::TFrequency(n_run_length, n_frequency);
			}
		}
		// copy the second set of frequencies to the second list of frequecies
	}
	// calculate symbol frequencies

	CHuff8 huff_tree, huff_0_tree;
	CHuffRL huff_rl_tree, huff_rl_tree2;
	huff_tree.Use_SymbolFrequencies(symbol_freq, true);
	huff_0_tree.Use_SymbolFrequencies(symbol_after_zero_run_freq, true);
	huff_rl_tree.Use_SymbolFrequencies(run_length_freq, true);
	if(run_Chaining == 3)
		huff_rl_tree2.Use_SymbolFrequencies(run_length_freq2, true);
	// use the calculated frequencies, note that both are sorted

	if(!huff_tree.Assign_CodeWords() || !huff_rl_tree.Assign_CodeWords() ||
	   !huff_0_tree.Assign_CodeWords() || (run_Chaining == 3 &&
	   !huff_rl_tree2.Assign_CodeWords()))
		return false;
	// assign codewords (sorts symbol by frequency)

	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CEncodeTable huff_table(huff_tree);
	CHuffmanUtil<CHuffRL::_TySymbol, CHuffRL::max_CodeBitNum>::CEncodeTable huff_rl_table(huff_rl_tree);
	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CEncodeTable huff_0_table(huff_0_tree);
	CHuffmanUtil<CHuffRL::_TySymbol, CHuffRL::max_CodeBitNum>::CEncodeTable huff_rl_table2(huff_rl_tree2);
	if(run_Chaining == 3) {
		if(!r_t_out_buffer.Resize(sizeof(uint32_t) + huff_table.n_Table_Size() +
		   huff_rl_table.n_Table_Size() + huff_0_table.n_Table_Size() +
		   huff_rl_table2.n_Table_Size(), false) || // prealloc size of all the tables
		   !r_t_out_buffer.Resize(sizeof(uint32_t), false) || // resize to a single uint32_t (decoded buffer size)
		   !huff_table.Write_Table(r_t_out_buffer) || // append the first table
		   !huff_rl_table.Write_Table(r_t_out_buffer) || // append the second table
		   !huff_0_table.Write_Table(r_t_out_buffer) || // append the third table
		   !huff_rl_table2.Write_Table(r_t_out_buffer)) // append the fourth table
			return false;
	} else {
		if(!r_t_out_buffer.Resize(sizeof(uint32_t) + huff_table.n_Table_Size() +
		   huff_rl_table.n_Table_Size() + huff_0_table.n_Table_Size(), false) || // prealloc size of all the tables
		   !r_t_out_buffer.Resize(sizeof(uint32_t), false) || // resize to a single uint32_t (decoded buffer size)
		   !huff_table.Write_Table(r_t_out_buffer) || // append the first table
		   !huff_rl_table.Write_Table(r_t_out_buffer) || // append the second table
		   !huff_0_table.Write_Table(r_t_out_buffer)) // append the third table
			return false;
	}
	*(uint32_t*)r_t_out_buffer.p_Data() = uint32_t(r_t_in_buffer.n_Size());
	// write code tables

	huff_tree.SortFrequencies_BySymbol();
	huff_0_tree.SortFrequencies_BySymbol();
	huff_rl_tree.SortFrequencies_BySymbol();
	if(run_Chaining == 3)
		huff_rl_tree2.SortFrequencies_BySymbol();
	// prepare for encoding

	{
		CBitEncoder coder(r_t_out_buffer); // append

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_src_end; ++ p_src) {
			_ASSERTE(p_src < p_src_end);
			const CHuff8::TFrequency &r_sym = ((b_had_zero_run)?
				huff_0_tree : huff_tree).r_LookupSymbol(*p_src);
			_ASSERTE(r_sym.n_symbol == *p_src);
			// find symbol in Huffman tree

			if(!coder.Encode_Symbol(r_sym.n_code_word, r_sym.n_code_length))
				return false;
			// write symbol as series of bits

			if((b_had_zero_run = !*p_src)) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_src_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_src_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				if(run_Chaining == 0 || run_Chaining == 1) { // compile-time constant
					while(n_zero_run_length) {
						CHuffRL::_TySymbol n_run_length = CHuffRL::_TySymbol(min(n_zero_run_length,
							size_t(CHuffRL::_TySymbol(-1))));
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 65536 repeats

						const CHuffRL::TFrequency &r_freq0 = huff_rl_tree.r_LookupSymbol(n_run_length);
						_ASSERTE(r_freq0.n_symbol == n_run_length);
						// find run length in the second Huffman tree

						if(!coder.Encode_Symbol(r_freq0.n_code_word, r_freq0.n_code_length))
							return false;
						// write symbol as series of bits

						if(run_Chaining == 0) { // compile-time constant
							if(n_zero_run_length) {
								const CHuff8::TFrequency &r_sym0 = huff_0_tree.r_LookupSymbol(0);
								_ASSERTE(r_sym0.n_symbol == 0);
								if(!coder.Encode_Symbol(r_sym0.n_code_word, r_sym0.n_code_length))
									return false;
							}
							// in case the run of zeroes is saved as multiple chunks,
							// increase frequency of the zero symbol as well
						} else {
							if(n_run_length == CHuffRL::_TySymbol(-1) && !n_zero_run_length) {
								const CHuffRL::TFrequency &r_freq0 = huff_rl_tree.r_LookupSymbol(0);
								_ASSERTE(r_freq0.n_symbol == 0);
								// find run length in the second Huffman tree

								if(!coder.Encode_Symbol(r_freq0.n_code_word, r_freq0.n_code_length))
									return false;
								// write symbol as series of bits
							}
							// in case the run was divided and the last run was UINT16_MAX,
							// write a dummy zero-length run for the decoder
						}
					}
				} else {
					_ASSERTE(run_Chaining == 2 || run_Chaining == 3); // compile-time constant
					bool b_first = true;
					while(n_zero_run_length) {
						CHuffRL::_TySymbol n_run_length = CHuffRL::_TySymbol(n_zero_run_length);
						n_zero_run_length >>= 8 * sizeof(CHuffRL::_TySymbol);
						// decompose the zero runs exponentially

						const CHuffRL::TFrequency &r_freq0 = ((run_Chaining == 3 && !b_first)?
							huff_rl_tree2 : huff_rl_tree).r_LookupSymbol(n_run_length);
						_ASSERTE(r_freq0.n_symbol == n_run_length);
						// find run length in the second Huffman tree

						if(!coder.Encode_Symbol(r_freq0.n_code_word, r_freq0.n_code_length))
							return false;
						// write symbol as series of bits

						if(n_zero_run_length) {
							const CHuff8::TFrequency &r_sym0 = huff_0_tree.r_LookupSymbol(0);
							_ASSERTE(r_sym0.n_symbol == 0);
							if(!coder.Encode_Symbol(r_sym0.n_code_word, r_sym0.n_code_length))
								return false;
						}
						// in case the run of zeroes is saved as multiple chunks,
						// increase frequency of the zero symbol as well
						
						b_first = false;
					}
				}

				p_src = p_last_zero - 1;
			}
			_ASSERTE(p_src < p_src_end);
			// in case the symbol was zero, encode run length and skip the occurences of the symbol
		}

		if(!coder.Flush())
			return false;
	}
	// compress

	return true;
}

/*
 *								=== ~CRLE0_HuffmanCodec_1 ===
 */

/*
 *								=== CRLE0_HuffmanCodec_2 ===
 */

bool CRLE0_HuffmanCodec_2::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// least possible size of input

	const uint8_t *p_input = r_t_in_buffer.p_Data();
	const uint8_t *p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
	// input buffer pointer

	uint32_t n_unpack_length = *(uint32_t*)p_input;
	p_input += sizeof(uint32_t);
	// get size of uncompressed data

	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CDecodeTable huff_table(p_input, p_src_end);
	if(!huff_table.Initialize())
		return false;
	p_input = huff_table.p_Pointer();
	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CDecodeTable huff_0_table(p_input, p_src_end);
	if(!huff_0_table.Initialize())
		return false;
	p_input = huff_0_table.p_Pointer();
	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CDecodeTable huff_len_table(p_input, p_src_end);
	if(use_LengthCoding) {
		if(!huff_len_table.Initialize())
			return false;
		p_input = huff_len_table.p_Pointer();
	}

	if(!r_t_out_buffer.Resize(n_unpack_length, false))
		return false;
	// alloc output buffer

	uint8_t n_byte = 0;
	int n_bit_num = 0;
	bool b_had_zero_run = false;
	const uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
	for(uint8_t *p_dest = r_t_out_buffer.p_Data(); p_dest != p_dest_end;) {
		if(!((b_had_zero_run)? huff_0_table : huff_table).Decode_Symbol(*p_dest,
		   n_byte, n_bit_num, p_input, p_src_end))
			return false;
		b_had_zero_run = false;
		// decode a single symbol using one of the tables

		if(!*p_dest) {
			size_t n_run_length = 0;
			if(use_LengthCoding) {
				for(;;) {
					uint8_t n_rl_bit_num;
					if(!huff_len_table.Decode_Symbol(n_rl_bit_num, n_byte, n_bit_num, p_input, p_src_end))
						return false;
					++ n_rl_bit_num;
					// decode number of bits

					uint32_t n_run_length_imm = 0;
					if(!CBitCoder<uint32_t>::Decode_Symbol(n_run_length_imm, n_rl_bit_num,
					   n_byte, n_bit_num, p_input, p_src_end))
						return false;
					// decode run length with that number of bits

					_ASSERTE(n_run_length < SIZE_MAX - n_run_length_imm);
					n_run_length += n_run_length_imm;
					if(n_run_length_imm < UINT32_MAX)
						break;
					// accumulate until run of less than UINT32_MAX
					// zeroes (used for chaining longer runs)
				}
			} else {
				uint16_t n_run_length_imm = 0;
				if(!CBitCoder<uint16_t>::Decode_Symbol(n_run_length_imm, 16,
				   n_byte, n_bit_num, p_input, p_src_end))
					return false;
				_ASSERTE(n_run_length < SIZE_MAX - n_run_length_imm);
				n_run_length += n_run_length_imm;
				// no chaining, if the run is decomposed, there will be another 0 symbol
			}
			_ASSERTE(n_run_length >= 1);
			memset(p_dest, 0, n_run_length * sizeof(uint8_t)); // decompress zero run
			p_dest += n_run_length;
			// fill run length with zeroes

			b_had_zero_run = true;
		} else
			++ p_dest;
		// decode run length from the second Huffman tree
	}
	// decode data

	_ASSERTE(p_input == p_src_end);
	// make sure we've read the whole input buffer

	return true;
}

bool CRLE0_HuffmanCodec_2::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	std::vector<CHuff8::TFrequency> symbol_freq,
		symbol_after_zero_run_freq, symbol_rlesize_freq;

	{
		if(!stl_ut::Resize_To_N(symbol_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)) ||
		   !stl_ut::Resize_To_N(symbol_after_zero_run_freq, 1 << (8 * sizeof(uint8_t)),
		   CHuff8::TFrequency(0)))
			return false;
		for(size_t i = 0, n = symbol_freq.size(); i < n; ++ i) {
			symbol_freq[i].n_symbol = i;
			symbol_after_zero_run_freq[i].n_symbol = i;
		}
		if(use_LengthCoding) {
			if(!stl_ut::Resize_To_N(symbol_rlesize_freq, 32, CHuff8::TFrequency(0)))
				return false;
			for(size_t i = 0; i < 32; ++ i)
				symbol_rlesize_freq[i].n_symbol = i;
		}
		// alloc symbol frequencies

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_end; ++ p_src) {
			if(!b_had_zero_run) {
				++ symbol_freq[*p_src].n_frequency;
				_ASSERTE(symbol_freq[*p_src].n_frequency < SIZE_MAX);
				// encodes all symbols, including the nulls
			} else {
				_ASSERTE(*p_src != 0); // there shouldn't be a 0 after a zero run
				b_had_zero_run = false; // clear the flag
				++ symbol_after_zero_run_freq[*p_src].n_frequency;
				_ASSERTE(symbol_after_zero_run_freq[*p_src].n_frequency < SIZE_MAX);
				// encodes symbols occuring after zero runs
			}
			// increment symbol frequency

			if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				if(use_LengthCoding) {
					while(n_zero_run_length) {
						uint32_t n_run_length = (n_zero_run_length <= UINT32_MAX)?
							uint32_t(n_zero_run_length) : UINT32_MAX;
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 2^32 - 1 repeats

						int n_bit_num = n_Bit_Width(n_run_length);
						_ASSERTE(n_bit_num > 0);
						_ASSERTE(symbol_rlesize_freq[n_bit_num - 1].n_frequency < SIZE_MAX);
						++ symbol_rlesize_freq[n_bit_num - 1].n_frequency;

						if(n_run_length == UINT32_MAX && !n_zero_run_length) {
							_ASSERTE(symbol_rlesize_freq[0].n_frequency < SIZE_MAX);
							++ symbol_rlesize_freq[0].n_frequency;
							// will write 1 bit
						}
						// in case the last run length was maximum, we need to write a dummy zero size run
					}
				} else {
					while(n_zero_run_length) {
						uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 65536 repeats

						// now it is saved as raw 16 bits, no frequency is estimated here

						if(n_zero_run_length) {
							_ASSERTE(symbol_after_zero_run_freq[0].n_frequency < SIZE_MAX);
							++ symbol_after_zero_run_freq[0].n_frequency;
						}
						// in case the run of zeroes is saved as multiple chunks,
						// increase frequency of the zero symbol as well
					}
				}
				// accumulate zero run length frequencies

				p_src = p_last_zero - 1;
				// shift behind the zero run

				b_had_zero_run = true;
				// set the context flag
			}
		}
		// calculate frequencies
	}
	// calculate symbol frequencies

	CHuff8 huff_tree, huff_0_tree, huff_length_tree;
	huff_tree.Use_SymbolFrequencies(symbol_freq, true);
	huff_0_tree.Use_SymbolFrequencies(symbol_after_zero_run_freq, true);
	if(use_LengthCoding)
		huff_length_tree.Use_SymbolFrequencies(symbol_rlesize_freq, true);
	// use the calculated frequencies, note that both are sorted

	if(!huff_tree.Assign_CodeWords() || !huff_0_tree.Assign_CodeWords() ||
	   (use_LengthCoding && !huff_length_tree.Assign_CodeWords()))
		return false;
	// assign codewords (sorts symbol by frequency)

	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CEncodeTable huff_table(huff_tree);
	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CEncodeTable huff_0_table(huff_0_tree);
	if(use_LengthCoding) {
		CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CEncodeTable huff_len_table(huff_length_tree);
		if(!r_t_out_buffer.Resize(sizeof(uint32_t) + huff_table.n_Table_Size() +
		   huff_0_table.n_Table_Size() + huff_len_table.n_Table_Size(), false) || // prealloc size of all the tables
		   !r_t_out_buffer.Resize(sizeof(uint32_t), false) || // resize to a single uint32_t (decoded buffer size)
		   !huff_table.Write_Table(r_t_out_buffer) || // append the first table
		   !huff_0_table.Write_Table(r_t_out_buffer) || // append the second table
		   !huff_len_table.Write_Table(r_t_out_buffer)) // append the third table
			return false;
	} else {
		if(!r_t_out_buffer.Resize(sizeof(uint32_t) + huff_table.n_Table_Size() +
		   huff_0_table.n_Table_Size(), false) || // prealloc size of all the tables
		   !r_t_out_buffer.Resize(sizeof(uint32_t), false) || // resize to a single uint32_t (decoded buffer size)
		   !huff_table.Write_Table(r_t_out_buffer) || // append the first table
		   !huff_0_table.Write_Table(r_t_out_buffer)) // append the second table
			return false;
	}
	*(uint32_t*)r_t_out_buffer.p_Data() = uint32_t(r_t_in_buffer.n_Size());
	// write code tables

	huff_tree.SortFrequencies_BySymbol();
	huff_0_tree.SortFrequencies_BySymbol();
	if(use_LengthCoding)
		huff_length_tree.SortFrequencies_BySymbol();
	// prepare for encoding

	{
		CBitEncoder coder(r_t_out_buffer); // append

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_src_end; ++ p_src) {
			_ASSERTE(p_src < p_src_end);
			const CHuff8::TFrequency &r_sym = ((b_had_zero_run)?
				huff_0_tree : huff_tree).r_LookupSymbol(*p_src);
			_ASSERTE(r_sym.n_symbol == *p_src);
			// find symbol in Huffman tree

			if(!coder.Encode_Symbol(r_sym.n_code_word, r_sym.n_code_length))
				return false;
			// write symbol as series of bits

			if((b_had_zero_run = !*p_src)) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_src_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_src_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				if(use_LengthCoding) {
					do {
						uint32_t n_run_length = (n_zero_run_length <= UINT32_MAX)?
							uint32_t(n_zero_run_length) : UINT32_MAX;
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 2^32 - 1 repeats

						int n_width = n_Bit_Width(n_run_length);
						_ASSERTE(n_width > 0);
						const CHuff8::TFrequency &r_lsym =
							huff_length_tree.r_LookupSymbol(n_width - 1);
						_ASSERTE(r_lsym.n_symbol == n_width - 1);
						if(!coder.Encode_Symbol(r_lsym.n_code_word, r_lsym.n_code_length) ||
						   !coder.Encode_Symbol(n_run_length, n_width))
							return false;
						// write the run length as Huffman-encoded length, followed by raw bits

						if(!n_zero_run_length && n_run_length == UINT32_MAX) {
							const CHuff8::TFrequency &r_lsym =
								huff_length_tree.r_LookupSymbol(0); // width 1
							_ASSERTE(r_lsym.n_symbol == 0);
							if(!coder.Encode_Symbol(r_lsym.n_code_word, r_lsym.n_code_length) ||
							   !coder.Encode_Symbol(0, 1))
								return false;
							// write a single zero
						}
						// in case the last run length was maximum, we need to write a dummy zero size run
					} while(n_zero_run_length);
				} else {
					do {
						uint16_t n_run_length = uint16_t(min(n_zero_run_length, size_t(UINT16_MAX)));
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 65536 repeats

						if(!coder.Encode_Symbol(n_run_length, 16))
							return false;
						// write the run length directly

						if(n_zero_run_length) {
							if(!coder.Encode_Symbol(r_sym.n_code_word, r_sym.n_code_length))
								return false;
						}
						// in case the zero run length is decomposed,
						// we need to write another zero symbol first
					} while(n_zero_run_length);
				}

				p_src = p_last_zero - 1;
			}
			_ASSERTE(p_src < p_src_end);
			// in case the symbol was zero, encode run length and skip the occurences of the symbol
		}

		if(!coder.Flush())
			return false;
	}
	// compress

	return true;
}

/*
 *								=== ~CRLE0_HuffmanCodec_2 ===
 */

/*
 *								=== CRLE0_HuffmanCodec_3 ===
 */

bool CRLE0_HuffmanCodec_3::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// least possible size of input

	const uint8_t *p_input = r_t_in_buffer.p_Data();
	const uint8_t *p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
	// input buffer pointer

	uint32_t n_unpack_length = *(uint32_t*)p_input;
	p_input += sizeof(uint32_t);
	// get size of uncompressed data

	CHuffmanUtil<CHuff16::_TySymbol, CHuff16::max_CodeBitNum>::CDecodeTable huff_table(p_input, p_src_end);
	if(!huff_table.Initialize())
		return false;
	p_input = huff_table.p_Pointer();
	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CDecodeTable huff_0_table(p_input, p_src_end);
	if(!huff_0_table.Initialize())
		return false;
	p_input = huff_0_table.p_Pointer();

	if(!r_t_out_buffer.Resize(n_unpack_length, false))
		return false;
	// alloc output buffer

	uint8_t n_byte = 0;
	int n_bit_num = 0;
	bool b_had_zero_run = false;
	const uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
	for(uint8_t *p_dest = r_t_out_buffer.p_Data(); p_dest != p_dest_end;) {
		if(!b_had_zero_run) {
			uint16_t n_sym;
			if(!huff_table.Decode_Symbol(n_sym,
			   n_byte, n_bit_num, p_input, p_src_end))
				return false;
			if(n_sym < 256) {
				*p_dest = uint8_t(n_sym);
				++ p_dest;
				if(!n_sym)
					b_had_zero_run = true; // just a single zero, but ...
			} else {
				uint16_t n_run_length = n_sym - 256;
				memset(p_dest, 0, n_run_length * sizeof(uint8_t)); // decompress zero run
				p_dest += n_run_length;
				if(n_run_length < UINT16_MAX - 256) { // !!
					b_had_zero_run = true;
					// note - in case there was UINT16_MAX - 256 repeats,
					// there will be at least one more run (maybe a run of zero length)

					// t_odo - this is not true, what about a single run of 65535 - 255 zeros?
					// need to encode another run of zero zeros to make this decoder work!
				}
			}
		} else {
			if(!huff_0_table.Decode_Symbol(*p_dest,
			   n_byte, n_bit_num, p_input, p_src_end))
				return false;
			b_had_zero_run = false;
			++ p_dest;
		}
		// decode a single symbol using one of the tables
	}
	// decode data

	_ASSERTE(p_input == p_src_end);
	// make sure we've read the whole input buffer

	return true;
}

bool CRLE0_HuffmanCodec_3::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	std::vector<CHuff16::TFrequency> symbol_freq;
	std::vector<CHuff8::TFrequency> symbol_after_zero_run_freq;

	{
		if(!stl_ut::Resize_To_N(symbol_freq,
		   1 << (8 * sizeof(uint8_t)), CHuff16::TFrequency(0)) ||
		   !stl_ut::Resize_To_N(symbol_after_zero_run_freq,
		   1 << (8 * sizeof(uint8_t)), CHuff8::TFrequency(0)))
			return false;
		for(size_t i = 0, n = symbol_freq.size(); i < n; ++ i) {
			symbol_freq[i].n_symbol = i;
			symbol_after_zero_run_freq[i].n_symbol = i;
		}
		// alloc symbol frequencies

		std::map<uint16_t, size_t> run_length_set;

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_end; ++ p_src) {
			if(b_had_zero_run) {
				_ASSERTE(*p_src != 0); // there shouldn't be a 0 after a zero run
				b_had_zero_run = false; // clear the flag
				_ASSERTE(symbol_after_zero_run_freq[*p_src].n_frequency < SIZE_MAX);
				++ symbol_after_zero_run_freq[*p_src].n_frequency;
				// encodes symbols occuring after zero runs
			} else if(!*p_src) {
				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				if(n_zero_run_length == 1)
					b_had_zero_run = true; // only a single zero, but ...
				else
					b_had_zero_run = true;
				// set the context flag

				try {
					while(n_zero_run_length) {
						uint16_t n_run_length = uint16_t(min(n_zero_run_length,
							size_t(UINT16_MAX) - 256)); // !!
						n_zero_run_length -= n_run_length;
						// decompose the zero run to up to 65536 repeats

						if(n_run_length > 1) {
							_ASSERTE(run_length_set[n_run_length] < SIZE_MAX);
							++ run_length_set[n_run_length];
							// increment run length frequency

							if(n_run_length == UINT16_MAX - 256 && !n_zero_run_length) {
								n_run_length = 0;

								_ASSERTE(run_length_set[n_run_length] < SIZE_MAX);
								++ run_length_set[n_run_length];
								// increment run length frequency zero
							}
							// in case the last run length was 65536 - 256,
							// force one more zero-length run
						} else {
							_ASSERTE(symbol_freq[0].n_frequency < SIZE_MAX);
							++ symbol_freq[0].n_frequency;
							// the first tree encodes all symbols, including the nulls
						}
					}
				} catch(std::bad_alloc&) {
					return false;
				}
				// accumulate zero run length frequencies

				p_src = p_last_zero - 1;
				// shift behind the zero run
			} else {
				_ASSERTE(symbol_freq[*p_src].n_frequency < SIZE_MAX);
				++ symbol_freq[*p_src].n_frequency;
				// encodes all symbols, including the nulls
			}
		}
		// calculate frequencies

		_ASSERTE(symbol_freq.size() == 256);
		if(!stl_ut::Resize_To_N(symbol_freq, 256 + run_length_set.size(),
		   CHuff16::TFrequency(0)))
			return false;
		std::vector<CHuff16::TFrequency>::iterator p_freq_it =
			symbol_freq.begin() + 256;
		for(std::map<uint16_t, size_t>::const_iterator p_rlf_it = run_length_set.begin(),
		   p_end_it = run_length_set.end(); p_rlf_it != p_end_it; ++ p_rlf_it, ++ p_freq_it) {
			uint16_t n_run_length = (*p_rlf_it).first;
			_ASSERTE(n_run_length > 1); // runs of a single zero are encoded differently
			size_t n_frequency = (*p_rlf_it).second;
			*p_freq_it = CHuff16::TFrequency(256 + n_run_length, n_frequency);
		}
		// copy set of frequencies to the list of frequecies
	}
	// calculate symbol frequencies

	CHuff16 huff_tree;
	CHuff8 huff_0_tree;
	huff_tree.Use_SymbolFrequencies(symbol_freq, true);
	huff_0_tree.Use_SymbolFrequencies(symbol_after_zero_run_freq, true);
	// use the calculated frequencies, note that both are sorted

	if(!huff_tree.Assign_CodeWords() || !huff_0_tree.Assign_CodeWords())
		return false;
	// assign codewords (sorts symbol by frequency)

	CHuffmanUtil<CHuff16::_TySymbol, CHuff16::max_CodeBitNum>::CEncodeTable huff_table(huff_tree);
	CHuffmanUtil<CHuff8::_TySymbol, CHuff8::max_CodeBitNum>::CEncodeTable huff_0_table(huff_0_tree);
	if(!r_t_out_buffer.Resize(sizeof(uint32_t) + huff_table.n_Table_Size() +
	   huff_0_table.n_Table_Size(), false) || // prealloc size of all the tables
	   !r_t_out_buffer.Resize(sizeof(uint32_t), false) || // resize to a single uint32_t (decoded buffer size)
	   !huff_table.Write_Table(r_t_out_buffer) || // append the first table
	   !huff_0_table.Write_Table(r_t_out_buffer)) // append the second table
		return false;
	*(uint32_t*)r_t_out_buffer.p_Data() = uint32_t(r_t_in_buffer.n_Size());
	// write code tables

	huff_tree.SortFrequencies_BySymbol();
	huff_0_tree.SortFrequencies_BySymbol();
	// prepare for encoding

	{
		CBitEncoder coder(r_t_out_buffer); // append

		const uint8_t *p_src = r_t_in_buffer.p_Data(),
			*p_src_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
		for(bool b_had_zero_run = false; p_src != p_src_end; ++ p_src) {
			_ASSERTE(p_src < p_src_end);
			if(*p_src) {
				if(!b_had_zero_run) {
					const CHuff16::TFrequency &r_sym = huff_tree.r_LookupSymbol(*p_src);
					_ASSERTE(r_sym.n_symbol == *p_src);
					// find symbol in Huffman tree

					if(!coder.Encode_Symbol(r_sym.n_code_word, r_sym.n_code_length))
						return false;
					// write symbol as series of bits
				} else {
					const CHuff8::TFrequency &r_sym = huff_0_tree.r_LookupSymbol(*p_src);
					_ASSERTE(r_sym.n_symbol == *p_src);
					// find symbol in Huffman tree

					if(!coder.Encode_Symbol(r_sym.n_code_word, r_sym.n_code_length))
						return false;
					// write symbol as series of bits

					b_had_zero_run = false; // !!
				}
			} else {
				_ASSERTE(!b_had_zero_run);
				// otherwise would include this null as well

				const uint8_t *p_last_zero = p_src + 1;
				while(p_last_zero != p_src_end && !*(p_last_zero))
					++ p_last_zero;
				_ASSERTE(!*(p_last_zero - 1) && (p_last_zero == p_src_end || *p_last_zero));
				// find the last zero

				size_t n_zero_run_length = p_last_zero - p_src;
				// calculate number of zeros

				if(n_zero_run_length == 1)
					b_had_zero_run = true; // just a single zero, but still ...
				else
					b_had_zero_run = true;
				// had a zero run

				do {
					uint16_t n_run_length = uint16_t(min(n_zero_run_length,
						size_t(UINT16_MAX) - 256)); // !!
					n_zero_run_length -= n_run_length;
					// decompose the zero run to up to 65536 - 256 repeats

					uint16_t n_encode = (n_run_length > 1)? n_run_length + 256 : 0;
					const CHuff16::TFrequency &r_sym0 = huff_tree.r_LookupSymbol(n_encode);
					_ASSERTE(r_sym0.n_symbol == n_encode);
					// find symbol in Huffman tree

					if(!coder.Encode_Symbol(r_sym0.n_code_word, r_sym0.n_code_length))
						return false;
					// write symbol as series of bits

					if(n_run_length == UINT16_MAX - 256 && !n_zero_run_length) {
						n_run_length = 0;

						uint16_t n_encode = (n_run_length > 1)? n_run_length + 256 : 0;
						const CHuff16::TFrequency &r_sym0 = huff_tree.r_LookupSymbol(n_encode);
						_ASSERTE(r_sym0.n_symbol == n_encode);
						// find symbol in Huffman tree

						if(!coder.Encode_Symbol(r_sym0.n_code_word, r_sym0.n_code_length))
							return false;
						// write symbol as series of bits
					}
					// in case the last run length was 65536 - 256,
					// force one more zero-length run
				} while(n_zero_run_length);

				p_src = p_last_zero - 1;
			}
			_ASSERTE(p_src < p_src_end);
		}

		if(!coder.Flush())
			return false;
	}
	// compress

	return true;
}

/*
 *								=== ~CRLE0_HuffmanCodec_3 ===
 */
/*
 *								=== CInversionFrequenciesCodec::CSortAsc ===
 */

class CInversionFrequenciesCodec::CSortAsc {
protected:
	const uint32_t *m_p_freq_list;

public:
	CSortAsc(const uint32_t *p_freq_list)
		:m_p_freq_list(p_freq_list)
	{}

	inline bool operator ()(uint8_t n_sym_a, uint8_t n_sym_b) const
	{
		return m_p_freq_list[n_sym_a] < m_p_freq_list[n_sym_b];
	}
};

/*
 *								=== ~CInversionFrequenciesCodec::CSortAsc ===
 */

/*
 *								=== CInversionFrequenciesCodec::CSortDesc ===
 */

class CInversionFrequenciesCodec::CSortDesc {
protected:
	const uint32_t *m_p_freq_list;

public:
	CSortDesc(const uint32_t *p_freq_list)
		:m_p_freq_list(p_freq_list)
	{}

	inline bool operator ()(uint8_t n_sym_a, uint8_t n_sym_b) const
	{
		return m_p_freq_list[n_sym_a] > m_p_freq_list[n_sym_b];
	}
};

/*
 *								=== ~CInversionFrequenciesCodec::CSortDesc ===
 */

/*
 *								=== CInversionFrequenciesCodec ===
 */


//#define __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
//#define __INVERSION_FREQUENCIES_ZERO_RUN_SYMBOL 0
// much worse ratio

#define __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
// much better ratio

#define __INVERSION_FREQUENCIES_DELTA_ENCODE_FREQUENCY
// slightly better ratio

/*
 *	static bool CInversionFrequenciesCodec::Decode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer)
 *		- decodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CInversionFrequenciesCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	// minimal size of input

	const uint8_t *p_src = r_t_in_buffer.p_Data();
	const uint8_t *p_end = p_src + r_t_in_buffer.n_Size();

	uint32_t n_output_size = *(uint32_t*)p_src;
	p_src += sizeof(uint32_t);
	// read output size

	CDecodeVarLength decode(p_src, p_end);
	return Decode_IF(n_output_size, decode, r_t_out_buffer) &&
		Decode_PermTable(decode.p_Pointer(), p_end, r_t_out_buffer);
}

/*
 *	static bool CInversionFrequenciesCodec::Encode(const TBuffer &r_t_in_buffer,
 *		TBuffer &r_t_out_buffer, int n_permutation_type = sort_NoSort)
 *		- encodes data from r_t_in_buffer, outputs to r_t_out_buffer (can be empty)
 *		- returns true on success, false on failure
 */
bool CInversionFrequenciesCodec::Encode(const TBuffer &r_t_in_buffer,
	TBuffer &r_t_out_buffer, int n_permutation_type)
{
	r_t_out_buffer.Resize(r_t_in_buffer.n_Size(), false);
	// preallocate some space in output

	if(!r_t_out_buffer.Resize(sizeof(uint32_t), false))
		return false;
	*(uint32_t*)r_t_out_buffer.p_Data() = r_t_in_buffer.n_Size();
	// write size of uncompressed data for easier decoding

	uint32_t p_frequency[256];
	uint8_t p_perm_table[256];

	return Encode_IF(r_t_in_buffer, n_permutation_type,
		p_frequency, p_perm_table, CEmitVarLength(r_t_out_buffer)) &&
		Encode_PermTable(n_permutation_type, p_perm_table,
		p_frequency, r_t_out_buffer); // appends to the output buffer
}

//static std::vector<uint32_t> encoded_buffer; // debug

bool CInversionFrequenciesCodec::ModifiedDecode(const TBuffer &r_t_in_buffer,
	const TBuffer &r_t_table_buffer, TBuffer &r_t_out_buffer,
	TBuffer &r_t_temp_buffer)
{
	_ASSERTE(&r_t_in_buffer != &r_t_table_buffer);
	_ASSERTE(&r_t_out_buffer != &r_t_in_buffer);
	_ASSERTE(&r_t_out_buffer != &r_t_table_buffer);
	_ASSERTE(&r_t_temp_buffer != &r_t_in_buffer);
	_ASSERTE(&r_t_temp_buffer != &r_t_out_buffer);
	_ASSERTE(&r_t_temp_buffer != &r_t_table_buffer);
	// the buffers need to be unique, no reusing

	if(r_t_table_buffer.n_Size() < sizeof(uint32_t) + sizeof(uint8_t)/* * 2*/)
		return false;
	// minimal size of input

	const uint8_t *p_src2 = r_t_table_buffer.p_Data();
	const uint8_t *p_end2 = p_src2 + r_t_table_buffer.n_Size();

	uint32_t n_output_size = *(uint32_t*)p_src2;
	p_src2 += sizeof(uint32_t);
	int n_symbol_size = *p_src2;
	++ p_src2;
	/*unsigned int n_log_symbol_num = *p_src2; // makes compression ratio worse
	++ p_src2;*/
	// read output size and symbol parameters

	{
		TBuffer &t_dehuff = r_t_temp_buffer; // t_odo - large allocations here, cache it
		const uint8_t *p_after_huff;
		uint32_t n_max_sym; // will be used to escape rare symbols
		switch(n_symbol_size) {
		case 32:
			if(!(p_after_huff = CHuffmanUtil<uint32_t, max_CodeBitNum32>::p_Decode(r_t_in_buffer,
			   &n_max_sym, t_dehuff/*, n_log_symbol_num*/)))
				return false;
			break;
		case 16:
			{
				uint16_t n_max_sym16;
				if(!(p_after_huff = CHuffmanUtil<uint16_t, max_CodeBitNum16>::p_Decode(r_t_in_buffer,
				   &n_max_sym16, t_dehuff/*, n_log_symbol_num*/)))
					return false;
				n_max_sym = n_max_sym16;
			}
			break;
		case 8:
			{
				uint8_t n_max_sym8;
				if(!(p_after_huff = CHuffmanUtil<uint8_t, max_CodeBitNum8>::p_Decode(r_t_in_buffer,
				   &n_max_sym8, t_dehuff/*, n_log_symbol_num*/)))
					return false;
				n_max_sym = n_max_sym8;
			}
			break;
		default:
			return false;
		};
		// decodes Huffman-encoded frequencies

		size_t n_old_size = t_dehuff.n_Size();
		if(n_symbol_size != 32 && !t_dehuff.Resize(n_old_size *
		   (32 / n_symbol_size), true)) // scale up
			return false;
		if(n_symbol_size == 8) {
			const uint8_t *p_src = t_dehuff.p_Data() + n_old_size;
			for(uint32_t *p_rbegin = (uint32_t*)(t_dehuff.p_Data() + t_dehuff.n_Size()),
			   *p_rend = (uint32_t*)t_dehuff.p_Data(); p_rbegin != p_rend;)
				*(-- p_rbegin) = *(-- p_src);
			_ASSERTE(p_src == t_dehuff.p_Data());
		} else if(n_symbol_size == 16) {
			const uint16_t *p_src = (const uint16_t*)(t_dehuff.p_Data() + n_old_size);
			for(uint32_t *p_rbegin = (uint32_t*)(t_dehuff.p_Data() + t_dehuff.n_Size()),
			   *p_rend = (uint32_t*)t_dehuff.p_Data(); p_rbegin != p_rend;)
				*(-- p_rbegin) = *(-- p_src);
			_ASSERTE(p_src == (const uint16_t*)t_dehuff.p_Data());
		} else if(n_symbol_size != 32)
			return false;
		// convert the bufer to uint32_t

		{
			const uint8_t *p_end = r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size();
#ifdef __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
			uint32_t n_escape_symbol = *(uint32_t*)(p_end - 2 * sizeof(uint32_t));
			int n_escaped_symbol_bit_num = *(uint32_t*)(p_end - sizeof(uint32_t));
#else // __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
			int n_escaped_symbol_bit_num = *(uint32_t*)(p_end - sizeof(uint32_t));
			uint32_t n_escaped_symbols_size = *(uint32_t*)(p_end - 2 * sizeof(uint32_t));
			if(p_after_huff + n_escaped_symbols_size + 2 * sizeof(uint32_t) != p_end)
				return false; // make sure the whole buffer is used
			uint32_t n_escape_symbol = n_max_sym;
#endif // __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS

			p_end -= 2 * sizeof(uint32_t);
#ifdef __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
			{
				CBitDecoder bit_decoder(p_after_huff, p_end);
				for(uint32_t *p_src = (uint32_t*)t_dehuff.p_Data(),
				   *p_end2 = (uint32_t*)(t_dehuff.p_Data() + t_dehuff.n_Size());
				   p_src != p_end2; ++ p_src) {
					if(*p_src >= n_escape_symbol) {
						int n_encoded_len = n_escaped_symbol_bit_num - (*p_src - n_escape_symbol);
						_ASSERTE(n_encoded_len > 0);
						if(!bit_decoder.Decode_Symbol(*p_src, n_encoded_len))
							return false;
						/*_ASSERTE(!encoded_buffer.empty() && encoded_buffer.front() == n_symbol);
						encoded_buffer.erase(encoded_buffer.begin());*/ // debug
					}
				}
				if(!bit_decoder.b_Finished())
					return false;
				// the symbols are bit-encoded
			}
#else // __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
			switch(n_escaped_symbol_bit_num) {
			case 0:
				if(p_after_huff != p_end)
					return false;
				// no escaped symbols
				break;
			case 8:
				{
					const uint8_t *p_sym = p_after_huff;
					for(uint32_t *p_src = (uint32_t*)t_dehuff.p_Data(),
					   *p_end2 = (uint32_t*)(t_dehuff.p_Data() + t_dehuff.n_Size());
					   p_src != p_end2; ++ p_src) {
						if(*p_src == n_escape_symbol) {
							if(p_sym >= p_end)
								return false;
							*p_src = *p_sym;
							++ p_sym;
						}
					}
					if(p_sym != p_end)
						return false;
				}
				break;
			case 16:
				{
					const uint16_t *p_sym = (const uint16_t*)p_after_huff;
					for(uint32_t *p_src = (uint32_t*)t_dehuff.p_Data(),
					   *p_end2 = (uint32_t*)(t_dehuff.p_Data() + t_dehuff.n_Size());
					   p_src != p_end2; ++ p_src) {
						if(*p_src == n_escape_symbol) {
							if(p_sym >= (const uint16_t*)p_end)
								return false;
							*p_src = *p_sym;
							++ p_sym;
						}
					}
					if(p_sym != (const uint16_t*)p_end)
						return false;
				}
				break;
			case 32:
				{
					const uint32_t *p_sym = (const uint32_t*)p_after_huff;
					for(uint32_t *p_src = (uint32_t*)t_dehuff.p_Data(),
					   *p_end2 = (uint32_t*)(t_dehuff.p_Data() + t_dehuff.n_Size());
					   p_src != p_end2; ++ p_src) {
						if(*p_src == n_escape_symbol) {
							if(p_sym >= (const uint32_t*)p_end)
								return false;
							*p_src = *p_sym;
							++ p_sym;
						}
					}
					if(p_sym != (const uint32_t*)p_end)
						return false;
				}
				break;
			default:
				//printf(" %d ", n_escaped_symbol_bit_num); // debug
				return false;
			};
#endif // __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
		}
		// replace the escaped symbols by the symbols in the second stream

		const uint8_t *p_src = t_dehuff.p_Data();
		const uint8_t *p_end = p_src + t_dehuff.n_Size();
		CDecodeInt<uint32_t> decode(p_src, p_end);
		if(!Decode_IF(n_output_size, decode, r_t_out_buffer) || decode.p_Pointer() != p_end)
			return false;
		// decodes inversion frequencies
	}
	// decodes inversion frequencies from the first buffer

	return Decode_PermTable(p_src2, p_end2, r_t_out_buffer);
	// decodes permutation table from the second buffer
}

bool CInversionFrequenciesCodec::ModifiedEncode(const TBuffer &r_t_in_buffer,
	TBuffer &r_t_out_buffer, TBuffer &r_t_table_buffer,
	TBuffer &r_t_temp_buffer, int n_permutation_type)
{
	_ASSERTE(&r_t_in_buffer != &r_t_table_buffer);
	_ASSERTE(&r_t_out_buffer != &r_t_in_buffer);
	_ASSERTE(&r_t_out_buffer != &r_t_table_buffer);
	_ASSERTE(&r_t_temp_buffer != &r_t_in_buffer);
	_ASSERTE(&r_t_temp_buffer != &r_t_out_buffer);
	// the buffers need to be unique, no reusing, except that r_t_table_buffer can also used as r_t_temp_buffer

	r_t_temp_buffer.Resize(r_t_in_buffer.n_Size(), false); // ignore allocation failures here, this is only to avoid reallocating later
	r_t_temp_buffer.Resize(0, false); // contract!
	// preallocate some space in output (r_t_table_buffer can also used as a temp buffer)

	uint32_t p_frequency[256];
	uint8_t p_perm_table[256];
	if(!Encode_IF(r_t_in_buffer, n_permutation_type,
	   p_frequency, p_perm_table, CEmitInt<uint32_t>(r_t_temp_buffer)))
		return false;
	_ASSERTE(r_t_temp_buffer.n_Size() % sizeof(uint32_t) == 0);
	// encodes only frequencies to the second buffer

	TBuffer escaped_symbols; // should be fairly small, no need to cache it
	uint32_t n_max_symbol, n_escape_symbol, n_max_escaped_symbol; // amounts to symbol representation
	int n_escaped_symbol_bit_num;
	//size_t n_symbol_num; // unused
	try {
		std::map<uint32_t, size_t> encoded_length_freqs; // t_odo - surround with try-catch
		for(uint32_t *p_begin = (uint32_t*)r_t_temp_buffer.p_Data(), *p_end =
		   (uint32_t*)(r_t_temp_buffer.p_Data() + r_t_temp_buffer.n_Size());
		   p_begin != p_end; ++ p_begin) {
			_ASSERTE(encoded_length_freqs[*p_begin] < SIZE_MAX);
			++ encoded_length_freqs[*p_begin];
		}
		// make a set of encoded lengths

		std::set<uint32_t> symbols_to_escape;
		for(std::map<uint32_t, size_t>::iterator p_sym_it = encoded_length_freqs.begin(),
		   p_end_it = encoded_length_freqs.end(); p_sym_it != p_end_it;) {
			if((*p_sym_it).second < symbol_FreqThresh) {
				symbols_to_escape.insert((*p_sym_it).first);

				std::map<uint32_t, size_t>::iterator p_next_it = p_sym_it;
				++ p_next_it; // can't get iterator to prev, as this may be the first (but never the last)

				encoded_length_freqs.erase(p_sym_it);
				// Iterators, pointers and references referring to elements removed by the function are invalidated.
				// All other iterators, pointers and references keep their validity.

				p_sym_it = p_next_it;
			} else
				++ p_sym_it;
		}
		// collect a set of symbols to escape

		n_max_escaped_symbol = (symbols_to_escape.empty())? 0 : *(-- symbols_to_escape.end());
		n_escaped_symbol_bit_num = n_Bit_Width(n_max_escaped_symbol);//n_Log2(n_Make_POT(n_max_escaped_symbol + 1)); // t_odo - make n_Bit_Width(x) and n_Log2_Ceil()
		uint32_t n_min_escaped_symbol = (symbols_to_escape.empty())? 0 : *symbols_to_escape.begin();
		int n_min_escaped_symbol_bit_num = n_Bit_Width(n_min_escaped_symbol);//n_Log2(n_Make_POT(n_min_escaped_symbol + 1));
		_ASSERTE((n_max_escaped_symbol & n_Mask(n_escaped_symbol_bit_num)) == n_max_escaped_symbol);
		_ASSERTE((n_max_escaped_symbol & ~n_Mask(n_escaped_symbol_bit_num)) == 0);
		n_max_symbol = (encoded_length_freqs.empty())? 0 : (*(-- encoded_length_freqs.end())).first;
		if(n_max_symbol > UINT32_MAX - n_escaped_symbol_bit_num) // make sure that there is space for more symbols
			return false;
		n_escape_symbol = n_max_symbol + 1; // use a symbol with the highest value to escape the less frequent symbols
		n_max_symbol += n_escaped_symbol_bit_num - n_min_escaped_symbol_bit_num + 1; // it depends on the lowest value of the escaped symbol
		//n_symbol_num = encoded_length_freqs.size() + 1; // unused
		// stats, assign a symbol to denote the escape

		// t_odo - add threshold to minimum number of bits the escape symbols are stored with
		// to reduce maximum symbol value (threshold of 2 helps a bit)

#ifdef __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
		_ASSERTE(escaped_symbols.b_Empty());
		CBitEncoder encoder(escaped_symbols);

		//encoded_buffer.clear(); // debug
		for(uint32_t *p_begin = (uint32_t*)r_t_temp_buffer.p_Data(), *p_end =
		   (uint32_t*)(r_t_temp_buffer.p_Data() + r_t_temp_buffer.n_Size());
		   p_begin != p_end; ++ p_begin) {
			uint32_t n_symbol = *p_begin;
			if(symbols_to_escape.count(n_symbol)) {
				//encoded_buffer.push_back(n_symbol); // debug
				int n_bit_num = n_Bit_Width(n_symbol);//n_Log2(n_Make_POT(n_symbol + 1));
				_ASSERTE(n_bit_num >= n_min_escaped_symbol_bit_num && n_bit_num <= n_escaped_symbol_bit_num);
				if(!encoder.Encode_Symbol(n_symbol, n_bit_num))
					return false;
				*p_begin = n_escape_symbol + n_escaped_symbol_bit_num - n_bit_num; // use the n-th last escape symbol to encode length of n bits
				_ASSERTE(*p_begin >= n_escape_symbol && *p_begin <= n_max_symbol); // make sure it fits
			}
		}
		if(!encoder.Flush())
			return false;
		// bit-encode the esxaped symbols for total efficiency
#else // __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
		CEmitInt<uint32_t> emit_escaped(escaped_symbols);
		for(uint32_t *p_begin = (uint32_t*)r_t_temp_buffer.p_Data(), *p_end =
		   (uint32_t*)(r_t_temp_buffer.p_Data() + r_t_temp_buffer.n_Size());
		   p_begin != p_end; ++ p_begin) {
			if(symbols_to_escape.count(*p_begin)) {
				if(!emit_escaped(*p_begin))
					return false;
				*p_begin = n_escape_symbol;
			}
		}
#endif // __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
		// create a buffer with escaped symbols
	} catch(std::bad_alloc&) {
		return false;
	}
	/*uint32_t n_max_symbol = (encoded_length_freqs.empty())? 0 :
		(*(-- encoded_length_freqs.end())).first; // amounts to symbol representation
	size_t n_symbol_num = encoded_length_freqs.size();*/
	//size_t n_log_symbol_num = n_Log2_Ceil(n_symbol_num); // amounts to max tree height // makes compression ratio worse
	// determine how to store the symbols

	if(n_max_symbol <= UINT8_MAX) {
		n_max_symbol = 8;
		uint8_t *p_dest = (uint8_t*)r_t_temp_buffer.p_Data();
		for(uint32_t *p_begin = (uint32_t*)r_t_temp_buffer.p_Data(), *p_end =
		   (uint32_t*)(r_t_temp_buffer.p_Data() + r_t_temp_buffer.n_Size());
		   p_begin != p_end; ++ p_begin, ++ p_dest)
			*p_dest = (uint8_t)*p_begin;
		r_t_temp_buffer.Resize(r_t_temp_buffer.n_Size() / 4, true); // scale down
	} else if(n_max_symbol <= UINT16_MAX) {
		n_max_symbol = 16;
		uint16_t *p_dest = (uint16_t*)r_t_temp_buffer.p_Data();
		for(uint32_t *p_begin = (uint32_t*)r_t_temp_buffer.p_Data(), *p_end =
		   (uint32_t*)(r_t_temp_buffer.p_Data() + r_t_temp_buffer.n_Size());
		   p_begin != p_end; ++ p_begin, ++ p_dest)
			*p_dest = (uint16_t)*p_begin;
		r_t_temp_buffer.Resize(r_t_temp_buffer.n_Size() / 2, true); // scale down
	} else
		n_max_symbol = 32;
	// convert the buffer

#ifndef __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
	if(n_max_escaped_symbol <= UINT8_MAX) {
		n_max_escaped_symbol = 8;
		uint8_t *p_dest = (uint8_t*)escaped_symbols.p_Data();
		for(uint32_t *p_begin = (uint32_t*)escaped_symbols.p_Data(), *p_end =
		   (uint32_t*)(escaped_symbols.p_Data() + escaped_symbols.n_Size());
		   p_begin != p_end; ++ p_begin, ++ p_dest)
			*p_dest = (uint8_t)*p_begin;
		escaped_symbols.Resize(escaped_symbols.n_Size() / 4, true); // scale down
	} else if(n_max_escaped_symbol <= UINT16_MAX) {
		n_max_escaped_symbol = 16;
		uint16_t *p_dest = (uint16_t*)escaped_symbols.p_Data();
		for(uint32_t *p_begin = (uint32_t*)escaped_symbols.p_Data(), *p_end =
		   (uint32_t*)(escaped_symbols.p_Data() + escaped_symbols.n_Size());
		   p_begin != p_end; ++ p_begin, ++ p_dest)
			*p_dest = (uint16_t)*p_begin;
		escaped_symbols.Resize(escaped_symbols.n_Size() / 2, true); // scale down
	} else
		n_max_escaped_symbol = 32;
#endif // !__INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
	// convert the escaped symbol buffer as well

	switch(n_max_symbol) {
	case 8:
		if(!CHuffmanUtil<uint8_t, max_CodeBitNum8>::Encode(r_t_temp_buffer, r_t_out_buffer/*, n_log_symbol_num*/))
			return false;
		break;
	case 16:
		if(!CHuffmanUtil<uint16_t, max_CodeBitNum16>::Encode(r_t_temp_buffer, r_t_out_buffer/*, n_log_symbol_num*/))
			return false;
		break;
	case 32:
		if(!CHuffmanUtil<uint32_t, max_CodeBitNum32>::Encode(r_t_temp_buffer, r_t_out_buffer/*, n_log_symbol_num*/))
			return false;
		break;
	}
	// encode the frequencies separately from the other data

	if(!r_t_out_buffer.Grow(escaped_symbols.n_Size() + 2 * sizeof(uint32_t)))
		return false;
	memcpy(r_t_out_buffer.p_Data() + (r_t_out_buffer.n_Size() -
		(escaped_symbols.n_Size() + 2 * sizeof(uint32_t))),
		escaped_symbols.p_Data(), escaped_symbols.n_Size());
#ifdef __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
	*(uint32_t*)(r_t_out_buffer.p_Data() + (r_t_out_buffer.n_Size() - 2 * sizeof(uint32_t))) = n_escape_symbol;
	*(uint32_t*)(r_t_out_buffer.p_Data() + (r_t_out_buffer.n_Size() - sizeof(uint32_t))) = n_escaped_symbol_bit_num; // this now needs to be encoded
#else // __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
	*(uint32_t*)(r_t_out_buffer.p_Data() + (r_t_out_buffer.n_Size() - 2 * sizeof(uint32_t))) = escaped_symbols.n_Size();
	*(uint32_t*)(r_t_out_buffer.p_Data() + (r_t_out_buffer.n_Size() - sizeof(uint32_t))) = n_max_escaped_symbol; // todo - store as uint8_t
#endif // __INVERSION_FREQUENCIES_BIT_ENCODED_RARE_SYMBOLS
	// append the escaped symbols to the end of the Huffman-encoded stream

	if(!r_t_table_buffer.Resize(sizeof(uint32_t) + sizeof(uint8_t)/* * 2*/, false))
		return false;
	*(uint32_t*)r_t_table_buffer.p_Data() = r_t_in_buffer.n_Size();
	*(uint8_t*)(r_t_table_buffer.p_Data() + sizeof(uint32_t)) = n_max_symbol; // t_odo - store as uint8_t
	//*(uint8_t*)(r_t_table_buffer.p_Data() + sizeof(uint32_t) + sizeof(uint8_t)) = n_log_symbol_num;
	// write size of uncompressed data and symbol parameters for easier decoding

	if(!Encode_PermTable(n_permutation_type, p_perm_table,
	   p_frequency, r_t_table_buffer)) // appends to the second output buffer
		return false;
	// encodes permutation table and decompressed size to the second buffer

	return true;
}

template <class CDecodeObject>
bool CInversionFrequenciesCodec::Decode_IF(uint32_t n_output_size,
	CDecodeObject &decode, TBuffer &r_t_out_buffer)
{
	if(!r_t_out_buffer.Resize(n_output_size, false))
		return false;
	// allocate output buffer

	memset(r_t_out_buffer.p_Data(), 0xff, r_t_out_buffer.n_Size());
	// set output to contain highest symbols

#ifdef __INVERSION_FREQUENCIES_DELTA_ENCODE_FREQUENCY
	uint32_t n_prev_count; // n_count gets overwritten inside the loop
#endif // __INVERSION_FREQUENCIES_DELTA_ENCODE_FREQUENCY
	for(int i = 0; i < 255; ++ i) { // symbol 255 is already filled in output buffer by memset() above
		uint32_t n_count;
#ifdef __INVERSION_FREQUENCIES_DELTA_ENCODE_FREQUENCY
		uint32_t n_dcount;
		if(!decode(n_dcount))
			return false;
		if(i) { // delta encoded
			if(n_dcount & 1)
				n_count = n_prev_count - (n_dcount >> 1);
			else
				n_count = n_prev_count + (n_dcount >> 1);
		} else
			n_count = n_dcount;
		n_prev_count = n_count;
#else // __INVERSION_FREQUENCIES_DELTA_ENCODE_FREQUENCY
		if(!decode(n_count))
			return false;
#endif // __INVERSION_FREQUENCIES_DELTA_ENCODE_FREQUENCY
		// read escaped value

		if(!n_count)
			continue;
		// no occurencies of this symbol

		uint8_t *p_dest = r_t_out_buffer.p_Data();
		uint8_t *p_dest_end = p_dest + r_t_out_buffer.n_Size();
		// get destination pointers

		uint8_t n_decoded_char = i;
		do {
			uint32_t n_distance;
			if(!decode(n_distance))
				return false;
#ifdef __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
			if(n_distance == __INVERSION_FREQUENCIES_ZERO_RUN_SYMBOL) {
				if(!decode(n_distance))
					return false;
				if(n_distance) {
					do {
						if(p_dest == p_dest_end)
							return false;
						while(*p_dest < n_decoded_char) {
							if(++ p_dest == p_dest_end)
								return false;
						}
						_ASSERTE(p_dest != p_dest_end);
						// skip to next destination, do not count already-filled values

						*p_dest ++ = n_decoded_char;
						// write character to it's position

						-- n_count;
					} while(-- n_distance);
					// limited length zero run

					++ n_count; // decremented one more time upon continue
					continue;
				} else {
					do {
						if(p_dest == p_dest_end)
							return false;
						while(*p_dest < n_decoded_char) {
							if(++ p_dest == p_dest_end)
								return false;
						}
						_ASSERTE(p_dest != p_dest_end);
						// skip to next destination, do not count already-filled values

						*p_dest ++ = n_decoded_char;
						// write character to it's position
					} while(-- n_count);
					break;
					// zero run until the end
				}
			} else {
				if(n_distance > __INVERSION_FREQUENCIES_ZERO_RUN_SYMBOL)
					-- n_distance;
			}
#endif // __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
			// read distance

			_ASSERTE(p_dest <= p_dest_end);
			while(n_distance && p_dest != p_dest_end) {
				if(*p_dest >= n_decoded_char)
					-- n_distance;
				++ p_dest;
			}
			if(p_dest == p_dest_end)
				return false;
			while(*p_dest < n_decoded_char) {
				if(++ p_dest == p_dest_end)
					return false;
			}
			_ASSERTE(p_dest != p_dest_end);
			// skip to next destination, do not count already-filled values

			*p_dest ++ = n_decoded_char;
			// write character to it's position
		} while(-- n_count);
		// fill-in character occurences (rather slow, but memory-efficient)
	}
	// read inversion frequencies, reconstruct positions

	// note that p_src == p_end is not checked here

	return true;
}

bool CInversionFrequenciesCodec::Decode_PermTable(const uint8_t *p_src,
	const uint8_t *p_end, TBuffer &r_t_out_buffer)
{
	if(p_src + sizeof(uint16_t) > p_end)
		return false;
	int n_perm_table_size = *(int16_t*)p_src;
	p_src += sizeof(uint16_t);
	// read permutation table size

	int n_perm_table_offset = 0;
	if(n_perm_table_size) {
		if(p_src == p_end)
			return false;
		n_perm_table_offset = *p_src ++;
	}
	// read permutaton table offset, if present

	const uint8_t *p_perm_table = p_src;
	if(p_src + n_perm_table_size != p_end)
		return false;
	// get permutatuion table, apply offset

	if(n_perm_table_size) {
		uint8_t *p_dest = r_t_out_buffer.p_Data();
		uint8_t *p_dest_end = p_dest + r_t_out_buffer.n_Size();
		// get destination pointers

		for(; p_dest != p_dest_end; ++ p_dest) {
			int n_index = int(*p_dest) - n_perm_table_offset;
			if(n_index < 0 || n_index >= n_perm_table_size)
				return false;
			*p_dest = p_perm_table[n_index];
		}
	}
	// apply permutation table

	return true;
}

template <class CEmitObject>
bool CInversionFrequenciesCodec::Encode_IF(const TBuffer &r_t_in_buffer, int n_permutation_type,
	uint32_t p_frequency[256], uint8_t p_perm_table[256], CEmitObject emit)
{
	//_ASSERTE(sizeof(p_frequency) == 256 * sizeof(uint32_t)); // decay? yes.
	memset(p_frequency, 0, (size_t(UINT8_MAX) + 1) * sizeof(p_frequency[0]));
	{
		const uint8_t *p_src = r_t_in_buffer.p_Data();
		const uint8_t *p_end = p_src + r_t_in_buffer.n_Size();
		for(; p_src != p_end; ++ p_src) {
			++ p_frequency[*p_src];
			_ASSERTE(p_frequency[*p_src] > 0); // watch out for overflow
		}
	}
	// count occurencies of all symbols in input sequence

	for(int i = 0; i < 256; ++ i)
		p_perm_table[i] = i;
	if(n_permutation_type != sort_NoSort) {
		if(n_permutation_type == sort_FreqAscending) {
			std::sort(p_perm_table, p_perm_table + 256, CSortAsc(p_frequency));
			for(int i = 1; i < 256; ++ i)
				_ASSERTE(p_frequency[p_perm_table[i - 1]] <= p_frequency[p_perm_table[i]]);
		} else /*if(n_permutation_type == sort_FreqDescending)*/ {
			_ASSERTE(n_permutation_type == sort_FreqDescending);
			std::sort(p_perm_table, p_perm_table + 256, CSortDesc(p_frequency));
			for(int i = 1; i < 256; ++ i)
				_ASSERTE(p_frequency[p_perm_table[i - 1]] >= p_frequency[p_perm_table[i]]);
		}
		// create permutation table
	}
	uint8_t p_inv_perm_table[256];
	for(int i = 0; i < 256; ++ i)
		p_inv_perm_table[p_perm_table[i]] = i;
	//for(int i = 0; i < 256; ++ i)
	//	std::swap(p_inv_perm_table[i], p_perm_table[i]); // no, the permutation is correct
	// build permutation table

	for(int i = 0; i < 255; ++ i) { // don't have to code the last symbol
		uint8_t n_encoded_char = i;
		uint32_t n_count = p_frequency[p_perm_table[i]];
#ifdef __INVERSION_FREQUENCIES_DELTA_ENCODE_FREQUENCY
		uint32_t n_dcount;
		if(i) {
			uint32_t n_prev_count = p_frequency[p_perm_table[i - 1]];
			if(n_prev_count > n_count) {
				_ASSERTE(!((n_prev_count - n_count) & (uint32_t(1) << (sizeof(uint32_t) * 8 - 1))));
				// make sure it wont overflow (note - this is a runtime error on >= 4GB buffers, it could actually happen)

				n_dcount = 1 | ((n_prev_count - n_count) << 1);
			} else {
				_ASSERTE(!((n_count - n_prev_count) & (uint32_t(1) << (sizeof(uint32_t) * 8 - 1))));
				// make sure it wont overflow (note - this is a runtime error on >= 4GB buffers, it could actually happen)

				n_dcount = (n_count - n_prev_count) << 1;
			}
			// delta-encode, use the least significant bit for sign to avoid very long numbers
		} else
			n_dcount = n_count;
		if(!emit(n_dcount))
			return false;
#else // __INVERSION_FREQUENCIES_DELTA_ENCODE_FREQUENCY
		if(!emit(n_count))
			return false;
#endif // __INVERSION_FREQUENCIES_DELTA_ENCODE_FREQUENCY
		// write frequency of the symbol

		if(!n_count)
			continue;
		// are there occurences of this symbol?

		uint32_t n_distance = 0;
#ifdef __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
		uint32_t n_zero_run = 0;
		const uint32_t n_zero_run_symbol = __INVERSION_FREQUENCIES_ZERO_RUN_SYMBOL;
#endif // __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
		for(const uint8_t *p_src = r_t_in_buffer.p_Data();; ++ p_src) {
			_ASSERTE(p_src < r_t_in_buffer.p_Data() + r_t_in_buffer.n_Size());
			uint8_t n_char = p_inv_perm_table[*p_src]; // apply inverse permutation here, we need to process symbols in ordered fashion
			if(n_char == n_encoded_char) {
#ifdef __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
				if(!n_distance)
					++ n_zero_run;
				else {
					if(n_zero_run) {
						if(n_zero_run < 3) { // do not explicitly encode short runs
							for(size_t i = 0; i < n_zero_run; ++ i) {
								if(!emit((0 >= n_zero_run_symbol)? 0 + 1 : 0))
									return false;
							}
							n_zero_run = 0;
						} else {
							if(!emit(n_zero_run_symbol) || !emit(n_zero_run)) // denotes run length of n_zero_run
								return false;
						}
					}
					if(!emit((n_distance >= n_zero_run_symbol)? n_distance + 1 : n_distance))
						return false;
					n_zero_run = 0;
				}
#else // __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
				if(!emit(n_distance))
					return false;
#endif // __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
				// emit distance

				n_distance = 0;
				// reset distance

				if(!(-- n_count)) {
#ifdef __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
					if(n_zero_run) { // zero run until the end
						if(!emit(n_zero_run_symbol) || !emit(0)) // denotes run length until the end
							return false;
					}
#endif // __INVERSION_FREQUENCIES_ENCODE_ZERO_RUNS
#ifdef _DEBUG
					++ p_src; // skip this one
					for(const uint8_t *p_end = r_t_in_buffer.p_Data() +
					   r_t_in_buffer.n_Size(); p_src != p_end; ++ p_src)
						_ASSERTE(p_inv_perm_table[*p_src] != n_char);
					// make sure there are really no more occurences of this symbol
#endif // _DEBUG
					break;
				}
				// are there more occurences?
			} else if(n_char > n_encoded_char) // we need to ensure symbols are compared correct (inverse permutation table)
				++ n_distance;
		}
		_ASSERTE(!n_count);
		// write 
	}
	// write inverse frequencies of symbols

	return true;
}

bool CInversionFrequenciesCodec::Encode_PermTable(int n_permutation_type,
	const uint8_t p_perm_table[256], const uint32_t p_frequency[256], TBuffer &r_t_out_buffer)
{
	if(n_permutation_type != sort_NoSort) {
		// t_odo - this doesn't work. think about it

		int n_perm_table_size = 0, n_perm_table_off = 0;
		int b = 0, e = 256;
		while(b < e && !p_frequency[p_perm_table[b]])
			++ b;
		while(e - 1 >= b && !p_frequency[p_perm_table[e - 1]])
			-- e;
		n_perm_table_size = e - b; // even 0
		n_perm_table_off = b;
		// crop the permutation table to not store the whole thing

		int n_perm_tab_space = sizeof(uint16_t) + ((n_perm_table_size)?
			n_perm_table_size + sizeof(uint8_t) : 0);
		// calculate space

		if(!r_t_out_buffer.Grow(n_perm_tab_space))
			return false;
		uint8_t *p_dest_end = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size();
		uint8_t *p_dest = p_dest_end - n_perm_tab_space;
		// resize

		*(uint16_t*)p_dest = n_perm_table_size;
		p_dest += sizeof(uint16_t);
		// write size (can unfortunately get slightly bigger than UINT8_MAX)

		if(n_perm_table_size) {
			*p_dest ++ = n_perm_table_off;
			memcpy(p_dest, p_perm_table + n_perm_table_off,
				n_perm_table_size * sizeof(uint8_t));
		}
		// write offset + contents

		_ASSERTE(p_dest + n_perm_table_size == p_dest_end);
	} else {
		if(!r_t_out_buffer.Grow(sizeof(uint16_t)))
			return false;
		uint8_t *p_dest = r_t_out_buffer.p_Data() + r_t_out_buffer.n_Size() - sizeof(uint16_t);
		*(uint16_t*)p_dest = 0;
		// write zero (perm table size)
	}
	// write permutation table

	return true;
}

/*
 *								=== ~CInversionFrequenciesCodec ===
 */

/*
 *								=== CCRC32CheckCodec ===
 */

bool CCRC32CheckCodec::Encode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	uint32_t n_crc32 = CCrc_32::n_Crc(r_t_in_buffer.n_Size(), r_t_in_buffer.p_Data());
	// calculate CRC of the input

	if(&r_t_out_buffer == &r_t_in_buffer) {
		if(!r_t_out_buffer.Grow(sizeof(uint32_t)))
			return false;
		*(uint32_t*)(r_t_out_buffer.p_Data() + (r_t_out_buffer.n_Size() - sizeof(uint32_t))) = n_crc32;
	} else {
		if(!r_t_out_buffer.Resize(r_t_in_buffer.n_Size() + sizeof(uint32_t), false))
			return false;
		memcpy(r_t_out_buffer.p_Data(), r_t_in_buffer.p_Data(), r_t_in_buffer.n_Size());
		*(uint32_t*)(r_t_out_buffer.p_Data() + (r_t_out_buffer.n_Size() - sizeof(uint32_t))) = n_crc32;
	}
	// append to the buffer (also handle inplace)

	return true;
}

bool CCRC32CheckCodec::Decode(const TBuffer &r_t_in_buffer, TBuffer &r_t_out_buffer)
{
	if(r_t_in_buffer.n_Size() < sizeof(uint32_t))
		return false;
	uint32_t n_crc32 = CCrc_32::n_Crc(r_t_in_buffer.n_Size() -
		sizeof(uint32_t), r_t_in_buffer.p_Data());
	// calculate CRC of the input

	if(*(uint32_t*)(r_t_in_buffer.p_Data() + (r_t_in_buffer.n_Size() - sizeof(uint32_t))) != n_crc32)
		return false;
	// compare to the stored CRC

	if(&r_t_out_buffer == &r_t_in_buffer) {
		r_t_out_buffer.Resize(r_t_out_buffer.n_Size() - sizeof(uint32_t), true);
		// remove the CRC from the buffer (will not fail)
	} else {
		if(!r_t_out_buffer.Resize(r_t_in_buffer.n_Size() - sizeof(uint32_t), false))
			return false;
		memcpy(r_t_out_buffer.p_Data(), r_t_in_buffer.p_Data(), r_t_in_buffer.n_Size());
		// copy to a new buffer
	}
	// remove CRC from the buffer (also handle inplace)

	return true;
}

/*
 *								=== ~CCRC32CheckCodec ===
 */
