/*
								+---------------------------------+
								|                                 |
								|  ***   Tiny jpeg encoder   ***  |
								|                                 |
								|  Copyright   -tHE SWINe- 2006  |
								|                                 |
								|           JpegEnc.cpp           |
								|                                 |
								+---------------------------------+
*/

/**
 *	@file JpegEnc.cpp
 *	@author -tHE SWINe-
 *	@date 2006
 *	@brief simple jpeg encoder
 *
 *	@date 2007-12-24
 *
 *	improved linux compatibility by adding posix integer types
 *
 *	@date 2008-03-04
 *
 *	using Integer.h header, using CallStack.h instead of crtdbg.h
 *	changed size of zigZag table back to 64 and it's type to int (speed / obfuscation purposes)
 *
 *	@date 2009-05-04
 *
 *	fixed mixed windows / linux line endings
 *
 */

#include "NewFix.h"

#include "CallStack.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <vector>
#include <math.h>
#include "Huffman.h"
#include "JpegEnc.h"
#include "DCT.h"

/*
 *								=== CGrayscaleConversion ===
 */

class CGrayscaleConversion {
protected:
	uint16_t m_p_RGB_to_Y[256][3];

public:
	CGrayscaleConversion()
	{
		for(int16_t i = 0; i < 256; i ++) {
			m_p_RGB_to_Y[i][0] = (uint16_t)(.299f * (i * 256));
			m_p_RGB_to_Y[i][1] = (uint16_t)(.587f * (i * 256));
			m_p_RGB_to_Y[i][2] = (uint16_t)(.114f * (i * 256));
		}
		// Y = 0.299 R + 0.587 G + 0.114 B
	}

	inline int16_t n_Color(uint32_t n_rgb) const
	{
#ifndef __JPEG_ENC_BGR
		return ((m_p_RGB_to_Y[((n_rgb >> 16) & 0xff)][0] +
			     m_p_RGB_to_Y[((n_rgb >> 8)  & 0xff)][1] +
			     m_p_RGB_to_Y[ (n_rgb        & 0xff)][2]) >> 8);
#else
		return ((m_p_RGB_to_Y[ (n_rgb        & 0xff)][0] +
			     m_p_RGB_to_Y[((n_rgb >> 8)  & 0xff)][1] +
			     m_p_RGB_to_Y[((n_rgb >> 16) & 0xff)][2]) >> 8);
#endif
	}
};

/*
 *								=== ~CGrayscaleConversion ===
 */

/*
 *								=== CChrominance_Cb_Conversion ===
 */

class CChrominance_Cb_Conversion {
protected:
	int p_RGB_to_Cb[256][2];

public:
	CChrominance_Cb_Conversion()
	{
		for(int16_t i = 0; i < 256; i ++) {
			p_RGB_to_Cb[i][0] = (int)(-.1687f * (i * 128));
			p_RGB_to_Cb[i][1] = (int)(-.3313f * (i * 128));
		}
		// Cb = - 0.1687 R - 0.3313 G + 0.5 B + 128
	}

	inline int16_t n_Color(uint32_t n_rgb) const
	{
#ifndef __JPEG_ENC_BGR
		return ((p_RGB_to_Cb[((n_rgb >> 16) & 0xff)][0] +
				 p_RGB_to_Cb[((n_rgb >> 8)  & 0xff)][1] +
							( (n_rgb        & 0xfe) << 6)) >> 7) + 128;
#else
		return ((p_RGB_to_Cb[( n_rgb        & 0xff)][0] +
				 p_RGB_to_Cb[((n_rgb >> 8)  & 0xff)][1] +
				 			 ((n_rgb & 0xfe0000) >> 10)) >> 7) + 128;
#endif
	}
};

/*
 *								=== ~CChrominance_Cb_Conversion ===
 */

/*
 *								=== CChrominance_Cr_Conversion ===
 */

class CChrominance_Cr_Conversion {
protected:
	int p_RGB_to_Cr[256][2];

public:
	CChrominance_Cr_Conversion()
	{
		for(int16_t i = 0; i < 256; i ++) {
			p_RGB_to_Cr[i][0] = (int)(-.4187f * (i * 128));
			p_RGB_to_Cr[i][1] = (int)(-.0813f * (i * 128));
		}
		// Cr = 0.5 R - 0.4187 G - 0.0813 B + 128
	}

	inline int16_t n_Color(uint32_t n_rgb) const
	{
#ifndef __JPEG_ENC_BGR
		return ((			 ((n_rgb & 0xfe0000) >> 10) +
				 p_RGB_to_Cr[((n_rgb >> 8)  & 0xff)][0] +
				 p_RGB_to_Cr[ (n_rgb        & 0xff)][1]) >> 7) + 128;
#else
		return ((			( (n_rgb        & 0xfe) << 6) +
				 p_RGB_to_Cr[((n_rgb >> 8)  & 0xff)][0] +
				 p_RGB_to_Cr[((n_rgb >> 16) & 0xff)][1]) >> 7) + 128;
#endif
	}
};

/*
 *								=== ~CChrominance_Cr_Conversion ===
 */

/*
 *								=== CComponent_R_Conversion ===
 */

class CComponent_R_Conversion {
public:
	static inline int16_t n_Color(uint32_t n_rgb) /*const*/
	{
		return (n_rgb >> 16) & 0xff;
	}
};

/*
 *								=== ~CComponent_R_Conversion ===
 */

/*
 *								=== CComponent_G_Conversion ===
 */

class CComponent_G_Conversion {
public:
	static inline int16_t n_Color(uint32_t n_rgb) /*const*/
	{
		return (n_rgb >> 8) & 0xff;
	}
};

/*
 *								=== ~CComponent_G_Conversion ===
 */

#define n_Min3(a,b,c) (((a) > (b))? (((b) > (c))? (c) : (b)) : (((a) > (c))? (c) : (a)))
#define n_Max3(a,b,c) (((a) > (b))? (((a) > (c))? (a) : (c)) : (((b) > (c))? (b) : (c)))

#if defined(__JPEG_ENC_ENABLE_CMYK) || defined(__JPEG_ENC_ENABLE_YCCK)

/*
 *								=== CCMYK_C_Conversion ===
 */

class CCMYK_C_Conversion {
public:
	static inline int16_t n_Color(uint32_t n_rgb) /*const*/
	{
		uint8_t n_r = (n_rgb >> 16) /*& 0xff*/;
		uint8_t n_g = (n_rgb >> 8) /*& 0xff*/;
		uint8_t n_b = n_rgb /*& 0xff*/;
		uint8_t n_k = 0xff - n_Max3(n_r, n_g, n_b);

		if(n_k == 0xff)
			return 0;

#ifndef __JPEG_ENC_BGR
		return (((int)((0xff - n_r) - n_k)) << 8) / (0xff - n_k);
#else
		return (((int)((0xff - n_b) - n_k)) << 8) / (0xff - n_k);
#endif
	}
};

/*
 *								=== ~CCMYK_C_Conversion ===
 */

/*
 *								=== CCMYK_M_Conversion ===
 */

class CCMYK_M_Conversion {
public:
	static inline int16_t n_Color(uint32_t n_rgb) /*const*/
	{
		uint8_t n_r = (n_rgb >> 16) /*& 0xff*/;
		uint8_t n_g = (n_rgb >> 8) /*& 0xff*/;
		uint8_t n_b = n_rgb /*& 0xff*/;
		uint8_t n_k = 0xff - n_Max3(n_r, n_g, n_b);

		if(n_k == 0xff)
			return 0;

		return (((int)((0xff - n_g) - n_k)) << 8) / (0xff - n_k);
		// t_odo - fix conversion - now it should be fine
	}
};

/*
 *								=== ~CCMYK_M_Conversion ===
 */

/*
 *								=== CCMYK_Y_Conversion ===
 */

class CCMYK_Y_Conversion {
public:
	static inline int16_t n_Color(uint32_t n_rgb) /*const*/
	{
		uint8_t n_r = (n_rgb >> 16) /*& 0xff*/;
		uint8_t n_g = (n_rgb >> 8) /*& 0xff*/;
		uint8_t n_b = n_rgb /*& 0xff*/;
		uint8_t n_k = 0xff - n_Max3(n_r, n_g, n_b);

		if(n_k == 0xff)
			return 0;

#ifndef __JPEG_ENC_BGR
		return (((int)((0xff - n_b) - n_k)) << 8) / (0xff - n_k);
#else
		return (((int)((0xff - n_r) - n_k)) << 8) / (0xff - n_k);
#endif
	}
};

/*
 *								=== ~CCMYK_Y_Conversion ===
 */

/*
 *								=== CCMYK_K_Conversion ===
 */

class CCMYK_K_Conversion {
public:
	static inline int16_t n_Color(uint32_t n_rgb) /*const*/
	{
		uint8_t n_r = (n_rgb >> 16) /*& 0xff*/;
		uint8_t n_g = (n_rgb >> 8) /*& 0xff*/;
		uint8_t n_b = n_rgb /*& 0xff*/;

		return n_Max3(n_r, n_g, n_b);
	}
};

/*
 *								=== ~CCMYK_K_Conversion ===
 */

#endif // defined(__JPEG_ENC_ENABLE_CMYK) || defined(__JPEG_ENC_ENABLE_YCCK)

#ifdef __JPEG_ENC_ENABLE_YCCK

/*
 *								=== CYCCK_C_Conversion ===
 */

class CYCCK_C_Conversion {
protected:
	CGrayscaleConversion m_gray_conv;

public:
	inline int16_t n_Color(uint32_t n_rgb) const
	{
		uint8_t n_c = (uint8_t)CCMYK_C_Conversion::n_Color(n_rgb);
		uint8_t n_m = (uint8_t)CCMYK_M_Conversion::n_Color(n_rgb);
		uint8_t n_y = (uint8_t)CCMYK_Y_Conversion::n_Color(n_rgb);

		return m_gray_conv.n_Color((n_y << 16) | (n_m << 8) | n_c);
	}
};

/*
 *								=== ~CYCCK_C_Conversion ===
 */

/*
 *								=== CYCCK_M_Conversion ===
 */

class CYCCK_M_Conversion {
protected:
	CChrominance_Cb_Conversion m_cb_conv;

public:
	inline int16_t n_Color(uint32_t n_rgb) const
	{
		uint8_t n_c = (uint8_t)CCMYK_C_Conversion::n_Color(n_rgb);
		uint8_t n_m = (uint8_t)CCMYK_M_Conversion::n_Color(n_rgb);
		uint8_t n_y = (uint8_t)CCMYK_Y_Conversion::n_Color(n_rgb);

		return m_cb_conv.n_Color((n_y << 16) | (n_m << 8) | n_c);
	}
};

/*
 *								=== ~CYCCK_M_Conversion ===
 */

/*
 *								=== CYCCK_Y_Conversion ===
 */

class CYCCK_Y_Conversion {
protected:
	CChrominance_Cr_Conversion m_cr_conv;

public:
	inline int16_t n_Color(uint32_t n_rgb) const
	{
		uint8_t n_c = (uint8_t)CCMYK_C_Conversion::n_Color(n_rgb);
		uint8_t n_m = (uint8_t)CCMYK_M_Conversion::n_Color(n_rgb);
		uint8_t n_y = (uint8_t)CCMYK_Y_Conversion::n_Color(n_rgb);

		return m_cr_conv.n_Color((n_y << 16) | (n_m << 8) | n_c);
	}
};

/*
 *								=== ~CYCCK_Y_Conversion ===
 */

#endif // __JPEG_ENC_ENABLE_YCCK

/*
 *								=== CPredictor ===
 */

#ifdef __JPEG_ENC_ENABLE_LOSSLESS

template <class TSrcPixelType, const int n_src_channel_bit_num>
class CPredictor {
protected:
	static inline int16_t n_Sample_BitOff(TSrcPixelType n_src_pixel, int n_offset, int n_bit_shift)
	{
		if(n_bit_shift >= 0) {
			return (int16_t)(((n_src_pixel >> n_offset) &
				((1 << n_src_channel_bit_num) - 1)) >> n_bit_shift);
		} else {
			return (int16_t)(((n_src_pixel >> n_offset) &
				((1 << n_src_channel_bit_num) - 1)) << -n_bit_shift);
		}
	}

public:
	/*
	 *	static inline int16_t n_Sample(TSrcPixelType n_src_pixel,
	 *		int n_channel, int n_bit_shift)
	 *		- return sample of channel with index n_channel of pixel n_src_pixel,
	 *		  shifted n_bit_shift bits right (negative bit shift is allowed here)
	 *		- exposed to make RLE compression cleaner
	 */
	static inline int16_t n_Sample(TSrcPixelType n_src_pixel,
		int n_channel, int n_bit_shift)
	{
		return n_Sample_BitOff(n_src_pixel, n_channel * n_src_channel_bit_num, n_bit_shift);
	}

	enum {
		pred_A = 1, // a
		pred_B, // b
		pred_C, // c
		pred_A_plus_B_minus_C, // a + b - c
		pred_A_plus_B_minus_C_half, // a + (b - c) / 2
		pred_B_plus_A_minus_C_half, // b + (a - c) / 2
		pred_A_plus_B_half // (a + b) / 2
	};

	/*	samples:
	 *		c | b
	 *		--+--
	 *		a | cur_pixel
	 */

	/*
	 *	static int16_t *p_DifferenceImage(const TSrcPixelType *p_src_buffer,
	 *		int n_width, int n_height, int n_channel, int n_predictor,
	 *		int n_bit_shift)
	 *		- return difference image (not jpeg differential image, but difference between
	 *		  prediction and source)
	 *		- p_src_buffer is source pixel buffer
	 *		- n_width and n_height are image dimensions
	 *		- n_channel is channel index
	 *		- n_predictor is used predictor
	 *		- n_bit_shift is number of bits to shift rigth (for example for 8bpp input
	 *		  and required 6bpp output, n_bit_shift is 2; negative bit shift is allowed here)
	 *		- returns pointer to buffer of modulo 16 differences or 0 in case there was
	 *		  not enough memory to alloc buffer
	 */
	static int16_t *p_DifferenceImage(const TSrcPixelType *p_src_buffer,
		int n_width, int n_height, int n_channel, int n_predictor,
		int n_bit_shift)
	{
		if(n_width <= 0 || n_height <= 0)
			return 0;

		int16_t *p_dest_buffer;
		if(!(p_dest_buffer = new(std::nothrow) int16_t[n_width * n_height]))
			return 0;
		// alloc dest buffer

		int16_t *p_dest = p_dest_buffer;
		// dest pixel pointer

		n_channel *= n_src_channel_bit_num;
		// calc offset in bits

		int16_t n_prediction = 1 << (n_src_channel_bit_num - n_bit_shift - 1); // 2 ^ (P - 1)
		int16_t n_first_pixel = n_Sample_BitOff(*p_src_buffer, n_channel, n_bit_shift);
		*p_dest ++ = n_prediction - n_first_pixel; // first pixel is calculated a bit differently
		n_prediction = n_first_pixel;
		for(const TSrcPixelType *p_src = p_src_buffer + 1, *p_end = p_src_buffer + n_width;
		   p_src < p_end;) {
			int n_sample = n_Sample_BitOff(*p_src ++, n_channel, n_bit_shift);
			*p_dest ++ = (int16_t)(n_prediction - n_sample);
			n_prediction = n_sample;
		}
		// predict first scanline (using predictor pred_A)

		p_src_buffer += n_width;
		for(const TSrcPixelType *p_end_scanline = p_src_buffer + n_width * (n_height - 1);
		   p_src_buffer < p_end_scanline; p_src_buffer += n_width) {
			int16_t n_prev_sample = n_Sample_BitOff(*p_src_buffer, n_channel, n_bit_shift);
			*p_dest ++ = (int16_t)(n_Sample_BitOff(p_src_buffer[-n_width],
				n_channel, n_bit_shift) - n_prev_sample);
			// predict first pixel of next scanline (using predictor pred_B)

			for(const TSrcPixelType *p_src = p_src_buffer + 1, *p_end = p_src_buffer + n_width;
			   p_src < p_end;) {
				int16_t n_prediction;
				if(n_predictor > 3) { // 4, 5, 6, 7
					if(n_predictor >= 6) { // 6, 7
						if(n_predictor == 6) { // 6 (pred_B_plus_A_minus_C_half)
							n_prediction = n_Sample_BitOff(p_src[-n_width], n_channel,
								n_bit_shift) + ((n_prev_sample - n_Sample_BitOff(
								p_src[-1 - n_width], n_channel, n_bit_shift)) >> 1);
						} else { // 7 (pred_A_plus_B_half)
							n_prediction = ((n_prev_sample +
								n_Sample_BitOff(p_src[-n_width], n_channel, n_bit_shift)) >> 1);
						}
					} else { // 4, 5
						if(n_predictor == 4) { // 4 (pred_A_plus_B_minus_C)
							n_prediction = n_prev_sample +
								n_Sample_BitOff(p_src[-n_width], n_channel, n_bit_shift) -
								n_Sample_BitOff(p_src[-1 - n_width], n_channel, n_bit_shift);
						} else { // 5 (pred_A_plus_B_minus_C_half)
							n_prediction = n_prev_sample + ((n_Sample_BitOff(p_src[-n_width],
								n_channel, n_bit_shift) - n_Sample_BitOff(p_src[-1 - n_width],
								n_channel, n_bit_shift)) >> 1);
						}
					}
				} else { // 1, 2, 3
					if(n_predictor >= 2) { // 2, 3
						if(n_predictor == 2) // 2 (pred_B)
							n_prediction = n_Sample_BitOff(p_src[-n_width], n_channel, n_bit_shift);
						else { // 3 (pred_C)
							n_prediction = n_Sample_BitOff(p_src[-1 - n_width],
								n_channel, n_bit_shift);
						}
					} else // 1 (pred_A)
						n_prediction = n_prev_sample;
				}
				// calc prediction (decission tree should be outside the loop, but i'm trying
				// to make the code as small as possible while keeping it fast. worst case here
				// is 2 cmp, 1 jg and 2 je / jge)

				int16_t n_sample = n_Sample_BitOff(*p_src ++, n_channel, n_bit_shift);
				*p_dest ++ = n_prediction - n_sample;
				n_prev_sample = n_sample;
			}
			// predict all other pixels (using selected predictor)
		}

		return p_dest_buffer;
	}
};

#endif // __JPEG_ENC_ENABLE_LOSSLESS

/*
 *								=== ~CPredictor ===
 */

/*
 *								=== CJpegEncoder ===
 */

enum {
	marker_SOF0_HuffBaseline_DCT = 0xffc0,
	marker_SOF1_HuffExtendedSequential_DCT = 0xffc1,
	marker_SOF2_HuffProgressive_DCT = 0xffc2,
	marker_SOF3_HuffLossless = 0xffc3,
	// start of frame markers, non-differential, huffman coding

	marker_SOF5_HuffDiffSequential_DCT = 0xffc5,
	marker_SOF6_HuffDiffProgressive_DCT = 0xffc6,
	marker_SOF7_HuffDiffLossless = 0xffc7,
	// start of frame markers, differential, huffman coding

	marker_SOF8_ArithReserved = 0xffc8,
	marker_SOF9_ArithExtendedSequential_DCT = 0xffc9,
	marker_SOF10_ArithProgressive_DCT = 0xffca,
	marker_SOF11_ArithLossless = 0xffcb,
	// start of frame markers, non-differential, arithmetic coding

	marker_SOF13_ArithDiffSequential_DCT = 0xffcd,
	marker_SOF14_ArithDiffProgressive_DCT = 0xffce,
	marker_SOF15_ArithDiffLossless = 0xffcf,
	// start of frame markers, differential, arithmetic coding

	marker_HuffmanTable = 0xffc4,
	// huffman table specification

	marker_ArithCodingCondBlock = 0xffcc,
	// arithmetic coding conditioning specification

	marker_RestartMod0 = 0xffd0,
	marker_RestartMod1 = 0xffd1,
	marker_RestartMod2 = 0xffd2,
	marker_RestartMod3 = 0xffd3,
	marker_RestartMod4 = 0xffd4,
	marker_RestartMod5 = 0xffd5,
	marker_RestartMod6 = 0xffd6,
	marker_RestartMod7 = 0xffd7,
	// restart interval termination

	marker_StartImage = 0xffd8,
	marker_EndImage = 0xffd9,
	marker_StartScan = 0xffda,
	marker_DefineQuantTables = 0xffdb,
	marker_DefineNumberLines = 0xffdc,
	marker_DefineRestartInterval = 0xffdd,
	marker_DefineHierarchicalProgression = 0xffde,
	marker_ExpandRefComps = 0xffdf,
	// other markers

	marker_ReservedAppSeg_0 = 0xffe0, // JFIF (jpeg file interchange format - the used format)
	marker_ReservedAppSeg_1 = 0xffe1,
	marker_ReservedAppSeg_2 = 0xffe2,
	marker_ReservedAppSeg_3 = 0xffe3,
	marker_ReservedAppSeg_4 = 0xffe4,
	marker_ReservedAppSeg_5 = 0xffe5,
	marker_ReservedAppSeg_6 = 0xffe6,
	marker_ReservedAppSeg_7 = 0xffe7,
	marker_ReservedAppSeg_8 = 0xffe8,
	marker_ReservedAppSeg_9 = 0xffe9,
	marker_ReservedAppSeg_a = 0xffea,
	marker_ReservedAppSeg_b = 0xffeb,
	marker_ReservedAppSeg_c = 0xffec,
	marker_ReservedAppSeg_d = 0xffed,
	marker_ReservedAppSeg_e = 0xffee,
	marker_ReservedAppSeg_f = 0xffef,

	marker_JPEG_ReservedExt_0 = 0xfff0,
	marker_JPEG_ReservedExt_1 = 0xfff1,
	marker_JPEG_ReservedExt_2 = 0xfff2,
	marker_JPEG_ReservedExt_3 = 0xfff3,
	marker_JPEG_ReservedExt_4 = 0xfff4,
	marker_JPEG_ReservedExt_5 = 0xfff5,
	marker_JPEG_ReservedExt_6 = 0xfff6,
	marker_JPEG_ReservedExt_7 = 0xfff7,
	marker_JPEG_ReservedExt_8 = 0xfff8,
	marker_JPEG_ReservedExt_9 = 0xfff9,
	marker_JPEG_ReservedExt_a = 0xfffa,
	marker_JPEG_ReservedExt_b = 0xfffb,
	marker_JPEG_ReservedExt_c = 0xfffc,
	marker_JPEG_ReservedExt_d = 0xfffd,

	marker_Comment = 0xfffe,

	marker_Temp = 0xff01,

	marker_Reserved_min = 0xff02,
	marker_Reserved_max = 0xffbf
};
// marker codes

const int CJpegEncoder::m_p_zig_indices[64] = {
     0,  1,  8, 16,  9,  2,  3, 10,
	17, 24, 32, 25, 18, 11,  4,  5,
	12, 19, 26, 33, 40, 48, 41, 34,
	27, 20, 13,  6,  7, 14, 21, 28,
	35, 42, 49, 56, 57, 50, 43, 36,
	29, 22, 15, 23, 30, 37, 44, 51,
	58, 59, 52, 45, 38, 31, 39, 46,
	53, 60, 61, 54, 47, 55, 62, 63
};

/*
 *	CJpegEncoder()
 *		- default constructor
 */
CJpegEncoder::CJpegEncoder()
{
	for(int i = 0; i < 2; i ++) {
		uint8_t p_table[64];
		CQuantTableFactory::Calc_QuantTable(p_table, !i, 1.0f);
		Set_QuantTable(i, p_table);
		Write_QuantTable(i, true);
	}
	// sets default quant tables
}

/*
 *	void Write_QuantTable(int n_index, bool b_write)
 *		- set write flag of quantization table with index n_index (n_index is 0 or 1)
 *		- by default, both quantization tables are written (default for RGB images)
 */
void CJpegEncoder::Write_QuantTable(int n_index, bool b_write)
{
	m_p_quant_table[n_index].b_write = b_write;
}

/*
 *	void Set_QuantTable(int n_index, int n_color_component, const uint8_t p_quant_table[64])
 *		- set data of quantization table with index n_index (n_index is 0 or 1)
 *		- p_quant_table is pointer to 64 element array, containing quantization coeffs
 *		  (coming in natural left-to-right, top-to-bottom order, no zig-zag)
 */
void CJpegEncoder::Set_QuantTable(int n_index, const uint8_t p_quant_table[64])
{
#ifdef __JPEG_ENC_INTEGER_QUANT
	int *p_dest = m_p_quant_table[n_index].p_value;
	const uint8_t *p_src = p_quant_table;
	uint8_t *p_dest2 = m_p_quant_table[n_index].p_store_value;
	for(int i = 0; i < 64; i ++) {
		*p_dest2 ++ = *p_src;
		*p_dest ++ = (int)(0x10000 * (*p_src ++ * (CFastDCT8_2D::p_PrescaleTable()[i / 8] *
			CFastDCT8_2D::p_PrescaleTable()[i % 8])));
		// int coeffs are stored direct, inverted form isn't precise enough (really ugly artifacts)
	}
#else
	float *p_dest = m_p_quant_table[n_index].p_value;
	const uint8_t *p_src = p_quant_table;
	uint8_t *p_dest2 = m_p_quant_table[n_index].p_store_value;
	for(int i = 0; i < 64; i ++) {
		*p_dest2 ++ = *p_src;
		*p_dest ++ = 1.0f / (*p_src ++ * (CFastDCT8_2D::p_PrescaleTable()[i / 8] *
			CFastDCT8_2D::p_PrescaleTable()[i % 8]));
		// float coeffs are stored inverted so it's possible to use fmul instead of fdiv
	}
#endif
}

/*
 *	void CJpegEncoder::Get_ComponentInfo(int n_encode_color, int n_color_sampling_horiz,
 *		int n_color_sampling_vert, TComponent *p_component_info, int &r_n_component_num,
 *		int &r_n_huff_coder_num)
 *		- translate n_encode_color (one of color_Gray, color_RG, color_RGB, color_CMYK or
 *		  color_YCCK) and n_color_sampling_horiz, n_color_sampling_vert to p_component_info
 *		- (value of elements of p_component_info is undefined in case n_encode_color
 *		  has other value than one of above)
 */
void CJpegEncoder::Get_ComponentInfo(int n_encode_color, int n_color_sampling_horiz,
	int n_color_sampling_vert, TComponent *p_component_info, int &r_n_component_num,
	int &r_n_huff_coder_num)
{
	const struct {
		int n_encode_color;
		uint8_t n_component_num;
		uint8_t n_huff_coder_num;
		bool b_dense_samples[4]; // high quality
		uint8_t n_table_index[4];
	} p_comp_table[] = {
		{color_Gray, 1, 1, {true}, {0}},
		{color_RG, 2, 1, {true, false}, {0, 0}},
		{color_RGB, 3, 2, {true, false, false}, {0, 1, 1}}
#ifdef __JPEG_ENC_ENABLE_CMYK
		, {color_CMYK, 4, 2, {true, false, false, true}, {0, 1, 1, 0}}
#endif
#ifdef __JPEG_ENC_ENABLE_YCCK
		, {color_YCCK, 4, 2, {true, false, false, true}, {0, 1, 1, 0}}
#endif
	};

	for(unsigned int i = 0; i < sizeof(p_comp_table) / sizeof(p_comp_table[0]); i ++) {
		if(p_comp_table[i].n_encode_color == n_encode_color) {
			r_n_component_num = p_comp_table[i].n_component_num;
			r_n_huff_coder_num = p_comp_table[i].n_huff_coder_num;
			for(int j = 0; j < p_comp_table[i].n_component_num; j ++) {
				p_component_info[j].n_component_id = j + 1;
				p_component_info[j].n_ac_huff_coder_index = p_comp_table[i].n_table_index[j];
				p_component_info[j].n_dc_huff_coder_index = p_comp_table[i].n_table_index[j];
				p_component_info[j].n_quant_table_index = p_comp_table[i].n_table_index[j];
				p_component_info[j].n_sampling_horiz = (p_comp_table[i].b_dense_samples[j])?
					n_color_sampling_horiz : 1;
				p_component_info[j].n_sampling_vert = (p_comp_table[i].b_dense_samples[j])?
					n_color_sampling_vert : 1;
				p_component_info[j].n_scale_horiz = (p_comp_table[i].b_dense_samples[j])?
					1 : n_color_sampling_horiz;
				p_component_info[j].n_scale_vert = (p_comp_table[i].b_dense_samples[j])?
					1 : n_color_sampling_vert;
			}
			return;
		}
	}

	_ASSERTE(0); // gotta to find it
}

/*
 *	void CJpegEncoder::Get_BlockInfo(const TComponent *p_component, int n_component_num,
 *		CJpegHuffCoder *p_huff_coder_base, int *p_huff_code_table_base,
 *		TBlockInfo *p_block_info, int &r_n_block_num)
 *		- translate component info (gained by Get_ComponentInfo()) to block info
 *		  (array of TBlockInfo with one element for each 8x8 data block in macro block
 *		  used to speed up block encoding)
 */
void CJpegEncoder::Get_BlockInfo(const TComponent *p_component, int n_component_num,
	CJpegHuffCoder *p_huff_coder_base, int *p_huff_code_table_base,
	TBlockInfo *p_block_info, int &r_n_block_num)
{
	r_n_block_num = 0;
	for(const TComponent *p_end = p_component + n_component_num; p_component < p_end;
	   p_component ++) {
		r_n_block_num += p_component->n_sampling_horiz * p_component->n_sampling_vert;
		for(int i = 0; i < p_component->n_sampling_horiz * p_component->n_sampling_vert;
		   i ++, p_block_info ++) {
			p_block_info->n_offset_x = (i % p_component->n_sampling_horiz) * 8;
			p_block_info->n_offset_y = (i / p_component->n_sampling_horiz) * 8;
			p_block_info->p_ac_huff_coder =
				&p_huff_coder_base[p_component->n_ac_huff_coder_index * 2 + 1];
			p_block_info->p_dc_huff_coder =
				&p_huff_coder_base[p_component->n_dc_huff_coder_index * 2];
			p_block_info->p_ac_huff_code_table =
				&p_huff_code_table_base[(p_component->n_ac_huff_coder_index * 2 + 1) * 256];
			p_block_info->p_dc_huff_code_table =
				&p_huff_code_table_base[(p_component->n_dc_huff_coder_index * 2) * 256];
			p_block_info->p_component = p_component;
			p_block_info->p_quant_table = &m_p_quant_table[p_component->n_quant_table_index];
		}
	}
	_ASSERTE(r_n_block_num <= 64);
	// todo - jpeg specs limits block number to certain maximum - find it
}

#ifdef __JPEG_ENC_ENABLE_LOSSLESS

/*
 *	bool CJpegEncoder::Encode_LosslessJpeg(FILE *p_fw, const TBmp *p_bitmap,
 *		int n_sample_precission = 8, bool b_separate_entropy = false, int n_channel_num = 3,
 *		 int n_channel0_index = 0, int n_channel1_index = 1, int n_channel2_index = 2,
 *		 int n_channel3_index = 3)
 *		- encode lossless jpeg image from p_bitmap (contains RGB(A) 8bpp image) to p_fw
 *		  (p_fw has to be opened for binary writing; fwrite() only is used)
 *		- bitmap resolution in jpeg is written as unsigned short so it's limited to 65535 ^ 2
 *		- if b_separate_entropy is true, every component has it's own entropy table. otherwise
 *		  color components (if present) share one table and alpha channel (if present)
 *		  is compressed using another one
 *		- n_sample_precission is destination data precission and can be 2 - 12 (but source
 *		  data are 8 bpp so reasonable range is 2 - 8 bits per sample)
 *		- n_channel_num is number of channels (can be 1 to 4)
 *		- n_channel0_index trough n_channel4_index are indices of channels in source RGBA quad
 *		  (formula to get channel from RGBA quad is: (n_rgba >> (8 * n_channel_index)) & 0xff
 *		  therefore channel index can be 0 (red), 1 (green), 2 (blue) or 3 (alpha)
 *		  in case __JPEG_ENC_BGR is defined, index with value 0 is set to 2 and vice-versa)
 *		  if n_channel_num is 1, n_channel0_index is used only, the rest is ignored
 *		  if n_channel_num is 2, n_channel0_index and n_channel1_index are used, etc ...
 */
bool CJpegEncoder::Encode_LosslessJpeg(FILE *p_fw, const TBmp *p_bitmap,
	int n_sample_precission, bool b_separate_entropy, int n_channel_num, int n_channel0_index,
	int n_channel1_index, int n_channel2_index, int n_channel3_index) // t_odo - add option to use single huff table per component or single huff table per rgb and other one for alpha (if present); add RLE compression option; rearrange code so when choosing the best predictor, best predicted image planes are remembered so it will elliminate the second not-so-quite-different loop so the compression time is going to get shorter.
																	  // todo - maybe think about some pre-mature exit in case first component plane bloats image size too much so there's (from pure propabilitistic point of view) no chance the compression ratio with a given predictor would be good (and then process lots of images to see which predictor is best and arrange predictor tests in new order so the compression is as quick as possible)
{
	if(n_sample_precission < 2 || n_sample_precission > 12 ||
	   n_channel_num < 1 || n_channel_num > 4)
		return false;
	if((n_channel_num >= 1 && (n_channel0_index < 0 || n_channel0_index > 3)) ||
	   (n_channel_num >= 2 && (n_channel1_index < 0 || n_channel1_index > 3)) ||
	   (n_channel_num >= 3 && (n_channel2_index < 0 || n_channel2_index > 3)) ||
	   (n_channel_num == 4 && (n_channel3_index < 0 || n_channel3_index > 3)))
		return false;

	int p_channel_index[4] = {n_channel0_index,
		n_channel1_index, n_channel2_index, n_channel3_index};
#ifdef __JPEG_ENC_BGR
	for(int i = 0; i < 4; i ++) {
		p_channel_index[i] = (p_channel_index[i] == 0)? 2 :
			((p_channel_index[i] == 2)? 0 : p_channel_index[i]);
	}
	// swap indices 0 and 2 for RGBA <-> BGRA
#endif

	TComponent p_component_info[4] = {
		{1, 1, 1, 1, 1, 0, (b_separate_entropy)? 0 : ((n_channel0_index == 3)? 1 : 0), 0},
		{2, 1, 1, 1, 1, 0, (b_separate_entropy)? 1 : ((n_channel1_index == 3)? 1 : 0), 0},
		{3, 1, 1, 1, 1, 0, (b_separate_entropy)? 2 : ((n_channel2_index == 3)? 1 : 0), 0},
		{4, 1, 1, 1, 1, 0, (b_separate_entropy)? 3 : ((n_channel3_index == 3)? 1 : 0), 0}
	};
	// component info (component id's are indices, 1 x 1 sampling, huff dc table indices
	// are component order, quant + ac table indices are 0)

#ifdef __JPEG_ENC_VERBOSE
	printf("selecting the best predictor ...\n");
	// debug
#endif

	CJpegHuffCoder p_huff_coder[4];
	int n_huff_coder_num = (b_separate_entropy)? n_channel_num : ((n_channel0_index == 3 ||
		(n_channel1_index == 3 && n_channel_num >= 2) ||
		(n_channel2_index == 3 && n_channel_num >= 3) ||
		(n_channel3_index == 3 && n_channel_num == 4))? 2 : 1);
	// huff coder list and number of used coders

	if(n_channel_num == 1 && n_channel0_index == 3 && !b_separate_entropy) {
		n_huff_coder_num = 1;
		p_component_info[0].n_dc_huff_coder_index = 0;
	}
	// a little correction for alpha-only images

	THuffmanTable p_huffman_table[8]; // 8 = 4 DC + 4 AC
#ifdef __JPEG_ENC_LOOKUP_CHUFFCODER
	int p_huff_code_table[4 * 256] = {0};
#endif
	// lo-word contains code word, hi-word contains code word length.
	// index is (n_huff_table_index * 256) + symbol

	int16_t *p_difference_image[4] = {0};
	// frame-buffers for image predictions in required component planes

	uint8_t n_used_predictor;
	unsigned int n_best_image_size = 0x7fffffff;
	for(uint8_t n_predictor = CPredictor<uint32_t, 8>::pred_A; n_predictor <=
	   CPredictor<uint32_t, 8>::pred_A_plus_B_half; n_predictor ++) {
		unsigned int n_image_size = 0;

		CJpegHuffCoder p_huff_coder2[4];
		int16_t *p_difference_image2[4];
		for(int i = 0; i < n_channel_num; i ++) {
			if(!(p_difference_image2[i] = CPredictor<uint32_t, 8>::p_DifferenceImage(
			   p_bitmap->p_buffer, p_bitmap->n_width, p_bitmap->n_height, p_channel_index[i],
			   n_predictor, 8 - n_sample_precission)))
				return false;
			// create image of differences between predicted and real samples

			for(int16_t n_dc = 0, *p_diff_image = p_difference_image2[i], *p_end =
			   p_difference_image2[i] + p_bitmap->n_width * p_bitmap->n_height;
			   p_diff_image < p_end;) {
				int16_t n_sample = *p_diff_image ++;
				TRLEBlockData::TRLEPair t_symbol(n_sample - n_dc);
				n_dc = n_sample;
				// here encoding is performed. the code has to be the same as in the loop below
				// t_odo - write RLE

				n_image_size += t_symbol.n_code_word; // contains length of dc value to be stored

#ifdef __JPEG_ENC_USE_LOSSLESS_NONSTD_RLE
				uint32_t *p_src_pixel = &p_bitmap->p_buffer[p_diff_image -
					p_difference_image2[i] - 1];
				int16_t n_image_sample = CPredictor<uint32_t, 8>::n_Sample(
					*p_src_pixel ++, p_channel_index[i], 8 - n_sample_precission/*12*/);
				// take source sample (gotta sample with maximal precision, otherwise rounding
				// errors along with errors, distributed by RLE will make image values drift away)

				uint8_t n_sample_num = 0; // add 1 when decoding
				while(p_diff_image < p_end && n_sample_num < 15 &&
				   n_image_sample == CPredictor<uint32_t, 8>::n_Sample(
				   *p_src_pixel ++, p_channel_index[i], 8 - n_sample_precission/*12*/)) {
					n_sample_num ++;
					p_diff_image ++;
				}
				// calc number of samples with the equal value
				
				t_symbol.n_code_word |= n_sample_num << 4;
#endif

				if(!p_huff_coder2[p_component_info[i].n_dc_huff_coder_index].Insert_Symbols(
				   &t_symbol.n_code_word, 1)) {
					for(int j = 0; j < i; j ++)
						delete[] p_difference_image2[j];
					for(int j = 0; j < n_channel_num; j ++)
						delete[] p_difference_image[j];
					return false;
				}
			}
		}

		for(int i = 0; i < n_huff_coder_num; i ++) {
			if(!p_huff_coder2[i].Build_HuffmanTree())
				return false;
			// create huffman tree, calc image size

			for(int j = 0; j < p_huff_coder2[i].n_Code_Num(); j ++) {
				n_image_size += p_huff_coder2[i].n_Code_Length(j) *
					p_huff_coder2[i].n_Code_Symbol_Frequency(j);
			}
			// calc image size
		}

#ifdef __JPEG_ENC_VERBOSE
		const char *p_s_predictor_name[] = {"a", "b", "c",
			"a + b - c", "a + (b - c) / 2", "b + (a - c) / 2", "(a + b) / 2"};
		printf("\tpredictor \'%s\' yields image with size %u bits\n",
			p_s_predictor_name[n_predictor - 1], n_image_size);
		// debug
#endif

		if(n_best_image_size > n_image_size || n_predictor ==
		   CPredictor<uint32_t, 8>::pred_A) {
			n_used_predictor = n_predictor;
			n_best_image_size = n_image_size;
			for(int j = 0; j < n_channel_num; j ++) {
				delete[] p_difference_image[j];
				p_difference_image[j] = p_difference_image2[j];
			}
			for(int j = 0; j < n_huff_coder_num; j ++) {
				if(!(p_huff_coder[j] = p_huff_coder2[j])) {
					for(int j = 0; j < n_channel_num; j ++)
						delete[] p_difference_image[j];
					return false;
				}
			}
		} else {
			for(int j = 0; j < n_channel_num; j ++)
				delete[] p_difference_image2[j];
		}
	}
	// choose the best predictor, keep the best difference image so we don't have to recalc

#ifdef __JPEG_ENC_VERBOSE
	const char *p_s_predictor_name[] = {"a", "b", "c",
		"a + b - c", "a + (b - c) / 2", "b + (a - c) / 2", "(a + b) / 2"};
	printf("\n\tbest predictor is \'%s\' and yields image with size %u bits\n",
		p_s_predictor_name[n_used_predictor - 1], n_best_image_size);
	// debug
#endif

#ifdef __JPEG_ENC_VERBOSE
	printf("calculating huffman tables ...\n");
	// debug
#endif

	for(int i = 0; i < n_huff_coder_num; i ++) {
#ifndef __JPEG_ENC_LOOKUP_CHUFFCODER
		p_huffman_table[2 * i] = t_HuffmanTable(p_huff_coder[i]); // 2 * i ~ DC, AC are unused
		// sorted-tree symbols variant (didn't properly work at the beginning, slower anyway)
#else
		p_huffman_table[2 * i] = t_HuffmanTable(p_huff_coder[i], &p_huff_code_table[i * 256]);
		// look-up huffman code table variant (uses a bit more memory for the table)
#endif
		// create huffman table
	}
	// build huffman tables

#ifdef __JPEG_ENC_VERBOSE
	printf("writing image to disk ...\n");
	// debug
#endif

	COutputFile out_file(p_fw);

	if(!out_file.Write_Short((int16_t)marker_StartImage) ||
	   !Write_JFIFHeader(&out_file) ||
	   !Write_FrameHeader(&out_file, marker_SOF3_HuffLossless, p_bitmap->n_width,
	   p_bitmap->n_height, n_sample_precission, n_channel_num, p_component_info) ||
	   !Write_HuffmanTables(&out_file, p_huffman_table, n_huff_coder_num, true) ||
	   !Write_StartScan(&out_file, n_channel_num, p_component_info, n_used_predictor, 0)) {
		for(int i = 0; i < n_channel_num; i ++)
			delete[] p_difference_image[i];
		return false;
	}
	// write headers !!DC huffman tables only!!

	CBitWriter bit_writer;

	{int16_t n_dc[4] = {0};
	int16_t *p_difference_image_ptr[4] = {p_difference_image[0],
	   p_difference_image[1], p_difference_image[2], p_difference_image[3]};
	const int16_t *p_end[4] = {p_difference_image[0] + p_bitmap->n_width * p_bitmap->n_height,
	   p_difference_image[1] + (n_channel_num > 1) * p_bitmap->n_width * p_bitmap->n_height,
	   p_difference_image[2] + (n_channel_num > 2) * p_bitmap->n_width * p_bitmap->n_height,
	   p_difference_image[3] + (n_channel_num > 3) * p_bitmap->n_width * p_bitmap->n_height};
	for(;p_difference_image_ptr[0] < p_end[0] || p_difference_image_ptr[1] < p_end[1] ||
	   p_difference_image_ptr[2] < p_end[2] || p_difference_image_ptr[3] < p_end[3];) {
		for(int i = 0; i < n_channel_num; i ++) {
#ifdef __JPEG_ENC_USE_LOSSLESS_NONSTD_RLE
			if(p_difference_image_ptr[i] >= p_end[i])
			    continue;
			// this component plane was finished earlier due to RLE encoding
#endif

			int16_t n_sample = *p_difference_image_ptr[i] ++;
			TRLEBlockData::TRLEPair t_symbol(n_sample - n_dc[i]);
			n_dc[i] = n_sample;
			// here encoding is performed. the code has to be the same as in the loop above

#ifdef __JPEG_ENC_USE_LOSSLESS_NONSTD_RLE // t_odo - debug encoder and decoder and investigate where two-points artifact originates on wire_fence_d.tga
			uint32_t *p_src_pixel = &p_bitmap->p_buffer[p_difference_image_ptr[i] -
				p_difference_image[i] - 1];
			int16_t n_image_sample = CPredictor<uint32_t, 8>::n_Sample(
				*p_src_pixel ++, p_channel_index[i], 8 - n_sample_precission/*12*/);
			// take source sample

			uint8_t n_sample_num = 0; // add 1 when decoding
			for(int16_t *p_end2 = p_difference_image[i] + p_bitmap->n_width *
			   p_bitmap->n_height; p_difference_image_ptr[i] < p_end2 && n_sample_num < 15 &&
			   n_image_sample == CPredictor<uint32_t, 8>::n_Sample(
			   *p_src_pixel ++, p_channel_index[i], 8 - n_sample_precission/*12*/); ) {
				n_sample_num ++;
				p_difference_image_ptr[i] ++;
			}
			// calc number of samples with the equal value
			
			t_symbol.n_code_word |= n_sample_num << 4;
#endif

			int n_code_word, n_bit_num;
#ifndef __JPEG_ENC_LOOKUP_CHUFFCODER
			if(!p_huff_coder[p_component_info[i].n_dc_huff_coder_index].TranslateSymbol(
			   t_symbol.n_code_word, n_code_word, n_bit_num)) {
				for(int j = 0; j < n_channel_num; j ++)
					delete[] p_difference_image[j];
				return false;
			}
#else
			n_bit_num = p_huff_code_table[p_component_info[i].n_dc_huff_coder_index * 256 +
				t_symbol.n_code_word] >> 16;
			n_code_word = p_huff_code_table[p_component_info[i].n_dc_huff_coder_index * 256 +
				t_symbol.n_code_word] & 0xffff;
#endif
			// encode differential dc value

			_ASSERTE(n_code_word >= 0 && n_code_word < 65536);
			_ASSERTE(n_bit_num >= 1 && n_bit_num <= 16);
			if(!bit_writer.WriteNumber(n_bit_num, n_code_word, &out_file)) {
				for(int j = 0; j < n_channel_num; j ++)
					delete[] p_difference_image[j];
				return false;
			}
			// write code

			if((t_symbol.n_code_word & 0x0f) && /*(t_symbol.n_code_word & 0x0f) < 16 &&*/
			   !bit_writer.WriteNumber(t_symbol.n_code_word & 0xf, t_symbol.n_coeff, &out_file)) {
				for(int j = 0; j < n_channel_num; j ++)
					delete[] p_difference_image[j];
				return false;
			}
			// write number (in case it's length is greater than 0 (value != 0) and less than 16)
		}
		// write differential encoded sample for each component
	}}
	// write huffman encoded data

	for(int i = 0; i < n_channel_num; i ++)
		delete[] p_difference_image[i];
	// don't need this any more

	if(!bit_writer.PaddByte(&out_file))
		return false;
	// padd last byte with 1-s

	if(!out_file.Write_Short((int16_t)marker_EndImage))
		return false;
	// write end-of-image marker

	return true;
}

#endif // __JPEG_ENC_ENABLE_LOSSLESS

/*
 *	bool CJpegEncoder::Encode_Jpeg(FILE *p_fw, const TBmp *p_bitmap, int n_encode_color,
 *		int n_color_sampling_horiz, int n_color_sampling_vert)
 *		- encode jpeg image from p_bitmap (contains RGB(A) 8bpp image) to p_fw
 *		  (p_fw has to be opened for binary writing; fwrite() only is used)
 *		- bitmap resolution in jpeg is written as unsigned short so it's limited to 65535 ^ 2
 *		- n_encode_color (one of color_Gray, color_RG, color_RGB, color_CMYK or color_YCCK)
 *		  determines way of storing color in jpeg file (source color is RGB (A is never used))
 *		- n_color_sampling_horiz and n_color_sampling_vert is color sampling (has to be 1, 2 or 4)
 *		- for color_Gray, color sampling is forced to 1 x 1 (recommended in jpeg specification)
 *		  and quant table 0 is used only
 *		- for color_RG, R has dense 1 x 1 samples, G has samples of size n_color_sampling_horiz x
 *		  n_color_sampling_vert, quant table 0 is used only
 *		- color_RGB sets storing color as YCbCr, Y has dense samples and uses quant table 0,
 *		  Cb and Cr has normal samples and both uses quant table 1
 *		- color_CMYK (valid only if __JPEG_ENC_ENABLE_CMYK is defined) sets storing color
 *		  information as CMYK, C and K has dense samples and uses quant table 0, M and Y has
 *		  normal samples and uses quant table 1
 *		- color_YCCK is used for standard 4-component jpegs (YCCK means YCbCrK) C and K has dense
 *		  samples and uses quant table 0, M and Y has normal samples and uses quant table 1
 *		- quant tables are by default set to be written, but their contents are unspecified
 *		- returns true on success and false on failure (not enough memory for temporal info
 *		  or not enough space on dest drive)
 */
bool CJpegEncoder::Encode_Jpeg(FILE *p_fw, const TBmp *p_bitmap, int n_encode_color,
	int n_color_sampling_horiz, int n_color_sampling_vert)
{
	if((n_color_sampling_vert != 1 && n_color_sampling_vert != 2 && n_color_sampling_vert != 4) ||
	   (n_color_sampling_horiz != 1 && n_color_sampling_horiz != 2 && n_color_sampling_horiz != 4))
		return false;
	// sampling must be valid

	if(n_encode_color != color_Gray && n_encode_color != color_RG &&
	   n_encode_color != color_RGB
#ifdef __JPEG_ENC_ENABLE_CMYK
	   && n_encode_color != color_CMYK
#endif
#ifdef __JPEG_ENC_ENABLE_YCCK
	   && n_encode_color != color_YCCK
#endif
	   )
		return false;
	// color conversion must be valid

	if(n_encode_color == color_Gray) {
		n_color_sampling_vert = 1;
		n_color_sampling_horiz = 1;
	}
	// force 1x1 coplor sampling for single-component jpegs

	int n_macro_block_x_num = (p_bitmap->n_width + (n_color_sampling_horiz * 8 - 1)) /
		(n_color_sampling_horiz * 8);
	int n_macro_block_y_num = (p_bitmap->n_height + (n_color_sampling_vert * 8 - 1)) /
		(n_color_sampling_vert * 8);
	int n_macro_block_num = n_macro_block_x_num * n_macro_block_y_num;
	// number of x and y macro-blocks

	CJpegHuffCoder p_huff_coder[4];
	int n_huff_coder_num;
	// huffman coders

#ifdef __JPEG_ENC_LOOKUP_CHUFFCODER
	int p_huff_code_table[4 * 256] = {0};
#endif
	// lo-word contains code word, hi-word contains code word length.
	// index is (n_huff_table_index * 256) + symbol

	TComponent p_component_data[4];
	int n_component_num;
	// component data

	Get_ComponentInfo(n_encode_color, n_color_sampling_horiz,
		n_color_sampling_vert, p_component_data, n_component_num, n_huff_coder_num);

	TBlockInfo p_block_info[64];
	int n_block_macro_block_num;
	// block data

#ifndef __JPEG_ENC_LOOKUP_CHUFFCODER
	Get_BlockInfo(p_component_data, n_component_num, p_huff_coder, 0,
		p_block_info, n_block_macro_block_num);
#else
	Get_BlockInfo(p_component_data, n_component_num, p_huff_coder, p_huff_code_table,
		p_block_info, n_block_macro_block_num);
#endif
	// determine number of blocks per macro-block, number of blocks per component
	// and number of components, assign quant-table pointers and huffman coder pointers
	// (per-block to reduce number of processing loops and branches)

	TRLEBlockData *p_block_data;
	TMacroBlock *p_macro_block;
	if(!(p_macro_block = new(std::nothrow) TMacroBlock[n_macro_block_num]))
		return false;
	if(!(p_block_data = new(std::nothrow) TRLEBlockData[n_macro_block_num * n_block_macro_block_num])) {
		delete[] p_macro_block;
		return false;
	}
	TRLEBlockData *p_block_data_ptr = p_block_data;
	for(TMacroBlock *p_block = p_macro_block, *p_end = p_macro_block + (n_macro_block_x_num *
	   n_macro_block_y_num); p_block < p_end; p_block ++) {
		p_block->p_block = p_block_data_ptr;
		p_block_data_ptr += n_block_macro_block_num;
	}
	// alloc macro-block buffers, we're going to forward-DCT and quantize the whole
	// image first, then create huffman tables and write image into the file

	int n_macro_block_width = 8 * n_color_sampling_horiz;
	int n_macro_block_height = 8 * n_color_sampling_vert;

#ifdef __JPEG_ENC_VERBOSE
	printf("separate color planes, FDCT, quantize and RLE compress\n");
#endif

	int16_t p_dc_value[4] = {0}; // dc accumulators
	const uint32_t *p_scanline = p_bitmap->p_buffer;
	TRLEBlockData *p_dest_block = p_block_data;
	// write directly to block data array, don't bother with TMacroBlock-s
	for(int n_residual_height = p_bitmap->n_height; n_residual_height > p_bitmap->n_height -
	   n_macro_block_height * n_macro_block_y_num; n_residual_height -= n_macro_block_height,
	   p_scanline += n_macro_block_height * p_bitmap->n_width) {
		const uint32_t *p_block_image = p_scanline;
		for(int n_residual_width = p_bitmap->n_width; n_residual_width > p_bitmap->n_width -
		   n_macro_block_width * n_macro_block_x_num; n_residual_width -= n_macro_block_width,
		   p_block_image += n_macro_block_width) {
#ifdef __JPEG_ENC_ENCODE_VERBOSE
		printf("\n\t\t\t=== rle-encoding macro-block %d ===\n",
			(p_dest_block - p_block_data) / n_block_macro_block_num + 1);
		_ASSERTE(!((p_dest_block - p_block_data) % n_block_macro_block_num));
#endif
			for(const TBlockInfo *p_cur_block_info = p_block_info, *p_end = p_block_info +
			   n_block_macro_block_num; p_cur_block_info < p_end; p_cur_block_info ++) {
#ifdef __JPEG_ENC_INTEGER_FDCT
				int16_t p_channel_block[64];
#else
				float p_channel_block[64];
#endif
				// block with image data in a given component plane

				if(n_encode_color == color_RGB || n_encode_color == color_Gray) {
					if((p_cur_block_info - p_block_info) % n_block_macro_block_num <
					   n_color_sampling_horiz * n_color_sampling_vert) {
						CGrayscaleConversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					} else if((p_cur_block_info - p_block_info) % n_block_macro_block_num ==
					   n_color_sampling_horiz * n_color_sampling_vert) {
						CChrominance_Cb_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					} else /*if((p_cur_block_info - p_block_info) % n_block_macro_block_num ==
					   n_color_sampling_horiz * n_color_sampling_vert + 1)*/ {
						CChrominance_Cr_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					}
#ifdef __JPEG_ENC_ENABLE_CMYK
				} else if(n_encode_color == color_CMYK) {
					if((p_cur_block_info - p_block_info) % n_block_macro_block_num <
					   n_color_sampling_horiz * n_color_sampling_vert) {
						CCMYK_C_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					} else if((p_cur_block_info - p_block_info) % n_block_macro_block_num ==
					   n_color_sampling_horiz * n_color_sampling_vert) {
						CCMYK_M_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					} else if((p_cur_block_info - p_block_info) % n_block_macro_block_num ==
					   n_color_sampling_horiz * n_color_sampling_vert + 1) {
						CCMYK_Y_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					} else /*if((p_cur_block_info - p_block_info) % n_block_macro_block_num >
					   n_color_sampling_horiz * n_color_sampling_vert + 1)*/ {
						CCMYK_K_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					}
#endif
#ifdef __JPEG_ENC_ENABLE_YCCK
				} else if(n_encode_color == color_YCCK) {
					if((p_cur_block_info - p_block_info) % n_block_macro_block_num <
					   n_color_sampling_horiz * n_color_sampling_vert) {
						CYCCK_C_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					} else if((p_cur_block_info - p_block_info) % n_block_macro_block_num ==
					   n_color_sampling_horiz * n_color_sampling_vert) {
						CYCCK_M_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					} else if((p_cur_block_info - p_block_info) % n_block_macro_block_num ==
					   n_color_sampling_horiz * n_color_sampling_vert + 1) {
						CYCCK_Y_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					} else /*if((p_cur_block_info - p_block_info) % n_block_macro_block_num >
					   n_color_sampling_horiz * n_color_sampling_vert + 1)*/ {
						CCMYK_K_Conversion color_converter; // K passes as-is
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					}
#endif
				} else if(n_encode_color == color_RG) {
					if((p_cur_block_info - p_block_info) % n_block_macro_block_num <
					   n_color_sampling_horiz * n_color_sampling_vert) {
						CComponent_R_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					} else /*if((p_cur_block_info - p_block_info) % n_block_macro_block_num ==
					   n_color_sampling_horiz * n_color_sampling_vert)*/ {
						CComponent_G_Conversion color_converter;
						Get_ImageBlock(p_channel_block, p_block_image + p_cur_block_info->n_offset_x +
							p_cur_block_info->n_offset_y * p_bitmap->n_width, p_bitmap->n_width,
							n_residual_width - p_cur_block_info->n_offset_x, n_residual_height -
							p_cur_block_info->n_offset_y, p_cur_block_info->p_component->n_scale_horiz,
							p_cur_block_info->p_component->n_scale_vert, color_converter);
					}
				}
				// read block part of the image, padd border if necessary and perform color
				// conversion

				int16_t p_quant_block[64];

				FDCT_Quantize_Block(p_quant_block, p_channel_block, p_cur_block_info->p_quant_table);

				int16_t n_prev_dc = p_dc_value[p_cur_block_info->p_component->n_component_id - 1];
				p_dc_value[p_cur_block_info->p_component->n_component_id - 1] = p_quant_block[0];
				p_quant_block[0] -= n_prev_dc;
				// manage DC value differential coding

				RLE_Encode_Block(*p_dest_block ++, p_quant_block);
			}
			// cut-out data of all macro-blocks, RLE-compress it and store it in the array
		}
	}
	// FDCT and RLE-compress all macro-blocks (the main bottleneck; asm-enhanced version would
	// come in handy)

#ifdef __JPEG_ENC_VERBOSE
	printf("gather symbols\n");
#endif

	for(const TMacroBlock *p_block = p_macro_block, *p_end = p_macro_block + n_macro_block_num;
	   p_block < p_end; p_block ++) {
		const TBlockInfo *p_cur_block_info = p_block_info;
		for(const TRLEBlockData *p_data_block = p_block->p_block, *p_end = p_block->p_block +
		   n_block_macro_block_num; p_data_block < p_end; p_data_block ++, p_cur_block_info ++) {
			if(!p_cur_block_info->p_dc_huff_coder->Insert_Symbols(
			   &p_data_block->p_pair->n_code_word, 1)) {
				delete[] p_macro_block;
				delete[] p_block_data;
				return false;
			}
			// first is DC

			for(const TRLEBlockData::TRLEPair *p_pair = p_data_block->p_pair + 1,
			   *p_end_pair = p_data_block->p_pair + p_data_block->n_pair_num;
			   p_pair < p_end_pair; p_pair ++) {
				if(!p_cur_block_info->p_ac_huff_coder->Insert_Symbols(&p_pair->n_code_word, 1)) {
					delete[] p_macro_block;
					delete[] p_block_data;
					return false;
				}
			}
			// rest are AC
		}
	}
	// gather symbols, count frequencies

#ifdef __JPEG_ENC_VERBOSE
	printf("fill huffman tables\n");
#endif

	THuffmanTable p_huffman_table[4]; // [2 * i + 1] = AC, [2 * i + 0] = DC

	for(int i = 0; i < n_huff_coder_num; i ++) { // t_odo - see what is being stored, fix algorithm
		for(int n = 0; n < 2; n ++) {
			THuffmanTable *p_table = &p_huffman_table[i * 2 + n];

			if(!p_huff_coder[i * 2 + n].Build_HuffmanTree()) {
				delete[] p_macro_block;
				delete[] p_block_data;
				return false;
			}

#ifndef __JPEG_ENC_LOOKUP_CHUFFCODER
			*p_table = t_HuffmanTable(p_huff_coder[i * 2 + n]);
			// sorted-tree symbols variant (didn't properly work at the beginning, slower anyway)
#else
			*p_table = t_HuffmanTable(p_huff_coder[i * 2 + n],
				&p_huff_code_table[(i * 2 + n) * 256]);
			// look-up huffman code table variant (uses a bit more memory for the table)
#endif
		}
	}
	// copy symbols to jpeg huffman tables, assign codes to symbols (create huffman trees)

	COutputFile out_file(p_fw);

	bool b_write_qt1 = m_p_quant_table[1].b_write;
	if(n_encode_color == color_Gray || n_encode_color == color_RG)
		m_p_quant_table[1].b_write = false;
	// grayscale / RG images do not need chrominance quant table

	if(!out_file.Write_Short((int16_t)marker_StartImage) ||
	   !Write_JFIFHeader(&out_file) ||
	   !Write_QuantTables(&out_file) ||
	   !Write_FrameHeader(&out_file, marker_SOF0_HuffBaseline_DCT, p_bitmap->n_width,
	   p_bitmap->n_height, 8, n_component_num, p_component_data)) {
		m_p_quant_table[1].b_write = b_write_qt1;
	}

	m_p_quant_table[1].b_write = b_write_qt1;
	// restore

#ifdef __JPEG_ENC_VERBOSE
	printf("\tn_macro_block_x_num=%d n_macro_block_y_num=%d (%d blocks)\n", n_macro_block_x_num,
		n_macro_block_y_num, n_macro_block_num);
	printf("\tn_max_sampling_horiz=%d n_max_sampling_vert=%d\n", n_color_sampling_horiz,
		n_color_sampling_vert);
#endif
	if(!Write_HuffmanTables(&out_file, p_huffman_table, n_huff_coder_num) ||
	   !Write_StartScan(&out_file, n_component_num, p_component_data)) {
		delete[] p_macro_block;
		delete[] p_block_data;
		return false;
	}
	// write headers


	CBitWriter bit_writer;

	for(const TMacroBlock *p_block = p_macro_block, *p_end = p_macro_block + n_macro_block_num;
	   p_block < p_end; p_block ++) {
#ifdef __JPEG_ENC_HUFF_ENCODE_VERBOSE
		printf("\n\t\t\t=== huff-encoding macro-block %d ===\n", (p_block - p_macro_block) + 1);
#endif
		const TBlockInfo *p_cur_block_info = p_block_info;
		for(const TRLEBlockData *p_data_block = p_block->p_block, *p_end = p_block->p_block +
		   n_block_macro_block_num; p_data_block < p_end; p_data_block ++, p_cur_block_info ++) {
#ifndef __JPEG_ENC_LOOKUP_CHUFFCODER
			if(!HuffCompress_Write_BlockData(*p_data_block, p_cur_block_info->p_dc_huff_coder,
			   p_cur_block_info->p_ac_huff_coder, bit_writer, &out_file)) {
#else
			if(!HuffCompress_Write_BlockData(*p_data_block, p_cur_block_info->p_dc_huff_code_table,
			   p_cur_block_info->p_ac_huff_code_table, bit_writer, &out_file)) {
#endif
				delete[] p_macro_block;
				delete[] p_block_data;
				return false;
			}
		}
	}
	// huffman-compress and simultaneously write all blocks to file

	delete[] p_macro_block;
	delete[] p_block_data;
	// don't need block data any more

	if(!bit_writer.PaddByte(&out_file))
		return false;
	// padd last byte with 1-s

	if(!out_file.Write_Short((int16_t)marker_EndImage))
		return false;
	// write end-of-image marker

	return true;
}

#ifndef __JPEG_ENC_LOOKUP_CHUFFCODER

/*
 *	template <class CJpegHuffCoderClass>
 *	CJpegEncoder::THuffmanTable CJpegEncoder::t_HuffmanTable(CJpegHuffCoderClass &r_huff_coder)
 *		- create THuffmanTable from r_huff_coder
 *		- r_huff_coder can't be const CJpegHuffCoder because some functions
 *		  of CJpegHuffCoder needs sorted object data so they may need to sort them
 *		  and therefore modify object data
 */
template <class CJpegHuffCoderClass>
CJpegEncoder::THuffmanTable CJpegEncoder::t_HuffmanTable(CJpegHuffCoderClass &r_huff_coder)
{
	THuffmanTable t_table;

	int n_code_length = 1;
	for(uint8_t *p_code_table_ptr = t_table.p_code_table, *p_code_num = t_table.p_code_num,
	   **p_code_ptr = t_table.p_code, *p_end = t_table.p_code_num + 16; p_code_num < p_end;
	   p_code_num ++, n_code_length ++) {
		*p_code_num = r_huff_coder.n_Code_Num(n_code_length);
		*p_code_ptr ++ = p_code_table_ptr;
		for(uint8_t j = 0; j < *p_code_num; j ++)
			*p_code_table_ptr ++ = *r_huff_coder.p_Code_Symbol(j, n_code_length);
	}
	// much shorter and nicer version, using the new huffman coder class

	/*memset(t_table.n_code_num, 0, 16 * sizeof(uint8_t));
	// zero memory (important is to zero code numbers)

	int n_index = 0, n_code = 0;
	for(uint8_t *p_code_num = t_table.p_code_num, **p_code_tab = t_table.p_code,
	   *p_code_data = t_table.p_code_table; n_index < r_huff_coder.n_Code_Num();
	   p_code_num ++, p_code_tab ++) {
		*p_code_tab = p_code_data;
		for(int n_code_length = (p_code_num - t_table.n_code_num) + 1,
		   n_code_val = n_code; n_index < r_huff_coder.n_Code_Num() &&
		   r_huff_coder.n_Code_Length(n_index) == n_code_length;
		   n_index ++, (*p_code_num) ++, n_code_val ++) {
			int n_code_index = -1;
			for(int j = 0; j < r_huff_coder.n_Code_Num(); j ++) {
				if(r_huff_coder.n_Code(j) == n_code_val) {
					n_code_index = j;
					break;
				}
			}
			_ASSERTE(n_code_index != -1);
			// codes are sorted by symbol length, but they are not sorted by code value
			// so we have to search for it so codes, stored in THuffmanTable are sorted

			*p_code_data ++ = *r_huff_coder.p_Code_Symbol(n_code_index);
		}
		n_code += *p_code_num;
		n_code <<= 1;
	}
	// sorted-tree symbols variant (didn't properly work at the beginning, slower anyway),*/

	return t_table;
}

/*
 *	bool CJpegEncoder::HuffCompress_Write_BlockData(const TRLEBlockData &r_t_block,
 *		CJpegHuffCoder *p_cur_dc_huff_coder, CJpegHuffCoder *p_cur_ac_huff_coder,
 *		CBitWriter &r_bit_writer, COutputFile *p_out_file)
 *		- take data block r_t_block, encode it using p_cur_dc_huff_coder and
 *		  p_cur_ac_huff_coder (didn't properly work at the beginning, slower anyway)
 *		- r_bit_writer is bit writer, p_out_file is output file
 *		- return value is true on success or false on write error (not enough free
 *		  space on dest drive)
 */
bool CJpegEncoder::HuffCompress_Write_BlockData(const TRLEBlockData &r_t_block,
	CJpegHuffCoder *p_cur_dc_huff_coder, CJpegHuffCoder *p_cur_ac_huff_coder,
	CBitWriter &r_bit_writer, COutputFile *p_out_file)
{
	CJpegHuffCoder *p_cur_huff_coder = p_cur_dc_huff_coder;
	for(const TRLEBlockData::TRLEPair *p_pair = r_t_block.p_pair,
	   *p_end = r_t_block.p_pair + r_t_block.n_pair_num; p_pair < p_end; p_pair ++) {
		int n_code_word, n_bit_num;
		if(!p_cur_huff_coder->TranslateSymbol(p_pair->n_code_word, n_code_word, n_bit_num))
			return false;
		// translate current code word (result is index into huffman table and it's length)

		p_cur_huff_coder = p_cur_ac_huff_coder;
		// first coeff only is encoded using dc coder,
		// all following codes are encoded using ac coder

#ifdef __JPEG_ENC_HUFF_ENCODE_VERBOSE
		if(p_pair->n_code_word == 0) {
			printf("(EOB) = %d(%d) ", n_bit_num, n_code_word);
		} else if(p_pair->n_code_word == 0xf0) {
			printf("(16x0) = %d(%d) ", n_bit_num, n_code_word);
		} else {
			printf("(%d:%d, %d) = %d(%d) ", p_pair->n_code_word >> 4,
				p_pair->n_code_word & 0xf, p_pair->n_coeff, n_bit_num, n_code_word);
		}
		// debug
#endif

		_ASSERTE(n_code_word >= 0 && n_code_word < 65536);
		_ASSERTE(n_bit_num >= 1 && n_bit_num <= 16);
		if(!r_bit_writer.WriteNumber(n_bit_num, n_code_word, p_out_file))
			return false;
		// write code

		if((p_pair->n_code_word & 0x0f) && !r_bit_writer.WriteNumber(p_pair->n_code_word &
		   0xf, p_pair->n_coeff, p_out_file))
			return false;
		// write number (in case it's length is greater than 0 (value != 0))
	}

#ifdef __JPEG_ENC_HUFF_ENCODE_VERBOSE
	printf("\n");
	// debug
#endif

	return true;
}
// sorted-tree symbols variant (sometimes doesn't work, don't know why yet)

#else // __JPEG_ENC_LOOKUP_CHUFFCODER

/*
 *	template <class CJpegHuffCoderClass>
 *	CJpegEncoder::THuffmanTable CJpegEncoder::t_HuffmanTable(CJpegHuffCoderClass &r_huff_coder,
 *		int *p_huff_code_table)
 *		- create THuffmanTable from r_huff_coder
 *		- fill p_huff_code_table (table of 256 integers, hi-word is code length,
 *		  lo-word is code-word, index is symbol being encoded)
 *		- r_huff_coder can't be const CJpegHuffCoder because some functions
 *		  of CJpegHuffCoder needs sorted object data so they may need to sort them
 *		  and therefore modify object data
 */
template <class CJpegHuffCoderClass>
CJpegEncoder::THuffmanTable CJpegEncoder::t_HuffmanTable(CJpegHuffCoderClass &r_huff_coder,
	int *p_huff_code_table)
{
	THuffmanTable t_table;

	int n_code_length = 1;
	for(uint8_t *p_code_table_ptr = t_table.p_code_table, *p_code_num = t_table.p_code_num,
	   **p_code_ptr = t_table.p_code, *p_end = t_table.p_code_num + 16; p_code_num < p_end;
	   p_code_num ++, n_code_length ++) {
		*p_code_num = r_huff_coder.n_Code_Num(n_code_length);
		*p_code_ptr ++ = p_code_table_ptr;
		for(uint8_t j = 0; j < *p_code_num; j ++) {
			uint8_t n_symbol = *r_huff_coder.p_Code_Symbol(j, n_code_length);
			*p_code_table_ptr ++ = n_symbol;
			p_huff_code_table[n_symbol] = ((n_code_length) << 16) |
				r_huff_coder.n_Code(j, n_code_length);
		}
	}
	// a bit shorter, but definitely nicer version, using the new huffman coder class

	/*memset(t_table.n_code_num, 0, 16 * sizeof(uint8_t));
	// zero memory (important is to zero code numbers)

	int n_index = 0, n_code = 0;
	for(uint8_t *p_code_num = t_table.n_code_num, **p_code_tab = t_table.p_code,
	   *p_code_data = t_table.p_code_table; n_index < r_huff_coder.n_Code_Num();
	   p_code_num ++, p_code_tab ++) {
		*p_code_tab = p_code_data;
		for(int n_code_length = (p_code_num - t_table.n_code_num) + 1,
		   n_code_val = n_code; n_index < r_huff_coder.n_Code_Num() &&
		   r_huff_coder.n_Code_Length(n_index) == n_code_length;
		   n_index ++, (*p_code_num) ++, n_code_val ++) {
			uint8_t n_symbol = *r_huff_coder.p_Code_Symbol(n_index);
			p_huff_code_table[n_symbol] = (n_code_length << 16) | n_code_val;
			*p_code_data ++ = n_symbol;
			//printf("%d\n", n_code_val);
			// debug
		}
		n_code += *p_code_num;
		n_code <<= 1;
	}
	// get code numbers and store codes in the table, set pointers
	// to codes of different lengths*/

	return t_table;
}

/*
 *	bool CJpegEncoder::HuffCompress_Write_BlockData(const TRLEBlockData &r_t_block,
 *		const int *p_cur_dc_huff_code_table, const int *p_cur_ac_huff_code_table,
 *		CBitWriter &r_bit_writer, COutputFile *p_out_file)
 *		- take data block r_t_block, encode it using p_cur_dc_huff_code_table and
 *		  p_cur_ac_huff_code_table (array of 256 integers, symbol is index,
 *		  hi-word is code length, lo-word is code word; used for huffman encoding)
 *		- r_bit_writer is bit writer, p_out_file is output file
 *		- return value is true on success or false on write error (not enough free
 *		  space on dest drive)
 */
bool CJpegEncoder::HuffCompress_Write_BlockData(const TRLEBlockData &r_t_block,
	const int *p_cur_dc_huff_code_table, const int *p_cur_ac_huff_code_table,
	CBitWriter &r_bit_writer, COutputFile *p_out_file)
{
	const int *p_cur_huff_code_table = p_cur_dc_huff_code_table;
	for(const TRLEBlockData::TRLEPair *p_pair = r_t_block.p_pair,
	   *p_end = r_t_block.p_pair + r_t_block.n_pair_num; p_pair < p_end; p_pair ++) {
		int n_code_word, n_bit_num;
		n_bit_num = p_cur_huff_code_table[p_pair->n_code_word] >> 16;
		n_code_word = p_cur_huff_code_table[p_pair->n_code_word] & 0xffff;
		// translate current code word (result is index into huffman table and it's length)

		p_cur_huff_code_table = p_cur_ac_huff_code_table;
		// first coeff only is encoded using dc coder,
		// all following codes are encoded using ac coder

#ifdef __JPEG_ENC_HUFF_ENCODE_VERBOSE
		if(p_pair->n_code_word == 0) {
			printf("(EOB) = %d(%d) ", n_bit_num, n_code_word);
		} else if(p_pair->n_code_word == 0xf0) {
			printf("(16x0) = %d(%d) ", n_bit_num, n_code_word);
		} else {
			printf("(%d:%d, %d) = %d(%d) ", p_pair->n_code_word >> 4,
				p_pair->n_code_word & 0xf, p_pair->n_coeff, n_bit_num, n_code_word);
		}
		// debug
#endif

		_ASSERTE(n_code_word >= 0 && n_code_word < 65535);
		_ASSERTE(n_bit_num >= 1 && n_bit_num <= 16);
		if(!r_bit_writer.WriteNumber(n_bit_num, n_code_word, p_out_file))
			return false;
		// write code

		if((p_pair->n_code_word & 0x0f) && !r_bit_writer.WriteNumber(p_pair->n_code_word &
		   0xf, p_pair->n_coeff, p_out_file))
			return false;
		// write number (in case it's length is greater than 0 (value != 0))
	}

#ifdef __JPEG_ENC_HUFF_ENCODE_VERBOSE
	printf("\n");
	// debug
#endif

	return true;
}

#endif // __JPEG_ENC_LOOKUP_CHUFFCODER

/*
 *	template <class TPixelType, class CColorConvertor>
 *	inline void CJpegEncoder::Get_ImageBlock(float *p_dest_block, const uint32_t *p_image,
 *		int n_scanline_width, int n_residual_width, int n_residual_height, int n_sample_width,
 *		int n_sample_height, const CColorConvertor &r_conv)
 *		- get 8x8 one-component image block for encoding, source data from p_image,
 *		  data block is written to p_dest_block
 *		- TPixelType is destination image block pixel type (should be either float or int / short)
 *		- n_scanline_width is width of source image scanline,
 *		- n_residual_width and n_residual_height is width and height of rectangle given by
 *		  pixel p_image points to and bottom-right corner
 *		- n_sample_width and n_sample_height is width and height of sampling window
 *		  (if __JPEG_ENC_FAST_CHROMINANCE_SAMPLING is defined, single pixel of
 *		  sampling window is taken into account. otherwise the whole window is averaged)
 *		- r_conv is color convertor object used to separate desired component (it has to have
 *		  member function int16_t n_Color(uint32_t))
 */
template <class TPixelType, class CColorConvertor>
inline void CJpegEncoder::Get_ImageBlock(TPixelType *p_dest_block, const uint32_t *p_image,
	int n_scanline_width, int n_residual_width, int n_residual_height, int n_sample_width,
	int n_sample_height, const CColorConvertor &r_conv)
{
	if(n_residual_width <= 0 || n_residual_height <= 0) {
		memset(p_dest_block, 0, 64 * sizeof(TPixelType));
		return;
	}

	uint8_t n_block_height;
	if(n_sample_width == 1 && n_sample_height == 1) {
		uint8_t n_block_width = (n_residual_width > 8)? 8 : n_residual_width;
		n_block_height = (n_residual_height > 8)? 8 : n_residual_height;
		// size of 8x8 block, intersected with source image

		const uint32_t *p_src_scanline = p_image;
		for(TPixelType *p_dest_scanline = p_dest_block, *p_dest_sc_end = p_dest_block +
		   8 * n_block_height; p_dest_scanline < p_dest_sc_end; p_dest_scanline += 8) {
			const uint32_t *p_src = p_src_scanline; 
			for(TPixelType *p_dest = p_dest_scanline, *p_end = p_dest_scanline +
			   n_block_width; p_dest < p_end;)
				*p_dest ++ = (TPixelType)(r_conv.n_Color(*p_src ++) - 128);
			p_src_scanline += n_scanline_width;

			for(TPixelType *p_dest = p_dest_scanline + n_block_width,
			   f_src = *(p_dest_scanline + n_block_width - 1),
			   *p_end = p_dest_scanline + 8; p_dest < p_end;)
				*p_dest ++ = f_src;
			// extend last filled row right so the 8x8 block will not contain empty space
			// (that should reduce high frequencies in DCT spectrum and hopefully improve
			// compression ratio for border blocks)
		}
	} else {
		uint8_t n_block_width = (n_residual_width > 8 * n_sample_width)?
			8 * n_sample_width : n_residual_width;
		n_block_height = (n_residual_height > 8 * n_sample_height)?
			8 * n_sample_height : n_residual_height;
		// size of 8x8 block, intersected with source image

		TPixelType *p_dest_scanline = p_dest_block;
		for(const uint32_t *p_src_scanline = p_image, *p_src_scanline_end = p_image +
		   n_block_height * n_scanline_width; p_src_scanline < p_src_scanline_end;
		   p_src_scanline += n_scanline_width * n_sample_height) {
			TPixelType *p_dest_scanline_end = p_dest_scanline + 8;

			for(const uint32_t *p_src = p_src_scanline, *p_end = p_src_scanline +
			   n_block_width; p_src < p_end; p_src += n_sample_width) {
#ifdef __JPEG_ENC_FAST_CHROMINANCE_SAMPLING
				*p_dest_scanline ++ = r_conv.n_Color(*p_src) - 128;
				// take just one sample from upper left window corner
#else
				int n_sample_num = 0;
				TPixelType f_sum = 0;
				for(const uint32_t *p_sam_scan = p_src, *p_sam_end = ((p_src +
				   n_sample_height * n_scanline_width > p_src_scanline_end)? p_src_scanline_end :
				   p_src + n_sample_height * n_scanline_width);
				   p_sam_scan < p_sam_end; p_sam_scan += n_scanline_width) {
					for(const uint32_t *p_sam_pixel = p_sam_scan, *p_sam_pixel_end =
					   (((n_residual_width - (p_src - p_src_scanline)) <= n_sample_width)?
					   p_sam_pixel + (n_residual_width - (p_src - p_src_scanline)) :
					   p_sam_scan + n_sample_width); p_sam_pixel < p_sam_pixel_end;) {
						f_sum += r_conv.n_Color(*p_sam_pixel ++);
						n_sample_num ++;
					}
				}
				if(n_sample_num)
					*p_dest_scanline ++ = f_sum / n_sample_num - 128;
				else
					*p_dest_scanline ++ = -128;
				// average the whole window
#endif
			}

			for(TPixelType f_src = p_dest_scanline[-1]; p_dest_scanline < p_dest_scanline_end;)
				*p_dest_scanline ++ = f_src;
			// extend last filled row right so the 8x8 block will not contain empty space
			// (that should reduce high frequencies in DCT spectrum and hopefully improve
			// compression ratio for border blocks)
		}
	}
	// copy source image to 8x8 block

	n_block_height += n_sample_height - 1;
	n_block_height /= n_sample_height;
	// convert block height from image pixels to block (scaled down) pixels

	for(TPixelType *p_src_scanline = p_dest_block + 8 * (n_block_height - 1),
	   *p_dest_sc_end = p_dest_block + 64, *p_dest_scanline = p_dest_block + 8 *
	   n_block_height; p_dest_scanline < p_dest_sc_end; p_dest_scanline += 8)
		memcpy(p_dest_scanline, p_src_scanline, 8 * sizeof(TPixelType));
	// extend last filled line down so the 8x8 block will not contain empty space
	// (that should reduce high frequencies in DCT spectrum and hopefully improve
	// compression ratio for border blocks)
}

/*
 *	template <class TPixelType>
 *	void CJpegEncoder::FDCT_Quantize_Block(int16_t *p_quantized_block,
 *		const TPixelType *p_image_block, const TQuantTable *p_quant_table)
 *		- take image block p_image_block, divide it by p_quant_table, round to nearest integer
 *		  and store in p_quantized_block
 *		- used FDCT algorithm is of AA&N family, it requires quantization table to be pre-scaled
 *		  (handled in Set_QuantTable())
 */
template <class TPixelType>
void CJpegEncoder::FDCT_Quantize_Block(int16_t *p_quantized_block,
	const TPixelType *p_image_block, const TQuantTable *p_quant_table)
{
#ifdef __JPEG_ENC_ENCODE_VERBOSE
	printf("\nimage block:\n");
	for(int i = 0; i < 64; i ++) {
		if(!(i % 8))
			printf("\t");
		printf("%5d%c", (int)p_image_block[i], ((i % 8) == 7)? '\n' : ' ');
	}
	// debug
#endif

	TPixelType p_dct_block[64];
	const TPixelType *p_dct = p_dct_block;

#ifdef __JPEG_ENC_USE_REFERENCE_DCT
	CReferenceDCT8_2D::Forward(p_dct_block, p_image_block);
#else
	CFastDCT8_2D::Forward(p_dct_block, p_image_block);
#endif
	// calc forward DCT

#ifdef __JPEG_ENC_ENCODE_VERBOSE
	printf("\nFDCT block:\n");
	for(int i = 0; i < 64; i ++) {
		if(!(i % 8))
			printf("\t");
		printf("%7.2f%c", (float)p_dct_block[i], ((i % 8) == 7)? '\n' : ' ');
	}
	// debug
#endif

#ifdef __JPEG_ENC_USE_REFERENCE_DCT
	const uint8_t *p_quant_coeff = p_quant_table->p_store_value;
	for(int16_t *p_dest = p_quantized_block, *p_end = p_quantized_block + 64; p_dest < p_end;)
		*p_dest ++ = (int16_t)(*p_dct ++ / (float)*p_quant_coeff ++);
#else
#ifdef __JPEG_ENC_INTEGER_QUANT
	const int *p_quant_coeff = p_quant_table->p_value;
	for(int16_t *p_dest = p_quantized_block, *p_end = p_quantized_block + 64; p_dest < p_end; p_dct ++)
		*p_dest ++ = (((int)*p_dct << 16) + ((*p_dct >= 0)? 0x8000 : -0x8000)) / *p_quant_coeff ++;
#else
	const float *p_quant_coeff = p_quant_table->p_value;
	for(int16_t *p_dest = p_quantized_block, *p_end = p_quantized_block + 64; p_dest < p_end; p_dct ++)
		*p_dest ++ = (int16_t)(*p_dct * *p_quant_coeff ++ + ((*p_dct >= 0)? .5f : -.5f));
#endif
	// extra rounding (rounding direction is important! simple rounding introduces ugly
	// compression artefacts)
#endif
	// quantize
}

/*
 *	void CJpegEncoder::RLE_Encode_Block(TRLEBlockData &r_block, const int16_t *p_quantized_block)
 *		- run-length encode p_quantized_block (8x8 FDCT quantized data block) and store data
 *		  to r_block
 */
void CJpegEncoder::RLE_Encode_Block(TRLEBlockData &r_block, const int16_t *p_quantized_block)
{
	TRLEBlockData::TRLEPair *p_pair = r_block.p_pair;

#ifdef __JPEG_ENC_ENCODE_VERBOSE
	printf("\nFDCT, quantized, diff DC coded block:\n");
	for(int i = 0; i < 64; i ++) {
		if(!(i % 8))
			printf("\t");
		printf("%5d%c", (int)p_quantized_block[i], ((i % 8) == 7)? '\n' : ' ');
	}
	// debug
#endif

	*p_pair ++ = TRLEBlockData::TRLEPair(p_quantized_block[0]);
	// pack DC value

#ifdef __JPEG_ENC_ENCODE_VERBOSE
	printf("DC(%d, %d) ", p_pair[-1].n_code_word, p_pair[-1].n_coeff);
	// debug
#endif

	uint8_t n_zero_num = 0;
	for(const int *p_index = m_p_zig_indices + 1, *p_end = m_p_zig_indices + 64;
	   p_index < p_end; p_index ++) {
		if(!p_quantized_block[*p_index])
			n_zero_num ++;
		else {
			while(n_zero_num >= 16) {
				(*p_pair ++).n_code_word = 0xf0; // (16x0)
				n_zero_num -= 16;

#ifdef __JPEG_ENC_ENCODE_VERBOSE
				printf("(16x0) ");
				// debug
#endif
			}
			*p_pair = TRLEBlockData::TRLEPair(p_quantized_block[*p_index]);
			(*p_pair ++).n_code_word |= n_zero_num << 4;
			// pack coefficient

#ifdef __JPEG_ENC_ENCODE_VERBOSE
			printf("AC(%d:%d, %d) ", p_pair[-1].n_code_word >> 4,
				p_pair[-1].n_code_word & 0xf, p_pair[-1].n_coeff);
			// debug
#endif

			n_zero_num = 0;
			// we'we just encoded it
		}
	}
	// pack AC coeffs

	if(n_zero_num) {
		(*p_pair ++).n_code_word = 0x00; // (EOB)

#ifdef __JPEG_ENC_ENCODE_VERBOSE
		printf("(EOB)");
		// debug
#endif
	}
	// append EOB mark if needed

	int n_coeff_num = 0;
	for(const TRLEBlockData::TRLEPair *p_cur_pair = r_block.p_pair, *p_end = p_pair;
	   p_cur_pair < p_end; p_cur_pair ++) {
		n_coeff_num += p_cur_pair->n_code_word >> 4;
		n_coeff_num ++;
	}
	_ASSERTE((p_pair[-1].n_code_word == 0x00 && n_coeff_num < 64) || n_coeff_num == 64);

	r_block.n_pair_num = p_pair - r_block.p_pair;
	// calc number of pairs

#ifdef __JPEG_ENC_ENCODE_VERBOSE
	printf("\n");
#endif
	// debug
}

/*
 *	bool CJpegEncoder::Write_JFIFHeader(COutputFile *p_output_file)
 *		- write JFIF header (version 1.1, no thumbnail)
 *		- return true on success, false on write error (not enough dest drive space)
 */
bool CJpegEncoder::Write_JFIFHeader(COutputFile *p_output_file)
{
#ifdef __JPEG_ENC_VERBOSE
	printf("write header [p_s_signature=\'%s\', v_version=%d.%d]\n",
		"JFIF", 1, 1);
	// debug
#endif

	return p_output_file->Write_Short((int16_t)marker_ReservedAppSeg_0) && // marker
		   p_output_file->Write_Short(16) && // length=16
		   p_output_file->Write_Byte('J') &&
		   p_output_file->Write_Byte('F') &&
		   p_output_file->Write_Byte('I') &&
		   p_output_file->Write_Byte('F') &&
		   p_output_file->Write_Byte(0) && // JFIF\0
		   p_output_file->Write_Byte(1) &&
		   p_output_file->Write_Byte(1) && // ver. 1.1
		   p_output_file->Write_Byte(1) && // unit == inches
		   p_output_file->Write_Short(72) &&
		   p_output_file->Write_Short(72) && // 72 DPI
		   p_output_file->Write_Byte(0) &&
		   p_output_file->Write_Byte(0); // thumbnail size
}

/*
 *	bool CJpegEncoder::Write_QuantTables(COutputFile *p_output_file)
 *		- write quantization tables, in case __JPEG_ENC_WRITE_SINGLE_QUANT_TABLE_PER_DQT_BLOCK
 *		  is defined, write every quant table into separate jpeg data block (it's better to
 *		  write both tables into the single block, resulting file is going to be 4 bytes smaller)
 *		- return true on success, false on write error (not enough dest drive space)
 */
bool CJpegEncoder::Write_QuantTables(COutputFile *p_output_file)
{
#ifndef __JPEG_ENC_WRITE_SINGLE_QUANT_TABLE_PER_DQT_BLOCK
	int n_table_num = 0;
	for(const TQuantTable *p_quant_table = m_p_quant_table, *p_end = m_p_quant_table + 2;
	   p_quant_table < p_end; p_quant_table ++) {
		if(p_quant_table->b_write)
			n_table_num ++;
	}

	if(!p_output_file->Write_Short((int16_t)marker_DefineQuantTables) ||
	   !p_output_file->Write_Short(2 + 65 * n_table_num))
		return false;
#endif

	for(const TQuantTable *p_quant_table = m_p_quant_table, *p_end = m_p_quant_table + 2;
	   p_quant_table < p_end; p_quant_table ++) {
		if(!p_quant_table->b_write)
			continue;

#ifdef __JPEG_ENC_WRITE_SINGLE_QUANT_TABLE_PER_DQT_BLOCK
		if(!p_output_file->Write_Short((int16_t)marker_DefineQuantTables) ||
		   !p_output_file->Write_Short(2 + 65))
			return false;
#endif

		uint8_t n_table_properties;
		n_table_properties = p_quant_table - m_p_quant_table; // table index
		//n_table_properties |= 0 << 4; // 8-bit, '1 << 4' would mean 16

		if(!p_output_file->Write_Byte(n_table_properties))
			return false;
		for(const int *p_index = m_p_zig_indices, *p_end = m_p_zig_indices + 64;
		   p_index < p_end; p_index ++) {
			if(!p_output_file->Write_Byte(p_quant_table->p_store_value[(int)*p_index]))
				return false;
		} // f_ixme - i should write it zig-zagged, right? yes

#ifdef __JPEG_ENC_VERBOSE
		printf("write quant table [n_index=%d, n_precision=%d]\n", n_table_properties & 0x0f,
			n_table_properties >> 4);
		{
			for(int i = 0; i < 64; i ++) {
				if(i % 8 == 0)
					printf("\t");
				printf("%5d%c", (int)(p_quant_table->p_store_value[i]), ((i % 8) == 7)? '\n' : ' ');
			}
		}
		// debug
#endif
	}
	// write quant tables

	return true;
}

/*
 *	bool CJpegEncoder::Write_HuffmanTables(COutputFile *p_output_file,
 *		const THuffmanTable *p_huffman_table, int n_table_num, bool b_write_dc_only = false)
 *		- write huffman tables, in case __JPEG_ENC_WRITE_SINGLE_HUFF_TABLE_PER_DQT_BLOCK
 *		  is defined, write every huffman table into separate jpeg data block (it's better to
 *		  write both tables into the single block, resulting file is going to be 4 bytes smaller)
 *		- p_huffman_table is pointer to array of 2 * n_table_num huffman tables (dc + ac)
 *		  every (i * 2) th element is dc, every (i * 2 + 1) th element is ac
 *		- return true on success, false on write error (not enough dest drive space)
 */
bool CJpegEncoder::Write_HuffmanTables(COutputFile *p_output_file,
	const THuffmanTable *p_huffman_table, int n_table_num, bool b_write_dc_only)
{
	if(!n_table_num)
		return true;

#ifdef __JPEG_ENC_WRITE_SINGLE_HUFF_TABLE_PER_DQT_BLOCK
	for(int i = 0; i < n_table_num * 2; i += (b_write_dc_only)? 2 : 1) {
		int n_table_element_num = 0;
		for(int j = 0; j < 16; j ++)
			n_table_element_num += p_huffman_table[i].n_code_num[j];
		if(!p_output_file->Write_Short((int16_t)marker_HuffmanTable) ||
		   !p_output_file->Write_Short(2 + 17 + n_table_element_num))
			return false;
		// f_ixme - 17 == 1 property byte + 16 code number bytes + code bytes themselves

		uint8_t n_table_properties;
		n_table_properties = i / 2; // index
		n_table_properties |= (i & 1) << 4; // DC / AC
		if(!p_output_file->Write_Byte(n_table_properties))
			return false;
		for(int j = 0; j < 16; j ++) {
			if(!p_output_file->Write_Byte(p_huffman_table[i].n_code_num[j]))
				return false;
		}
		for(const uint8_t *p_code_byte = p_huffman_table[i].p_code_table,
		   *p_cb_end = p_huffman_table[i].p_code_table + n_table_element_num;
		   p_code_byte < p_cb_end; p_code_byte ++) {
			if(!p_output_file->Write_Byte(*p_code_byte))
				return false;
		}

#ifdef __JPEG_ENC_VERBOSE
		printf("write huffman table [n_index=%d, b_AC=%d] with %d components\n\t",
			n_table_properties & 0xf, n_table_properties >> 4, n_table_element_num);
		for(int j = 0; j < 16; j ++)
			printf("%d%c", p_huffman_table[i].n_code_num[j], (j == 15)? '\n' : ' ');
		printf("it means this table defines those symbols:\n");
		int n_code = 0;
		for(int j = 0; j < 16; j ++) {
			if(p_huffman_table[i].n_code_num[j]) {
				printf("\t%d bit codes %d-%d == <%d,%d): ", j + 1, n_code,
					n_code + p_huffman_table[i].n_code_num[j] - 1, n_code,
					n_code + p_huffman_table[i].n_code_num[j]);
				for(int k = n_code; k < n_code + p_huffman_table[i].n_code_num[j]; k ++) {
					for(int n = 0; n <= j; n ++)
						printf("%d", (k >> (j - n)) & 1);
					printf(" ");
				}
				printf("\n");
			}
			n_code += p_huffman_table[i].n_code_num[j];
			n_code *= 2;
		}
		// debug
#endif
	}
#else
	int n_tot_table_element_num = 0;
	for(int i = 0; i < n_table_num * 2; i += (b_write_dc_only)? 2 : 1) {
		for(int j = 0; j < 16; j ++)
			n_tot_table_element_num += p_huffman_table[i].p_code_num[j];
	}

	if(!p_output_file->Write_Short((int16_t)marker_HuffmanTable) ||
	   !p_output_file->Write_Short(2 + 17 * n_table_num * ((b_write_dc_only)? 1 : 2) +
	   n_tot_table_element_num))
		return false;
	// f_ixme - 17 == 1 property byte + 16 code number bytes + code bytes themselves

	for(int i = 0; i < n_table_num * 2; i += (b_write_dc_only)? 2 : 1) {
		int n_table_element_num = 0;
		for(int j = 0; j < 16; j ++)
			n_table_element_num += p_huffman_table[i].p_code_num[j];

		uint8_t n_table_properties;
		n_table_properties = i / 2; // index
		n_table_properties |= (i & 1) << 4; // DC / AC
		if(!p_output_file->Write_Byte(n_table_properties))
			return false;
		for(int j = 0; j < 16; j ++) {
			if(!p_output_file->Write_Byte(p_huffman_table[i].p_code_num[j]))
				return false;
		}
		for(const uint8_t *p_code_byte = p_huffman_table[i].p_code_table,
		   *p_cb_end = p_huffman_table[i].p_code_table + n_table_element_num;
		   p_code_byte < p_cb_end; p_code_byte ++) {
			if(!p_output_file->Write_Byte(*p_code_byte))
				return false;
		}

#ifdef __JPEG_ENC_VERBOSE
		printf("write huffman table [n_index=%d, b_AC=%d] with %d components\n\t",
			n_table_properties & 0xf, n_table_properties >> 4, n_table_element_num);
		for(int j = 0; j < 16; j ++)
			printf("%d%c", p_huffman_table[i].p_code_num[j], (j == 15)? '\n' : ' ');
		printf("it means this table defines those symbols:\n");
		int n_code = 0;
		for(int j = 0; j < 16; j ++) {
			if(p_huffman_table[i].p_code_num[j]) {
				printf("\t%d bit codes %d-%d == <%d,%d): ", j + 1, n_code,
					n_code + p_huffman_table[i].p_code_num[j] - 1, n_code,
					n_code + p_huffman_table[i].p_code_num[j]);
				for(int k = n_code; k < n_code + p_huffman_table[i].p_code_num[j]; k ++) {
					for(int n = 0; n <= j; n ++)
						printf("%d", (k >> (j - n)) & 1);
					printf(" ");
				}
				printf("\n");
			}
			n_code += p_huffman_table[i].p_code_num[j];
			n_code *= 2;
		}
		// debug
#endif
	}
#endif
	// write huffman tables

	return true;
}

/*
 *	bool CJpegEncoder::Write_FrameHeader(COutputFile *p_output_file, uint16_t n_marker,
 *		int n_width, int n_height, int n_sample_precission, int n_component_num,
 *		const TComponent *p_component)
 *		- write frame header; n_marker is data block marker (used in decoder to distinguish image
 *		  compression mode), n_width and n_height is image size in pixels and n_component_num
 *		  is number of elements of p_component array, specifying component sampling and
 *		  quant / huffman table indices
 *		- return true on success, false on write error (not enough dest drive space)
 */
bool CJpegEncoder::Write_FrameHeader(COutputFile *p_output_file, uint16_t n_marker,
	int n_width, int n_height, int n_sample_precission, int n_component_num,
	const TComponent *p_component)
{
#ifdef __JPEG_ENC_VERBOSE
	printf("write frame header [n_bpp=%d, n_width=%d, n_height=%d, n_component_num=%d]\n",
		n_sample_precission, n_width, n_height, n_component_num);
	// debug
#endif

	if(!p_output_file->Write_Short(n_marker) ||
	   !p_output_file->Write_Short(8 + 3 * n_component_num) || // length
	   !p_output_file->Write_Byte(n_sample_precission) || // <n_sample_precission> bpp
	   !p_output_file->Write_Short(n_height) || // Y
	   !p_output_file->Write_Short(n_width) || // X
	   !p_output_file->Write_Byte(n_component_num)) // component num
		return false;
	// write header

	for(const TComponent *p_end = p_component + n_component_num; p_component < p_end; p_component ++) {
		if(!p_output_file->Write_Byte(p_component->n_component_id) ||
		   !p_output_file->Write_Byte((p_component->n_sampling_horiz << 4) |
		   p_component->n_sampling_vert) ||
		   !p_output_file->Write_Byte(p_component->n_quant_table_index))
			return false;
	}
	// write component data

	return true;
}

/*
 *	bool CJpegEncoder::Write_StartScan(COutputFile *p_output_file, int n_component_num,
 *		const TComponent *p_component, uint8_t n_start_of_spectrum_predictor,
 *		uint8_t n_end_of_spectrum)
 *		- write start-scan header
 *		- n_component_num is number of elements of p_component array, specifying component
 *		  sampling and quant / huffman table indices
 *		- n_start_of_spectrum_predictor is start of spectrum (0) in DCT mode or predictor
 *		  in lossless mode
 *		- n_end_of_spectrum is end of spectrum (63) in DCT mode or zero in lossless mode
 *		- return true on success, false on write error (not enough dest drive space)
 */
bool CJpegEncoder::Write_StartScan(COutputFile *p_output_file, int n_component_num,
	const TComponent *p_component, uint8_t n_start_of_spectrum_predictor,
	uint8_t n_end_of_spectrum)
{
#ifdef __JPEG_ENC_VERBOSE
	printf("start of scan [n_component_num=%d]\n", n_component_num);
	// debug
#endif

	if(!p_output_file->Write_Short((int16_t)marker_StartScan) ||
	   !p_output_file->Write_Short(6 + 2 * n_component_num) || // length
	   !p_output_file->Write_Byte(n_component_num))
		return false;
	// write header

	for(const TComponent *p_end = p_component + n_component_num;
	   p_component < p_end; p_component ++) {
#ifdef __JPEG_ENC_VERBOSE
		printf("\tcomponent [n_component_id=%d, n_DC_entropy_table=%d, n_AC_entropy_table=%d]\n",
			p_component->n_component_id, p_component->n_dc_huff_coder_index,
			p_component->n_ac_huff_coder_index);
		// debug
#endif
		if(!p_output_file->Write_Byte(p_component->n_component_id) ||
		   !p_output_file->Write_Byte((p_component->n_dc_huff_coder_index << 4) |
		   p_component->n_ac_huff_coder_index))
			return false;
	}
	// write component data

	if(!p_output_file->Write_Byte(n_start_of_spectrum_predictor) || // min coeff
	   !p_output_file->Write_Byte(n_end_of_spectrum) || // max coeff
	   !p_output_file->Write_Byte(0)) // successive approximation
		return false;
	// write some more data

	return true;
}

/*
 *								=== ~CJpegEncoder ===
 */

/*
 *								=== CJpegEncoder::TRLEBlockData::TRLEPair ===
 */

/*
 *	CJpegEncoder::TRLEBlockData::TRLEPair::TRLEPair(int16_t n_x)
 *		- non-default constructor
 *		- set n_code_word to code length (preceding zero num has yet to be shifted << 4 and or-ed)
 */
CJpegEncoder::TRLEBlockData::TRLEPair::TRLEPair(int16_t n_x)
{
	uint8_t n_bit_num = n_SignificantBit_Num((n_x < 0)? -n_x : n_x);
	// need abs value to calculate bit num (two's complement is symetrical)

	if(n_x < 0)
		n_x -= (int16_t)((0xffff ^ ((1 << n_bit_num) - 1)) | 1);
	// offset value to make it two's complement

	n_code_word = n_bit_num;
	n_coeff = n_x;
}

/*
 *	uint8_t CJpegEncoder::TRLEBlockData::TRLEPair::n_SignificantBit_Num(int16_t n_x)
 *		- return number of significant bits in n_x (n_x as-is, no sign-dependent ops involved)
 */
uint8_t CJpegEncoder::TRLEBlockData::TRLEPair::n_SignificantBit_Num(int16_t n_x)
{
	register int16_t n_bit_mask = (int16_t)0x8000;
	for(register uint8_t n_result = 16; n_result > 0; n_result --, n_bit_mask >>= 1) {
		if(n_x & n_bit_mask)
			return n_result;
	}
	// search for first occurence of 1-bit

	return 0;
}

/*
 *								=== ~CJpegEncoder::TRLEBlockData::TRLEPair ===
 */

/*
 *								=== CJpegEncoder::COutputFile ===
 */

/*
 *	CJpegEncoder::COutputFile::COutputFile(FILE *p_fw)
 *		- default constructor
 *		- p_fw specifies output stream data will be written to (only fwrite, no fseek, fclose, etc)
 */
CJpegEncoder::COutputFile::COutputFile(FILE *p_fw)
	:m_p_fw(p_fw)
{}

/*
 *	bool CJpegEncoder::COutputFile::Write_Byte(uint8_t n_value)
 *		- write a single byte
 *		- return true on success, false on io error (not enough space or invalid stream)
 */
bool CJpegEncoder::COutputFile::Write_Byte(uint8_t n_value)
{
	return fwrite(&n_value, sizeof(uint8_t), 1, m_p_fw) == 1;
}

/*
 *	bool CJpegEncoder::COutputFile::Write_Short(int16_t n_value)
 *		- write int16_t as two bytes in hi-lo order (need hi byte first to be able
 *		  to recognize markers when reading)
 *		- return true on success, false on io error (not enough space or invalid stream)
 */
bool CJpegEncoder::COutputFile::Write_Short(int16_t n_value)
{
	uint8_t p_hi_lo[2];

	_ASSERTE(sizeof(int16_t) == sizeof(p_hi_lo));
	memcpy(p_hi_lo, &n_value, sizeof(int16_t));

	return fwrite(&p_hi_lo[1], sizeof(uint8_t), 1, m_p_fw) == 1 &&
		   fwrite(&p_hi_lo[0], sizeof(uint8_t), 1, m_p_fw) == 1;
}

/*
 *								=== ~CJpegEncoder::COutputFile ===
 */

/*
 *								=== CJpegEncoder::CBitWriter ===
 */

/*
 *	CJpegEncoder::CBitWriter::CBitWriter()
 *		- default constructor
 */
CJpegEncoder::CBitWriter::CBitWriter()
	:m_n_bit_num(0)
{}

/*
 *	bool CJpegEncoder::CBitWriter::WriteBit(bool b_bit, COutputFile *p_output_file)
 *		- write a single bit b_bit into output file p_output_file
 *		- return true on success, false on io error (not enough space or invalid stream)
 *		- note bits are accumulated to bytes which are written. in case byte 0xff is written,
 *		  extra byte 0x00 is written right after it, it will be skipped by decoder, purpose is
 *		  to prevent marker-like codes to occur in data segments
 */
bool CJpegEncoder::CBitWriter::WriteBit(bool b_bit, COutputFile *p_output_file)
{
	m_n_byte <<= 1;
	if(b_bit)
		m_n_byte |= 1;
	if(m_n_bit_num == 7) {
		m_n_bit_num = 0;
		if(m_n_byte == 0xff) // !! byte after 0xff must be 0x00, decoder will discard it
			return p_output_file->Write_Byte(m_n_byte) && p_output_file->Write_Byte(0x00);
		return p_output_file->Write_Byte(m_n_byte);
	} else
		m_n_bit_num ++;
	return true;
}

/*
 *	bool CJpegEncoder::CBitWriter::WriteNumber(int n_bit_num, int n_value,
 *		COutputFile *p_output_file)
 *		- write n_bit_num bit long number with value n_value (any higher bits are ignored)
 *		  to output file p_output_file
 *		- return true on success, false on io error (not enough space or invalid stream)
 */
bool CJpegEncoder::CBitWriter::WriteNumber(int n_bit_num, int n_value, COutputFile *p_output_file)
{
	if(n_value < 0)
		n_value -= (int16_t)((0xffff ^ ((1 << n_bit_num) - 1)) | 1);
	// make two's complement

	uint16_t n_mask = 1 << (n_bit_num - 1);
	for(int i = 0; i < n_bit_num; i ++, n_mask >>= 1) {
		if(!WriteBit((n_value & n_mask) != 0, p_output_file))
			return false;
	}
	// very simple, shall be slow

	return true;
}

/*
 *	bool CJpegEncoder::CBitWriter::PaddByte(COutputFile *p_output_file)
 *		- in case there are some (< 8) unwritten bits, it's necessary to padd byte with ones
 *		  and write it to the file
 *		- return true on success, false on io error (not enough space or invalid stream)
 *		- note bits are accumulated to bytes which are written. in case byte 0xff is written,
 *		  extra byte 0x00 is written right after it, it will be skipped by decoder, purpose is
 *		  to prevent marker-like codes to occur in data segments
 */
bool CJpegEncoder::CBitWriter::PaddByte(COutputFile *p_output_file)
{
	if(!m_n_bit_num)
		return true;
	// no need for padding

	m_n_byte <<= 8 - m_n_bit_num;
	m_n_byte |= (1 << (8 - m_n_bit_num)) - 1;
	// padd with 1-s

	m_n_bit_num = 0;
	return p_output_file->Write_Byte(m_n_byte);
}

/*
 *								=== ~CJpegEncoder::CBitWriter ===
 */

/*
 *								=== CQuantTableFactory ===
 */

/*
 *	void CQuantTableFactory::Calc_QuantTable(uint8_t *p_output,
 *		bool b_luminance, float f_quality)
 *		- calculate quantization table, based on simple formula
 *		- r_p_output is reference to array of 64 unsigned chars where table
 *		  coefficients are to be output (in natural left-to-right top-to-bottom order, no zig-zag)
 *		- set b_luminance to true in case the table should be used for quantizing luminance
 *		  coefficients (table is a bit softer in that case as human eye is more sensitive
 *		  to luminance errors)
 *		- f_quality is in range 0 - 1.1 (can be negative up to approx -1.97647 which is limit and
 *		  yields really poor quality, for smooth surfaces such as sky, mostly dc only is stored,
 *		  resulting in ugly-looking mosaic)
 */
void CQuantTableFactory::Calc_QuantTable(uint8_t *p_output,
	bool b_luminance, float f_quality)
{
	int n_offset = (int)((f_quality > 1)? 1 : 2 + 85 * (1 - f_quality));
	int n_cutoff = (int)((f_quality > 1)? 3 : 4 + 30 * (1 - f_quality));
	if(n_cutoff < n_offset) {
		n_cutoff ^= n_offset;
		n_offset ^= n_cutoff;
		n_cutoff ^= n_offset;
	}
	float f_exponent, f_scale;
	if(b_luminance) {
		f_exponent = (f_quality > 1)? 1.325f : 1.75f;
		f_scale = (f_quality > 1)? .25f : .15f;
	} else {
		f_exponent = (f_quality > 1)? 1.5f : 2.25f;
		f_scale = (f_quality > 1)? .2f : .1f;
	}
	for(int i = 0; i < 64; i ++) {
		float f_dist = float(sqrt(float((i / 8) * (i / 8) + (i % 8) * (i % 8))));
		float f_value = n_offset + (float)pow(f_exponent, f_dist) * f_scale;
		if(f_value > n_cutoff / (f_dist * .5f * (1.1f - f_quality) + 1))
			p_output[i] = (uint8_t)(n_cutoff / (f_dist * .5f * (1.1f - f_quality) + 1));
		else {
			_ASSERTE((int)f_value >= 1 && (int)f_value <= 255);
			p_output[i] = (uint8_t)f_value;
		}
	}
	// don't want to scale std tables like every other one
}

/*
 *								=== ~CQuantTableFactory ===
 */
