/*
								+---------------------------------+
								|                                 |
								|  ***   Tiny jpeg decoder   ***  |
								|                                 |
								|  Copyright   -tHE SWINe- 2006  |
								|                                 |
								|            Jpeg.cpp             |
								|                                 |
								+---------------------------------+
*/

/**
 *	@file Jpeg.cpp
 *	@author -tHE SWINe-
 *	@date 2006
 *	@brief tiny jpeg decoder (compiles to cca 3kB with MSVC)
 *
 *	@date 2006-08-30
 *
 *	passed code revision
 *
 *	write code for __JPEG_DEC_ALLOW_SUBSAMPLED_IMAGES code path (not really necessary)
 *	write code to support 12-bit extended DCT images
 *	refine code for lossless compression to eliminate rounding error when decompressing > 8bit
 *	lossless images (alloc just two int16 scanlines to remember predictor values)
 *
 *	@date 2007-09-27
 *
 *	fixed error with restart interval being set to 0 (setting it to 0 means disable it
 *	which is rather strange, it would be preferable not to mention it at all)
 *
 *	@date 2007-11-12
 *
 *	reformat (added line breaks where lines were longer than 100 characters)
 *
 *	@date 2007-12-24
 *
 *	improved linux compatibility by adding posix integer types
 *
 *	@date 2008-02-25
 *
 *	added function for loading jpegs from memory
 *
 *	@date 2008-03-04
 *
 *	using Integer.h header, using CallStack.h instead of crtdbg.h
 *	renamed CTinyJpegDecoder::CFileBuffer to CTinyJpegDecoder::CBufferredFile
 *	changed size of zigZag table back to 64 and it's type to int (speed / obfuscation purposes)
 *
 *	@date 2008-03-11
 *
 *	fixed error on line 1281 where macroblock index overrun check was "less than 63",
 *	it should've been 64 (error carried-in on 2008-03-04 update)
 *
 *	@date 2008-07-25
 *
 *	added ability to load image "thumbnails" using DC values only. while this is very
 *	fast, it works for lossy compression only. loading thumbnail of lossless image would
 *	result in access violation right now. to be fixed.
 *
 *	@date 2008-09-11
 *
 *	added ability to get image information without having to decode it trough overloaded
 *	function CTinyJpegDecoder::t_JPEG_Info().
 *
 *	@date 2009-05-04
 *
 *	fixed mixed windows / linux line endings
 *
 */

#include "NewFix.h"

#include "CallStack.h"
#include <vector>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "Jpeg.h"
#include "DCT.h"
#include "MinMax.h"
#include "Unused.h"

#if defined(_MSC_VER) && !defined(__MWERKS__) && !defined(for) && _MSC_VER <= 1200
#define for if(0) {} else for
#endif // _MSC_VER && !__MWERKS__ && !for && _MSC_VER <= 1200
// msvc 'for' scoping hack

/*
 *								=== CTinyJpegDecoder::CImageRenderer ===
 */

/*
 *	template <const int n_component_num, class TColorConversion>
 *	class CTinyJpegDecoder::CImageRenderer
 *		- template class for rendering / scaling RGB images from separate color planes
 */
template <const int n_component_num, class TColorConversion>
class CTinyJpegDecoder::CImageRenderer {
public:
	static inline void RenderImage(TBmp *p_bitmap, const CTinyJpegDecoder::TComponentInfo *p_component,
		const CTinyJpegDecoder::TImageData &r_t_image_data)
	{
		TColorConversion t_color_conv;
		// shall contain conversion tables

#ifdef __JPEG_DEC_VERBOSE
		const char *p_colorspace_name_list[] = {"grayscale", "RG", "RGB", "YCCK"};
		printf("rendering %s image ...\n", p_colorspace_name_list[n_component_num - 1]);
		// debug
#endif

		memset(p_bitmap->p_buffer, 0, p_bitmap->n_width * p_bitmap->n_height *
			sizeof(uint32_t));
		// fill with zeros first, the color components will be or-ed

		for(const TComponentInfo *p_back_component = p_component + (n_component_num - 1);
		   p_component <= p_back_component; ++ p_component) {
			int n = p_component->n_component_id - 1;

#ifdef __JPEG_DEC_VERBOSE
			if(n_component_num != 2) {
				const char *p_component_name_list[] = {"Y", "Cb", "Cr", "K"};
				printf("\tcomponent %s scaling %d x %d\n", p_component_name_list[n],
					p_component->n_scale_horiz, p_component->n_scale_vert);
			} else {
				const char *p_component_name_list[] = {"R", "G"};
				printf("\tcomponent %s scaling %d x %d\n", p_component_name_list[n],
					p_component->n_scale_horiz, p_component->n_scale_vert);
			}
			// debug
#endif

			const uint8_t *p_src_buffer = r_t_image_data.p_framebuffer[n];
			for(uint32_t *p_dest_scanline = p_bitmap->p_buffer, *p_end_scanline =
			   p_bitmap->p_buffer + p_bitmap->n_width * p_bitmap->n_height;
			   p_dest_scanline < p_end_scanline; p_dest_scanline += p_bitmap->n_width *
			   p_component->n_scale_vert) {
				for(uint32_t *p_dest_scanline2 = p_dest_scanline, *p_end_scanline2 =
				   p_dest_scanline + p_bitmap->n_width * p_component->n_scale_vert;
				   p_dest_scanline2 < p_end_scanline2 && p_dest_scanline2 < p_end_scanline;
				   p_dest_scanline2 += p_bitmap->n_width) {
					if(p_component < p_back_component) {
						const uint8_t *p_src_buffer2 = p_src_buffer;
						for(uint32_t *p_dest_pixel = p_dest_scanline2, *p_end =
						   p_dest_scanline2 + p_bitmap->n_width; p_dest_pixel < p_end;) {
							uint32_t n_value =
								((uint32_t)((*p_src_buffer2 ++ + 128) & 0xff)) << (n * 8);
							// calc color

							uint32_t *p_end2 = p_dest_pixel + p_component->n_scale_horiz;
							if(p_end2 > p_end)
								p_end2 = p_end;
							while(p_dest_pixel < p_end2)
								*p_dest_pixel ++ |= n_value;
							// fill scanline data, subsample horizontally
						}
					} else {
						const uint8_t *p_src_buffer2 = p_src_buffer;
						for(uint32_t *p_dest_pixel = p_dest_scanline2, *p_end =
						   p_dest_scanline2 + p_bitmap->n_width; p_dest_pixel < p_end;) {
							uint32_t *p_end2 = p_dest_pixel + p_component->n_scale_horiz;
							if(p_end2 > p_end)
								p_end2 = p_end;
							uint8_t p_component[n_component_num];
							p_component[n_component_num - 1] = *p_src_buffer2 ++ + 128;
							while(p_dest_pixel < p_end2) {
								if(n_component_num == 2)
									p_component[0] = (uint8_t)*p_dest_pixel;
								else if(n_component_num == 3) {
									p_component[0] = (uint8_t)(*p_dest_pixel /*& 0xff*/);
									p_component[1] = (uint8_t)((*p_dest_pixel >> 8) /*& 0xff*/);
								} else if(n_component_num == 4) {
									p_component[0] = (uint8_t)(*p_dest_pixel /*& 0xff*/);
									p_component[1] = (uint8_t)((*p_dest_pixel >> 8) /*& 0xff*/);
									p_component[2] = (uint8_t)((*p_dest_pixel >> 16) /*& 0xff*/);
								}

								*p_dest_pixel ++ = t_color_conv(p_component);
							}
							// fill scanline data, subsample horizontally
						}
					}
				}
				p_src_buffer += r_t_image_data.p_framebuffer_width[n];
			}
			// semi-fast fill algorithm (could write several ones for each sub-sampling value
			// combination, but the aim of this decoder was to minimize code size)
		}
	}
};

/*
 *								=== ~CTinyJpegDecoder::CImageRenderer ===
 */

/*
 *								=== CGrayConversion ===
 */

#ifdef __JPEG_DEC_ALLOW_SUBSAMPLED_GRAYSCALE

class CGrayConversion {
public:
	inline uint32_t operator ()(const uint8_t *p_component) const
	{
		return 0xff000000 | ((uint32_t)*p_component << 16) |
			((uint32_t)*p_component << 8) | (uint32_t)*p_component;
	}
};

#endif

/*
 *								=== ~CGrayConversion ===
 */

/*
 *								=== CRGConversion ===
 */

#ifdef __JPEG_DEC_SUPPORT_RG

class CRGConversion {
public:
	inline uint32_t operator ()(const uint8_t *p_component) const
	{
#ifdef __JPEG_DEC_RGB
		return 0xff000000 | ((uint32_t)p_component[0] << 16) |
			((uint32_t)p_component[1] << 8);
#else
		return 0xff000000 | ((uint32_t)p_component[0]) |
			((uint32_t)p_component[1] << 8);
#endif
	}
};

#endif

/*
 *								=== ~CRGConversion ===
 */

/*
 *								=== CPackConversion ===
 */

#ifdef __JPEG_DEC_SUPPORT_LOSSLESS

// would do it as template, but MSVC fails with internal compiller error (C1001)
// i should be getting smaller code this way anyway
class CPackConversion1 {
public:
	inline uint32_t operator ()(const uint8_t *p_component) const
	{
		/*uint8_t n_gray = p_component[0] - 128;
		return 0xff000000 | ((uint32_t)n_gray << 16) | ((uint32_t)n_gray << 8) | n_gray;*/
		// todo - remove. lossless RLE debug only
#ifdef __JPEG_DEC_RGB
		return 0xff000000 | ((uint32_t)(uint8_t)(p_component[0] - 128) << 16);
#else
		return 0xff000000 | (uint8_t)(p_component[0] - 128);
#endif
	}
};

class CPackConversion2 {
public:
	inline uint32_t operator ()(const uint8_t *p_component) const
	{
#ifdef __JPEG_DEC_RGB
		return 0xff000000 | ((uint32_t)(uint8_t)(p_component[0] - 128) << 16) |
			((uint32_t)(uint8_t)(p_component[1] - 128) << 8);
#else
		return 0xff000000 | ((uint8_t)(p_component[0] - 128)) |
			((uint32_t)(uint8_t)(p_component[1] - 128) << 8);
#endif
	}
};

class CPackConversion3 {
public:
	inline uint32_t operator ()(const uint8_t *p_component) const
	{
#ifdef __JPEG_DEC_RGB
		return 0xff000000 | ((uint32_t)(uint8_t)(p_component[0] - 128) << 16) |
			((uint32_t)(uint8_t)(p_component[1] - 128) << 8) |
			(uint32_t)(uint8_t)(p_component[2] - 128);
#else
		return 0xff000000 | ((uint32_t)(uint8_t)(p_component[2] - 128) << 16) |
			((uint32_t)(uint8_t)(p_component[1] - 128) << 8) |
			(uint32_t)(uint8_t)(p_component[0] - 128);
#endif
	}
};

class CPackConversion4 {
public:
	inline uint32_t operator ()(const uint8_t *p_component) const
	{
#ifdef __JPEG_DEC_RGB
		return ((uint32_t)(uint8_t)(p_component[0] - 128) << 16) |
			((uint32_t)(uint8_t)(p_component[1] - 128) << 8) |
			(uint32_t)(uint8_t)(p_component[2] - 128) |
			((uint32_t)(uint8_t)(p_component[3] - 128) << 24);
#else
		return ((uint32_t)(uint8_t)(p_component[2] - 128) << 16) |
			((uint32_t)(uint8_t)(p_component[1] - 128) << 8) |
			(uint32_t)(uint8_t)(p_component[0] - 128) |
			((uint32_t)(uint8_t)(p_component[3] - 128) << 24);
#endif
	}
};

#endif // __JPEG_DEC_SUPPORT_LOSSLESS

/*
 *								=== ~CPackConversion ===
 */

/*
 *								=== CRGBConversion ===
 */

class CRGBConversion {
protected:
	int16_t m_p_YCbCr_to_RGB[4][256]; // 2k table with premultiplied coeffs

public:
	CRGBConversion()
	{
		for(int16_t i = 0; i < 256; ++ i) {
			m_p_YCbCr_to_RGB[0][i] = (int16_t)(1.402f * (i - 128));
			m_p_YCbCr_to_RGB[1][i] = (int16_t)(-.34414f * (i - 128));
			m_p_YCbCr_to_RGB[2][i] = (int16_t)(-.71414f * (i - 128));
			m_p_YCbCr_to_RGB[3][i] = (int16_t)(1.772f * (i - 128));
		}
	}

	inline uint32_t operator ()(const uint8_t *p_component) const
	{
#ifdef __JPEG_DEC_RGB
		int32_t R = p_component[0] + m_p_YCbCr_to_RGB[0][p_component[2]];
#else
		int32_t R = p_component[0] + m_p_YCbCr_to_RGB[3][p_component[1]];
#endif
		if(R < 0)
			R = 0;
		else if(R >= 0xff)
			R = 0xff0000;
		else
			R <<= 16;
		int32_t G = p_component[0] + m_p_YCbCr_to_RGB[1][p_component[1]] +
			m_p_YCbCr_to_RGB[2][p_component[2]];
		if(G < 0)
			G = 0;
		else if(G >= 0xff)
			G = 0xff00;
		else
			G <<= 8;
#ifdef __JPEG_DEC_RGB
		int32_t B = p_component[0] + m_p_YCbCr_to_RGB[3][p_component[1]];
#else
		int32_t B = p_component[0] + m_p_YCbCr_to_RGB[0][p_component[2]];
#endif
		if(B < 0)
			return 0xff000000 | R | G;
		else if(B > 0xff)
			return 0xff0000ff | R | G;
		return 0xff000000 | R | G | B;
	}
};

/*
 *								=== ~CRGBConversion ===
 */

/*
 *								=== CCMYKConversion ===
 */

#if defined(__JPEG_DEC_SUPPORT_YCCK) || defined(__JPEG_DEC_SUPPORT_CMYK)

class CCMYKConversion {
public:
	inline uint32_t operator ()(const uint8_t *p_component) const
	{
		uint8_t n_k = /*0xff -*/ p_component[3];

		int32_t R = /*0xff*/n_k - ((p_component[2] * (/*0xff -*/ n_k)) >> 8) /*- n_k*/;
		if(R < 0)
			R = 0;
		else if(R >= 0xff)
			R = 0xff0000;
		else
			R <<= 16;
		int32_t G = /*0xff*/n_k - ((p_component[1] * (/*0xff -*/ n_k)) >> 8) /*- n_k*/;
		if(G < 0)
			G = 0;
		else if(G >= 0xff)
			G = 0xff00;
		else
			G <<= 8;
		int32_t B = /*0xff*/n_k - ((p_component[0] * (/*0xff -*/ n_k)) >> 8) /*- n_k*/;
		if(B < 0)
			return 0xff000000 | R | G;
		else if(B > 0xff)
			return 0xff0000ff | R | G;
		return 0xff000000 | R | G | B;

		// maybe here should be couple of #ifdef __JPEG_DEC_RGB as well, but that would
		// require change in CYCCKConversion code as well so i keep it as is since noone
		// encodes jpegs using CMYK
	}
};

#endif

/*
 *								=== ~CCMYKConversion ===
 */

/*
 *								=== CYCCKConversion ===
 */

#ifdef __JPEG_DEC_SUPPORT_YCCK

class CYCCKConversion : public CRGBConversion, public CCMYKConversion {
public:
	inline uint32_t operator ()(const uint8_t *p_component) const
	{
		uint32_t n_cmyk = (*(CRGBConversion*)this)(p_component);
		
		/*
		uint8_t p_cmyk[4] = {(uint8_t)n_cmyk, (uint8_t)(n_cmyk >> 8),
			(uint8_t)(n_cmyk >> 16), p_component[3]};
		return CCMYKConversion::operator ()(p_cmyk);
		*/ // a bit cleaner code doing the same thing

		((uint8_t*)&n_cmyk)[3] = p_component[3];
		return (*(CCMYKConversion*)this)((uint8_t*)&n_cmyk);
		// a bit faster code, i guess it won't work on big endian machines
	}
};

#endif

/*
 *								=== ~CYCCKConversion ===
 */

/*
 *								=== CTinyJpegDecoder ===
 */

int CTinyJpegDecoder::p_zig_indices[64] = {
     0,  1,  8, 16,  9,  2,  3, 10,
	17, 24, 32, 25, 18, 11,  4,  5,
	12, 19, 26, 33, 40, 48, 41, 34,
	27, 20, 13,  6,  7, 14, 21, 28,
	35, 42, 49, 56, 57, 50, 43, 36,
	29, 22, 15, 23, 30, 37, 44, 51,
	58, 59, 52, 45, 38, 31, 39, 46,
	53, 60, 61, 54, 47, 55, 62, 63
};
// reverse zig-zag indices

enum {
	marker_SOF0_HuffBaseline_DCT = 0xffc0,
	marker_SOF1_HuffExtendedSequential_DCT = 0xffc1,
	marker_SOF2_HuffProgressive_DCT = 0xffc2,
	marker_SOF3_HuffLossless = 0xffc3,
	// start of frame markers, non-differential, huffman coding

	marker_SOF5_HuffDiffSequential_DCT = 0xffc5,
	marker_SOF6_HuffDiffProgressive_DCT = 0xffc6,
	marker_SOF7_HuffDiffLossless = 0xffc7,
	// start of frame markers, differential, huffman coding

	marker_SOF8_ArithReserved = 0xffc8,
	marker_SOF9_ArithExtendedSequential_DCT = 0xffc9,
	marker_SOF10_ArithProgressive_DCT = 0xffca,
	marker_SOF11_ArithLossless = 0xffcb,
	// start of frame markers, non-differential, arithmetic coding

	marker_SOF13_ArithDiffSequential_DCT = 0xffcd,
	marker_SOF14_ArithDiffProgressive_DCT = 0xffce,
	marker_SOF15_ArithDiffLossless = 0xffcf,
	// start of frame markers, differential, arithmetic coding

	marker_HuffmanTable = 0xffc4,
	// huffman table specification

	marker_ArithCodingCondBlock = 0xffcc,
	// arithmetic coding conditioning specification

	marker_RestartMod0 = 0xffd0,
	marker_RestartMod1 = 0xffd1,
	marker_RestartMod2 = 0xffd2,
	marker_RestartMod3 = 0xffd3,
	marker_RestartMod4 = 0xffd4,
	marker_RestartMod5 = 0xffd5,
	marker_RestartMod6 = 0xffd6,
	marker_RestartMod7 = 0xffd7,
	// restart interval termination

	marker_StartImage = 0xffd8,
	marker_EndImage = 0xffd9,
	marker_StartScan = 0xffda,
	marker_DefineQuantTables = 0xffdb,
	marker_DefineNumberLines = 0xffdc,
	marker_DefineRestartInterval = 0xffdd,
	marker_DefineHierarchicalProgression = 0xffde,
	marker_ExpandRefComps = 0xffdf,
	// other markers

	marker_ReservedAppSeg_0 = 0xffe0, // JFIF (jpeg file interchange format - the used format)
	marker_ReservedAppSeg_1 = 0xffe1,
	marker_ReservedAppSeg_2 = 0xffe2,
	marker_ReservedAppSeg_3 = 0xffe3,
	marker_ReservedAppSeg_4 = 0xffe4,
	marker_ReservedAppSeg_5 = 0xffe5,
	marker_ReservedAppSeg_6 = 0xffe6,
	marker_ReservedAppSeg_7 = 0xffe7,
	marker_ReservedAppSeg_8 = 0xffe8,
	marker_ReservedAppSeg_9 = 0xffe9,
	marker_ReservedAppSeg_a = 0xffea,
	marker_ReservedAppSeg_b = 0xffeb,
	marker_ReservedAppSeg_c = 0xffec,
	marker_ReservedAppSeg_d = 0xffed,
	marker_ReservedAppSeg_e = 0xffee,
	marker_ReservedAppSeg_f = 0xffef,

	marker_JPEG_ReservedExt_0 = 0xfff0,
	marker_JPEG_ReservedExt_1 = 0xfff1,
	marker_JPEG_ReservedExt_2 = 0xfff2,
	marker_JPEG_ReservedExt_3 = 0xfff3,
	marker_JPEG_ReservedExt_4 = 0xfff4,
	marker_JPEG_ReservedExt_5 = 0xfff5,
	marker_JPEG_ReservedExt_6 = 0xfff6,
	marker_JPEG_ReservedExt_7 = 0xfff7,
	marker_JPEG_ReservedExt_8 = 0xfff8,
	marker_JPEG_ReservedExt_9 = 0xfff9,
	marker_JPEG_ReservedExt_a = 0xfffa,
	marker_JPEG_ReservedExt_b = 0xfffb,
	marker_JPEG_ReservedExt_c = 0xfffc,
	marker_JPEG_ReservedExt_d = 0xfffd,

	marker_Comment = 0xfffe,

	marker_Temp = 0xff01,

	marker_Reserved_min = 0xff02,
	marker_Reserved_max = 0xffbf
};
// marker codes (could be in common header, but i want to have least files possible)

const CTinyJpegDecoder::TBlockAction CTinyJpegDecoder::m_p_action_table[] = {
	{marker_SOF0_HuffBaseline_DCT, Read_FrameHeader}, // -------------
	{marker_SOF1_HuffExtendedSequential_DCT, Read_FrameHeader}, // fixme - according to the specs, this is only extended by maximal allowed number of quant / huff tables to 4, and possibility of 12-bit samples (which requires longer huff tables) ... anyway, everything is written in such a fashion, it should be handled
	{marker_SOF2_HuffProgressive_DCT, Read_Unsupported},
#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
	{marker_SOF3_HuffLossless, Read_FrameHeader}, // t_odo
#else
	{marker_SOF3_HuffLossless, Read_Unsupported},
#endif
	{marker_SOF5_HuffDiffSequential_DCT, Read_Unsupported},
	{marker_SOF6_HuffDiffProgressive_DCT, Read_Unsupported},
	{marker_SOF7_HuffDiffLossless, Read_Unsupported},
	{marker_SOF8_ArithReserved, Read_Unsupported},
	{marker_SOF9_ArithExtendedSequential_DCT, Read_Unsupported},
	{marker_SOF10_ArithProgressive_DCT, Read_Unsupported},
	{marker_SOF11_ArithLossless, Read_Unsupported},
	{marker_SOF13_ArithDiffSequential_DCT, Read_Unsupported},
	{marker_SOF14_ArithDiffProgressive_DCT, Read_Unsupported},
	{marker_SOF15_ArithDiffLossless, Read_Unsupported},
	{marker_HuffmanTable, Read_HuffmanTables},
	{marker_ArithCodingCondBlock, Read_Unsupported},
	/*{marker_RestartMod0, Read_SkipMarker},
	{marker_RestartMod1, Read_SkipMarker},
	{marker_RestartMod2, Read_SkipMarker},
	{marker_RestartMod3, Read_SkipMarker},
	{marker_RestartMod4, Read_SkipMarker},
	{marker_RestartMod5, Read_SkipMarker},
	{marker_RestartMod6, Read_SkipMarker},
	{marker_RestartMod7, Read_SkipMarker},*/ // read directy in macroblock decoding loop
	{marker_StartImage, Read_SkipMarker},
	{marker_EndImage, Read_SkipMarker},
	{marker_StartScan, Read_StartScan},
	{marker_DefineQuantTables, Read_QuantTables},
	{marker_DefineNumberLines, Read_Unsupported},
	{marker_DefineRestartInterval, Read_RestartInterval},
	{marker_DefineHierarchicalProgression, Read_Unsupported},
	{marker_ExpandRefComps, Read_Skip},
	{marker_ReservedAppSeg_0, Read_JFIF_Header},
	{marker_ReservedAppSeg_1, Read_Skip},
	{marker_ReservedAppSeg_2, Read_Skip},
	{marker_ReservedAppSeg_3, Read_Skip},
	{marker_ReservedAppSeg_4, Read_Skip},
	{marker_ReservedAppSeg_5, Read_Skip},
	{marker_ReservedAppSeg_6, Read_Skip},
	{marker_ReservedAppSeg_7, Read_Skip},
	{marker_ReservedAppSeg_8, Read_Skip},
	{marker_ReservedAppSeg_9, Read_Skip},
	{marker_ReservedAppSeg_a, Read_Skip},
	{marker_ReservedAppSeg_b, Read_Skip},
	{marker_ReservedAppSeg_c, Read_Skip},
	{marker_ReservedAppSeg_d, Read_Skip},
	{marker_ReservedAppSeg_e, Read_Skip},
	{marker_ReservedAppSeg_f, Read_Skip},
	{marker_JPEG_ReservedExt_0, Read_Skip},
	{marker_JPEG_ReservedExt_1, Read_Skip},
	{marker_JPEG_ReservedExt_2, Read_Skip},
	{marker_JPEG_ReservedExt_3, Read_Skip},
	{marker_JPEG_ReservedExt_4, Read_Skip},
	{marker_JPEG_ReservedExt_5, Read_Skip},
	{marker_JPEG_ReservedExt_6, Read_Skip},
	{marker_JPEG_ReservedExt_7, Read_Skip},
	{marker_JPEG_ReservedExt_8, Read_Skip},
	{marker_JPEG_ReservedExt_9, Read_Skip},
	{marker_JPEG_ReservedExt_a, Read_Skip},
	{marker_JPEG_ReservedExt_b, Read_Skip},
	{marker_JPEG_ReservedExt_c, Read_Skip},
	{marker_JPEG_ReservedExt_d, Read_Skip}, // will be treated as unknown blocks
	{marker_Comment, Read_Skip},
	{marker_Temp, Read_Skip},
	{marker_Reserved_min, Read_Skip},
	{marker_Reserved_max, Read_Skip},
};
// decoder action table

/*
 *	TBmp *CTinyJpegDecoder::p_Decode_JPEG(FILE *p_fr, bool b_thumbnail = false)
 *		- decode jpeg, convert it to RGBA, 8 bits per channel bitmap
 *		- setting b_thumbnail to true enables faster jpeg decoding using DC values only
 *		  (resulting image has 8 times lower resolution)
 *		- alpha is always 0xff since jpeg files don't store alpha channel
 *		- return 0 on failure (not enough memory / unsupported format / damaged file)
 */
TBmp *CTinyJpegDecoder::p_Decode_JPEG(FILE *p_fr, bool b_thumbnail)
{
	CBufferredFile file(p_fr); // g++ hates this as an arg
	return _p_Decode_JPEG(file, b_thumbnail);
}

/*
 *	TBmp *CTinyJpegDecoder::p_Decode_JPEG(const void *p_data,
 *		unsigned int n_size, bool b_thumbnail = false)
 *		- decode jpeg, convert it to RGBA, 8 bits per channel bitmap
 *		- setting b_thumbnail to true enables faster jpeg decoding using DC values only
 *		  (resulting image has 8 times lower resolution)
 *		- alpha is always 0xff since jpeg files don't store alpha channel
 *		- return 0 on failure (not enough memory / unsupported format / damaged file)
 */
TBmp *CTinyJpegDecoder::p_Decode_JPEG(const void *p_data,
	unsigned int n_size, bool b_thumbnail)
{
	CBufferredFile file((const unsigned char*)p_data, n_size); // g++ hates this as an arg
	return _p_Decode_JPEG(file, b_thumbnail);
}

/*
 *	TImageInfo CTinyJpegDecoder::t_JPEG_Info(FILE *p_fr, bool b_thumbnail = false)
 *		- returns image file information
 *		- setting b_thumbnail to true downscales image size accordingly
 *		  (refer to p_Decode_JPEG() documentation)
 *		- returns TImageInfo with all fields set to 0 on error
 */
CTinyJpegDecoder::TImageInfo CTinyJpegDecoder::t_JPEG_Info(FILE *p_fr, bool b_thumbnail)
{
	CBufferredFile file(p_fr);
	return _t_JPEG_Info(file, b_thumbnail);
}

/*
 *	TImageInfo CTinyJpegDecoder::t_JPEG_Info(const void *p_data,
 *		unsigned int n_size, bool b_thumbnail = false)
 *		- returns image file information
 *		- setting b_thumbnail to true downscales image size accordingly
 *		  (refer to p_Decode_JPEG() documentation)
 *		- returns TImageInfo with all fields set to 0 on error
 */
CTinyJpegDecoder::TImageInfo CTinyJpegDecoder::t_JPEG_Info(const void *p_data,
	unsigned int n_size, bool b_thumbnail)
{
	CBufferredFile file((const unsigned char*)p_data, n_size);
	return _t_JPEG_Info(file, b_thumbnail);
}

/*
 *	TImageInfo _t_JPEG_Info(CBufferredFile &file_buffer, bool b_thumbnail)
 *		- returns image file information
 *		- setting b_thumbnail to true downscales image size accordingly
 *		  (refer to p_Decode_JPEG() documentation)
 *		- returns TImageInfo with all fields set to 0 on error
 */
CTinyJpegDecoder::TImageInfo CTinyJpegDecoder::_t_JPEG_Info(CBufferredFile &file_buffer, bool b_thumbnail)
{
	TImageInfo t_result = {0, 0, 0, 0};

	if(!Read_Blocks(file_buffer, b_thumbnail, false))
		return t_result; // contains all 0's

	t_result.n_width = m_t_image_header.n_width;
	t_result.n_height = m_t_image_header.n_height;
	t_result.n_component_num = m_t_image_header.n_component_num;
	t_result.n_bit_width = m_t_image_header.n_sample_precision;
	// copy image data

	return t_result;
}

/*
 *	bool CTinyJpegDecoder::Read_Blocks(CBufferredFile &file_buffer, bool b_thumbnail, bool b_image)
 *		- reads jpeg file, but doesn't perform color conversion
 *		- returns true on success, false on failure
 */
bool CTinyJpegDecoder::Read_Blocks(CBufferredFile &file_buffer, bool b_thumbnail, bool b_image)
{
	m_t_image_data.b_read_image_header = false;
	for(int i = 0; i < __JPEG_DEC_MAX_COMPONENT_NUM; ++ i) {
		m_t_image_data.p_framebuffer[i] = 0;
		m_t_image_data.p_dc_value[i] = 0;
		m_p_huffman_table[i][0].b_valid = false;
		m_p_huffman_table[i][1].b_valid = false;
	}
	m_t_restart.b_enabled = false;
	// clear data that needs to be cleared

	m_t_image_data.b_want_image = b_image; // want image data
	m_t_image_data.b_want_thumbnail = b_thumbnail; // want thumbnail

	bool b_have_start_image = false;
	for(;;) {
		uint16_t n_marker_code;
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if(!file_buffer.Read_Marker(n_marker_code))
			return false;
#else
		file_buffer.Read_Marker(n_marker_code);
#endif

		if(n_marker_code == marker_StartImage) {
			b_have_start_image = true;

#ifdef __JPEG_DEC_VERBOSE
			printf("encountered start-of-image marker\n");
			// debug
#endif
		} else if(!b_have_start_image)
			continue;
		// skip until reaches marker for start of image

		if(n_marker_code == marker_EndImage)
			break;
		// when encounters marker for end of image, we're done, time to create RGB output

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		bool b_found = false;
#endif
		for(const TBlockAction *p_action = m_p_action_table, *p_end = m_p_action_table +
		   (sizeof(m_p_action_table) / sizeof(m_p_action_table[0])); p_action < p_end; ++ p_action) {
			if(p_action->n_id == n_marker_code) {
				if(!p_action->p_function(n_marker_code, this, &file_buffer))
					return false; // can't strip error checks, those functions may fail if not enough memory
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
				b_found = true;
#endif
				break;
			}
		}
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if(!b_found)
			return false;
		// try to process current block
#endif

		if(m_t_image_data.b_read_image_header && !m_t_image_data.b_want_image)
			return true;
		// can end if image header was read and image data is not required
	}
	// decode image, result should be in m_t_image_data::p_framebuffer

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
	if(!m_t_image_data.b_read_image_header)
		return false;
	// in case jpeg file did contain no images (or images were in block, assigned Read_Skip action)
#endif

	return true;
}

/*
 *	TBmp *CTinyJpegDecoder::_p_Decode_JPEG(CBufferredFile &file_buffer, bool b_thumbnail)
 *		- decode jpeg, convert it to RGBA, 8 bits per channel bitmap
 *		- alpha is always 0xff since jpeg files don't store alpha channel
 *		- return 0 on failure (not enough memory / unsupported format / damaged file)
 */
TBmp *CTinyJpegDecoder::_p_Decode_JPEG(CBufferredFile &file_buffer, bool b_thumbnail)
{
	if(!Read_Blocks(file_buffer, b_thumbnail, true)) {
		for(int i = 0; i < __JPEG_DEC_MAX_COMPONENT_NUM; ++ i) {
			if(m_t_image_data.p_framebuffer[i])
				delete[] m_t_image_data.p_framebuffer[i];
		}
		return false;
	}
	// read jpeg blocks

	TBmp *p_bitmap;
	if(!(p_bitmap = new(std::nothrow) TBmp)) {
		for(int i = 0; i < __JPEG_DEC_MAX_COMPONENT_NUM; ++ i) {
			if(m_t_image_data.p_framebuffer[i])
				delete[] m_t_image_data.p_framebuffer[i];
		}
		return 0;
	}
	p_bitmap->n_former_bpc = m_t_image_header.n_sample_precision;
	p_bitmap->b_grayscale = m_t_image_header.n_component_num == 1;
	p_bitmap->b_alpha = false; // jpeg files don't have alpha channel
	if(m_t_image_data.b_want_thumbnail) {
		p_bitmap->n_width = max(1, m_t_image_header.n_width / 8);
		p_bitmap->n_height = max(1, m_t_image_header.n_height / 8);
	} else {
		p_bitmap->n_width = m_t_image_header.n_width;
		p_bitmap->n_height = m_t_image_header.n_height;
	}
	if(!(p_bitmap->p_buffer = new(std::nothrow) uint32_t[p_bitmap->n_width * p_bitmap->n_height])) {
		for(int i = 0; i < __JPEG_DEC_MAX_COMPONENT_NUM; ++ i) {
			if(m_t_image_data.p_framebuffer[i])
				delete[] m_t_image_data.p_framebuffer[i];
		}
		delete p_bitmap;
		return 0;
	}
	// alloc bitmap structure

	if(m_t_image_header.n_component_num == 1) {
#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
		if(m_t_image_data.b_lossless) {
			CImageRenderer<1, CPackConversion1>::RenderImage(p_bitmap,
				m_t_image_header.p_component_info, m_t_image_data);
		} else
#endif
		{
			TComponentInfo *p_Y_component = m_t_image_header.p_component_info;
			// there's just one, we don't have to search

#ifdef __JPEG_DEC_VERBOSE
			printf("rendering grayscale image\n");
#endif

#ifdef __JPEG_DEC_ALLOW_SUBSAMPLED_GRAYSCALE
			if(p_Y_component->n_scale_horiz == 1 && p_Y_component->n_scale_vert == 1) {
#endif
				uint8_t *p_src_buffer = m_t_image_data.p_framebuffer[p_Y_component->n_component_id - 1];
				for(uint32_t *p_dest_scanline = p_bitmap->p_buffer, *p_end_scanline =
				   p_bitmap->p_buffer + p_bitmap->n_width * p_bitmap->n_height;
				   p_dest_scanline < p_end_scanline; p_dest_scanline += p_bitmap->n_width) {
					for(uint32_t *p_dest_pixel = p_dest_scanline, *p_end = p_dest_scanline +
					   m_t_image_header.n_width; p_dest_pixel < p_end;) {
						uint8_t n_gray_ub = *p_src_buffer ++ + 128;
						uint32_t n_gray = ((uint32_t)n_gray_ub << 16) |
							((uint16_t)n_gray_ub << 8) | (n_gray_ub) | 0xff000000;
						// calc color

						*p_dest_pixel ++ = n_gray;
					}
					p_src_buffer += m_t_image_data.p_framebuffer_width[p_Y_component->n_component_id -
						1] - p_bitmap->n_width;
				}
				// grayscale images should always have sampling 1x1
#ifdef __JPEG_DEC_ALLOW_SUBSAMPLED_GRAYSCALE
			} else {
				CImageRenderer<1, CGrayConversion>::RenderImage(p_bitmap,
					m_t_image_header.p_component_info, m_t_image_data);
			}
#endif
		}
		// semi-fast fill algorithm (could write several ones for each subscale,
		// but aim of this decoder is to minimize code size)
	} else if(m_t_image_header.n_component_num == 3) {
#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
		if(m_t_image_data.b_lossless) {
			CImageRenderer<3, CPackConversion3>::RenderImage(p_bitmap,
				m_t_image_header.p_component_info, m_t_image_data);
		} else
#endif
		{
			CImageRenderer<3, CRGBConversion>::RenderImage(p_bitmap,
				m_t_image_header.p_component_info, m_t_image_data);
		}
#if defined(__JPEG_DEC_SUPPORT_YCCK)
	} else if(m_t_image_header.n_component_num == 4) {
#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
		if(m_t_image_data.b_lossless) {
			CImageRenderer<4, CPackConversion4>::RenderImage(p_bitmap,
				m_t_image_header.p_component_info, m_t_image_data);
		} else
#endif
		{
			CImageRenderer<4, CYCCKConversion>::RenderImage(p_bitmap,
				m_t_image_header.p_component_info, m_t_image_data);
		}
#elif defined(__JPEG_DEC_SUPPORT_CMYK)
	} else if(m_t_image_header.n_component_num == 4) {
#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
		if(m_t_image_data.b_lossless) {
			CImageRenderer<4, CPackConversion4>::RenderImage(p_bitmap,
				m_t_image_header.p_component_info, m_t_image_data);
		} else
#endif
		{
			CImageRenderer<4, CCMYKConversion>::RenderImage(p_bitmap,
				m_t_image_header.p_component_info, m_t_image_data);
		}
#elif defined(__JPEG_DEC_SUPPORT_LOSSLESS)
	} else if(m_t_image_header.n_component_num == 4 && m_t_image_data.b_lossless) {
		CImageRenderer<4, CPackConversion4>::RenderImage(p_bitmap,
			m_t_image_header.p_component_info, m_t_image_data);
#endif
#if defined(__JPEG_DEC_SUPPORT_RG)
	} else if(m_t_image_header.n_component_num == 2) {
#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
		if(m_t_image_data.b_lossless) {
			CImageRenderer<2, CPackConversion2>::RenderImage(p_bitmap,
				m_t_image_header.p_component_info, m_t_image_data);
		} else
#endif
		{
			CImageRenderer<2, CRGConversion>::RenderImage(p_bitmap,
				m_t_image_header.p_component_info, m_t_image_data);
		}
#elif defined(__JPEG_DEC_SUPPORT_LOSSLESS)
	} else if(m_t_image_header.n_component_num == 2 && m_t_image_data.b_lossless) {
		CImageRenderer<2, CPackConversion2>::RenderImage(p_bitmap,
			m_t_image_header.p_component_info, m_t_image_data);
#endif
	} else {
		delete[] p_bitmap->p_buffer;
		delete p_bitmap;
		p_bitmap = 0;
	}
	// convert from Y or YCbCr to RGB // t_odo - write support for LUMINANCE_ALPHA (or RG) and CMYK jpegs

	for(int i = 0; i < __JPEG_DEC_MAX_COMPONENT_NUM; ++ i) {
		if(m_t_image_data.p_framebuffer[i])
			delete[] m_t_image_data.p_framebuffer[i];
	}
	// cleanup

	return p_bitmap;
}

/*
 *	bool CTinyJpegDecoder::Supply_QuantTable(int n_index, const void *p_table, bool b_16_bit)
 *		- it's possible to supply quantization tables before decoding image
 *		- (image is decoded anyway, even if it doesn't contain quantization
 *		  tables and you can save some space if all your images use the same
 *		  quantization tables)
 *		- n_index has to be in range 0-3 (there are four tables in jpeg)
 *		- p_table points to table data (if b_16_bit == false, then it's array
 *		  of 64 unsigned chars, if b_16_bit == true then it's array of 64
 *		  unsigned shorts)
 *		- returns true in case n_index contains valid value, othervise false
 */
bool CTinyJpegDecoder::Supply_QuantTable(int n_index, const void *p_table, bool b_16_bit)
{
	CTinyJpegDecoder *p_this = this;
	// dying for best compression ratio (ie. best resemblence to Read_QuantTables())

	uint8_t n_quant_table_properties;
	n_quant_table_properties = ((uint8_t)n_index) | ((b_16_bit)? 0x10 : 0);

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
	if((n_quant_table_properties & 0xf) < 0 || (n_quant_table_properties & 0xf) > 3)
		return false;
	// index of quantization table must be <0, 3>

	if((n_quant_table_properties >> 4) != 0 && (n_quant_table_properties >> 4) != 1)
		return false;
	// quantization table precision must be 0 (8 bit) or 1 (16 bit)
#endif

	TQuantTable *p_cur_quant_table = &p_this->m_p_quant_table[n_quant_table_properties & 0x0f];
	p_cur_quant_table->b_16_bit = (n_quant_table_properties >> 4) == 1;

#ifdef __JPEG_DEC_INTEGER_IDCT
	for(int *p_value = p_cur_quant_table->p_value,
	   *p_end = p_cur_quant_table->p_value + 64; p_value < p_end; ++ p_value) {
		if(p_cur_quant_table->b_16_bit) {
			uint16_t n_tmp;
			n_tmp = *(const uint16_t*)p_table;
			p_table = ((const uint16_t*)p_table) + 1;
			*p_value = n_tmp; // todo - get file with such a quant_table and see what should be here
		} else {
			uint8_t n_tmp;
			n_tmp = *(const uint8_t*)p_table;
			p_table = ((const uint8_t*)p_table) + 1;
			*p_value = n_tmp;
		}
	}
#else // __JPEG_DEC_INTEGER_IDCT
	for(float *p_value = p_cur_quant_table->p_value,
	   *p_end = p_cur_quant_table->p_value + 64; p_value < p_end; ++ p_value) {
		if(p_cur_quant_table->b_16_bit) {
			uint16_t n_tmp;
			n_tmp = *(const uint16_t*)p_table;
			p_table = ((const uint16_t*)p_table) + 1;
			*p_value = (float)n_tmp; // todo - get file with such a quant_table and see what should be here
		} else {
			uint8_t n_tmp;
			n_tmp = *(const uint8_t*)p_table;
			p_table = ((const uint8_t*)p_table) + 1;
			*p_value = (float)n_tmp;
		}
	}
#endif // __JPEG_DEC_INTEGER_IDCT
	// read table data

	for(int i = 0; i < 64; ++ i) {
		int n_index = p_zig_indices[i];
#ifdef __JPEG_DEC_INTEGER_IDCT
		p_cur_quant_table->p_value[i] = (int)((p_cur_quant_table->p_value[i] *
			CFastDCT8_2D::p_PrescaleTable()[n_index / 8] *
			CFastDCT8_2D::p_PrescaleTable()[n_index % 8]) * 0x10000);
#else
		p_cur_quant_table->p_value[i] *=
			CFastDCT8_2D::p_PrescaleTable()[n_index / 8] *
			CFastDCT8_2D::p_PrescaleTable()[n_index % 8];
#endif
	}
	// hack! use scale factor here and i can use faster IDCT function

#ifdef __JPEG_DEC_VERBOSE
	printf("supplied quant table [n_index=%d, n_precision=%d]\n", n_quant_table_properties & 0x0f,
		n_quant_table_properties >> 4);
	{
		const char p_fwd_zig_indices[64] = {
			 0,  1,  5,  6, 14, 15, 27, 28,
			 2,  4,  7, 13, 16, 26, 29, 42,
			 3,  8, 12, 17, 25, 30, 41, 43,
			 9, 11, 18, 24, 31, 40, 44, 53,
			10, 19, 23, 32, 39, 45, 52, 54,
			20, 22, 33, 38, 46, 51, 55, 60,
			21, 34, 37, 47, 50, 56, 59, 61,
			35, 36, 48, 49, 57, 58, 62, 63
		};

		for(int i = 0; i < 64; ++ i) {
			printf("%5d%c", (int)p_cur_quant_table->p_value[p_fwd_zig_indices[i]],
				((i % 8) == 7)? '\n' : ' ');
		}
	}
	// debug
#endif

	return true;
}

/*
 *	bool CTinyJpegDecoder::Read_SkipMarker(uint16_t UNUSED(n_block_id),
 *		CTinyJpegDecoder *UNUSED(p_this), CBufferredFile *UNUSED(p_file_buffer))
 *		- do nothing, just return true (only to be put action in table)
 *		- always return true
 */
bool CTinyJpegDecoder::Read_SkipMarker(uint16_t UNUSED(n_block_id),
	CTinyJpegDecoder *UNUSED(p_this), CBufferredFile *UNUSED(p_file_buffer))
{
	return true;
}

/*
 *	bool CTinyJpegDecoder::Read_Unsupported(uint16_t n_block_id,
 *		 CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
 *		- do nothing, just return true (only to be put action in table)
 *		- always return false
 */
bool CTinyJpegDecoder::Read_Unsupported(uint16_t UNUSED(n_block_id),
	CTinyJpegDecoder *UNUSED(p_this), CBufferredFile *UNUSED(p_file_buffer))
{
	return false;
}

/*
 *	bool CTinyJpegDecoder::Read_Skip(uint16_t n_block_id,
 *		CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
 *		- read block length and skip the block
 *		  (for blocks we don't support - comments, exif data, etc ...)
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Read_Skip(uint16_t UNUSED(n_block_id),
	CTinyJpegDecoder *UNUSED(p_this), CBufferredFile *p_file_buffer)
{
	uint16_t n_length;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Short(n_length);
#else
	if(!p_file_buffer->Read_Short(n_length))
		return false;
#endif

#ifdef __JPEG_DEC_VERBOSE
	printf("\tskip %d bytes\n", n_length);
	// debug
#endif

	return p_file_buffer->Skip_Bytes(n_length - 2);
}

/*
 *	bool CTinyJpegDecoder::Read_JFIF_Header(uint16_t n_block_id,
 *		CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
 *		- read jpeg file interchange formad header and verify it
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Read_JFIF_Header(uint16_t UNUSED(n_block_id),
	CTinyJpegDecoder *UNUSED(p_this), CBufferredFile *p_file_buffer)
{
	uint16_t n_length;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Short(n_length);
#else
	if(!p_file_buffer->Read_Short(n_length))
		return false;
#endif

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
	if(n_length < 16)
		return false;
#endif
	//n_length -= 2;
	// length must be >= 16

#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Skip_Bytes(5);
#else
	char p_s_jfif[5];
	for(char *p_ptr = p_s_jfif, *p_end = p_s_jfif + 5; p_ptr < p_end; ++ p_ptr) {
		uint8_t n_byte;
		if(!p_file_buffer->Read_Byte(n_byte))
			return false;
		*p_ptr = n_byte;
	}
	if(p_s_jfif[4] || strcmp(p_s_jfif, "JFIF"))
		return false;
#endif
	//n_length -= 5;
	// must contain JFIF signature

#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Skip_Bytes(2);
#else
	uint8_t n_version_hi, n_version_lo;
	if(!p_file_buffer->Read_Byte(n_version_hi) ||
	   !p_file_buffer->Read_Byte(n_version_lo))
		return false;
	if(n_version_hi != 1 || n_version_lo > 2)
		return false;
#endif
	//n_length -= 2;
	// read version, should be 1.0 to 1.2

#if defined(__JPEG_DEC_VERBOSE) && !defined(__JPEG_DEC_STRIP_ERROR_CHECKS)
	printf("read header [p_s_signature=\'%s\', v_version=%d.%d]\n",
		p_s_jfif, n_version_hi, n_version_lo);
	// debug
#endif

	n_length -= 9; // sum of all fields (1 sub instead of 3)
	if(!p_file_buffer->Skip_Bytes(n_length))
		return false;
	// skip the rest (contains 1 byte units (0 = pixels, 1 = inches, 2 = cm),
	// 2 bytes x-density per unit, 2 bytes y-density per unit, 1 bit thumbnail width,
	// 1 bit thumbnail height, RGB 3 byte per pixel uncompressed thumbnail)

	return true;
}

/*
 *	bool CTinyJpegDecoder::Read_RestartInterval(uint16_t UNUSED(n_block_id),
 *		CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
 *		- read restart interval block, store data in p_this->m_t_restart
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Read_RestartInterval(uint16_t UNUSED(n_block_id),
	CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
{
	uint16_t n_length;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Short(n_length);
#else
	if(!p_file_buffer->Read_Short(n_length))
		return false;
#endif

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
	if(n_length < 4)
		return false;
#endif

#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Short(p_this->m_t_restart.n_interval);
#else
	if(!p_file_buffer->Read_Short(p_this->m_t_restart.n_interval))
		return false;
#endif

	p_this->m_t_restart.b_enabled = p_this->m_t_restart.n_interval > 0;

#ifdef __JPEG_DEC_VERBOSE
	printf("read restart interval definition [n_interval=%d]\n",
		p_this->m_t_restart.n_interval);
	// debug
#endif

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
	return p_file_buffer->Skip_Bytes(n_length - 4);
#else
	return true; // that's right, n_length - 4 should be 0
#endif
}

/*
 *	bool CTinyJpegDecoder::Read_QuantTables(uint16_t n_block_id,
 *		CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
 *		- read quantization tables block, supports 8-bit and 16-bit quantization tables
 *		- hack! quantization tables are pre-scaled here so it's possible to use faster IDCT function
 *		- todo - find out how is 16-bit quantization table treated? is it bit-shifted or not?
 *		  (it's not bit-shifted now as this seems to be default for jpeg)
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Read_QuantTables(uint16_t UNUSED(n_block_id),
	CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer) // read quant table block (there are )
{
	uint16_t n_length;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Short(n_length);
#else
	if(!p_file_buffer->Read_Short(n_length))
		return false;
#endif
	n_length -= 2;

	while(n_length >= 65) {
		uint8_t n_quant_table_properties;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
		p_file_buffer->Read_Byte(n_quant_table_properties);
#else
		if(!p_file_buffer->Read_Byte(n_quant_table_properties))
			return false;
#endif
		n_length --;

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if((n_quant_table_properties & 0xf) < 0 || (n_quant_table_properties & 0xf) > 3)
			return false;
		// index of quantization table must be <0, 3>

		if((n_quant_table_properties >> 4) != 0 && (n_quant_table_properties >> 4) != 1)
			return false;
		// quantization table precision must be 0 (8 bit) or 1 (16 bit)
#endif

		TQuantTable *p_cur_quant_table = &p_this->m_p_quant_table[n_quant_table_properties & 0x0f];
		p_cur_quant_table->b_16_bit = (n_quant_table_properties >> 4) == 1;

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if(n_length < ((p_cur_quant_table->b_16_bit)? 128 : 64))
			return false;
#endif
		n_length -= (p_cur_quant_table->b_16_bit)? 128 : 64;
		// there must be enough space for table data

#ifdef __JPEG_DEC_INTEGER_IDCT
		for(int *p_value = p_cur_quant_table->p_value,
		   *p_end = p_cur_quant_table->p_value + 64; p_value < p_end; ++ p_value) {
			if(p_cur_quant_table->b_16_bit) {
				uint16_t n_tmp;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
				p_file_buffer->Read_Short(n_tmp);
#else
				if(!p_file_buffer->Read_Short(n_tmp))
					return false;
#endif
				*p_value = n_tmp; // todo - get file with such a quant_table and see what should be here
			} else {
				uint8_t n_tmp;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
				p_file_buffer->Read_Byte(n_tmp);
#else
				if(!p_file_buffer->Read_Byte(n_tmp))
					return false;
#endif
				*p_value = n_tmp;
			}
		}
#else // __JPEG_DEC_INTEGER_IDCT
		for(float *p_value = p_cur_quant_table->p_value,
		   *p_end = p_cur_quant_table->p_value + 64; p_value < p_end; ++ p_value) {
			if(p_cur_quant_table->b_16_bit) {
				uint16_t n_tmp;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
				p_file_buffer->Read_Short(n_tmp);
#else
				if(!p_file_buffer->Read_Short(n_tmp))
					return false;
#endif
				*p_value = (float)n_tmp; // todo - get file with such a quant_table and see what should be here
			} else {
				uint8_t n_tmp;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
				p_file_buffer->Read_Byte(n_tmp);
#else
				if(!p_file_buffer->Read_Byte(n_tmp))
					return false;
#endif
				*p_value = (float)n_tmp;
			}
		}
#endif // __JPEG_DEC_INTEGER_IDCT
		// read table data

#ifdef __JPEG_DEC_VERBOSE
		printf("read quant table [n_index=%d, n_precision=%d]\n", n_quant_table_properties & 0x0f,
			n_quant_table_properties >> 4);
		{
			const char p_fwd_zig_indices[64] = {
				 0,  1,  5,  6, 14, 15, 27, 28,
				 2,  4,  7, 13, 16, 26, 29, 42,
				 3,  8, 12, 17, 25, 30, 41, 43,
				 9, 11, 18, 24, 31, 40, 44, 53,
				10, 19, 23, 32, 39, 45, 52, 54,
				20, 22, 33, 38, 46, 51, 55, 60,
				21, 34, 37, 47, 50, 56, 59, 61,
				35, 36, 48, 49, 57, 58, 62, 63
			};

			for(int i = 0; i < 64; ++ i) {
				printf("%5d%c", (int)p_cur_quant_table->p_value[p_fwd_zig_indices[i]],
					((i % 8) == 7)? '\n' : ' ');
			}
		}
		// debug
#endif

		for(int i = 0; i < 64; ++ i) {
			int n_index = p_zig_indices[i]; // removed 1st element
#ifdef __JPEG_DEC_INTEGER_IDCT
			p_cur_quant_table->p_value[i] = (int)((p_cur_quant_table->p_value[i] *
				CFastDCT8_2D::p_PrescaleTable()[n_index / 8] *
				CFastDCT8_2D::p_PrescaleTable()[n_index % 8]) * 0x10000);
#else
			p_cur_quant_table->p_value[i] *=
				CFastDCT8_2D::p_PrescaleTable()[n_index / 8] *
				CFastDCT8_2D::p_PrescaleTable()[n_index % 8];
#endif
		}
		// hack! use scale factor here and i can use faster IDCT function
	}

	_ASSERTE(!n_length);

	return true;
}

/*
 *	bool CTinyJpegDecoder::Read_HuffmanTables(uint16_t n_block_id,
 *		CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
 *		- read huffman tables
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Read_HuffmanTables(uint16_t UNUSED(n_block_id),
	CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer) // read huffman table
{
	uint16_t n_length;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Short(n_length);
#else
	if(!p_file_buffer->Read_Short(n_length))
		return false;
#endif
	n_length -= 2;

	while(n_length >= 17) {
		uint8_t n_huff_table_properties;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
		p_file_buffer->Read_Byte(n_huff_table_properties);
#else
		if(!p_file_buffer->Read_Byte(n_huff_table_properties))
			return false;
#endif
		n_length --;

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if((n_huff_table_properties & 0xf) < 0 || (n_huff_table_properties & 0xf) > 3)
			return false;

		if((n_huff_table_properties >> 4) != 0 && (n_huff_table_properties >> 4) != 1)
			return false;
#endif

		THuffmanTable *p_cur_huff_table = &p_this->m_p_huffman_table[n_huff_table_properties &
			0xf][n_huff_table_properties >> 4];
		//bool b_AC_table = (n_huff_table_properties >> 4) == 1; // unused

		int n_tot_component_num = 0;
		{int n_index = 0, n_code_value = 0;
		for(uint8_t *p_component_num = p_cur_huff_table->p_code_num,
		   *p_end = p_cur_huff_table->p_code_num + 16; p_component_num < p_end;
		   ++ p_component_num, ++ n_index) {
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
			p_file_buffer->Read_Byte(*p_component_num);
#else
			if(!p_file_buffer->Read_Byte(*p_component_num))
				return false;
#endif
			p_cur_huff_table->p_code[n_index] = &p_cur_huff_table->p_code_table[n_tot_component_num];
			// set pointer to code list

			p_cur_huff_table->p_min_code_value[n_index] = n_code_value;
			n_code_value += *p_component_num;
			//p_cur_huff_table->n_max_code_value[n_index] = n_code_value; // it's easy to calculate, don't store it
			n_code_value <<= 1;
			// manage minimal and maximal code values

			n_tot_component_num += *p_component_num;
		}}
		n_length -= 16;
		// read total component numbers, assign pointers

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if(n_length < n_tot_component_num)
			return false;
#endif
		n_length -= n_tot_component_num;
		// there must be enough space for table data

		for(uint8_t *p_component = p_cur_huff_table->p_code_table,
		   *p_end = p_cur_huff_table->p_code_table + n_tot_component_num;
		   p_component < p_end; ++ p_component) {
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
			p_file_buffer->Read_Byte(*p_component);
#else
			if(!p_file_buffer->Read_Byte(*p_component))
				return false;
#endif
		}
		// read all table components

		p_cur_huff_table->b_valid = true;
		// we read it

#ifdef __JPEG_DEC_VERBOSE
		printf("read huffman table [n_index=%d, b_AC=%d] with %d components\n\t",
			n_huff_table_properties & 0xf, n_huff_table_properties >> 4, n_tot_component_num);
		for(int i = 0; i < 16; ++ i)
			printf("%d%c", p_cur_huff_table->p_code_num[i], (i == 15)? '\n' : ' ');
		printf("it means this table defines those symbols:\n");
		int n_code = 0;
		for(int i = 0; i < 16; ++ i) {
			if(p_cur_huff_table->p_code_num[i]) {
				_ASSERTE(p_cur_huff_table->p_min_code_value[i] == n_code /*&&
					p_cur_huff_table->n_max_code_value[i] == n_code + p_cur_huff_table->p_code_num[i]*/);
				printf("\t%d bit codes %d-%d == <%d,%d): ", i + 1, n_code, n_code +
					p_cur_huff_table->p_code_num[i] - 1, p_cur_huff_table->p_min_code_value[i],
					p_cur_huff_table->p_min_code_value[i] + p_cur_huff_table->p_code_num[i]);
				for(int j = n_code; j < n_code + p_cur_huff_table->p_code_num[i]; ++ j) {
					for(int n = 0; n <= i; ++ n)
						printf("%d", (j >> (i - n)) & 1);
					printf(" ");
				}
				printf("\n");
			}
			n_code += p_cur_huff_table->p_code_num[i];
			n_code *= 2;
		}
		// debug
#endif
	}

	_ASSERTE(!n_length);

	return true;
}

/*
 *	bool CTinyJpegDecoder::Read_FrameHeader(uint16_t n_block_id,
 *		CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
 *		- read frame header and alloc temporary buffers for Y(CbCr) planes
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Read_FrameHeader(uint16_t UNUSED(n_block_id),
	CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
{
	uint16_t n_length;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Short(n_length);
#else
	if(!p_file_buffer->Read_Short(n_length))
		return false;
#endif

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
	if(n_length < 8)
		return false;
#endif

	TImageHeader *p_header = &p_this->m_t_image_header;

#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Byte(p_header->n_sample_precision);
	p_file_buffer->Read_Short(p_header->n_height);
	p_file_buffer->Read_Short(p_header->n_width);
	p_file_buffer->Read_Byte(p_header->n_component_num);
#else // __JPEG_DEC_STRIP_ERROR_CHECKS
	if(!p_file_buffer->Read_Byte(p_header->n_sample_precision) ||
	   !p_file_buffer->Read_Short(p_header->n_height) ||
	   !p_file_buffer->Read_Short(p_header->n_width) ||
	   !p_file_buffer->Read_Byte(p_header->n_component_num) ||
	   p_header->n_height < 0 || p_header->n_width < 0 ||
	   p_header->n_component_num == 0 || p_header->n_component_num > __JPEG_DEC_MAX_COMPONENT_NUM)
		return false;

#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
	if((n_block_id == marker_SOF3_HuffLossless && (p_header->n_sample_precision < 2 ||
	   p_header->n_sample_precision > 12)) || (n_block_id != marker_SOF3_HuffLossless &&
	   p_header->n_sample_precision != 8 && p_header->n_sample_precision != 12))
		return false;
#else // __JPEG_DEC_SUPPORT_LOSSLESS
	if(p_header->n_sample_precision != 8 && p_header->n_sample_precision != 12)
		return false;
#endif // __JPEG_DEC_SUPPORT_LOSSLESS
	// see if sample precision

#if defined(__JPEG_DEC_SUPPORT_CMYK) || defined(__JPEG_DEC_SUPPORT_YCCK)
		if(p_header->n_component_num > 4)
			return false;
#else
		if(p_header->n_component_num > 3)
			return false;
#endif
	// see if component num is valid

#ifndef __JPEG_DEC_SUPPORT_RG
		if(p_header->n_component_num == 2)
			return false;
#endif
	// see if component num is valid

#endif // __JPEG_DEC_STRIP_ERROR_CHECKS
	// read header
	
#ifdef __JPEG_DEC_VERBOSE
	printf("read frame header [n_bpp=%d, n_width=%d, n_height=%d, n_component_num=%d]\n",
		p_header->n_sample_precision, p_header->n_width, p_header->n_height, p_header->n_component_num);
	// debug
#endif

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
	if(n_length < 8 + p_header->n_component_num * 3)
		return false;
#endif
	_ASSERTE(n_length == 8 + p_header->n_component_num * 3);
	// there should be enough space for component specification (or better exact as much space)

	uint8_t n_max_sampling_horiz = 0;
	uint8_t n_max_sampling_vert = 0;
	for(TComponentInfo *p_comp_info = p_header->p_component_info,
	   *p_end = p_header->p_component_info + p_header->n_component_num;
	   p_comp_info < p_end; ++ p_comp_info) {
		uint8_t n_sampling;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
		p_file_buffer->Read_Byte(p_comp_info->n_component_id);
		p_file_buffer->Read_Byte(n_sampling);
		p_file_buffer->Read_Byte(p_comp_info->n_quant_table_index);
#else
		if(!p_file_buffer->Read_Byte(p_comp_info->n_component_id) ||
		   !p_file_buffer->Read_Byte(n_sampling) ||
		   !p_file_buffer->Read_Byte(p_comp_info->n_quant_table_index) ||
		   p_comp_info->n_component_id < 1 || p_comp_info->n_component_id >
		   (p_header->n_component_num + 1) || p_comp_info->n_quant_table_index > 3)
			return false;

#endif
		p_comp_info->n_sampling_horiz = n_sampling >> 4;
		p_comp_info->n_sampling_vert = n_sampling & 0xf;
		if(n_max_sampling_horiz < p_comp_info->n_sampling_horiz)
			n_max_sampling_horiz = p_comp_info->n_sampling_horiz;
		if(n_max_sampling_vert < p_comp_info->n_sampling_vert)
			n_max_sampling_vert = p_comp_info->n_sampling_vert;
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if((p_comp_info->n_sampling_horiz != 1 && p_comp_info->n_sampling_horiz != 2 &&
		   p_comp_info->n_sampling_horiz != 4) || (p_comp_info->n_sampling_vert != 1 &&
		   p_comp_info->n_sampling_vert != 2 && p_comp_info->n_sampling_vert != 4))
			return false;
#endif
	}
	// read component specifications

	p_header->n_macro_block_x_num = (p_header->n_width + (n_max_sampling_horiz * 8 - 1)) /
		(n_max_sampling_horiz * 8); // t_odo - here's an error
	p_header->n_macro_block_num = p_header->n_macro_block_x_num *
		((p_header->n_height + (n_max_sampling_vert * 8 - 1)) / (n_max_sampling_vert * 8));
	// number of macro-blocks (not 64-element blocks in case image has other sampling than 1x1)

#ifdef __JPEG_DEC_VERBOSE
	printf("\tn_macro_block_x_num=%d n_macro_block_y_num=%d (%d blocks)\n", p_header->n_macro_block_x_num,
		p_header->n_macro_block_num / p_header->n_macro_block_x_num, p_header->n_macro_block_num);
	printf("\tn_max_sampling_horiz=%d n_max_sampling_vert=%d\n", n_max_sampling_horiz, n_max_sampling_vert);
#endif

	for(TComponentInfo *p_comp_info = p_header->p_component_info,
	   *p_end = p_header->p_component_info + p_header->n_component_num;
	   p_comp_info < p_end; ++ p_comp_info) {
		p_comp_info->n_scale_horiz = n_max_sampling_horiz / p_comp_info->n_sampling_horiz;
		p_comp_info->n_scale_vert = n_max_sampling_vert / p_comp_info->n_sampling_vert;
#ifdef __JPEG_DEC_VERBOSE
		printf("\tcomponent [n_id=%d, v_sampling=%dx%d, v_scale=%dx%d, n_quant_table_index=%d]\n",
			p_comp_info->n_component_id, p_comp_info->n_sampling_horiz, p_comp_info->n_sampling_vert,
			p_comp_info->n_scale_horiz, p_comp_info->n_scale_vert, p_comp_info->n_quant_table_index);
		// debug
#endif
	}
	// determine component scaling

	p_this->m_t_image_data.b_read_image_header = true;
#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
	p_this->m_t_image_data.b_lossless = n_block_id == marker_SOF3_HuffLossless;
#endif

	if(p_this->m_t_image_data.b_want_image) {
		for(TComponentInfo *p_comp_info = p_header->p_component_info,
		   *p_end = p_header->p_component_info + p_header->n_component_num;
		   p_comp_info < p_end; ++ p_comp_info) {
			int n = p_comp_info->n_component_id - 1;
			p_this->m_t_image_data.p_framebuffer_width[n] = (p_header->n_width +
				p_comp_info->n_scale_horiz - 1) / p_comp_info->n_scale_horiz;
			p_this->m_t_image_data.p_framebuffer_height[n] = (p_header->n_height +
				p_comp_info->n_scale_vert - 1) / p_comp_info->n_scale_vert;
			// calc component image dimensions, based on sampling

			if(p_this->m_t_image_data.b_want_thumbnail) {
				p_this->m_t_image_data.p_framebuffer_width[n] /= 8;
				p_this->m_t_image_data.p_framebuffer_height[n] /= 8;
				if(!p_this->m_t_image_data.p_framebuffer_width[n])
					p_this->m_t_image_data.p_framebuffer_width[n] = 1;
				if(!p_this->m_t_image_data.p_framebuffer_height[n])
					p_this->m_t_image_data.p_framebuffer_height[n] = 1;
			}
			// in case we're decoding thumbnails, we want image to be 8 times smaller

			if(p_this->m_t_image_data.p_framebuffer_width[n] & 0x7) {
				p_this->m_t_image_data.p_framebuffer_width[n] &= ~0x7;
				p_this->m_t_image_data.p_framebuffer_width[n] += 8;
			}
			if(p_this->m_t_image_data.p_framebuffer_height[n] & 0x7) {
				p_this->m_t_image_data.p_framebuffer_height[n] &= ~0x7;
				p_this->m_t_image_data.p_framebuffer_height[n] += 8;
			}
			// round to closest greater multiple of eight so in filling loop i can be sure there is
			// either 8 pixels or none at all, which enables loop unrolling, etc

			if(!(p_this->m_t_image_data.p_framebuffer[n] =
			   new(std::nothrow) uint8_t[p_this->m_t_image_data.p_framebuffer_width[n] *
			   p_this->m_t_image_data.p_framebuffer_height[n]]))
				return false;
		}
	}
	// alloc frame-buffers

	return true;
}

/*
 *	bool CTinyJpegDecoder::Read_StartScan(uint16_t n_block_id,
 *		CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
 *		- read start scan block and decode the image
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Read_StartScan(uint16_t UNUSED(n_block_id),
	CTinyJpegDecoder *p_this, CBufferredFile *p_file_buffer)
{
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
	if(!p_this->m_t_image_data.b_read_image_header)
		return false;
	// must've read image header already
#endif

	uint16_t n_length;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Short(n_length);
#else
	if(!p_file_buffer->Read_Short(n_length))
		return false;
	if(n_length < 6)
		return false;
#endif

	TScanHeader t_scan_head;

#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Byte(t_scan_head.n_component_num);
#else
	if(!p_file_buffer->Read_Byte(t_scan_head.n_component_num) ||
	   t_scan_head.n_component_num != p_this->m_t_image_header.n_component_num)
		return false;
#endif
	_ASSERTE(t_scan_head.n_component_num == p_this->m_t_image_header.n_component_num);
	// read number of components

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
	if(n_length < 6 + 2 * t_scan_head.n_component_num)
		return false;
#endif
	_ASSERTE(n_length == 6 + 2 * t_scan_head.n_component_num);

#ifdef __JPEG_DEC_VERBOSE
	printf("start of scan [n_component_num=%d]\n", t_scan_head.n_component_num);
	// debug
#endif

	for(TScanHeader::TComponentInfo2 *p_comp = t_scan_head.p_component_info,
	   *p_end = t_scan_head.p_component_info + t_scan_head.n_component_num;
	   p_comp < p_end; ++ p_comp) {
		uint8_t n_entropy_table;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
		p_file_buffer->Read_Byte(p_comp->n_component_id);
		p_file_buffer->Read_Byte(n_entropy_table);
#else
		if(!p_file_buffer->Read_Byte(p_comp->n_component_id) ||
		   !p_file_buffer->Read_Byte(n_entropy_table))
			return false;
#endif
		p_comp->n_DC_entropy_table = n_entropy_table >> 4;
		p_comp->n_AC_entropy_table = n_entropy_table & 0xf;
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if(p_comp->n_DC_entropy_table > 3 || p_comp->n_AC_entropy_table > 3)
			return false;
#endif

		p_comp->p_component = 0;
		for(TComponentInfo *p_ih_comp = p_this->m_t_image_header.p_component_info,
		   *p_ih_end = p_this->m_t_image_header.p_component_info + t_scan_head.n_component_num;
		   p_ih_comp < p_ih_end; ++ p_ih_comp) {
			if(p_ih_comp->n_component_id == p_comp->n_component_id) {
				p_comp->p_component = p_ih_comp;
				break;
			}
		}
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if(!p_comp->p_component)
			return false;
#endif
		// find the component in image header

		p_comp->p_DC_table = &p_this->m_p_huffman_table[p_comp->n_DC_entropy_table][0];
		p_comp->p_AC_table = &p_this->m_p_huffman_table[p_comp->n_AC_entropy_table][1];
		// find huffman tables

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
		if(!p_comp->p_DC_table->b_valid || (!p_this->m_t_image_data.b_lossless && !p_comp->p_AC_table->b_valid))
			return false;
#else // __JPEG_DEC_SUPPORT_LOSSLESS
		if(!p_comp->p_DC_table->b_valid || !p_comp->p_AC_table->b_valid)
			return false;
#endif // __JPEG_DEC_SUPPORT_LOSSLESS
		// we didn't specify it!
#endif

#ifdef __JPEG_DEC_VERBOSE
		printf("\tcomponent [n_component_id=%d, n_DC_entropy_table=%d, n_AC_entropy_table=%d]\n",
			p_comp->n_component_id, p_comp->n_DC_entropy_table, p_comp->n_AC_entropy_table);
		// debug
#endif
	}
	// read components

#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	p_file_buffer->Read_Byte(t_scan_head.n_spectrum_first);
	p_file_buffer->Read_Byte(t_scan_head.n_spectrum_last);
	p_file_buffer->Read_Byte(t_scan_head.n_sucessive_approximation);
#else
	if(!p_file_buffer->Read_Byte(t_scan_head.n_spectrum_first) ||
	   !p_file_buffer->Read_Byte(t_scan_head.n_spectrum_last) ||
	   !p_file_buffer->Read_Byte(t_scan_head.n_sucessive_approximation))
		return false;
#endif
#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
	_ASSERTE((!p_this->m_t_image_data.b_lossless && t_scan_head.n_spectrum_first == 0 &&
		t_scan_head.n_spectrum_last == 63) || (p_this->m_t_image_data.b_lossless &&
		/*t_scan_head.n_spectrum_first >= 0 &&*/ t_scan_head.n_spectrum_first < 8 && // unsigned char is always >= 0
		t_scan_head.n_spectrum_last == 0));
#else
	_ASSERTE(t_scan_head.n_spectrum_first == 0 && t_scan_head.n_spectrum_last == 63);
#endif
	// read some more info (parts of it used for lossless mode, part for point transform,
	// part of it is for progressive DCT mode - progressive using saving just a part of
	// coefficients - image quality improves as more and more coefficients are read or
	// using successive approximation (saving component's integer most important bytes,
	// then lower and lower important bytes ...))

	CBitReader bit_reader; // blocks come in single bit-stream

#ifdef __JPEG_DEC_SUPPORT_LOSSLESS
	if(p_this->m_t_image_data.b_lossless) {
		if(!p_this->Decode_Lossless_Image(&t_scan_head, bit_reader, p_file_buffer))
			return false;
	} else
#endif
	{
		bool b_thumbnail = p_this->m_t_image_data.b_want_thumbnail;
		// request to decode thumbnail only?

		for(int i = 0; i < p_this->m_t_image_header.n_macro_block_num; ++ i) {
			if(!b_thumbnail) {
				if(!p_this->Decode_Baseline_MacroBlock(i, &t_scan_head, bit_reader, p_file_buffer))
					return false;
			} else {
				if(!p_this->Decode_Baseline_MacroBlock_DCOnly(i, &t_scan_head, bit_reader, p_file_buffer))
					return false;
			}
			if(p_this->m_t_restart.b_enabled && (i + 1) % p_this->m_t_restart.n_interval == 0 &&
			   i + 1 < p_this->m_t_image_header.n_macro_block_num) {
				for(int n = 0; n < p_this->m_t_image_header.n_component_num; ++ n)
					p_this->m_t_image_data.p_dc_value[n] = 0;
				// reset DC values

				uint16_t n_marker_code;
				if(!p_file_buffer->Read_Marker(n_marker_code))
					return false;
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
				if(n_marker_code < marker_RestartMod0 ||
				   n_marker_code > marker_RestartMod7)
					return false;
#endif
				// there has to be an restart marker

#ifdef __JPEG_DEC_DECODE_VERBOSE
				printf("\tencountered restart marker %d\n", n_marker_code - marker_RestartMod0);
				// debug
#endif

				bit_reader.Restart();
				// restart bit reader (will fetch another byte)
			}
		}
		// DCT baseline / DCT extended
	}
	// now the file contains image data, it needs to be decoded now ...

	return true;
}

#ifdef __JPEG_DEC_SUPPORT_LOSSLESS

/*
 *	bool CTinyJpegDecoder::Decode_Lossless_Image(const TScanHeader *p_scan_head,
 *		CBitReader &r_bit_reader, CBufferredFile *p_file_buffer)
 *		- decode the whole image, encoded in lossless mode (can't work in macro-blocks,
 *		  since in lossless compression, image data block is pixel and decompression
 *		  would be slow because of function call overhead)
 *		- p_scan_head is scan header, it contains some data, necessary for decoding process
 *		- r_bit_reader is class, reading data from file bit by bit (it has to be supplied
 *		  from outside as macro-blocks are not byte aligned, ie. bits from last byte, read
 *		  by one call to this function will be required for proper decoding in successive call)
 *		- p_file_buffer is buffered file reader class
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Decode_Lossless_Image(const TScanHeader *p_scan_head,
	CBitReader &r_bit_reader, CBufferredFile *p_file_buffer)
{
#ifdef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE
	const uint8_t *p_framebuffer_end[__JPEG_DEC_MAX_COMPONENT_NUM];
#endif
	uint8_t *p_framebuffer_begin[__JPEG_DEC_MAX_COMPONENT_NUM];
	uint8_t *p_framebuffer[__JPEG_DEC_MAX_COMPONENT_NUM];
	uint16_t p_framebuffer_width[__JPEG_DEC_MAX_COMPONENT_NUM];
	const THuffmanTable *p_huffman_table[__JPEG_DEC_MAX_COMPONENT_NUM];
	int16_t *p_dc_value[__JPEG_DEC_MAX_COMPONENT_NUM];
	// sorted arrays of values, necessary for decoding (all here for fast access)

	{int n_index = 0;
	for(const TScanHeader::TComponentInfo2 *p_comp = p_scan_head->p_component_info,
	   *p_end = p_scan_head->p_component_info + p_scan_head->n_component_num;
	   p_comp < p_end; ++ p_comp, ++ n_index) {
#if !defined(__JPEG_DEC_STRIP_ERROR_CHECKS) && !defined(__JPEG_DEC_ALLOW_SUBSAMPLED_IMAGES)
		if(p_comp->p_component->n_sampling_horiz != 1 ||
		   p_comp->p_component->n_sampling_vert != 1)
			return false;
#endif
		p_framebuffer[n_index] = m_t_image_data.p_framebuffer[p_comp->n_component_id - 1];
		p_framebuffer_width[n_index] =
			m_t_image_data.p_framebuffer_width[p_comp->n_component_id - 1];
		p_huffman_table[n_index] = p_comp->p_DC_table;
		p_dc_value[n_index] = &m_t_image_data.p_dc_value[p_comp->n_component_id - 1];
		p_framebuffer_begin[n_index] = p_framebuffer[n_index];
#ifdef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE
		p_framebuffer_end[n_index] = p_framebuffer[n_index] +
			p_framebuffer_width[n_index] * m_t_image_header.n_height;
#endif
	}}
	// assign framebuffers and make sure sampling of every component is 1x1

	char n_sample_bit_shift = (m_t_image_header.n_sample_precision > 8)?
		m_t_image_header.n_sample_precision - 8 : 8 - m_t_image_header.n_sample_precision;
	bool b_sample_shift_right = m_t_image_header.n_sample_precision > 8;
	uint8_t n_selected_predictor = p_scan_head->n_spectrum_first;

#ifdef __JPEG_DEC_ALLOW_SUBSAMPLED_IMAGES
#error "code path for arbitrary sampled lossless images was not written yet (next library version)"
#endif

	_ASSERTE(p_scan_head->n_component_num);
	for(int j = 0; j < m_t_image_header.n_height; ++ j) {
		uint8_t n_predictor = (j)? 2 : 0; // predictor for first scanline pixel; 0 means no prediction
		uint8_t n_predictor2 = (j)? n_selected_predictor : 1; // predictor for the rest of scanline
		for(int i = 0; i < m_t_image_header.n_width; ++ i, n_predictor = n_predictor2) {
			const THuffmanTable **p_huffman_table_ptr = &p_huffman_table[0];
			const uint16_t *p_framebuffer_width_ptr = &p_framebuffer_width[0];
			int16_t **p_dc_value_ptr = &p_dc_value[0];
#ifdef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE
			const uint8_t **p_framebuffer_end_ptr = &p_framebuffer_end[0];
			uint8_t **p_framebuffer_begin_ptr = &p_framebuffer_begin[0];
#endif
			for(uint8_t **p_framebuffer_ptr = &p_framebuffer[0],
			   **p_end_framebuffer = &p_framebuffer[0] + p_scan_head->n_component_num;
			   p_framebuffer_ptr < p_end_framebuffer; ++ p_huffman_table_ptr,
			   ++ p_framebuffer_width_ptr, ++ p_dc_value_ptr
#ifdef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE
			   , ++ p_framebuffer_end_ptr, ++ p_framebuffer_begin_ptr
#endif
			   ) {
#ifdef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE
				if(*p_framebuffer_ptr >= *p_framebuffer_end_ptr) {
					++ p_framebuffer_ptr; // !! (next component layer)
					continue;
				}/* else if(!i) {
					i += (*p_framebuffer_ptr - p_framebuffer_begin[p_framebuffer_ptr - &p_framebuffer[0]]) %
						*p_framebuffer_width_ptr;
					// RLE may run over the end of the scanline ... watch out so i is always right

					if(i)
						n_predictor = n_predictor2;
					// in case we continue with some of the next pixels, we need the right predictor
				}*/ // must not temper with i because of the other component layers, predictor selection is overriden for RLE
				// this component layer was decompressed earlier due to RLE compression
#endif

				int16_t n_pixel_value;
#ifdef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE
				int16_t n_run_length;
#endif
				{uint16_t n_number = 0;
				for(int k = 0; /*k < 16*/; ++ k) {
					n_number <<= 1;
					n_number |= r_bit_reader.n_GetBit(p_file_buffer);
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
					if(r_bit_reader.b_Error())
						return false;
#endif
					// get another bit to our number

					if(//n_number >= (*p_huffman_table_ptr)->p_min_code_value[k] &&
					   n_number < (*p_huffman_table_ptr)->p_min_code_value[k] +
					   (*p_huffman_table_ptr)->p_code_num[k]) {
						_ASSERTE(n_number >= (*p_huffman_table_ptr)->p_min_code_value[k]);
						// we have code in table

#ifdef __JPEG_DEC_DECODE_VERBOSE
						printf("DC %d, ", n_number);
						// debug
#endif

						uint8_t n_huff_data = (*p_huffman_table_ptr)->p_code[k][n_number -
							(*p_huffman_table_ptr)->p_min_code_value[k]];
						// read data from huffman table, one nibble is number of preceding zeroes,
						// the other one is length of actual encoded value, in bits
						// for DC it's actualy just number of bits

#ifdef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE
						n_run_length = (int16_t)(n_huff_data >> 4) + 1;
						// extract run-length
#endif

						if(n_huff_data & 0xf) {
							**p_dc_value_ptr += r_bit_reader.n_GetNumber(n_huff_data & 0xf, p_file_buffer);
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
							if(r_bit_reader.b_Error())
								return false;
#endif
						}
						n_pixel_value = **p_dc_value_ptr;

						//printf("%dx %d\n", n_run_length, n_pixel_value);
						// debug

						break;
					}
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
					if(k == 15)
						return false;
#endif
				}}
				// decode DC value

#ifdef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE
				int n_position = *p_framebuffer_ptr - *p_framebuffer_begin_ptr;
				if(n_position < *p_framebuffer_width_ptr) {
					if(n_position)
						n_predictor = 1;
					else
						n_predictor = 0;
				} else if(n_position % *p_framebuffer_width_ptr == 0)
					n_predictor = 2;
				else
					n_predictor = n_selected_predictor;
				// choose predictor for each component individualy ... necessary for interleaved RLE samples
#endif

				int16_t n_division_mask = (m_t_image_header.n_sample_precision < 8)?
					((1 << m_t_image_header.n_sample_precision) - 1) <<
					(8 - m_t_image_header.n_sample_precision) : 0xffff;
				// in case predictor uses division by 2 and sample precision is less than 8,
				// division result has to be rounded as if it was calculated in source sample
				// precision (it's calculated in 8-bit precision)

				int16_t n_prediction;
				if(n_predictor > 3) { // 4, 5, 6, 7
					if(n_predictor >= 6) { // 6, 7
						if(n_predictor == 6) { // 6 (pred_B_plus_A_minus_C_half)
							n_prediction = (*p_framebuffer_ptr)[- *p_framebuffer_width_ptr] +
								((((*p_framebuffer_ptr)[-1] - (*p_framebuffer_ptr)[-1 -
								*p_framebuffer_width_ptr]) >> 1) & n_division_mask);
						} else { // 7 (pred_A_plus_B_half)
							n_prediction = (((*p_framebuffer_ptr)[-1] +
								(*p_framebuffer_ptr)[- *p_framebuffer_width_ptr]) >> 1) &
								n_division_mask;
						}
					} else { // 4, 5
						if(n_predictor == 4) { // 4 (pred_A_plus_B_minus_C)
							n_prediction = (*p_framebuffer_ptr)[-1] +
								(*p_framebuffer_ptr)[- *p_framebuffer_width_ptr] -
								(*p_framebuffer_ptr)[-1 - *p_framebuffer_width_ptr];
						} else { // 5 (pred_A_plus_B_minus_C_half)
							n_prediction = (*p_framebuffer_ptr)[-1] +
								((((*p_framebuffer_ptr)[- *p_framebuffer_width_ptr] -
								(*p_framebuffer_ptr)[-1 - *p_framebuffer_width_ptr]) >> 1) &
								n_division_mask);
						}
					}
				} else { // 1, 2, 3
					if(n_predictor >= 2) { // 2, 3
						if(n_predictor == 2) // 2 (pred_B)
							n_prediction = (*p_framebuffer_ptr)[- *p_framebuffer_width_ptr];
						else // 3 (pred_C)
							n_prediction = (*p_framebuffer_ptr)[-1 - *p_framebuffer_width_ptr];
					} else if(n_predictor) // 1 (pred_A)
						n_prediction = (*p_framebuffer_ptr)[-1];
					else // 0 (no prediction)
						n_prediction = 1 << (8/*m_t_image_header.n_sample_precision*/ - 1); // 2 ^ (P - 1);
				}
				// calc prediction (decission tree should be outside the loop, but i'm trying
				// to make the code as small as possible while keeping it fast. worst case here
				// is 2 cmp, 1 jg and 2 je / jge)

#ifdef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE // t_odo - rewrite a bit so it can handle multi-component RLE images (there must be different predictor selection than now, because every component could use different predictor in the same sample - due to RLE, sample dest position isn't the same for the components)
				uint8_t n_final_pixel_value = n_prediction - ((b_sample_shift_right)?
					n_pixel_value >> n_sample_bit_shift : n_pixel_value << n_sample_bit_shift);

				int n_x = (*p_framebuffer_ptr - *p_framebuffer_begin_ptr) %
					*p_framebuffer_width_ptr;
				if(n_x + n_run_length >= m_t_image_header.n_width) {
					int n_scanline_num = (n_x + n_run_length) / m_t_image_header.n_width;
					int n_scanline_padding = *p_framebuffer_width_ptr - m_t_image_header.n_width;
					//j += (i + n_run_length) / m_t_image_header.n_width - 1; // can jump over multiple scanlines
					//i += n_run_length - 1;
					// would terminate decoding too soon ... some component layers could end up unfinished
					while(1) {
						for(; n_x < m_t_image_header.n_width && n_run_length > 0; -- n_run_length, ++ n_x)
							*(*p_framebuffer_ptr) ++ = n_final_pixel_value;
						if(n_run_length) {
							n_scanline_num --;
							(*p_framebuffer_ptr) += n_scanline_padding;
							// there is something more - we have to skip the padding at the end of the scanline
						} else {
							if(n_scanline_num)
								(*p_framebuffer_ptr) += n_scanline_padding;
							break; // otherwise that was the last part and we can quit
						}
						_ASSERTE(n_x == m_t_image_header.n_width);
						n_x = 0;
					}
				} else {
					//i += n_run_length - 1;
					// would terminate decoding too soon ... some component layers could end up unfinished
					for(; n_run_length > 0; -- n_run_length)
						*(*p_framebuffer_ptr) ++ = n_final_pixel_value;
				}

				p_framebuffer_ptr ++; // next component layer
#else
				*(*p_framebuffer_ptr ++) ++ = n_prediction - ((b_sample_shift_right)?
					n_pixel_value >> n_sample_bit_shift : n_pixel_value << n_sample_bit_shift);
#endif
				// shift the value (todo - see if there originates any precision issues (because
				// of rounding values to 8bpp, whereby prediction may require precision up to
				// 12bpp so right / bottom pixel values could drift away a bit (but is the error
				// ever going to be larger than a single 8-bit step?); but as long as data source
				// is 8bpp (my encoder), there's no harm done))
			}
			// component loop
		}

#ifndef __JPEG_DEC_USE_LOSSLESS_NONSTD_RLE
		// it's a bit problematic with RLE compression for multiple component images ...

		{int n_component_index = 0;
		for(uint8_t **p_framebuffer_ptr = &p_framebuffer[0],
		   **p_end_framebuffer = &p_framebuffer[0] + p_scan_head->n_component_num;
		   p_framebuffer_ptr < p_end_framebuffer; ++ p_framebuffer_ptr, ++ n_component_index) {
			(*p_framebuffer_ptr) += p_framebuffer_width[n_component_index] -
				m_t_image_header.n_width;
		}}
		// skip past 8-pixel align padding to the beginning of next scanline
#endif
	}
	// lossless - t_odo - decide wheter to support sub-sampled lossless images or not
	// (guess do it via #ifdef) - support for arbitrary sub-sampled lossless images is
	// going to come in next library version

	if(m_t_image_header.n_sample_precision < 8) {
		int16_t n_max_sample_value = ((1 << m_t_image_header.n_sample_precision) - 1) <<
			n_sample_bit_shift;
		// obtain maximal sample value (for example for 2-bit samples, max. value is 192, not 255
		// so white isn't exactly white, but rather gray ...)

		const uint16_t *p_framebuffer_width_ptr = &p_framebuffer_width[0];
		for(uint8_t **p_framebuffer_ptr = &p_framebuffer_begin[0],
		   **p_end_framebuffer = &p_framebuffer_begin[0] + p_scan_head->n_component_num;
		   p_framebuffer_ptr < p_end_framebuffer; ++ p_framebuffer_ptr,
		   ++ p_framebuffer_width_ptr) {
			for(uint8_t *p_sample = *p_framebuffer_ptr, *p_end = (*p_framebuffer_ptr) +
			   *p_framebuffer_width_ptr * m_t_image_header.n_height; p_sample < p_end; ++ p_sample)
				*p_sample = (uint8_t)(((int)*p_sample * 255) / n_max_sample_value);
		}
		// re-scale samples to match white
	}

	return true;
}

#endif // __JPEG_DEC_SUPPORT_LOSSLESS

/*
 *	bool CTinyJpegDecoder::Decode_Baseline_MacroBlock(int n_macro_block_index,
 *		const TScanHeader *p_scan_head, CBitReader &r_bit_reader, CBufferredFile *p_file_buffer)
 *		- decode macro-block, compressed in baseline jpeg mode
 *		- n_macro_block_index is index of macro-block so it's possible to calculate
 *		  destination position in pixels
 *		- p_scan_head is scan header, it contains some data, necessary for decoding process
 *		- r_bit_reader is class, reading data from file bit by bit (it has to be supplied
 *		  from outside as macro-blocks are not byte aligned, ie. bits from last byte, read
 *		  by one call to this function will be required for proper decoding in successive call)
 *		- p_file_buffer is buffered file reader class
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Decode_Baseline_MacroBlock(int n_macro_block_index,
	const TScanHeader *p_scan_head, CBitReader &r_bit_reader, CBufferredFile *p_file_buffer)
{
#ifdef __JPEG_DEC_DECODE_VERBOSE
	printf("\n\t\t\t=== decode macro-block %d ===\n", n_macro_block_index + 1);
	// debug
#endif

	for(const TScanHeader::TComponentInfo2 *p_comp = p_scan_head->p_component_info,
	   *p_end = p_scan_head->p_component_info + p_scan_head->n_component_num;
	   p_comp < p_end; ++ p_comp) {
		const TQuantTable *p_quant_table = &m_p_quant_table[p_comp->p_component->n_quant_table_index];
		const THuffmanTable *p_DC_table = p_comp->p_DC_table;
		const THuffmanTable *p_AC_table = p_comp->p_AC_table;

		for(int n_block = 0, n_block_num = p_comp->p_component->n_sampling_horiz *
		   p_comp->p_component->n_sampling_vert; n_block < n_block_num; ++ n_block) {
			// sampling tells us how much blocks is contained within a single scan

#ifdef __JPEG_DEC_INTEGER_IDCT
#ifdef __JPEG_DEC_USE_MMX
			__declspec(align(64)) int16_t p_coeffs[64] = {0};
#else // __JPEG_DEC_USE_MMX
			int16_t p_coeffs[64] = {0};
#endif // __JPEG_DEC_USE_MMX
#else
			float p_coeffs[64] = {0};
#endif
			// decoded coefficients; has to be de-zig-zaged

			{uint16_t n_number = 0;
			for(int i = 0; /*i < 16*/; ++ i) {
				n_number <<= 1;
				n_number |= r_bit_reader.n_GetBit(p_file_buffer);
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
				if(r_bit_reader.b_Error())
					return false;
#endif
				// get another bit to our number

				if(//n_number >= p_DC_table->p_min_code_value[i] &&
				   n_number < p_DC_table->p_min_code_value[i] + p_DC_table->p_code_num[i]) {
					_ASSERTE(n_number >= p_DC_table->p_min_code_value[i]);
					// we have code in table

#ifdef __JPEG_DEC_DECODE_VERBOSE
					printf("DC %d, ", n_number);
					// debug
#endif

					uint8_t n_huff_data = p_DC_table->p_code[i][n_number -
						p_DC_table->p_min_code_value[i]];
					// read data from huffman table, one nibble is number of preceding zeroes,
					// the other one is length of actual encoded value, in bits
					// for DC it's actualy just number of bits

					if(n_huff_data) {
						m_t_image_data.p_dc_value[p_comp->n_component_id - 1] +=
							r_bit_reader.n_GetNumber(n_huff_data, p_file_buffer);
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
						if(r_bit_reader.b_Error())
							return false;
#endif
					}
					p_coeffs[0] = m_t_image_data.p_dc_value[p_comp->n_component_id - 1];

					break;
				}
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
				if(i == 15)
					return false;
#endif
			}}
			// decode DC value

			/*if(p_quant_table->b_16_bit)
				p_coeffs[0] = (float)((int)p_coeffs[0] * (p_quant_table->p_value[0] >> 8));
			else*/ // hack! - i pre-multiplied it and can use faster IDCT function, so this is no longer necessary
#ifdef __JPEG_DEC_INTEGER_IDCT
				p_coeffs[0] = (p_coeffs[0] * p_quant_table->p_value[0] + 0x8000) >> 16;
#else
				p_coeffs[0] *= p_quant_table->p_value[0];
#endif
			// de-quantize DC

			{int *p_index = p_zig_indices + 1;
			for(int n = 1; n < 64;) {
				uint16_t n_number = 0;
				for(int i = 0; /*i < 16*/; ++ i) {
					n_number <<= 1;
					n_number |= r_bit_reader.n_GetBit(p_file_buffer);
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
					if(r_bit_reader.b_Error())
						return false;
#endif
					// get another bit to our number

					if(//n_number >= p_AC_table->p_min_code_value[i] &&
					   n_number < p_AC_table->p_min_code_value[i] + p_AC_table->p_code_num[i]) {
						_ASSERTE(n_number >= p_AC_table->p_min_code_value[i]);
						// we have code in table
#ifdef __JPEG_DEC_DECODE_VERBOSE
						printf("AC %d, ", n_number);
						// debug
#endif
						uint8_t n_huff_data = p_AC_table->p_code[i][n_number -
							p_AC_table->p_min_code_value[i]];
						// read data from huffman table, one nibble is number of preceding zeroes,
						// the other one is length of actual encoded value, in bits
						// for DC it's actualy just number of bits

						uint8_t n_zero_num = n_huff_data >> 4;
						uint8_t n_number_bits = n_huff_data & 0xf;

						if(!n_number_bits) {
							if(!n_zero_num) {
								/*for(char *p_end = p_zig_indices + 64; p_index < p_end;)
									p_coeffs[*p_index ++] = 0;*/
								// found eob, fill the rest of coeffs with zeroes
								p_index = p_zig_indices + 64;
#ifdef __JPEG_DEC_DECODE_VERBOSE
								printf("(EOB) ");
								// debug
#endif
								n = 64; // disrupt the outer loop
								break;
							} else if(n_zero_num == 15) {
								n_zero_num ++; // the encoded zero itself
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
								if((p_index + n_zero_num) - p_zig_indices > 63)
									return false;
#endif
								/*for(char *p_end = p_index + n_zero_num; p_index < p_end;)
									p_coeffs[*p_index ++] = 0;*/
								p_index += n_zero_num;
								n += n_zero_num;

#ifdef __JPEG_DEC_DECODE_VERBOSE
								printf("(16x0) ");
								// debug
#endif
							} else
								return false;
						} else {
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
							if((p_index + n_zero_num + 1) - p_zig_indices > 64) // n_zero_num + actual coeff
								return false;
#endif
							/*for(char *p_end = p_index + n_zero_num; p_index < p_end;)
								p_coeffs[*p_index ++] = 0;*/
							p_index += n_zero_num;
							n += n_zero_num;
							// fill preceding zeroes

							/*if(p_quant_table->b_16_bit) {
								p_coeffs[*p_index ++] = (float)((int)r_bit_reader.n_GetNumber(
									n_number_bits, p_file_buffer) * (p_quant_table->p_value[n] >> 8));
							} else {*/ // hack! - i pre-multiplied it and can use faster IDCT function, so this is no longer necessary
#ifdef __JPEG_DEC_INTEGER_IDCT
								p_coeffs[*p_index ++] = (r_bit_reader.n_GetNumber(n_number_bits,
									p_file_buffer) * p_quant_table->p_value[n] + 0x8000) >> 16;
#else
								p_coeffs[*p_index ++] = (float)r_bit_reader.n_GetNumber(n_number_bits,
									p_file_buffer) * p_quant_table->p_value[n];
#endif
							//}
							// de-quantize

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
							if(r_bit_reader.b_Error())
								return false;
#endif
							n ++;
							// read vli
						}

						break;
					}
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
					if(i == 15)
						return false;
#endif
				}
			}
			_ASSERTE(p_index == p_zig_indices + 64); // should never happen
			}
			// decode 63 AC values, dequantize as well (saving lots of muls because i'm not
			// de-quantizing zeros)
			// t_odo - is it smaller code with array initialized to zeros or with those zero-ing
			// loops? code is both smaller and faster
			// todo - is code smaller when it's zig-zagged on the fly or when it's copied into
			// another array in a loop afterwards?

#ifdef __JPEG_DEC_DECODE_VERBOSE
			printf("\ndecoded block:\n");
			{
				const char p_fwd_zig_indices[64] = {
					 0,  1,  5,  6, 14, 15, 27, 28,
					 2,  4,  7, 13, 16, 26, 29, 42,
					 3,  8, 12, 17, 25, 30, 41, 43,
					 9, 11, 18, 24, 31, 40, 44, 53,
					10, 19, 23, 32, 39, 45, 52, 54,
					20, 22, 33, 38, 46, 51, 55, 60,
					21, 34, 37, 47, 50, 56, 59, 61,
					35, 36, 48, 49, 57, 58, 62, 63
				};
				for(int i = 0; i < 64; ++ i) {
					if(!(i % 8))
						printf("\t");
					printf("%5d%c", (int)(p_coeffs[i] / p_quant_table->p_value[p_fwd_zig_indices[i]]), ((i % 8) == 7)? '\n' : ' ');
				}
			}
			// debug
#endif

			int n_block_x = n_block % p_comp->p_component->n_sampling_horiz;
			int n_block_y = n_block / p_comp->p_component->n_sampling_horiz;
			int n_macro_block_x = n_macro_block_index % m_t_image_header.n_macro_block_x_num;
			int n_macro_block_y = n_macro_block_index / m_t_image_header.n_macro_block_x_num;
			n_block_x += n_macro_block_x * p_comp->p_component->n_sampling_horiz;
			n_block_y += n_macro_block_y * p_comp->p_component->n_sampling_vert;
			n_block_x *= 8;
			n_block_y *= 8;
			// calc position of block in image

			int n_raster_width = m_t_image_data.p_framebuffer_width[p_comp->n_component_id - 1];
			int n_raster_height = m_t_image_data.p_framebuffer_height[p_comp->n_component_id - 1];
			// raster size

			char n_copy_horiz = (n_raster_width - n_block_x > 8)? 8 : n_raster_width - n_block_x;
			char n_copy_vert = (n_raster_height - n_block_y > 8)? 8 : n_raster_height - n_block_y;
			// how much do we want?

#ifdef __JPEG_DEC_DECODE_VERBOSE
			printf("block destination [n_channel=%d, n_raster_x=%d, n_raster_y=%d, n_copy_w=%d, n_copy_h=%d]\n",
				p_comp->n_component_id, n_block_x, n_block_y, (int)n_copy_horiz, (int)n_copy_vert);
			// debug
#endif

			if(n_copy_horiz > 0 && n_copy_vert > 0) {
				_ASSERTE(n_copy_horiz == 8 && n_copy_vert == 8);
				// Read_ImageHeader() forced buffer size o multiples of eight to speedup here

				// there are cases when part of macroblock lies beyond the image border,
				// if this component is sampled more than 1x1, there's chance this block
				// will not be necessary to compute

#ifdef __JPEG_DEC_INTEGER_IDCT
#ifdef __JPEG_DEC_USE_MMX
				__declspec(align(64)) int16_t p_dest_coeffs[64];
#else // __JPEG_DEC_USE_MMX
				int16_t p_dest_coeffs[64];
#endif // __JPEG_DEC_USE_MMX
#else
				float p_dest_coeffs[64];
#endif

#ifdef __JPEG_DEC_USE_MMX
				CFastDCT8_2D::Inverse_MMX(p_dest_coeffs, p_coeffs);
#else
				CFastDCT8_2D::Inverse(p_dest_coeffs, p_coeffs);
#endif

#ifdef __JPEG_DEC_DECODE_VERBOSE
				printf("block IDCT:\n");
				for(int i = 0; i < 64; ++ i) {
					if(!(i % 8))
						printf("\t");
					printf("%5d%c", (int)p_dest_coeffs[i], ((i % 8) == 7)? '\n' : ' ');
				}
				// debug
				printf("\n");
#endif

				uint8_t *p_dest_ptr = m_t_image_data.p_framebuffer[
					p_comp->n_component_id - 1] + n_block_x + n_block_y * n_raster_width;
				// get address of dest framebuffer

#ifdef __JPEG_DEC_INTEGER_IDCT
				const int16_t *p_idct_ptr = p_dest_coeffs;
#else
				const float *p_idct_ptr = p_dest_coeffs;
#endif

				/*char p_final_coeffs[64];
				for(char *p_final_coeffs_ptr = p_final_coeffs, *p_final_coeffs_end = p_final_coeffs + 64; p_final_coeffs_ptr < p_final_coeffs_end;) {
					*p_final_coeffs_ptr ++ = (*p_idct_ptr < -128)? -128 :
						((*p_idct_ptr <= 127)? (unsigned char)*p_idct_ptr : 127);
					p_idct_ptr ++;
				}
				const char *p_idct_ptr2 = p_final_coeffs;
				for(unsigned char *p_dest_end = p_dest_ptr + n_raster_width * n_copy_vert;
				   p_dest_ptr < p_dest_end; p_idct_ptr2 += 8, p_dest_ptr += n_raster_width)
					memcpy(p_dest_ptr, p_idct_ptr2, n_copy_horiz * sizeof(char));*/
				// a little bit faster but ugly

				/*for(unsigned char *p_dest_end = p_dest_ptr + n_raster_width * n_copy_vert;
				   p_dest_ptr < p_dest_end; p_idct_ptr += 8 - n_copy_horiz,
				   p_dest_ptr += n_raster_width - n_copy_horiz) {
					for(unsigned char *p_dest_end2 = p_dest_ptr + n_copy_horiz; p_dest_ptr < p_dest_end2;) {
						*p_dest_ptr ++ = (*p_idct_ptr < -128)? -128 :
							((*p_idct_ptr <= 127)? (unsigned char)*p_idct_ptr : 127);
						p_idct_ptr ++;
					}
					// clamp
				}*/
				// older version with arbitrary n_copy_horiz and n_copy_vert

				for(uint8_t *p_dest_end = p_dest_ptr + n_raster_width * 8;
				   p_dest_ptr < p_dest_end; p_dest_ptr += n_raster_width) {
					for(uint8_t *p_dest_ptr2 = p_dest_ptr, *p_dest_end2 = p_dest_ptr + 8;
					   p_dest_ptr2 < p_dest_end2;) {
						*p_dest_ptr2 ++ = (*p_idct_ptr < -128)? -128 :
							((*p_idct_ptr <= 127)? (uint8_t)*p_idct_ptr : 127);
						p_idct_ptr ++;
					}
					// clamp
				}
				// write required part of idct into image
				// a tiny bottleneck here, time spent here during decoding 2304x1478 pic is ~ 0.2 sec
			}
			// IDCT
		}
	}

	return true;
}

/*
 *	bool CTinyJpegDecoder::Decode_Baseline_MacroBlock_DCOnly(int n_macro_block_index,
 *		const TScanHeader *p_scan_head, CBitReader &r_bit_reader, CBufferredFile *p_file_buffer)
 *		- decode macro-block, compressed in baseline jpeg mode, uses DC value only (thumbnail rendering)
 *		- n_macro_block_index is index of macro-block so it's possible to calculate
 *		  destination position in pixels
 *		- p_scan_head is scan header, it contains some data, necessary for decoding process
 *		- r_bit_reader is class, reading data from file bit by bit (it has to be supplied
 *		  from outside as macro-blocks are not byte aligned, ie. bits from last byte, read
 *		  by one call to this function will be required for proper decoding in successive call)
 *		- p_file_buffer is buffered file reader class
 *		- return true on success, false on failure
 */
bool CTinyJpegDecoder::Decode_Baseline_MacroBlock_DCOnly(int n_macro_block_index,
	const TScanHeader *p_scan_head, CBitReader &r_bit_reader, CBufferredFile *p_file_buffer)
{
#ifdef __JPEG_DEC_DECODE_VERBOSE
	printf("\n\t\t\t=== decode macro-block %d ===\n", n_macro_block_index + 1);
	// debug
#endif

	for(const TScanHeader::TComponentInfo2 *p_comp = p_scan_head->p_component_info,
	   *p_end = p_scan_head->p_component_info + p_scan_head->n_component_num;
	   p_comp < p_end; ++ p_comp) {
		const TQuantTable *p_quant_table = &m_p_quant_table[p_comp->p_component->n_quant_table_index];
		const THuffmanTable *p_DC_table = p_comp->p_DC_table;
		const THuffmanTable *p_AC_table = p_comp->p_AC_table;

		for(int n_block = 0, n_block_num = p_comp->p_component->n_sampling_horiz *
		   p_comp->p_component->n_sampling_vert; n_block < n_block_num; ++ n_block) {
			// sampling tells us how much blocks is contained within a single scan

#ifdef __JPEG_DEC_INTEGER_IDCT
#ifdef __JPEG_DEC_USE_MMX
			__declspec(align(64)) int16_t p_coeffs[64] = {0};
#else // __JPEG_DEC_USE_MMX
			int16_t p_coeffs[64] = {0};
#endif // __JPEG_DEC_USE_MMX
#else
			float p_coeffs[64] = {0};
#endif
			// decoded coefficients; has to be de-zig-zaged

			{uint16_t n_number = 0;
			for(int i = 0; /*i < 16*/; ++ i) {
				n_number <<= 1;
				n_number |= r_bit_reader.n_GetBit(p_file_buffer);
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
				if(r_bit_reader.b_Error())
					return false;
#endif
				// get another bit to our number

				if(//n_number >= p_DC_table->p_min_code_value[i] &&
				   n_number < p_DC_table->p_min_code_value[i] + p_DC_table->p_code_num[i]) {
					_ASSERTE(n_number >= p_DC_table->p_min_code_value[i]);
					// we have code in table

#ifdef __JPEG_DEC_DECODE_VERBOSE
					printf("DC %d, ", n_number);
					// debug
#endif

					uint8_t n_huff_data = p_DC_table->p_code[i][n_number -
						p_DC_table->p_min_code_value[i]];
					// read data from huffman table, one nibble is number of preceding zeroes,
					// the other one is length of actual encoded value, in bits
					// for DC it's actualy just number of bits

					if(n_huff_data) {
						m_t_image_data.p_dc_value[p_comp->n_component_id - 1] +=
							r_bit_reader.n_GetNumber(n_huff_data, p_file_buffer);
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
						if(r_bit_reader.b_Error())
							return false;
#endif
					}
					p_coeffs[0] = m_t_image_data.p_dc_value[p_comp->n_component_id - 1];

					break;
				}
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
				if(i == 15)
					return false;
#endif
			}}
			// decode DC value

			/*if(p_quant_table->b_16_bit)
				p_coeffs[0] = (float)((int)p_coeffs[0] * (p_quant_table->p_value[0] >> 8));
			else*/ // hack! - i pre-multiplied it and can use faster IDCT function, so this is no longer necessary
#ifdef __JPEG_DEC_INTEGER_IDCT
				p_coeffs[0] = (p_coeffs[0] * p_quant_table->p_value[0] + 0x8000) >> 16;
#else
				p_coeffs[0] *= p_quant_table->p_value[0];
#endif
			// de-quantize DC

			{int *p_index = p_zig_indices + 1;
			for(int n = 1; n < 64;) {
				uint16_t n_number = 0;
				for(int i = 0; /*i < 16*/; ++ i) {
					n_number <<= 1;
					n_number |= r_bit_reader.n_GetBit(p_file_buffer);
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
					if(r_bit_reader.b_Error())
						return false;
#endif
					// get another bit to our number

					if(//n_number >= p_AC_table->p_min_code_value[i] &&
					   n_number < p_AC_table->p_min_code_value[i] + p_AC_table->p_code_num[i]) {
						_ASSERTE(n_number >= p_AC_table->p_min_code_value[i]);
						// we have code in table
#ifdef __JPEG_DEC_DECODE_VERBOSE
						printf("AC %d, ", n_number);
						// debug
#endif
						uint8_t n_huff_data = p_AC_table->p_code[i][n_number -
							p_AC_table->p_min_code_value[i]];
						// read data from huffman table, one nibble is number of preceding zeroes,
						// the other one is length of actual encoded value, in bits
						// for DC it's actualy just number of bits

						uint8_t n_zero_num = n_huff_data >> 4;
						uint8_t n_number_bits = n_huff_data & 0xf;

						if(!n_number_bits) {
							if(!n_zero_num) {
								/*for(char *p_end = p_zig_indices + 64; p_index < p_end;)
									p_coeffs[*p_index ++] = 0;*/
								// found eob, fill the rest of coeffs with zeroes
								p_index = p_zig_indices + 64;
#ifdef __JPEG_DEC_DECODE_VERBOSE
								printf("(EOB) ");
								// debug
#endif
								n = 64; // disrupt the outer loop
								break;
							} else if(n_zero_num == 15) {
								n_zero_num ++; // the encoded zero itself
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
								if((p_index + n_zero_num) - p_zig_indices > 63)
									return false;
#endif
								/*for(char *p_end = p_index + n_zero_num; p_index < p_end;)
									p_coeffs[*p_index ++] = 0;*/
								p_index += n_zero_num;
								n += n_zero_num;

#ifdef __JPEG_DEC_DECODE_VERBOSE
								printf("(16x0) ");
								// debug
#endif
							} else
								return false;
						} else {
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
							if((p_index + n_zero_num + 1) - p_zig_indices > 64) // n_zero_num + actual coeff
								return false;
#endif
							/*for(char *p_end = p_index + n_zero_num; p_index < p_end;)
								p_coeffs[*p_index ++] = 0;*/
							p_index += n_zero_num;
							n += n_zero_num;
							// fill preceding zeroes

							/*if(p_quant_table->b_16_bit) {
								p_coeffs[*p_index ++] = (float)((int)r_bit_reader.n_GetNumber(
									n_number_bits, p_file_buffer) * (p_quant_table->p_value[n] >> 8));
							} else {*/ // hack! - i pre-multiplied it and can use faster IDCT function, so this is no longer necessary
#ifdef __JPEG_DEC_INTEGER_IDCT
								p_coeffs[*p_index ++] = (r_bit_reader.n_GetNumber(n_number_bits,
									p_file_buffer) * p_quant_table->p_value[n] + 0x8000) >> 16;
#else
								p_coeffs[*p_index ++] = (float)r_bit_reader.n_GetNumber(n_number_bits,
									p_file_buffer) * p_quant_table->p_value[n];
#endif
							//}
							// de-quantize

#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
							if(r_bit_reader.b_Error())
								return false;
#endif
							n ++;
							// read vli
						}

						break;
					}
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
					if(i == 15)
						return false;
#endif
				}
			}
			_ASSERTE(p_index == p_zig_indices + 64); // should never happen
			}
			// decode 63 AC values, dequantize as well (saving lots of muls because i'm not
			// de-quantizing zeros)
			// t_odo - is it smaller code with array initialized to zeros or with those zero-ing
			// loops? code is both smaller and faster
			// todo - is code smaller when it's zig-zagged on the fly or when it's copied into
			// another array in a loop afterwards?

#ifdef __JPEG_DEC_DECODE_VERBOSE
			printf("\ndecoded block:\n");
			{
				const char p_fwd_zig_indices[64] = {
					 0,  1,  5,  6, 14, 15, 27, 28,
					 2,  4,  7, 13, 16, 26, 29, 42,
					 3,  8, 12, 17, 25, 30, 41, 43,
					 9, 11, 18, 24, 31, 40, 44, 53,
					10, 19, 23, 32, 39, 45, 52, 54,
					20, 22, 33, 38, 46, 51, 55, 60,
					21, 34, 37, 47, 50, 56, 59, 61,
					35, 36, 48, 49, 57, 58, 62, 63
				};
				for(int i = 0; i < 64; ++ i) {
					if(!(i % 8))
						printf("\t");
					printf("%5d%c", (int)(p_coeffs[i] / p_quant_table->p_value[p_fwd_zig_indices[i]]), ((i % 8) == 7)? '\n' : ' ');
				}
			}
			// debug
#endif

			int n_block_x = n_block % p_comp->p_component->n_sampling_horiz;
			int n_block_y = n_block / p_comp->p_component->n_sampling_horiz;
			int n_macro_block_x = n_macro_block_index % m_t_image_header.n_macro_block_x_num;
			int n_macro_block_y = n_macro_block_index / m_t_image_header.n_macro_block_x_num;
			n_block_x += n_macro_block_x * p_comp->p_component->n_sampling_horiz;
			n_block_y += n_macro_block_y * p_comp->p_component->n_sampling_vert;
			// calc position of block in image

			int n_raster_width = m_t_image_data.p_framebuffer_width[p_comp->n_component_id - 1];
			int n_raster_height = m_t_image_data.p_framebuffer_height[p_comp->n_component_id - 1];
			// raster size

			char n_copy_horiz = (n_raster_width - n_block_x > 1)? 1 : n_raster_width - n_block_x;
			char n_copy_vert = (n_raster_height - n_block_y > 1)? 1 : n_raster_height - n_block_y;
			// how much do we want?

#ifdef __JPEG_DEC_DECODE_VERBOSE
			printf("block destination [n_channel=%d, n_raster_x=%d, n_raster_y=%d, n_copy_w=%d, n_copy_h=%d]\n",
				p_comp->n_component_id, n_block_x, n_block_y, (int)n_copy_horiz, (int)n_copy_vert);
			// debug
#endif

			if(n_copy_horiz > 0 && n_copy_vert > 0) {
				_ASSERTE(n_copy_horiz == 1 && n_copy_vert == 1);
				// Read_ImageHeader() forced buffer size o multiples of eight to speedup here

				// there are cases when part of macroblock lies beyond the image border,
				// if this component is sampled more than 1x1, there's chance this block
				// will not be necessary to compute

#ifdef __JPEG_DEC_INTEGER_IDCT
				int16_t f_dest_coeff = p_coeffs[0] >> 3;
#else
				float f_dest_coeff = p_coeffs[0] * .125f;
#endif

				uint8_t *p_dest_ptr = m_t_image_data.p_framebuffer[
					p_comp->n_component_id - 1] + n_block_x + n_block_y * n_raster_width;
				// get address of dest framebuffer

				*p_dest_ptr = (f_dest_coeff < -128)? -128 :
					((f_dest_coeff <= 127)? (uint8_t)f_dest_coeff : 127); // clamp
				// write DC part into image as a single pixel
			}
		}
	}

	return true;
}

/*
 *								=== ~CTinyJpegDecoder ===
 */

/*
 *								=== CTinyJpegDecoder::CBufferredFile ===
 */

/*
 *	const int CTinyJpegDecoder::CBufferredFile::m_n_buffer_size = 16384;
 *		- input file buffer size
 *		- experimentaly determined to provide best-reading performance
 *		  (although my HDD configuration is not very common, jpeg files use to be very small
 *		  anyway so this propably won't affect decoding time so much; note HDD cluster size 
 *		  has nothing to do with this, for example my clusters are 64kB)
 */
const size_t CTinyJpegDecoder::CBufferredFile::m_n_buffer_size = 16384;

/*
 *	CTinyJpegDecoder::CBufferredFile::CBufferredFile(FILE *p_fr)
 *		- default constructor
 *		- p_fr is input file, must not be NULL
 */
CTinyJpegDecoder::CBufferredFile::CBufferredFile(FILE *p_fr)
	:m_p_buffer(_m_p_buffer_), m_n_buffer_used(0), m_p_buffer_ptr(_m_p_buffer_), m_p_fr(p_fr)
{
}

/*
 *	CTinyJpegDecoder::CBufferredFile::CBufferredFile(const unsigned char *p_data, unsigned int n_size)
 *		- constructor
 *		- p_data is input buffer of size n_size, must not be NULL
 */
CTinyJpegDecoder::CBufferredFile::CBufferredFile(const unsigned char *p_data, size_t n_size)
	:m_p_buffer((unsigned char*)p_data), m_n_buffer_used(n_size), m_p_buffer_ptr(p_data), m_p_fr(0)
{}

/*
 *	bool CTinyJpegDecoder::CBufferredFile::Read_Byte(uint8_t &r_n_byte)
 *		- read a single byte, store it in r_n_byte
 *		- if byte was read, returns true. otherwise (EOF / IO error) false
 */
bool CTinyJpegDecoder::CBufferredFile::Read_Byte(uint8_t &r_n_byte)
{
	if(m_p_buffer_ptr == m_p_buffer + m_n_buffer_used) {
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if(!m_p_fr)
			return false;
#endif
		m_n_buffer_used = fread(m_p_buffer, sizeof(char), m_n_buffer_size, m_p_fr);
#ifndef __JPEG_DEC_STRIP_ERROR_CHECKS
		if(!m_n_buffer_used)
			return false;
#endif
		m_p_buffer_ptr = m_p_buffer;

#ifdef __JPEG_DEC_READ_VERBOSE
		printf("have to fetch more data from file while reading byte\n");
		// debug
#endif
	}
	r_n_byte = *m_p_buffer_ptr ++;
	return true;
}

/*
 *	bool CTinyJpegDecoder::CBufferredFile::Read_Short(int16_t &r_n_short)
 *		- read two bytes, store it in r_n_short
 *		- if reading was successful, returns true. otherwise (EOF / IO error) false
 */
bool CTinyJpegDecoder::CBufferredFile::Read_Short(int16_t &r_n_short)
{
	union {
		uint8_t p_byte[2];
		int16_t n_short;
	} t_hilo;

#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	Read_Byte(t_hilo.p_byte[1]);
	Read_Byte(t_hilo.p_byte[0]);
#else
	if(!Read_Byte(t_hilo.p_byte[1]) || !Read_Byte(t_hilo.p_byte[0]))
		return false;
#endif

	r_n_short = t_hilo.n_short;

	return true;
}

/*
 *	bool CTinyJpegDecoder::CBufferredFile::Read_Marker(uint16_t &r_n_marker_code)
 *		- read bytes until valid JPEG marker is reached (0xff, followed by non-zero and non-0xff)
 *		- if reading was successful, returns true. otherwise (EOF / IO error) false
 */
bool CTinyJpegDecoder::CBufferredFile::Read_Marker(uint16_t &r_n_marker_code)
{
	uint16_t n_code;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
	Read_Short(n_code);
#else
	if(!Read_Short(n_code))
		return false;
#endif
	while(1) {
		bool b_byte_offset = false;
		if((n_code & 0xff00) == 0xff00) {
			if((n_code & 0xff) == 0xff)
				b_byte_offset = true;
			else if((n_code & 0xff) != 0x00) {
#ifdef __JPEG_DEC_READ_VERBOSE
				printf("found marker word 0x%04x\n", (unsigned int)n_code & 0xffff);
				// debug
#endif

				r_n_marker_code = n_code;
				return true;
			}
		} else if((n_code & 0xff) == 0xff)
			b_byte_offset = true;
		// see if we have valid marker or if at least second
		// byte of n_code could be beginning of the marker

#ifdef __JPEG_DEC_READ_VERBOSE
		printf("failed with marker word 0x%04x, trying again\n", (unsigned int)n_code & 0xffff);
		// debug
#endif

		if(b_byte_offset) {
			uint8_t n_byte;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
			Read_Byte(n_byte);
#else
			if(!Read_Byte(n_byte))
				return false;
#endif
			_ASSERTE((n_code & 0xff) == 0xff);
			n_code = 0xff00; // or n_code <<= 8, but this could be faster
			n_code |= n_byte;
			// second byte of n_code could be beginning of the marker
		} else {
			_ASSERTE((n_code & 0xff) != 0xff);
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
			Read_Short(n_code);
#else
			if(!Read_Short(n_code))
				return false;
#endif
			// get next short
		}
	}
}

/*
 *	bool CTinyJpegDecoder::CBufferredFile::Skip_Bytes(size_t n_byte_num)
 *		- skip n_byte_num bytes forward in file (may fetch another data from file)
 *		- if reading was successful, returns true. otherwise (EOF / IO error) false
 */
bool CTinyJpegDecoder::CBufferredFile::Skip_Bytes(size_t n_byte_num)
{
	while(1) {
		if(m_n_buffer_used - (m_p_buffer_ptr - m_p_buffer) >= n_byte_num) {
			m_p_buffer_ptr += n_byte_num;
			return true;
		}
		// do we have enough bytes in buffer?

#ifdef __JPEG_DEC_READ_VERBOSE
		printf("have to fetch more data from file while skipping %d bytes\n", n_byte_num);
		// debug
#endif

		if(!m_p_fr)
			return false;
		n_byte_num -= m_n_buffer_used - (m_p_buffer_ptr - m_p_buffer);
		m_n_buffer_used = fread(m_p_buffer, sizeof(char), m_n_buffer_size, m_p_fr);
		if(!m_n_buffer_used)
			return false;
		m_p_buffer_ptr = m_p_buffer;
		// read buffer
	}
	// skip in buffered file
}

/*
 *								=== ~CTinyJpegDecoder::CBufferredFile ===
 */

/*
 *								=== CTinyJpegDecoder::CBitReader ===
 */

/*
 *	CTinyJpegDecoder::CBitReader::CBitReader()
 *		- default constructor
 */
CTinyJpegDecoder::CBitReader::CBitReader()
	:m_n_cur_byte(0), m_b_0xff_byte(false), m_n_bit_num(0), m_b_error_flag(false)
{
}

/*
 *	uint8_t CTinyJpegDecoder::CBitReader::n_GetBit(CBufferredFile *p_file_buffer)
 *		- get a single bit from p_file_buffer
 *		  (a byte is read and stored inside CTinyJpegDecoder::CBitReader object, MSB is returned.
 *		  then with every next call to this function, next bit is returned. eventualy next
 *		  byte is read)
 *		- in case reading was unsuccessful, raise inner error flag (returned by b_Error())
 */
uint8_t CTinyJpegDecoder::CBitReader::n_GetBit(CBufferredFile *p_file_buffer)
{
	if(!m_n_bit_num) {
		uint8_t n_byte;
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
		p_file_buffer->Read_Byte(n_byte);
#else
		if(!p_file_buffer->Read_Byte(n_byte)) {
			m_b_error_flag = true;
			return 0;
		}
#endif
		if(n_byte == 0x00 && m_b_0xff_byte) {
#ifdef __JPEG_DEC_STRIP_ERROR_CHECKS
			p_file_buffer->Read_Byte(m_n_cur_byte);
#else
			if(!p_file_buffer->Read_Byte(m_n_cur_byte)) {
				m_b_error_flag = true;
				return 0;
			}
#endif
		} else
			m_n_cur_byte = n_byte;
		// fetch another byte, remember we need to look for 0xff00 sequence
		// (0xff is valid, 0x00 is inserted to prevent random markers to occur
		// in bit-coded stream) // f_ixme - it's in bit-coded stream only or it's in all blocks? only here.

		m_b_0xff_byte = m_n_cur_byte == 0xff;
		// remember if we had 0xff again

		m_n_bit_num = 7; // save dec instr
	} else
		m_n_bit_num --;

	uint8_t n_result = m_n_cur_byte >> 7;
	// return MSB

	m_n_cur_byte <<= 1;
	// and shift a little

	return n_result;
}

/*
 *	int16_t CTinyJpegDecoder::CBitReader::n_GetNumber(int n_bit_num, CBufferredFile *p_file_buffer)
 *		- return n_bit_num bits wide signed (two's complement) number, read from p_file_buffer
 *		- n_bit_num must be larger than zero (and due to return type should be less or equal 16)
 *		- in case reading was unsuccessful, raise inner error flag (returned by b_Error())
 */
int16_t CTinyJpegDecoder::CBitReader::n_GetNumber(int n_bit_num, CBufferredFile *p_file_buffer)
{
	_ASSERTE(n_bit_num);

	uint8_t n_first_bit;
	int16_t n_number = (n_first_bit = n_GetBit(p_file_buffer));
	// read sign bit; either 1 = positive or 0 = negative, calc min number with n_bit_num bits 

	for(int n = n_bit_num; -- n;) {
		n_number <<= 1;
		n_number |= n_GetBit(p_file_buffer);
	}
	// read more bits

	if(!n_first_bit)
		n_number += (int16_t)((0xffff ^ ((1 << n_bit_num) - 1)) | 1);
	// negative number

	return n_number;
}

/*
 *	void CTinyJpegDecoder::CBitReader::Restart()
 *		- restart bit reader (next time any bits are needed (regardless if in n_GetBit
 *		  or n_GetNumber), new byte will be read from supplied file, ie. any remaining
 *		  bits of current byte are discarded)
 *		- note again it does read nothing immediately
 */
void CTinyJpegDecoder::CBitReader::Restart()
{
	m_n_bit_num = 0;
	m_b_0xff_byte = false;
}

/*
 *								=== ~CTinyJpegDecoder::CBitReader ===
 */
