/*
								+---------------------------------+
								|                                 |
								| *** Dense numerical methods *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2013  |
								|                                 |
								|            Matrix.h             |
								|                                 |
								+---------------------------------+
*/

#pragma once
#ifndef __MATRIX_MATH_INCLUDED
#define __MATRIX_MATH_INCLUDED

/**
 *	@file Matrix.h
 *	@brief dense matrix and accompanying numerical linear algebra implementations
 *	@author -tHE SWINe-
 *	@date 2013
 *
 *	@todo Improve cooperation with structures, defined in Vector.h.
 */

#include "NewFix.h"
#include "CallStack.h"
#include <vector>
#include <stdio.h>
#include <math.h>
#include "Unused.h"
#include "MinMax.h"

/**
 *	@brief a simple dense (column-major) matrix template
 *
 *	The array
 *
 *		| 1 2 3 |
 *		| 4 5 6 |
 *
 *	if stored contiguously in linear memory with column-major order would look like the following:
 *
 *		1  4  2  5  3  6
 *
 *	(from http://en.wikipedia.org/wiki/Row-major_order#Column-major_order)
 *
 *	@tparam _Ty is scalar type
 *
 *	@note This matrix is column-major, all the relevant functions and
 *		operators expect arguments in form (column, row), not the opposite.
 *
 *	@todo Think how to unify this with the statically-sized matrices from ../UberLame_src/Vector.h.
 *	@todo Think how to unify vector and matrix storage (probably can't be done,
 *		but maybe it can exist as a specialization of CMatrixStorage<1, x> if MSVC60 can handle it)
 */
template <class _Ty>
class MatrixMN {
public:
	typedef _Ty _TyScalar; /**< @brief scalar type */

protected:
	int m_n_column_num; /**< @brief number of columns of the matrix */
	int m_n_row_num; /**< @brief number of rows of the matrix */
	_TyScalar *m_p_data; /**< @brief dense (column-major) data of the matrix */

public:
	/**
	 *	@brief default constructor
	 *
	 *	@param[in] n_column_num is new number of columns
	 *	@param[in] n_row_num is new number of rows
	 *
	 *	@note This function throws std::bad_alloc.
	 */
	inline MatrixMN(int n_column_num, int n_row_num) // throw(std::bad_alloc)
		:m_n_column_num(n_column_num), m_n_row_num(n_row_num),
		m_p_data(new _TyScalar[n_column_num * n_row_num])
	{}

	/**
	 *	@brief copy-constructor
	 *	@param[in] r_t_mat is the matrix to copy from
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN(const MatrixMN &r_t_mat) // throw(std::bad_alloc)
		:m_n_column_num(r_t_mat.m_n_column_num), m_n_row_num(r_t_mat.m_n_row_num),
		m_p_data(new _TyScalar[r_t_mat.m_n_column_num * r_t_mat.m_n_row_num])
	{
		memcpy(m_p_data, r_t_mat.m_p_data, r_t_mat.m_n_column_num * r_t_mat.m_n_row_num * sizeof(_TyScalar));
	}

#if 0
	/**
	 *	@brief copy-constructor
	 *	@param[in] r_t_mat is the matrix to copy from
	 *	@note This function throws std::bad_alloc.
	 */
	template <const int n_cols, const int n_rows, class CDerived>
	MatrixMN(const CMatrixBase<_TyScalar, n_cols, n_rows, CDerived> &r_t_mat) // throw(std::bad_alloc)
		:m_n_column_num(n_cols), m_n_row_num(n_rows),
		m_p_data(new _TyScalar[n_cols * n_rows])
	{
		for(int i = 0; i < n_cols; ++ i) {
			for(int j = 0; j < n_rows; ++ j)
				(*this)[i][j] = r_t_mat[i][j];
		}
	}
#endif // 0
	/**
	 *	@brief copy-constructor
	 *	@param[in] r_t_mat is the matrix to copy from
	 *	@note This function throws std::bad_alloc.
	 */
	template <class CDerived>
	MatrixMN(const CDerived &r_t_mat) // throw(std::bad_alloc)
		:m_n_column_num(CDerived::n_column_num), m_n_row_num(CDerived::n_row_num),
		m_p_data(new _TyScalar[CDerived::n_column_num * CDerived::n_row_num])
	{
		for(int i = 0; i < CDerived::n_column_num; ++ i) {
			for(int j = 0; j < CDerived::n_row_num; ++ j)
				(*this)[i][j] = r_t_mat[i][j];
		}
	}

	/**
	 *	@brief destructor; deletes matrix data
	 */
	inline ~MatrixMN()
	{
		if(m_p_data)
			delete[] m_p_data;
	}

	/**
	 *	@brief copy-operator
	 *	@param[in] r_t_mat is the matrix to copy from
	 *	@return Returns reference to this.
	 *	@note This function throws std::bad_alloc.
	 */
#if 0
	inline MatrixMN &operator =(MatrixMN r_t_mat) // throw(std::bad_alloc)
	{
		Swap(t_mat);
		// very simple, but always reallocs, possibly lower performance than we can achieve

		return *this;
	}
#else // 0
	inline MatrixMN &operator =(const MatrixMN &r_t_mat) // throw(std::bad_alloc)
	{
		Resize(r_t_mat.m_n_column_num, r_t_mat.m_n_row_num, false);
		// resize, keeps ths matrix memory if it can

		memcpy(m_p_data, r_t_mat.m_p_data, r_t_mat.m_n_column_num *
			r_t_mat.m_n_row_num * sizeof(_TyScalar));
		// copy the data

		return *this;
	}
#endif // 0

	// conversion to fixed-size matrix
	template <class CDerived>
	inline operator CDerived() const
	{
		_ASSERTE(CDerived::n_column_num == m_n_column_num &&
			CDerived::n_row_num == m_n_row_num);

		CDerived M;
		for(int i = 0; i < CDerived::n_column_num; ++ i) {
			for(int j = 0; j < CDerived::n_row_num; ++ j)
				M[i][j] = (*this)[i][j];
		}
		return M;
	}

	/**
	 *	@brief resizes matrix to a new size
	 *
	 *	@param[in] n_column_num is new number of columns
	 *	@param[in] n_row_num is new number of rows
	 *	@param[in] b_data_important is data important flag (if set, (part of) the data is
	 *		copied to the new matrix, otherwise the contents of the new matrix are unspecified)
	 *
	 *	@note This function throws std::bad_alloc.
	 */
	void Resize(int n_column_num, int n_row_num,
		bool b_data_important = true) // throw(std::bad_alloc)
	{
		if(n_row_num == m_n_row_num && n_column_num == m_n_column_num)
			return;
		// simple case, no resize

		if(!b_data_important && n_row_num * n_column_num == m_n_row_num * m_n_column_num) {
			m_n_row_num = n_row_num;
			m_n_column_num = n_column_num;
			return;
		}
		// simple case, storage allocated to correct size, just reshape (will lose data)

		if(!b_data_important) {
			delete[] m_p_data;
			m_p_data = 0; // if it throws?
		}
		// if don't need to copy data, free it before allocating

		_TyScalar *p_new_data = new _TyScalar[n_row_num * n_column_num];
		// need to allocate a new storage

		if(b_data_important) {
			const int m = std::min(m_n_row_num, n_row_num),
				n = std::min(m_n_column_num, n_column_num);
			for(int i = 0; i < n; ++ i) {
				for(int j = 0; j < m; ++ j)
					p_new_data[i * n_row_num + j] = m_p_data[i * m_n_row_num + j];
				for(int j = m; j < n_row_num; ++ j)
					p_new_data[i * n_row_num + j] = 0; // padd with zeros
			}
			for(int i = n; i < n_column_num; ++ i) {
				for(int j = 0; j < n_row_num; ++ j)
					p_new_data[i * n_row_num + j] = 0; // padd with zeros
			}
			delete[] m_p_data;
		}
		// if data important, 

		m_n_row_num = n_row_num;
		m_n_column_num = n_column_num;
		m_p_data = p_new_data;
		// delete old data
	}

	/**
	 *	@brief swaps contents of two matrices
	 *	@param[in,out] r_other is the other matrix to swap with
	 */
	void Swap(MatrixMN &r_other)
	{
		std::swap(m_p_data, r_other.m_p_data);
		std::swap(m_n_column_num, r_other.m_n_column_num);
		std::swap(m_n_row_num, r_other.m_n_row_num);
	}

	/**
	 *	@brief creates identity matrix
	 *	@note This also works on square matrices.
	 */
	void Identity()
	{
		for(int y = 0; y < m_n_row_num; ++ y) {
			for(int x = 0; x < m_n_column_num; ++ x)
				m_p_data[x + m_n_row_num * y] = _TyScalar((x == y)? 1 : 0);
		}
	}

	/**
	 *	@brief sets all matrix elements to constant value
	 *	@param[in] f_value is value to set all matrix elements to
	 */
	void SetConst(_TyScalar f_value)
	{
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			m_p_data[i] = f_value;
	}

	/**
	 *	@brief sets all matrix elements to zero
	 */
	void SetZero()
	{
		memset(m_p_data, 0, m_n_column_num * m_n_row_num * sizeof(_TyScalar));
	}

	/**
	 *	@brief calculates L2 norm
	 *	@return Returns L2 norm of this matrix
	 */
	inline _TyScalar f_Norm() const
	{
		return (_TyScalar)sqrt(f_SquaredNorm());
	}

	/**
	 *	@brief calculates squared L2 norm
	 *	@return Returns squared L2 norm of this matrix
	 */
	_TyScalar f_SquaredNorm() const
	{
		_TyScalar f_norm = 0;
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			f_norm += m_p_data[i] * m_p_data[i];
		return f_norm;
	}

	/**
	 *	@brief gets number of columns
	 *	@return Returns number of columns
	 */
	inline int n_Column_Num() const
	{
		return m_n_column_num;
	}

	/**
	 *	@brief gets number of rows
	 *	@return Returns number of rows
	 */
	inline int n_Row_Num() const
	{
		return m_n_row_num;
	}

	/**
	 *	@brief accesses the columns of the matrix
	 *	@param[in] n_column is zero-based index of the column
	 *	@return Returns const pointer to the elements of the selected column.
	 */
	inline const _TyScalar *operator[](int n_column) const
	{
		_ASSERTE(n_column >= 0 && n_column < m_n_column_num);
		return m_p_data + (n_column * m_n_row_num);
	}

	/**
	 *	@brief accesses the columns of the matrix
	 *	@param[in] n_column is zero-based index of the column
	 *	@return Returns pointer to the elements of the selected column.
	 */
	inline _TyScalar *operator[](int n_column)
	{
		_ASSERTE(n_column >= 0 && n_column < m_n_column_num);
		return m_p_data + (n_column * m_n_row_num);
	}

	/**
	 *	@brief accesses the elements of the matrix, Eigen style
	 *
	 *	@param[in] n_column is zero-based index of the column
	 *	@param[in] n_row is zero-based index of the row
	 *
	 *	@return Returns const reference to the element at the selected column and row.
	 */
	const inline _TyScalar &operator ()(int n_column, int n_row) const
	{
		_ASSERTE(n_column >= 0 && n_column < m_n_column_num);
		_ASSERTE(n_row >= 0 && n_row < m_n_row_num);
		return m_p_data[n_row + n_column * m_n_row_num];
	}

	/**
	 *	@brief accesses the elements of the matrix, Eigen style
	 *
	 *	@param[in] n_column is zero-based index of the column
	 *	@param[in] n_row is zero-based index of the row
	 *
	 *	@return Returns reference to the element at the selected column and row.
	 */
	inline _TyScalar &operator ()(int n_column, int n_row)
	{
		_ASSERTE(n_column >= 0 && n_column < m_n_column_num);
		_ASSERTE(n_row >= 0 && n_row < m_n_row_num);
		return m_p_data[n_row + n_column * m_n_row_num];
	}

	/**
	 *	@brief below diagonal element predicate
	 *
	 *	@param[in] n_column is zero-based index of a column
	 *	@param[in] n_row is zero-based index of a row
	 *
	 *	@return Returns true if the element at the selected column and row
	 *		is below the diagonal, otherwise returns false.
	 */
	static inline bool b_IsLowerElem(int n_column, int n_row)
	{
		return n_row > n_column; // row > column: lower
	}

	/**
	 *	@brief lower-triangluar element predicate
	 *
	 *	@param[in] n_column is zero-based index of a column
	 *	@param[in] n_row is zero-based index of a row
	 *
	 *	@return Returns true if the element at the selected column and row
	 *		is below or on the diagonal, otherwise returns false.
	 */
	static inline bool b_IsLowerDiagElem(int n_column, int n_row)
	{
		return n_row >= n_column; // row > column: lower
	}

	/**
	 *	@brief diagonal element predicate
	 *
	 *	@param[in] n_column is zero-based index of a column
	 *	@param[in] n_row is zero-based index of a row
	 *
	 *	@return Returns true if the element at the selected column and row
	 *		is on the diagonal, otherwise returns false.
	 */
	static inline bool b_IsDiagElem(int n_column, int n_row)
	{
		return n_column == n_row; // row == column: diagonal
	}

	/**
	 *	@brief upper-triangluar element predicate
	 *
	 *	@param[in] n_column is zero-based index of a column
	 *	@param[in] n_row is zero-based index of a row
	 *
	 *	@return Returns true if the element at the selected column and row
	 *		is above or on the diagonal, otherwise returns false.
	 */
	static inline bool b_IsUpperDiagElem(int n_column, int n_row)
	{
		return n_column >= n_row; // row < column: upper
	}

	/**
	 *	@brief above-diagonal element predicate
	 *
	 *	@param[in] n_column is zero-based index of a column
	 *	@param[in] n_row is zero-based index of a row
	 *
	 *	@return Returns true if the element at the selected column and row
	 *		is above the diagonal, otherwise returns false.
	 */
	static inline bool b_IsUpperElem(int n_column, int n_row)
	{
		return n_column > n_row; // row < column: upper
	}

	/**
	 *	@brief calculates transpose of this matrix
	 *	@return Returns transpose of this matrix.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN t_Transpose() const // throw(std::bad_alloc)
	{
		MatrixMN t_matrix(m_n_row_num, m_n_column_num); // transpose
		for(int y = 0; y < m_n_row_num; ++ y) {
			for(int x = 0; x < m_n_column_num; ++ x)
				t_matrix[y][x] = (*this)[x][y];
		}
		return t_matrix;
	}

	/**
	 *	@brief calculates minor of this matrix
	 *
	 *	@param[in] n_skip_column is zero-based index of a column
	 *	@param[in] n_skip_row is zero-based index of a row
	 *
	 *	@return Returns a copy of this matrix, with the selected column and row ommited.
	 *
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN t_Minor(int n_skip_column, int n_skip_row) const // throw(std::bad_alloc)
	{
		_ASSERTE(n_skip_column >= 0 && n_skip_column < m_n_column_num);
		_ASSERTE(n_skip_row >= 0 && n_skip_row < m_n_row_num);
		MatrixMN t_matrix(m_n_column_num - 1, m_n_row_num - 1);
		for(int y = 0; y < m_n_row_num; ++ y) {
			if(y == n_skip_y)
				continue;
			for(int x = 0; x < m_n_column_num; ++ x) {
				if(x == n_skip_x)
					continue;
				t_matrix[x - (x > n_skip_x)][y - (y > n_skip_y)] = (*this)[x][y];
			}
		}
		return t_matrix;
	}

	/**
	 *	@brief calculates cofactor of a selected element
	 *
	 *	@param[in] n_column is zero-based index of a column
	 *	@param[in] n_row is zero-based index of a row
	 *
	 *	@return Returns cofactor of the element at the selected column and row.
	 *
	 *	@note This function throws std::bad_alloc.
	 */
	inline _TyScalar f_Cofactor(int n_column, int n_row) const // throw(std::bad_alloc)
	{
		_ASSERTE(n_column >= 0 && n_column < m_n_column_num);
		_ASSERTE(n_row >= 0 && n_row < m_n_row_num);
		return (((n_column + n_row) & 1)? -1 : 1) * t_Minor(n_column, n_row).f_Determinant();
	}

	/**
	 *	@brief calculates determinant of this matrix using Saruss' rule (must be square)
	 *	@return Returns determinant of this matrix.
	 *	@note This function throws std::bad_alloc for matrices bigger than 3x3 elements.
	 *	@note This function is recursive and rather slow. Faster and possibly more precise
	 *		solutions can be obtained using LU decomposition.
	 */
	_TyScalar f_Determinant() const // throw(std::bad_alloc)
	{
		_ASSERTE(m_n_column_num == m_n_row_num);
		// must be the same dimensions

		if(m_n_column_num == 0 && m_n_row_num == 0)
			return 1;
		else if(m_n_column_num == 1 && m_n_row_num == 1)
			return (*this)[0][0];
		else if(m_n_column_num == 2 && m_n_row_num == 2)
			return (*this)[0][0] * (*this)[1][1] - (*this)[0][1] * (*this)[1][0];
		else if(m_n_column_num == 3 && m_n_row_num == 3) {
			return (*this)[0][0] * (*this)[1][1] * (*this)[2][2] +
				(*this)[0][1] * (*this)[1][2] * (*this)[2][0] +
				(*this)[0][2] * (*this)[1][0] * (*this)[2][1] -
				(*this)[2][0] * (*this)[1][1] * (*this)[0][2] -
				(*this)[2][1] * (*this)[1][2] * (*this)[0][0] -
				(*this)[2][2] * (*this)[1][0] * (*this)[0][1];
		}
		// use sarrus rule

		_TyScalar f_det = 0;
		for(int n_col = 0; n_col < m_n_column_num; ++ n_col)
			f_det += (*this)[n_col][0] * f_Cofactor(n_col, 0);
		// use Laplace's formula (expansion along first line)

		return f_det;
	}

	/**
	 *	@brief calculates adjugate of this matrix (must be square)
	 *	@return Returns matrix, containing the adjugate.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN t_Adjugate() const // throw(std::bad_alloc)
	{
		_ASSERTE(m_n_column_num == m_n_row_num);
		// must be the same dimensions

		MatrixMN t_adj(m_n_column_num, m_n_row_num);
		for(int y = 0; y < m_n_row_num; ++ y) {
			for(int x = 0; x < m_n_column_num; ++ x)
				t_adj[y][x] = f_Cofactor(x, y);
		}
		return t_adj;
	}

	/**
	 *	@brief calculates inverse of this matrix (must be square)
	 *	@return Returns matrix, containing the inverse.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN t_Inverse() const // throw(std::bad_alloc)
	{
		_ASSERTE(m_n_column_num == m_n_row_num);
		// must be the same dimensions

		MatrixMN t_inv(t_Adjugate());
		_TyScalar f_inv_det = 1 / f_Determinant();
		for(int y = 0; y < m_n_row_num; ++ y) {
			for(int x = 0; x < m_n_column_num; ++ x)
				t_inv[y][x] *= f_inv_det;
		}
		return t_inv;
	}

	/**
	 *	@brief calculates elementwise sum of matrix elements and a scalar
	 *	@param[in] f_scalar is a scalar
	 *	@return Returns reference to this.
	 */
	MatrixMN &operator +=(_TyScalar f_scalar)
	{
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			m_p_data[i] += f_scalar;

		return *this;
	}

	/**
	 *	@brief calculates elementwise difference of matrix elements and a scalar
	 *	@param[in] f_scalar is a scalar
	 *	@return Returns reference to this.
	 */
	MatrixMN &operator -=(_TyScalar f_scalar)
	{
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			m_p_data[i] -= f_scalar;

		return *this;
	}

	/**
	 *	@brief calculates elementwise multiplication of matrix elements by a scalar
	 *	@param[in] f_scalar is a scalar
	 *	@return Returns reference to this.
	 */
	MatrixMN &operator *=(_TyScalar f_scalar)
	{
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			m_p_data[i] *= f_scalar;

		return *this;
	}

	/**
	 *	@brief calculates elementwise division of matrix elements by a scalar
	 *	@param[in] f_scalar is a scalar
	 *	@return Returns reference to this.
	 */
	MatrixMN &operator /=(_TyScalar f_scalar)
	{
		f_scalar = 1 / f_scalar; // multiplication faster than division
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			m_p_data[i] *= f_scalar;

		return *this;
	}

	/**
	 *	@brief calculates elementwise sum of matrix elements and a scalar
	 *	@param[in] f_scalar is a scalar
	 *	@return Returns matrix, containing the sums.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN operator +(_TyScalar f_scalar) const // throw(std::bad_alloc)
	{
		MatrixMN temp(m_n_column_num, m_n_row_num);
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			temp.m_p_data[i] = m_p_data[i] + f_scalar;

		return temp;
	}

	/**
	 *	@brief calculates elementwise difference of matrix elements and a scalar
	 *	@param[in] f_scalar is a scalar
	 *	@return Returns matrix, containing the differences.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN operator -(_TyScalar f_scalar) const // throw(std::bad_alloc)
	{
		MatrixMN temp(m_n_column_num, m_n_row_num);
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			temp.m_p_data[i] = m_p_data[i] - f_scalar;

		return temp;
	}

	/**
	 *	@brief calculates elementwise multiplication of matrix elements by a scalar
	 *	@param[in] f_scalar is a scalar
	 *	@return Returns matrix, containing the products.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN operator *(_TyScalar f_scalar) const // throw(std::bad_alloc)
	{
		MatrixMN temp(m_n_column_num, m_n_row_num);
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			temp.m_p_data[i] = m_p_data[i] * f_scalar;

		return temp;
	}

	/**
	 *	@brief calculates elementwise division of matrix elements by a scalar
	 *	@param[in] f_scalar is a scalar
	 *	@return Returns matrix, containing the quotients.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN operator /(_TyScalar f_scalar) const // throw(std::bad_alloc)
	{
		f_scalar = 1 / f_scalar; // multiplication faster than division
		MatrixMN temp(m_n_column_num, m_n_row_num);
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			temp.m_p_data[i] = m_p_data[i] * f_scalar;

		return temp;
	}

	/**
	 *	@brief calculates elementwise sum of two matrices inplace
	 *	@param[in] r_t_mat is right-hand side addend (dimensions must match this matrix)
	 *	@return Returns reference to this.
	 */
	MatrixMN &operator +=(const MatrixMN &r_t_mat)
	{
		_ASSERTE(m_n_row_num == r_t_mat.m_n_row_num &&
			m_n_column_num == r_t_mat.m_n_column_num);
		// dimensions must agree

		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			m_p_data[i] += r_t_mat.m_p_data[i];

		return *this;
	}

	/**
	 *	@brief calculates elementwise difference of two matrices inplace
	 *	@param[in] r_t_mat is right-hand side addend (dimensions must match this matrix)
	 *	@return Returns reference to this.
	 */
	MatrixMN &operator -=(const MatrixMN &r_t_mat)
	{
		_ASSERTE(m_n_row_num == r_t_mat.m_n_row_num &&
			m_n_column_num == r_t_mat.m_n_column_num);
		// dimensions must agree

		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			m_p_data[i] -= r_t_mat.m_p_data[i];

		return *this;
	}

	/**
	 *	@brief unary minus operator
	 *	@return Returns an elementwise negative of this matrix.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN operator -() const // throw(std::bad_alloc)
	{
		MatrixMN temp(m_n_column_num, m_n_row_num);
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			temp.m_p_data[i] = -m_p_data[i];
		return temp;
	}

	/**
	 *	@brief calculates elementwise sum of two matrices
	 *	@param[in] r_t_mat is right-hand side addend (dimensions must match this matrix)
	 *	@return Returns matrix, containing the sum.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN operator +(const MatrixMN &r_t_mat) const // throw(std::bad_alloc)
	{
		_ASSERTE(m_n_row_num == r_t_mat.m_n_row_num &&
			m_n_column_num == r_t_mat.m_n_column_num);
		// dimensions must agree

		MatrixMN temp(m_n_column_num, m_n_row_num);
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			temp.m_p_data[i] = m_p_data[i] + r_t_mat.m_p_data[i];

		return temp;
	}

	/**
	 *	@brief calculates elementwise difference of two matrices
	 *	@param[in] r_t_mat is right-hand side addend (dimensions must match this matrix)
	 *	@return Returns matrix, containing the difference.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN operator -(const MatrixMN &r_t_mat) const // throw(std::bad_alloc)
	{
		_ASSERTE(m_n_row_num == r_t_mat.m_n_row_num &&
			m_n_column_num == r_t_mat.m_n_column_num);
		// dimensions must agree

		MatrixMN temp(m_n_column_num, m_n_row_num);
		for(int i = 0, n = m_n_column_num * m_n_row_num; i < n; ++ i)
			temp.m_p_data[i] = m_p_data[i] - r_t_mat.m_p_data[i];

		return temp;
	}

	/**
	 *	@brief calculates product of two matrices inplace
	 *	@param[in] r_t_mat is right-hand side multiplicand
	 *		(number of rows must match this matrix' number of columns)
	 *	@return Returns reference to this.
	 *	@note This function throws std::bad_alloc.
	 */
	inline MatrixMN &operator *=(const MatrixMN &r_t_mat) // throw(std::bad_alloc)
	{
		MatrixMN t_temp = (*this) * r_t_mat;
		Swap(t_temp);
		return *this;
	}

	/**
	 *	@brief calculates product of two matrices
	 *	@param[in] r_t_mat is right-hand side multiplicand
	 *		(number of rows must match this matrix' number of columns)
	 *	@return Returns matrix, containing the product.
	 *	@note This function throws std::bad_alloc.
	 */
	MatrixMN operator *(const MatrixMN &r_t_mat) const // throw(std::bad_alloc)
	{
		_ASSERTE(m_n_column_num == r_t_mat.m_n_row_num);
		// must be the same dimensions

		MatrixMN t_product(m_n_row_num, r_t_mat.m_n_column_num);
		for(int y = 0; y < m_n_row_num; ++ y) {
			for(int x = 0; x < r_t_mat.m_n_column_num; ++ x) {
				_TyScalar f_sum_prod = 0;
				for(int r = 0; r < m_n_column_num; ++ r)
					f_sum_prod += (*this)[r][y] * r_t_mat[x][r];
				t_product[x][y] = f_sum_prod;
			}
		}
		return t_product;
	}

	std::vector<_TyScalar> operator *(std::vector<_TyScalar> &r_t_vec) const // throw(std::bad_alloc)
	{
		_ASSERTE(m_n_column_num == r_t_vec.size());
		// must be the same dimensions

		std::vector<_TyScalar> t_product(m_n_row_num);
		for(int y = 0; y < m_n_row_num; ++ y) {
			_TyScalar f_sum_prod = 0;
			for(int r = 0; r < m_n_column_num; ++ r)
				f_sum_prod += (*this)[r][y] * r_t_vec[r];
			t_product[y] = f_sum_prod;
		}
		return t_product;
	}

	/**
	 *	@brief solves a system of linear equations Ax = b using Cramer's rule
	 *
	 *	@param[in] t_A is system matrix (any shape and form)
	 *	@param[in] t_rhs is right-hand-side vector (b; size t_A.n_Column_Num() x 1 elements)
	 *
	 *	@return Returns a column vector, containing the solution (x).
	 *
	 *	@note This function throws std::bad_alloc.
	 *	@note This function can not work inplace.
	 *	@note This function is too slow and numerically unstable to be practically used
	 *		(unless your matrix is very small). Faster solutions can be obtained using LU()
	 *		or Cholesky() decompositions.
	 */
	static MatrixMN t_SolveLinear(const MatrixMN &t_A, const MatrixMN &t_rhs) // throw(std::bad_alloc)
	{
		_ASSERTE(t_rhs.m_n_column_num == 1);

		MatrixMN t_result(1, t_A.m_n_row_num);

		SolveLinear(t_result.m_p_data, t_A, t_rhs.m_p_data, t_rhs.m_n_row_num);

		return t_result;
	}

	/**
	 *	@brief solves a system of linear equations Ax = b using Cramer's rule
	 *
	 *	@param[in] t_A is system matrix (any shape and form)
	 *	@param[in] p_rhs is right-hand-side vector (b; allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal t_A.n_Column_Num())
	 *
	 *	@return Returns a column vector, containing the solution (x).
	 *
	 *	@note This function throws std::bad_alloc.
	 *	@note This function can not work inplace.
	 *	@note This function is too slow and numerically unstable to be practically used
	 *		(unless your matrix is very small). Faster solutions can be obtained using LU()
	 *		or Cholesky() decompositions.
	 */
	static MatrixMN t_SolveLinear(const MatrixMN &t_A,
		const _TyScalar *p_rhs, int n_rhs_size) // throw(std::bad_alloc)
	{
		MatrixMN t_result(1, t_A.m_n_row_num);

		SolveLinear(t_result.m_p_data, t_A, p_rhs, n_rhs_size);

		return t_result;
	}

	/**
	 *	@brief solves a system of linear equations Ax = b using Cramer's rule
	 *
	 *	@param[out] p_lhs is left-hand-side vector (x; allocated to n_lhs_size elements)
	 *	@param[in] n_lhs_size is size of the left-hand-side vector (must equal t_A.n_Column_Num())
	 *	@param[in] t_A is system matrix (any shape and form)
	 *	@param[in] p_rhs is right-hand-side vector (b; allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal t_A.n_Column_Num())
	 *
	 *	@note This function throws std::bad_alloc.
	 *	@note This function can not work inplace.
	 *	@note This function is too slow and numerically unstable to be practically used
	 *		(unless your matrix is very small). Faster solutions can be obtained using LU()
	 *		or Cholesky() decompositions.
	 */
	static void SolveLinear(_TyScalar *p_lhs, int UNUSED(n_lhs_size),
		const MatrixMN &t_A, const _TyScalar *p_rhs, int UNUSED(n_rhs_size)) // throw(std::bad_alloc)
	{
		_ASSERTE(p_lhs != p_rhs);
		_ASSERTE(t_A.m_n_row_num == n_rhs_size &&
			t_A.m_n_row_num == n_rhs_size &&
			t_A.m_n_column_num == t_A.m_n_row_num);
		// some preconditions

		_TyScalar f_A_inv_det = 1 / t_A.f_Determinant();
		MatrixMN t_tmp(t_A);
		for(int i = 0; i < t_A.m_n_row_num; ++ i) {
			for(int j = 0; j < t_A.m_n_row_num; ++ j)
				t_tmp[i][j] = p_rhs[j];
			// copy coefficients matrix and change one of it columns to result vector

			p_lhs[i] = t_tmp.f_Determinant() * f_A_inv_det;
			// calculate one variable at a time

			if(i + 1 < t_A.m_n_row_num) {
				for(int j = 0; j < t_A.m_n_row_num; ++ j)
					t_tmp[i][j] = t_A[i][j];
			}
			// in case we're not done yet, put the column back
		}
		// use Cramer's rule
	}

	/**
	 *	@brief calculates LU decomposition inplace with partial pivoting (assumes square matrix)
	 *
	 *	@param[out] p_row_perm is row permutation given by pivoting (allocated to n_row_perm_size)
	 *	@param[in] n_row_perm_size is size of row permutation (must equal n_Column_Num())
	 *
	 *	@return Returns true on success, false on failure (singular matrix).
	 *
	 *	@note This function throws std::bad_alloc (needs workspace for pivoting).
	 */
	inline bool LU(int *p_row_perm, size_t n_row_perm_size) // throw(std::bad_alloc)
	{
		return LU(p_row_perm, n_row_perm_size, *this, *this); // inplace LU
	}

	/**
	 *	@brief calculates LU decomposition with partial pivoting (assumes square matrix)
	 *
	 *	@param[out] p_row_perm is row permutation given by pivoting (allocated to n_row_perm_size)
	 *	@param[in] n_row_perm_size is size of row permutation (must equal n_Column_Num())
	 *	@param[out] r_LU is a matrix, containing packed LU decomposition
	 *		(L's unit diagonal is not stored)
	 *
	 *	@return Returns true on success, false on failure (singular matrix).
	 *
	 *	@note This function throws std::bad_alloc (needs workspace for pivoting).
	 *	@note This function can work inplace (r_LU can point to this).
	 *	@note If r_LU is already allocated, the storage is kept.
	 */
	inline bool LU(int *p_row_perm, size_t n_row_perm_size, MatrixMN &r_LU) const // throw(std::bad_alloc)
	{
		return LU(p_row_perm, n_row_perm_size, r_LU, r_LU); // packed L and U
	}

	/**
	 *	@brief solves a system of linear equations LUx = b using previously calculated LU decomposition
	 *
	 *	@param[out] p_lhs is left-hand-side vector (x; allocated to n_lhs_size elements)
	 *	@param[in] n_lhs_size is size of the left-hand-side vector (must equal t_A.n_Column_Num())
	 *	@param[in] p_row_perm is row permutation given by pivoting (allocated to n_row_perm_size)
	 *	@param[in] n_row_perm_size is size of row permutation (must equal n_Column_Num())
	 *	@param[in] p_rhs is right-hand-side vector (b; allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal t_A.n_Column_Num())
	 *
	 *	@note This function throws std::bad_alloc.
	 *	@note This function can not work inplace.
	 */
	inline void LU_Solve(_TyScalar *p_lhs, int UNUSED(n_lhs_size),
		int *p_row_perm, size_t n_row_perm_size,
		const _TyScalar *p_rhs, int UNUSED(n_rhs_size)) const
	{
		LU_Solve(*this, *this, p_lhs, n_lhs_size, p_row_perm, n_row_perm_size, p_rhs, n_rhs_size);
	}

	/**
	 *	@brief solves a system of linear equations LUx = b using previously calculated LU decomposition
	 *
	 *	@param[in] L is the lower part of the LU decomposition (the diagonal is ignored)
	 *	@param[in] U is the upper part of the LU decomposition (can be the same matrix as L)
	 *	@param[out] p_lhs is left-hand-side vector (x; allocated to n_lhs_size elements)
	 *	@param[in] n_lhs_size is size of the left-hand-side vector (must equal t_A.n_Column_Num())
	 *	@param[in] p_row_perm is row permutation given by pivoting (allocated to n_row_perm_size)
	 *	@param[in] n_row_perm_size is size of row permutation (must equal n_Column_Num())
	 *	@param[in] p_rhs is right-hand-side vector (b; allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal t_A.n_Column_Num())
	 *
	 *	@note This function can not work inplace.
	 */
	static inline void LU_Solve(const MatrixMN &L, const MatrixMN &U,
		_TyScalar *p_lhs, int UNUSED(n_lhs_size),
		int *p_row_perm, size_t n_row_perm_size,
		const _TyScalar *p_rhs, int UNUSED(n_rhs_size))
	{
		_ASSERTE(L.m_n_column_num == L.m_n_row_num);
		_ASSERTE(U.m_n_column_num == U.m_n_row_num);
		_ASSERTE(U.m_n_column_num == L.m_n_column_num);
		_ASSERTE(p_row_perm && n_row_perm_size == L.m_n_column_num);
		_ASSERTE(n_lhs_size == n_rhs_size);
		_ASSERTE(n_lhs_size == n_row_perm_size);
		_ASSERTE(p_lhs != p_rhs); // can't work inplace

		for(size_t i = 0; i < n_row_perm_size; ++ i)
			p_lhs[i] = p_rhs[p_row_perm[i]];
		L.LSolve_UnitDiag(p_lhs, n_row_perm_size);
		U.USolve(p_lhs, n_row_perm_size);
	}

	/**
	 *	@brief calculates determinant of the matrix, using the U matrix of a LU decomposition
	 *
	 *	@param[in] p_row_perm is row permutation given by pivoting
	 *		(contains n_row_perm_size elements)
	 *	@param[in] n_row_perm_size is size of row permutation (must equal n_Column_Num())
	 *
	 *	@return Returns the value of the determinant of the original matrix A = LU.
	 *
	 *	@note This function throws std::bad_alloc (needs a small workspace to determine sign).
	 *	@note This matrix must be the either the U matrix or packed
	 *		LU matrix of the LU decomposition.
	 */
	_TyScalar f_Determinant(const int *p_row_perm, size_t n_row_perm_size) const // throw(std::bad_alloc)
	{
		_ASSERTE(m_n_column_num == m_n_row_num);
		_ASSERTE(p_row_perm && n_row_perm_size == m_n_column_num);

		const int n = m_n_column_num;
		const MatrixMN &r_U = *this;

		double f_det = 1;
		// prone to under / overflow with big matrices, could use multiple precision accumulator

		int n_sign = 1;
		std::vector<int> row_perm(p_row_perm, p_row_perm + n_row_perm_size);
		// determine sign of the determinant by tracking back the
		// number of changes in the permutation (the permutation is destroyed)

		for(int i = 0; i < n; ++ i) {
			if(row_perm[i] != i) {
				do {
					std::swap(row_perm[row_perm[i]], row_perm[i]);
					n_sign = -n_sign;
				} while(row_perm[i] != i); // could be permuted multiple times
			}

			f_det *= r_U[i][i];
		}

		return _TyScalar(n_sign * f_det);
	}

	/**
	 *	@brief calculates LU decomposition with partial pivoting (assumes square matrix)
	 *
	 *	@param[out] p_row_perm is row permutation given by pivoting (allocated to n_row_perm_size)
	 *	@param[in] n_row_perm_size is size of row permutation (must equal n_Column_Num())
	 *	@param[out] r_L is the L matrix of LU decomposition
	 *	@param[out] r_U is the U matrix of LU decomposition
	 *
	 *	@return Returns true on success, false on failure (singular matrix).
	 *
	 *	@note This function throws std::bad_alloc (needs workspace for pivoting).
	 *	@note This function can work inplace (r_L or r_U can point to this), and r_L and r_U can
	 *		also point to the same matrix (even this; LU decomposition is then stored in packed
	 *		form where the unit diagonal of L is not represented).
	 *	@note If r_L or r_U are already allocated, the storage is kept.
	 */
	bool LU(int *p_row_perm, size_t UNUSED(n_row_perm_size),
		MatrixMN &r_L, MatrixMN &r_U) const // throw(std::bad_alloc)
	{
		_ASSERTE(m_n_column_num == m_n_row_num);
		_ASSERTE(p_row_perm && n_row_perm_size == m_n_column_num);

		const int n = m_n_column_num;
		if(r_L.m_n_row_num != n || r_L.m_n_column_num != n)
			r_L.Resize(n, n, false);
		if(r_U.m_n_row_num != n || r_U.m_n_column_num != n)
			r_U.Resize(n, n);
		// alloc matrices, if not already allocated

		const MatrixMN &r_A = *this;

		std::vector<_TyScalar> piv_weights(n);
		for(int j = 0; j < n; ++ j) {
			_TyScalar f_max = 0;
			for(int i = 0; i < n; ++ i) {
				_TyScalar f_temp = fabs(r_A[i][j]);
				if(f_temp > f_max)
					f_max = f_temp;
			}
			if(f_max == 0) {
				fprintf(stderr, "error: singular\n");
				return false;
			}
			piv_weights[j] = 1 / f_max;
		}
		// calculate (approximate) implicit scaling for pivoting

		if(&r_U != this || &r_L != this) {
			if(&r_U != &r_L) {
				if(&r_U != this && &r_L != this) {
					for(int j = 0; j < n; ++ j) { // for every column ...
						for(int i = 0; i <= j; ++ i) { // including the diagonal
							_ASSERTE(b_IsUpperDiagElem(j, i));
							r_U[j][i] = r_A[j][i]; // row < column: upper
						}
						for(int i = j + 1; i < n; ++ i) { // do not overwrite the diagonal in L
							_ASSERTE(b_IsLowerElem(j, i));
							r_L[j][i] = r_A[j][i]; // row > column: lower
						}
						// copy the row to L, U
					}
				} else if(&r_U != this) {
					for(int j = 0; j < n; ++ j) { // for every column ...
						for(int i = 0; i <= j; ++ i) { // including the diagonal
							_ASSERTE(b_IsUpperDiagElem(j, i));
							r_U[j][i] = r_A[j][i]; // row < column: upper
						}
						// copy the row to U, L is inplace
					}
				} else /*if(&r_L != this)*/ {
					for(int j = 0; j < n; ++ j) { // for every column ...
						for(int i = j + 1; i < n; ++ i) { // do not overwrite the diagonal in L
							_ASSERTE(b_IsLowerElem(j, i));
							r_L[j][i] = r_A[j][i]; // row > column: lower
						}
						// copy the row to L, U is inplace
					}
				}
				// two matrices, one can still be inplace

				for(int j = 0; j < n; ++ j) {
					for(int i = 0; i < j; ++ i) {
						r_U[i][j] = 0; // clear lower diagonal to zeros
						r_L[j][i] = 0; // clear upper diagonal to zeros
					}
					r_L[j][j] = 1; // lower part has unit diagonal, which is not stored in packed form
				}
				// in case we will not be storing the matrices as packed, we want
				// to clear them first (the algorithm will not write the zero entries)
				// do this *after* the copying above to avoid overwriting if working partially inplace
			} else {
				MatrixMN &r_LU = r_L;
				for(int j = 0; j < n; ++ j) {
					for(int i = 0; i < n; ++ i)
						r_LU[j][i] = r_A[j][i];
				}
				// packed
			}
		}
		// copy contents of A to U, L, if not working inplace

		for(int i = 0; i < n; ++ i)
			p_row_perm[i] = i;
		// begin with identity perm

		if(&r_U != &r_L) {
			for(int k = 0; k < n; ++ k) {
				int n_pivot = k;
				_ASSERTE(b_IsUpperDiagElem(k, k)); // ...
				_TyScalar f_max = piv_weights[k] * fabs(r_U[k][k]);
				for(int i = k + 1; i < n; ++ i) {
					_ASSERTE(b_IsLowerElem(k, i));
					_TyScalar f_temp = piv_weights[i] * fabs(r_L[k][i]);
					if(f_temp > f_max) {
						f_max = f_temp;
						n_pivot = i;
					}
				}
				// partial pivoting

				if(k != n_pivot) {
					_ASSERTE(n_pivot > k); // must be one of rows below the diagonal element

					for(int i = 0; i < k; ++ i) {
						_ASSERTE(b_IsLowerElem(i, k)); // row > column: lower
						_ASSERTE(b_IsLowerElem(i, n_pivot)); // row > column: lower
						std::swap(r_L[i][k], r_L[i][n_pivot]);
					}
					for(int i = k; i < n_pivot; ++ i) {
						_ASSERTE(b_IsUpperDiagElem(i, k)); // row < column: upper
						_ASSERTE(b_IsLowerElem(i, n_pivot)); // row > column: lower
						std::swap(r_U[i][k], r_L[i][n_pivot]);
					}
					for(int i = n_pivot; i < n; ++ i) {
						_ASSERTE(b_IsUpperDiagElem(i, k)); // row < column: upper
						_ASSERTE(b_IsUpperDiagElem(i, n_pivot)); // row < column: upper
						std::swap(r_U[i][k], r_U[i][n_pivot]);
					}
					// swap a row of LU

					piv_weights[n_pivot] = piv_weights[k];
					// will not need piv_weights[k] anymore, no need to swap, just copy k below

					std::swap(p_row_perm[k], p_row_perm[n_pivot]);
					// modify the permutation, based on the pivoting
				}
				// swap rows n_pivot and k, if not the same row

#if 0
				_ASSERTE(b_IsUpperDiagElem(k, k));
				if(!r_U[k][k])
					r_U[k][k] = 1e-37f;
#endif // 0
				// can avoid INF and NaN in the solution by replacing zero pivot by a large number

				for(int i = k + 1; i < n; ++ i) {
					_ASSERTE(b_IsUpperDiagElem(k, k));
					_ASSERTE(b_IsLowerElem(k, i));
					_TyScalar f_temp = (r_L[k][i] /= r_U[k][k]); // row > column: lower, row = column: upper
					// divide by pivot

					for(int j = k + 1; j < i; ++ j) {
						_ASSERTE(b_IsLowerElem(j, i));
						_ASSERTE(b_IsUpperDiagElem(j, k));
						r_L[j][i] -= f_temp * r_U[j][k]; // column > row: upper
					}
					for(int j = i; j < n; ++ j) {
						_ASSERTE(b_IsUpperDiagElem(j, i));
						_ASSERTE(b_IsUpperDiagElem(j, k));
						r_U[j][i] -= f_temp * r_U[j][k]; // column > row: upper
					}
					// reduce the rest of the matrix
				}
			}
			// version for separate matrices L and U
		} else {
			_ASSERTE(&r_L == &r_U); // the same matrix
			MatrixMN &r_LU = r_L;
			// save some hassle by providing code for the &r_L == &r_U case

			for(int k = 0; k < n; ++ k) {
				int n_pivot = k;
				_TyScalar f_max = piv_weights[k] * fabs(r_LU[k][k]);
				for(int i = k + 1; i < n; ++ i) {
					_TyScalar f_temp = piv_weights[i] * fabs(r_LU[k][i]);
					if(f_temp > f_max) {
						f_max = f_temp;
						n_pivot = i;
					}
				}
				// partial pivoting

				if(k != n_pivot) {
					_ASSERTE(n_pivot > k); // must be one of rows below the diagonal element

					for(int i = 0; i < n; ++ i)
						std::swap(r_LU[i][k], r_LU[i][n_pivot]);
					// swap a row of LU

					piv_weights[n_pivot] = piv_weights[k];
					// will not need piv_weights[k] anymore, no need to swap, just copy k below

					std::swap(p_row_perm[k], p_row_perm[n_pivot]);
					// modify the permutation, based on the pivoting
				}
				// swap rows n_pivot and k, if not the same row

#if 0
				_ASSERTE(b_IsUpperDiagElem(k, k));
				if(!r_LU[k][k])
					r_LU[k][k] = 1e-37f;
#endif // 0
				// can avoid INF and NaN in the solution by replacing zero pivot by a large number

				for(int i = k + 1; i < n; ++ i) {
					_TyScalar f_temp = (r_LU[k][i] /= r_LU[k][k]); // solve using LU? it *is* on diagonal, maybe we can already regard it as a finished LU decomposition
					// divide by pivot

					for(int j = k + 1; j < n; ++ j)
						r_LU[j][i] -= f_temp * r_LU[j][k]; // causes scatter in sparse version
					// reduce the rest of the matrix
				}
			}
			// version for packed LU

			// note that these are essentially the same operations as for Cholesky, could be done by block,
			// only the inter-block pivoting could become a problem
		}

		return true;
	}

	/**
	 *	@brief calculates Cholesky decomposition U^TU (assumes square positive definite matrix)
	 *	@return Returns true on success, false on failure (not positive definite).
	 */
	bool Cholesky()
	{
		_ASSERTE(m_n_column_num == m_n_row_num);

		MatrixMN &r_U = *this;
		const int n = m_n_column_num;
		for(int k = 0; k < n; ++ k) {
			if(r_U[k][k] < 0) {
				fprintf(stderr, "error: not pos def\n");
				return false;
			}
			_TyScalar l_kk = (_TyScalar)sqrt(r_U[k][k]); // accesses the diagonal
			r_U[k][k] = l_kk;
			for(int i = k + 1; i < n; ++ i)
				r_U[i][k] /= l_kk; // writes a column below diagonal, doesn't introduce any fill-in
			for(int j = k + 1; j < n; ++ j) {
				for(int i = j; i < n; ++ i)
					r_U[i][j] -= r_U[i][k] * r_U[j][k]; // writes to lower triangle, a lot of scatter :(
			}
		}
		for(int k = 0; k < n; ++ k) {
			for(int i = k + 1; i < n; ++ i)
				r_U[k][i] = 0; // have to manually clear the upper half
		}
		// the [c][r] operator is actually transposed so this calculates upper Cholesky

		return true;
	}

	/**
	 *	@brief forward-substitution, assumes square upper-triangular matrix
	 *
	 *	@param[in] p_rhs is right-hand-side vector (allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal n_Column_Num())
	 */
	void UTSolve(_TyScalar *p_rhs, int UNUSED(n_rhs_size)) const
	{
		_ASSERTE(m_n_row_num == n_rhs_size && m_n_column_num == m_n_row_num);

		const int n = m_n_column_num;
		const MatrixMN &r_U = *this;
		for(int j = 0; j < n; ++ j) {
			for(int i = 0; i < j; ++ i)
				p_rhs[j] -= r_U[j][i] * p_rhs[i];
			p_rhs[j] /= r_U[j][j];
		}
	}

	/**
	 *	@brief back-substitution, assumes square upper-triangular matrix
	 *
	 *	@param[in] p_rhs is right-hand-side vector (allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal n_Column_Num())
	 */
	void USolve(_TyScalar *p_rhs, int UNUSED(n_rhs_size)) const
	{
		_ASSERTE(m_n_row_num == n_rhs_size && m_n_column_num == m_n_row_num);

		const int n = m_n_column_num;
		const MatrixMN &r_U = *this;
		for(int j = n; j > 0;) {
			-- j;
			p_rhs[j] /= r_U[j][j];
			for(int i = 0; i < j; ++ i)
				p_rhs[i] -= r_U[j][i] * p_rhs[j];
		}
	}

	/**
	 *	@brief forward-substitution, assumes square lower-triangular matrix
	 *
	 *	@param[in] p_rhs is right-hand-side vector (allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal n_Column_Num())
	 */
	void LSolve(_TyScalar *p_rhs, int UNUSED(n_rhs_size)) const
	{
		_ASSERTE(m_n_row_num == n_rhs_size && m_n_column_num == m_n_row_num);

		const int n = m_n_column_num;
		const MatrixMN &r_L = *this;
		for(int j = 0; j < n; ++ j) {
			p_rhs[j] /= r_L[j][j];
			for(int i = j + 1; i < n; ++ i)
				p_rhs[i] -= r_L[j][i] * p_rhs[j];
		}
	}

	/**
	 *	@brief back-substitution, assumes square lower-triangular matrix
	 *
	 *	@param[in] p_rhs is right-hand-side vector (allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal n_Column_Num())
	 *
	 *	@note In case the matrix has unit diagonal (such as L from LU decomposition,
	 *		LSolve_UnitDiag() is slightly faster).
	 */
	void LTSolve(_TyScalar *p_rhs, int n_rhs_size) const
	{
		_ASSERTE(m_n_row_num == n_rhs_size && m_n_column_num == m_n_row_num);

		const int n = m_n_column_num;
		const MatrixMN &r_L = *this;
		for(int j = n; j > 0;) {
			-- j;
			// here

			for(int i = j + 1; i < n; ++ i)
				p_rhs[j] -= r_L[j][i] * p_rhs[i];
			p_rhs[j] /= r_L[j][j];
			// note this is untested
		}
	}

	/**
	 *	@brief forward-substitution, assumes square lower-triangular matrix with unit diagonal
	 *
	 *	@param[in] p_rhs is right-hand-side vector (allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal n_Column_Num())
	 *
	 *	@note This function does not access the diagonal, does not verify its unit-ness.
	 */
	void LSolve_UnitDiag(_TyScalar *p_rhs, int UNUSED(n_rhs_size)) const
	{
		_ASSERTE(m_n_row_num == n_rhs_size && m_n_column_num == m_n_row_num);

		const int n = m_n_column_num;
		const MatrixMN &r_L = *this;
		for(int j = 0; j < n; ++ j) {
			//_ASSERTE(r_L[j][j] == 1);
			// does not always have to be true, packed LU stores something else at the diagonal

			//p_rhs[j] /= r_L[j][j];
			// the diagonal is unit, no need to be accessed

			for(int i = j + 1; i < n; ++ i) {
				_ASSERTE(b_IsLowerElem(j, i));
				p_rhs[i] -= r_L[j][i] * p_rhs[j];
			}
		}
	}

	/**
	 *	@brief solves a system of linear equations U^TUx = b using Cholesky decomposition
	 *
	 *	@param[in] t_U is Cholesky decomposition (square, upper-triangular)
	 *	@param[in] t_rhs is right-hand-side vector (b; allocated to t_U.n_Column_Num() x 1)
	 *
	 *	@return Returns a column vector, containing the solution (x).
	 *
	 *	@note This function throws std::bad_alloc.
	 */
	static MatrixMN t_CholSolve(const MatrixMN &t_U, const MatrixMN &t_rhs) // throw(std::bad_alloc)
	{
		_ASSERTE(t_rhs.m_n_column_num == 1);

		MatrixMN t_result(1, t_rhs.m_n_row_num);
		// allocate result and copy data (works inplace)

		CholSolve(t_result.m_p_data, t_U, t_rhs.m_p_data, t_rhs.m_n_row_num);

		return t_result;
	}

	/**
	 *	@brief solves a system of linear equations U^TUx = b using Cholesky decomposition
	 *
	 *	@param[in] t_U is Cholesky decomposition (square, upper-triangular)
	 *	@param[in] p_rhs is right-hand-side vector (b; allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal t_U.n_Column_Num())
	 *
	 *	@return Returns a column vector, containing the solution (x).
	 *
	 *	@note This function throws std::bad_alloc.
	 */
	static MatrixMN t_CholSolve(const MatrixMN &t_U, const _TyScalar *p_rhs, int n_rhs_size) // throw(std::bad_alloc)
	{
		MatrixMN t_result(1, n_rhs_size);
		//memcpy(&t_result[0][0], p_rhs, n_rhs_size * sizeof(_TyScalar)); // CholSolve() does it
		_TyScalar *p_result = &t_result[0][0];
		// allocate result and copy data (works inplace)

		CholSolve(p_result, t_U, p_rhs, n_rhs_size);

		return t_result;
	}

	/**
	 *	@brief solves a system of linear equations U^TUx = b using Cholesky decomposition
	 *
	 *	@param[out] p_lhs is left-hand-side vector (x; allocated to n_lhs_size elements)
	 *	@param[in] n_lhs_size is size of the left-hand-side vector (must equal t_U.n_Column_Num())
	 *	@param[in] t_U is Cholesky decomposition (square, upper-triangular)
	 *	@param[in] p_rhs is right-hand-side vector (b; allocated to n_rhs_size elements)
	 *	@param[in] n_rhs_size is size of the right-hand-side vector (must equal t_U.n_Column_Num())
	 *
	 *	@note This function can work inplace, p_lhs and p_rhs can point to the same array.
	 */
	static void CholSolve(_TyScalar *p_lhs, int UNUSED(n_lhs_size),
		const MatrixMN &t_U, const _TyScalar *p_rhs, int n_rhs_size)
	{
		_ASSERTE(t_U.m_n_row_num == n_lhs_size &&
			t_U.m_n_row_num == n_rhs_size &&
			t_U.m_n_column_num == t_U.m_n_row_num);
		// some preconditions

		_TyScalar *p_result = p_lhs;
		if(p_rhs != p_lhs) // can work inplace
			memcpy(p_result, p_rhs, n_rhs_size * sizeof(_TyScalar));
		// allocate result and copy data (works inplace)

		t_U.UTSolve(p_result, n_rhs_size);
		t_U.USolve(p_result, n_rhs_size);
		// solve
	}

	/**
	 *	@brief prints the matrix to stdout
	 *	@param[in] p_s_label is name of the matrix (or null for no name)
	 */
	void Print(const char *p_s_label = 0) const
	{
		if(p_s_label)
			printf("%s =\n", p_s_label);
		const MatrixMN &r_A = *this;
		for(int j = 0; j < m_n_row_num; ++ j) {
			printf("    | ");
			for(int i = 0; i < m_n_column_num; ++ i)
				printf("\t%.2f" + !i, r_A[i][j]);
			printf(" |\n");
		}
	}
};

/**
 *	@brief specialization of the matrix type for floats
 */
typedef MatrixMN<float> MatrixMNf;

/**
 *	@brief specialization of the matrix type for doubles
 */
typedef MatrixMN<double> MatrixMNd;

/**
 *	@brief floating point traits
 *	@tparam _Ty is floating point type
 */
template <class _Ty>
class CFloatTraits {};

/**
 *	@brief floating point traits (specialization for single-precision float)
 */
template <>
class CFloatTraits<float> {
public:
	typedef float _TyScalar;

	/**
	 *	@brief gets epsilon
	 *	@return Returns smallest number, such that 1 + f_Epsilon() != 1.
	 */
	static inline _TyScalar f_Epsilon()
	{
		return 1.192092896e-07f; // taken from MSVC's float.h
		// opengroup.org says 1e-5f
	}

	/**
	 *	@brief gets maximal value
	 *	@return Returns maximal value that can be represented using the given data type.
	 */
	static inline _TyScalar f_MaxValue()
	{
		return 3.402823466e+38f;
		// opengroup.org says 1e+37
	}

	/**
	 *	@brief gets minimal value
	 *	@return Returns minimal positive value that can be represented using the given data type.
	 */
	static inline _TyScalar f_MinValue()
	{
		return 1.175494351e-38f;
		// opengroup.org says 1e-37
	}
};

/**
 *	@brief floating point traits (specialization for double-precision float)
 */
template <>
class CFloatTraits<double> {
public:
	typedef double _TyScalar;

	/**
	 *	@brief gets epsilon
	 *	@return Returns smallest number, such that 1 + f_Epsilon() != 1.
	 */
	static inline _TyScalar f_Epsilon()
	{
		return 2.2204460492503131e-016; // taken from MSVC's float.h
		// opengroup.org says 1e-9
	}

	/**
	 *	@brief gets maximal value
	 *	@return Returns maximal value that can be represented using the given data type.
	 */
	static inline _TyScalar f_MaxValue()
	{
		return 1.7976931348623158e+308;
		// opengroup.org says 1e+37
	}

	/**
	 *	@brief gets minimal value
	 *	@return Returns minimal positive value that can be represented using the given data type.
	 */
	static inline _TyScalar f_MinValue()
	{
		return 2.2250738585072014e-308;
		// opengroup.org says 1e-37
	}
};

/**
 *	@brief a simple SVD implementation, utilising Householder bidiagonalization and QR iteration
 */
template <class _Ty>
class CSVD {
public:
	typedef _Ty _TyScalar;
	typedef MatrixMN<_TyScalar> _TyMatrix;

protected:
	const int m_n_row_num; /**< @brief number of rows of the original A matrix */
	const int m_n_column_num; /**< @brief number of columns of the original A matrix */
	const _TyMatrix &m_r_a; /**< @brief reference to the original A matrix (may be out of scope) */
	_TyMatrix m_t_u; /**< @brief the U matrix of A = UWV^T decomposition */
	_TyMatrix m_t_v; /**< @brief the V matrix of A = UWV^T decomposition */
	std::vector<_TyScalar> m_w; /**< @brief the diagonal values of the W matrix of A = UWV^T decomposition */
	const _TyScalar m_f_epsilon; /**< @brief machine epsilon for a given floating-point type */
	_TyScalar m_f_thresh; /**< @brief relative singular value thrershold, given value of w[0] and epsilon */

public:
	/**
	 *	@brief default constructor; caculates the SVD
	 *
	 *	@param[in] r_a is the matrix to be decomposed
	 *	@param[in] n_max_pass_num is maximal number of QR iteration passes
	 *
	 *	@note This function throws std::bad_alloc or std::runtime_error
	 *		in case QR iteration fails to converge.
	 */
	CSVD(const _TyMatrix &r_a, int n_max_pass_num = 30) // throw(std::bad_alloc, std::runtime_error)
		:m_n_row_num(r_a.n_Row_Num()), m_n_column_num(r_a.n_Column_Num()),
		m_r_a(r_a), m_t_u(r_a), m_t_v(m_n_column_num, m_n_column_num),
		m_w(m_n_column_num), m_f_epsilon(CFloatTraits<_TyScalar>::f_Epsilon())
	{
		Decompose(n_max_pass_num);
		Reorder();
		m_f_thresh = .5f * _TyScalar(sqrt(m_n_row_num +
			m_n_column_num + 1.)) * m_w.front() * m_f_epsilon;
	}

	/**
	 *	@brief gets number of rows of the original A matrix
	 *	@return Returns the number of rows of the original A matrix.
	 *	@note The U matrix is n_Row_Num() by n_Column_Num().
	 */
	inline const int n_Row_Num() const
	{
		return m_n_row_num;
	}

	/**
	 *	@brief gets number of columns of the original A matrix
	 *	@return Returns the number of columns of the original A matrix.
	 *	@note The W and V matrices are both n_Column_Num() by n_Column_Num().
	 */
	inline const int n_Column_Num() const
	{
		return m_n_column_num;
	}

	/**
	 *	@brief gets reference to the original A matrix
	 *	@return Returns const reference to the original A matrix.
	 *	@note Note that the A matrix is not stored, and may be out of scope.
	 */
	inline const _TyMatrix &r_A() const
	{
		return m_r_a;
	}

	/**
	 *	@brief gets reference to the U matrix
	 *	@return Returns const reference to the U matrix of the UWV^T decomposition.
	 */
	inline const _TyMatrix &r_U() const
	{
		return m_t_u;
	}

	/**
	 *	@brief gets a singular value
	 *	@param[in] n_index is zero-based singular value index (0 to n_Column_Num() - 1)
	 *	@return Returns the selected singular value.
	 */
	inline _TyScalar f_W(size_t n_index) const
	{
		_ASSERTE(n_index < size_t(m_n_column_num));
		return m_w[n_index];
	}

	/**
	 *	@brief gets a pointer to the diagonal elements of the W matrix
	 *	@return Returns pointer to the diagonal elements
	 *		of the W matrix of the UWV^T decomposition.
	 *	@note The size of the W matrix is n_Column_Num() by n_Column_Num().
	 */
	inline const _TyScalar *p_W() const
	{
		return &m_w[0];
	}

	/**
	 *	@brief gets a copy of the diagonal W matrix
	 *	@return Returns const reference to the W matrix of the UWV^T decomposition.
	 *	@note This function throws std::bad_alloc.
	 */
	_TyMatrix t_W() const // throw(std::bad_alloc)
	{
		int n = m_w.size();
		_TyMatrix t_w(n, n);
		t_w.SetZero();
		for(int i = 0; i < n; ++ i)
			t_w[i][i] = m_w[i];
		return t_w;
	}

	/**
	 *	@brief gets reference to the V matrix
	 *	@return Returns const reference to the V matrix of the UWV^T decomposition.
	 */
	inline const _TyMatrix &r_V() const
	{
		return m_t_v;
	}

	/**
	 *	@brief gets rank of the original A matrix
	 *	@param[in] f_thresh is singular value threshold
	 *		(if negative, default relative thresh is used)
	 *	@return Returns rank of the original A matrix.
	 */
	int n_Rank(_TyScalar f_thresh = -1) const
	{
		f_thresh = (f_thresh >= 0)? f_thresh : m_f_thresh;
		int n_rank = 0;
		int n = m_n_column_num; // antialiass
		for(int j = 0;j < n; ++ j) {
			if(m_w[j] > f_thresh)
				++ n_rank;
			else
				break; // it is sorted
		}
		return n_rank;
	}

	/**
	 *	@brief gets nullity of the original A matrix
	 *	@param[in] f_thresh is singular value threshold
	 *		(if negative, default relative thresh is used)
	 *	@return Returns nullity of the original A matrix.
	 */
	inline int n_Nullity(_TyScalar f_thresh = -1) const
	{
		return m_n_column_num - n_Rank(f_thresh);
		// the rank-nullity theorem states that, for any A, the
		// rank plus the nullity is n, the number of columns
	}

	/**
	 *	@brief gets the range matrix
	 *	@param[in] f_thresh is singular value threshold
	 *		(if negative, default relative thresh is used)
	 *	@return Returns the orthonomal basis of range of A.
	 *	@note This function throws std::bad_alloc.
	 */
	_TyMatrix t_Range(_TyScalar f_thresh = -1) const // throw(std::bad_alloc)
	{
		f_thresh = (f_thresh >= 0)? f_thresh : m_f_thresh;
		const int m = m_n_row_num, n = m_n_column_num; // antialiass
		_TyMatrix t_range(n_Rank(f_thresh), m);
		for(int j = 0, n_dest = 0; j < n; ++ j) {
			if(m_w[j] > f_thresh) {
				for(int i = 0; i < m; ++ i)
					t_range[n_dest][i] = m_t_u[j][i];
				++ n_dest;
			}
		}
		return t_range;
	}

	/**
	 *	@brief gets the nullspace matrix
	 *	@param[in] f_thresh is singular value threshold
	 *		(if negative, default relative thresh is used)
	 *	@return Returns the orthonomal basis of nullspace of A.
	 *	@note This function throws std::bad_alloc.
	 */
	_TyMatrix t_Nullspace(_TyScalar f_thresh = -1) const // throw(std::bad_alloc)
	{
		f_thresh = (f_thresh >= 0)? f_thresh : m_f_thresh;
		const int m = m_n_row_num, n = m_n_column_num; // antialiass
		_TyMatrix t_nullspace(n_Nullity(f_thresh), n);
		for(int j = 0, n_dest = 0; j < n; ++ j) {
			if(m_w[j] <= f_thresh) {
				for(int i = 0; i < n; ++ i)
					t_nullspace[n_dest][i] = m_t_v[j][i];
				++ n_dest;
			}
		}
		return t_nullspace;
	}

	/**
	 *	@brief solve Ax = b for a vector x using the pseudoinverse of A as obtained by SVD
	 *
	 *	@param[in] p_b is pointer to left-hand-side vector values
	 *	@param[in] n_b_size is number of elements in the left-hand-side vector
	 *		(must match number of rows of the original matrix)
	 *	@param[out] p_x is pointer to right-hand-side vector values
	 *	@param[in] n_x_size is number of elements in the right-hand-side vector
	 *		(must match number of columns of the original matrix)
	 *	@param[in] f_thresh is singular value threshold
	 *		(if negative, default relative thresh is used)
	 *
	 *	@note This function throws std::bad_alloc.
	 */
	void Solve(const _TyScalar *p_b, int UNUSED(n_b_size),
		_TyScalar *p_x, int UNUSED(n_x_size), _TyScalar f_thresh = -1) const // throw(std::bad_alloc)
	{
		const int m = m_n_row_num, n = m_n_column_num; // antialiass
		_ASSERTE(n_b_size == m && n_x_size == n);

		std::vector<_TyScalar> tmp(n);
		f_thresh = (f_thresh >= 0)? f_thresh : m_f_thresh;
		for(int j = 0; j < n; ++ j) { // Calculate UT B.
			_TyScalar s = 0;
			if(m_w[j] > f_thresh) { // Nonzero result only if wj is nonzero.
				for(int i = 0; i < m; ++ i)
					s += m_t_u[j][i] * p_b[i];
				s /= m_w[j]; // This is the divide by wj .
			}
			tmp[j] = s;
		}
		for(int j = 0; j < n; ++ j) { // Matrix multiply by V to get answer.
			_TyScalar s = 0;
			for(int i = 0; i < n; ++ i)
				s += m_t_v[i][j] * tmp[i];
			p_x[j] = s;
		}
	}

	/**
	 *	@brief checks the SVD decomposition and displays results in stdout
	 *	@return Returns true if all errors are below (ad-hoc) threshold,
	 *		otherwise returns false.
	 */
	bool Check()
	{
		_TyScalar f_max_col_err_u = 0;
		for(int i = 0; i < m_t_u.n_Column_Num(); ++ i) {
			_TyScalar f_accum = 0;
			for(int j = 0; j < m_t_u.n_Row_Num(); ++ j)
				f_accum += m_t_u[i][j] * m_t_u[i][j];
			f_accum = sqrt(f_accum);
			_TyScalar f_error = fabs(1 - f_accum);
			if(f_max_col_err_u < f_error)
				f_max_col_err_u = f_error;
		}
		_TyScalar f_max_col_err_v = 0;
		for(int i = 0; i < m_t_v.n_Column_Num(); ++ i) {
			_TyScalar f_accum = 0;
			for(int j = 0; j < m_t_v.n_Row_Num(); ++ j)
				f_accum += m_t_v[i][j] * m_t_v[i][j];
			f_accum = sqrt(f_accum);
			_TyScalar f_error = fabs(1 - f_accum);
			if(f_max_col_err_v < f_error)
				f_max_col_err_v = f_error;
		}
		// make sure that every column of u and v is normalized

		_TyScalar f_diag_error_u;
		{
			_TyMatrix utu = m_t_u.t_Transpose() * m_t_u;
			_TyMatrix diag_utu(utu.n_Column_Num(), utu.n_Row_Num());
			diag_utu.Identity();
			f_diag_error_u = (utu - diag_utu).f_Norm();
		}
		_TyScalar f_diag_error_v;
		{
			_TyMatrix vtv = m_t_v.t_Transpose() * m_t_v;
			_TyMatrix diag_vtv(vtv.n_Column_Num(), vtv.n_Row_Num());
			diag_vtv.Identity();
			f_diag_error_v = (vtv - diag_vtv).f_Norm();
		}
		// make sure that both u and v are orthonormal bases

		_TyMatrix W = t_W();
		// make w a matrix

		_TyMatrix A = m_t_u * W * m_t_v.t_Transpose();
		_TyScalar f_norm = (A - m_r_a).f_Norm();

		bool b_sorted = true;
		for(int i = 1; i < m_n_column_num; ++ i) {
			if(m_w[i - 1] < m_w[i]) {
				b_sorted = false;
				break;
			}
		}
		// t_odo - make sure it is sorted

		printf("SVD-check(u-col-norm: %f, v-col-norm:"
			" %f, u-ortho: %f, v-ortho: %f, factorization: %f, sort: %s)\n",
			f_max_col_err_u, f_max_col_err_v, f_diag_error_u, f_diag_error_v,
			f_norm, (b_sorted)? "ok" : "fail");

		return f_max_col_err_u < 1e-5f && f_max_col_err_v < 1e-5f &&
			f_diag_error_u < 2e-5f && f_diag_error_v < 2e-5f && f_norm < 8e-5f && b_sorted;
		// not too much precision with floats

		// t_odo - check SVD properties (make sure that the factorization holds, and that u and v are orthonormal)
	}

protected:
	/**
	 *	@brief calculates singular value decomposition of a given A (stored in m_u)
	 *
	 *	given the matrix A stored in u[0..m-1][0..n-1], this routine computes its singular value
	 *	decomposition, A = UWV^T and stores the results in the matrices u and v, and the vector w
	 *
	 *	@param[in] n_max_pass_num is maximal number of QR iteration passes
	 *	@note This function throws std::bad_alloc or std::runtime_error
	 *		in case QR iteration fails to converge.
	 */
	void Decompose(int n_max_pass_num = 30) // throw(std::bad_alloc, std::runtime_error)
	{
		const int m = m_n_row_num, n = m_n_column_num; // antialiass
		std::vector<_TyScalar> rv1(n);

		_TyScalar anorm = 0;
		{
			_TyScalar g = 0, scale = 0;
			// values of g and scale are reused from one iteration to the next

			for(int i = 0; i < n; ++ i) {
				int l = i + 2;
				rv1[i] = scale * g;
				scale = 0;
				if(i < m) {
					for(int k = i; k < m; ++ k)
						scale += fabs(m_t_u[i][k]);
					// accumulate scale of the rest of the column below the diagonal (for numerical stability?)

					if(scale != 0) {
						_TyScalar s = 0;
						for(int k = i; k < m; ++ k) {
							m_t_u[i][k] /= scale;
							s += m_t_u[i][k] * m_t_u[i][k];
						}
						// calculate dot of the reduced part of the column with itself

						_TyScalar f = m_t_u[i][i];
						// the current diagonal element

						g = (f >= 0)? -sqrt(s) : sqrt(s); // g = -SIGN(sqrt(s), f);
						// choose reflection distance so that it results in nonzero diagonal

						m_t_u[i][i] = f - g;
						// update the diagonal

						_TyScalar h = f * g - s;
						for(int j = l - 1; j < n; ++ j) {
							_TyScalar s = 0;
							for(int k = i; k < m; ++ k)
								s += m_t_u[i][k] * m_t_u[j][k];
							f = s / h;
							for(int k = i; k < m; ++ k)
								m_t_u[j][k] += f * m_t_u[i][k];
						}
						// update the rest of the submatrix

						for(int k = i; k < m; ++ k)
							m_t_u[i][k] *= scale;
						// scale the column back
					}
					// in case there are nonzeros to be reduced
				}
				m_w[i] = scale * g;
				// elimination of columns below diagonal

				scale = 0;
				if(i + 1 <= m && i + 1 != n) {
					for(int k = l - 1; k < n; ++ k)
						scale += fabs(m_t_u[k][i]);
					// accumulate scale of the rest of the row right from the diagonal (for numerical stability?)

					if(scale != 0) {
						_TyScalar s = 0;
						for(int k = l - 1; k < n; ++ k) {
							m_t_u[k][i] /= scale;
							s += m_t_u[k][i] * m_t_u[k][i];
						}
						// calculate dot of the reduced part of the row with itself

						_TyScalar f = m_t_u[l - 1][i];
						// the current pivot element

						g = (f >= 0)? -sqrt(s) : sqrt(s); // g = -SIGN(sqrt(s), f);
						// choose reflection distance so that it results in nonzero pivot

						m_t_u[l - 1][i] = f - g;
						// update the pivot

						_TyScalar h = f * g - s;
						for(int k = l - 1; k < n; ++ k)
							rv1[k] = m_t_u[k][i] / h; // the rest of rv1 reused as temp storage
						for(int j = l - 1; j < m; ++ j) {
							_TyScalar s = 0;
							for(int k = l - 1; k < n; ++ k)
								s += m_t_u[k][j] * m_t_u[k][i];
							for(int k = l - 1; k < n; ++ k)
								m_t_u[k][j] += s * rv1[k];
						}
						// update the rest of the submatrix

						for(int k = l - 1; k < n; ++ k)
							m_t_u[k][i] *= scale;
						// scale the row back
					}
					// in case there are nonzeros to be reduced
				}
				// elimination of rows to the right from the diagonal

				anorm = std::max(anorm, (fabs(m_w[i]) + fabs(rv1[i])));
			}
		}
		// householder reduction to bidiagonal form
		// note that the columns and rows reduced are kept intact in u
		// the reflection distances stored in w and rv1, respectively

		m_t_v[n - 1][n - 1] = 1;
		for(int i = n - 1; i > 0;) {
			int l = i;
			-- i; // here
			_TyScalar g = rv1[l];
			if(g != 0) {
				for(int j = l; j < n; ++ j)
					m_t_v[i][j] = (m_t_u[j][i] / m_t_u[l][i]) / g; // double division to avoid possible underflow
				for(int j = l; j < n; ++ j) {
					_TyScalar s = 0;
					for(int k = l; k < n; ++ k)
						s += m_t_u[k][i] * m_t_v[j][k];
					for(int k = l; k < n; ++ k)
						m_t_v[j][k] += s * m_t_v[i][k];
				}
			}
			for(int j = l; j < n; ++ j) {
				m_t_v[j][i] = 0;
				m_t_v[i][j] = 0;
			}
			m_t_v[i][i] = 1;
		}
		// accumulation of right-hand transformations

		for(int i = std::min(m, n); i > 0;) {
			int l = i;
			-- i; // here

			for(int j = l; j < n; ++ j)
				m_t_u[j][i] = 0;
			// clear a row of u

			_TyScalar g = m_w[i];
			if(g != 0) {
				g = 1 / g;
				for(int j = l; j < n; ++ j) {
					_TyScalar s = 0;
					for(int k = l; k < m; ++ k)
						s += m_t_u[i][k] * m_t_u[j][k];
					s = (s / m_t_u[i][i]) * g;
					for(int k = i; k < m; ++ k)
						m_t_u[j][k] += s * m_t_u[i][k];
				}
				for(int j = i; j < m; ++ j)
					m_t_u[i][j] *= g;
				// scale a column of u
			} else {
				for(int j = i; j < m; ++ j)
					m_t_u[i][j] = 0;
				// clear a column of u
			}
			m_t_u[i][i] += 1;
			// deposit one at the diagonal
		}
		// accumulation of left-hand transformations

		/*m_t_u.Print("u");
		m_t_v.Print("v");*/

		for(int k = n; k > 0;) {
			-- k;
			// here

			for(int n_pass = 0; n_pass < n_max_pass_num; ++ n_pass) {
				bool flag = true;
				int nm, l;
				for(l = k + 1; l > 0;) { // test for splitting
					-- l;
					// here

					nm = l - 1;
					if(l == 0 || fabs(rv1[l]) <= m_f_epsilon * anorm) {
						flag = false;
						break;
					}
					if(fabs(m_w[nm]) <= m_f_epsilon * anorm)
						break;
				}
				if(flag) {
					_TyScalar c = 0; // cancellation of rv1[l], if l > 0
					_TyScalar s = 1;
					for(int i = l; i < k + 1; ++ i) {
						_TyScalar f = s * rv1[i];
						rv1[i] = c * rv1[i];
						if(fabs(f) <= m_f_epsilon * anorm)
							break;
						_TyScalar g = m_w[i];
						_TyScalar h = f_Hypotenuse(f, g);
						m_w[i] = h;
						h = 1 / h;
						c = g * h;
						s = -f * h;
						for(int j = 0; j < m; ++ j) {
							_TyScalar y = m_t_u[nm][j];
							_TyScalar z = m_t_u[i][j];
							m_t_u[nm][j] = y * c + z * s;
							m_t_u[i][j] = z * c - y * s;
						}
					}
				}
				_TyScalar z = m_w[k];
				if(l == k) { // convergence
					if(z < 0) {
						m_w[k] = -z;
						for(int j = 0; j < n; ++ j)
							m_t_v[k][j] = -m_t_v[k][j];
						// singular value is made nonnegative
					}
					break;
				}
				if(n_pass == n_max_pass_num - 1)
					throw std::runtime_error("SVD does not converge");
				_TyScalar x = m_w[l]; // shift from bottom 2-by-2 minor
				nm = k - 1;
				_TyScalar y = m_w[nm];
				_TyScalar g = rv1[nm];
				_TyScalar h = rv1[k];
				_TyScalar f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2 * h * y);
				g = f_Hypotenuse(f, 1);
				_ASSERTE(g >= 0);
				_TyScalar f_tmp = (f >= 0)? g : -g; // SIGN(g, f)
				f = ((x - z) * (x + z) + h * ((y / (f + f_tmp)) - h)) / x;
				_TyScalar c = 1, s = 1;
				for(int j = l; j <= nm; ++ j) { // next QR transformation
					int i = j + 1;
					g = rv1[i];
					y = m_w[i];
					h = s * g;
					g = c * g;
					z = f_Hypotenuse(f, h);
					rv1[j] = z;
					c = f / z;
					s = h / z;
					f = x * c + g * s;
					g = g * c - x * s;
					h = y * s;
					y *= c;
					for(int jj = 0; jj < n; ++ jj) {
						_TyScalar x = m_t_v[j][jj];
						_TyScalar z = m_t_v[i][jj];
						m_t_v[j][jj] = x * c + z * s;
						m_t_v[i][jj] = z * c - x * s;
					}
					z = f_Hypotenuse(f, h);
					m_w[j] = z; // rotation can be arbitrary if z = 0
					if(z) {
						z = 1 / z;
						c = f * z;
						s = h * z;
					}
					f = c * g + s * y;
					x = c * y - s * g;
					for(int jj = 0; jj < m; ++ jj) {
						_TyScalar y = m_t_u[j][jj];
						_TyScalar z = m_t_u[i][jj];
						m_t_u[j][jj] = y * c + z * s;
						m_t_u[i][jj] = z * c - y * s;
					}
				}
				rv1[l] = 0;
				rv1[k] = f;
				m_w[k] = x;
				// hell ... this is a simple implementation of QR iteration
			}
		}
		// diagonalization of the bidiagonal form: loop over singular values, and over allowed iterations
	}

	/**
	 *	@brief reorders the decomposition and flips signs
	 *	
	 *	reorders the decomposition so that w[0] >= w[1] >= ... >= w[n] >= 0 and
	 *	flips signs to make most of the numbers positive
	 */
	void Reorder() // throw(std::bad_alloc)
	{
		const int m = m_n_row_num, n = m_n_column_num; // antialiass
		std::vector<_TyScalar> su(m), sv(n);
		// temporary storage for shellsort

		int n_gap = 1;
		do {
			n_gap *= 3;
			++ n_gap;
		} while(n_gap <= n);
		// calculate big enough gap for shell-sort

		do {
			n_gap /= 3;
			for(int i = n_gap; i < n; ++ i) {
				_TyScalar sw = m_w[i];
				if(m_w[i - n_gap] >= sw)
					continue; // a sorted pair occured; avoid a lot of copying
				for(int k = 0; k < m; ++ k)
					su[k] = m_t_u[i][k];
				for(int k = 0; k < n; ++ k)
					sv[k] = m_t_v[i][k];
				int j = i;
				while(m_w[j - n_gap] < sw) {
					m_w[j] = m_w[j - n_gap];
					for(int k = 0; k < m; ++ k)
						m_t_u[j][k] = m_t_u[j - n_gap][k];
					for(int k = 0; k < n; ++ k)
						m_t_v[j][k] = m_t_v[j - n_gap][k];
					j -= n_gap;
					if(j < n_gap)
						break;
				}
				m_w[j] = sw;
				for(int k = 0; k < m; ++ k)
					m_t_u[j][k] = su[k];
				for(int k = 0; k < n; ++ k)
					m_t_v[j][k] = sv[k];
			}
		} while(n_gap > 1);
		// sort the values

		// note that less storage would be needed if only sorting w first,
		// and generating a minimal swap sequence to sort u and v later

		// also note that the shellsort is obscure and slow, but compared
		// to the rest of the svd, the complexity is negligible

		for(int k = 0; k < n; ++ k) {
			int n_negative_num = 0;
			for(int i = 0; i < m; ++ i) {
				if(m_t_u[k][i] < 0)
					++ n_negative_num;
			}
			for(int j = 0; j < n; ++ j) {
				if(m_t_v[k][j] < 0)
					++ n_negative_num;
			}
			// count negative values

			if(n_negative_num > (m + n) / 2) {
				for(int i = 0; i < m; ++ i)
					m_t_u[k][i] = -m_t_u[k][i];
				for(int j = 0; j < n; ++ j)
					m_t_v[k][j] = -m_t_v[k][j];
			}
			// too many negatives? flip
		}
		// flip signs to have as much positive values as possible
		// (to guarantee unique and repeatable solution)
	}

	/**
	 *	@brief calculates the length of the hypotenuse of a right-angle triangle
	 *
	 *	@param[in] a is length of the first side of the triangle
	 *	@param[in] b is length of the second side of the triangle
	 *
	 *	@return Returne the length of the hypotenuse of a right-angle triangle.
	 */
	static _TyScalar f_Hypotenuse(_TyScalar a, _TyScalar b)
	{
		_TyScalar absa = fabs(a), absb = fabs(b);
		return ((absa > absb)? absa * sqrt(1 + f_Sqr(absb / absa)) :
			((absb == 0)? 0 : absb * sqrt(1 + f_Sqr(absa / absb))));
	}

	/**
	 *	@brief calculates square of a value
	 *	@param[in] x is input value
	 *	@return Returns square of the given value.
	 */
	static inline _TyScalar f_Sqr(_TyScalar x)
	{
		return x * x;
	}
};

typedef CSVD<float> CSVDf;

typedef CSVD<double> CSVDd;

#endif // !__MATRIX_MATH_INCLUDED
