// Copyright (C) 2015-2015 ChaosForge Ltd
// http://chaosforge.org/
//
// This file is part of Nova libraries. 
// For conditions of distribution and use, see copying.txt file in root folder.

#include "nv/image/png_loader.hh"

#include "nv/image/miniz.hh"

using namespace nv;

enum
{
	STBI_default = 0, // only used for req_comp

	STBI_grey = 1,
	STBI_grey_alpha = 2,
	STBI_rgb = 3,
	STBI_rgb_alpha = 4
};

typedef struct
{
	int( *read )  ( void *user, char *data, int size );   // fill 'data' with 'size' bytes.  return number of bytes actually read
	void( *skip )  ( void *user, int n );                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
	int( *eof )   ( void *user );                       // returns nonzero if we are at end of file/data
} stbi_io_callbacks;

#define STBI_MALLOC(sz)    nvmalloc(sz)
#define STBI_REALLOC(p,sz) nvrealloc(p,sz)
#define STBI_FREE(p)       nvfree(p)

static void *stbi__malloc( size_t size )
{
	return STBI_MALLOC( size );
}


template < typename T >
inline uchar8 byte_cast( T x )
{
	return uchar8( (x)& 255 );
}

#define stbi__err(x,y)  0
#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))


static uchar8 stbi__compute_y( int r, int g, int b )
{
	return (uchar8)( ( ( r * 77 ) + ( g * 150 ) + ( 29 * b ) ) >> 8 );
}

static unsigned char *stbi__convert_format( unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y )
{
	int i, j;
	unsigned char *good;

	if ( req_comp == img_n ) return data;
	NV_ASSERT( req_comp >= 1 && req_comp <= 4, "!" );

	good = (unsigned char *)stbi__malloc( req_comp * x * y );
	if ( good == NULL )
	{
		STBI_FREE( data );
		return stbi__errpuc( "outofmem", "Out of memory" );
	}

	for ( j = 0; j < (int)y; ++j )
	{
		unsigned char *src = data + j * x * img_n;
		unsigned char *dest = good + j * x * req_comp;

#define COMBO(a,b)  ((a)*8+(b))
#define CASE(a,b)   case COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
		// convert source image with img_n components to one with req_comp components;
		// avoid switch per pixel, so use switch per scanline and massive macros
		switch ( COMBO( img_n, req_comp ) )
		{
			CASE( 1, 2 ) dest[0] = src[0], dest[1] = 255; break;
			CASE( 1, 3 ) dest[0] = dest[1] = dest[2] = src[0]; break;
			CASE( 1, 4 ) dest[0] = dest[1] = dest[2] = src[0], dest[3] = 255; break;
			CASE( 2, 1 ) dest[0] = src[0]; break;
			CASE( 2, 3 ) dest[0] = dest[1] = dest[2] = src[0]; break;
			CASE( 2, 4 ) dest[0] = dest[1] = dest[2] = src[0], dest[3] = src[1]; break;
			CASE( 3, 4 ) dest[0] = src[0], dest[1] = src[1], dest[2] = src[2], dest[3] = 255; break;
			CASE( 3, 1 ) dest[0] = stbi__compute_y( src[0], src[1], src[2] ); break;
			CASE( 3, 2 ) dest[0] = stbi__compute_y( src[0], src[1], src[2] ), dest[1] = 255; break;
			CASE( 4, 1 ) dest[0] = stbi__compute_y( src[0], src[1], src[2] ); break;
			CASE( 4, 2 ) dest[0] = stbi__compute_y( src[0], src[1], src[2] ), dest[1] = src[3]; break;
			CASE( 4, 3 ) dest[0] = src[0], dest[1] = src[1], dest[2] = src[2]; break;
		default: NV_ASSERT( 0, "!" );
		}
#undef CASE
	}

	STBI_FREE( data );
	return good;
}


struct stbi__context
{
	uint32 img_x, img_y;
	int img_n, img_out_n;

	void rewind()
	{
		// conceptually rewind SHOULD rewind to the beginning of the stream,
		// but we just rewind to the beginning of the initial buffer, because
		// we only use it after doing 'test', which only ever looks at at most 92 bytes
		m_img_buffer     = m_img_buffer_original;
		m_img_buffer_end = m_img_buffer_original_end;
	}

	// initialize a memory-decode context
	stbi__context( const uchar8* buffer, int len )
	{
		m_io.read = NULL;
		m_read_from_callbacks = 0;
		m_img_buffer          = m_img_buffer_original     = (uchar8 *)buffer;
		m_img_buffer_end      = m_img_buffer_original_end = (uchar8 *)buffer + len;
	}

	// initialize a callback-based context
	stbi__context( stbi_io_callbacks *c, void *user )
	{
		m_io = *c;
		m_io_user_data = user;
		m_buflen = sizeof( m_buffer_start );
		m_read_from_callbacks = 1;
		m_img_buffer_original = m_buffer_start;
		refill_buffer();
		m_img_buffer_original_end = m_img_buffer_end;
	}

	void refill_buffer()
	{
		int n = ( m_io.read )( m_io_user_data, (char*)m_buffer_start, m_buflen );
		if ( n == 0 )
		{
			// at end of file, treat same as if from memory, but need to handle case
			// where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
			m_read_from_callbacks = 0;
			m_img_buffer = m_buffer_start;
			m_img_buffer_end = m_buffer_start + 1;
			*m_img_buffer = 0;
		}
		else
		{
			m_img_buffer = m_buffer_start;
			m_img_buffer_end = m_buffer_start + n;
		}
	}

	uchar8 get8()
	{
		if ( m_img_buffer < m_img_buffer_end )
			return *m_img_buffer++;
		if ( m_read_from_callbacks )
		{
			refill_buffer();
			return *m_img_buffer++;
		}
		return 0;
	}

	inline int at_eof()
	{
		if ( m_io.read )
		{
			if ( !( m_io.eof )( m_io_user_data ) ) return 0;
			// if feof() is true, check if buffer = end
			// special case: we've only got the special 0 character at the end
			if ( m_read_from_callbacks == 0 ) return 1;
		}

		return m_img_buffer >= m_img_buffer_end;
	}

	void skip( int n )
	{
		if ( n < 0 )
		{
			m_img_buffer = m_img_buffer_end;
			return;
		}
		if ( m_io.read )
		{
			int blen = (int)( m_img_buffer_end - m_img_buffer );
			if ( blen < n )
			{
				m_img_buffer = m_img_buffer_end;
				( m_io.skip )( m_io_user_data, n - blen );
				return;
			}
		}
		m_img_buffer += n;
	}

	int getn( uchar8 *buffer, int n )
	{
		if ( m_io.read )
		{
			int blen = (int)( m_img_buffer_end - m_img_buffer );
			if ( blen < n )
			{
				int res, count;

				nvmemcpy( buffer, m_img_buffer, blen );

				count = ( m_io.read )( m_io_user_data, (char*)buffer + blen, n - blen );
				res = ( count == ( n - blen ) );
				m_img_buffer = m_img_buffer_end;
				return res;
			}
		}

		if ( m_img_buffer + n <= m_img_buffer_end )
		{
			nvmemcpy( buffer, m_img_buffer, n );
			m_img_buffer += n;
			return 1;
		}
		else
			return 0;
	}

	inline int get16be()
	{
		int z = get8();
		return ( z << 8 ) + get8();
	}

	inline uint32 get32be()
	{
		uint32 z = get16be();
		return ( z << 16 ) + get16be();
	}

	int remaining()
	{
		return m_img_buffer_end - m_img_buffer;
	}

private:
	stbi_io_callbacks m_io;
	void* m_io_user_data;

	int m_read_from_callbacks;
	int m_buflen;
	uchar8 m_buffer_start[128];

	uchar8 *m_img_buffer, *m_img_buffer_end;
	uchar8 *m_img_buffer_original, *m_img_buffer_original_end;


};


enum
{
	STBI__SCAN_load = 0,
	STBI__SCAN_type,
	STBI__SCAN_header
};

static int      stbi__png_test( stbi__context *s );
static uchar8 *stbi__png_load( stbi__context *s, int *x, int *y, int *comp, int req_comp );
static int      stbi__png_info( stbi__context *s, int *x, int *y, int *comp );

typedef struct
{
	uint32 length;
	uint32 type;
} stbi__pngchunk;

static stbi__pngchunk stbi__get_chunk_header( stbi__context *s )
{
	stbi__pngchunk c;
	c.length = s->get32be();
	c.type = s->get32be();
	return c;
}

static int stbi__check_png_header( stbi__context *s )
{
	static uchar8 png_sig[8] = { 137,80,78,71,13,10,26,10 };
	int i;
	for ( i = 0; i < 8; ++i )
		if ( s->get8() != png_sig[i] ) return stbi__err( "bad png sig", "Not a PNG" );
	return 1;
}

typedef struct
{
	stbi__context *s;
	uchar8 *idata, *expanded, *out;
} stbi__png;


enum
{
	STBI__F_none = 0,
	STBI__F_sub = 1,
	STBI__F_up = 2,
	STBI__F_avg = 3,
	STBI__F_paeth = 4,
	// synthetic filters used for first scanline to avoid needing a dummy row of 0s
	STBI__F_avg_first,
	STBI__F_paeth_first
};

static uchar8 first_row_filter[5] =
{
	STBI__F_none,
	STBI__F_sub,
	STBI__F_none,
	STBI__F_avg_first,
	STBI__F_paeth_first
};

static int stbi__paeth( int a, int b, int c )
{
	int p = a + b - c;
	int pa = abs( p - a );
	int pb = abs( p - b );
	int pc = abs( p - c );
	if ( pa <= pb && pa <= pc ) return a;
	if ( pb <= pc ) return b;
	return c;
}

static uchar8 stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };

// create the png data from post-deflated data
static int stbi__create_png_image_raw( stbi__png *a, uchar8 *raw, uint32 raw_len, int out_n, uint32 x, uint32 y, int depth, int color )
{
	stbi__context *s = a->s;
	uint32 i, j, stride = x*out_n;
	uint32 img_len, img_width_bytes;
	int k;
	int img_n = s->img_n; // copy it into a local for later

	NV_ASSERT( out_n == s->img_n || out_n == s->img_n + 1, "!" );
	a->out = (uchar8 *)stbi__malloc( x * y * out_n ); // extra bytes to write off the end into
	if ( !a->out ) return stbi__err( "outofmem", "Out of memory" );

	img_width_bytes = ( ( ( img_n * x * depth ) + 7 ) >> 3 );
	img_len = ( img_width_bytes + 1 ) * y;
	if ( s->img_x == x && s->img_y == y )
	{
		if ( raw_len != img_len ) return stbi__err( "not enough pixels", "Corrupt PNG" );
	}
	else
	{ // interlaced:
		if ( raw_len < img_len ) return stbi__err( "not enough pixels", "Corrupt PNG" );
	}

	for ( j = 0; j < y; ++j )
	{
		uchar8 *cur = a->out + stride*j;
		uchar8 *prior = cur - stride;
		int filter = *raw++;
		int filter_bytes = img_n;
		int width = x;
		if ( filter > 4 )
			return stbi__err( "invalid filter", "Corrupt PNG" );

		if ( depth < 8 )
		{
			NV_ASSERT( img_width_bytes <= x, "!" );
			cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
			filter_bytes = 1;
			width = img_width_bytes;
		}

		// if first row, use special filter that doesn't sample previous row
		if ( j == 0 ) filter = first_row_filter[filter];

		// handle first byte explicitly
		for ( k = 0; k < filter_bytes; ++k )
		{
			switch ( filter )
			{
			case STBI__F_none: cur[k] = raw[k]; break;
			case STBI__F_sub: cur[k] = raw[k]; break;
			case STBI__F_up: cur[k] = byte_cast( raw[k] + prior[k] ); break;
			case STBI__F_avg: cur[k] = byte_cast( raw[k] + ( prior[k] >> 1 ) ); break;
			case STBI__F_paeth: cur[k] = byte_cast( raw[k] + stbi__paeth( 0, prior[k], 0 ) ); break;
			case STBI__F_avg_first: cur[k] = raw[k]; break;
			case STBI__F_paeth_first: cur[k] = raw[k]; break;
			}
		}

		if ( depth == 8 )
		{
			if ( img_n != out_n )
				cur[img_n] = 255; // first pixel
			raw += img_n;
			cur += out_n;
			prior += out_n;
		}
		else
		{
			raw += 1;
			cur += 1;
			prior += 1;
		}

		// this is a little gross, so that we don't switch per-pixel or per-component
		if ( depth < 8 || img_n == out_n )
		{
			int nk = ( width - 1 )*img_n;
#define CASE(f) \
             case f:     \
                for (k=0; k < nk; ++k)
			switch ( filter )
			{
				// "none" filter turns into a memcpy here; make that explicit.
			case STBI__F_none:         nvmemcpy( cur, raw, nk ); break;
				CASE( STBI__F_sub )          cur[k] = byte_cast( raw[k] + cur[k - filter_bytes] ); break;
				CASE( STBI__F_up )           cur[k] = byte_cast( raw[k] + prior[k] ); break;
				CASE( STBI__F_avg )          cur[k] = byte_cast( raw[k] + ( ( prior[k] + cur[k - filter_bytes] ) >> 1 ) ); break;
				CASE( STBI__F_paeth )        cur[k] = byte_cast( raw[k] + stbi__paeth( cur[k - filter_bytes], prior[k], prior[k - filter_bytes] ) ); break;
				CASE( STBI__F_avg_first )    cur[k] = byte_cast( raw[k] + ( cur[k - filter_bytes] >> 1 ) ); break;
				CASE( STBI__F_paeth_first )  cur[k] = byte_cast( raw[k] + stbi__paeth( cur[k - filter_bytes], 0, 0 ) ); break;
			}
#undef CASE
			raw += nk;
		}
		else
		{
			NV_ASSERT( img_n + 1 == out_n, "!" );
#define CASE(f) \
             case f:     \
                for (i=x-1; i >= 1; --i, cur[img_n]=255,raw+=img_n,cur+=out_n,prior+=out_n) \
                   for (k=0; k < img_n; ++k)
			switch ( filter )
			{
				CASE( STBI__F_none )         cur[k] = raw[k]; break;
				CASE( STBI__F_sub )          cur[k] = byte_cast( raw[k] + cur[k - out_n] ); break;
				CASE( STBI__F_up )           cur[k] = byte_cast( raw[k] + prior[k] ); break;
				CASE( STBI__F_avg )          cur[k] = byte_cast( raw[k] + ( ( prior[k] + cur[k - out_n] ) >> 1 ) ); break;
				CASE( STBI__F_paeth )        cur[k] = byte_cast( raw[k] + stbi__paeth( cur[k - out_n], prior[k], prior[k - out_n] ) ); break;
				CASE( STBI__F_avg_first )    cur[k] = byte_cast( raw[k] + ( cur[k - out_n] >> 1 ) ); break;
				CASE( STBI__F_paeth_first )  cur[k] = byte_cast( raw[k] + stbi__paeth( cur[k - out_n], 0, 0 ) ); break;
			}
#undef CASE
		}
	}

	// we make a separate pass to expand bits to pixels; for performance,
	// this could run two scanlines behind the above code, so it won't
	// intefere with filtering but will still be in the cache.
	if ( depth < 8 )
	{
		for ( j = 0; j < y; ++j )
		{
			uchar8 *cur = a->out + stride*j;
			uchar8 *in = a->out + stride*j + x*out_n - img_width_bytes;
			// unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
			// png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
			uchar8 scale = ( color == 0 ) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range

																				 // note that the final byte might overshoot and write more data than desired.
																				 // we can allocate enough data that this never writes out of memory, but it
																				 // could also overwrite the next scanline. can it overwrite non-empty data
																				 // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
																				 // so we need to explicitly clamp the final ones

			if ( depth == 4 )
			{
				for ( k = x*img_n; k >= 2; k -= 2, ++in )
				{
					*cur++ = scale * ( ( *in >> 4 ) );
					*cur++ = scale * ( ( *in ) & 0x0f );
				}
				if ( k > 0 ) *cur++ = scale * ( ( *in >> 4 ) );
			}
			else if ( depth == 2 )
			{
				for ( k = x*img_n; k >= 4; k -= 4, ++in )
				{
					*cur++ = scale * ( ( *in >> 6 ) );
					*cur++ = scale * ( ( *in >> 4 ) & 0x03 );
					*cur++ = scale * ( ( *in >> 2 ) & 0x03 );
					*cur++ = scale * ( ( *in ) & 0x03 );
				}
				if ( k > 0 ) *cur++ = scale * ( ( *in >> 6 ) );
				if ( k > 1 ) *cur++ = scale * ( ( *in >> 4 ) & 0x03 );
				if ( k > 2 ) *cur++ = scale * ( ( *in >> 2 ) & 0x03 );
			}
			else if ( depth == 1 )
			{
				for ( k = x*img_n; k >= 8; k -= 8, ++in )
				{
					*cur++ = scale * ( ( *in >> 7 ) );
					*cur++ = scale * ( ( *in >> 6 ) & 0x01 );
					*cur++ = scale * ( ( *in >> 5 ) & 0x01 );
					*cur++ = scale * ( ( *in >> 4 ) & 0x01 );
					*cur++ = scale * ( ( *in >> 3 ) & 0x01 );
					*cur++ = scale * ( ( *in >> 2 ) & 0x01 );
					*cur++ = scale * ( ( *in >> 1 ) & 0x01 );
					*cur++ = scale * ( ( *in ) & 0x01 );
				}
				if ( k > 0 ) *cur++ = scale * ( ( *in >> 7 ) );
				if ( k > 1 ) *cur++ = scale * ( ( *in >> 6 ) & 0x01 );
				if ( k > 2 ) *cur++ = scale * ( ( *in >> 5 ) & 0x01 );
				if ( k > 3 ) *cur++ = scale * ( ( *in >> 4 ) & 0x01 );
				if ( k > 4 ) *cur++ = scale * ( ( *in >> 3 ) & 0x01 );
				if ( k > 5 ) *cur++ = scale * ( ( *in >> 2 ) & 0x01 );
				if ( k > 6 ) *cur++ = scale * ( ( *in >> 1 ) & 0x01 );
			}
			if ( img_n != out_n )
			{
				int q;
				// insert alpha = 255
				cur = a->out + stride*j;
				if ( img_n == 1 )
				{
					for ( q = x - 1; q >= 0; --q )
					{
						cur[q * 2 + 1] = 255;
						cur[q * 2 + 0] = cur[q];
					}
				}
				else
				{
					NV_ASSERT( img_n == 3, "!" );
					for ( q = x - 1; q >= 0; --q )
					{
						cur[q * 4 + 3] = 255;
						cur[q * 4 + 2] = cur[q * 3 + 2];
						cur[q * 4 + 1] = cur[q * 3 + 1];
						cur[q * 4 + 0] = cur[q * 3 + 0];
					}
				}
			}
		}
	}

	return 1;
}

static int stbi__create_png_image( stbi__png *a, uchar8 *image_data, uint32 image_data_len, int out_n, int depth, int color, int interlaced )
{
	uchar8 *final;
	int p;
	if ( !interlaced )
		return stbi__create_png_image_raw( a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color );

	// de-interlacing
	final = (uchar8 *)stbi__malloc( a->s->img_x * a->s->img_y * out_n );
	for ( p = 0; p < 7; ++p )
	{
		int xorig[] = { 0,4,0,2,0,1,0 };
		int yorig[] = { 0,0,4,0,2,0,1 };
		int xspc[] = { 8,8,4,4,2,2,1 };
		int yspc[] = { 8,8,8,4,4,2,2 };
		int i, j, x, y;
		// pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
		x = ( a->s->img_x - xorig[p] + xspc[p] - 1 ) / xspc[p];
		y = ( a->s->img_y - yorig[p] + yspc[p] - 1 ) / yspc[p];
		if ( x && y )
		{
			uint32 img_len = ( ( ( ( a->s->img_n * x * depth ) + 7 ) >> 3 ) + 1 ) * y;
			if ( !stbi__create_png_image_raw( a, image_data, image_data_len, out_n, x, y, depth, color ) )
			{
				STBI_FREE( final );
				return 0;
			}
			for ( j = 0; j < y; ++j )
			{
				for ( i = 0; i < x; ++i )
				{
					int out_y = j*yspc[p] + yorig[p];
					int out_x = i*xspc[p] + xorig[p];
					nvmemcpy( final + out_y*a->s->img_x*out_n + out_x*out_n,
						a->out + ( j*x + i )*out_n, out_n );
				}
			}
			STBI_FREE( a->out );
			image_data += img_len;
			image_data_len -= img_len;
		}
	}
	a->out = final;

	return 1;
}

static int stbi__compute_transparency( stbi__png *z, uchar8 tc[3], int out_n )
{
	stbi__context *s = z->s;
	uint32 i, pixel_count = s->img_x * s->img_y;
	uchar8 *p = z->out;

	// compute color-based transparency, assuming we've
	// already got 255 as the alpha value in the output
	NV_ASSERT( out_n == 2 || out_n == 4, "!" );

	if ( out_n == 2 )
	{
		for ( i = 0; i < pixel_count; ++i )
		{
			p[1] = ( p[0] == tc[0] ? 0 : 255 );
			p += 2;
		}
	}
	else
	{
		for ( i = 0; i < pixel_count; ++i )
		{
			if ( p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2] )
				p[3] = 0;
			p += 4;
		}
	}
	return 1;
}

static int stbi__expand_png_palette( stbi__png *a, uchar8 *palette, int len, int pal_img_n )
{
	uint32 i, pixel_count = a->s->img_x * a->s->img_y;
	uchar8 *p, *temp_out, *orig = a->out;

	p = (uchar8 *)stbi__malloc( pixel_count * pal_img_n );
	if ( p == NULL ) return stbi__err( "outofmem", "Out of memory" );

	// between here and free(out) below, exitting would leak
	temp_out = p;

	if ( pal_img_n == 3 )
	{
		for ( i = 0; i < pixel_count; ++i )
		{
			int n = orig[i] * 4;
			p[0] = palette[n];
			p[1] = palette[n + 1];
			p[2] = palette[n + 2];
			p += 3;
		}
	}
	else
	{
		for ( i = 0; i < pixel_count; ++i )
		{
			int n = orig[i] * 4;
			p[0] = palette[n];
			p[1] = palette[n + 1];
			p[2] = palette[n + 2];
			p[3] = palette[n + 3];
			p += 4;
		}
	}
	STBI_FREE( a->out );
	a->out = temp_out;

	NV_UNUSED( len );
//	STBI_NOTUSED( len );

	return 1;
}

static int stbi__unpremultiply_on_load = 0;
static int stbi__de_iphone_flag = 0;

static void stbi_set_unpremultiply_on_load( int flag_true_if_should_unpremultiply )
{
	stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
}

static void stbi_convert_iphone_png_to_rgb( int flag_true_if_should_convert )
{
	stbi__de_iphone_flag = flag_true_if_should_convert;
}

static void stbi__de_iphone( stbi__png *z )
{
	stbi__context *s = z->s;
	uint32 i, pixel_count = s->img_x * s->img_y;
	uchar8 *p = z->out;

	if ( s->img_out_n == 3 )
	{  // convert bgr to rgb
		for ( i = 0; i < pixel_count; ++i )
		{
			uchar8 t = p[0];
			p[0] = p[2];
			p[2] = t;
			p += 3;
		}
	}
	else
	{
		NV_ASSERT( s->img_out_n == 4, "!" );
		if ( stbi__unpremultiply_on_load )
		{
			// convert bgr to rgb and unpremultiply
			for ( i = 0; i < pixel_count; ++i )
			{
				uchar8 a = p[3];
				uchar8 t = p[0];
				if ( a )
				{
					p[0] = p[2] * 255 / a;
					p[1] = p[1] * 255 / a;
					p[2] = t * 255 / a;
				}
				else
				{
					p[0] = p[2];
					p[2] = t;
				}
				p += 4;
			}
		}
		else
		{
			// convert bgr to rgb
			for ( i = 0; i < pixel_count; ++i )
			{
				uchar8 t = p[0];
				p[0] = p[2];
				p[2] = t;
				p += 4;
			}
		}
	}
}

#define STBI__PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))

static int stbi__parse_png_file( stbi__png *z, int scan, int req_comp )
{
	uchar8 palette[1024], pal_img_n = 0;
	uchar8 has_trans = 0, tc[3];
	uint32 ioff = 0, idata_limit = 0, i, pal_len = 0;
	int first = 1, k, interlace = 0, color = 0, depth = 0, is_iphone = 0;
	stbi__context *s = z->s;

	z->expanded = NULL;
	z->idata = NULL;
	z->out = NULL;

	if ( !stbi__check_png_header( s ) ) return 0;

	if ( scan == STBI__SCAN_type ) return 1;

	for ( ;;)
	{
		stbi__pngchunk c = stbi__get_chunk_header( s );
		switch ( c.type )
		{
		case STBI__PNG_TYPE( 'C', 'g', 'B', 'I' ):
			is_iphone = 1;
			s->skip( c.length );
			break;
		case STBI__PNG_TYPE( 'I', 'H', 'D', 'R' ): {
			int comp, filter;
			if ( !first ) return stbi__err( "multiple IHDR", "Corrupt PNG" );
			first = 0;
			if ( c.length != 13 ) return stbi__err( "bad IHDR len", "Corrupt PNG" );
			s->img_x = s->get32be(); if ( s->img_x > ( 1 << 24 ) ) return stbi__err( "too large", "Very large image (corrupt?)" );
			s->img_y = s->get32be(); if ( s->img_y > ( 1 << 24 ) ) return stbi__err( "too large", "Very large image (corrupt?)" );
			depth = s->get8();  if ( depth != 1 && depth != 2 && depth != 4 && depth != 8 )  return stbi__err( "1/2/4/8-bit only", "PNG not supported: 1/2/4/8-bit only" );
			color = s->get8();  if ( color > 6 )         return stbi__err( "bad ctype", "Corrupt PNG" );
			if ( color == 3 ) pal_img_n = 3; else if ( color & 1 ) return stbi__err( "bad ctype", "Corrupt PNG" );
			comp = s->get8();  if ( comp ) return stbi__err( "bad comp method", "Corrupt PNG" );
			filter = s->get8();  if ( filter ) return stbi__err( "bad filter method", "Corrupt PNG" );
			interlace = s->get8(); if ( interlace > 1 ) return stbi__err( "bad interlace method", "Corrupt PNG" );
			if ( !s->img_x || !s->img_y ) return stbi__err( "0-pixel image", "Corrupt PNG" );
			if ( !pal_img_n )
			{
				s->img_n = ( color & 2 ? 3 : 1 ) + ( color & 4 ? 1 : 0 );
				if ( ( 1 << 30 ) / s->img_x / s->img_n < s->img_y ) return stbi__err( "too large", "Image too large to decode" );
				if ( scan == STBI__SCAN_header ) return 1;
			}
			else
			{
				// if paletted, then pal_n is our final components, and
				// img_n is # components to decompress/filter.
				s->img_n = 1;
				if ( ( 1 << 30 ) / s->img_x / 4 < s->img_y ) return stbi__err( "too large", "Corrupt PNG" );
				// if SCAN_header, have to scan to see if we have a tRNS
			}
			break;
		}

		case STBI__PNG_TYPE( 'P', 'L', 'T', 'E' ): {
			if ( first ) return stbi__err( "first not IHDR", "Corrupt PNG" );
			if ( c.length > 256 * 3 ) return stbi__err( "invalid PLTE", "Corrupt PNG" );
			pal_len = c.length / 3;
			if ( pal_len * 3 != c.length ) return stbi__err( "invalid PLTE", "Corrupt PNG" );
			for ( i = 0; i < pal_len; ++i )
			{
				palette[i * 4 + 0] = s->get8();
				palette[i * 4 + 1] = s->get8();
				palette[i * 4 + 2] = s->get8();
				palette[i * 4 + 3] = 255;
			}
			break;
		}

		case STBI__PNG_TYPE( 't', 'R', 'N', 'S' ): {
			if ( first ) return stbi__err( "first not IHDR", "Corrupt PNG" );
			if ( z->idata ) return stbi__err( "tRNS after IDAT", "Corrupt PNG" );
			if ( pal_img_n )
			{
				if ( scan == STBI__SCAN_header ) { s->img_n = 4; return 1; }
				if ( pal_len == 0 ) return stbi__err( "tRNS before PLTE", "Corrupt PNG" );
				if ( c.length > pal_len ) return stbi__err( "bad tRNS len", "Corrupt PNG" );
				pal_img_n = 4;
				for ( i = 0; i < c.length; ++i )
					palette[i * 4 + 3] = s->get8();
			}
			else
			{
				if ( !( s->img_n & 1 ) ) return stbi__err( "tRNS with alpha", "Corrupt PNG" );
				if ( c.length != (uint32)s->img_n * 2 ) return stbi__err( "bad tRNS len", "Corrupt PNG" );
				has_trans = 1;
				for ( k = 0; k < s->img_n; ++k )
					tc[k] = (uchar8)( s->get16be() & 255 ) * stbi__depth_scale_table[depth]; // non 8-bit images will be larger
			}
			break;
		}

		case STBI__PNG_TYPE( 'I', 'D', 'A', 'T' ): {
			if ( first ) return stbi__err( "first not IHDR", "Corrupt PNG" );
			if ( pal_img_n && !pal_len ) return stbi__err( "no PLTE", "Corrupt PNG" );
			if ( scan == STBI__SCAN_header ) { s->img_n = pal_img_n; return 1; }
			if ( (int)( ioff + c.length ) < (int)ioff ) return 0;
			if ( ioff + c.length > idata_limit )
			{
				uchar8 *p;
				if ( idata_limit == 0 ) idata_limit = c.length > 4096 ? c.length : 4096;
				while ( ioff + c.length > idata_limit )
					idata_limit *= 2;
				p = (uchar8 *)STBI_REALLOC( z->idata, idata_limit ); if ( p == NULL ) return stbi__err( "outofmem", "Out of memory" );
				z->idata = p;
			}
			if ( !s->getn( z->idata + ioff, c.length ) ) return stbi__err( "outofdata", "Corrupt PNG" );
			ioff += c.length;
			break;
		}

		case STBI__PNG_TYPE( 'I', 'E', 'N', 'D' ): {
			uint32 raw_len, bpl;
			if ( first ) return stbi__err( "first not IHDR", "Corrupt PNG" );
			if ( scan != STBI__SCAN_load ) return 1;
			if ( z->idata == NULL ) return stbi__err( "no IDAT", "Corrupt PNG" );
			// initial guess for decoded data size to avoid unnecessary reallocs
			bpl = ( s->img_x * depth + 7 ) / 8; // bytes per line, per component
			raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
			size_t result_len = raw_len;
			z->expanded = (uchar8 *)nv::miniz_decompress( (char *)z->idata, ioff, &result_len, !is_iphone );
			raw_len = result_len;

			if ( z->expanded == NULL ) return 0; // zlib should set error
			STBI_FREE( z->idata ); z->idata = NULL;
			if ( ( req_comp == s->img_n + 1 && req_comp != 3 && !pal_img_n ) || has_trans )
				s->img_out_n = s->img_n + 1;
			else
				s->img_out_n = s->img_n;
			if ( !stbi__create_png_image( z, z->expanded, raw_len, s->img_out_n, depth, color, interlace ) ) return 0;
			if ( has_trans )
				if ( !stbi__compute_transparency( z, tc, s->img_out_n ) ) return 0;
			if ( is_iphone && stbi__de_iphone_flag && s->img_out_n > 2 )
				stbi__de_iphone( z );
			if ( pal_img_n )
			{
				// pal_img_n == 3 or 4
				s->img_n = pal_img_n; // record the actual colors we had
				s->img_out_n = pal_img_n;
				if ( req_comp >= 3 ) s->img_out_n = req_comp;
				if ( !stbi__expand_png_palette( z, palette, pal_len, s->img_out_n ) )
					return 0;
			}
			STBI_FREE( z->expanded ); z->expanded = NULL;
			return 1;
		}

		default:
			// if critical, fail
			if ( first ) return stbi__err( "first not IHDR", "Corrupt PNG" );
			if ( ( c.type & ( 1 << 29 ) ) == 0 )
			{
				return stbi__err( invalid_chunk, "PNG not supported: unknown PNG chunk type" );
			}
			s->skip( c.length );
			break;
		}
		// end of PNG chunk, read and skip CRC
		s->get32be();
	}
}

static unsigned char *stbi__do_png( stbi__png *p, int *x, int *y, int *n, int req_comp )
{
	unsigned char *result = NULL;
	if ( req_comp < 0 || req_comp > 4 ) return stbi__errpuc( "bad req_comp", "Internal error" );
	if ( stbi__parse_png_file( p, STBI__SCAN_load, req_comp ) )
	{
		result = p->out;
		p->out = NULL;
		if ( req_comp && req_comp != p->s->img_out_n )
		{
			result = stbi__convert_format( result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y );
			p->s->img_out_n = req_comp;
			if ( result == NULL ) return result;
		}
		*x = p->s->img_x;
		*y = p->s->img_y;
		if ( n ) *n = p->s->img_out_n;
	}
	STBI_FREE( p->out );      p->out = NULL;
	STBI_FREE( p->expanded ); p->expanded = NULL;
	STBI_FREE( p->idata );    p->idata = NULL;

	return result;
}

static unsigned char *stbi__png_load( stbi__context *s, int *x, int *y, int *comp, int req_comp )
{
	stbi__png p;
	p.s = s;
	return stbi__do_png( &p, x, y, comp, req_comp );
}

static int stbi__png_test( stbi__context *s )
{
	int r;
	r = stbi__check_png_header( s );
	s->rewind();
	return r;
}

static int stbi__png_info_raw( stbi__png *p, int *x, int *y, int *comp )
{
	if ( !stbi__parse_png_file( p, STBI__SCAN_header, 0 ) )
	{
		p->s->rewind();
		return 0;
	}
	if ( x ) *x = p->s->img_x;
	if ( y ) *y = p->s->img_y;
	if ( comp ) *comp = p->s->img_n;
	return 1;
}

static int stbi__png_info( stbi__context *s, int *x, int *y, int *comp )
{
	stbi__png p;
	p.s = s;
	return stbi__png_info_raw( &p, x, y, comp );
}

static int stbi__stream_read( void *user, char *data, int size )
{
	return reinterpret_cast<stream*>( user )->read( data, 1, size );
}

static void stbi__stream_skip( void *user, int n )
{
	reinterpret_cast<stream*>( user )->seek( n, origin::CUR );
}

static int stbi__stream_eof( void *user )
{
	return reinterpret_cast<stream*>( user )->eof();
}

static stbi_io_callbacks stbi__callbacks =
{
	stbi__stream_read,
	stbi__stream_skip,
	stbi__stream_eof,
};

unsigned char * stbi_load( stream* f, int *x, int *y, int *comp, int req_comp )
{
	unsigned char *result;
	stbi__context s( &stbi__callbacks, (void *)f );
	result = stbi__png_load( &s, x, y, comp, req_comp );

	if ( result )
	{
		// need to 'unget' all the characters in the IO buffer
		f->seek( -s.remaining(), origin::CUR );
	}
	return result;
}


png_loader::png_loader() {}

bool nv::png_loader::get_info( stream& str, image_format& format, ivec2& size )
{
	size_t pos = str.tell();
	stbi__context s( &stbi__callbacks, (void *)&str );
	int x, y;
	int comp;
	if ( stbi__png_info( &s, &x, &y, &comp ) == 1 )
	{
		str.seek( pos, origin::SET );
		format.type = UBYTE;
		switch ( comp )
		{
		case 0: return false;
		case 1: format.format = RED; break;
		case 3: format.format = RGB; break;
		case 4: format.format = RGBA; break;
		default: return false;
		}
		size = ivec2( x, y );
		return true;
	}
	return false;
}

image_data* nv::png_loader::load( stream& s )
{
	int x, y;
	int comp;

	stbi__context ctx( &stbi__callbacks, (void *)&s );
	unsigned char *result;
	result = stbi__png_load( &ctx, &x, &y, &comp, 0 );
	if ( result )
	{
		// need to 'unget' all the characters in the IO buffer
		s.seek( -ctx.remaining(), origin::CUR );
		image_format format;
		ivec2 size;
		format.type = UBYTE;
		switch ( comp )
		{
		case 1: format.format = RED; break;
		case 3: format.format = RGB; break;
		case 4: format.format = RGBA; break;
		default: return false;
		}
		size = ivec2( x, y );
		return new image_data( format, size, result );
	}
	return nullptr;
}

image_data* nv::png_loader::load( stream& s, image_format format )
{
	NV_ASSERT( format.type == UBYTE, "!" );
	int rcomp = 0;
	switch ( format.format )
	{
	case RED: rcomp = 1; break;
	case RGB: rcomp = 3; break;
	case RGBA: rcomp = 4; break;
	default: NV_ASSERT( false, "bad format requested!" ); return nullptr;
	}
	int x, y;
	int comp;
	stbi__context ctx( &stbi__callbacks, (void *)&s );
	unsigned char* result = stbi__png_load( &ctx, &x, &y, &comp, rcomp );
	if ( result )
	{
		s.seek( -ctx.remaining(), origin::CUR );
		image_format fmt;
		ivec2 sz;
		fmt.type = UBYTE;
		switch ( comp )
		{
		case 1: fmt.format = RED; break;
		case 3: fmt.format = RGB; break;
		case 4: fmt.format = RGBA; break;
		default: NV_ASSERT( false, "UNKNOWN RESULT!" );
		}
		sz = ivec2( x, y );
		return new image_data( fmt, sz, result );
	}
	return nullptr;
}
