/*
    This file is part of Buld Then Snip.

    Buld Then Snip is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Buld Then Snip is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with Buld Then Snip.  If not, see <http://www.gnu.org/licenses/>.
*/

#include "common.h"

// TODO: bump up to 127.5f
#define FOG_DISTANCE 40.0f

#define DF_NX 0x01
#define DF_NY 0x02
#define DF_NZ 0x04
#define DF_PX 0x08
#define DF_PY 0x10
#define DF_PZ 0x20
#define DF_SPREAD 0x3F

enum
{
	CM_NX = 0,
	CM_NY,
	CM_NZ,
	CM_PX,
	CM_PY,
	CM_PZ,
	CM_MAX
};

uint32_t *cubemap_color[CM_MAX];
float *cubemap_depth[CM_MAX];
int cubemap_size;
int cubemap_shift;

uint32_t *rtmp_pixels;
int rtmp_width, rtmp_height, rtmp_pitch;
camera_t *rtmp_camera;
map_t *rtmp_map;

/*
 * REFERENCE IMPLEMENTATION
 * 
 */

void render_vxl_rect(uint32_t *ccolor, float *cdepth, int x1, int y1, int x2, int y2, uint32_t color, float depth)
{
	// TODO: one of these:
	// - a proper front-to-back renderer with the linked lists and stuff
	// - a back-to-front renderer
	//
	// because this is a tad slow for my liking.
	
	int b = color&255;
	int g = (color>>8)&255;
	int r = (color>>16)&255;
	int t = (color>>24)&255;
	
	float fog = (FOG_DISTANCE-(depth < 0.001f ? 0.001f : depth))/FOG_DISTANCE;
	if(fog > 1.0f)
		fog = 1.0f;
	
	r = (r*fog+0.5f);
	g = (g*fog+0.5f);
	b = (b*fog+0.5f);
	
	color = b|(g<<8)|(r<<16)|(t<<24);
	
	int x,y;
	
	// arrange *1 <= *2
	if(x1 > x2)
	{
		int t = x1;
		x1 = x2;
		x2 = t;
	}
	
	if(y1 > y2)
	{
		int t = y1;
		y1 = y2;
		y2 = t;
	}
	
	// clip
	if(x1 < 0)
		x1 = 0;
	if(y1 < 0)
		y1 = 0;
	if(x2 > cubemap_size)
		x2 = cubemap_size;
	if(y2 > cubemap_size)
		y2 = cubemap_size;
	
	// render
	uint32_t *cstart = &ccolor[(y1<<cubemap_shift)+x1];
	float *dstart = &cdepth[(y1<<cubemap_shift)+x1];
	for(y = y1; y < y2; y++)
	{
		uint32_t *cptr = cstart;
		float *dptr = dstart;
		
		for(x = x1; x < x2; x++)
		{
			if(*cptr == 0)
			{
				*cptr = color;
				*dptr = depth;
			}
			cptr++;
			dptr++;
		}
		
		cstart += cubemap_size;
		dstart += cubemap_size;
	}
}

void render_vxl_face_vert(int blkx, int blky, int blkz,
	float subx, float suby, float subz,
	int face,
	int gx, int gy, int gz)
{
	int sx,sy;
	int i;
	
	float tracemul = cubemap_size/2;
	float traceadd = tracemul;
	
	// get cubemaps
	uint32_t *ccolor = cubemap_color[face];
	float *cdepth = cubemap_depth[face];
	
	// clear cubemap
	for(i = 0; i < cubemap_size*cubemap_size; i++)
	{
		ccolor[i] = 0x00000000;
		cdepth[i] = FOG_DISTANCE;
	}
	
	// get X cube direction
	int xgx = gz+gy;
	int xgy = 0;
	int xgz = -gx;
	
	// get Y cube direction
	int ygx = 0;
	int ygy = gx+gz;
	int ygz = gy;
	
	// get cubemap offset
	float cmoffsx = -(xgx*subx+xgy*suby+xgz*subz);
	float cmoffsy = -(ygx*subx+ygy*suby+ygz*subz);
	
	// get distance to wall
	float dist = -(subx*gx+suby*gy+subz*gz);
	if(dist < 0.0f)
		dist = 1.0f+dist;
	else {
		blky--;
		
		blkx--;
		blkz--;
	}
	dist -= 1.0f;
	
	int coz = blky;
	
	// now loop and follow through
	while(dist < FOG_DISTANCE)
	{
		if(coz < 0 || coz >= rtmp_map->ylen)
		{
			coz += gy;
			dist += 1.0f;
			
			if(gy*coz > 0)
				break;
			
			continue;
		}
		
		// calculate frustum
		int frustum = (int)(dist*cubemap_size);
		
		// prep boundaries
		int bx1 = 0;
		int by1 = 0;
		int bx2 = frustum*2;
		int by2 = frustum*2;
		
		// clamp wrt pixel counts
		// TODO!
		
		// relocate
		bx1 -= frustum;
		by1 -= frustum;
		bx2 -= frustum;
		by2 -= frustum;
		
		// need to go towards 0, not -inf!
		// (can be done as shifts, just looks nicer this way)
		bx1 /= cubemap_size;
		by1 /= cubemap_size;
		bx2 /= cubemap_size;
		by2 /= cubemap_size;
		
		bx1--;by1--;
		bx2+=2;by2+=2;
		
		// go through loop
		int cox,coy;
		
		//printf("%.3f %i %i %i %i\n ", dist, bx1, by1, bx2, by2);
		if(dist > 0.001f)
		{
			float boxsize = tracemul/dist;
			if(gy >= 0)
			{
				// bottom cubemap face
				for(cox = bx1; cox <= bx2; cox++)
				for(coy = by1; coy <= by2; coy++)
				{
					uint8_t *pillar = rtmp_map->pillars[
						((cox+blkx)&(rtmp_map->xlen-1))
						+(((coy+blkz)&(rtmp_map->zlen-1))*rtmp_map->xlen)]+4;
					
					//printf("%4i %4i %4i - %i %i %i %i\n",cox,coy,coz,
					//	pillar[0],pillar[1],pillar[2],pillar[3]);
					// get correct height
					for(;;)
					{
						if(coz == pillar[1])
						{
							float px1 = (cox+cmoffsx)*boxsize+traceadd;
							float py1 = (coy+cmoffsy)*boxsize+traceadd;
							float px2 = px1+boxsize;
							float py2 = py1+boxsize;
							//printf("%i %i %i %i\n",(int)px1,(int)py1,(int)px2,(int)py2);
							
							render_vxl_rect(ccolor, cdepth,
								(int)px1, (int)py1, (int)px2, (int)py2,
								*(uint32_t *)(&pillar[4]), dist);
							break;
							// TODO: sides
						} else if(coz >= pillar[1] && coz <= pillar[2]) {
							// TODO: sides
							float px1 = (cox+cmoffsx)*boxsize+traceadd;
							float py1 = (coy+cmoffsy)*boxsize+traceadd;
							float px2 = px1+boxsize;
							float py2 = py1+boxsize;
							
							render_vxl_rect(ccolor, cdepth,
								(int)px1, (int)py1, (int)px2, (int)py2,
								*(uint32_t *)(&pillar[4*(coz-pillar[1]+1)]), dist);
							break;
						} else if(pillar[0] == 0 || (coz < pillar[3])) {
							break;
						} else {
							pillar += pillar[0]*4;
						}
					}
					
				}
			} else {
				// top cubemap face
				int ln = 0;
				for(cox = bx1; cox <= bx2; cox++)
				for(coy = by1; coy <= by2; coy++)
				{
					uint8_t *pillar = rtmp_map->pillars[
						((cox+blkx)&(rtmp_map->xlen-1))
						+(((coy+blkz)&(rtmp_map->zlen-1))*rtmp_map->xlen)]+4;
					
					//if(pillar[0] == 0)
					//	continue;
					
					//printf("%4i %4i %4i - %i %i %i %i\n",cox,coy,coz,
					//	pillar[0],pillar[1],pillar[2],pillar[3]);
					// get correct height
					for(;;)
					{
						if(coz >= pillar[1] && coz <= pillar[2])
						{
							float px1 = (-cox+cmoffsx)*boxsize+traceadd;
							float py1 = (-coy+cmoffsy)*boxsize+traceadd;
							float px2 = px1+boxsize;
							float py2 = py1+boxsize;
							
							render_vxl_rect(ccolor, cdepth,
								(int)px1, (int)py1, (int)px2, (int)py2,
								*(uint32_t *)(&pillar[4*(coz-pillar[1]+1)]), dist);
							// TODO: sides
							// wait, how the hell am i going to do these here?!
							break;
						} else if(ln != 0 && (coz < pillar[3] && coz > pillar[3]-ln)) {
							float px1 = (-cox+cmoffsx)*boxsize+traceadd;
							float py1 = (-coy+cmoffsy)*boxsize+traceadd;
							float px2 = px1+boxsize;
							float py2 = py1+boxsize;
							//printf("%i %i %i %i\n",(int)px1,(int)py1,(int)px2,(int)py2);
							
							render_vxl_rect(ccolor, cdepth,
								(int)px1, (int)py1, (int)px2, (int)py2,
								*(uint32_t *)(&pillar[4*(coz-pillar[3])]), dist);
							// TODO: sides
							break;
						} else if(pillar[0] == 0 || (coz < pillar[1])) {
							break;
						} else {
							ln = pillar[0]-(pillar[2]-pillar[1]+1);
							pillar += pillar[0]*4;
						}
					}
					
				}
			}
		}
		
		coz += gy;
		dist += 1.0f;
	}
}

void render_vxl_face_horiz(int blkx, int blky, int blkz,
	float subx, float suby, float subz,
	int face,
	int gx, int gy, int gz)
{
	int sx,sy;
	int i;
	
	float tracemul = cubemap_size/2;
	float traceadd = tracemul;
	
	// get cubemaps
	uint32_t *ccolor = cubemap_color[face];
	float *cdepth = cubemap_depth[face];
	
	// clear cubemap
	for(i = 0; i < cubemap_size*cubemap_size; i++)
	{
		ccolor[i] = 0x00000000;
		cdepth[i] = FOG_DISTANCE;
	}
	
	// get X cube direction
	int xgx = gz+gy;
	int xgy = 0;
	int xgz = -gx;
	
	// get Y cube direction
	int ygx = 0;
	int ygy = gx+gz;
	int ygz = gy;
	
	// get cubemap offset
	float cmoffsx = -(xgx*subx+xgy*suby+xgz*subz);
	float cmoffsy = -(ygx*subx+ygy*suby+ygz*subz);
	if(cmoffsy >= 0.0f)
		cmoffsy = -cmoffsy;
	if(cmoffsx >= 0.0f)
		cmoffsx -= 1.0f;
	//else
	//	blky--;
	
	
	// get distance to wall
	float dist = -(subx*gx+suby*gy+subz*gz);
	if(dist < 0.0f)
		dist = 1.0f+dist;
	dist -= 1.0f;
	
	int coz = blky;
	
	// now loop and follow through
	while(dist < FOG_DISTANCE)
	{
		// calculate frustum
		int frustum = (int)(dist*cubemap_size);
		
		// prep boundaries
		int bx1 = 0;
		int by1 = 0;
		int bx2 = frustum*2;
		int by2 = frustum*2;
		
		// clamp wrt pixel counts
		// TODO!
		
		// relocate
		bx1 -= frustum;
		by1 -= frustum;
		bx2 -= frustum;
		by2 -= frustum;
		
		// need to go towards 0, not -inf!
		// (can be done as shifts, just looks nicer this way)
		bx1 /= cubemap_size;
		by1 /= cubemap_size;
		bx2 /= cubemap_size;
		by2 /= cubemap_size;
		
		bx1-=2;by1--;
		bx2+=2;by2++;
		
		// go through loop
		int cox,coy;
		cox = 0;
		coy = 0;
		
		if(dist > 0.001f)
		{
			float boxsize = tracemul/dist;
			float nboxsize = tracemul/(dist+0.5f);
			for(cox = bx1; cox <= bx2; cox++)
			{
				coz = 0;
				
				uint8_t *pillar = rtmp_map->pillars[
					((cox*gz+blkx)&(rtmp_map->xlen-1))
					+(((-cox*gx+blkz)&(rtmp_map->zlen-1))*rtmp_map->xlen)]+4;
				
				//printf("%4i %4i %4i - %i %i %i %i\n",cox,coy,coz,
				//	pillar[0],pillar[1],pillar[2],pillar[3]);
				
				for(;;)
				{
					uint8_t *pcol = pillar+4;
					
					// render top
					if(pillar[2]-blky >= by1 && pillar[1]-blky <= by2)
					for(coz = pillar[1]; coz <= pillar[2]; coz++)
					{
						if(coz-blky >= by1 && coz-blky <= by2)
						{
							float px1 = (cox+cmoffsx)*boxsize+traceadd;
							float py1 = (coz+cmoffsy-blky)*boxsize+traceadd;
							float px2 = px1+boxsize;
							float py2 = py1+boxsize;
							float px3 = (cox+cmoffsx)*nboxsize+traceadd;
							float py3 = (coz+cmoffsy-blky)*nboxsize+traceadd;
							float px4 = px3+nboxsize;
							float py4 = py3+nboxsize;
							
							render_vxl_rect(ccolor, cdepth,
								(int)px1, (int)py1, (int)px2, (int)py2,
								*((uint32_t *)pcol), dist);
							
							// TODO: replace these with trapezium drawing routines
							if(px3 < px1)
								render_vxl_rect(ccolor, cdepth,
									(int)px3, (int)py3, (int)px1, (int)py4,
									*((uint32_t *)pcol), dist);
							else if(px2 < px4)
								render_vxl_rect(ccolor, cdepth,
									(int)px2, (int)py3, (int)px4, (int)py4,
									*((uint32_t *)pcol), dist);
							
							if(py3 < py1)
								render_vxl_rect(ccolor, cdepth,
									(int)px3, (int)py3, (int)px4, (int)py1,
									*((uint32_t *)pcol), dist);
							else if(py2 < py4)
								render_vxl_rect(ccolor, cdepth,
									(int)px3, (int)py2, (int)px4, (int)py4,
									*((uint32_t *)pcol), dist);
						}
						pcol+=4;
					}
					
					// advance where sensible
					if(pillar[2]-blky > by2)
						break;
					
					if(pillar[0] == 0)
						break;
					
					pillar += pillar[0]*4;
					
					// render bottom
					int diff = (pillar-pcol)>>2;
					
					for(coz = pillar[3]-diff; coz < pillar[3]; coz++)
					{
						if(coz-blky >= by1 && coz-blky <= by2)
						{
							float px1 = (cox+cmoffsx)*boxsize+traceadd;
							float py1 = (coz+cmoffsy-blky)*boxsize+traceadd;
							float px2 = px1+boxsize;
							float py2 = py1+boxsize;
							float px3 = (cox+cmoffsx)*nboxsize+traceadd;
							float py3 = (coz+cmoffsy-blky)*nboxsize+traceadd;
							float px4 = px3+nboxsize;
							float py4 = py3+nboxsize;
							
							render_vxl_rect(ccolor, cdepth,
								(int)px1, (int)py1, (int)px2, (int)py2,
								*((uint32_t *)pcol), dist);
							
							// TODO: replace these with trapezium drawing routines
							if(px3 < px1)
								render_vxl_rect(ccolor, cdepth,
									(int)px3, (int)py3, (int)px1, (int)py4,
									*((uint32_t *)pcol), dist);
							else if(px2 < px4)
								render_vxl_rect(ccolor, cdepth,
									(int)px2, (int)py3, (int)px4, (int)py4,
									*((uint32_t *)pcol), dist);
							
							if(py3 < py1)
								render_vxl_rect(ccolor, cdepth,
									(int)px3, (int)py3, (int)px4, (int)py1,
									*((uint32_t *)pcol), dist);
							else if(py2 < py4)
								render_vxl_rect(ccolor, cdepth,
									(int)px3, (int)py2, (int)px4, (int)py4,
									*((uint32_t *)pcol), dist);
						}
						pcol+=4;
					}
				}
			}
		}
		
		dist += 1.0f;
		blkx += gx;
		blkz += gz;
	}
}

void render_vxl_redraw(camera_t *camera, map_t *map)
{
	int x,y,z;
	
	// stash stuff in globals to prevent spamming the stack too much
	// (and in turn thrashing the cache)
	rtmp_camera = camera;
	rtmp_map = map;
	
	// get block pos
	int blkx = ((int)floor(camera->mpx)) & (map->xlen-1);
	int blky = ((int)floor(camera->mpy));// & (map->ylen-1);
	int blkz = ((int)floor(camera->mpz)) & (map->zlen-1);
	
	// get block subpos
	float subx = (camera->mpx - floor(camera->mpx));
	float suby = (camera->mpy - floor(camera->mpy));
	float subz = (camera->mpz - floor(camera->mpz));
	
	// render each face
	render_vxl_face_horiz(blkx, blky, blkz, subx, suby, subz, CM_NX, -1,  0,  0);
	render_vxl_face_vert(blkx, blky, blkz, subx, suby, subz, CM_NY,  0, -1,  0);
	render_vxl_face_horiz(blkx, blky, blkz, subx, suby, subz, CM_NZ,  0,  0, -1);
	render_vxl_face_horiz(blkx, blky, blkz, subx, suby, subz, CM_PX,  1,  0,  0);
	render_vxl_face_vert(blkx, blky, blkz, subx, suby, subz, CM_PY,  0,  1,  0);
	render_vxl_face_horiz(blkx, blky, blkz, subx, suby, subz, CM_PZ,  0,  0,  1);
}

void render_cubemap(uint32_t *pixels, int width, int height, int pitch, camera_t *camera, map_t *map)
{
	int x,y,z;
	
	// stash stuff in globals to prevent spamming the stack too much
	// (and in turn thrashing the cache)
	rtmp_pixels = pixels;
	rtmp_width = width;
	rtmp_height = height;
	rtmp_pitch = pitch;
	rtmp_camera = camera;
	rtmp_map = map;
	
	// get corner traces
	float tracemul = cubemap_size/2;
	float traceadd = tracemul;
	float ctrx1 = (camera->mzx+camera->mxx-camera->myx);
	float ctry1 = (camera->mzy+camera->mxy-camera->myy);
	float ctrz1 = (camera->mzz+camera->mxz-camera->myz);
	float ctrx2 = (camera->mzx-camera->mxx-camera->myx);
	float ctry2 = (camera->mzy-camera->mxy-camera->myy);
	float ctrz2 = (camera->mzz-camera->mxz-camera->myz);
	float ctrx3 = (camera->mzx+camera->mxx+camera->myx);
	float ctry3 = (camera->mzy+camera->mxy+camera->myy);
	float ctrz3 = (camera->mzz+camera->mxz+camera->myz);
	float ctrx4 = (camera->mzx-camera->mxx+camera->myx);
	float ctry4 = (camera->mzy-camera->mxy+camera->myy);
	float ctrz4 = (camera->mzz-camera->mxz+camera->myz);
	
	// calculate deltas
	float fbx = ctrx1, fby = ctry1, fbz = ctrz1; // base
	float fex = ctrx2, fey = ctry2, fez = ctrz2; // end
	float flx = ctrx3-fbx, fly = ctry3-fby, flz = ctrz3-fbz; // left side
	float frx = ctrx4-fex, fry = ctry4-fey, frz = ctrz4-fez; // right side
	flx /= (float)width; fly /= (float)width; flz /= (float)width;
	frx /= (float)width; fry /= (float)width; frz /= (float)width;
	
	// scale cubemap correctly
	fbx += flx*((float)(width-height))/2.0f;
	fby += fly*((float)(width-height))/2.0f;
	fbz += flz*((float)(width-height))/2.0f;
	fex += frx*((float)(width-height))/2.0f;
	fey += fry*((float)(width-height))/2.0f;
	fez += frz*((float)(width-height))/2.0f;
	
	// raytrace it
	// TODO: find some faster method
	uint32_t *p = pixels;
	int hwidth = width/2;
	int hheight = height/2;
	for(y = -hheight; y < hheight; y++)
	{
		float fx = fbx;
		float fy = fby;
		float fz = fbz;
		
		float fdx = (fex-fbx)/(float)width;
		float fdy = (fey-fby)/(float)width;
		float fdz = (fez-fbz)/(float)width;
		
		for(x = -hwidth; x < hwidth; x++)
		{
			// get correct cube map and draw
			if(fabsf(fx) > fabsf(fy) && fabsf(fx) > fabsf(fz))
			{
				*p++ = cubemap_color[fx >= 0.0f ? CM_PX : CM_NX][
					((cubemap_size-1)&(int)(-fz*tracemul/fx+traceadd))
					|(((cubemap_size-1)&(int)(fy*tracemul/fabsf(fx)+traceadd))<<cubemap_shift)];
			} else if(fabsf(fz) > fabsf(fy) && fabsf(fz) > fabsf(fx)) {
				*p++ = cubemap_color[fz >= 0.0f ? CM_PZ : CM_NZ][
					((cubemap_size-1)&(int)(fx*tracemul/fz+traceadd))
					|(((cubemap_size-1)&(int)(fy*tracemul/fabsf(fz)+traceadd))<<cubemap_shift)];
			} else {
				*p++ = cubemap_color[fy >= 0.0f ? CM_PY : CM_NY][
					((cubemap_size-1)&(int)(fx*tracemul/fy+traceadd))
					|(((cubemap_size-1)&(int)(fz*tracemul/fy+traceadd))<<cubemap_shift)];
			}
			
			fx += fdx;
			fy += fdy;
			fz += fdz;
		}
		
		p += pitch-width;
		
		fbx += flx;
		fby += fly;
		fbz += flz;
		
		fex += frx;
		fey += fry;
		fez += frz;
	}
	
	/*
	// TEST: draw something
	for(x = 0; x < 512; x++)
	for(y = 0; y < 512; y++)
	{
		pixels[y*pitch+x] = *(uint32_t *)&(map->pillars[y*map->xlen+x][8]);
		//pixels[y*pitch+x] = cubemap_color[CM_PZ][y*cubemap_size+x];
	}*/
}

int render_init(int width, int height)
{
	int i;
	int size = (width > height ? width : height);
	
	// get nearest power of 2
	size = (size-1);
	size |= size>>1;
	size |= size>>2;
	size |= size>>4;
	size |= size>>8;
	size++;
	
	// reduce quality a little bit
	// 800x600 -> 1024^2 -> 512^2 ends up as 1MB x 6 textures = 6MB
	
	size >>= 1;
	
	// allocate cubemaps
	for(i = 0; i < CM_MAX; i++)
	{
		cubemap_color[i] = malloc(size*size*4);
		cubemap_depth[i] = malloc(size*size*4);
		if(cubemap_color[i] == NULL || cubemap_depth[i] == NULL)
		{
			// Can't allocate :. Can't continue
			// Clean up like a boss
			fprintf(stderr, "render_init: could not allocate cubemap %i\n", i);
			for(; i >= 0; i--)
			{
				if(cubemap_color[i] != NULL)
					free(cubemap_color[i]);
				if(cubemap_depth[i] != NULL)
					free(cubemap_depth[i]);
				cubemap_color[i] = NULL;
				cubemap_depth[i] = NULL;
			}
			
			return 1;
		}
	}
	
	// we might as well set this, too!
	cubemap_size = size;
	
	// calculate shift factor
	cubemap_shift = -1;
	while(size != 0)
	{
		cubemap_shift++;
		size >>= 1;
	}
	
	return 0;
}

void render_deinit(void)
{
	int i;
	
	// deallocate cubemaps
	for(i = 0; i < CM_MAX; i++)
	{
		if(cubemap_color[i] != NULL)
		{
			free(cubemap_color[i]);
			cubemap_color[i] = NULL;
		}
		if(cubemap_depth[i] != NULL)
		{
			free(cubemap_depth[i]);
			cubemap_depth[i] = NULL;
		}
	}
	
}