openspades/Sources/Draw/SWImageRenderer.cpp

/*
 Copyright (c) 2013 yvt

 This file is part of OpenSpades.

 OpenSpades is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OpenSpades is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with OpenSpades.  If not, see <http://www.gnu.org/licenses/>.

 */

#include "SWImageRenderer.h"
#include <Core/Bitmap.h>
#include "SWImage.h"

namespace spades {
	namespace draw {
		SWImageRenderer::SWImageRenderer(SWFeatureLevel lvl):
		shader(ShaderType::Image),
		featureLevel(lvl){

		}

		SWImageRenderer::~SWImageRenderer() {

		}

		void SWImageRenderer::SetFramebuffer(spades::Bitmap *bmp) {
			this->frame = bmp;
			if(bmp) {
				fbSize4 = MakeVector4(static_cast<float>(bmp->GetWidth()) * .5f,
									  static_cast<float>(bmp->GetHeight()) * -.5f,
									  1.f, 1.f);
				fbCenter4 = MakeVector4(static_cast<float>(bmp->GetWidth()) * .5f,
										static_cast<float>(bmp->GetHeight()) * .5f,
										0.f, 0.f);
			}
		}

		void SWImageRenderer::SetDepthBuffer(float *f) {
			depthBuffer = f;
		}

		void SWImageRenderer::SetShaderType(ShaderType type) {
			shader = type;
		}

		void SWImageRenderer::SetZRange(float zNear, float) {
			// currently zNear is ignored...
			this->zNear = zNear;
		}


		struct Interpolator {
			int mode; // 0:fixed point, 1/-1: bresenham

			// fixed point
			struct {
				int64_t counter;
				int64_t step;
			} fp;

			struct {
				unsigned int divisor;
				unsigned int dividend;
				unsigned int step;
				int largePos;
			} b;

			static int abs(int v) {
				return v < 0 ? -v : v;
			}

			Interpolator(int start, int end, int numSteps, bool noBresenham = true) {
				// FIXME: same sub-pixel positioning as OpenGL?
				if(abs(end - start) <= numSteps && !noBresenham) {
					int distance = end - start;
					if(distance >= 0) {
						mode = 1;
					}else{
						mode = -1;
						distance = -distance;
					}
					if(numSteps == 0)
						numSteps = 1;
					b.divisor = static_cast<unsigned int>(numSteps);
					b.dividend = 0;
					b.largePos = start;
					b.step = static_cast<unsigned int>(distance);
				}else{
					mode = 0;
					if(numSteps == 0) {
						fp.step = 0;
					}else{
						fp.step = static_cast<int64_t>(end - start) << 32;
						fp.step /= static_cast<int64_t>(numSteps);
					}/*else if(end > start){
						unsigned int distance = end - start;
						unsigned int large = distance / static_cast<unsigned int>(numSteps);
						fp.step = static_cast<int64_t>(large) << 32;

						unsigned int distance2 = distance - static_cast<unsigned int>(numSteps) * large;
						unsigned int medium = (distance2 << 12) / static_cast<unsigned int>(numSteps);
						fp.step += static_cast<int64_t>(medium) << 20;

						unsigned int distance3 = (distance2 << 12) - static_cast<unsigned int>(numSteps) * medium;
						unsigned int small = (distance3 << 12) / static_cast<unsigned int>(numSteps);
						fp.step += static_cast<int64_t>(small) << 8;


					}else{
						unsigned int distance = start - end;
						unsigned int large = distance / static_cast<unsigned int>(numSteps);
						fp.step = static_cast<int64_t>(large) << 32;

						unsigned int distance2 = distance - static_cast<unsigned int>(numSteps) * large;
						unsigned int medium = (distance2 << 12) / static_cast<unsigned int>(numSteps);
						fp.step += static_cast<int64_t>(medium) << 20;

						unsigned int distance3 = (distance2 << 12) - static_cast<unsigned int>(numSteps) * medium;
						unsigned int small = (distance3 << 12) / static_cast<unsigned int>(numSteps);
						fp.step += static_cast<int64_t>(small) << 8;

						fp.step = -fp.step;
					}*/
					fp.counter = static_cast<int64_t>(start) << 32;
				}
			}
			int GetCurrent() {
				if(mode != 0)
					return b.largePos;
				return static_cast<int>(fp.counter >> 32);
			}
			void MoveNext() {
				if(mode == 0) {
					fp.counter += fp.step;
				}else{
					b.dividend += b.step;
					while(b.dividend >= b.divisor){
						b.dividend -= b.divisor;
						b.largePos += mode;
					}
				}
			}
			void MoveNext(int numSteps) {
				if(mode == 0) {
					if(numSteps == 1)
						MoveNext();
					else if(numSteps == 0)
						return;
					else
						fp.counter += fp.step * static_cast<int64_t>(numSteps);
				}else{
					if(numSteps < 4){
						while(numSteps--)
							MoveNext();
					}else{
						unsigned long long d = b.dividend;
						d += static_cast<unsigned long long>(b.step * static_cast<unsigned int>(numSteps));
						unsigned long long cnt = d / b.divisor;
						d -= cnt * b.divisor;
						b.dividend = static_cast<unsigned int>(d);
						b.largePos += mode * static_cast<int>(cnt);
					}
				}
			}
		};

		enum {
			texUVScaleBits = 16,
			texUVScaleInt = 1 << texUVScaleBits
		};
		static const float texUVScaleFloat = static_cast<float>(texUVScaleInt);

		struct SWImageVarying {
			union {
				struct {
					int u, v;
				};
#if ENABLE_SSE2
				__m128i uv_m128; // [u32, ?32, v32, ?32]
#endif
			};
#if __cplusplus >= 201103L
			SWImageVarying() = default; // POD
#else
			SWImageVarying() {} // non-POD
#endif
			SWImageVarying(int u, int v):
			u(u), v(v){ }

			SWImageVarying(const SWImageRenderer::Vertex& v):
			u(static_cast<int>(v.uv.x * texUVScaleFloat + .5f)),
			v(static_cast<int>(v.uv.y * texUVScaleFloat + .5f)) {

			}
		};

		template<SWFeatureLevel level>
		struct SWImageGouraudInterpolator {
			Interpolator u, v;
			SWImageGouraudInterpolator(const SWImageVarying& start,
									   const SWImageVarying& end,
									   int numSteps):
			u(start.u, end.u, numSteps),
			v(start.v, end.v, numSteps)
			{}

			SWImageVarying GetCurrent() {
				return SWImageVarying(u.GetCurrent(), v.GetCurrent());
			}

			void MoveNext(int s) {
				u.MoveNext(s);
				v.MoveNext(s);
			}

			void MoveNext() {
				u.MoveNext();
				v.MoveNext();
			}
		};

#if ENABLE_SSE2
		template<>
		struct SWImageGouraudInterpolator<SWFeatureLevel::SSE2> {
			Interpolator u, v;

			union{
				__m128i uv;
				struct {
					int64_t uvU, uvV;
				};
			};
			union {
				__m128i uvStep;
				struct {
					int64_t stepU, stepV;
				};
			};

			inline SWImageGouraudInterpolator(const SWImageVarying& start,
									   const SWImageVarying& end,
									   int numSteps):
			u(start.u, end.u, numSteps, true),
			v(start.v, end.v, numSteps, true)
			{
				uvU = u.fp.counter;
				uvV = v.fp.counter;
				stepU = u.fp.step;
				stepV = v.fp.step;
			}

			inline SWImageVarying GetCurrent() {
				SWImageVarying varying;
				varying.uv_m128 = _mm_shuffle_epi32(uv, 0x3d);
				return varying;
			}

			inline void MoveNext(int s) {
				if(s == 0) return;
				else if(s < 4){
					auto v = uv, st = uvStep;
					while(s--) {
						v = _mm_add_epi64(v, st);
					}
					uv = v;
				}
				else {
					// no SSE2 support for 64bit multiply, but
					// this isn't a big problem because this case is rare
					uvU += stepU * s;
					uvV += stepV * s;
				}
			}

			inline void MoveNext() {
				uv = _mm_add_epi64(uv, uvStep);
			}
		};
#endif


#pragma mark - Polygon Renderer Main

		template<
		SWFeatureLevel level,
		bool needTransform,
		bool ndc, // normalized device coordinate
		bool depthTest,
		bool solidFill,
		bool linearInterpolate
		>
		struct SWImageRenderer::PolygonRenderer {

			static_assert(!needTransform, "Transform pass was not selected");
			static_assert(!ndc, "Denormalize pass was not selected");

			static void DrawPolygonInternalInner(SWImage *img,
											const Vertex& v1,
											const Vertex& v2,
											const Vertex& v3,
											SWImageRenderer& r) {
				// TODO: support null image

				Bitmap *const fb = r.frame;
				SPAssert(fb != nullptr);

				if(v3.position.y <= 0.f) {
					// viewport cull
					return;
				}

				const int fbW = fb->GetWidth();
				const int fbH = fb->GetHeight();
				uint32_t *const bmp = fb->GetPixels();

				if(v1.position.y >= static_cast<float>(fbH)) {
					// viewport cull
					return;
				}

				float *const depthBuffer = r.depthBuffer;
				if(depthTest){
					SPAssert(depthBuffer != nullptr);
				}

				uint32_t *const tpixels = img->GetRawBitmap();
				const int tw = img->GetRawWidth();
				const int th = img->GetRawHeight();

				const int x1 = static_cast<int>(v1.position.x);
				const int y1 = static_cast<int>(v1.position.y);
				const int x2 = static_cast<int>(v2.position.x);
				const int y2 = static_cast<int>(v2.position.y);
				const int x3 = static_cast<int>(v3.position.x);
				const int y3 = static_cast<int>(v3.position.y);

				if(x1 == x2 && x2 == x3) return; // area cull
				if(y1 == y3) return; // area cull
				if(std::min(std::min(x1, x2), x3) >= fbW) return; // viewport cull
				if(std::max(std::max(x1, x2), x3) <= 0) return; // viewport cull

				auto convertColor = [](float f) {
					int i = static_cast<int>(f * 256.f + .5f);
					return static_cast<unsigned short>(std::max(std::min(i, 256), 0));
				};
				unsigned short mulR = convertColor(v1.color.z);
				unsigned short mulG = convertColor(v1.color.y);
				unsigned short mulB = convertColor(v1.color.x);
				unsigned short mulA = convertColor(v1.color.w);

				if(mulA == 0 && mulR == 0 && mulG == 0 && mulB == 0)
					return;

				// unreal-ish linear interpolation by using dither
				int16_t ditherBaseX = static_cast<int16_t>((1 << (texUVScaleBits - 2)) * img->GetInvWidth());
				int16_t ditherBaseY = static_cast<int16_t>((1 << (texUVScaleBits - 2)) * img->GetInvHeight());
				int16_t ditherMap[] = {
					static_cast<int16_t>(-ditherBaseX), static_cast<int16_t>(-ditherBaseY << 1),
					0, static_cast<int16_t>(ditherBaseY),
					static_cast<int16_t>(ditherBaseX), 0,
					static_cast<int16_t>(-ditherBaseX << 1), static_cast<int16_t>(-ditherBaseY),
				};

				auto drawPixel = [mulR, mulG, mulB, mulA](uint32_t& dest, float& destDepth,
														  uint32_t texture, float inDepth) {
					if(depthTest) {
						if(inDepth > destDepth) {
							return;
						}
					}

					if(texture == 0) return; // transparent

					unsigned int ta = static_cast<unsigned int>(texture >> 24);
					ta += (ta >> 7); // [0, 255] -> [0, 256]

					if(ta == 256 && mulA == 256) {
						// opaque
						unsigned int tr = static_cast<unsigned int>((texture >> 0) & 0xff);
						unsigned int tg = static_cast<unsigned int>((texture >> 8) & 0xff);
						unsigned int tb = static_cast<unsigned int>((texture >>16) & 0xff);
						tr = (tr * mulR) >> 8; tg = (tg * mulG) >> 8;
						tb = (tb * mulB) >> 8; ta = (ta * mulA) >> 8;
						dest = tr | (tg << 8) | (tb << 16);
						return;
					}

					// already premultiplied. see SWImage.cpp
					unsigned int tr = static_cast<unsigned int>((texture >> 0) & 0xff);
					unsigned int tg = static_cast<unsigned int>((texture >> 8) & 0xff);
					unsigned int tb = static_cast<unsigned int>((texture >>16) & 0xff);
					tr = (tr * mulR) >> 8; tg = (tg * mulG) >> 8;
					tb = (tb * mulB) >> 8; ta = (ta * mulA) >> 8;

					uint32_t destCol = dest;
					unsigned int dr = static_cast<unsigned int>((destCol >> 0) & 0xff);
					unsigned int dg = static_cast<unsigned int>((destCol >> 8) & 0xff);
					unsigned int db = static_cast<unsigned int>((destCol >>16) & 0xff);
					unsigned int invA = 256 - ta;
					dr = (dr * invA) >> 8; dg = (dg * invA) >> 8;
					db = (db * invA) >> 8;

					unsigned int outR = tr + dr, outG = tg + dg, outB = tb + db;
					outR = std::min(outR, 255U);
					outG = std::min(outG, 255U);
					outB = std::min(outB, 255U);

					dest = outR | (outG << 8) | (outB << 16);
				};

				auto drawScanline = [tw, th, tpixels, bmp, fbW, fbH, depthBuffer, &drawPixel, &r,
									 &ditherMap]
				(int y, int x1, int x2,
				 const SWImageVarying& vary1,
				 const SWImageVarying& vary2,
				 float z1, float z2) {
					uint32_t *out = bmp + (y * fbW);
					float *depthOut = nullptr;
					if(depthTest) {
						depthOut = depthBuffer + (y * fbW);
					}
					SPAssert(x1 < x2);
					int width = x2 - x1;
					SWImageGouraudInterpolator<level> vary(vary1, vary2, width);
					int minX = std::max(x1, 0);
					int maxX = std::min(x2, fbW);
					vary.MoveNext(minX - x1);
					out += minX;
					if(depthTest) {
						depthOut += minX;
					}
					r.pixelsDrawn += maxX - minX;
					auto *ditherMap2 = ditherMap + ((y & 1) << 2);
					for(int x = minX; x < maxX; x++) {
						auto vr = vary.GetCurrent();
						unsigned int u = static_cast<unsigned int>(vr.u);
						unsigned int v = static_cast<unsigned int>(vr.v);
						if(linearInterpolate) {
							uint8_t idx = static_cast<uint8_t>(x & 1);
							u += ditherMap2[idx * 2];
							v += ditherMap2[idx * 2 + 1];
						}
						u &= texUVScaleInt-1;
						v &= texUVScaleInt-1;
						u = (u * tw) >> texUVScaleBits;
						v = (v * th) >> texUVScaleBits;
						uint32_t tex = tpixels[u + v * tw];
						// FIXME: Z interpolation
						// FIXME: perspective correction
						drawPixel(*out, *depthOut, tex, z1);
						out++;
						if(depthTest) {
							depthOut ++;
						}
						vary.MoveNext();
					}
				};

				// FIXME: interpolated Z

				Interpolator longSpanX(x1, x3, y3 - y1);
				SWImageGouraudInterpolator<level> longSpan(v1, v3, y3 - y1);
				{
					Interpolator shortSpanX(x1, x2, y2 - y1);
					SWImageGouraudInterpolator<level> shortSpan(v1, v2, y2 - y1);
					int minY = std::max(0, y1);
					int maxY = std::min(fbH, y2);
					shortSpanX.MoveNext(minY - y1);
					shortSpan.MoveNext(minY - y1);
					longSpanX.MoveNext(minY - y1);
					longSpan.MoveNext(minY - y1);
					for(int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if(lineX1 == lineX2) continue;
						if(lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2,
										 v1.position.z, v1.position.z);
						}else{
							drawScanline(y, lineX2, lineX1, line2, line1,
										 v1.position.z, v1.position.z);
						}
					}
				}
				{
					Interpolator shortSpanX(x2, x3, y3 - y2);
					SWImageGouraudInterpolator<level> shortSpan(v2, v3, y3 - y2);
					int minY = std::max(0, y2);
					int maxY = std::min(fbH, y3);
					shortSpanX.MoveNext(minY - y2);
					shortSpan.MoveNext(minY - y2);
					longSpanX.MoveNext(minY - y2);
					longSpan.MoveNext(minY - y2);
					for(int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if(lineX1 == lineX2) continue;
						if(lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2,
										 v1.position.z, v1.position.z);
						}else{
							drawScanline(y, lineX2, lineX1, line2, line1,
										 v1.position.z, v1.position.z);
						}
					}
				}
				// polygon, done!
			}

			static void DrawPolygonInternal(SWImage *img,
											const Vertex& v1,
											const Vertex& v2,
											const Vertex& v3,
											SWImageRenderer& r) {
				if(v2.position.y < v1.position.y) {
					if(v3.position.y < v2.position.y) {
						DrawPolygonInternalInner(img, v3, v2, v1, r);
					}else if(v3.position.y < v1.position.y) {
						DrawPolygonInternalInner(img, v2, v3, v1, r);
					}else{
						DrawPolygonInternalInner(img, v2, v1, v3, r);
					}
				}else if(v3.position.y < v1.position.y){
					DrawPolygonInternalInner(img, v3, v1, v2, r);
				}else if(v3.position.y < v2.position.y){
					DrawPolygonInternalInner(img, v1, v3, v2, r);
				}else{
					DrawPolygonInternalInner(img, v1, v2, v3, r);
				}
			}
		};

		// TODO: Non-SSE2 renderer for solid polygons

#pragma mark - SSE2
#if ENABLE_SSE2

#pragma mark General
		template<
		bool depthTest,
		bool linearInterpolate
		>
		struct SWImageRenderer::PolygonRenderer
		<SWFeatureLevel::SSE2, false, false, depthTest, false, linearInterpolate> {


			static void DrawPolygonInternalInner(SWImage *img,
												 const Vertex& v1,
												 const Vertex& v2,
												 const Vertex& v3,
												 SWImageRenderer& r) {


				Bitmap *const fb = r.frame;
				SPAssert(fb != nullptr);

				if(v3.position.y <= 0.f) {
					// viewport cull
					return;
				}

				const int fbW = fb->GetWidth();
				const int fbH = fb->GetHeight();
				uint32_t *const bmp = fb->GetPixels();

				if(v1.position.y >= static_cast<float>(fbH)) {
					// viewport cull
					return;
				}

				float *const depthBuffer = r.depthBuffer;
				if(depthTest){
					SPAssert(depthBuffer != nullptr);
				}

				uint32_t *const tpixels = img->GetRawBitmap();
				const int tw = img->GetRawWidth();
				const int th = img->GetRawHeight();

				const int x1 = static_cast<int>(v1.position.x);
				const int y1 = static_cast<int>(v1.position.y);
				const int x2 = static_cast<int>(v2.position.x);
				const int y2 = static_cast<int>(v2.position.y);
				const int x3 = static_cast<int>(v3.position.x);
				const int y3 = static_cast<int>(v3.position.y);

				if(x1 == x2 && x2 == x3) return; // area cull
				if(y1 == y3) return; // area cull
				if(std::min(std::min(x1, x2), x3) >= fbW) return; // viewport cull
				if(std::max(std::max(x1, x2), x3) <= 0) return; // viewport cull

				auto convertColor = [](float f) {
					int i = static_cast<int>(f * 256.f + .5f);
					return static_cast<unsigned short>(std::max(std::min(i, 256), 0));
				};
				unsigned short mulR = convertColor(v1.color.x);
				unsigned short mulG = convertColor(v1.color.y);
				unsigned short mulB = convertColor(v1.color.z);
				unsigned short mulA = convertColor(v1.color.w);

				if(mulA == 0 && mulR == 0 && mulG == 0 && mulB == 0)
					return;

				// unreal-ish linear interpolation by using dither
				int16_t ditherBaseX = static_cast<int16_t>((1 << (texUVScaleBits - 2)) * img->GetInvWidth());
				int16_t ditherBaseY = static_cast<int16_t>((1 << (texUVScaleBits - 2)) * img->GetInvHeight());
				int16_t ditherMap[] = {
					static_cast<int16_t>(-ditherBaseX), static_cast<int16_t>(-ditherBaseY << 1),
					0, static_cast<int16_t>(ditherBaseY),
					static_cast<int16_t>(ditherBaseX), 0,
					static_cast<int16_t>(-ditherBaseX << 1), static_cast<int16_t>(-ditherBaseY),
				};

				__m128i ditherMap2[] = {
					_mm_setr_epi32(ditherMap[0], ditherMap[1], ditherMap[2], ditherMap[3]),
					_mm_setr_epi32(ditherMap[4], ditherMap[5], ditherMap[6], ditherMap[7])
				};

				__m128i mulCol = _mm_setr_epi16(mulB, mulG, mulR, mulA,
												mulB, mulG, mulR, mulA);

				auto drawPixel = [mulCol, mulA]
				(uint32_t& dest, float& destDepth,
				 uint32_t texture, float inDepth) {
					if(depthTest) {
						if(inDepth > destDepth) {
							return;
						}
					}

					if(texture == 0)
						return; // transparent

					unsigned int ta = static_cast<unsigned int>(texture >> 24);
					ta += (ta >> 7); // [0, 255] -> [0, 256]

					// load [u8.0x4]
					__m128i tcol = _mm_setr_epi32(texture, 0,0,0);

					// convert to [u16.0x4], 8bit width
					tcol = _mm_unpacklo_epi8(tcol, _mm_setzero_si128());
					//tcol = _mm_shufflelo_epi16(tcol, 0b11000110); // swap BGR/RGB

					if(ta == 256 && mulA == 256) {
						// opaque
						tcol = _mm_mullo_epi16(tcol, mulCol);

						// pack.
						tcol = _mm_srli_epi16(tcol, 8);
						tcol = _mm_packus_epi16(tcol, tcol);

						// store.
						_mm_store_ss(reinterpret_cast<float *>(&dest),
									 _mm_castsi128_ps(tcol));

						return;
					}

					// tcol is already premultiplied. see SWImage.cpp

					// modulate by the constant color. now [u8.8x4], 8bit width
					tcol = _mm_mullo_epi16(tcol, mulCol);

					// broadcast the alpha of the tcol.
					__m128i tcolAlphaVec = _mm_shufflelo_epi16(tcol, 0xff);

					// make tcol [u8.8x4]
					//tcol = _mm_slli_epi16(tcol, 1);
					tcolAlphaVec = _mm_srli_epi16(tcolAlphaVec, 8); // make [u16x4]8bw
					tcolAlphaVec = _mm_add_epi16(tcolAlphaVec,
												 _mm_srli_epi16(tcolAlphaVec, 7)); // [0,255] -> [0,256]

					// inverse the alpha
					tcolAlphaVec = _mm_sub_epi16(_mm_set1_epi16(0x100),
												 tcolAlphaVec);

					// load [u8.0x4]
					__m128i dcol = _mm_setr_epi32(dest, 0,0,0);

					// convert to [u16.0x4], 8bit width
					dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128());

					// modulate by inversed src alpha.
					// now [u8.8 x 4]
					dcol = _mm_mullo_epi16(dcol, tcolAlphaVec);

					// additive blending with saturation.
					dcol = _mm_adds_epu16(dcol, tcol);

					// pack.
					dcol = _mm_srli_epi16(dcol, 8);
					dcol = _mm_packus_epi16(dcol, dcol);

					// store.
					_mm_store_ss(reinterpret_cast<float *>(&dest),
								 _mm_castsi128_ps(dcol));
				};

				auto drawPixel2 = [mulCol, mulA, &drawPixel]
				(uint32_t *dest, float *destDepth,
				 uint32_t texture1, float inDepth1,
				 uint32_t texture2, float inDepth2) {
					if(depthTest) {
						if(inDepth1 > destDepth[0]) {
							drawPixel(dest[1], destDepth[1],
									  texture2, inDepth2);
							return;
						}
						if(inDepth2 > destDepth[1]) {
							drawPixel(dest[0], destDepth[0],
									  texture2, inDepth2);
							return;
						}
					}

					if(texture1 == 0 && texture2 == 0)
						return; // transparent

					// load [u8.0x4]
					__m128i tcol = _mm_setr_epi32(texture1, texture2, 0,0);

					// convert to [u16.0x4, u16.0x4], 8bit width
					tcol = _mm_unpacklo_epi8(tcol, _mm_setzero_si128());
					//tcol = _mm_shufflelo_epi16(tcol, 0b11000110); // swap BGR/RGB
					//tcol = _mm_shufflehi_epi16(tcol, 0b11000110); // swap BGR/RGB

					/* FIXME
					if(ta == 256 && mulA == 256) {
						// opaque
						tcol = _mm_mullo_epi16(tcol, mulCol);

						// pack.
						tcol = _mm_srli_epi16(tcol, 8);
						tcol = _mm_packus_epi16(tcol, tcol);

						// store.
						_mm_store_ss(reinterpret_cast<float *>(&dest),
									 tcol);

						return;
					}*/

					// tcol is already premultiplied. see SWImage.cpp

					// modulate by the constant color. now [u8.8x4, u8.8x4], 8bit width
					tcol = _mm_mullo_epi16(tcol, mulCol);

					// broadcast the alpha of the tcol.
					__m128i tcolAlphaVec = _mm_shufflelo_epi16(tcol, 0xff);
					tcolAlphaVec = _mm_shufflehi_epi16(tcolAlphaVec, 0xff);

					// make tcol [u8.8x4]
					//tcol = _mm_slli_epi16(tcol, 1);
					tcolAlphaVec = _mm_srli_epi16(tcolAlphaVec, 8); // make [u16x4,u16x4]8bw
					tcolAlphaVec = _mm_add_epi16(tcolAlphaVec,
												 _mm_srli_epi16(tcolAlphaVec, 7)); // [0,255] -> [0,256]

					// inverse the alpha
					tcolAlphaVec = _mm_sub_epi16(_mm_set1_epi16(0x100),
												 tcolAlphaVec);

					// load [u8.0 x 4 x 2]
					__m128i dcol = _mm_setr_epi32(dest[0], dest[1], 0,0);

					// convert to [u16.0 x 4 x 2], 8bit width
					dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128());

					// modulate by inversed src alpha.
					// now [u8.8 x 4 x 2]
					dcol = _mm_mullo_epi16(dcol, tcolAlphaVec);

					// additive blending with saturation.
					dcol = _mm_adds_epu16(dcol, tcol);

					// pack.
					dcol = _mm_srli_epi16(dcol, 8);
					dcol = _mm_packus_epi16(dcol, dcol);

					// store.
					_mm_store_sd(reinterpret_cast<double *>(dest),
								 _mm_castsi128_pd(dcol));
				};

				auto drawScanline = [tw, th, tpixels, bmp, fbW, fbH, depthBuffer, &drawPixel, &drawPixel2, &r,
									 &ditherMap, &ditherMap2]
				(int y, int x1, int x2,
				 const SWImageVarying& vary1,
				 const SWImageVarying& vary2,
				 float z1, float z2) {
					uint32_t *out = bmp + (y * fbW);
					float *depthOut = nullptr;
					if(depthTest) {
						depthOut = depthBuffer + (y * fbW);
					}
					SPAssert(x1 < x2);
					int width = x2 - x1;
					SWImageGouraudInterpolator<SWFeatureLevel::SSE2> vary(vary1, vary2, width);
					int minX = std::max(x1, 0);
					int maxX = std::min(x2, fbW);
					r.pixelsDrawn += maxX - minX;
					vary.MoveNext(minX - x1);
					out += minX;
					if(depthTest) {
						depthOut += minX;
					}
					auto uvMask = _mm_set1_epi32(texUVScaleInt - 1);
					auto uvScale = _mm_setr_epi32(tw, tw, th, th);

					uint8_t ditherIndex = static_cast<uint8_t>(y & 1) << 1;

					auto unalignedPixel = [&]() {
						auto vr = vary.GetCurrent();
						union {
							__m128i uv;
							struct { unsigned int ui, dummy1, vi, dummy2; } iuv;
						};
						uv = vr.uv_m128;

						if(linearInterpolate) {
							uint8_t idx = static_cast<uint8_t>(minX & 1) | ditherIndex;
							auto m = _mm_setr_epi32(ditherMap[idx * 2], 0, ditherMap[idx * 2 + 1], 0);
							uv = _mm_add_epi32(uv, m);
						}

						uv = _mm_and_si128(uv, uvMask); // repeat
						uv = _mm_mul_epu32(uv, uvScale); // now [u*tw, v*th]
						uv = _mm_srli_epi64(uv, texUVScaleBits);

						uint32_t tex = tpixels[iuv.ui + iuv.vi * tw];
						// FIXME: Z interpolation
						// FIXME: perspective correction
						drawPixel(*out, *depthOut, tex, z1);
						out++;
						if(depthTest) {
							depthOut ++;
						}
						vary.MoveNext();
					};
					while((minX & 1) && (minX < maxX)) {
						// non-aligned.
						unalignedPixel();
						minX++;
					}
					int reminders = maxX & 1;
					maxX -= reminders;
					auto dither = ditherMap2[y & 1];
					for(int x = minX; x < maxX; x+=2) {
						auto vr1 = vary.GetCurrent();
						vary.MoveNext();
						auto vr2 = vary.GetCurrent();
						union {
							__m128i uv;
							struct { unsigned int ui, dummy1, vi, dummy2; } iuv;
						};
						//static_assert(texUVScaleBits == 16, "texUVScaleBits must be 16");
						uv = _mm_castps_si128
						(_mm_shuffle_ps(_mm_castsi128_ps(vr1.uv_m128),
											_mm_castsi128_ps(vr2.uv_m128), 0x88)); // [u1,v1,u2,v2]

						if(linearInterpolate) {
							uv =_mm_add_epi32(uv, dither);
						}

						uv = _mm_shuffle_epi32(uv, 0xd8); // [u1,u2,v1,v2]
						uv = _mm_and_si128(uv, uvMask); // repeat

						auto tm = uv;
						uv = _mm_mul_epu32(uv, uvScale);
						uv = _mm_srli_epi64(uv, texUVScaleBits);
						uint32_t tex1 = tpixels[iuv.ui + iuv.vi * tw];

						uv = _mm_shuffle_epi32(tm, 0xb1); // [u2,u1,v2,v1]
						uv = _mm_mul_epu32(uv, uvScale);
						uv = _mm_srli_epi64(uv, texUVScaleBits);
						uint32_t tex2 = tpixels[iuv.ui + iuv.vi * tw];
						// FIXME: Z interpolation
						// FIXME: perspective correction
						//drawPixel(out[0], depthOut[0], tex1, z1);
						//drawPixel(out[1], depthOut[1], tex2, z1);

						drawPixel2(out, depthOut, tex1, z1, tex2, z1);
						out += 2;
						if(depthTest) {
							depthOut += 2;
						}
						vary.MoveNext();
					}
					while(reminders--) {
						unalignedPixel();
					}
				};

				// FIXME: interpolated Z

				Interpolator longSpanX(x1, x3, y3 - y1);
				SWImageGouraudInterpolator<SWFeatureLevel::SSE2> longSpan(v1, v3, y3 - y1);
				{
					Interpolator shortSpanX(x1, x2, y2 - y1);
					SWImageGouraudInterpolator<SWFeatureLevel::SSE2> shortSpan(v1, v2, y2 - y1);
					int minY = std::max(0, y1);
					int maxY = std::min(fbH, y2);
					shortSpanX.MoveNext(minY - y1);
					shortSpan.MoveNext(minY - y1);
					longSpanX.MoveNext(minY - y1);
					longSpan.MoveNext(minY - y1);
					for(int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if(lineX1 == lineX2) continue;
						if(lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2,
										 v1.position.z, v1.position.z);
						}else{
							drawScanline(y, lineX2, lineX1, line2, line1,
										 v1.position.z, v1.position.z);
						}
					}
				}
				{
					Interpolator shortSpanX(x2, x3, y3 - y2);
					SWImageGouraudInterpolator<SWFeatureLevel::SSE2> shortSpan(v2, v3, y3 - y2);
					int minY = std::max(0, y2);
					int maxY = std::min(fbH, y3);
					shortSpanX.MoveNext(minY - y2);
					shortSpan.MoveNext(minY - y2);
					longSpanX.MoveNext(minY - y2);
					longSpan.MoveNext(minY - y2);
					for(int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if(lineX1 == lineX2) continue;
						if(lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2,
										 v1.position.z, v1.position.z);
						}else{
							drawScanline(y, lineX2, lineX1, line2, line1,
										 v1.position.z, v1.position.z);
						}
					}
				}
				// polygon, done!
			}

			static void DrawPolygonInternal(SWImage *img,
											const Vertex& v1,
											const Vertex& v2,
											const Vertex& v3,
											SWImageRenderer& r) {
				if(v2.position.y < v1.position.y) {
					if(v3.position.y < v2.position.y) {
						DrawPolygonInternalInner(img, v3, v2, v1, r);
					}else if(v3.position.y < v1.position.y) {
						DrawPolygonInternalInner(img, v2, v3, v1, r);
					}else{
						DrawPolygonInternalInner(img, v2, v1, v3, r);
					}
				}else if(v3.position.y < v1.position.y){
					DrawPolygonInternalInner(img, v3, v1, v2, r);
				}else if(v3.position.y < v2.position.y){
					DrawPolygonInternalInner(img, v1, v3, v2, r);
				}else{
					DrawPolygonInternalInner(img, v1, v2, v3, r);
				}
			}
		};

#pragma mark Solid
		template<
		bool depthTest,
		bool lerp
		>
		struct SWImageRenderer::PolygonRenderer
		<SWFeatureLevel::SSE2, false, false, depthTest, true, lerp> {


			static void DrawPolygonInternalInner(SWImage *img,
												 const Vertex& v1,
												 const Vertex& v2,
												 const Vertex& v3,
												 SWImageRenderer& r) {


				Bitmap *const fb = r.frame;
				SPAssert(fb != nullptr);

				if(v3.position.y <= 0.f) {
					// viewport cull
					return;
				}

				const int fbW = fb->GetWidth();
				const int fbH = fb->GetHeight();
				uint32_t *const bmp = fb->GetPixels();

				if(v1.position.y >= static_cast<float>(fbH)) {
					// viewport cull
					return;
				}

				float *const depthBuffer = r.depthBuffer;
				if(depthTest){
					SPAssert(depthBuffer != nullptr);
				}

				const int x1 = static_cast<int>(v1.position.x);
				const int y1 = static_cast<int>(v1.position.y);
				const int x2 = static_cast<int>(v2.position.x);
				const int y2 = static_cast<int>(v2.position.y);
				const int x3 = static_cast<int>(v3.position.x);
				const int y3 = static_cast<int>(v3.position.y);

				if(x1 == x2 && x2 == x3) return; // area cull
				if(y1 == y3) return; // area cull
				if(std::min(std::min(x1, x2), x3) >= fbW) return; // viewport cull
				if(std::max(std::max(x1, x2), x3) <= 0) return; // viewport cull

				auto convertColor = [](float f) {
					// 255.f, not 256.f here because for solid rendering
					// this color is directly used
					int i = static_cast<int>(f * 255.f + .5f);
					return static_cast<unsigned short>(std::max(std::min(i, 255), 0));
				};
				unsigned short mulR = convertColor(v1.color.x);
				unsigned short mulG = convertColor(v1.color.y);
				unsigned short mulB = convertColor(v1.color.z);
				unsigned short mulA = convertColor(v1.color.w);

				if(mulA == 0 && mulR == 0 && mulG == 0 && mulB == 0)
					return;

				__m128i mulCol = _mm_setr_epi16(mulB, mulG, mulR, mulA,
												mulB, mulG, mulR, mulA);
				__m128i mulInv = _mm_set1_epi16(256 - (mulA + (mulA >> 7)));
				mulCol = _mm_slli_epi16(mulCol, 8);

				auto drawPixel = [mulCol, mulInv]
				(uint32_t& dest, float& destDepth,
				 float inDepth) {
					if(depthTest) {
						if(inDepth > destDepth) {
							return;
						}
					}

					// load [u8.8x4]8bw
					__m128i tcol = mulCol;

					// load [u8.0x4]
					__m128i dcol = _mm_setr_epi32(dest, 0,0,0);

					// convert to [u16.0x4], 8bit width
					dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128());

					// modulate by inversed src alpha.
					// now [u8.8 x 4]
					dcol = _mm_mullo_epi16(dcol, mulInv);

					// additive blending with saturation.
					dcol = _mm_adds_epu16(dcol, tcol);

					// pack.
					dcol = _mm_srli_epi16(dcol, 8);
					dcol = _mm_packus_epi16(dcol, dcol);

					// store.
					_mm_store_ss(reinterpret_cast<float *>(&dest),
								 _mm_castsi128_ps(dcol));
				};

				auto drawPixel2 = [mulCol, mulInv, &drawPixel]
				(uint32_t *dest, float *destDepth,
				float inDepth1,
				float inDepth2) {
					if(depthTest) {
						if(inDepth1 > destDepth[0]) {
							drawPixel(dest[1], destDepth[1],
									  inDepth2);
							return;
						}
						if(inDepth2 > destDepth[1]) {
							drawPixel(dest[0], destDepth[0],
									  inDepth2);
							return;
						}
					}

					// load [u8.8 x 4 x 2]
					__m128i tcol = mulCol;

					// load [u8.0 x 4 x 2]
					__m128i dcol = _mm_setr_epi32(dest[0], dest[1], 0,0);

					// convert to [u16.0 x 4 x 2], 8bit width
					dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128());

					// modulate by inversed src alpha.
					// now [u8.8 x 4 x 2]
					dcol = _mm_mullo_epi16(dcol, mulInv);

					// additive blending with saturation.
					dcol = _mm_adds_epu16(dcol, tcol);

					// pack.
					dcol = _mm_srli_epi16(dcol, 8);
					dcol = _mm_packus_epi16(dcol, dcol);

					// store.
					_mm_store_sd(reinterpret_cast<double *>(dest),
								 _mm_castsi128_pd(dcol));
				};

				auto drawScanline = [bmp, fbW, fbH, depthBuffer, &drawPixel, &drawPixel2, &r]
				(int y, int x1, int x2,
				 const SWImageVarying& vary1,
				 const SWImageVarying& vary2,
				 float z1, float z2) {
					uint32_t *out = bmp + (y * fbW);
					float *depthOut = nullptr;
					if(depthTest) {
						depthOut = depthBuffer + (y * fbW);
					}
					SPAssert(x1 < x2);
					//int width = x2 - x1;
					int minX = std::max(x1, 0);
					int maxX = std::min(x2, fbW);
					r.pixelsDrawn += maxX - minX;
					out += minX;
					if(depthTest) {
						depthOut += minX;
					}

					auto unalignedPixel = [&]() {
						// FIXME: Z interpolation
						drawPixel(*out, *depthOut, z1);
						out++;
						if(depthTest) {
							depthOut ++;
						}
					};
					while((minX & 1) && (minX < maxX)) {
						// non-aligned.
						unalignedPixel();
						minX++;
					}
					int reminders = maxX & 1;
					maxX -= reminders;
					/* for(int x = minX; x < maxX; x+=2) */
					if(maxX > minX)
					for(auto *endPtr = out + (maxX - minX); out != endPtr;){
						// FIXME: Z interpolation

						drawPixel2(out, depthOut, z1, z1);
						out += 2;
						if(depthTest) {
							depthOut += 2;
						}
					}
					while(reminders--) {
						unalignedPixel();
					}
				};

				// FIXME: interpolated Z

				Interpolator longSpanX(x1, x3, y3 - y1);
				SWImageGouraudInterpolator<SWFeatureLevel::SSE2> longSpan(v1, v3, y3 - y1);
				{
					Interpolator shortSpanX(x1, x2, y2 - y1);
					SWImageGouraudInterpolator<SWFeatureLevel::SSE2> shortSpan(v1, v2, y2 - y1);
					int minY = std::max(0, y1);
					int maxY = std::min(fbH, y2);
					shortSpanX.MoveNext(minY - y1);
					shortSpan.MoveNext(minY - y1);
					longSpanX.MoveNext(minY - y1);
					longSpan.MoveNext(minY - y1);
					for(int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if(lineX1 == lineX2) continue;
						if(lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2,
										 v1.position.z, v1.position.z);
						}else{
							drawScanline(y, lineX2, lineX1, line2, line1,
										 v1.position.z, v1.position.z);
						}
					}
				}
				{
					Interpolator shortSpanX(x2, x3, y3 - y2);
					SWImageGouraudInterpolator<SWFeatureLevel::SSE2> shortSpan(v2, v3, y3 - y2);
					int minY = std::max(0, y2);
					int maxY = std::min(fbH, y3);
					shortSpanX.MoveNext(minY - y2);
					shortSpan.MoveNext(minY - y2);
					longSpanX.MoveNext(minY - y2);
					longSpan.MoveNext(minY - y2);
					for(int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if(lineX1 == lineX2) continue;
						if(lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2,
										 v1.position.z, v1.position.z);
						}else{
							drawScanline(y, lineX2, lineX1, line2, line1,
										 v1.position.z, v1.position.z);
						}
					}
				}
				// polygon, done!
			}

			static void DrawPolygonInternal(SWImage *img,
											const Vertex& v1,
											const Vertex& v2,
											const Vertex& v3,
											SWImageRenderer& r) {
				if(v2.position.y < v1.position.y) {
					if(v3.position.y < v2.position.y) {
						DrawPolygonInternalInner(img, v3, v2, v1, r);
					}else if(v3.position.y < v1.position.y) {
						DrawPolygonInternalInner(img, v2, v3, v1, r);
					}else{
						DrawPolygonInternalInner(img, v2, v1, v3, r);
					}
				}else if(v3.position.y < v1.position.y){
					DrawPolygonInternalInner(img, v3, v1, v2, r);
				}else if(v3.position.y < v2.position.y){
					DrawPolygonInternalInner(img, v1, v3, v2, r);
				}else{
					DrawPolygonInternalInner(img, v1, v2, v3, r);
				}
			}
		};

#endif

#pragma mark - Intermediates

		template<
		SWFeatureLevel featureLvl,
		bool depthTest,
		bool solidFill,
		bool lerp
		>
		struct SWImageRenderer::PolygonRenderer<featureLvl, false, true, depthTest, solidFill, lerp> {
			static void DrawPolygonInternal(SWImage *img,
											const Vertex& v1,
											const Vertex& v2,
											const Vertex& v3,
											SWImageRenderer& r) {

				// denormalize
				auto vv1 = v1, vv2 = v2, vv3 = v3;
				vv1.position = (vv1.position * r.fbSize4) + r.fbCenter4;
				vv2.position = (vv2.position * r.fbSize4) + r.fbCenter4;
				vv3.position = (vv3.position * r.fbSize4) + r.fbCenter4;
				PolygonRenderer<featureLvl, false, false, depthTest, solidFill, lerp>::DrawPolygonInternal(img,
																   vv1, vv2, vv3, r);
			}
		};

		template<
		SWFeatureLevel featureLvl,
		bool ndc, // normalized device coordinate
		bool depthTest,
		bool solidFill,
		bool lerp
		>
		struct SWImageRenderer::PolygonRenderer<featureLvl, true, ndc, depthTest, solidFill, lerp> {
			template<class F>
			static void Clip(Vertex& v1, Vertex& v2, Vertex& v3,
							 Vector4 plane, F continuation) {
				auto distance = [](const Vector4& v, const Vector4& plane) {
					return v.x * plane.x + v.y * plane.y + v.z * plane.z + plane.w;
				};
				auto lerpVertex = [](const Vertex& v1, const Vertex& v2, Vertex& out,
									 float per) {
					out.position = v1.position + (v2.position - v1.position) * per;
					out.color = v1.color + (v2.color - v1.color) * per;
					out.uv = v1.uv + (v2.uv - v1.uv) * per;
				};
				float d1 = distance(v1.position, plane);
				float d2 = distance(v2.position, plane);
				float d3 = distance(v3.position, plane);
				bool nc1 = d1 >= 0.f;
				bool nc2 = d2 >= 0.f;
				bool nc3 = d3 >= 0.f;
				int bits = (nc1 ? 1 : 0) | (nc2 ? 2 : 0) | (nc3 ? 4 : 0);
				float per1, per2;
				Vertex vv1, vv2, vv3;
				Vertex t1, t2;
				switch(bits){
					case 0:
						// culled
						return;
					case 7:
						// not clipped
						continuation(v1, v2, v3);
						return;
					case 1:
						per1 = d2 / (d2 - d1); // == (0.f - d2) / (d1 - d2);
						per2 = d3 / (d3 - d1);
						lerpVertex(v2, v1, v2, per1);
						lerpVertex(v3, v1, v3, per2);
						continuation(v1, v2, v3);
						return;
					case 2:
						per1 = d1 / (d1 - d2);
						per2 = d3 / (d3 - d2);
						lerpVertex(v1, v2, v1, per1);
						lerpVertex(v3, v2, v3, per2);
						continuation(v1, v2, v3);
						return;
					case 4:
						per1 = d2 / (d2 - d3);
						per2 = d1 / (d1 - d3);
						lerpVertex(v2, v3, v2, per1);
						lerpVertex(v1, v3, v1, per2);
						continuation(v1, v2, v3);
						return;
					case 3:
						per1 = d2 / (d2 - d3);
						per2 = d1 / (d1 - d3);
						lerpVertex(v2, v3, t2, per1);
						lerpVertex(v1, v3, t1, per2);
						vv1 = v1;
						vv2 = v2;
						vv3 = t2;
						continuation(vv1, vv2, vv3);
						vv1 = v1;
						vv2 = t2;
						vv3 = t1;
						continuation(vv1, vv2, vv3);
						break;
					case 5:
						per1 = d3 / (d3 - d2);
						per2 = d1 / (d1 - d2);
						lerpVertex(v3, v2, t2, per1);
						lerpVertex(v1, v2, t1, per2);
						vv1 = v1;
						vv2 = v3;
						vv3 = t2;
						continuation(vv1, vv2, vv3);
						vv1 = v1;
						vv2 = t2;
						vv3 = t1;
						continuation(vv1, vv2, vv3);
						break;
					case 6:
						per1 = d3 / (d3 - d1);
						per2 = d2 / (d2 - d1);
						lerpVertex(v3, v1, t2, per1);
						lerpVertex(v2, v1, t1, per2);
						vv1 = v2;
						vv2 = v3;
						vv3 = t2;
						continuation(vv1, vv2, vv3);
						vv1 = v2;
						vv2 = t2;
						vv3 = t1;
						continuation(vv1, vv2, vv3);
						break;
				}

			}

			static void DrawPolygonInternal(SWImage *img,
											const Vertex& v1,
											const Vertex& v2,
											const Vertex& v3,
											SWImageRenderer& r) {
				// needs transform.
				auto vv1 = v1, vv2 = v2, vv3 = v3;
				const auto& mat = r.matrix;
				vv1.position = mat * vv1.position;
				vv2.position = mat * vv2.position;
				vv3.position = mat * vv3.position;

				Clip
				(vv1, vv2, vv3,
				 MakeVector4(0.f, 0.f, 1.f, -r.zNear),
				 [img,&r](Vertex& v1, Vertex& v2, Vertex& v3) {
					 Vertex vv1 = v1;
					 Vertex vv2 = v2;
					 Vertex vv3 = v3;
					 // want to save Z
					 float orig1 = vv1.position.z;
					 float orig2 = vv2.position.z;
					 float orig3 = vv3.position.z;
#if ENABLE_SSE
					 union {
						 __m128 m;
						 Vector4 v;
					 };
					 m = _mm_setr_ps(vv1.position.w,
										  vv2.position.w,
										  vv3.position.w,
										  1.f);
					 m = _mm_rcp_ps(m);
					 vv1.position *= v.x;
					 vv2.position *= v.y;
					 vv3.position *= v.z;
#else
					 vv1.position /= vv1.position.w;
					 vv2.position /= vv2.position.w;
					 vv3.position /= vv3.position.w;
#endif
					 vv1.position.z = orig1;
					 vv2.position.z = orig2;
					 vv3.position.z = orig3;

					 PolygonRenderer<featureLvl, false, true, depthTest, solidFill, lerp>::DrawPolygonInternal
					 (img, vv1, vv2, vv3, r);
					 return;
					 Clip
					 (v1, v2, v3,
					  MakeVector4(0.f, 0.f, -1.f, 1.f),
					  [img,&r](Vertex& v1, Vertex& v2, Vertex& v3) {
						  PolygonRenderer<featureLvl, false, true, depthTest, solidFill, lerp>::DrawPolygonInternal
						  (img, v1, v2, v3, r);
					  });
				 });
			}
		};


		template<
		SWFeatureLevel level,
		bool needTransform,
		bool ndc, // normalized device coordinate
		bool depthTest,
		bool lerp
		>
		struct SWImageRenderer::PolygonRenderer3 {
			static void DrawPolygonInternal(SWImage *img,
											const Vertex& v1,
											const Vertex& v2,
											const Vertex& v3,
											SWImageRenderer& r) {
				if(img == nullptr || img->IsWhiteImage()) {
					PolygonRenderer<level, needTransform, ndc, depthTest, true, lerp>::DrawPolygonInternal(img,
																									  v1, v2, v3, r);
					return;
				}
				PolygonRenderer<level, needTransform, ndc, depthTest, false, lerp>::DrawPolygonInternal(img,
																										  v1, v2, v3, r);
			}
		};


		template<
		bool needTransform,
		bool ndc, // normalized device coordinate
		bool depthTest,
		bool lerp
		>
		struct SWImageRenderer::PolygonRenderer2 {
			static void DrawPolygonInternal(SWImage *img,
											const Vertex& v1,
											const Vertex& v2,
											const Vertex& v3,
											SWImageRenderer& r,
											SWFeatureLevel lvl) {
#if ENABLE_SSE2
				if(static_cast<int>(lvl) >= static_cast<int>(SWFeatureLevel::SSE2)) {
					PolygonRenderer3<SWFeatureLevel::SSE2, needTransform, ndc, depthTest, lerp>::DrawPolygonInternal(img,
																											  v1, v2, v3, r);
					return;
				}
#endif
				PolygonRenderer3<SWFeatureLevel::None, needTransform, ndc, depthTest, lerp>::DrawPolygonInternal(img,
																										  v1, v2, v3, r);
			}
		};


		void SWImageRenderer::DrawPolygon(SWImage *img,
										  const Vertex& v1,
										  const Vertex& v2,
										  const Vertex& v3) {
			SPAssert(frame != nullptr);
			switch(shader){
				case ShaderType::Sprite:
					PolygonRenderer2<
					true,	// needs transform
					true,	// in NDC
					true,	// depth tested
					true	// linear interpolation
					>::DrawPolygonInternal(img, v1, v2, v3, *this, featureLevel);
					break;
				case ShaderType::Image:
					PolygonRenderer2<
					false,	// don't need transform
					false,	// not NDC
					false,	// no depth test
					false   // point sampling
					>::DrawPolygonInternal(img, v1, v2, v3, *this, featureLevel);
					break;
			}
		}
	}
}