openspades/Sources/Draw/SWImageRenderer.cpp

/*
 Copyright (c) 2013 yvt

 This file is part of OpenSpades.

 OpenSpades is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OpenSpades is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with OpenSpades.  If not, see <http://www.gnu.org/licenses/>.

 */

#include "SWImageRenderer.h"
#include "SWImage.h"
#include <Core/Bitmap.h>

namespace spades {
	namespace draw {
		SWImageRenderer::SWImageRenderer(SWFeatureLevel lvl)
		    : shader(ShaderType::Image), featureLevel(lvl) {}

		SWImageRenderer::~SWImageRenderer() {}

		void SWImageRenderer::SetFramebuffer(spades::Bitmap *bmp) {
			this->frame = bmp;
			if (bmp) {
				fbSize4 = MakeVector4(static_cast<float>(bmp->GetWidth()) * .5f,
				                      static_cast<float>(bmp->GetHeight()) * -.5f, 1.f, 1.f);
				fbCenter4 = MakeVector4(static_cast<float>(bmp->GetWidth()) * .5f,
				                        static_cast<float>(bmp->GetHeight()) * .5f, 0.f, 0.f);
			}
		}

		void SWImageRenderer::SetDepthBuffer(float *f) { depthBuffer = f; }

		void SWImageRenderer::SetShaderType(ShaderType type) { shader = type; }

		void SWImageRenderer::SetZRange(float zNear, float) {
			// currently zNear is ignored...
			this->zNear = zNear;
		}

		struct Interpolator {
			int mode; // 0:fixed point, 1/-1: bresenham

			// fixed point
			struct {
				int64_t counter;
				int64_t step;
			} fp;

			struct {
				unsigned int divisor;
				unsigned int dividend;
				unsigned int step;
				int largePos;
			} b;

			static int abs(int v) { return v < 0 ? -v : v; }

			Interpolator(int start, int end, int numSteps, bool noBresenham = true) {
				// FIXME: same sub-pixel positioning as OpenGL?
				if (abs(end - start) <= numSteps && !noBresenham) {
					int distance = end - start;
					if (distance >= 0) {
						mode = 1;
					} else {
						mode = -1;
						distance = -distance;
					}
					if (numSteps == 0)
						numSteps = 1;
					b.divisor = static_cast<unsigned int>(numSteps);
					b.dividend = 0;
					b.largePos = start;
					b.step = static_cast<unsigned int>(distance);
				} else {
					mode = 0;
					if (numSteps == 0) {
						fp.step = 0;
					} else {
						fp.step = static_cast<int64_t>(end - start) << 32;
						fp.step /= static_cast<int64_t>(numSteps);
					} /*else if(end > start){
					     unsigned int distance = end - start;
					     unsigned int large = distance / static_cast<unsigned int>(numSteps);
					     fp.step = static_cast<int64_t>(large) << 32;

					     unsigned int distance2 = distance - static_cast<unsigned int>(numSteps) *
					 large;
					     unsigned int medium = (distance2 << 12) / static_cast<unsigned
					 int>(numSteps);
					     fp.step += static_cast<int64_t>(medium) << 20;

					     unsigned int distance3 = (distance2 << 12) - static_cast<unsigned
					 int>(numSteps) * medium;
					     unsigned int small = (distance3 << 12) / static_cast<unsigned
					 int>(numSteps);
					     fp.step += static_cast<int64_t>(small) << 8;


					 }else{
					     unsigned int distance = start - end;
					     unsigned int large = distance / static_cast<unsigned int>(numSteps);
					     fp.step = static_cast<int64_t>(large) << 32;

					     unsigned int distance2 = distance - static_cast<unsigned int>(numSteps) *
					 large;
					     unsigned int medium = (distance2 << 12) / static_cast<unsigned
					 int>(numSteps);
					     fp.step += static_cast<int64_t>(medium) << 20;

					     unsigned int distance3 = (distance2 << 12) - static_cast<unsigned
					 int>(numSteps) * medium;
					     unsigned int small = (distance3 << 12) / static_cast<unsigned
					 int>(numSteps);
					     fp.step += static_cast<int64_t>(small) << 8;

					     fp.step = -fp.step;
					 }*/
					fp.counter = static_cast<int64_t>(start) << 32;
				}
			}
			int GetCurrent() {
				if (mode != 0)
					return b.largePos;
				return static_cast<int>(fp.counter >> 32);
			}
			void MoveNext() {
				if (mode == 0) {
					fp.counter += fp.step;
				} else {
					b.dividend += b.step;
					while (b.dividend >= b.divisor) {
						b.dividend -= b.divisor;
						b.largePos += mode;
					}
				}
			}
			void MoveNext(int numSteps) {
				if (mode == 0) {
					if (numSteps == 1)
						MoveNext();
					else if (numSteps == 0)
						return;
					else
						fp.counter += fp.step * static_cast<int64_t>(numSteps);
				} else {
					if (numSteps < 4) {
						while (numSteps--)
							MoveNext();
					} else {
						unsigned long long d = b.dividend;
						d += static_cast<unsigned long long>(b.step *
						                                     static_cast<unsigned int>(numSteps));
						unsigned long long cnt = d / b.divisor;
						d -= cnt * b.divisor;
						b.dividend = static_cast<unsigned int>(d);
						b.largePos += mode * static_cast<int>(cnt);
					}
				}
			}
		};

		enum { texUVScaleBits = 16, texUVScaleInt = 1 << texUVScaleBits };
		static const float texUVScaleFloat = static_cast<float>(texUVScaleInt);

		struct SWImageVarying {
			union {
				struct {
					int u, v;
				};
#if ENABLE_SSE2
				__m128i uv_m128; // [u32, ?32, v32, ?32]
#endif
			};
#if __cplusplus >= 201103L
			SWImageVarying() = default; // POD
#else
			SWImageVarying() {} // non-POD
#endif
			SWImageVarying(int u, int v) : u(u), v(v) {}

			SWImageVarying(const SWImageRenderer::Vertex &v)
			    : u(static_cast<int>(v.uv.x * texUVScaleFloat + .5f)),
			      v(static_cast<int>(v.uv.y * texUVScaleFloat + .5f)) {}
		};

		template <SWFeatureLevel level> struct SWImageGouraudInterpolator {
			Interpolator u, v;
			SWImageGouraudInterpolator(const SWImageVarying &start, const SWImageVarying &end,
			                           int numSteps)
			    : u(start.u, end.u, numSteps), v(start.v, end.v, numSteps) {}

			SWImageVarying GetCurrent() { return SWImageVarying(u.GetCurrent(), v.GetCurrent()); }

			void MoveNext(int s) {
				u.MoveNext(s);
				v.MoveNext(s);
			}

			void MoveNext() {
				u.MoveNext();
				v.MoveNext();
			}
		};

#if ENABLE_SSE2
		template <> struct SWImageGouraudInterpolator<SWFeatureLevel::SSE2> {
			Interpolator u, v;

			union {
				__m128i uv;
				struct {
					int64_t uvU, uvV;
				};
			};
			union {
				__m128i uvStep;
				struct {
					int64_t stepU, stepV;
				};
			};

			inline SWImageGouraudInterpolator(const SWImageVarying &start,
			                                  const SWImageVarying &end, int numSteps)
			    : u(start.u, end.u, numSteps, true), v(start.v, end.v, numSteps, true) {
				uvU = u.fp.counter;
				uvV = v.fp.counter;
				stepU = u.fp.step;
				stepV = v.fp.step;
			}

			inline SWImageVarying GetCurrent() {
				SWImageVarying varying;
				varying.uv_m128 = _mm_shuffle_epi32(uv, 0x3d);
				return varying;
			}

			inline void MoveNext(int s) {
				if (s == 0)
					return;
				else if (s < 4) {
					auto v = uv, st = uvStep;
					while (s--) {
						v = _mm_add_epi64(v, st);
					}
					uv = v;
				} else {
					// no SSE2 support for 64bit multiply, but
					// this isn't a big problem because this case is rare
					uvU += stepU * s;
					uvV += stepV * s;
				}
			}

			inline void MoveNext() { uv = _mm_add_epi64(uv, uvStep); }
		};
#endif

#pragma mark - Polygon Renderer Main

		template <SWFeatureLevel level, bool needTransform,
		          bool ndc, // normalized device coordinate
		          bool depthTest, bool solidFill, bool linearInterpolate>
		struct SWImageRenderer::PolygonRenderer {

			static_assert(!needTransform, "Transform pass was not selected");
			static_assert(!ndc, "Denormalize pass was not selected");

			static void DrawPolygonInternalInner(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                     const Vertex &v3, SWImageRenderer &r) {
				// TODO: support null image

				Bitmap *const fb = r.frame;
				SPAssert(fb != nullptr);

				if (v3.position.y <= 0.f) {
					// viewport cull
					return;
				}

				const int fbW = fb->GetWidth();
				const int fbH = fb->GetHeight();
				uint32_t *const bmp = fb->GetPixels();

				if (v1.position.y >= static_cast<float>(fbH)) {
					// viewport cull
					return;
				}

				float *const depthBuffer = r.depthBuffer;
				if (depthTest) {
					SPAssert(depthBuffer != nullptr);
				}

				uint32_t *const tpixels = img->GetRawBitmap();
				const int tw = img->GetRawWidth();
				const int th = img->GetRawHeight();

				const int x1 = static_cast<int>(v1.position.x);
				const int y1 = static_cast<int>(v1.position.y);
				const int x2 = static_cast<int>(v2.position.x);
				const int y2 = static_cast<int>(v2.position.y);
				const int x3 = static_cast<int>(v3.position.x);
				const int y3 = static_cast<int>(v3.position.y);

				if (x1 == x2 && x2 == x3)
					return; // area cull
				if (y1 == y3)
					return; // area cull
				if (std::min(std::min(x1, x2), x3) >= fbW)
					return; // viewport cull
				if (std::max(std::max(x1, x2), x3) <= 0)
					return; // viewport cull

				auto convertColor = [](float f) {
					int i = static_cast<int>(f * 256.f + .5f);
					return static_cast<unsigned short>(std::max(std::min(i, 256), 0));
				};
				unsigned short mulR = convertColor(v1.color.z);
				unsigned short mulG = convertColor(v1.color.y);
				unsigned short mulB = convertColor(v1.color.x);
				unsigned short mulA = convertColor(v1.color.w);

				if (mulA == 0 && mulR == 0 && mulG == 0 && mulB == 0)
					return;

				// unreal-ish linear interpolation by using dither
				int16_t ditherBaseX =
				  static_cast<int16_t>((1 << (texUVScaleBits - 2)) * img->GetInvWidth());
				int16_t ditherBaseY =
				  static_cast<int16_t>((1 << (texUVScaleBits - 2)) * img->GetInvHeight());
				int16_t ditherMap[] = {
				  static_cast<int16_t>(-ditherBaseX),
				  static_cast<int16_t>(-ditherBaseY << 1),
				  0,
				  static_cast<int16_t>(ditherBaseY),
				  static_cast<int16_t>(ditherBaseX),
				  0,
				  static_cast<int16_t>(-ditherBaseX << 1),
				  static_cast<int16_t>(-ditherBaseY),
				};

				auto drawPixel = [mulR, mulG, mulB, mulA](uint32_t &dest, float &destDepth,
				                                          uint32_t texture, float inDepth) {
					if (depthTest) {
						if (inDepth > destDepth) {
							return;
						}
					}

					if (texture == 0)
						return; // transparent

					unsigned int ta = static_cast<unsigned int>(texture >> 24);
					ta += (ta >> 7); // [0, 255] -> [0, 256]

					if (ta == 256 && mulA == 256) {
						// opaque
						unsigned int tr = static_cast<unsigned int>((texture >> 0) & 0xff);
						unsigned int tg = static_cast<unsigned int>((texture >> 8) & 0xff);
						unsigned int tb = static_cast<unsigned int>((texture >> 16) & 0xff);
						tr = (tr * mulR) >> 8;
						tg = (tg * mulG) >> 8;
						tb = (tb * mulB) >> 8;
						ta = (ta * mulA) >> 8;
						dest = tr | (tg << 8) | (tb << 16);
						return;
					}

					// already premultiplied. see SWImage.cpp
					unsigned int tr = static_cast<unsigned int>((texture >> 0) & 0xff);
					unsigned int tg = static_cast<unsigned int>((texture >> 8) & 0xff);
					unsigned int tb = static_cast<unsigned int>((texture >> 16) & 0xff);
					tr = (tr * mulR) >> 8;
					tg = (tg * mulG) >> 8;
					tb = (tb * mulB) >> 8;
					ta = (ta * mulA) >> 8;

					uint32_t destCol = dest;
					unsigned int dr = static_cast<unsigned int>((destCol >> 0) & 0xff);
					unsigned int dg = static_cast<unsigned int>((destCol >> 8) & 0xff);
					unsigned int db = static_cast<unsigned int>((destCol >> 16) & 0xff);
					unsigned int invA = 256 - ta;
					dr = (dr * invA) >> 8;
					dg = (dg * invA) >> 8;
					db = (db * invA) >> 8;

					unsigned int outR = tr + dr, outG = tg + dg, outB = tb + db;
					outR = std::min(outR, 255U);
					outG = std::min(outG, 255U);
					outB = std::min(outB, 255U);

					dest = outR | (outG << 8) | (outB << 16);
				};

				auto drawScanline = [tw, th, tpixels, bmp, fbW, fbH, depthBuffer, &drawPixel, &r,
				                     &ditherMap](int y, int x1, int x2, const SWImageVarying &vary1,
				                                 const SWImageVarying &vary2, float z1, float z2) {
					uint32_t *out = bmp + (y * fbW);
					float *depthOut = nullptr;
					if (depthTest) {
						depthOut = depthBuffer + (y * fbW);
					}
					SPAssert(x1 < x2);
					int width = x2 - x1;
					SWImageGouraudInterpolator<level> vary(vary1, vary2, width);
					int minX = std::max(x1, 0);
					int maxX = std::min(x2, fbW);
					vary.MoveNext(minX - x1);
					out += minX;
					if (depthTest) {
						depthOut += minX;
					}
					r.pixelsDrawn += maxX - minX;
					auto *ditherMap2 = ditherMap + ((y & 1) << 2);
					for (int x = minX; x < maxX; x++) {
						auto vr = vary.GetCurrent();
						unsigned int u = static_cast<unsigned int>(vr.u);
						unsigned int v = static_cast<unsigned int>(vr.v);
						if (linearInterpolate) {
							uint8_t idx = static_cast<uint8_t>(x & 1);
							u += ditherMap2[idx * 2];
							v += ditherMap2[idx * 2 + 1];
						}
						u &= texUVScaleInt - 1;
						v &= texUVScaleInt - 1;
						u = (u * tw) >> texUVScaleBits;
						v = (v * th) >> texUVScaleBits;
						uint32_t tex = tpixels[u + v * tw];
						// FIXME: Z interpolation
						// FIXME: perspective correction
						drawPixel(*out, *depthOut, tex, z1);
						out++;
						if (depthTest) {
							depthOut++;
						}
						vary.MoveNext();
					}
				};

				// FIXME: interpolated Z

				Interpolator longSpanX(x1, x3, y3 - y1);
				SWImageGouraudInterpolator<level> longSpan(v1, v3, y3 - y1);
				{
					Interpolator shortSpanX(x1, x2, y2 - y1);
					SWImageGouraudInterpolator<level> shortSpan(v1, v2, y2 - y1);
					int minY = std::max(0, y1);
					int maxY = std::min(fbH, y2);
					shortSpanX.MoveNext(minY - y1);
					shortSpan.MoveNext(minY - y1);
					longSpanX.MoveNext(minY - y1);
					longSpan.MoveNext(minY - y1);
					for (int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if (lineX1 == lineX2)
							continue;
						if (lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z,
							             v1.position.z);
						} else {
							drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z,
							             v1.position.z);
						}
					}
				}
				{
					Interpolator shortSpanX(x2, x3, y3 - y2);
					SWImageGouraudInterpolator<level> shortSpan(v2, v3, y3 - y2);
					int minY = std::max(0, y2);
					int maxY = std::min(fbH, y3);
					shortSpanX.MoveNext(minY - y2);
					shortSpan.MoveNext(minY - y2);
					longSpanX.MoveNext(minY - y2);
					longSpan.MoveNext(minY - y2);
					for (int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if (lineX1 == lineX2)
							continue;
						if (lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z,
							             v1.position.z);
						} else {
							drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z,
							             v1.position.z);
						}
					}
				}
				// polygon, done!
			}

			static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                const Vertex &v3, SWImageRenderer &r) {
				if (v2.position.y < v1.position.y) {
					if (v3.position.y < v2.position.y) {
						DrawPolygonInternalInner(img, v3, v2, v1, r);
					} else if (v3.position.y < v1.position.y) {
						DrawPolygonInternalInner(img, v2, v3, v1, r);
					} else {
						DrawPolygonInternalInner(img, v2, v1, v3, r);
					}
				} else if (v3.position.y < v1.position.y) {
					DrawPolygonInternalInner(img, v3, v1, v2, r);
				} else if (v3.position.y < v2.position.y) {
					DrawPolygonInternalInner(img, v1, v3, v2, r);
				} else {
					DrawPolygonInternalInner(img, v1, v2, v3, r);
				}
			}
		};

// TODO: Non-SSE2 renderer for solid polygons

#pragma mark - SSE2
#if ENABLE_SSE2

#pragma mark General
		template <bool depthTest, bool linearInterpolate>
		struct SWImageRenderer::PolygonRenderer<SWFeatureLevel::SSE2, false, false, depthTest,
		                                        false, linearInterpolate> {

			static void DrawPolygonInternalInner(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                     const Vertex &v3, SWImageRenderer &r) {

				Bitmap *const fb = r.frame;
				SPAssert(fb != nullptr);

				if (v3.position.y <= 0.f) {
					// viewport cull
					return;
				}

				const int fbW = fb->GetWidth();
				const int fbH = fb->GetHeight();
				uint32_t *const bmp = fb->GetPixels();

				if (v1.position.y >= static_cast<float>(fbH)) {
					// viewport cull
					return;
				}

				float *const depthBuffer = r.depthBuffer;
				if (depthTest) {
					SPAssert(depthBuffer != nullptr);
				}

				uint32_t *const tpixels = img->GetRawBitmap();
				const int tw = img->GetRawWidth();
				const int th = img->GetRawHeight();

				const int x1 = static_cast<int>(v1.position.x);
				const int y1 = static_cast<int>(v1.position.y);
				const int x2 = static_cast<int>(v2.position.x);
				const int y2 = static_cast<int>(v2.position.y);
				const int x3 = static_cast<int>(v3.position.x);
				const int y3 = static_cast<int>(v3.position.y);

				if (x1 == x2 && x2 == x3)
					return; // area cull
				if (y1 == y3)
					return; // area cull
				if (std::min(std::min(x1, x2), x3) >= fbW)
					return; // viewport cull
				if (std::max(std::max(x1, x2), x3) <= 0)
					return; // viewport cull

				auto convertColor = [](float f) {
					int i = static_cast<int>(f * 256.f + .5f);
					return static_cast<unsigned short>(std::max(std::min(i, 256), 0));
				};
				unsigned short mulR = convertColor(v1.color.x);
				unsigned short mulG = convertColor(v1.color.y);
				unsigned short mulB = convertColor(v1.color.z);
				unsigned short mulA = convertColor(v1.color.w);

				if (mulA == 0 && mulR == 0 && mulG == 0 && mulB == 0)
					return;

				// unreal-ish linear interpolation by using dither
				int16_t ditherBaseX =
				  static_cast<int16_t>((1 << (texUVScaleBits - 2)) * img->GetInvWidth());
				int16_t ditherBaseY =
				  static_cast<int16_t>((1 << (texUVScaleBits - 2)) * img->GetInvHeight());
				int16_t ditherMap[] = {
				  static_cast<int16_t>(-ditherBaseX),
				  static_cast<int16_t>(-ditherBaseY << 1),
				  0,
				  static_cast<int16_t>(ditherBaseY),
				  static_cast<int16_t>(ditherBaseX),
				  0,
				  static_cast<int16_t>(-ditherBaseX << 1),
				  static_cast<int16_t>(-ditherBaseY),
				};

				__m128i ditherMap2[] = {
				  _mm_setr_epi32(ditherMap[0], ditherMap[1], ditherMap[2], ditherMap[3]),
				  _mm_setr_epi32(ditherMap[4], ditherMap[5], ditherMap[6], ditherMap[7])};

				__m128i mulCol = _mm_setr_epi16(mulB, mulG, mulR, mulA, mulB, mulG, mulR, mulA);

				auto drawPixel = [mulCol, mulA](uint32_t &dest, float &destDepth, uint32_t texture,
				                                float inDepth) {
					if (depthTest) {
						if (inDepth > destDepth) {
							return;
						}
					}

					if (texture == 0)
						return; // transparent

					unsigned int ta = static_cast<unsigned int>(texture >> 24);
					ta += (ta >> 7); // [0, 255] -> [0, 256]

					// load [u8.0x4]
					__m128i tcol = _mm_setr_epi32(texture, 0, 0, 0);

					// convert to [u16.0x4], 8bit width
					tcol = _mm_unpacklo_epi8(tcol, _mm_setzero_si128());
					// tcol = _mm_shufflelo_epi16(tcol, 0b11000110); // swap BGR/RGB

					if (ta == 256 && mulA == 256) {
						// opaque
						tcol = _mm_mullo_epi16(tcol, mulCol);

						// pack.
						tcol = _mm_srli_epi16(tcol, 8);
						tcol = _mm_packus_epi16(tcol, tcol);

						// store.
						_mm_store_ss(reinterpret_cast<float *>(&dest), _mm_castsi128_ps(tcol));

						return;
					}

					// tcol is already premultiplied. see SWImage.cpp

					// modulate by the constant color. now [u8.8x4], 8bit width
					tcol = _mm_mullo_epi16(tcol, mulCol);

					// broadcast the alpha of the tcol.
					__m128i tcolAlphaVec = _mm_shufflelo_epi16(tcol, 0xff);

					// make tcol [u8.8x4]
					// tcol = _mm_slli_epi16(tcol, 1);
					tcolAlphaVec = _mm_srli_epi16(tcolAlphaVec, 8); // make [u16x4]8bw
					tcolAlphaVec = _mm_add_epi16(
					  tcolAlphaVec, _mm_srli_epi16(tcolAlphaVec, 7)); // [0,255] -> [0,256]

					// inverse the alpha
					tcolAlphaVec = _mm_sub_epi16(_mm_set1_epi16(0x100), tcolAlphaVec);

					// load [u8.0x4]
					__m128i dcol = _mm_setr_epi32(dest, 0, 0, 0);

					// convert to [u16.0x4], 8bit width
					dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128());

					// modulate by inversed src alpha.
					// now [u8.8 x 4]
					dcol = _mm_mullo_epi16(dcol, tcolAlphaVec);

					// additive blending with saturation.
					dcol = _mm_adds_epu16(dcol, tcol);

					// pack.
					dcol = _mm_srli_epi16(dcol, 8);
					dcol = _mm_packus_epi16(dcol, dcol);

					// store.
					_mm_store_ss(reinterpret_cast<float *>(&dest), _mm_castsi128_ps(dcol));
				};

				auto drawPixel2 = [mulCol, mulA, &drawPixel](uint32_t *dest, float *destDepth,
				                                             uint32_t texture1, float inDepth1,
				                                             uint32_t texture2, float inDepth2) {
					if (depthTest) {
						if (inDepth1 > destDepth[0]) {
							drawPixel(dest[1], destDepth[1], texture2, inDepth2);
							return;
						}
						if (inDepth2 > destDepth[1]) {
							drawPixel(dest[0], destDepth[0], texture2, inDepth2);
							return;
						}
					}

					if (texture1 == 0 && texture2 == 0)
						return; // transparent

					// load [u8.0x4]
					__m128i tcol = _mm_setr_epi32(texture1, texture2, 0, 0);

					// convert to [u16.0x4, u16.0x4], 8bit width
					tcol = _mm_unpacklo_epi8(tcol, _mm_setzero_si128());
					// tcol = _mm_shufflelo_epi16(tcol, 0b11000110); // swap BGR/RGB
					// tcol = _mm_shufflehi_epi16(tcol, 0b11000110); // swap BGR/RGB

					/* FIXME
					if(ta == 256 && mulA == 256) {
					    // opaque
					    tcol = _mm_mullo_epi16(tcol, mulCol);

					    // pack.
					    tcol = _mm_srli_epi16(tcol, 8);
					    tcol = _mm_packus_epi16(tcol, tcol);

					    // store.
					    _mm_store_ss(reinterpret_cast<float *>(&dest),
					                 tcol);

					    return;
					}*/

					// tcol is already premultiplied. see SWImage.cpp

					// modulate by the constant color. now [u8.8x4, u8.8x4], 8bit width
					tcol = _mm_mullo_epi16(tcol, mulCol);

					// broadcast the alpha of the tcol.
					__m128i tcolAlphaVec = _mm_shufflelo_epi16(tcol, 0xff);
					tcolAlphaVec = _mm_shufflehi_epi16(tcolAlphaVec, 0xff);

					// make tcol [u8.8x4]
					// tcol = _mm_slli_epi16(tcol, 1);
					tcolAlphaVec = _mm_srli_epi16(tcolAlphaVec, 8); // make [u16x4,u16x4]8bw
					tcolAlphaVec = _mm_add_epi16(
					  tcolAlphaVec, _mm_srli_epi16(tcolAlphaVec, 7)); // [0,255] -> [0,256]

					// inverse the alpha
					tcolAlphaVec = _mm_sub_epi16(_mm_set1_epi16(0x100), tcolAlphaVec);

					// load [u8.0 x 4 x 2]
					__m128i dcol = _mm_setr_epi32(dest[0], dest[1], 0, 0);

					// convert to [u16.0 x 4 x 2], 8bit width
					dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128());

					// modulate by inversed src alpha.
					// now [u8.8 x 4 x 2]
					dcol = _mm_mullo_epi16(dcol, tcolAlphaVec);

					// additive blending with saturation.
					dcol = _mm_adds_epu16(dcol, tcol);

					// pack.
					dcol = _mm_srli_epi16(dcol, 8);
					dcol = _mm_packus_epi16(dcol, dcol);

					// store.
					_mm_store_sd(reinterpret_cast<double *>(dest), _mm_castsi128_pd(dcol));
				};

				auto drawScanline =
				  [tw, th, tpixels, bmp, fbW, fbH, depthBuffer, &drawPixel, &drawPixel2, &r,
				   &ditherMap, &ditherMap2](int y, int x1, int x2, const SWImageVarying &vary1,
				                            const SWImageVarying &vary2, float z1, float z2) {
					  uint32_t *out = bmp + (y * fbW);
					  float *depthOut = nullptr;
					  if (depthTest) {
						  depthOut = depthBuffer + (y * fbW);
					  }
					  SPAssert(x1 < x2);
					  int width = x2 - x1;
					  SWImageGouraudInterpolator<SWFeatureLevel::SSE2> vary(vary1, vary2, width);
					  int minX = std::max(x1, 0);
					  int maxX = std::min(x2, fbW);
					  r.pixelsDrawn += maxX - minX;
					  vary.MoveNext(minX - x1);
					  out += minX;
					  if (depthTest) {
						  depthOut += minX;
					  }
					  auto uvMask = _mm_set1_epi32(texUVScaleInt - 1);
					  auto uvScale = _mm_setr_epi32(tw, tw, th, th);

					  uint8_t ditherIndex = static_cast<uint8_t>(y & 1) << 1;

					  auto unalignedPixel = [&]() {
						  auto vr = vary.GetCurrent();
						  union {
							  __m128i uv;
							  struct {
								  unsigned int ui, dummy1, vi, dummy2;
							  } iuv;
						  };
						  uv = vr.uv_m128;

						  if (linearInterpolate) {
							  uint8_t idx = static_cast<uint8_t>(minX & 1) | ditherIndex;
							  auto m =
							    _mm_setr_epi32(ditherMap[idx * 2], 0, ditherMap[idx * 2 + 1], 0);
							  uv = _mm_add_epi32(uv, m);
						  }

						  uv = _mm_and_si128(uv, uvMask);  // repeat
						  uv = _mm_mul_epu32(uv, uvScale); // now [u*tw, v*th]
						  uv = _mm_srli_epi64(uv, texUVScaleBits);

						  uint32_t tex = tpixels[iuv.ui + iuv.vi * tw];
						  // FIXME: Z interpolation
						  // FIXME: perspective correction
						  drawPixel(*out, *depthOut, tex, z1);
						  out++;
						  if (depthTest) {
							  depthOut++;
						  }
						  vary.MoveNext();
					  };
					  while ((minX & 1) && (minX < maxX)) {
						  // non-aligned.
						  unalignedPixel();
						  minX++;
					  }
					  int reminders = maxX & 1;
					  maxX -= reminders;
					  auto dither = ditherMap2[y & 1];
					  for (int x = minX; x < maxX; x += 2) {
						  auto vr1 = vary.GetCurrent();
						  vary.MoveNext();
						  auto vr2 = vary.GetCurrent();
						  union {
							  __m128i uv;
							  struct {
								  unsigned int ui, dummy1, vi, dummy2;
							  } iuv;
						  };
						  // static_assert(texUVScaleBits == 16, "texUVScaleBits must be 16");
						  uv = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vr1.uv_m128),
						                                       _mm_castsi128_ps(vr2.uv_m128),
						                                       0x88)); // [u1,v1,u2,v2]

						  if (linearInterpolate) {
							  uv = _mm_add_epi32(uv, dither);
						  }

						  uv = _mm_shuffle_epi32(uv, 0xd8); // [u1,u2,v1,v2]
						  uv = _mm_and_si128(uv, uvMask);   // repeat

						  auto tm = uv;
						  uv = _mm_mul_epu32(uv, uvScale);
						  uv = _mm_srli_epi64(uv, texUVScaleBits);
						  uint32_t tex1 = tpixels[iuv.ui + iuv.vi * tw];

						  uv = _mm_shuffle_epi32(tm, 0xb1); // [u2,u1,v2,v1]
						  uv = _mm_mul_epu32(uv, uvScale);
						  uv = _mm_srli_epi64(uv, texUVScaleBits);
						  uint32_t tex2 = tpixels[iuv.ui + iuv.vi * tw];
						  // FIXME: Z interpolation
						  // FIXME: perspective correction
						  // drawPixel(out[0], depthOut[0], tex1, z1);
						  // drawPixel(out[1], depthOut[1], tex2, z1);

						  drawPixel2(out, depthOut, tex1, z1, tex2, z1);
						  out += 2;
						  if (depthTest) {
							  depthOut += 2;
						  }
						  vary.MoveNext();
					  }
					  while (reminders--) {
						  unalignedPixel();
					  }
				  };

				// FIXME: interpolated Z

				Interpolator longSpanX(x1, x3, y3 - y1);
				SWImageGouraudInterpolator<SWFeatureLevel::SSE2> longSpan(v1, v3, y3 - y1);
				{
					Interpolator shortSpanX(x1, x2, y2 - y1);
					SWImageGouraudInterpolator<SWFeatureLevel::SSE2> shortSpan(v1, v2, y2 - y1);
					int minY = std::max(0, y1);
					int maxY = std::min(fbH, y2);
					shortSpanX.MoveNext(minY - y1);
					shortSpan.MoveNext(minY - y1);
					longSpanX.MoveNext(minY - y1);
					longSpan.MoveNext(minY - y1);
					for (int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if (lineX1 == lineX2)
							continue;
						if (lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z,
							             v1.position.z);
						} else {
							drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z,
							             v1.position.z);
						}
					}
				}
				{
					Interpolator shortSpanX(x2, x3, y3 - y2);
					SWImageGouraudInterpolator<SWFeatureLevel::SSE2> shortSpan(v2, v3, y3 - y2);
					int minY = std::max(0, y2);
					int maxY = std::min(fbH, y3);
					shortSpanX.MoveNext(minY - y2);
					shortSpan.MoveNext(minY - y2);
					longSpanX.MoveNext(minY - y2);
					longSpan.MoveNext(minY - y2);
					for (int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if (lineX1 == lineX2)
							continue;
						if (lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z,
							             v1.position.z);
						} else {
							drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z,
							             v1.position.z);
						}
					}
				}
				// polygon, done!
			}

			static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                const Vertex &v3, SWImageRenderer &r) {
				if (v2.position.y < v1.position.y) {
					if (v3.position.y < v2.position.y) {
						DrawPolygonInternalInner(img, v3, v2, v1, r);
					} else if (v3.position.y < v1.position.y) {
						DrawPolygonInternalInner(img, v2, v3, v1, r);
					} else {
						DrawPolygonInternalInner(img, v2, v1, v3, r);
					}
				} else if (v3.position.y < v1.position.y) {
					DrawPolygonInternalInner(img, v3, v1, v2, r);
				} else if (v3.position.y < v2.position.y) {
					DrawPolygonInternalInner(img, v1, v3, v2, r);
				} else {
					DrawPolygonInternalInner(img, v1, v2, v3, r);
				}
			}
		};

#pragma mark Solid
		template <bool depthTest, bool lerp>
		struct SWImageRenderer::PolygonRenderer<SWFeatureLevel::SSE2, false, false, depthTest, true,
		                                        lerp> {

			static void DrawPolygonInternalInner(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                     const Vertex &v3, SWImageRenderer &r) {

				Bitmap *const fb = r.frame;
				SPAssert(fb != nullptr);

				if (v3.position.y <= 0.f) {
					// viewport cull
					return;
				}

				const int fbW = fb->GetWidth();
				const int fbH = fb->GetHeight();
				uint32_t *const bmp = fb->GetPixels();

				if (v1.position.y >= static_cast<float>(fbH)) {
					// viewport cull
					return;
				}

				float *const depthBuffer = r.depthBuffer;
				if (depthTest) {
					SPAssert(depthBuffer != nullptr);
				}

				const int x1 = static_cast<int>(v1.position.x);
				const int y1 = static_cast<int>(v1.position.y);
				const int x2 = static_cast<int>(v2.position.x);
				const int y2 = static_cast<int>(v2.position.y);
				const int x3 = static_cast<int>(v3.position.x);
				const int y3 = static_cast<int>(v3.position.y);

				if (x1 == x2 && x2 == x3)
					return; // area cull
				if (y1 == y3)
					return; // area cull
				if (std::min(std::min(x1, x2), x3) >= fbW)
					return; // viewport cull
				if (std::max(std::max(x1, x2), x3) <= 0)
					return; // viewport cull

				auto convertColor = [](float f) {
					// 255.f, not 256.f here because for solid rendering
					// this color is directly used
					int i = static_cast<int>(f * 255.f + .5f);
					return static_cast<unsigned short>(std::max(std::min(i, 255), 0));
				};
				unsigned short mulR = convertColor(v1.color.x);
				unsigned short mulG = convertColor(v1.color.y);
				unsigned short mulB = convertColor(v1.color.z);
				unsigned short mulA = convertColor(v1.color.w);

				if (mulA == 0 && mulR == 0 && mulG == 0 && mulB == 0)
					return;

				__m128i mulCol = _mm_setr_epi16(mulB, mulG, mulR, mulA, mulB, mulG, mulR, mulA);
				__m128i mulInv = _mm_set1_epi16(256 - (mulA + (mulA >> 7)));
				mulCol = _mm_slli_epi16(mulCol, 8);

				auto drawPixel = [mulCol, mulInv](uint32_t &dest, float &destDepth, float inDepth) {
					if (depthTest) {
						if (inDepth > destDepth) {
							return;
						}
					}

					// load [u8.8x4]8bw
					__m128i tcol = mulCol;

					// load [u8.0x4]
					__m128i dcol = _mm_setr_epi32(dest, 0, 0, 0);

					// convert to [u16.0x4], 8bit width
					dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128());

					// modulate by inversed src alpha.
					// now [u8.8 x 4]
					dcol = _mm_mullo_epi16(dcol, mulInv);

					// additive blending with saturation.
					dcol = _mm_adds_epu16(dcol, tcol);

					// pack.
					dcol = _mm_srli_epi16(dcol, 8);
					dcol = _mm_packus_epi16(dcol, dcol);

					// store.
					_mm_store_ss(reinterpret_cast<float *>(&dest), _mm_castsi128_ps(dcol));
				};

				auto drawPixel2 = [mulCol, mulInv, &drawPixel](uint32_t *dest, float *destDepth,
				                                               float inDepth1, float inDepth2) {
					if (depthTest) {
						if (inDepth1 > destDepth[0]) {
							drawPixel(dest[1], destDepth[1], inDepth2);
							return;
						}
						if (inDepth2 > destDepth[1]) {
							drawPixel(dest[0], destDepth[0], inDepth2);
							return;
						}
					}

					// load [u8.8 x 4 x 2]
					__m128i tcol = mulCol;

					// load [u8.0 x 4 x 2]
					__m128i dcol = _mm_setr_epi32(dest[0], dest[1], 0, 0);

					// convert to [u16.0 x 4 x 2], 8bit width
					dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128());

					// modulate by inversed src alpha.
					// now [u8.8 x 4 x 2]
					dcol = _mm_mullo_epi16(dcol, mulInv);

					// additive blending with saturation.
					dcol = _mm_adds_epu16(dcol, tcol);

					// pack.
					dcol = _mm_srli_epi16(dcol, 8);
					dcol = _mm_packus_epi16(dcol, dcol);

					// store.
					_mm_store_sd(reinterpret_cast<double *>(dest), _mm_castsi128_pd(dcol));
				};

				auto drawScanline = [bmp, fbW, fbH, depthBuffer, &drawPixel, &drawPixel2,
				                     &r](int y, int x1, int x2, const SWImageVarying &vary1,
				                         const SWImageVarying &vary2, float z1, float z2) {
					uint32_t *out = bmp + (y * fbW);
					float *depthOut = nullptr;
					if (depthTest) {
						depthOut = depthBuffer + (y * fbW);
					}
					SPAssert(x1 < x2);
					// int width = x2 - x1;
					int minX = std::max(x1, 0);
					int maxX = std::min(x2, fbW);
					r.pixelsDrawn += maxX - minX;
					out += minX;
					if (depthTest) {
						depthOut += minX;
					}

					auto unalignedPixel = [&]() {
						// FIXME: Z interpolation
						drawPixel(*out, *depthOut, z1);
						out++;
						if (depthTest) {
							depthOut++;
						}
					};
					while ((minX & 1) && (minX < maxX)) {
						// non-aligned.
						unalignedPixel();
						minX++;
					}
					int reminders = maxX & 1;
					maxX -= reminders;
					/* for(int x = minX; x < maxX; x+=2) */
					if (maxX > minX)
						for (auto *endPtr = out + (maxX - minX); out != endPtr;) {
							// FIXME: Z interpolation

							drawPixel2(out, depthOut, z1, z1);
							out += 2;
							if (depthTest) {
								depthOut += 2;
							}
						}
					while (reminders--) {
						unalignedPixel();
					}
				};

				// FIXME: interpolated Z

				Interpolator longSpanX(x1, x3, y3 - y1);
				SWImageGouraudInterpolator<SWFeatureLevel::SSE2> longSpan(v1, v3, y3 - y1);
				{
					Interpolator shortSpanX(x1, x2, y2 - y1);
					SWImageGouraudInterpolator<SWFeatureLevel::SSE2> shortSpan(v1, v2, y2 - y1);
					int minY = std::max(0, y1);
					int maxY = std::min(fbH, y2);
					shortSpanX.MoveNext(minY - y1);
					shortSpan.MoveNext(minY - y1);
					longSpanX.MoveNext(minY - y1);
					longSpan.MoveNext(minY - y1);
					for (int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if (lineX1 == lineX2)
							continue;
						if (lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z,
							             v1.position.z);
						} else {
							drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z,
							             v1.position.z);
						}
					}
				}
				{
					Interpolator shortSpanX(x2, x3, y3 - y2);
					SWImageGouraudInterpolator<SWFeatureLevel::SSE2> shortSpan(v2, v3, y3 - y2);
					int minY = std::max(0, y2);
					int maxY = std::min(fbH, y3);
					shortSpanX.MoveNext(minY - y2);
					shortSpan.MoveNext(minY - y2);
					longSpanX.MoveNext(minY - y2);
					longSpan.MoveNext(minY - y2);
					for (int y = minY; y < maxY; y++) {
						int lineX1 = shortSpanX.GetCurrent();
						auto line1 = shortSpan.GetCurrent();
						int lineX2 = longSpanX.GetCurrent();
						auto line2 = longSpan.GetCurrent();
						shortSpanX.MoveNext();
						shortSpan.MoveNext();
						longSpanX.MoveNext();
						longSpan.MoveNext();
						if (lineX1 == lineX2)
							continue;
						if (lineX1 < lineX2) {
							drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z,
							             v1.position.z);
						} else {
							drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z,
							             v1.position.z);
						}
					}
				}
				// polygon, done!
			}

			static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                const Vertex &v3, SWImageRenderer &r) {
				if (v2.position.y < v1.position.y) {
					if (v3.position.y < v2.position.y) {
						DrawPolygonInternalInner(img, v3, v2, v1, r);
					} else if (v3.position.y < v1.position.y) {
						DrawPolygonInternalInner(img, v2, v3, v1, r);
					} else {
						DrawPolygonInternalInner(img, v2, v1, v3, r);
					}
				} else if (v3.position.y < v1.position.y) {
					DrawPolygonInternalInner(img, v3, v1, v2, r);
				} else if (v3.position.y < v2.position.y) {
					DrawPolygonInternalInner(img, v1, v3, v2, r);
				} else {
					DrawPolygonInternalInner(img, v1, v2, v3, r);
				}
			}
		};

#endif

#pragma mark - Intermediates

		template <SWFeatureLevel featureLvl, bool depthTest, bool solidFill, bool lerp>
		struct SWImageRenderer::PolygonRenderer<featureLvl, false, true, depthTest, solidFill,
		                                        lerp> {
			static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                const Vertex &v3, SWImageRenderer &r) {

				// denormalize
				auto vv1 = v1, vv2 = v2, vv3 = v3;
				vv1.position = (vv1.position * r.fbSize4) + r.fbCenter4;
				vv2.position = (vv2.position * r.fbSize4) + r.fbCenter4;
				vv3.position = (vv3.position * r.fbSize4) + r.fbCenter4;
				PolygonRenderer<featureLvl, false, false, depthTest, solidFill,
				                lerp>::DrawPolygonInternal(img, vv1, vv2, vv3, r);
			}
		};

		template <SWFeatureLevel featureLvl,
		          bool ndc, // normalized device coordinate
		          bool depthTest, bool solidFill, bool lerp>
		struct SWImageRenderer::PolygonRenderer<featureLvl, true, ndc, depthTest, solidFill, lerp> {
			template <class F>
			static void Clip(Vertex &v1, Vertex &v2, Vertex &v3, Vector4 plane, F continuation) {
				auto distance = [](const Vector4 &v, const Vector4 &plane) {
					return v.x * plane.x + v.y * plane.y + v.z * plane.z + plane.w;
				};
				auto lerpVertex = [](const Vertex &v1, const Vertex &v2, Vertex &out, float per) {
					out.position = v1.position + (v2.position - v1.position) * per;
					out.color = v1.color + (v2.color - v1.color) * per;
					out.uv = v1.uv + (v2.uv - v1.uv) * per;
				};
				float d1 = distance(v1.position, plane);
				float d2 = distance(v2.position, plane);
				float d3 = distance(v3.position, plane);
				bool nc1 = d1 >= 0.f;
				bool nc2 = d2 >= 0.f;
				bool nc3 = d3 >= 0.f;
				int bits = (nc1 ? 1 : 0) | (nc2 ? 2 : 0) | (nc3 ? 4 : 0);
				float per1, per2;
				Vertex vv1, vv2, vv3;
				Vertex t1, t2;
				switch (bits) {
					case 0:
						// culled
						return;
					case 7:
						// not clipped
						continuation(v1, v2, v3);
						return;
					case 1:
						per1 = d2 / (d2 - d1); // == (0.f - d2) / (d1 - d2);
						per2 = d3 / (d3 - d1);
						lerpVertex(v2, v1, v2, per1);
						lerpVertex(v3, v1, v3, per2);
						continuation(v1, v2, v3);
						return;
					case 2:
						per1 = d1 / (d1 - d2);
						per2 = d3 / (d3 - d2);
						lerpVertex(v1, v2, v1, per1);
						lerpVertex(v3, v2, v3, per2);
						continuation(v1, v2, v3);
						return;
					case 4:
						per1 = d2 / (d2 - d3);
						per2 = d1 / (d1 - d3);
						lerpVertex(v2, v3, v2, per1);
						lerpVertex(v1, v3, v1, per2);
						continuation(v1, v2, v3);
						return;
					case 3:
						per1 = d2 / (d2 - d3);
						per2 = d1 / (d1 - d3);
						lerpVertex(v2, v3, t2, per1);
						lerpVertex(v1, v3, t1, per2);
						vv1 = v1;
						vv2 = v2;
						vv3 = t2;
						continuation(vv1, vv2, vv3);
						vv1 = v1;
						vv2 = t2;
						vv3 = t1;
						continuation(vv1, vv2, vv3);
						break;
					case 5:
						per1 = d3 / (d3 - d2);
						per2 = d1 / (d1 - d2);
						lerpVertex(v3, v2, t2, per1);
						lerpVertex(v1, v2, t1, per2);
						vv1 = v1;
						vv2 = v3;
						vv3 = t2;
						continuation(vv1, vv2, vv3);
						vv1 = v1;
						vv2 = t2;
						vv3 = t1;
						continuation(vv1, vv2, vv3);
						break;
					case 6:
						per1 = d3 / (d3 - d1);
						per2 = d2 / (d2 - d1);
						lerpVertex(v3, v1, t2, per1);
						lerpVertex(v2, v1, t1, per2);
						vv1 = v2;
						vv2 = v3;
						vv3 = t2;
						continuation(vv1, vv2, vv3);
						vv1 = v2;
						vv2 = t2;
						vv3 = t1;
						continuation(vv1, vv2, vv3);
						break;
				}
			}

			static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                const Vertex &v3, SWImageRenderer &r) {
				// needs transform.
				auto vv1 = v1, vv2 = v2, vv3 = v3;
				const auto &mat = r.matrix;
				vv1.position = mat * vv1.position;
				vv2.position = mat * vv2.position;
				vv3.position = mat * vv3.position;

				Clip(vv1, vv2, vv3, MakeVector4(0.f, 0.f, 1.f, -r.zNear),
				     [img, &r](Vertex &v1, Vertex &v2, Vertex &v3) {
					     Vertex vv1 = v1;
					     Vertex vv2 = v2;
					     Vertex vv3 = v3;
					     // want to save Z
					     float orig1 = vv1.position.z;
					     float orig2 = vv2.position.z;
					     float orig3 = vv3.position.z;
#if ENABLE_SSE
					     union {
						     __m128 m;
						     Vector4 v;
					     };
					     m = _mm_setr_ps(vv1.position.w, vv2.position.w, vv3.position.w, 1.f);
					     m = _mm_rcp_ps(m);
					     vv1.position *= v.x;
					     vv2.position *= v.y;
					     vv3.position *= v.z;
#else
					     vv1.position /= vv1.position.w;
					     vv2.position /= vv2.position.w;
					     vv3.position /= vv3.position.w;
#endif
					     vv1.position.z = orig1;
					     vv2.position.z = orig2;
					     vv3.position.z = orig3;

					     PolygonRenderer<featureLvl, false, true, depthTest, solidFill,
					                     lerp>::DrawPolygonInternal(img, vv1, vv2, vv3, r);
					     return;
					     Clip(v1, v2, v3, MakeVector4(0.f, 0.f, -1.f, 1.f),
					          [img, &r](Vertex &v1, Vertex &v2, Vertex &v3) {
						          PolygonRenderer<featureLvl, false, true, depthTest, solidFill,
						                          lerp>::DrawPolygonInternal(img, v1, v2, v3, r);
						      });
					 });
			}
		};

		template <SWFeatureLevel level, bool needTransform,
		          bool ndc, // normalized device coordinate
		          bool depthTest, bool lerp>
		struct SWImageRenderer::PolygonRenderer3 {
			static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                const Vertex &v3, SWImageRenderer &r) {
				if (img == nullptr || img->IsWhiteImage()) {
					PolygonRenderer<level, needTransform, ndc, depthTest, true,
					                lerp>::DrawPolygonInternal(img, v1, v2, v3, r);
					return;
				}
				PolygonRenderer<level, needTransform, ndc, depthTest, false,
				                lerp>::DrawPolygonInternal(img, v1, v2, v3, r);
			}
		};

		template <bool needTransform,
		          bool ndc, // normalized device coordinate
		          bool depthTest, bool lerp>
		struct SWImageRenderer::PolygonRenderer2 {
			static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2,
			                                const Vertex &v3, SWImageRenderer &r,
			                                SWFeatureLevel lvl) {
#if ENABLE_SSE2
				if (static_cast<int>(lvl) >= static_cast<int>(SWFeatureLevel::SSE2)) {
					PolygonRenderer3<SWFeatureLevel::SSE2, needTransform, ndc, depthTest,
					                 lerp>::DrawPolygonInternal(img, v1, v2, v3, r);
					return;
				}
#endif
				PolygonRenderer3<SWFeatureLevel::None, needTransform, ndc, depthTest,
				                 lerp>::DrawPolygonInternal(img, v1, v2, v3, r);
			}
		};

		void SWImageRenderer::DrawPolygon(SWImage *img, const Vertex &v1, const Vertex &v2,
		                                  const Vertex &v3) {
			SPAssert(frame != nullptr);
			switch (shader) {
				case ShaderType::Sprite:
					PolygonRenderer2<true, // needs transform
					                 true, // in NDC
					                 true, // depth tested
					                 true  // linear interpolation
					                 >::DrawPolygonInternal(img, v1, v2, v3, *this, featureLevel);
					break;
				case ShaderType::Image:
					PolygonRenderer2<false, // don't need transform
					                 false, // not NDC
					                 false, // no depth test
					                 false  // point sampling
					                 >::DrawPolygonInternal(img, v1, v2, v3, *this, featureLevel);
					break;
			}
		}
	}
}