/* Copyright (c) 2013 yvt This file is part of OpenSpades. OpenSpades is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OpenSpades is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenSpades. If not, see . */ #include "SWImageRenderer.h" #include "SWImage.h" #include namespace spades { namespace draw { SWImageRenderer::SWImageRenderer(SWFeatureLevel lvl) : shader(ShaderType::Image), featureLevel(lvl) {} SWImageRenderer::~SWImageRenderer() {} void SWImageRenderer::SetFramebuffer(spades::Bitmap *bmp) { this->frame = bmp; if (bmp) { fbSize4 = MakeVector4(static_cast(bmp->GetWidth()) * .5f, static_cast(bmp->GetHeight()) * -.5f, 1.f, 1.f); fbCenter4 = MakeVector4(static_cast(bmp->GetWidth()) * .5f, static_cast(bmp->GetHeight()) * .5f, 0.f, 0.f); } } void SWImageRenderer::SetDepthBuffer(float *f) { depthBuffer = f; } void SWImageRenderer::SetShaderType(ShaderType type) { shader = type; } void SWImageRenderer::SetZRange(float zNear, float) { // currently zNear is ignored... this->zNear = zNear; } struct Interpolator { int mode; // 0:fixed point, 1/-1: bresenham // fixed point struct { int64_t counter; int64_t step; } fp; struct { unsigned int divisor; unsigned int dividend; unsigned int step; int largePos; } b; static int abs(int v) { return v < 0 ? -v : v; } Interpolator(int start, int end, int numSteps, bool noBresenham = true) { // FIXME: same sub-pixel positioning as OpenGL? if (abs(end - start) <= numSteps && !noBresenham) { int distance = end - start; if (distance >= 0) { mode = 1; } else { mode = -1; distance = -distance; } if (numSteps == 0) numSteps = 1; b.divisor = static_cast(numSteps); b.dividend = 0; b.largePos = start; b.step = static_cast(distance); } else { mode = 0; if (numSteps == 0) { fp.step = 0; } else { fp.step = static_cast(end - start) << 32; fp.step /= static_cast(numSteps); } /*else if(end > start){ unsigned int distance = end - start; unsigned int large = distance / static_cast(numSteps); fp.step = static_cast(large) << 32; unsigned int distance2 = distance - static_cast(numSteps) * large; unsigned int medium = (distance2 << 12) / static_cast(numSteps); fp.step += static_cast(medium) << 20; unsigned int distance3 = (distance2 << 12) - static_cast(numSteps) * medium; unsigned int small = (distance3 << 12) / static_cast(numSteps); fp.step += static_cast(small) << 8; }else{ unsigned int distance = start - end; unsigned int large = distance / static_cast(numSteps); fp.step = static_cast(large) << 32; unsigned int distance2 = distance - static_cast(numSteps) * large; unsigned int medium = (distance2 << 12) / static_cast(numSteps); fp.step += static_cast(medium) << 20; unsigned int distance3 = (distance2 << 12) - static_cast(numSteps) * medium; unsigned int small = (distance3 << 12) / static_cast(numSteps); fp.step += static_cast(small) << 8; fp.step = -fp.step; }*/ fp.counter = static_cast(start) << 32; } } int GetCurrent() { if (mode != 0) return b.largePos; return static_cast(fp.counter >> 32); } void MoveNext() { if (mode == 0) { fp.counter += fp.step; } else { b.dividend += b.step; while (b.dividend >= b.divisor) { b.dividend -= b.divisor; b.largePos += mode; } } } void MoveNext(int numSteps) { if (mode == 0) { if (numSteps == 1) MoveNext(); else if (numSteps == 0) return; else fp.counter += fp.step * static_cast(numSteps); } else { if (numSteps < 4) { while (numSteps--) MoveNext(); } else { unsigned long long d = b.dividend; d += static_cast(b.step * static_cast(numSteps)); unsigned long long cnt = d / b.divisor; d -= cnt * b.divisor; b.dividend = static_cast(d); b.largePos += mode * static_cast(cnt); } } } }; enum { texUVScaleBits = 16, texUVScaleInt = 1 << texUVScaleBits }; static const float texUVScaleFloat = static_cast(texUVScaleInt); struct SWImageVarying { union { struct { int u, v; }; #if ENABLE_SSE2 __m128i uv_m128; // [u32, ?32, v32, ?32] #endif }; #if __cplusplus >= 201103L SWImageVarying() = default; // POD #else SWImageVarying() {} // non-POD #endif SWImageVarying(int u, int v) : u(u), v(v) {} SWImageVarying(const SWImageRenderer::Vertex &v) : u(static_cast(v.uv.x * texUVScaleFloat + .5f)), v(static_cast(v.uv.y * texUVScaleFloat + .5f)) {} }; template struct SWImageGouraudInterpolator { Interpolator u, v; SWImageGouraudInterpolator(const SWImageVarying &start, const SWImageVarying &end, int numSteps) : u(start.u, end.u, numSteps), v(start.v, end.v, numSteps) {} SWImageVarying GetCurrent() { return SWImageVarying(u.GetCurrent(), v.GetCurrent()); } void MoveNext(int s) { u.MoveNext(s); v.MoveNext(s); } void MoveNext() { u.MoveNext(); v.MoveNext(); } }; #if ENABLE_SSE2 template <> struct SWImageGouraudInterpolator { Interpolator u, v; union { __m128i uv; struct { int64_t uvU, uvV; }; }; union { __m128i uvStep; struct { int64_t stepU, stepV; }; }; inline SWImageGouraudInterpolator(const SWImageVarying &start, const SWImageVarying &end, int numSteps) : u(start.u, end.u, numSteps, true), v(start.v, end.v, numSteps, true) { uvU = u.fp.counter; uvV = v.fp.counter; stepU = u.fp.step; stepV = v.fp.step; } inline SWImageVarying GetCurrent() { SWImageVarying varying; varying.uv_m128 = _mm_shuffle_epi32(uv, 0x3d); return varying; } inline void MoveNext(int s) { if (s == 0) return; else if (s < 4) { auto v = uv, st = uvStep; while (s--) { v = _mm_add_epi64(v, st); } uv = v; } else { // no SSE2 support for 64bit multiply, but // this isn't a big problem because this case is rare uvU += stepU * s; uvV += stepV * s; } } inline void MoveNext() { uv = _mm_add_epi64(uv, uvStep); } }; #endif #pragma mark - Polygon Renderer Main template struct SWImageRenderer::PolygonRenderer { static_assert(!needTransform, "Transform pass was not selected"); static_assert(!ndc, "Denormalize pass was not selected"); static void DrawPolygonInternalInner(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r) { // TODO: support null image Bitmap *const fb = r.frame; SPAssert(fb != nullptr); if (v3.position.y <= 0.f) { // viewport cull return; } const int fbW = fb->GetWidth(); const int fbH = fb->GetHeight(); uint32_t *const bmp = fb->GetPixels(); if (v1.position.y >= static_cast(fbH)) { // viewport cull return; } float *const depthBuffer = r.depthBuffer; if (depthTest) { SPAssert(depthBuffer != nullptr); } uint32_t *const tpixels = img->GetRawBitmap(); const int tw = img->GetRawWidth(); const int th = img->GetRawHeight(); const int x1 = static_cast(v1.position.x); const int y1 = static_cast(v1.position.y); const int x2 = static_cast(v2.position.x); const int y2 = static_cast(v2.position.y); const int x3 = static_cast(v3.position.x); const int y3 = static_cast(v3.position.y); if (x1 == x2 && x2 == x3) return; // area cull if (y1 == y3) return; // area cull if (std::min(std::min(x1, x2), x3) >= fbW) return; // viewport cull if (std::max(std::max(x1, x2), x3) <= 0) return; // viewport cull auto convertColor = [](float f) { int i = static_cast(f * 256.f + .5f); return static_cast(std::max(std::min(i, 256), 0)); }; unsigned short mulR = convertColor(v1.color.z); unsigned short mulG = convertColor(v1.color.y); unsigned short mulB = convertColor(v1.color.x); unsigned short mulA = convertColor(v1.color.w); if (mulA == 0 && mulR == 0 && mulG == 0 && mulB == 0) return; // unreal-ish linear interpolation by using dither int16_t ditherBaseX = static_cast((1 << (texUVScaleBits - 2)) * img->GetInvWidth()); int16_t ditherBaseY = static_cast((1 << (texUVScaleBits - 2)) * img->GetInvHeight()); int16_t ditherMap[] = { static_cast(-ditherBaseX), static_cast(-ditherBaseY << 1), 0, static_cast(ditherBaseY), static_cast(ditherBaseX), 0, static_cast(-ditherBaseX << 1), static_cast(-ditherBaseY), }; auto drawPixel = [mulR, mulG, mulB, mulA](uint32_t &dest, float &destDepth, uint32_t texture, float inDepth) { if (depthTest) { if (inDepth > destDepth) { return; } } if (texture == 0) return; // transparent unsigned int ta = static_cast(texture >> 24); ta += (ta >> 7); // [0, 255] -> [0, 256] if (ta == 256 && mulA == 256) { // opaque unsigned int tr = static_cast((texture >> 0) & 0xff); unsigned int tg = static_cast((texture >> 8) & 0xff); unsigned int tb = static_cast((texture >> 16) & 0xff); tr = (tr * mulR) >> 8; tg = (tg * mulG) >> 8; tb = (tb * mulB) >> 8; ta = (ta * mulA) >> 8; dest = tr | (tg << 8) | (tb << 16); return; } // already premultiplied. see SWImage.cpp unsigned int tr = static_cast((texture >> 0) & 0xff); unsigned int tg = static_cast((texture >> 8) & 0xff); unsigned int tb = static_cast((texture >> 16) & 0xff); tr = (tr * mulR) >> 8; tg = (tg * mulG) >> 8; tb = (tb * mulB) >> 8; ta = (ta * mulA) >> 8; uint32_t destCol = dest; unsigned int dr = static_cast((destCol >> 0) & 0xff); unsigned int dg = static_cast((destCol >> 8) & 0xff); unsigned int db = static_cast((destCol >> 16) & 0xff); unsigned int invA = 256 - ta; dr = (dr * invA) >> 8; dg = (dg * invA) >> 8; db = (db * invA) >> 8; unsigned int outR = tr + dr, outG = tg + dg, outB = tb + db; outR = std::min(outR, 255U); outG = std::min(outG, 255U); outB = std::min(outB, 255U); dest = outR | (outG << 8) | (outB << 16); }; auto drawScanline = [tw, th, tpixels, bmp, fbW, fbH, depthBuffer, &drawPixel, &r, &ditherMap](int y, int x1, int x2, const SWImageVarying &vary1, const SWImageVarying &vary2, float z1, float z2) { uint32_t *out = bmp + (y * fbW); float *depthOut = nullptr; if (depthTest) { depthOut = depthBuffer + (y * fbW); } SPAssert(x1 < x2); int width = x2 - x1; SWImageGouraudInterpolator vary(vary1, vary2, width); int minX = std::max(x1, 0); int maxX = std::min(x2, fbW); vary.MoveNext(minX - x1); out += minX; if (depthTest) { depthOut += minX; } r.pixelsDrawn += maxX - minX; auto *ditherMap2 = ditherMap + ((y & 1) << 2); for (int x = minX; x < maxX; x++) { auto vr = vary.GetCurrent(); unsigned int u = static_cast(vr.u); unsigned int v = static_cast(vr.v); if (linearInterpolate) { uint8_t idx = static_cast(x & 1); u += ditherMap2[idx * 2]; v += ditherMap2[idx * 2 + 1]; } u &= texUVScaleInt - 1; v &= texUVScaleInt - 1; u = (u * tw) >> texUVScaleBits; v = (v * th) >> texUVScaleBits; uint32_t tex = tpixels[u + v * tw]; // FIXME: Z interpolation // FIXME: perspective correction drawPixel(*out, *depthOut, tex, z1); out++; if (depthTest) { depthOut++; } vary.MoveNext(); } }; // FIXME: interpolated Z Interpolator longSpanX(x1, x3, y3 - y1); SWImageGouraudInterpolator longSpan(v1, v3, y3 - y1); { Interpolator shortSpanX(x1, x2, y2 - y1); SWImageGouraudInterpolator shortSpan(v1, v2, y2 - y1); int minY = std::max(0, y1); int maxY = std::min(fbH, y2); shortSpanX.MoveNext(minY - y1); shortSpan.MoveNext(minY - y1); longSpanX.MoveNext(minY - y1); longSpan.MoveNext(minY - y1); for (int y = minY; y < maxY; y++) { int lineX1 = shortSpanX.GetCurrent(); auto line1 = shortSpan.GetCurrent(); int lineX2 = longSpanX.GetCurrent(); auto line2 = longSpan.GetCurrent(); shortSpanX.MoveNext(); shortSpan.MoveNext(); longSpanX.MoveNext(); longSpan.MoveNext(); if (lineX1 == lineX2) continue; if (lineX1 < lineX2) { drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z, v1.position.z); } else { drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z, v1.position.z); } } } { Interpolator shortSpanX(x2, x3, y3 - y2); SWImageGouraudInterpolator shortSpan(v2, v3, y3 - y2); int minY = std::max(0, y2); int maxY = std::min(fbH, y3); shortSpanX.MoveNext(minY - y2); shortSpan.MoveNext(minY - y2); longSpanX.MoveNext(minY - y2); longSpan.MoveNext(minY - y2); for (int y = minY; y < maxY; y++) { int lineX1 = shortSpanX.GetCurrent(); auto line1 = shortSpan.GetCurrent(); int lineX2 = longSpanX.GetCurrent(); auto line2 = longSpan.GetCurrent(); shortSpanX.MoveNext(); shortSpan.MoveNext(); longSpanX.MoveNext(); longSpan.MoveNext(); if (lineX1 == lineX2) continue; if (lineX1 < lineX2) { drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z, v1.position.z); } else { drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z, v1.position.z); } } } // polygon, done! } static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r) { if (v2.position.y < v1.position.y) { if (v3.position.y < v2.position.y) { DrawPolygonInternalInner(img, v3, v2, v1, r); } else if (v3.position.y < v1.position.y) { DrawPolygonInternalInner(img, v2, v3, v1, r); } else { DrawPolygonInternalInner(img, v2, v1, v3, r); } } else if (v3.position.y < v1.position.y) { DrawPolygonInternalInner(img, v3, v1, v2, r); } else if (v3.position.y < v2.position.y) { DrawPolygonInternalInner(img, v1, v3, v2, r); } else { DrawPolygonInternalInner(img, v1, v2, v3, r); } } }; // TODO: Non-SSE2 renderer for solid polygons #pragma mark - SSE2 #if ENABLE_SSE2 #pragma mark General template struct SWImageRenderer::PolygonRenderer { static void DrawPolygonInternalInner(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r) { Bitmap *const fb = r.frame; SPAssert(fb != nullptr); if (v3.position.y <= 0.f) { // viewport cull return; } const int fbW = fb->GetWidth(); const int fbH = fb->GetHeight(); uint32_t *const bmp = fb->GetPixels(); if (v1.position.y >= static_cast(fbH)) { // viewport cull return; } float *const depthBuffer = r.depthBuffer; if (depthTest) { SPAssert(depthBuffer != nullptr); } uint32_t *const tpixels = img->GetRawBitmap(); const int tw = img->GetRawWidth(); const int th = img->GetRawHeight(); const int x1 = static_cast(v1.position.x); const int y1 = static_cast(v1.position.y); const int x2 = static_cast(v2.position.x); const int y2 = static_cast(v2.position.y); const int x3 = static_cast(v3.position.x); const int y3 = static_cast(v3.position.y); if (x1 == x2 && x2 == x3) return; // area cull if (y1 == y3) return; // area cull if (std::min(std::min(x1, x2), x3) >= fbW) return; // viewport cull if (std::max(std::max(x1, x2), x3) <= 0) return; // viewport cull auto convertColor = [](float f) { int i = static_cast(f * 256.f + .5f); return static_cast(std::max(std::min(i, 256), 0)); }; unsigned short mulR = convertColor(v1.color.x); unsigned short mulG = convertColor(v1.color.y); unsigned short mulB = convertColor(v1.color.z); unsigned short mulA = convertColor(v1.color.w); if (mulA == 0 && mulR == 0 && mulG == 0 && mulB == 0) return; // unreal-ish linear interpolation by using dither int16_t ditherBaseX = static_cast((1 << (texUVScaleBits - 2)) * img->GetInvWidth()); int16_t ditherBaseY = static_cast((1 << (texUVScaleBits - 2)) * img->GetInvHeight()); int16_t ditherMap[] = { static_cast(-ditherBaseX), static_cast(-ditherBaseY << 1), 0, static_cast(ditherBaseY), static_cast(ditherBaseX), 0, static_cast(-ditherBaseX << 1), static_cast(-ditherBaseY), }; __m128i ditherMap2[] = { _mm_setr_epi32(ditherMap[0], ditherMap[1], ditherMap[2], ditherMap[3]), _mm_setr_epi32(ditherMap[4], ditherMap[5], ditherMap[6], ditherMap[7])}; __m128i mulCol = _mm_setr_epi16(mulB, mulG, mulR, mulA, mulB, mulG, mulR, mulA); auto drawPixel = [mulCol, mulA](uint32_t &dest, float &destDepth, uint32_t texture, float inDepth) { if (depthTest) { if (inDepth > destDepth) { return; } } if (texture == 0) return; // transparent unsigned int ta = static_cast(texture >> 24); ta += (ta >> 7); // [0, 255] -> [0, 256] // load [u8.0x4] __m128i tcol = _mm_setr_epi32(texture, 0, 0, 0); // convert to [u16.0x4], 8bit width tcol = _mm_unpacklo_epi8(tcol, _mm_setzero_si128()); // tcol = _mm_shufflelo_epi16(tcol, 0b11000110); // swap BGR/RGB if (ta == 256 && mulA == 256) { // opaque tcol = _mm_mullo_epi16(tcol, mulCol); // pack. tcol = _mm_srli_epi16(tcol, 8); tcol = _mm_packus_epi16(tcol, tcol); // store. _mm_store_ss(reinterpret_cast(&dest), _mm_castsi128_ps(tcol)); return; } // tcol is already premultiplied. see SWImage.cpp // modulate by the constant color. now [u8.8x4], 8bit width tcol = _mm_mullo_epi16(tcol, mulCol); // broadcast the alpha of the tcol. __m128i tcolAlphaVec = _mm_shufflelo_epi16(tcol, 0xff); // make tcol [u8.8x4] // tcol = _mm_slli_epi16(tcol, 1); tcolAlphaVec = _mm_srli_epi16(tcolAlphaVec, 8); // make [u16x4]8bw tcolAlphaVec = _mm_add_epi16( tcolAlphaVec, _mm_srli_epi16(tcolAlphaVec, 7)); // [0,255] -> [0,256] // inverse the alpha tcolAlphaVec = _mm_sub_epi16(_mm_set1_epi16(0x100), tcolAlphaVec); // load [u8.0x4] __m128i dcol = _mm_setr_epi32(dest, 0, 0, 0); // convert to [u16.0x4], 8bit width dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128()); // modulate by inversed src alpha. // now [u8.8 x 4] dcol = _mm_mullo_epi16(dcol, tcolAlphaVec); // additive blending with saturation. dcol = _mm_adds_epu16(dcol, tcol); // pack. dcol = _mm_srli_epi16(dcol, 8); dcol = _mm_packus_epi16(dcol, dcol); // store. _mm_store_ss(reinterpret_cast(&dest), _mm_castsi128_ps(dcol)); }; auto drawPixel2 = [mulCol, mulA, &drawPixel](uint32_t *dest, float *destDepth, uint32_t texture1, float inDepth1, uint32_t texture2, float inDepth2) { if (depthTest) { if (inDepth1 > destDepth[0]) { drawPixel(dest[1], destDepth[1], texture2, inDepth2); return; } if (inDepth2 > destDepth[1]) { drawPixel(dest[0], destDepth[0], texture2, inDepth2); return; } } if (texture1 == 0 && texture2 == 0) return; // transparent // load [u8.0x4] __m128i tcol = _mm_setr_epi32(texture1, texture2, 0, 0); // convert to [u16.0x4, u16.0x4], 8bit width tcol = _mm_unpacklo_epi8(tcol, _mm_setzero_si128()); // tcol = _mm_shufflelo_epi16(tcol, 0b11000110); // swap BGR/RGB // tcol = _mm_shufflehi_epi16(tcol, 0b11000110); // swap BGR/RGB /* FIXME if(ta == 256 && mulA == 256) { // opaque tcol = _mm_mullo_epi16(tcol, mulCol); // pack. tcol = _mm_srli_epi16(tcol, 8); tcol = _mm_packus_epi16(tcol, tcol); // store. _mm_store_ss(reinterpret_cast(&dest), tcol); return; }*/ // tcol is already premultiplied. see SWImage.cpp // modulate by the constant color. now [u8.8x4, u8.8x4], 8bit width tcol = _mm_mullo_epi16(tcol, mulCol); // broadcast the alpha of the tcol. __m128i tcolAlphaVec = _mm_shufflelo_epi16(tcol, 0xff); tcolAlphaVec = _mm_shufflehi_epi16(tcolAlphaVec, 0xff); // make tcol [u8.8x4] // tcol = _mm_slli_epi16(tcol, 1); tcolAlphaVec = _mm_srli_epi16(tcolAlphaVec, 8); // make [u16x4,u16x4]8bw tcolAlphaVec = _mm_add_epi16( tcolAlphaVec, _mm_srli_epi16(tcolAlphaVec, 7)); // [0,255] -> [0,256] // inverse the alpha tcolAlphaVec = _mm_sub_epi16(_mm_set1_epi16(0x100), tcolAlphaVec); // load [u8.0 x 4 x 2] __m128i dcol = _mm_setr_epi32(dest[0], dest[1], 0, 0); // convert to [u16.0 x 4 x 2], 8bit width dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128()); // modulate by inversed src alpha. // now [u8.8 x 4 x 2] dcol = _mm_mullo_epi16(dcol, tcolAlphaVec); // additive blending with saturation. dcol = _mm_adds_epu16(dcol, tcol); // pack. dcol = _mm_srli_epi16(dcol, 8); dcol = _mm_packus_epi16(dcol, dcol); // store. _mm_store_sd(reinterpret_cast(dest), _mm_castsi128_pd(dcol)); }; auto drawScanline = [tw, th, tpixels, bmp, fbW, fbH, depthBuffer, &drawPixel, &drawPixel2, &r, &ditherMap, &ditherMap2](int y, int x1, int x2, const SWImageVarying &vary1, const SWImageVarying &vary2, float z1, float z2) { uint32_t *out = bmp + (y * fbW); float *depthOut = nullptr; if (depthTest) { depthOut = depthBuffer + (y * fbW); } SPAssert(x1 < x2); int width = x2 - x1; SWImageGouraudInterpolator vary(vary1, vary2, width); int minX = std::max(x1, 0); int maxX = std::min(x2, fbW); r.pixelsDrawn += maxX - minX; vary.MoveNext(minX - x1); out += minX; if (depthTest) { depthOut += minX; } auto uvMask = _mm_set1_epi32(texUVScaleInt - 1); auto uvScale = _mm_setr_epi32(tw, tw, th, th); uint8_t ditherIndex = static_cast(y & 1) << 1; auto unalignedPixel = [&]() { auto vr = vary.GetCurrent(); union { __m128i uv; struct { unsigned int ui, dummy1, vi, dummy2; } iuv; }; uv = vr.uv_m128; if (linearInterpolate) { uint8_t idx = static_cast(minX & 1) | ditherIndex; auto m = _mm_setr_epi32(ditherMap[idx * 2], 0, ditherMap[idx * 2 + 1], 0); uv = _mm_add_epi32(uv, m); } uv = _mm_and_si128(uv, uvMask); // repeat uv = _mm_mul_epu32(uv, uvScale); // now [u*tw, v*th] uv = _mm_srli_epi64(uv, texUVScaleBits); uint32_t tex = tpixels[iuv.ui + iuv.vi * tw]; // FIXME: Z interpolation // FIXME: perspective correction drawPixel(*out, *depthOut, tex, z1); out++; if (depthTest) { depthOut++; } vary.MoveNext(); }; while ((minX & 1) && (minX < maxX)) { // non-aligned. unalignedPixel(); minX++; } int reminders = maxX & 1; maxX -= reminders; auto dither = ditherMap2[y & 1]; for (int x = minX; x < maxX; x += 2) { auto vr1 = vary.GetCurrent(); vary.MoveNext(); auto vr2 = vary.GetCurrent(); union { __m128i uv; struct { unsigned int ui, dummy1, vi, dummy2; } iuv; }; // static_assert(texUVScaleBits == 16, "texUVScaleBits must be 16"); uv = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(vr1.uv_m128), _mm_castsi128_ps(vr2.uv_m128), 0x88)); // [u1,v1,u2,v2] if (linearInterpolate) { uv = _mm_add_epi32(uv, dither); } uv = _mm_shuffle_epi32(uv, 0xd8); // [u1,u2,v1,v2] uv = _mm_and_si128(uv, uvMask); // repeat auto tm = uv; uv = _mm_mul_epu32(uv, uvScale); uv = _mm_srli_epi64(uv, texUVScaleBits); uint32_t tex1 = tpixels[iuv.ui + iuv.vi * tw]; uv = _mm_shuffle_epi32(tm, 0xb1); // [u2,u1,v2,v1] uv = _mm_mul_epu32(uv, uvScale); uv = _mm_srli_epi64(uv, texUVScaleBits); uint32_t tex2 = tpixels[iuv.ui + iuv.vi * tw]; // FIXME: Z interpolation // FIXME: perspective correction // drawPixel(out[0], depthOut[0], tex1, z1); // drawPixel(out[1], depthOut[1], tex2, z1); drawPixel2(out, depthOut, tex1, z1, tex2, z1); out += 2; if (depthTest) { depthOut += 2; } vary.MoveNext(); } while (reminders--) { unalignedPixel(); } }; // FIXME: interpolated Z Interpolator longSpanX(x1, x3, y3 - y1); SWImageGouraudInterpolator longSpan(v1, v3, y3 - y1); { Interpolator shortSpanX(x1, x2, y2 - y1); SWImageGouraudInterpolator shortSpan(v1, v2, y2 - y1); int minY = std::max(0, y1); int maxY = std::min(fbH, y2); shortSpanX.MoveNext(minY - y1); shortSpan.MoveNext(minY - y1); longSpanX.MoveNext(minY - y1); longSpan.MoveNext(minY - y1); for (int y = minY; y < maxY; y++) { int lineX1 = shortSpanX.GetCurrent(); auto line1 = shortSpan.GetCurrent(); int lineX2 = longSpanX.GetCurrent(); auto line2 = longSpan.GetCurrent(); shortSpanX.MoveNext(); shortSpan.MoveNext(); longSpanX.MoveNext(); longSpan.MoveNext(); if (lineX1 == lineX2) continue; if (lineX1 < lineX2) { drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z, v1.position.z); } else { drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z, v1.position.z); } } } { Interpolator shortSpanX(x2, x3, y3 - y2); SWImageGouraudInterpolator shortSpan(v2, v3, y3 - y2); int minY = std::max(0, y2); int maxY = std::min(fbH, y3); shortSpanX.MoveNext(minY - y2); shortSpan.MoveNext(minY - y2); longSpanX.MoveNext(minY - y2); longSpan.MoveNext(minY - y2); for (int y = minY; y < maxY; y++) { int lineX1 = shortSpanX.GetCurrent(); auto line1 = shortSpan.GetCurrent(); int lineX2 = longSpanX.GetCurrent(); auto line2 = longSpan.GetCurrent(); shortSpanX.MoveNext(); shortSpan.MoveNext(); longSpanX.MoveNext(); longSpan.MoveNext(); if (lineX1 == lineX2) continue; if (lineX1 < lineX2) { drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z, v1.position.z); } else { drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z, v1.position.z); } } } // polygon, done! } static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r) { if (v2.position.y < v1.position.y) { if (v3.position.y < v2.position.y) { DrawPolygonInternalInner(img, v3, v2, v1, r); } else if (v3.position.y < v1.position.y) { DrawPolygonInternalInner(img, v2, v3, v1, r); } else { DrawPolygonInternalInner(img, v2, v1, v3, r); } } else if (v3.position.y < v1.position.y) { DrawPolygonInternalInner(img, v3, v1, v2, r); } else if (v3.position.y < v2.position.y) { DrawPolygonInternalInner(img, v1, v3, v2, r); } else { DrawPolygonInternalInner(img, v1, v2, v3, r); } } }; #pragma mark Solid template struct SWImageRenderer::PolygonRenderer { static void DrawPolygonInternalInner(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r) { Bitmap *const fb = r.frame; SPAssert(fb != nullptr); if (v3.position.y <= 0.f) { // viewport cull return; } const int fbW = fb->GetWidth(); const int fbH = fb->GetHeight(); uint32_t *const bmp = fb->GetPixels(); if (v1.position.y >= static_cast(fbH)) { // viewport cull return; } float *const depthBuffer = r.depthBuffer; if (depthTest) { SPAssert(depthBuffer != nullptr); } const int x1 = static_cast(v1.position.x); const int y1 = static_cast(v1.position.y); const int x2 = static_cast(v2.position.x); const int y2 = static_cast(v2.position.y); const int x3 = static_cast(v3.position.x); const int y3 = static_cast(v3.position.y); if (x1 == x2 && x2 == x3) return; // area cull if (y1 == y3) return; // area cull if (std::min(std::min(x1, x2), x3) >= fbW) return; // viewport cull if (std::max(std::max(x1, x2), x3) <= 0) return; // viewport cull auto convertColor = [](float f) { // 255.f, not 256.f here because for solid rendering // this color is directly used int i = static_cast(f * 255.f + .5f); return static_cast(std::max(std::min(i, 255), 0)); }; unsigned short mulR = convertColor(v1.color.x); unsigned short mulG = convertColor(v1.color.y); unsigned short mulB = convertColor(v1.color.z); unsigned short mulA = convertColor(v1.color.w); if (mulA == 0 && mulR == 0 && mulG == 0 && mulB == 0) return; __m128i mulCol = _mm_setr_epi16(mulB, mulG, mulR, mulA, mulB, mulG, mulR, mulA); __m128i mulInv = _mm_set1_epi16(256 - (mulA + (mulA >> 7))); mulCol = _mm_slli_epi16(mulCol, 8); auto drawPixel = [mulCol, mulInv](uint32_t &dest, float &destDepth, float inDepth) { if (depthTest) { if (inDepth > destDepth) { return; } } // load [u8.8x4]8bw __m128i tcol = mulCol; // load [u8.0x4] __m128i dcol = _mm_setr_epi32(dest, 0, 0, 0); // convert to [u16.0x4], 8bit width dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128()); // modulate by inversed src alpha. // now [u8.8 x 4] dcol = _mm_mullo_epi16(dcol, mulInv); // additive blending with saturation. dcol = _mm_adds_epu16(dcol, tcol); // pack. dcol = _mm_srli_epi16(dcol, 8); dcol = _mm_packus_epi16(dcol, dcol); // store. _mm_store_ss(reinterpret_cast(&dest), _mm_castsi128_ps(dcol)); }; auto drawPixel2 = [mulCol, mulInv, &drawPixel](uint32_t *dest, float *destDepth, float inDepth1, float inDepth2) { if (depthTest) { if (inDepth1 > destDepth[0]) { drawPixel(dest[1], destDepth[1], inDepth2); return; } if (inDepth2 > destDepth[1]) { drawPixel(dest[0], destDepth[0], inDepth2); return; } } // load [u8.8 x 4 x 2] __m128i tcol = mulCol; // load [u8.0 x 4 x 2] __m128i dcol = _mm_setr_epi32(dest[0], dest[1], 0, 0); // convert to [u16.0 x 4 x 2], 8bit width dcol = _mm_unpacklo_epi8(dcol, _mm_setzero_si128()); // modulate by inversed src alpha. // now [u8.8 x 4 x 2] dcol = _mm_mullo_epi16(dcol, mulInv); // additive blending with saturation. dcol = _mm_adds_epu16(dcol, tcol); // pack. dcol = _mm_srli_epi16(dcol, 8); dcol = _mm_packus_epi16(dcol, dcol); // store. _mm_store_sd(reinterpret_cast(dest), _mm_castsi128_pd(dcol)); }; auto drawScanline = [bmp, fbW, fbH, depthBuffer, &drawPixel, &drawPixel2, &r](int y, int x1, int x2, const SWImageVarying &vary1, const SWImageVarying &vary2, float z1, float z2) { uint32_t *out = bmp + (y * fbW); float *depthOut = nullptr; if (depthTest) { depthOut = depthBuffer + (y * fbW); } SPAssert(x1 < x2); // int width = x2 - x1; int minX = std::max(x1, 0); int maxX = std::min(x2, fbW); r.pixelsDrawn += maxX - minX; out += minX; if (depthTest) { depthOut += minX; } auto unalignedPixel = [&]() { // FIXME: Z interpolation drawPixel(*out, *depthOut, z1); out++; if (depthTest) { depthOut++; } }; while ((minX & 1) && (minX < maxX)) { // non-aligned. unalignedPixel(); minX++; } int reminders = maxX & 1; maxX -= reminders; /* for(int x = minX; x < maxX; x+=2) */ if (maxX > minX) for (auto *endPtr = out + (maxX - minX); out != endPtr;) { // FIXME: Z interpolation drawPixel2(out, depthOut, z1, z1); out += 2; if (depthTest) { depthOut += 2; } } while (reminders--) { unalignedPixel(); } }; // FIXME: interpolated Z Interpolator longSpanX(x1, x3, y3 - y1); SWImageGouraudInterpolator longSpan(v1, v3, y3 - y1); { Interpolator shortSpanX(x1, x2, y2 - y1); SWImageGouraudInterpolator shortSpan(v1, v2, y2 - y1); int minY = std::max(0, y1); int maxY = std::min(fbH, y2); shortSpanX.MoveNext(minY - y1); shortSpan.MoveNext(minY - y1); longSpanX.MoveNext(minY - y1); longSpan.MoveNext(minY - y1); for (int y = minY; y < maxY; y++) { int lineX1 = shortSpanX.GetCurrent(); auto line1 = shortSpan.GetCurrent(); int lineX2 = longSpanX.GetCurrent(); auto line2 = longSpan.GetCurrent(); shortSpanX.MoveNext(); shortSpan.MoveNext(); longSpanX.MoveNext(); longSpan.MoveNext(); if (lineX1 == lineX2) continue; if (lineX1 < lineX2) { drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z, v1.position.z); } else { drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z, v1.position.z); } } } { Interpolator shortSpanX(x2, x3, y3 - y2); SWImageGouraudInterpolator shortSpan(v2, v3, y3 - y2); int minY = std::max(0, y2); int maxY = std::min(fbH, y3); shortSpanX.MoveNext(minY - y2); shortSpan.MoveNext(minY - y2); longSpanX.MoveNext(minY - y2); longSpan.MoveNext(minY - y2); for (int y = minY; y < maxY; y++) { int lineX1 = shortSpanX.GetCurrent(); auto line1 = shortSpan.GetCurrent(); int lineX2 = longSpanX.GetCurrent(); auto line2 = longSpan.GetCurrent(); shortSpanX.MoveNext(); shortSpan.MoveNext(); longSpanX.MoveNext(); longSpan.MoveNext(); if (lineX1 == lineX2) continue; if (lineX1 < lineX2) { drawScanline(y, lineX1, lineX2, line1, line2, v1.position.z, v1.position.z); } else { drawScanline(y, lineX2, lineX1, line2, line1, v1.position.z, v1.position.z); } } } // polygon, done! } static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r) { if (v2.position.y < v1.position.y) { if (v3.position.y < v2.position.y) { DrawPolygonInternalInner(img, v3, v2, v1, r); } else if (v3.position.y < v1.position.y) { DrawPolygonInternalInner(img, v2, v3, v1, r); } else { DrawPolygonInternalInner(img, v2, v1, v3, r); } } else if (v3.position.y < v1.position.y) { DrawPolygonInternalInner(img, v3, v1, v2, r); } else if (v3.position.y < v2.position.y) { DrawPolygonInternalInner(img, v1, v3, v2, r); } else { DrawPolygonInternalInner(img, v1, v2, v3, r); } } }; #endif #pragma mark - Intermediates template struct SWImageRenderer::PolygonRenderer { static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r) { // denormalize auto vv1 = v1, vv2 = v2, vv3 = v3; vv1.position = (vv1.position * r.fbSize4) + r.fbCenter4; vv2.position = (vv2.position * r.fbSize4) + r.fbCenter4; vv3.position = (vv3.position * r.fbSize4) + r.fbCenter4; PolygonRenderer::DrawPolygonInternal(img, vv1, vv2, vv3, r); } }; template struct SWImageRenderer::PolygonRenderer { template static void Clip(Vertex &v1, Vertex &v2, Vertex &v3, Vector4 plane, F continuation) { auto distance = [](const Vector4 &v, const Vector4 &plane) { return v.x * plane.x + v.y * plane.y + v.z * plane.z + plane.w; }; auto lerpVertex = [](const Vertex &v1, const Vertex &v2, Vertex &out, float per) { out.position = v1.position + (v2.position - v1.position) * per; out.color = v1.color + (v2.color - v1.color) * per; out.uv = v1.uv + (v2.uv - v1.uv) * per; }; float d1 = distance(v1.position, plane); float d2 = distance(v2.position, plane); float d3 = distance(v3.position, plane); bool nc1 = d1 >= 0.f; bool nc2 = d2 >= 0.f; bool nc3 = d3 >= 0.f; int bits = (nc1 ? 1 : 0) | (nc2 ? 2 : 0) | (nc3 ? 4 : 0); float per1, per2; Vertex vv1, vv2, vv3; Vertex t1, t2; switch (bits) { case 0: // culled return; case 7: // not clipped continuation(v1, v2, v3); return; case 1: per1 = d2 / (d2 - d1); // == (0.f - d2) / (d1 - d2); per2 = d3 / (d3 - d1); lerpVertex(v2, v1, v2, per1); lerpVertex(v3, v1, v3, per2); continuation(v1, v2, v3); return; case 2: per1 = d1 / (d1 - d2); per2 = d3 / (d3 - d2); lerpVertex(v1, v2, v1, per1); lerpVertex(v3, v2, v3, per2); continuation(v1, v2, v3); return; case 4: per1 = d2 / (d2 - d3); per2 = d1 / (d1 - d3); lerpVertex(v2, v3, v2, per1); lerpVertex(v1, v3, v1, per2); continuation(v1, v2, v3); return; case 3: per1 = d2 / (d2 - d3); per2 = d1 / (d1 - d3); lerpVertex(v2, v3, t2, per1); lerpVertex(v1, v3, t1, per2); vv1 = v1; vv2 = v2; vv3 = t2; continuation(vv1, vv2, vv3); vv1 = v1; vv2 = t2; vv3 = t1; continuation(vv1, vv2, vv3); break; case 5: per1 = d3 / (d3 - d2); per2 = d1 / (d1 - d2); lerpVertex(v3, v2, t2, per1); lerpVertex(v1, v2, t1, per2); vv1 = v1; vv2 = v3; vv3 = t2; continuation(vv1, vv2, vv3); vv1 = v1; vv2 = t2; vv3 = t1; continuation(vv1, vv2, vv3); break; case 6: per1 = d3 / (d3 - d1); per2 = d2 / (d2 - d1); lerpVertex(v3, v1, t2, per1); lerpVertex(v2, v1, t1, per2); vv1 = v2; vv2 = v3; vv3 = t2; continuation(vv1, vv2, vv3); vv1 = v2; vv2 = t2; vv3 = t1; continuation(vv1, vv2, vv3); break; } } static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r) { // needs transform. auto vv1 = v1, vv2 = v2, vv3 = v3; const auto &mat = r.matrix; vv1.position = mat * vv1.position; vv2.position = mat * vv2.position; vv3.position = mat * vv3.position; Clip(vv1, vv2, vv3, MakeVector4(0.f, 0.f, 1.f, -r.zNear), [img, &r](Vertex &v1, Vertex &v2, Vertex &v3) { Vertex vv1 = v1; Vertex vv2 = v2; Vertex vv3 = v3; // want to save Z float orig1 = vv1.position.z; float orig2 = vv2.position.z; float orig3 = vv3.position.z; #if ENABLE_SSE union { __m128 m; Vector4 v; }; m = _mm_setr_ps(vv1.position.w, vv2.position.w, vv3.position.w, 1.f); m = _mm_rcp_ps(m); vv1.position *= v.x; vv2.position *= v.y; vv3.position *= v.z; #else vv1.position /= vv1.position.w; vv2.position /= vv2.position.w; vv3.position /= vv3.position.w; #endif vv1.position.z = orig1; vv2.position.z = orig2; vv3.position.z = orig3; PolygonRenderer::DrawPolygonInternal(img, vv1, vv2, vv3, r); return; Clip(v1, v2, v3, MakeVector4(0.f, 0.f, -1.f, 1.f), [img, &r](Vertex &v1, Vertex &v2, Vertex &v3) { PolygonRenderer::DrawPolygonInternal(img, v1, v2, v3, r); }); }); } }; template struct SWImageRenderer::PolygonRenderer3 { static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r) { if (img == nullptr || img->IsWhiteImage()) { PolygonRenderer::DrawPolygonInternal(img, v1, v2, v3, r); return; } PolygonRenderer::DrawPolygonInternal(img, v1, v2, v3, r); } }; template struct SWImageRenderer::PolygonRenderer2 { static void DrawPolygonInternal(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3, SWImageRenderer &r, SWFeatureLevel lvl) { #if ENABLE_SSE2 if (static_cast(lvl) >= static_cast(SWFeatureLevel::SSE2)) { PolygonRenderer3::DrawPolygonInternal(img, v1, v2, v3, r); return; } #endif PolygonRenderer3::DrawPolygonInternal(img, v1, v2, v3, r); } }; void SWImageRenderer::DrawPolygon(SWImage *img, const Vertex &v1, const Vertex &v2, const Vertex &v3) { SPAssert(frame != nullptr); switch (shader) { case ShaderType::Sprite: PolygonRenderer2::DrawPolygonInternal(img, v1, v2, v3, *this, featureLevel); break; case ShaderType::Image: PolygonRenderer2::DrawPolygonInternal(img, v1, v2, v3, *this, featureLevel); break; } } } }