/* Copyright (c) 2013 yvt This file is part of OpenSpades. OpenSpades is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OpenSpades is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenSpades. If not, see . */ #include #include #include #include "SWMapRenderer.h" #include "SWRenderer.h" #include "SWUtils.h" #include #include #include #include #include #include using namespace std; DEFINE_SPADES_SETTING(r_swUndersampling, "0"); namespace spades { namespace draw { // special tan function whose value is finite. static inline float SpecialTan(float v) { static const float pi = M_PI; if (v <= -pi * 0.5f) { return -2.f; } else if (v < -pi * 0.25f) { v = -2.f - 1.f / tanf(v); } else if (v < pi * 0.25f) { v = tanf(v); } else if (v < pi * 0.5f) { v = 2.f - 1.f / tanf(v); } else { return v = 2.f; } return v; } // convert from tan value to special tan value. static inline float ToSpecialTan(float v) { if (v < -1.f) return -2.f - fastRcp(v); else if (v > 1.f) return 2.f - fastRcp(v); else return v; } enum class Face : short { PosX, NegX, PosY, NegY, PosZ, NegZ }; struct SWMapRenderer::LinePixel { union { struct { uint32_t combined; float depth; }; struct { unsigned int color : 24; // Face face: 7; bool filled : 1; }; struct { uint64_t allData; }; }; // using "operator =" makes this struct non-POD void Set(const LinePixel &p) { allData = p.allData; } inline void Clear() { combined = 0; depth = 10000.f; } inline bool IsEmpty() const { return combined == 0; } }; // infinite length line from -z to +z struct SWMapRenderer::Line { std::vector pixels; Vector3 horizonDir; float pitchTanMin; float pitchScale; int pitchTanMinI; int pitchScaleI; }; SWMapRenderer::SWMapRenderer(SWRenderer *r, client::GameMap *m, SWFeatureLevel level) : w(m->Width()), h(m->Height()), renderer(r), level(level), map(m), frameBuf(nullptr), depthBuf(nullptr), rleHeap(m->Width() * m->Height() * 64) { rle.resize(w * h); rleLen.resize(w * h); Stopwatch sw; sw.Reset(); SPLog("Building RLE map..."); int idx = 0; for (int y = 0; y < h; y++) for (int x = 0; x < w; x++) { BuildRle(x, y, rleBuf); auto ref = rleHeap.Alloc(rleBuf.size() * sizeof(RleData)); short *ptr = rleHeap.Dereference(ref); std::memcpy(ptr, rleBuf.data(), rleBuf.size() * sizeof(RleData)); rle[idx] = ref; rleLen[idx] = rleBuf.size() * sizeof(RleData); idx++; } SPLog("RLE map created in %.6f seconds", sw.GetTime()); } SWMapRenderer::~SWMapRenderer() {} void SWMapRenderer::BuildRle(int x, int y, std::vector &out) { out.clear(); out.push_back(0); // [0] = +Z face position address out.push_back(0); out.push_back(0); // [2] = +X face position address out.push_back(0); out.push_back(0); // [4] = -X face position address out.push_back(0); out.push_back(0); // [6] = +Y face position address out.push_back(0); out.push_back(0); // [8] = -Y face position address out.push_back(0); auto setHeader = [&](size_t idx, size_t val) { reinterpret_cast(out.data())[idx] = static_cast(val); }; uint64_t smap = map->GetSolidMapWrapped(x, y); std::array adjs = { map->GetSolidMapWrapped(x + 1, y), map->GetSolidMapWrapped(x - 1, y), map->GetSolidMapWrapped(x, y + 1), map->GetSolidMapWrapped(x, y - 1)}; bool old = false; for (int z = 0; z < 64; z++) { bool b = (smap >> z) & 1; if (b && !old) { out.push_back(static_cast(z)); } old = b; } out.push_back(-1); setHeader(0, out.size()); old = true; for (int z = 63; z >= 0; z--) { bool b = (smap >> z) & 1; if (b && !old) { out.push_back(static_cast(z)); } old = b; } out.push_back(-1); for (int k = 0; k < 4; k++) { setHeader(k + 1, out.size()); for (int z = 0; z < 64; z++) { if ((smap >> z) & 1) { if (!((adjs[k] >> z) & 1)) { out.push_back(static_cast(z)); } } } out.push_back(-1); } // padding while (out.size() & 3) { out.push_back(42); } } void SWMapRenderer::UpdateRle(int x, int y) { int idx = x + y * w; BuildRle(x, y, rleBuf); rleHeap.Free(rle[idx], rleLen[idx]); auto ref = rleHeap.Alloc(rleBuf.size() * sizeof(RleData)); short *ptr = rleHeap.Dereference(ref); std::memcpy(ptr, rleBuf.data(), rleBuf.size() * sizeof(RleData)); rle[idx] = ref; rleLen[idx] = rleBuf.size() * sizeof(RleData); } template void SWMapRenderer::BuildLine(Line &line, float minPitch, float maxPitch) { // hard code for further optimization enum { w = 512, h = 512 }; SPAssert(map->Width() == 512); SPAssert(map->Height() == 512); const auto *rle = this->rle.data(); auto &rleHeap = this->rleHeap; client::GameMap *map = this->map; // pitch culling { const auto &frustrum = renderer->frustrum; static const float pi = M_PI; const auto &horz = line.horizonDir; minPitch = -pi * 0.4999f; maxPitch = pi * 0.4999f; auto cull = [&minPitch, &maxPitch]() { minPitch = 2.f; maxPitch = -2.f; }; auto clip = [&minPitch, &maxPitch, &horz, &cull](Vector3 plane) { if (plane.x == 0.f && plane.y == 0.f) { if (plane.z > 0.f) { minPitch = std::max(minPitch, 0.f); } else { maxPitch = std::min(maxPitch, 0.f); } } else if (plane.z == 0.f) { if (Vector3::Dot(plane, horz) < 0.f) { cull(); } } else { Vector3 prj = plane; prj.z = 0.f; prj = prj.Normalize(); float zv = fabsf(plane.z); float cs = Vector3::Dot(prj, horz); float ang = zv * zv * (1.f - cs * cs) / (cs * cs); ang = -cs * fastSqrt(1.f + ang); ang = zv / ang; if (isnan(ang) || isinf(ang) || ang == 0.f) return; // convert to tan ang = fastSqrt(1.f - ang * ang) / ang; // convert to angle ang = atanf(ang); if (isnan(ang) || isinf(ang)) return; if (plane.z > 0.f) { minPitch = std::max(minPitch, ang - 0.01f); } else { maxPitch = std::min(maxPitch, -ang + 0.01f); } } }; clip(frustrum[2].n); clip(frustrum[3].n); clip(frustrum[4].n); clip(frustrum[5].n); } float minTan = SpecialTan(minPitch); float maxTan = SpecialTan(maxPitch); { float minDiff = lineResolution / 10000.f; if (maxTan < minTan + minDiff) { // too little difference; scale value might overflow. maxTan = minTan + minDiff; } } line.pitchTanMin = minTan; line.pitchScale = lineResolution / (maxTan - minTan); line.pitchTanMinI = static_cast(minTan * 65536.f); line.pitchScaleI = static_cast(line.pitchScale * 65536.f); // TODO: pitch culling // ray direction float dirX = line.horizonDir.x; float dirY = line.horizonDir.y; if (fabsf(dirY) < 1.e-4f) dirY = 1.e-4f; if (fabsf(dirX) < 1.e-4f) dirX = 1.e-4f; float invDirX = 1.f / dirX; float invDirY = 1.f / dirY; std::int_fast8_t signX = dirX > 0.f ? 1 : -1; std::int_fast8_t signY = dirY > 0.f ? 1 : -1; int invDirXI = static_cast(invDirX * 256.f); int invDirYI = static_cast(invDirY * 256.f); int dirXI = static_cast(dirX * 512.f); int dirYI = static_cast(dirY * 512.f); if (invDirXI < 0) invDirXI = -invDirXI; if (invDirYI < 0) invDirYI = -invDirYI; if (dirXI < 0) dirXI = -dirXI; if (dirYI < 0) dirYI = -dirYI; // camera position float cx = sceneDef.viewOrigin.x; float cy = sceneDef.viewOrigin.y; float cz = sceneDef.viewOrigin.z; int icz = static_cast(floorf(cz)); // ray position // float rx = cx, ry = cy; int rx = static_cast(cx * 512.f); int ry = static_cast(cy * 512.f); // ray position in integer std::int_fast16_t irx = rx >> 9; // static_cast(floorf(rx)); std::int_fast16_t iry = ry >> 9; // static_cast(floorf(ry)); float fogDist = 128.f; float distance = 1.e-20f; // traveled path float invDist = 1.f / distance; // auto& pixels = line.pixels; line.pixels.resize(lineResolution); auto *pixels = line.pixels.data(); // std::vector feels slow... const float transScale = static_cast(lineResolution) / (maxTan - minTan); const float transOffset = -minTan * transScale; #if ENABLE_SSE if (lineResolution > 4) { static_assert(sizeof(LinePixel) == 8, "size of LinePixel has changed; needs code modification"); union { LinePixel pxs[2]; __m128 m; }; pxs[0].Clear(); pxs[1].Clear(); auto *ptr = pixels; for (auto *e = pixels + lineResolution; (reinterpret_cast(ptr) & 0xf) && (ptr < e); ptr++) { ptr->Clear(); } for (auto *e = pixels + lineResolution - 2; ptr < e; ptr += 2) { _mm_store_ps(reinterpret_cast(ptr), m); } for (auto *e = pixels + lineResolution; ptr < e; ptr++) { ptr->Clear(); } } else #endif for (size_t i = 0; i < lineResolution; i++) pixels[i].Clear(); // if culled out, bail out now (pixels are filled) if (minPitch >= maxPitch) return; std::array zval; // precompute (z - cz) * some for (size_t i = 0; i < zval.size(); i++) zval[i] = (static_cast(i) - cz); float vmax = lineResolution + 0.5f; auto transform = [&zval, &transOffset, vmax, &transScale](float invDist, int z) { float p = ToSpecialTan(invDist * zval[z]) * transScale + transOffset; p = std::max(p, 0.f); p = std::min(p, vmax); return static_cast(p); }; float zscale; // travel distance -> view Z value factor zscale = Vector3::Dot(line.horizonDir, sceneDef.viewAxis[2]); float heightScale; // Z value -> view Z value factor heightScale = sceneDef.viewAxis[2].z; std::array heightScaleVal; // precompute (heightScale * z) for (size_t i = 0; i < zval.size(); i++) heightScaleVal[i] = (static_cast(i) * heightScale); float depthBias; depthBias = -cz * heightScale; RleData *lastRle; { auto ref = rle[(irx & w - 1) + ((iry & h - 1) * w)]; lastRle = rleHeap.Dereference(ref); } std::uint_fast16_t count = 1; std::uint_fast16_t cnt2 = static_cast(fogDist * 8.f); while (distance < fogDist && (--cnt2) > 0) { std::int_fast16_t nextIRX, nextIRY; auto oirx = irx, oiry = iry; // DDE Face wallFace; if (signX > 0) { nextIRX = irx + 1; if (signY > 0) { nextIRY = iry + 1; unsigned int timeToNextX = (512 - (rx & 511)) * invDirXI; unsigned int timeToNextY = (512 - (ry & 511)) * invDirYI; if (timeToNextX < timeToNextY) { // go across x plane irx = nextIRX; rx = irx << 9; ry += (dirYI * timeToNextX) >> 17; distance += static_cast(timeToNextX) * (1.f / 512.f / 256.f); wallFace = Face::NegX; } else { // go across y plane iry = nextIRY; rx += (dirXI * timeToNextY) >> 17; ry = iry << 9; distance += static_cast(timeToNextY) * (1.f / 512.f / 256.f); wallFace = Face::NegY; } } else /* (signY < 0) */ { nextIRY = iry - 1; unsigned int timeToNextX = (512 - (rx & 511)) * invDirXI; unsigned int timeToNextY = (ry & 511) * invDirYI; if (timeToNextX < timeToNextY) { // go across x plane irx = nextIRX; rx = irx << 9; ry -= (dirYI * timeToNextX) >> 17; distance += static_cast(timeToNextX) * (1.f / 512.f / 256.f); wallFace = Face::NegX; } else { // go across y plane iry = nextIRY; rx += (dirXI * timeToNextY) >> 17; ry = (iry << 9) - 1; distance += static_cast(timeToNextY) * (1.f / 512.f / 256.f); wallFace = Face::PosY; } } } else /* signX < 0 */ { nextIRX = irx - 1; if (signY > 0) { nextIRY = iry + 1; unsigned int timeToNextX = (rx & 511) * invDirXI; unsigned int timeToNextY = (512 - (ry & 511)) * invDirYI; if (timeToNextX < timeToNextY) { // go across x plane irx = nextIRX; rx = (irx << 9) - 1; ry += (dirYI * timeToNextX) >> 17; distance += static_cast(timeToNextX) * (1.f / 512.f / 256.f); wallFace = Face::PosX; } else { // go across y plane iry = nextIRY; rx -= (dirXI * timeToNextY) >> 17; ry = iry << 9; distance += static_cast(timeToNextY) * (1.f / 512.f / 256.f); wallFace = Face::NegY; } } else /* (signY < 0) */ { nextIRY = iry - 1; unsigned int timeToNextX = (rx & 511) * invDirXI; unsigned int timeToNextY = (ry & 511) * invDirYI; if (timeToNextX < timeToNextY) { // go across x plane irx = nextIRX; rx = (irx << 9) - 1; ry -= (dirYI * timeToNextX) >> 17; distance += static_cast(timeToNextX) * (1.f / 512.f / 256.f); wallFace = Face::PosX; } else { // go across y plane iry = nextIRY; rx -= (dirXI * timeToNextY) >> 17; ry = (iry << 9) - 1; distance += static_cast(timeToNextY) * (1.f / 512.f / 256.f); wallFace = Face::PosY; } } } float oldInvDist = invDist; invDist = fastRcp(distance); float medDist = distance * zscale + depthBias; //(distance + oldDistance) * 0.5f; // check for new spans auto BuildLinePixel = [map](int x, int y, int z, Face face, float dist) { LinePixel px; px.depth = dist; #if ENABLE_SSE if (flevel == SWFeatureLevel::SSE2) { __m128i m; uint32_t col = map->GetColorWrapped(x, y, z); m = _mm_setr_epi32(col, 0, 0, 0); m = _mm_unpacklo_epi8(m, _mm_setzero_si128()); m = _mm_shufflelo_epi16(m, 0xc6); switch (face) { case Face::PosZ: m = _mm_srli_epi16(m, 1); break; case Face::PosX: case Face::PosY: case Face::NegX: m = _mm_adds_epi16(_mm_srli_epi16(m, 1), _mm_srli_epi16(m, 2)); break; default: break; } if ((col >> 24) < 100) { m = _mm_srli_epi16(m, 1); } m = _mm_packus_epi16(m, m); _mm_store_ss(reinterpret_cast(&px.combined), _mm_castsi128_ps(m)); px.filled = true; } else #endif // non-optimized { uint32_t col; col = map->GetColorWrapped(x, y, z); col = (col & 0xff00) | ((col & 0xff) << 16) | ((col & 0xff0000) >> 16); switch (face) { case Face::PosZ: col = (col & 0xfcfcfc) >> 2; break; case Face::PosX: case Face::PosY: case Face::NegX: col = (col & 0xfefefe) >> 1; break; default: break; } px.combined = col; px.filled = true; } return px; }; // floor/ceiling { // linear code // RLE scan RleData *rle = lastRle; { RleData *ptr = rle + 10; while (*ptr != -1) { std::int_fast8_t z = *ptr; if (z > icz) { std::uint_fast16_t p1 = transform(invDist, z); std::uint_fast16_t p2 = transform(oldInvDist, z); LinePixel pix = BuildLinePixel(oirx, oiry, z, Face::NegZ, medDist + heightScaleVal[z]); for (std::uint_fast16_t j = p1; j < p2; j++) { auto &p = pixels[j]; if (!p.IsEmpty()) continue; p.Set(pix); } } ptr++; } ptr++; while (*ptr != -1) { std::int_fast8_t z = *ptr; if (z < icz) { std::uint_fast16_t p1 = transform(invDist, z + 1); std::uint_fast16_t p2 = transform(oldInvDist, z + 1); LinePixel pix = BuildLinePixel(oirx, oiry, z, Face::PosZ, medDist + heightScaleVal[z + 1]); for (std::uint_fast16_t j = p2; j < p1; j++) { auto &p = pixels[j]; if (!p.IsEmpty()) continue; p.Set(pix); } } ptr++; } } } // done: floor/ceiling // add walls { // by RLE map auto ref = rle[static_cast(irx & w - 1) + static_cast(iry & h - 1) * w]; RleData *rle = rleHeap.Dereference(ref); lastRle = rle; auto *ptr = rle; ptr += reinterpret_cast(rle)[1 + static_cast(wallFace)]; std::uint_fast16_t savedP = 0; std::int_fast8_t savedZ = 127; while (*ptr != -1) { std::int_fast8_t z = *(ptr++); std::uint_fast16_t p1 = savedZ == z ? savedP : transform(invDist, z); std::uint_fast16_t p2 = transform(invDist, z + 1); savedZ = z + 1; savedP = p2; LinePixel pix = BuildLinePixel(irx, iry, z, wallFace, medDist + heightScaleVal[z]); for (std::uint_fast16_t j = p1; j < p2; j++) { auto &p = pixels[j]; if (!p.IsEmpty()) continue; p.Set(pix); } } } // add wall - end // check pitch cull if ((--count) == 0) { if ((transform(invDist, 0) >= lineResolution - 1 && icz >= 0) || transform(invDist, 63) <= 0) break; count = 4; } // let's go to next voxel! } } struct AtanTable { std::array sm; std::array lg; std::array smN; std::array lgN; // [0, 2pi] -> [0, 65536] static uint16_t ToFixed(float v) { v /= (M_PI * 2.f); v *= 65536.f; int i = static_cast(v); return static_cast(i & 65535); } AtanTable() { for (int i = 0; i < 5000; i++) { sm[i] = ToFixed(atanf(i / 4096.f)); lg[i] = ToFixed(atanf(1.f / ((i + .5f) / 4096.f))); smN[i] = ToFixed(-atanf(i / 4096.f)); lgN[i] = ToFixed(-atanf(1.f / ((i + .5f) / 4096.f))); } } }; static AtanTable atanTable; static inline uint16_t fastATan(float v) { if (v < 0.f) { if (v > -1.f) { v *= -4096.f; int idx = static_cast(v); // v -= idx; auto ret = atanTable.smN[idx]; return ret; } else { v = fastDiv(-4096.f, v); int idx = static_cast(v); // v -= idx; auto ret = atanTable.lgN[idx]; return ret; } } else { if (v < 1.f) { v *= 4096.f; int idx = static_cast(v); // v -= idx; auto ret = atanTable.sm[idx]; return ret; // ret += (atanTable.sm[idx + 1] - ret) * v; // return ret; } else { v = fastDiv(4096.f, v); int idx = static_cast(v); // v -= idx; auto ret = atanTable.lg[idx]; return ret; // ret += (atanTable.lg[idx + 1] - ret) * v; // return ret; } } } static inline uint16_t fastATan2(float y, float x) { if (x == 0.f) { return y > 0.f ? 16384 : -16384; // y > 0.f ? (pi * 0.5f) : (-pi * 0.5f); } else if (x > 0.f) { return fastATan(fastDiv(y, x)); } else { return fastATan(fastDiv(y, x)) + 32768; } } template void SWMapRenderer::RenderFinal(float yawMin, float yawMax, unsigned int numLines, unsigned int threadId, unsigned int numThreads) { float fovX = tanf(sceneDef.fovX * 0.5f); float fovY = tanf(sceneDef.fovY * 0.5f); Vector3 front = sceneDef.viewAxis[2]; Vector3 right = sceneDef.viewAxis[0]; Vector3 down = sceneDef.viewAxis[1]; unsigned int fw = frameBuf->GetWidth(); unsigned int fh = frameBuf->GetHeight(); uint32_t *fb = frameBuf->GetPixels(); float *depthBuf = this->depthBuf; Vector3 v1 = front - right * fovX + down * fovY; Vector3 deltaDown = -down * (fovY * 2.f / static_cast(fh)); Vector3 deltaRight = right * (fovX * 2.f / static_cast(fw) * under); Vector2 screenPos = {-fovX, -fovY}; float deltaScreenPosRight = fovX * 2.f / static_cast(fw); float deltaScreenPosDown = fovY * 2.f / static_cast(fh); static const float pi = M_PI; float yawScale = 65536.f / (pi * 2.f); std::int32_t yawScale2 = static_cast(pi * 2.f / (yawMax - yawMin) * 65536.f); std::int32_t yawMin2 = static_cast(yawMin * yawScale); auto &lineList = this->lines; enum { blockSize = 8, hBlock = blockSize / under }; Vector3 deltaDownLarge = deltaDown * blockSize; Vector3 deltaRightLarge = deltaRight * hBlock; unsigned int startX = threadId * fw / numThreads; unsigned int endX = (threadId + 1) * fw / numThreads; startX = (startX / blockSize) * blockSize; endX = (endX / blockSize) * blockSize; float deltaScreenPosRightSmall = deltaScreenPosRight * under; float deltaScreenPosDownSmall = deltaScreenPosDown; deltaScreenPosRight *= static_cast(blockSize); deltaScreenPosDown *= static_cast(blockSize); v1 += deltaRight * static_cast(startX / under); screenPos.x += deltaScreenPosRight * static_cast(startX / blockSize); for (unsigned int fx = startX; fx < endX; fx += blockSize) { Vector3 v2 = v1; screenPos.y = -fovY; for (unsigned int fy = 0; fy < fh; fy += blockSize) { uint32_t *fb2 = fb + fx + fy * fw; float *db2 = depthBuf + fx + fy * fw; if (v2.z > 0.99f || v2.z < -0.99f) { // near to pole. cannot be approximated by piecewise goto SlowBlockPath; } FastBlockPath : { // Use bi-linear interpolation for faster yaw/pitch // computation. auto calcYawindex = [yawScale2, numLines, yawMin2](Vector3 v) { std::int32_t yawIndex; { float x = v.x, y = v.y; int yaw; yaw = fastATan2(y, x); yaw -= yawMin2; yawIndex = static_cast(yaw & 0xffff); } yawIndex <<= 8; return yawIndex; }; auto calcPitch = [](Vector3 vv) { float pitch; pitch = vv.z * fastRSqrt(vv.x * vv.x + vv.y * vv.y); pitch = ToSpecialTan(pitch); return static_cast(pitch * (65536.f * 8192.f)); }; std::int32_t yawIndex1 = calcYawindex(v2); std::int32_t pitch1 = calcPitch(v2); std::int32_t yawIndex2 = calcYawindex(v2 + deltaRightLarge); std::int32_t pitch2 = calcPitch(v2 + deltaRightLarge); std::int32_t yawIndex3 = calcYawindex(v2 + deltaDownLarge); std::int32_t pitch3 = calcPitch(v2 + deltaDownLarge); std::int32_t yawIndex4 = calcYawindex(v2 + deltaRightLarge + deltaDownLarge); std::int32_t pitch4 = calcPitch(v2 + deltaRightLarge + deltaDownLarge); // note: `<<8>>8` is phase unwrapping std::int32_t yawDiff1 = ((yawIndex2 - yawIndex1) << 8 >> 8) / hBlock; std::int32_t yawDiff2 = ((yawIndex4 - yawIndex3) << 8 >> 8) / hBlock; std::int32_t pitchDiff1 = (pitch2 - pitch1) / hBlock; std::int32_t pitchDiff2 = (pitch4 - pitch3) / hBlock; std::int32_t yawIndexA = yawIndex1; std::int32_t yawIndexB = yawIndex3; std::int32_t pitchA = pitch1; std::int32_t pitchB = pitch3; for (unsigned int x = 0; x < blockSize; x += under) { uint32_t *fb3 = fb2 + x; auto *db3 = db2 + x; std::int32_t yawIndexC = yawIndexA; std::int32_t yawDelta = ((yawIndexB - yawIndexA) << 8 >> 8) / blockSize; std::int32_t pitchC = pitchA; std::int32_t pitchDelta = (pitchB - pitchA) / blockSize; for (unsigned int y = 0; y < blockSize; y++) { std::uint32_t yawIndex = static_cast(yawIndexC << 8 >> 16); yawIndex = (yawIndex * yawScale2) >> 16; yawIndex = (yawIndex * numLines) >> 16; auto &line = lineList[yawIndex]; auto *pixels = line.pixels.data(); // solve pitch std::int32_t pitchIndex; { pitchIndex = pitchC >> 13; pitchIndex -= line.pitchTanMinI; pitchIndex = static_cast((static_cast(pitchIndex) * static_cast(line.pitchScaleI)) >> 32); // pitch = (pitch - line.pitchTanMin) * line.pitchScale; // pitchIndex = static_cast(pitch); pitchIndex &= lineResolution - 1; // pitchIndex = std::max(pitchIndex, 0); // pitchIndex = std::min(pitchIndex, lineResolution - 1); } auto &pix = pixels[pitchIndex]; // write color. // NOTE: combined contains both color and other information, // though this isn't a problem as long as the color comes // in the LSB's #if ENABLE_SSE if (flevel == SWFeatureLevel::SSE2) { __m128i m; if (under == 1) { *fb3 = pix.combined; *db3 = pix.depth; } else if (under == 2) { m = _mm_castpd_si128( _mm_load_sd(reinterpret_cast(&pix))); _mm_store_sd(reinterpret_cast(fb3), _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x00))); _mm_store_sd(reinterpret_cast(db3), _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x55))); } else if (under == 4) { m = _mm_castpd_si128( _mm_load_sd(reinterpret_cast(&pix))); _mm_stream_si128(reinterpret_cast<__m128i *>(fb3), _mm_shuffle_epi32(m, 0x00)); _mm_stream_si128(reinterpret_cast<__m128i *>(db3), _mm_shuffle_epi32(m, 0x55)); } } else #endif // non-optimized { uint32_t col = pix.combined; float d = pix.depth; for (int k = 0; k < under; k++) { fb3[k] = col; db3[k] = d; } } fb3 += fw; db3 += fw; yawIndexC += yawDelta; pitchC += pitchDelta; } yawIndexA += yawDiff1; yawIndexB += yawDiff2; pitchA += pitchDiff1; pitchB += pitchDiff2; } } goto Converge; SlowBlockPath : { Vector3 v3 = v2; Vector2 screenPos2 = screenPos; for (unsigned int x = 0; x < blockSize; x += under) { Vector3 v4 = v3; uint32_t *fb3 = fb2 + x; auto *db3 = db2 + x; screenPos2.y = screenPos.y; for (unsigned int y = 0; y < blockSize; y++) { Vector3 vv = v4; // solve yaw std::uint32_t yawIndex; { float x = vv.x, y = vv.y; int yaw; yaw = fastATan2(y, x); yaw -= yawMin2; yawIndex = static_cast(yaw & 0xffff); } yawIndex = (yawIndex * yawScale2) >> 16; yawIndex = (yawIndex * numLines) >> 16; auto &line = lineList[yawIndex]; auto *pixels = line.pixels.data(); // solve pitch std::int32_t pitchIndex; { float pitch; pitch = vv.z * fastRSqrt(vv.x * vv.x + vv.y * vv.y); pitch = ToSpecialTan(pitch); pitch = (pitch - line.pitchTanMin) * line.pitchScale; pitchIndex = static_cast(pitch); pitchIndex &= lineResolution - 1; // pitchIndex = std::max(pitchIndex, 0); // pitchIndex = std::min(pitchIndex, lineResolution - 1); } auto &pix = pixels[pitchIndex]; // write color. // NOTE: combined contains both color and other information, // though this isn't a problem as long as the color comes // in the LSB's #if ENABLE_SSE if (flevel == SWFeatureLevel::SSE2) { __m128i m; if (under == 1) { *fb3 = pix.combined; *db3 = pix.depth; } else if (under == 2) { m = _mm_castpd_si128( _mm_load_sd(reinterpret_cast(&pix))); _mm_store_sd(reinterpret_cast(fb3), _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x00))); _mm_store_sd(reinterpret_cast(db3), _mm_castsi128_pd(_mm_shuffle_epi32(m, 0x55))); } else if (under == 4) { m = _mm_castpd_si128( _mm_load_sd(reinterpret_cast(&pix))); _mm_stream_si128(reinterpret_cast<__m128i *>(fb3), _mm_shuffle_epi32(m, 0x00)); _mm_stream_si128(reinterpret_cast<__m128i *>(db3), _mm_shuffle_epi32(m, 0x55)); } } else #endif // non-optimized { uint32_t col = pix.combined; float d = pix.depth; for (int k = 0; k < under; k++) { fb3[k] = col; db3[k] = d; } } fb3 += fw; db3 += fw; v4 += deltaDown; screenPos2.y += deltaScreenPosDownSmall; } // y v3 += deltaRight; screenPos2.x += deltaScreenPosRightSmall; } // x } // end SlowBlockPath Converge: v2 += deltaDownLarge; screenPos.y += deltaScreenPosDown; } // fy v1 += deltaRightLarge; screenPos.x += deltaScreenPosRight; } // fx } template void SWMapRenderer::RenderInner(const client::SceneDefinition &def, Bitmap *frame, float *depthBuffer) { sceneDef = def; frameBuf = frame; depthBuf = depthBuffer; // calculate line density. float yawMin, yawMax; float pitchMin, pitchMax; size_t numLines; { float fovX = tanf(def.fovX * 0.5f); float fovY = tanf(def.fovY * 0.5f); float fovDiag = sqrtf(fovX * fovX + fovY * fovY); float fovDiagAng = atanf(fovDiag); float pitch = asinf(def.viewAxis[2].z); static const float pi = M_PI; // pitch = 0.f; if (fabsf(pitch) >= pi * 0.49f - fovDiagAng) { // pole is visible yawMin = 0.f; yawMax = pi * 2.f; } else { float yaw = atan2l(def.viewAxis[2].y, def.viewAxis[2].x); // TODO: incorrect! yawMin = yaw - pi * .5f; // fovDiagAng; yawMax = yaw + pi * .5f; // fovDiagAng; } pitchMin = pitch - fovDiagAng; pitchMax = pitch + fovDiagAng; if (pitchMin < -pi * 0.5f) { pitchMax = std::max(pitchMax, -pi - pitchMin); pitchMin = -pi * 0.5f; } if (pitchMax > pi * 0.5f) { pitchMin = std::min(pitchMin, pi - pitchMax); pitchMax = pi * 0.5f; } // pitch of PI/2 will make tan(x) infinite pitchMin = std::max(pitchMin, -pi * 0.4999f); pitchMax = std::min(pitchMax, pi * 0.4999f); float interval = static_cast(frame->GetHeight()); interval = fovY * 2.f / interval; lineResolution = static_cast((pitchMax - pitchMin) / interval * 1.5f); lineResolution = frame->GetHeight(); for (int i = lineResolution, j = 1; j <= i; j <<= 1) { lineResolution = j; } if (pitchMin > 0.f) { // interval /= cosf(pitchMin); } else if (pitchMax < 0.f) { // interval /= cosf(pitchMax); } numLines = static_cast((yawMax - yawMin) / interval); int under = r_swUndersampling; under = std::max(std::min(under, 4), 1); numLines /= under; if (numLines < 8) numLines = 8; if (numLines > 65536) { numLines = 65536; // SPRaise("Too many lines emit: %d", static_cast(numLines)); } lines.resize(std::max(numLines, lines.size())); /* SPLog("numlines: %d, each %f deg, and %d res", static_cast(numLines), interval * 180.f / pi, static_cast(lineResolution));*/ } // calculate vector for each lines { float scl = (yawMax - yawMin) / numLines; Vector3 horiz = Vector3::Make(cosf(yawMin), sinf(yawMin), 0.f); float c = cosf(scl); float s = sinf(scl); for (size_t i = 0; i < numLines; i++) { Line &l = lines[i]; l.horizonDir = horiz; float x = horiz.x * c - horiz.y * s; float y = horiz.x * s + horiz.y * c; horiz.x = x; horiz.y = y; } } { unsigned int nlines = static_cast(numLines); InvokeParallel2([&](unsigned int th, unsigned int numThreads) { unsigned int start = th * nlines / numThreads; unsigned int end = (th + 1) * nlines / numThreads; for (size_t i = start; i < end; i++) { BuildLine(lines[i], pitchMin, pitchMax); } }); } int under = r_swUndersampling; InvokeParallel2([&](unsigned int th, unsigned int numThreads) { if (under <= 1) { RenderFinal(yawMin, yawMax, static_cast(numLines), th, numThreads); } else if (under <= 2) { RenderFinal(yawMin, yawMax, static_cast(numLines), th, numThreads); } else { RenderFinal(yawMin, yawMax, static_cast(numLines), th, numThreads); } }); frameBuf = nullptr; depthBuf = nullptr; } void SWMapRenderer::Render(const client::SceneDefinition &def, Bitmap *frame, float *depthBuffer) { if (!frame) SPInvalidArgument("frame"); if (!depthBuffer) SPInvalidArgument("depthBuffer"); auto p = def.viewOrigin.Floor(); if (map->IsSolidWrapped(p.x, p.y, p.z)) { return; } #if ENABLE_SSE2 if (static_cast(level) >= static_cast(SWFeatureLevel::SSE2)) { RenderInner(def, frame, depthBuffer); return; } #endif RenderInner(def, frame, depthBuffer); } } }