Spread out mesh destruction in attempt to workaround slow Vulkan buffer deallocation

This commit is contained in:
Marc Gilleron 2021-12-28 18:19:08 +00:00
parent ea74c1f05f
commit 541af4a4ad
6 changed files with 159 additions and 3 deletions

View File

@ -0,0 +1,55 @@
#include "progressive_task_runner.h"
namespace zylann {
ProgressiveTaskRunner::~ProgressiveTaskRunner() {
flush();
ERR_FAIL_COND_MSG(_tasks.size() > 0, "Tasks got created in destructors?");
}
void ProgressiveTaskRunner::push(IProgressiveTask *task) {
ERR_FAIL_COND(task == nullptr);
_tasks.push(task);
}
void ProgressiveTaskRunner::process() {
const int64_t now_msec = Time::get_singleton()->get_ticks_msec();
const int64_t delta_msec = now_msec - _last_process_time_msec;
_last_process_time_msec = now_msec;
ERR_FAIL_COND(delta_msec < 0);
// The goal is to dequeue everything in S seconds.
// So if we have N tasks and `process` is called F times per second, we must dequeue N / (S * F) tasks.
// Or put it another way, if we call `process` every D seconds, we must dequeue (D * N) / S tasks.
// We make sure a minimum amount is run so it cannot be stuck at 0.
// As the number of pending tasks decreases, we want to keep running the highest amount we calculated.
// we reset when we are done.
_dequeue_count = max(int64_t(_dequeue_count), (int64_t(_tasks.size()) * delta_msec) / COMPLETION_TIME_MSEC);
_dequeue_count = min(_dequeue_count, max(MIN_COUNT, unsigned int(_tasks.size())));
unsigned int count = _dequeue_count;
while (_tasks.size() > 0 && count > 0) {
IProgressiveTask *task = _tasks.front();
_tasks.pop();
task->run();
// TODO Call recycling function instead?
memdelete(task);
--count;
}
}
void ProgressiveTaskRunner::flush() {
while (!_tasks.empty()) {
IProgressiveTask *task = _tasks.front();
_tasks.pop();
task->run();
memdelete(task);
}
}
unsigned int ProgressiveTaskRunner::get_pending_count() const {
return _tasks.size();
}
} // namespace zylann

View File

@ -0,0 +1,50 @@
#ifndef PROGRESSIVE_TASK_RUNNER_H
#define PROGRESSIVE_TASK_RUNNER_H
#include "../util/math/funcs.h"
#include <core/os/time.h>
#include <queue>
namespace zylann {
// TODO It would be really nice if Godot4 Vulkan buffer deallocation was better optimized.
// This is originally to workaround the terribly slow Vulkan buffer deallocation in Godot4.
// It happens on the main thread and causes deferred stutters when a terrain contains a lot of chunks
// and the camera moves fast.
// I hate this workaround because it feels like we are almost not in control of a stable framerate.
// "Make less meshes" is not enough, if it can't be dynamically adressed.
class IProgressiveTask {
public:
virtual ~IProgressiveTask() {}
virtual void run() = 0;
};
// Runs a certain amount of tasks per frame such that all tasks should be completed in N seconds.
// This has the effect of spreading the load over time and avoids CPU spikes.
// This can be used in place of a time-slicing runner when the duration of tasks cannot be used as a cost metric.
// This is the case of tasks that defer their workload to another system to run later. It is far from perfect though,
// and is a last resort solution when optimization and threading are not possible.
// Such tasks may preferably not require low latency in the game,
// because they will likely run a bit later than a time-sliced task.
class ProgressiveTaskRunner {
public:
~ProgressiveTaskRunner();
void push(IProgressiveTask *task);
void process();
void flush();
unsigned int get_pending_count() const;
private:
static const unsigned int MIN_COUNT = 4;
static const unsigned int COMPLETION_TIME_MSEC = 500;
std::queue<IProgressiveTask *> _tasks;
unsigned int _dequeue_count = MIN_COUNT;
int64_t _last_process_time_msec = 0;
};
} // namespace zylann
#endif // PROGRESSIVE_TASK_RUNNER_H

View File

@ -586,6 +586,10 @@ void VoxelServer::push_time_spread_task(IVoxelTimeSpreadTask *task) {
_time_spread_task_runner.push(task);
}
void VoxelServer::push_progressive_task(zylann::IProgressiveTask *task) {
_progressive_task_runner.push(task);
}
int VoxelServer::get_main_thread_time_budget_usec() const {
return _main_thread_time_budget_usec;
}
@ -605,6 +609,8 @@ void VoxelServer::process() {
//VOXEL_PROFILE_MARK_FRAME();
VOXEL_PROFILE_SCOPE();
VOXEL_PROFILE_PLOT("Static memory usage", int64_t(OS::get_singleton()->get_static_memory_usage()));
VOXEL_PROFILE_PLOT("TimeSpread tasks", int64_t(_time_spread_task_runner.get_pending_count()));
VOXEL_PROFILE_PLOT("Progressive tasks", int64_t(_progressive_task_runner.get_pending_count()));
// Receive data updates
_streaming_thread_pool.dequeue_completed_tasks([](IVoxelTask *task) {
@ -622,6 +628,8 @@ void VoxelServer::process() {
// which could in turn complete right away (we avoid 1-frame delays this way).
_time_spread_task_runner.process(_main_thread_time_budget_usec);
_progressive_task_runner.process();
// Update viewer dependencies
{
const size_t viewer_count = _world.viewers.count();
@ -696,7 +704,7 @@ VoxelServer::Stats VoxelServer::get_stats() const {
s.generation_tasks = g_debug_generate_tasks_count;
s.meshing_tasks = g_debug_mesh_tasks_count;
s.streaming_tasks = g_debug_stream_tasks_count;
s.main_thread_tasks = _time_spread_task_runner.get_pending_count();
s.main_thread_tasks = _time_spread_task_runner.get_pending_count() + _progressive_task_runner.get_pending_count();
return s;
}

View File

@ -5,8 +5,10 @@
#include "../meshers/blocky/voxel_mesher_blocky.h"
#include "../streams/voxel_stream.h"
#include "../util/file_locker.h"
#include "progressive_task_runner.h"
#include "struct_db.h"
#include "voxel_thread_pool.h"
#include <scene/main/node.h>
#include <memory>
@ -155,6 +157,8 @@ public:
void push_time_spread_task(IVoxelTimeSpreadTask *task);
int get_main_thread_time_budget_usec() const;
void push_progressive_task(zylann::IProgressiveTask *task);
void push_async_task(IVoxelTask *task);
void push_async_tasks(Span<IVoxelTask *> tasks);
@ -384,6 +388,7 @@ private:
// For tasks that can only run on the main thread and be spread out over frames
VoxelTimeSpreadTaskRunner _time_spread_task_runner;
int _main_thread_time_budget_usec = 8000;
zylann::ProgressiveTaskRunner _progressive_task_runner;
VoxelFileLocker _file_locker;
};

View File

@ -1,5 +1,7 @@
#include "voxel_mesh_block.h"
#include "../constants/voxel_string_names.h"
#include "../server/progressive_task_runner.h"
#include "../server/voxel_server.h"
#include "../util/godot/funcs.h"
#include "../util/macros.h"
#include "../util/profiling.h"
@ -34,7 +36,43 @@ VoxelMeshBlock *VoxelMeshBlock::create(Vector3i bpos, unsigned int size, unsigne
VoxelMeshBlock::VoxelMeshBlock() {}
VoxelMeshBlock::~VoxelMeshBlock() {}
VoxelMeshBlock::~VoxelMeshBlock() {
// Had to resort to this in Godot4 because deleting meshes is particularly expensive,
// because of the Vulkan allocator used by the renderer
class FreeMeshTask : public zylann::IProgressiveTask {
public:
static inline void try_add_and_destroy(DirectMeshInstance &mi) {
if (mi.get_mesh().is_valid()) {
add(mi.get_mesh());
}
mi.destroy();
}
static void add(Ref<Mesh> mesh) {
CRASH_COND(mesh.is_null());
FreeMeshTask *task = memnew(FreeMeshTask());
task->mesh = mesh;
VoxelServer::get_singleton()->push_progressive_task(task);
}
void run() override {
#ifdef DEBUG_ENABLED
if (mesh->reference_get_count() > 1) {
WARN_PRINT("Mesh has more than one ref left, task spreading will not be effective at smoothing "
"destruction cost");
}
#endif
mesh.unref();
}
Ref<Mesh> mesh;
};
FreeMeshTask::try_add_and_destroy(_mesh_instance);
for (unsigned int i = 0; i < _transition_mesh_instances.size(); ++i) {
FreeMeshTask::try_add_and_destroy(_transition_mesh_instances[i]);
}
}
void VoxelMeshBlock::set_world(Ref<World3D> p_world) {
if (_world != p_world) {

View File

@ -24,8 +24,8 @@ void DirectMeshInstance::destroy() {
RenderingServer &vs = *RenderingServer::get_singleton();
vs.free(_mesh_instance);
_mesh_instance = RID();
_mesh.unref();
}
_mesh.unref();
}
void DirectMeshInstance::set_world(World3D *world) {