Spread out mesh destruction in attempt to workaround slow Vulkan buffer deallocation

2021-12-28 18:19:08 +00:00 · 2021-12-28 18:19:08 +00:00 · 541af4a4ad
commit 541af4a4ad
parent ea74c1f05f
6 changed files with 159 additions and 3 deletions
--- a/server/progressive_task_runner.cpp
+++ b/server/progressive_task_runner.cpp
@ -0,0 +1,55 @@
+#include "progressive_task_runner.h"
+
+namespace zylann {
+
+ProgressiveTaskRunner::~ProgressiveTaskRunner() {
+	flush();
+	ERR_FAIL_COND_MSG(_tasks.size() > 0, "Tasks got created in destructors?");
+}
+
+void ProgressiveTaskRunner::push(IProgressiveTask *task) {
+	ERR_FAIL_COND(task == nullptr);
+	_tasks.push(task);
+}
+
+void ProgressiveTaskRunner::process() {
+	const int64_t now_msec = Time::get_singleton()->get_ticks_msec();
+	const int64_t delta_msec = now_msec - _last_process_time_msec;
+	_last_process_time_msec = now_msec;
+	ERR_FAIL_COND(delta_msec < 0);
+
+	// The goal is to dequeue everything in S seconds.
+	// So if we have N tasks and `process` is called F times per second, we must dequeue N / (S * F) tasks.
+	// Or put it another way, if we call `process` every D seconds, we must dequeue (D * N) / S tasks.
+	// We make sure a minimum amount is run so it cannot be stuck at 0.
+	// As the number of pending tasks decreases, we want to keep running the highest amount we calculated.
+	// we reset when we are done.
+
+	_dequeue_count = max(int64_t(_dequeue_count), (int64_t(_tasks.size()) * delta_msec) / COMPLETION_TIME_MSEC);
+	_dequeue_count = min(_dequeue_count, max(MIN_COUNT, unsigned int(_tasks.size())));
+
+	unsigned int count = _dequeue_count;
+	while (_tasks.size() > 0 && count > 0) {
+		IProgressiveTask *task = _tasks.front();
+		_tasks.pop();
+		task->run();
+		// TODO Call recycling function instead?
+		memdelete(task);
+		--count;
+	}
+}
+
+void ProgressiveTaskRunner::flush() {
+	while (!_tasks.empty()) {
+		IProgressiveTask *task = _tasks.front();
+		_tasks.pop();
+		task->run();
+		memdelete(task);
+	}
+}
+
+unsigned int ProgressiveTaskRunner::get_pending_count() const {
+	return _tasks.size();
+}
+
+} // namespace zylann
--- a/server/progressive_task_runner.h
+++ b/server/progressive_task_runner.h
@ -0,0 +1,50 @@
+#ifndef PROGRESSIVE_TASK_RUNNER_H
+#define PROGRESSIVE_TASK_RUNNER_H
+
+#include "../util/math/funcs.h"
+#include <core/os/time.h>
+#include <queue>
+
+namespace zylann {
+
+// TODO It would be really nice if Godot4 Vulkan buffer deallocation was better optimized.
+// This is originally to workaround the terribly slow Vulkan buffer deallocation in Godot4.
+// It happens on the main thread and causes deferred stutters when a terrain contains a lot of chunks
+// and the camera moves fast.
+// I hate this workaround because it feels like we are almost not in control of a stable framerate.
+// "Make less meshes" is not enough, if it can't be dynamically adressed.
+
+class IProgressiveTask {
+public:
+	virtual ~IProgressiveTask() {}
+	virtual void run() = 0;
+};
+
+// Runs a certain amount of tasks per frame such that all tasks should be completed in N seconds.
+// This has the effect of spreading the load over time and avoids CPU spikes.
+// This can be used in place of a time-slicing runner when the duration of tasks cannot be used as a cost metric.
+// This is the case of tasks that defer their workload to another system to run later. It is far from perfect though,
+// and is a last resort solution when optimization and threading are not possible.
+// Such tasks may preferably not require low latency in the game,
+// because they will likely run a bit later than a time-sliced task.
+class ProgressiveTaskRunner {
+public:
+	~ProgressiveTaskRunner();
+
+	void push(IProgressiveTask *task);
+	void process();
+	void flush();
+	unsigned int get_pending_count() const;
+
+private:
+	static const unsigned int MIN_COUNT = 4;
+	static const unsigned int COMPLETION_TIME_MSEC = 500;
+
+	std::queue<IProgressiveTask *> _tasks;
+	unsigned int _dequeue_count = MIN_COUNT;
+	int64_t _last_process_time_msec = 0;
+};
+
+} // namespace zylann
+
+#endif // PROGRESSIVE_TASK_RUNNER_H
--- a/server/voxel_server.cpp
+++ b/server/voxel_server.cpp
@ -586,6 +586,10 @@ void VoxelServer::push_time_spread_task(IVoxelTimeSpreadTask *task) {
 	_time_spread_task_runner.push(task);
 }

+void VoxelServer::push_progressive_task(zylann::IProgressiveTask *task) {
+	_progressive_task_runner.push(task);
+}
+
 int VoxelServer::get_main_thread_time_budget_usec() const {
 	return _main_thread_time_budget_usec;
 }
@ -605,6 +609,8 @@ void VoxelServer::process() {
 	//VOXEL_PROFILE_MARK_FRAME();
 	VOXEL_PROFILE_SCOPE();
 	VOXEL_PROFILE_PLOT("Static memory usage", int64_t(OS::get_singleton()->get_static_memory_usage()));
+	VOXEL_PROFILE_PLOT("TimeSpread tasks", int64_t(_time_spread_task_runner.get_pending_count()));
+	VOXEL_PROFILE_PLOT("Progressive tasks", int64_t(_progressive_task_runner.get_pending_count()));

 	// Receive data updates
 	_streaming_thread_pool.dequeue_completed_tasks([](IVoxelTask *task) {
@ -622,6 +628,8 @@ void VoxelServer::process() {
 	// which could in turn complete right away (we avoid 1-frame delays this way).
 	_time_spread_task_runner.process(_main_thread_time_budget_usec);

+	_progressive_task_runner.process();
+
 	// Update viewer dependencies
 	{
 		const size_t viewer_count = _world.viewers.count();
@ -696,7 +704,7 @@ VoxelServer::Stats VoxelServer::get_stats() const {
 	s.generation_tasks = g_debug_generate_tasks_count;
 	s.meshing_tasks = g_debug_mesh_tasks_count;
 	s.streaming_tasks = g_debug_stream_tasks_count;
-	s.main_thread_tasks = _time_spread_task_runner.get_pending_count();
+	s.main_thread_tasks = _time_spread_task_runner.get_pending_count() + _progressive_task_runner.get_pending_count();
 	return s;
 }

--- a/server/voxel_server.h
+++ b/server/voxel_server.h
@ -5,8 +5,10 @@
 #include "../meshers/blocky/voxel_mesher_blocky.h"
 #include "../streams/voxel_stream.h"
 #include "../util/file_locker.h"
+#include "progressive_task_runner.h"
 #include "struct_db.h"
 #include "voxel_thread_pool.h"
+
 #include <scene/main/node.h>

 #include <memory>
@ -155,6 +157,8 @@ public:
 	void push_time_spread_task(IVoxelTimeSpreadTask *task);
 	int get_main_thread_time_budget_usec() const;

+	void push_progressive_task(zylann::IProgressiveTask *task);
+
 	void push_async_task(IVoxelTask *task);
 	void push_async_tasks(Span<IVoxelTask *> tasks);

@ -384,6 +388,7 @@ private:
 	// For tasks that can only run on the main thread and be spread out over frames
 	VoxelTimeSpreadTaskRunner _time_spread_task_runner;
 	int _main_thread_time_budget_usec = 8000;
+	zylann::ProgressiveTaskRunner _progressive_task_runner;

 	VoxelFileLocker _file_locker;
 };
--- a/terrain/voxel_mesh_block.cpp
+++ b/terrain/voxel_mesh_block.cpp
@ -1,5 +1,7 @@
 #include "voxel_mesh_block.h"
 #include "../constants/voxel_string_names.h"
+#include "../server/progressive_task_runner.h"
+#include "../server/voxel_server.h"
 #include "../util/godot/funcs.h"
 #include "../util/macros.h"
 #include "../util/profiling.h"
@ -34,7 +36,43 @@ VoxelMeshBlock *VoxelMeshBlock::create(Vector3i bpos, unsigned int size, unsigne

 VoxelMeshBlock::VoxelMeshBlock() {}

-VoxelMeshBlock::~VoxelMeshBlock() {}
+VoxelMeshBlock::~VoxelMeshBlock() {
+	// Had to resort to this in Godot4 because deleting meshes is particularly expensive,
+	// because of the Vulkan allocator used by the renderer
+	class FreeMeshTask : public zylann::IProgressiveTask {
+	public:
+		static inline void try_add_and_destroy(DirectMeshInstance &mi) {
+			if (mi.get_mesh().is_valid()) {
+				add(mi.get_mesh());
+			}
+			mi.destroy();
+		}
+
+		static void add(Ref<Mesh> mesh) {
+			CRASH_COND(mesh.is_null());
+			FreeMeshTask *task = memnew(FreeMeshTask());
+			task->mesh = mesh;
+			VoxelServer::get_singleton()->push_progressive_task(task);
+		}
+
+		void run() override {
+#ifdef DEBUG_ENABLED
+			if (mesh->reference_get_count() > 1) {
+				WARN_PRINT("Mesh has more than one ref left, task spreading will not be effective at smoothing "
+						   "destruction cost");
+			}
+#endif
+			mesh.unref();
+		}
+
+		Ref<Mesh> mesh;
+	};
+
+	FreeMeshTask::try_add_and_destroy(_mesh_instance);
+	for (unsigned int i = 0; i < _transition_mesh_instances.size(); ++i) {
+		FreeMeshTask::try_add_and_destroy(_transition_mesh_instances[i]);
+	}
+}

 void VoxelMeshBlock::set_world(Ref<World3D> p_world) {
 	if (_world != p_world) {
--- a/util/godot/direct_mesh_instance.cpp
+++ b/util/godot/direct_mesh_instance.cpp
@ -24,8 +24,8 @@ void DirectMeshInstance::destroy() {
 		RenderingServer &vs = *RenderingServer::get_singleton();
 		vs.free(_mesh_instance);
 		_mesh_instance = RID();
-		_mesh.unref();
 	}
+	_mesh.unref();
 }

 void DirectMeshInstance::set_world(World3D *world) {