godot_voxel/terrain/block_thread_manager.h

611 lines
18 KiB
C++

#ifndef VOXEL_BLOCK_THREAD_MANAGER_H
#define VOXEL_BLOCK_THREAD_MANAGER_H
#include "../math/rect3i.h"
#include "../math/vector3i.h"
#include "../util/array_slice.h"
#include "../util/utility.h"
#include <core/os/os.h>
#include <core/os/semaphore.h>
#include <algorithm>
#include <functional>
#include <vector>
// Base structure for an asynchronous block processing manager using threads.
// It is the same for block loading and rendering, hence made a generic one.
// - Push requests and pop requests in batch
// - One or more threads can be used
// - Minimizes sync points
// - Orders blocks to process the closest ones first
// - Merges duplicate requests
// - Cancels requests that become out of range
// - Takes some stats
template <typename InputBlockData_T, typename OutputBlockData_T>
class VoxelBlockThreadManager {
public:
static const int MAX_LOD = 32; // Like VoxelLodTerrain
static const int MAX_JOBS = 8; // Arbitrary, should be enough
// Specialization must be copyable
struct InputBlock {
InputBlockData_T data;
Vector3i position; // In LOD0 block coordinates
uint8_t lod = 0;
bool can_be_discarded = true; // If false, will always be processed, even if the thread is told to exit
float sort_heuristic = 0;
};
// Specialization must be copyable
struct OutputBlock {
OutputBlockData_T data;
Vector3i position; // In LOD0 block coordinates
unsigned int lod = 0;
// True if the block was actually dropped.
// Ideally the requester will agree that it doesn't need that block anymore,
// but in cases it still does (bad case), it will have to query it again.
bool drop_hint = false;
};
struct Input {
std::vector<InputBlock> blocks;
Vector3i priority_position; // In LOD0 block coordinates
Vector3 priority_direction; // Where the viewer is looking at
int exclusive_region_extent = 0; // Region beyond which the processor is allowed to discard requests
bool use_exclusive_region = false;
int max_lod_index = 0;
bool is_empty() const {
return blocks.empty();
}
};
struct ProcessorStats {
int file_opens = 0;
};
struct Stats {
// Generic stats
bool first = true;
uint64_t min_time = 0;
uint64_t max_time = 0;
uint64_t sorting_time = 0;
uint32_t remaining_blocks[MAX_JOBS];
uint32_t thread_count = 0;
uint32_t dropped_count = 0;
// Processor-specific
ProcessorStats processor;
Stats() {
for (int i = 0; i < MAX_JOBS; ++i) {
remaining_blocks[i] = 0;
}
}
};
struct Output {
Vector<OutputBlock> blocks;
Stats stats;
};
typedef std::function<void(ArraySlice<InputBlock>, ArraySlice<OutputBlock>, ProcessorStats &)> BlockProcessingFunc;
// TODO Make job count dynamic, don't start threads in constructor
// Creates and starts jobs.
// Processors are given as array because you could decide to either re-use the same one,
// or have clones depending on them being stateless or not.
VoxelBlockThreadManager(
unsigned int job_count,
unsigned int sync_interval_ms,
BlockProcessingFunc *processors,
bool duplicate_rejection = true,
unsigned int batch_count = 1) {
CRASH_COND(job_count < 1);
CRASH_COND(job_count >= MAX_JOBS);
_job_count = job_count;
CRASH_COND(batch_count == 0);
_batch_count = batch_count;
for (unsigned int i = 0; i < MAX_JOBS; ++i) {
JobData &job = _jobs[i];
job.job_index = i;
job.duplicate_rejection = duplicate_rejection;
job.sync_interval_ms = sync_interval_ms;
job.batch_count = batch_count;
}
for (unsigned int i = 0; i < _job_count; ++i) {
JobData &job = _jobs[i];
CRASH_COND(job.thread != nullptr);
job.input_mutex = Mutex::create();
job.output_mutex = Mutex::create();
job.semaphore = Semaphore::create();
job.thread = Thread::create(_thread_func, &job);
job.needs_sort = true;
job.processor = processors[i];
}
}
~VoxelBlockThreadManager() {
for (unsigned int i = 0; i < _job_count; ++i) {
JobData &job = _jobs[i];
job.thread_exit = true;
job.semaphore->post();
}
for (unsigned int i = 0; i < _job_count; ++i) {
JobData &job = _jobs[i];
CRASH_COND(job.thread == nullptr);
Thread::wait_to_finish(job.thread);
memdelete(job.thread);
memdelete(job.semaphore);
memdelete(job.input_mutex);
memdelete(job.output_mutex);
}
}
void push(const Input &input) {
CRASH_COND(_job_count < 1);
unsigned int replaced_blocks = 0;
unsigned int highest_pending_count = 0;
unsigned int lowest_pending_count = 0;
// Lock all inputs and gather their pending work counts
for (unsigned int job_index = 0; job_index < _job_count; ++job_index) {
JobData &job = _jobs[job_index];
job.input_mutex->lock();
highest_pending_count = MAX(highest_pending_count, job.shared_input.blocks.size());
lowest_pending_count = MIN(lowest_pending_count, job.shared_input.blocks.size());
}
unsigned int i = 0;
// We don't use a "weakest team gets it" dispatch for speed,
// So prioritize only jobs under median workload count and not just the highest.
unsigned int median_pending_count = lowest_pending_count + (highest_pending_count - lowest_pending_count) / 2;
// Dispatch to jobs with least pending requests
for (unsigned int job_index = 0; job_index < _job_count && i < input.blocks.size(); ++job_index) {
JobData &job = _jobs[job_index];
unsigned int pending_count = job.shared_input.blocks.size();
unsigned int count = MIN(median_pending_count - pending_count, input.blocks.size());
if (count > 0) {
if (i + count > input.blocks.size()) {
count = input.blocks.size() - i;
}
replaced_blocks += push_block_requests(job, input.blocks, i, count);
i += count;
}
}
// Dispatch equal count of remaining requests.
// Remainder is dispatched too until consumed through the first jobs.
unsigned int base_count = (input.blocks.size() - i) / _job_count;
unsigned int remainder = (input.blocks.size() - i) % _job_count;
for (unsigned int job_index = 0; job_index < _job_count && i < input.blocks.size(); ++job_index) {
JobData &job = _jobs[job_index];
unsigned int count = base_count;
if (remainder > 0) {
++count;
--remainder;
}
if (i + count > input.blocks.size()) {
replaced_blocks += push_block_requests(job, input.blocks, i, input.blocks.size() - i);
} else {
replaced_blocks += push_block_requests(job, input.blocks, i, count);
i += count;
}
}
// Set remaining data on all jobs, unlock inputs and resume
for (unsigned int job_index = 0; job_index < _job_count; ++job_index) {
JobData &job = _jobs[job_index];
if (job.shared_input.priority_position != input.priority_position || input.blocks.size() > 0) {
job.needs_sort = true;
}
job.shared_input.priority_position = input.priority_position;
if (input.use_exclusive_region) {
job.shared_input.use_exclusive_region = true;
job.shared_input.exclusive_region_extent = input.exclusive_region_extent;
}
bool should_run = !job.shared_input.is_empty();
job.input_mutex->unlock();
if (should_run) {
job.semaphore->post();
}
}
if (replaced_blocks > 0) {
print_line(String("VoxelBlockProcessor: {1} blocks already in queue were replaced").format(varray(replaced_blocks)));
}
}
void pop(Output &output) {
output.stats = Stats();
output.stats.thread_count = _job_count;
// Harvest results from all jobs
for (unsigned int i = 0; i < _job_count; ++i) {
JobData &job = _jobs[i];
{
MutexLock lock(job.output_mutex);
output.blocks.append_array(job.shared_output.blocks);
merge_stats(output.stats, job.shared_output.stats, i);
job.shared_output.blocks.clear();
}
}
}
static Dictionary to_dictionary(const Stats &stats) {
Dictionary d;
d["min_time"] = stats.min_time;
d["max_time"] = stats.max_time;
d["sorting_time"] = stats.sorting_time;
d["dropped_count"] = stats.dropped_count;
Array remaining_blocks;
remaining_blocks.resize(stats.thread_count);
for (unsigned int i = 0; i < stats.thread_count; ++i) {
remaining_blocks[i] = stats.remaining_blocks[i];
}
d["remaining_blocks_per_thread"] = remaining_blocks;
d["file_opens"] = stats.processor.file_opens;
return d;
}
private:
struct JobData {
// Data accessed from other threads, so they need mutexes
Input shared_input;
Output shared_output;
Mutex *input_mutex = nullptr;
Mutex *output_mutex = nullptr;
// Indexes which blocks are present in shared_input,
// so if we push a duplicate request with the same coordinates, we can discard it without a linear search
HashMap<Vector3i, int, Vector3iHasher> block_indexes[MAX_LOD];
bool needs_sort = false;
// Only read by the thread
bool thread_exit = false;
Input input;
Output output;
Semaphore *semaphore = nullptr;
Thread *thread = nullptr;
uint32_t sync_interval_ms = 100;
uint32_t job_index = -1;
bool duplicate_rejection = false;
int batch_count = 0;
BlockProcessingFunc processor;
};
static void merge_stats(Stats &a, const Stats &b, int job_index) {
a.max_time = MAX(a.max_time, b.max_time);
a.min_time = MIN(a.min_time, b.min_time);
a.remaining_blocks[job_index] = b.remaining_blocks[job_index];
a.sorting_time += b.sorting_time;
a.dropped_count += b.dropped_count;
a.processor.file_opens += b.processor.file_opens;
}
unsigned int push_block_requests(JobData &job, const std::vector<InputBlock> &input_blocks, int begin, int count) {
// The job's input must have been locked first
unsigned int replaced_blocks = 0;
unsigned int end = begin + count;
CRASH_COND(end > input_blocks.size());
for (unsigned int i = begin; i < end; ++i) {
const InputBlock &block = input_blocks[i];
CRASH_COND(block.lod >= MAX_LOD)
if (job.duplicate_rejection) {
int *index = job.block_indexes[block.lod].getptr(block.position);
// TODO When using more than one thread, duplicate rejection is less effective... is it relevant to keep it at all?
if (index) {
// The block is already in the update queue, replace it
++replaced_blocks;
job.shared_input.blocks[*index] = block;
} else {
// Append new block request
int j = job.shared_input.blocks.size();
job.shared_input.blocks.push_back(block);
job.block_indexes[block.lod][block.position] = j;
}
} else {
job.shared_input.blocks.push_back(block);
}
}
return replaced_blocks;
}
static void _thread_func(void *p_data) {
JobData *data = reinterpret_cast<JobData *>(p_data);
CRASH_COND(data == nullptr);
thread_func(*data);
}
static void thread_func(JobData &data) {
while (!data.thread_exit) {
uint32_t sync_time = OS::get_singleton()->get_ticks_msec() + data.sync_interval_ms;
unsigned int queue_index = 0;
Stats stats;
thread_sync(data, queue_index, stats, stats.sorting_time, stats.dropped_count);
// Continue to run as long as there are queries to process
while (!data.input.blocks.empty()) {
if (!data.input.blocks.empty()) {
if (data.thread_exit) {
// Remove all queries except those that can't be discarded
std::remove_if(data.input.blocks.begin(), data.input.blocks.end(),
[](const InputBlock &b) {
return b.can_be_discarded;
});
}
int input_begin = queue_index;
int batch_count = data.batch_count;
if (input_begin + batch_count > data.input.blocks.size()) {
batch_count = data.input.blocks.size() - input_begin;
}
if (batch_count > 0) {
uint64_t time_before = OS::get_singleton()->get_ticks_usec();
int output_begin = data.output.blocks.size();
data.output.blocks.resize(data.output.blocks.size() + batch_count);
for (int i = 0; i < batch_count; ++i) {
InputBlock &ib = data.input.blocks[input_begin + i];
OutputBlock &ob = data.output.blocks.write[output_begin + i];
ob.position = ib.position;
ob.lod = ib.lod;
}
data.processor(
ArraySlice<InputBlock>(&data.input.blocks[0], input_begin, input_begin + batch_count),
ArraySlice<OutputBlock>(&data.output.blocks.write[0], output_begin, output_begin + batch_count),
stats.processor);
uint64_t time_taken = (OS::get_singleton()->get_ticks_usec() - time_before) / batch_count;
// Do some stats
if (stats.first) {
stats.first = false;
stats.min_time = time_taken;
stats.max_time = time_taken;
} else {
if (time_taken < stats.min_time) {
stats.min_time = time_taken;
}
if (time_taken > stats.max_time) {
stats.max_time = time_taken;
}
}
}
queue_index += batch_count;
if (queue_index >= data.input.blocks.size()) {
data.input.blocks.clear();
}
}
uint32_t time = OS::get_singleton()->get_ticks_msec();
if (time >= sync_time || data.input.blocks.empty()) {
uint64_t sort_time;
unsigned int dropped_count;
thread_sync(data, queue_index, stats, sort_time, dropped_count);
sync_time = OS::get_singleton()->get_ticks_msec() + data.sync_interval_ms;
queue_index = 0;
stats = Stats();
stats.sorting_time = sort_time;
stats.dropped_count = dropped_count;
}
}
if (data.thread_exit) {
break;
}
// Wait for future wake-up
data.semaphore->wait();
}
}
static inline float get_priority_heuristic(const InputBlock &a, const Vector3i &viewer_block_pos, const Vector3 &viewer_direction, int max_lod) {
int f = 1 << a.lod;
Vector3i p = a.position * f;
float d = Math::sqrt(p.distance_sq(viewer_block_pos) + 0.1f);
float dp = viewer_direction.dot(viewer_block_pos.to_vec3() / d);
// Higher lod indexes come first to allow the octree to subdivide.
// Then comes distance, which is modified by how much in view the block is
return (max_lod - a.lod) * 10000.f + d + (1.f - dp) * 4.f * f;
}
struct BlockUpdateComparator {
inline bool operator()(const InputBlock &a, const InputBlock &b) const {
return a.sort_heuristic < b.sort_heuristic;
}
};
static void thread_sync(JobData &data, unsigned int queue_index, Stats stats, uint64_t &out_sort_time, unsigned int &out_dropped_count) {
if (!data.input.blocks.empty()) {
// Cleanup input vector
if (queue_index >= data.input.blocks.size()) {
data.input.blocks.clear();
} else if (queue_index > 0) {
// Shift up remaining items since we use a Vector
shift_up(data.input.blocks, queue_index);
}
}
stats.remaining_blocks[data.job_index] = data.input.blocks.size();
bool needs_sort;
// Get input
{
MutexLock lock(data.input_mutex);
// Copy requests from shared to internal
append_array(data.input.blocks, data.shared_input.blocks);
data.input.priority_position = data.shared_input.priority_position;
if (data.shared_input.use_exclusive_region) {
data.input.use_exclusive_region = true;
data.input.exclusive_region_extent = data.shared_input.exclusive_region_extent;
}
data.shared_input.blocks.clear();
if (data.duplicate_rejection) {
for (unsigned int lod_index = 0; lod_index < MAX_LOD; ++lod_index) {
data.block_indexes[lod_index].clear();
}
}
needs_sort = data.needs_sort;
data.needs_sort = false;
}
if (!data.output.blocks.empty()) {
// print_line(String("VoxelMeshUpdater: posting {0} blocks, {1} remaining ; cost [{2}..{3}] usec")
// .format(varray(_output.blocks.size(), _input.blocks.size(), stats.min_time, stats.max_time)));
// Copy output to shared
MutexLock lock(data.output_mutex);
data.shared_output.blocks.append_array(data.output.blocks);
data.shared_output.stats = stats;
data.output.blocks.clear();
}
// Cancel blocks outside exclusive region.
// We do this early because if the player keeps moving forward,
// we would keep accumulating requests forever, and that means slower sorting and memory waste
int dropped_count = 0;
if (data.input.use_exclusive_region) {
for (unsigned int i = 0; i < data.input.blocks.size(); ++i) {
const InputBlock &ib = data.input.blocks[i];
if (!ib.can_be_discarded) {
continue;
}
Rect3i box = Rect3i::from_center_extents(data.input.priority_position >> ib.lod, Vector3i(data.input.exclusive_region_extent));
if (!box.contains(ib.position)) {
// Indicate the caller that we dropped that block.
// This can help troubleshoot bugs in some situations.
OutputBlock ob;
ob.position = ib.position;
ob.lod = ib.lod;
ob.drop_hint = true;
data.output.blocks.push_back(ob);
// We'll put that block in replacement of the dropped one and pop the last cell,
// so we don't need to shift every following blocks
const InputBlock &shifted_block = data.input.blocks.back();
if (data.duplicate_rejection) {
data.block_indexes[ib.lod].erase(ib.position);
data.block_indexes[shifted_block.lod][shifted_block.position] = i;
}
// Do this last because it invalidates `ib`
data.input.blocks[i] = shifted_block;
data.input.blocks.pop_back();
// Move back to redo this index, since we replaced the current block
--i;
++dropped_count;
}
}
}
if (dropped_count > 0) {
print_line(String("Dropped {0} blocks from thread").format(varray(dropped_count)));
out_dropped_count = dropped_count;
}
uint64_t time_before = OS::get_singleton()->get_ticks_usec();
if (!data.input.blocks.empty() && needs_sort) {
for (auto it = data.input.blocks.begin(); it != data.input.blocks.end(); ++it) {
InputBlock &ib = *it;
// Set or override previous heuristic based on new infos
ib.sort_heuristic = get_priority_heuristic(ib,
data.input.priority_position,
data.input.priority_direction,
data.input.max_lod_index);
}
// Re-sort priority
SortArray<InputBlock, BlockUpdateComparator> sorter;
sorter.sort(data.input.blocks.data(), data.input.blocks.size());
}
out_sort_time = OS::get_singleton()->get_ticks_usec() - time_before;
}
JobData _jobs[MAX_JOBS];
unsigned int _job_count = 0;
unsigned int _batch_count = 0;
};
#endif // VOXEL_BLOCK_THREAD_MANAGER_H