Align program memory of VoxelGraphRuntime

This commit is contained in:
Marc Gilleron 2021-10-28 19:32:38 +01:00
parent 3133834a17
commit 38ec2f43d1
3 changed files with 110 additions and 75 deletions

View File

@ -17,23 +17,6 @@
//#define VOXEL_DEBUG_GRAPH_PROG_SENTINEL uint16_t(12345) // 48, 57 (base 10)
//#endif
template <typename T>
inline const T &read(const Span<const uint8_t> &mem, uint32_t &p) {
#ifdef DEBUG_ENABLED
CRASH_COND(p + sizeof(T) > mem.size());
#endif
const T *v = reinterpret_cast<const T *>(&mem[p]);
p += sizeof(T);
return *v;
}
template <typename T>
inline void append(std::vector<uint8_t> &mem, const T &v) {
size_t p = mem.size();
mem.resize(p + sizeof(T));
*(T *)(&mem[p]) = v;
}
// The Image lock() API prevents us from reading the same image in multiple threads.
// Compiling makes a read-only copy of all resources, so we can lock all images up-front if successful.
// This might no longer needed in Godot 4.
@ -217,7 +200,7 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
_program.y_input_address = mem.add_binding();
_program.z_input_address = mem.add_binding();
std::vector<uint8_t> &operations = _program.operations;
std::vector<uint16_t> &operations = _program.operations;
const VoxelGraphNodeDB &type_db = *VoxelGraphNodeDB::get_singleton();
// Run through each node in order, and turn them into program instructions
@ -284,7 +267,7 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
}
_program.default_execution_map.operation_adresses.push_back(operations.size());
append(operations, static_cast<uint8_t>(node->type_id));
operations.push_back(node->type_id);
// Inputs and outputs use a convention so we can have generic code for them.
// Parameters are more specific, and may be affected by alignment so better just do them by hand
@ -314,7 +297,7 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
++dg_node.end_dependency;
}
append(operations, a);
operations.push_back(a);
BufferSpec &bs = _program.buffer_specs[a];
++bs.users_count;
@ -328,12 +311,12 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
const ProgramGraph::PortLocation op{ node_id, static_cast<uint32_t>(j) };
_program.output_port_addresses[op] = a;
append(operations, a);
operations.push_back(a);
}
// Add space for params size, default is no params so size is 0
size_t params_size_index = operations.size();
append<uint16_t>(operations, 0);
operations.push_back(0);
// Get params, copy resources when used, and hold a reference to them
std::vector<Variant> params_copy;
@ -364,7 +347,6 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
}
if (type.compile_func != nullptr) {
const size_t size_before = operations.size();
CompileContext ctx(*node, operations, _program.heap_resources, params_copy);
type.compile_func(ctx);
if (ctx.has_error()) {
@ -374,9 +356,9 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
result.node_id = node_id;
return result;
}
const size_t params_size = operations.size() - size_before;
const size_t params_size = ctx.get_params_size_in_words();
CRASH_COND(params_size > std::numeric_limits<uint16_t>::max());
*reinterpret_cast<uint16_t *>(&operations[params_size_index]) = params_size;
operations[params_size_index] = params_size;
}
if (type.category == VoxelGraphNodeDB::CATEGORY_OUTPUT) {
@ -414,7 +396,7 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
PRINT_VERBOSE(String("Compiled voxel graph. Program size: {0}b, buffers: {1}")
.format(varray(
SIZE_T_TO_VARIANT(_program.operations.size() * sizeof(float)),
SIZE_T_TO_VARIANT(_program.operations.size() * sizeof(uint16_t)),
SIZE_T_TO_VARIANT(_program.buffer_count))));
_program.lock_images();
@ -425,15 +407,15 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
}
static Span<const uint16_t> get_outputs_from_op_address(
Span<const uint8_t> operations, uint16_t op_address) {
const uint8_t opid = operations[op_address];
Span<const uint16_t> operations, uint16_t op_address) {
const uint16_t opid = operations[op_address];
const VoxelGraphNodeDB::NodeType &node_type = VoxelGraphNodeDB::get_singleton()->get_type(opid);
const uint32_t inputs_size = node_type.inputs.size() * sizeof(uint16_t);
const uint32_t outputs_size = node_type.outputs.size() * sizeof(uint16_t);
const uint32_t inputs_count = node_type.inputs.size();
const uint32_t outputs_count = node_type.outputs.size();
// The +1 is for `opid`
return operations.sub(op_address + 1 + inputs_size, outputs_size).reinterpret_cast_to<const uint16_t>();
return operations.sub(op_address + 1 + inputs_count, outputs_count);
}
bool VoxelGraphRuntime::is_operation_constant(const State &state, uint16_t op_address) const {
@ -548,7 +530,7 @@ void VoxelGraphRuntime::generate_optimized_execution_map(const State &state, Exe
}
}
Span<const uint8_t> operations(program.operations.data(), 0, program.operations.size());
Span<const uint16_t> operations(program.operations.data(), 0, program.operations.size());
bool xzy_start_not_assigned = true;
// Now we have to fill buffers with the local constants we may have found.
@ -730,6 +712,20 @@ void VoxelGraphRuntime::prepare_state(State &state, unsigned int buffer_size) co
}*/
}
static inline Span<const uint8_t> read_params(Span<const uint16_t> operations, unsigned int &pc) {
const uint16_t params_size_in_words = operations[pc];
++pc;
Span<const uint8_t> params;
if (params_size_in_words > 0) {
const size_t params_offset_in_words = operations[pc];
// Seek to aligned position where params start
pc += params_offset_in_words;
params = operations.sub(pc, params_size_in_words).reinterpret_cast_to<const uint8_t>();
pc += params_size_in_words;
}
return params;
}
void VoxelGraphRuntime::generate_set(State &state,
Span<float> in_x, Span<float> in_y, Span<float> in_z, bool skip_xz,
const ExecutionMap *execution_map) const {
@ -789,7 +785,7 @@ void VoxelGraphRuntime::generate_set(State &state,
L::bind_buffer(buffers, _program.z_input_address, in_z);
}
const Span<const uint8_t> operations(_program.operations.data(), 0, _program.operations.size());
const Span<const uint16_t> operations(_program.operations.data(), 0, _program.operations.size());
Span<const uint16_t> op_adresses = execution_map != nullptr ?
to_span_const(execution_map->operation_adresses) :
@ -804,25 +800,18 @@ void VoxelGraphRuntime::generate_set(State &state,
for (unsigned int execution_map_index = 0; execution_map_index < op_adresses.size(); ++execution_map_index) {
unsigned int pc = op_adresses[execution_map_index];
const uint8_t opid = operations[pc++];
const uint16_t opid = operations[pc++];
const VoxelGraphNodeDB::NodeType &node_type = VoxelGraphNodeDB::get_singleton()->get_type(opid);
const uint32_t inputs_size = node_type.inputs.size() * sizeof(uint16_t);
const uint32_t outputs_size = node_type.outputs.size() * sizeof(uint16_t);
const uint32_t inputs_count = node_type.inputs.size();
const uint32_t outputs_count = node_type.outputs.size();
const Span<const uint16_t> inputs =
operations.sub(pc, inputs_size).reinterpret_cast_to<const uint16_t>();
pc += inputs_size;
const Span<const uint16_t> outputs =
operations.sub(pc, outputs_size).reinterpret_cast_to<const uint16_t>();
pc += outputs_size;
const Span<const uint16_t> inputs = operations.sub(pc, inputs_count);
pc += inputs_count;
const Span<const uint16_t> outputs = operations.sub(pc, outputs_count);
pc += outputs_count;
const uint16_t params_size = read<uint16_t>(operations, pc);
Span<const uint8_t> params;
if (params_size > 0) {
params = operations.sub(pc, params_size);
//pc += params_size;
}
Span<const uint8_t> params = read_params(operations, pc);
ERR_FAIL_COND(node_type.process_buffer_func == nullptr);
ProcessBufferContext ctx(inputs, outputs, params, buffers, execution_map != nullptr);
@ -863,31 +852,24 @@ void VoxelGraphRuntime::analyze_range(State &state, Vector3i min_pos, Vector3i m
ranges[_program.y_input_address] = Interval(min_pos.y, max_pos.y);
ranges[_program.z_input_address] = Interval(min_pos.z, max_pos.z);
const Span<const uint8_t> operations(_program.operations.data(), 0, _program.operations.size());
const Span<const uint16_t> operations(_program.operations.data(), 0, _program.operations.size());
// Here operations must all be analyzed, because we do this as a broad-phase.
// Only narrow-phase may skip some operations eventually.
uint32_t pc = 0;
while (pc < operations.size()) {
const uint8_t opid = operations[pc++];
const uint16_t opid = operations[pc++];
const VoxelGraphNodeDB::NodeType &node_type = VoxelGraphNodeDB::get_singleton()->get_type(opid);
const uint32_t inputs_size = node_type.inputs.size() * sizeof(uint16_t);
const uint32_t outputs_size = node_type.outputs.size() * sizeof(uint16_t);
const uint32_t inputs_count = node_type.inputs.size();
const uint32_t outputs_count = node_type.outputs.size();
const Span<const uint16_t> inputs =
operations.sub(pc, inputs_size).reinterpret_cast_to<const uint16_t>();
pc += inputs_size;
const Span<const uint16_t> outputs =
operations.sub(pc, outputs_size).reinterpret_cast_to<const uint16_t>();
pc += outputs_size;
const Span<const uint16_t> inputs = operations.sub(pc, inputs_count);
pc += inputs_count;
const Span<const uint16_t> outputs = operations.sub(pc, outputs_count);
pc += outputs_count;
const uint16_t params_size = read<uint16_t>(operations, pc);
Span<const uint8_t> params;
if (params_size > 0) {
params = operations.sub(pc, params_size);
pc += params_size;
}
Span<const uint8_t> params = read_params(operations, pc);
ERR_FAIL_COND(node_type.range_analysis_func == nullptr);
RangeAnalysisContext ctx(inputs, outputs, params, ranges, buffers);

View File

@ -67,7 +67,7 @@ public:
~State() {
clear();
}
inline const Buffer &get_buffer(uint16_t address) const {
// TODO Just for convenience because STL bound checks aren't working in Godot 3
CRASH_COND(address >= buffers.size());
@ -160,11 +160,10 @@ public:
// Functions usable by node implementations during the compilation stage
class CompileContext {
public:
CompileContext(const ProgramGraph::Node &node, std::vector<uint8_t> &program,
CompileContext(const ProgramGraph::Node &node, std::vector<uint16_t> &program,
std::vector<HeapResource> &heap_resources,
std::vector<Variant> &params) :
_node(node),
_offset(program.size()),
_program(program),
_heap_resources(heap_resources),
_params(params) {}
@ -178,10 +177,33 @@ public:
template <typename T>
void set_params(T params) {
// Can be called only once per node
CRASH_COND(_offset != _program.size());
_program.resize(_program.size() + sizeof(T));
T &p = *reinterpret_cast<T *>(&_program[_offset]);
CRASH_COND(_params_added);
// We will need to align memory, so the struct will not be immediately stored here.
// Instead we put a header that tells how much to advance in order to reach the beginning of the struct,
// which will be at an aligned position.
// We align to the maximum alignment between the struct,
// and the type of word we store inside the program buffer, which is uint16.
const size_t params_alignment = max(alignof(T), alignof(uint16_t));
const size_t params_offset_index = _program.size();
// Prepare space to store the offset (at least 1 since that header is one word)
_program.push_back(1);
// Align memory for the struct.
// Note, we index with words, not bytes.
const size_t struct_offset =
alignup(_program.size() * sizeof(uint16_t), params_alignment) / sizeof(uint16_t);
if (struct_offset > _program.size()) {
_program.resize(struct_offset);
}
// Write offset in header
_program[params_offset_index] = struct_offset - params_offset_index;
// Allocate space for the struct. It is measured in words, so it can be up to 1 byte larger.
_params_size_in_words = (sizeof(T) + sizeof(uint16_t) - 1) / sizeof(uint16_t);
_program.resize(_program.size() + _params_size_in_words);
// Write struct
T &p = *reinterpret_cast<T *>(&_program[struct_offset]);
p = params;
_params_added = true;
}
// In case the compilation step produces a resource to be deleted
@ -210,14 +232,19 @@ public:
return _error_message;
}
size_t get_params_size_in_words() const {
return _params_size_in_words;
}
private:
const ProgramGraph::Node &_node;
const size_t _offset;
std::vector<uint8_t> &_program;
std::vector<uint16_t> &_program;
std::vector<HeapResource> &_heap_resources;
std::vector<Variant> &_params;
String _error_message;
size_t _params_size_in_words = 0;
bool _has_error = false;
bool _params_added = false;
};
class _ProcessContext {
@ -232,6 +259,9 @@ public:
template <typename T>
inline const T &get_params() const {
#ifdef DEBUG_ENABLED
CRASH_COND(sizeof(T) > _params.size());
#endif
return *reinterpret_cast<const T *>(_params.data());
}
@ -374,11 +404,21 @@ private:
// Precalculated program data.
// Remains constant and read-only after compilation.
struct Program {
// Serialized operations and arguments.
// They come up as series of <opid><inputs><outputs><parameters_size><parameters>.
// Serialized operations and arguments, aligned at minimum with uint16.
// They come up as series of:
//
// - uint16 opid
// - uint16 inputs[0..*]
// - uint16 outputs[0..*]
// - uint16 parameters_size
// - uint16 parameters_offset // how much to advance from here to reach the beginning of `parameters`
// - <optional padding>
// - T parameters, where T could be any struct
// - <optional padding to keep alignment with uint16>
//
// They should be laid out in the same order they will be run in, although it's not absolutely required.
// It's better to have it ordered because memory access will be more predictable.
std::vector<uint8_t> operations;
std::vector<uint16_t> operations;
// Describes dependencies between operations. It is generated at compile time.
// It is used to perform dynamic optimization in case some operations can be predicted as constant.

View File

@ -158,6 +158,19 @@ inline bool is_valid_size(const Vector3 &s) {
return s.x >= 0 && s.y >= 0 && s.z >= 0;
}
inline bool is_power_of_two(size_t x) {
return x != 0 && (x & (x - 1)) == 0;
}
// If the provided address `a` is not aligned to the number of bytes specified in `align`,
// returns the next aligned address. `align` must be a power of two.
inline size_t alignup(size_t a, size_t align) {
#ifdef DEBUG_ENABLED
CRASH_COND(!is_power_of_two(align));
#endif
return (a + align - 1) & ~(align - 1);
}
// inline bool is_power_of_two(int i) {
// return i & (i - 1);
// }