Align program memory of VoxelGraphRuntime

2021-10-28 19:32:38 +01:00 · 2021-10-28 19:32:38 +01:00 · 38ec2f43d1
commit 38ec2f43d1
parent 3133834a17
3 changed files with 110 additions and 75 deletions
--- a/generators/graph/voxel_graph_runtime.cpp
+++ b/generators/graph/voxel_graph_runtime.cpp
@ -17,23 +17,6 @@
 //#define VOXEL_DEBUG_GRAPH_PROG_SENTINEL uint16_t(12345) // 48, 57 (base 10)
 //#endif

-template <typename T>
-inline const T &read(const Span<const uint8_t> &mem, uint32_t &p) {
-#ifdef DEBUG_ENABLED
-	CRASH_COND(p + sizeof(T) > mem.size());
-#endif
-	const T *v = reinterpret_cast<const T *>(&mem[p]);
-	p += sizeof(T);
-	return *v;
-}
-
-template <typename T>
-inline void append(std::vector<uint8_t> &mem, const T &v) {
-	size_t p = mem.size();
-	mem.resize(p + sizeof(T));
-	*(T *)(&mem[p]) = v;
-}
-
 // The Image lock() API prevents us from reading the same image in multiple threads.
 // Compiling makes a read-only copy of all resources, so we can lock all images up-front if successful.
 // This might no longer needed in Godot 4.
@ -217,7 +200,7 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
 	_program.y_input_address = mem.add_binding();
 	_program.z_input_address = mem.add_binding();

-	std::vector<uint8_t> &operations = _program.operations;
+	std::vector<uint16_t> &operations = _program.operations;
 	const VoxelGraphNodeDB &type_db = *VoxelGraphNodeDB::get_singleton();

 	// Run through each node in order, and turn them into program instructions
@ -284,7 +267,7 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
 		}
 		_program.default_execution_map.operation_adresses.push_back(operations.size());

-		append(operations, static_cast<uint8_t>(node->type_id));
+		operations.push_back(node->type_id);

 		// Inputs and outputs use a convention so we can have generic code for them.
 		// Parameters are more specific, and may be affected by alignment so better just do them by hand
@ -314,7 +297,7 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
 				++dg_node.end_dependency;
 			}

-			append(operations, a);
+			operations.push_back(a);

 			BufferSpec &bs = _program.buffer_specs[a];
 			++bs.users_count;
@ -328,12 +311,12 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
 			const ProgramGraph::PortLocation op{ node_id, static_cast<uint32_t>(j) };
 			_program.output_port_addresses[op] = a;

-			append(operations, a);
+			operations.push_back(a);
 		}

 		// Add space for params size, default is no params so size is 0
 		size_t params_size_index = operations.size();
-		append<uint16_t>(operations, 0);
+		operations.push_back(0);

 		// Get params, copy resources when used, and hold a reference to them
 		std::vector<Variant> params_copy;
@ -364,7 +347,6 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
 		}

 		if (type.compile_func != nullptr) {
-			const size_t size_before = operations.size();
 			CompileContext ctx(*node, operations, _program.heap_resources, params_copy);
 			type.compile_func(ctx);
 			if (ctx.has_error()) {
@ -374,9 +356,9 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
 				result.node_id = node_id;
 				return result;
 			}
-			const size_t params_size = operations.size() - size_before;
+			const size_t params_size = ctx.get_params_size_in_words();
 			CRASH_COND(params_size > std::numeric_limits<uint16_t>::max());
-			*reinterpret_cast<uint16_t *>(&operations[params_size_index]) = params_size;
+			operations[params_size_index] = params_size;
 		}

 		if (type.category == VoxelGraphNodeDB::CATEGORY_OUTPUT) {
@ -414,7 +396,7 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr

 	PRINT_VERBOSE(String("Compiled voxel graph. Program size: {0}b, buffers: {1}")
 						  .format(varray(
-								  SIZE_T_TO_VARIANT(_program.operations.size() * sizeof(float)),
+								  SIZE_T_TO_VARIANT(_program.operations.size() * sizeof(uint16_t)),
 								  SIZE_T_TO_VARIANT(_program.buffer_count))));

 	_program.lock_images();
@ -425,15 +407,15 @@ VoxelGraphRuntime::CompilationResult VoxelGraphRuntime::_compile(const ProgramGr
 }

 static Span<const uint16_t> get_outputs_from_op_address(
-		Span<const uint8_t> operations, uint16_t op_address) {
-	const uint8_t opid = operations[op_address];
+		Span<const uint16_t> operations, uint16_t op_address) {
+	const uint16_t opid = operations[op_address];
 	const VoxelGraphNodeDB::NodeType &node_type = VoxelGraphNodeDB::get_singleton()->get_type(opid);

-	const uint32_t inputs_size = node_type.inputs.size() * sizeof(uint16_t);
-	const uint32_t outputs_size = node_type.outputs.size() * sizeof(uint16_t);
+	const uint32_t inputs_count = node_type.inputs.size();
+	const uint32_t outputs_count = node_type.outputs.size();

 	// The +1 is for `opid`
-	return operations.sub(op_address + 1 + inputs_size, outputs_size).reinterpret_cast_to<const uint16_t>();
+	return operations.sub(op_address + 1 + inputs_count, outputs_count);
 }

 bool VoxelGraphRuntime::is_operation_constant(const State &state, uint16_t op_address) const {
@ -548,7 +530,7 @@ void VoxelGraphRuntime::generate_optimized_execution_map(const State &state, Exe
 		}
 	}

-	Span<const uint8_t> operations(program.operations.data(), 0, program.operations.size());
+	Span<const uint16_t> operations(program.operations.data(), 0, program.operations.size());
 	bool xzy_start_not_assigned = true;

 	// Now we have to fill buffers with the local constants we may have found.
@ -730,6 +712,20 @@ void VoxelGraphRuntime::prepare_state(State &state, unsigned int buffer_size) co
 	}*/
 }

+static inline Span<const uint8_t> read_params(Span<const uint16_t> operations, unsigned int &pc) {
+	const uint16_t params_size_in_words = operations[pc];
+	++pc;
+	Span<const uint8_t> params;
+	if (params_size_in_words > 0) {
+		const size_t params_offset_in_words = operations[pc];
+		// Seek to aligned position where params start
+		pc += params_offset_in_words;
+		params = operations.sub(pc, params_size_in_words).reinterpret_cast_to<const uint8_t>();
+		pc += params_size_in_words;
+	}
+	return params;
+}
+
 void VoxelGraphRuntime::generate_set(State &state,
 		Span<float> in_x, Span<float> in_y, Span<float> in_z, bool skip_xz,
 		const ExecutionMap *execution_map) const {
@ -789,7 +785,7 @@ void VoxelGraphRuntime::generate_set(State &state,
 		L::bind_buffer(buffers, _program.z_input_address, in_z);
 	}

-	const Span<const uint8_t> operations(_program.operations.data(), 0, _program.operations.size());
+	const Span<const uint16_t> operations(_program.operations.data(), 0, _program.operations.size());

 	Span<const uint16_t> op_adresses = execution_map != nullptr ?
 											   to_span_const(execution_map->operation_adresses) :
@ -804,25 +800,18 @@ void VoxelGraphRuntime::generate_set(State &state,
 	for (unsigned int execution_map_index = 0; execution_map_index < op_adresses.size(); ++execution_map_index) {
 		unsigned int pc = op_adresses[execution_map_index];

-		const uint8_t opid = operations[pc++];
+		const uint16_t opid = operations[pc++];
 		const VoxelGraphNodeDB::NodeType &node_type = VoxelGraphNodeDB::get_singleton()->get_type(opid);

-		const uint32_t inputs_size = node_type.inputs.size() * sizeof(uint16_t);
-		const uint32_t outputs_size = node_type.outputs.size() * sizeof(uint16_t);
+		const uint32_t inputs_count = node_type.inputs.size();
+		const uint32_t outputs_count = node_type.outputs.size();

-		const Span<const uint16_t> inputs =
-				operations.sub(pc, inputs_size).reinterpret_cast_to<const uint16_t>();
-		pc += inputs_size;
-		const Span<const uint16_t> outputs =
-				operations.sub(pc, outputs_size).reinterpret_cast_to<const uint16_t>();
-		pc += outputs_size;
+		const Span<const uint16_t> inputs = operations.sub(pc, inputs_count);
+		pc += inputs_count;
+		const Span<const uint16_t> outputs = operations.sub(pc, outputs_count);
+		pc += outputs_count;

-		const uint16_t params_size = read<uint16_t>(operations, pc);
-		Span<const uint8_t> params;
-		if (params_size > 0) {
-			params = operations.sub(pc, params_size);
-			//pc += params_size;
-		}
+		Span<const uint8_t> params = read_params(operations, pc);

 		ERR_FAIL_COND(node_type.process_buffer_func == nullptr);
 		ProcessBufferContext ctx(inputs, outputs, params, buffers, execution_map != nullptr);
@ -863,31 +852,24 @@ void VoxelGraphRuntime::analyze_range(State &state, Vector3i min_pos, Vector3i m
 	ranges[_program.y_input_address] = Interval(min_pos.y, max_pos.y);
 	ranges[_program.z_input_address] = Interval(min_pos.z, max_pos.z);

-	const Span<const uint8_t> operations(_program.operations.data(), 0, _program.operations.size());
+	const Span<const uint16_t> operations(_program.operations.data(), 0, _program.operations.size());

 	// Here operations must all be analyzed, because we do this as a broad-phase.
 	// Only narrow-phase may skip some operations eventually.
 	uint32_t pc = 0;
 	while (pc < operations.size()) {
-		const uint8_t opid = operations[pc++];
+		const uint16_t opid = operations[pc++];
 		const VoxelGraphNodeDB::NodeType &node_type = VoxelGraphNodeDB::get_singleton()->get_type(opid);

-		const uint32_t inputs_size = node_type.inputs.size() * sizeof(uint16_t);
-		const uint32_t outputs_size = node_type.outputs.size() * sizeof(uint16_t);
+		const uint32_t inputs_count = node_type.inputs.size();
+		const uint32_t outputs_count = node_type.outputs.size();

-		const Span<const uint16_t> inputs =
-				operations.sub(pc, inputs_size).reinterpret_cast_to<const uint16_t>();
-		pc += inputs_size;
-		const Span<const uint16_t> outputs =
-				operations.sub(pc, outputs_size).reinterpret_cast_to<const uint16_t>();
-		pc += outputs_size;
+		const Span<const uint16_t> inputs = operations.sub(pc, inputs_count);
+		pc += inputs_count;
+		const Span<const uint16_t> outputs = operations.sub(pc, outputs_count);
+		pc += outputs_count;

-		const uint16_t params_size = read<uint16_t>(operations, pc);
-		Span<const uint8_t> params;
-		if (params_size > 0) {
-			params = operations.sub(pc, params_size);
-			pc += params_size;
-		}
+		Span<const uint8_t> params = read_params(operations, pc);

 		ERR_FAIL_COND(node_type.range_analysis_func == nullptr);
 		RangeAnalysisContext ctx(inputs, outputs, params, ranges, buffers);
--- a/generators/graph/voxel_graph_runtime.h
+++ b/generators/graph/voxel_graph_runtime.h
@ -67,7 +67,7 @@ public:
 		~State() {
 			clear();
 		}
-		
+
 		inline const Buffer &get_buffer(uint16_t address) const {
 			// TODO Just for convenience because STL bound checks aren't working in Godot 3
 			CRASH_COND(address >= buffers.size());
@ -160,11 +160,10 @@ public:
 	// Functions usable by node implementations during the compilation stage
 	class CompileContext {
 	public:
-		CompileContext(const ProgramGraph::Node &node, std::vector<uint8_t> &program,
+		CompileContext(const ProgramGraph::Node &node, std::vector<uint16_t> &program,
 				std::vector<HeapResource> &heap_resources,
 				std::vector<Variant> &params) :
 				_node(node),
-				_offset(program.size()),
 				_program(program),
 				_heap_resources(heap_resources),
 				_params(params) {}
@ -178,10 +177,33 @@ public:
 		template <typename T>
 		void set_params(T params) {
 			// Can be called only once per node
-			CRASH_COND(_offset != _program.size());
-			_program.resize(_program.size() + sizeof(T));
-			T &p = *reinterpret_cast<T *>(&_program[_offset]);
+			CRASH_COND(_params_added);
+			// We will need to align memory, so the struct will not be immediately stored here.
+			// Instead we put a header that tells how much to advance in order to reach the beginning of the struct,
+			// which will be at an aligned position.
+			// We align to the maximum alignment between the struct,
+			// and the type of word we store inside the program buffer, which is uint16.
+			const size_t params_alignment = max(alignof(T), alignof(uint16_t));
+			const size_t params_offset_index = _program.size();
+			// Prepare space to store the offset (at least 1 since that header is one word)
+			_program.push_back(1);
+			// Align memory for the struct.
+			// Note, we index with words, not bytes.
+			const size_t struct_offset =
+					alignup(_program.size() * sizeof(uint16_t), params_alignment) / sizeof(uint16_t);
+			if (struct_offset > _program.size()) {
+				_program.resize(struct_offset);
+			}
+			// Write offset in header
+			_program[params_offset_index] = struct_offset - params_offset_index;
+			// Allocate space for the struct. It is measured in words, so it can be up to 1 byte larger.
+			_params_size_in_words = (sizeof(T) + sizeof(uint16_t) - 1) / sizeof(uint16_t);
+			_program.resize(_program.size() + _params_size_in_words);
+			// Write struct
+			T &p = *reinterpret_cast<T *>(&_program[struct_offset]);
 			p = params;
+
+			_params_added = true;
 		}

 		// In case the compilation step produces a resource to be deleted
@ -210,14 +232,19 @@ public:
 			return _error_message;
 		}

+		size_t get_params_size_in_words() const {
+			return _params_size_in_words;
+		}
+
 	private:
 		const ProgramGraph::Node &_node;
-		const size_t _offset;
-		std::vector<uint8_t> &_program;
+		std::vector<uint16_t> &_program;
 		std::vector<HeapResource> &_heap_resources;
 		std::vector<Variant> &_params;
 		String _error_message;
+		size_t _params_size_in_words = 0;
 		bool _has_error = false;
+		bool _params_added = false;
 	};

 	class _ProcessContext {
@ -232,6 +259,9 @@ public:

 		template <typename T>
 		inline const T &get_params() const {
+#ifdef DEBUG_ENABLED
+			CRASH_COND(sizeof(T) > _params.size());
+#endif
 			return *reinterpret_cast<const T *>(_params.data());
 		}

@ -374,11 +404,21 @@ private:
 	// Precalculated program data.
 	// Remains constant and read-only after compilation.
 	struct Program {
-		// Serialized operations and arguments.
-		// They come up as series of <opid><inputs><outputs><parameters_size><parameters>.
+		// Serialized operations and arguments, aligned at minimum with uint16.
+		// They come up as series of:
+		//
+		// - uint16 opid
+		// - uint16 inputs[0..*]
+		// - uint16 outputs[0..*]
+		// - uint16 parameters_size
+		// - uint16 parameters_offset // how much to advance from here to reach the beginning of `parameters`
+		// - <optional padding>
+		// - T parameters, where T could be any struct
+		// - <optional padding to keep alignment with uint16>
+		//
 		// They should be laid out in the same order they will be run in, although it's not absolutely required.
 		// It's better to have it ordered because memory access will be more predictable.
-		std::vector<uint8_t> operations;
+		std::vector<uint16_t> operations;

 		// Describes dependencies between operations. It is generated at compile time.
 		// It is used to perform dynamic optimization in case some operations can be predicted as constant.
--- a/util/math/funcs.h
+++ b/util/math/funcs.h
@ -158,6 +158,19 @@ inline bool is_valid_size(const Vector3 &s) {
 	return s.x >= 0 && s.y >= 0 && s.z >= 0;
 }

+inline bool is_power_of_two(size_t x) {
+	return x != 0 && (x & (x - 1)) == 0;
+}
+
+// If the provided address `a` is not aligned to the number of bytes specified in `align`,
+// returns the next aligned address. `align` must be a power of two.
+inline size_t alignup(size_t a, size_t align) {
+#ifdef DEBUG_ENABLED
+	CRASH_COND(!is_power_of_two(align));
+#endif
+	return (a + align - 1) & ~(align - 1);
+}
+
 // inline bool is_power_of_two(int i) {
 // 	return i & (i - 1);
 // }