Optimize SDF slice copy

2021-12-31 21:14:38 +00:00 · 2021-12-31 21:14:38 +00:00 · c47050e29e
commit c47050e29e
parent ba100fdd97
1 changed files with 59 additions and 12 deletions
--- a/generators/graph/voxel_generator_graph.cpp
+++ b/generators/graph/voxel_generator_graph.cpp
@ -396,6 +396,60 @@ void VoxelGeneratorGraph::gather_indices_and_weights(Span<const WeightOutput> we
 	}
 }

+template <typename F, typename Data_T>
+void fill_zx_slice(Span<Data_T> channel_data, float sdf_scale, Vector3i rmin, Vector3i rmax, int ry, int x_stride,
+		const float *src_data, Vector3i buffer_size, F convert_func) {
+	unsigned int src_i = 0;
+	for (int rz = rmin.z; rz < rmax.z; ++rz) {
+		unsigned int dst_i = Vector3iUtil::get_zxy_index(Vector3i(rmin.x, ry, rz), buffer_size);
+		for (int rx = rmin.x; rx < rmax.x; ++rx) {
+			channel_data[dst_i] = convert_func(sdf_scale * src_data[src_i]);
+			++src_i;
+			dst_i += x_stride;
+		}
+	}
+}
+
+static void fill_zx_slice(const VoxelGraphRuntime::Buffer &sdf_buffer, VoxelBufferInternal &out_buffer,
+		unsigned int channel, VoxelBufferInternal::Depth channel_depth, float sdf_scale, Vector3i rmin, Vector3i rmax,
+		int ry) {
+	VOXEL_PROFILE_SCOPE_NAMED("Copy SDF to block");
+
+	if (out_buffer.get_channel_compression(channel) != VoxelBufferInternal::COMPRESSION_NONE) {
+		out_buffer.decompress_channel(channel);
+	}
+	Span<uint8_t> channel_bytes;
+	ERR_FAIL_COND(!out_buffer.get_channel_raw(channel, channel_bytes));
+	const Vector3i buffer_size = out_buffer.get_size();
+	const unsigned int x_stride = Vector3iUtil::get_zxy_index(Vector3i(1, 0, 0), buffer_size) -
+			Vector3iUtil::get_zxy_index(Vector3i(0, 0, 0), buffer_size);
+
+	switch (channel_depth) {
+		case VoxelBufferInternal::DEPTH_8_BIT:
+			fill_zx_slice(channel_bytes, sdf_scale, rmin, rmax, ry, x_stride, sdf_buffer.data, buffer_size, norm_to_u8);
+			break;
+
+		case VoxelBufferInternal::DEPTH_16_BIT:
+			fill_zx_slice(channel_bytes.reinterpret_cast_to<uint16_t>(), sdf_scale, rmin, rmax, ry, x_stride,
+					sdf_buffer.data, buffer_size, norm_to_u16);
+			break;
+
+		case VoxelBufferInternal::DEPTH_32_BIT:
+			fill_zx_slice(channel_bytes.reinterpret_cast_to<float>(), sdf_scale, rmin, rmax, ry, x_stride,
+					sdf_buffer.data, buffer_size, [](float v) { return v; });
+			break;
+
+		case VoxelBufferInternal::DEPTH_64_BIT:
+			fill_zx_slice(channel_bytes.reinterpret_cast_to<double>(), sdf_scale, rmin, rmax, ry, x_stride,
+					sdf_buffer.data, buffer_size, [](double v) { return v; });
+			break;
+
+		default:
+			ERR_PRINT(String("Unknown depth {0}").format(varray(channel_depth)));
+			break;
+	}
+}
+
 VoxelGenerator::Result VoxelGeneratorGraph::generate_block(VoxelBlockRequest &input) {
 	std::shared_ptr<Runtime> runtime_ptr;
 	{
@ -418,8 +472,8 @@ VoxelGenerator::Result VoxelGeneratorGraph::generate_block(VoxelBlockRequest &in
 	// TODO This may be shared across the module
 	// Storing voxels is lossy on some depth configurations. They use normalized SDF,
 	// so we must scale the values to make better use of the offered resolution
-	const float sdf_scale = VoxelBufferInternal::get_sdf_quantization_scale(
-			out_buffer.get_channel_depth(out_buffer.get_channel_depth(channel)));
+	const VoxelBufferInternal::Depth channel_depth = out_buffer.get_channel_depth(channel);
+	const float sdf_scale = VoxelBufferInternal::get_sdf_quantization_scale(channel_depth);

 	const int stride = 1 << input.lod;

@ -522,17 +576,9 @@ VoxelGenerator::Result VoxelGeneratorGraph::generate_block(VoxelBlockRequest &in
 								_use_optimized_execution_map ? &cache.optimized_execution_map : nullptr);

 						{
-							VOXEL_PROFILE_SCOPE_NAMED("Copy SDF to block");
-							unsigned int i = 0;
 							const VoxelGraphRuntime::Buffer &sdf_buffer =
 									cache.state.get_buffer(sdf_output_buffer_index);
-							for (int rz = rmin.z; rz < rmax.z; ++rz) {
-								for (int rx = rmin.x; rx < rmax.x; ++rx) {
-									// TODO Flatten this further, this may run checks we don't need
-									out_buffer.set_voxel_f(sdf_scale * sdf_buffer.data[i], rx, ry, rz, channel);
-									++i;
-								}
-							}
+							fill_zx_slice(sdf_buffer, out_buffer, channel, channel_depth, sdf_scale, rmin, rmax, ry);
 						}

 						if (runtime_ptr->weight_outputs_count > 0) {
@ -589,7 +635,8 @@ VoxelGenerator::Result VoxelGeneratorGraph::generate_block(VoxelBlockRequest &in
 		// they will find it's all default weights.
 		// Possible workarounds:
 		// - Only do it for air
-		// - Also take indices and weights into account, but may lead to way less compression, or none, for stuff that
+		// - Also take indices and weights into account, but may lead to way less compression, or none, for stuff
+		// that
 		//   essentially isnt showing up until dug out
 		// - Invoke generator to produce LOD0 blocks somehow, but main thread could stall
 		result.max_lod_hint = true;