Added mesh optimization option to Transvoxel, using MeshOptimizer

This commit is contained in:
Marc Gilleron 2021-07-11 16:19:49 +01:00
parent 8c17a1cf20
commit 5fcd7f7363
28 changed files with 9401 additions and 7 deletions

6
SCsub
View File

@ -47,7 +47,8 @@ voxel_files = [
"edition/*.cpp",
"thirdparty/lz4/*.c",
"thirdparty/sqlite/*.c"
"thirdparty/sqlite/*.c",
"thirdparty/meshoptimizer/*.cpp"
]
if env["tools"]:
@ -61,6 +62,9 @@ if env["tools"]:
]
voxel_files += voxel_editor_files
# See https://github.com/zeux/meshoptimizer/issues/311
env_voxel.Append(CPPDEFINES=["MESHOPTIMIZER_ZYLANN_NEVER_COLLAPSE_BORDERS"])
if RUN_TESTS:
voxel_files += [
"tests/*.cpp"

View File

@ -208,6 +208,29 @@ void VoxelTool::do_sphere(Vector3 center, float radius) {
_post_edit(box);
}
// Erases matter in every voxel where the provided buffer has matter.
void VoxelTool::sdf_stamp_erase(Ref<VoxelBuffer> stamp, Vector3i pos) {
VOXEL_PROFILE_SCOPE();
ERR_FAIL_COND_MSG(get_channel() != VoxelBuffer::CHANNEL_SDF, "This function only works when channel is set to SDF");
const Box3i box(pos, stamp->get_size());
if (!is_area_editable(box)) {
PRINT_VERBOSE("Area not editable");
return;
}
box.for_each_cell_zxy([this, stamp, pos](Vector3i pos_in_volume) {
const Vector3i pos_in_stamp = pos_in_volume - pos;
const float dst_sdf = stamp->get_voxel_f(
pos_in_stamp.x, pos_in_stamp.y, pos_in_stamp.z, VoxelBuffer::CHANNEL_SDF);
if (dst_sdf <= 0.f) {
_set_voxel_f(pos_in_volume, 1.f);
}
});
_post_edit(box);
}
void VoxelTool::do_box(Vector3i begin, Vector3i end) {
VOXEL_PROFILE_SCOPE();
Vector3i::sort_min_max(begin, end);

View File

@ -63,6 +63,8 @@ public:
virtual void do_sphere(Vector3 center, float radius);
virtual void do_box(Vector3i begin, Vector3i end);
void sdf_stamp_erase(Ref<VoxelBuffer> stamp, Vector3i pos);
virtual void copy(Vector3i pos, Ref<VoxelBuffer> dst, uint8_t channels_mask);
virtual void paste(Vector3i pos, Ref<VoxelBuffer> p_voxels, uint8_t channels_mask, uint64_t mask_value);

View File

@ -1,9 +1,16 @@
#include "voxel_tool_lod_terrain.h"
#include "../constants/voxel_string_names.h"
#include "../terrain/voxel_lod_terrain.h"
#include "../util/funcs.h"
#include "../util/island_finder.h"
#include "../util/voxel_raycast.h"
#include "funcs.h"
#include <scene/3d/collision_shape.h>
#include <scene/3d/mesh_instance.h>
#include <scene/3d/physics_body.h>
#include <scene/main/timer.h>
VoxelToolLodTerrain::VoxelToolLodTerrain(VoxelLodTerrain *terrain, VoxelDataMap &map) :
_terrain(terrain), _map(&map) {
ERR_FAIL_COND(terrain == nullptr);
@ -226,10 +233,340 @@ void VoxelToolLodTerrain::set_raycast_binary_search_iterations(int iterations) {
_raycast_binary_search_iterations = clamp(iterations, 0, 16);
}
// Turns floating chunks of voxels into rigidbodies:
// Detects separate groups of connected voxels within a box. Each group fully contained in the box is removed from
// the source volume, and turned into a rigidbody.
// This is one way of doing it, I don't know if it's the best way (there is rarely a best way)
// so there are probably other approaches that could be explored in the future, if they have better performance
static Array separate_floating_chunks(VoxelTool &voxel_tool, Box3i world_box, Node *parent_node, Transform transform,
Ref<VoxelMesher> mesher, Array materials) {
VOXEL_PROFILE_SCOPE();
// Checks
ERR_FAIL_COND_V(mesher.is_null(), Array());
ERR_FAIL_COND_V(parent_node == nullptr, Array());
// Copy source data
// TODO Do not assume channel, at the moment it's hardcoded for smooth terrain
static const int channels_mask = (1 << VoxelBuffer::CHANNEL_SDF);
static const int main_channel = VoxelBuffer::CHANNEL_SDF;
Ref<VoxelBuffer> source_copy_buffer;
{
VOXEL_PROFILE_SCOPE_NAMED("Copy");
source_copy_buffer.instance();
source_copy_buffer->create(world_box.size);
voxel_tool.copy(world_box.pos, source_copy_buffer, channels_mask);
}
// Label distinct voxel groups
static thread_local std::vector<uint8_t> ccl_output;
ccl_output.resize(world_box.size.volume());
unsigned int label_count = 0;
{
VOXEL_PROFILE_SCOPE_NAMED("CCL scan");
IslandFinder island_finder;
island_finder.scan_3d(
Box3i(Vector3i(), world_box.size), [&source_copy_buffer](Vector3i pos) {
// TODO Can be optimized further with direct access
return source_copy_buffer->get_voxel_f(pos.x, pos.y, pos.z, main_channel) < 0.f;
},
to_span(ccl_output), &label_count);
}
struct Bounds {
Vector3i min_pos;
Vector3i max_pos; // inclusive
bool valid = false;
};
// Compute bounds of each group
std::vector<Bounds> bounds_per_label;
{
VOXEL_PROFILE_SCOPE_NAMED("Bounds calculation");
// Adding 1 because label 0 is the index for "no label"
bounds_per_label.resize(label_count + 1);
unsigned int ccl_index = 0;
for (int z = 0; z < world_box.size.z; ++z) {
for (int x = 0; x < world_box.size.x; ++x) {
for (int y = 0; y < world_box.size.y; ++y) {
CRASH_COND(ccl_index >= ccl_output.size());
const uint8_t label = ccl_output[ccl_index];
++ccl_index;
if (label == 0) {
continue;
}
CRASH_COND(label >= bounds_per_label.size());
Bounds &bounds = bounds_per_label[label];
if (bounds.valid == false) {
bounds.min_pos = Vector3i(x, y, z);
bounds.max_pos = bounds.min_pos;
bounds.valid = true;
} else {
if (x < bounds.min_pos.x) {
bounds.min_pos.x = x;
} else if (x > bounds.max_pos.x) {
bounds.max_pos.x = x;
}
if (y < bounds.min_pos.y) {
bounds.min_pos.y = y;
} else if (y > bounds.max_pos.y) {
bounds.max_pos.y = y;
}
if (z < bounds.min_pos.z) {
bounds.min_pos.z = z;
} else if (z > bounds.max_pos.z) {
bounds.max_pos.z = z;
}
}
}
}
}
}
// Eliminate groups that touch the box border,
// because that means we can't tell if they are truly hanging in the air or attached to land further away
const Vector3i lbmax = world_box.size - Vector3i(1);
for (unsigned int label = 1; label < bounds_per_label.size(); ++label) {
CRASH_COND(label >= bounds_per_label.size());
Bounds &local_bounds = bounds_per_label[label];
ERR_CONTINUE(!local_bounds.valid);
if (
local_bounds.min_pos.x == 0 ||
local_bounds.min_pos.y == 0 ||
local_bounds.min_pos.z == 0 ||
local_bounds.max_pos.x == lbmax.x ||
local_bounds.max_pos.y == lbmax.y ||
local_bounds.max_pos.z == lbmax.z) {
//
local_bounds.valid = false;
}
}
// Create voxel buffer for each group
struct InstanceInfo {
Ref<VoxelBuffer> voxels;
Vector3i world_pos;
unsigned int label;
};
std::vector<InstanceInfo> instances_info;
const int min_padding = 2; //mesher->get_minimum_padding();
const int max_padding = 2; //mesher->get_maximum_padding();
{
VOXEL_PROFILE_SCOPE_NAMED("Extraction");
for (unsigned int label = 1; label < bounds_per_label.size(); ++label) {
CRASH_COND(label >= bounds_per_label.size());
const Bounds local_bounds = bounds_per_label[label];
if (!local_bounds.valid) {
continue;
}
const Vector3i world_pos = world_box.pos + local_bounds.min_pos - Vector3i(min_padding);
const Vector3i size = local_bounds.max_pos - local_bounds.min_pos + Vector3i(1 + max_padding + min_padding);
Ref<VoxelBuffer> buffer;
buffer.instance();
buffer->create(size);
// Read voxels from the source volume
voxel_tool.copy(world_pos, buffer, channels_mask);
// Cleanup padding borders
const Box3i inner_box(Vector3i(min_padding), buffer->get_size() - Vector3i(min_padding + max_padding));
Box3i(Vector3i(), buffer->get_size())
.difference(inner_box, [&buffer](Box3i box) {
buffer->fill_area_f(1.f, box.pos, box.pos + box.size, main_channel);
});
// Filter out voxels that don't belong to this label
for (int z = local_bounds.min_pos.z; z <= local_bounds.max_pos.z; ++z) {
for (int x = local_bounds.min_pos.x; x <= local_bounds.max_pos.x; ++x) {
for (int y = local_bounds.min_pos.y; y <= local_bounds.max_pos.y; ++y) {
const unsigned int ccl_index = Vector3i(x, y, z).get_zxy_index(world_box.size);
CRASH_COND(ccl_index >= ccl_output.size());
const uint8_t label2 = ccl_output[ccl_index];
if (label2 != 0 && label != label2) {
buffer->set_voxel_f(1.f,
min_padding + x - local_bounds.min_pos.x,
min_padding + y - local_bounds.min_pos.y,
min_padding + z - local_bounds.min_pos.z, main_channel);
}
}
}
}
instances_info.push_back(InstanceInfo{ buffer, world_pos, label });
}
}
// Erase voxels from source volume.
// Must be done after we copied voxels from it.
{
VOXEL_PROFILE_SCOPE_NAMED("Erasing");
voxel_tool.set_channel(main_channel);
for (unsigned int instance_index = 0; instance_index < instances_info.size(); ++instance_index) {
CRASH_COND(instance_index >= instances_info.size());
const InstanceInfo info = instances_info[instance_index];
ERR_CONTINUE(info.voxels.is_null());
voxel_tool.sdf_stamp_erase(info.voxels, info.world_pos);
}
}
// Create instances
Array nodes;
{
VOXEL_PROFILE_SCOPE_NAMED("Remeshing and instancing");
for (unsigned int instance_index = 0; instance_index < instances_info.size(); ++instance_index) {
CRASH_COND(instance_index >= instances_info.size());
const InstanceInfo info = instances_info[instance_index];
ERR_CONTINUE(info.voxels.is_null());
CRASH_COND(info.label >= bounds_per_label.size());
const Bounds local_bounds = bounds_per_label[info.label];
ERR_CONTINUE(!local_bounds.valid);
// DEBUG
// print_line(String("--- Instance {0}").format(varray(instance_index)));
// for (int z = 0; z < info.voxels->get_size().z; ++z) {
// for (int x = 0; x < info.voxels->get_size().x; ++x) {
// String s;
// for (int y = 0; y < info.voxels->get_size().y; ++y) {
// float sdf = info.voxels->get_voxel_f(x, y, z, VoxelBuffer::CHANNEL_SDF);
// if (sdf < -0.1f) {
// s += "X ";
// } else if (sdf < 0.f) {
// s += "x ";
// } else {
// s += "- ";
// }
// }
// print_line(s);
// }
// print_line("//");
// }
const Transform local_transform(Basis(), info.world_pos.to_vec3());
for (int i = 0; i < materials.size(); ++i) {
Ref<ShaderMaterial> sm = materials[i];
if (sm.is_valid() &&
sm->get_shader().is_valid() &&
sm->get_shader()->has_param(VoxelStringNames::get_singleton()->u_block_local_transform)) {
// That parameter should have a valid default value matching the local transform relative to the volume,
// which is usually per-instance, but in Godot 3 we have no such feature, so we have to duplicate.
sm = sm->duplicate(false);
sm->set_shader_param(VoxelStringNames::get_singleton()->u_block_local_transform, local_transform);
materials[i] = sm;
}
}
Ref<Mesh> mesh = mesher->build_mesh(info.voxels, materials);
// The mesh is not supposed to be null,
// because we build these buffers from connected groups that had negative SDF.
ERR_CONTINUE(mesh.is_null());
// DEBUG
// {
// Ref<VoxelBlockSerializer> serializer;
// serializer.instance();
// Ref<StreamPeerBuffer> peer;
// peer.instance();
// serializer->serialize(peer, info.voxels, false);
// String fpath = String("debug_data/split_dump_{0}.bin").format(varray(instance_index));
// FileAccess *f = FileAccess::open(fpath, FileAccess::WRITE);
// PoolByteArray bytes = peer->get_data_array();
// PoolByteArray::Read bytes_read = bytes.read();
// f->store_buffer(bytes_read.ptr(), bytes.size());
// f->close();
// memdelete(f);
// }
// TODO Option to make multiple convex shapes
// TODO Use the fast way. This is slow because of the internal TriangleMesh thing.
Ref<Shape> shape = mesh->create_convex_shape();
ERR_CONTINUE(shape.is_null());
CollisionShape *collision_shape = memnew(CollisionShape);
collision_shape->set_shape(shape);
// Center the shape somewhat, because Godot is confusing node origin with center of mass
const Vector3i size = local_bounds.max_pos - local_bounds.min_pos + Vector3i(1 + max_padding + min_padding);
const Vector3 offset = -size.to_vec3() * 0.5f;
collision_shape->set_translation(offset);
RigidBody *rigid_body = memnew(RigidBody);
rigid_body->set_transform(transform * local_transform.translated(-offset));
rigid_body->add_child(collision_shape);
rigid_body->set_mode(RigidBody::MODE_KINEMATIC);
// Switch to rigid after a short time to workaround clipping with terrain,
// because colliders are updated asynchronously
Timer *timer = memnew(Timer);
timer->set_wait_time(0.2);
timer->set_one_shot(true);
timer->connect("timeout", rigid_body, "set_mode", varray(RigidBody::MODE_RIGID));
// Cannot use start() here because it requires to be inside the SceneTree,
// and we don't know if it will be after we add to the parent.
timer->set_autostart(true);
rigid_body->add_child(timer);
MeshInstance *mesh_instance = memnew(MeshInstance);
mesh_instance->set_mesh(mesh);
mesh_instance->set_translation(offset);
rigid_body->add_child(mesh_instance);
parent_node->add_child(rigid_body);
nodes.append(rigid_body);
}
}
return nodes;
}
Array VoxelToolLodTerrain::separate_floating_chunks(AABB world_box, Node *parent_node) {
ERR_FAIL_COND_V(_terrain == nullptr, Array());
Ref<VoxelMesher> mesher = _terrain->get_mesher();
Array materials;
materials.append(_terrain->get_material());
const Box3i int_world_box(world_box.position.floor(), world_box.size.ceil());
return ::separate_floating_chunks(
*this, int_world_box, parent_node, _terrain->get_global_transform(), mesher, materials);
}
void VoxelToolLodTerrain::_bind_methods() {
ClassDB::bind_method(D_METHOD("set_raycast_binary_search_iterations", "iterations"),
&VoxelToolLodTerrain::set_raycast_binary_search_iterations);
ClassDB::bind_method(D_METHOD("get_raycast_binary_search_iterations"),
&VoxelToolLodTerrain::get_raycast_binary_search_iterations);
ClassDB::bind_method(D_METHOD("get_voxel_f_interpolated"), &VoxelToolLodTerrain::get_voxel_f_interpolated);
ClassDB::bind_method(D_METHOD("get_voxel_f_interpolated", "position"),
&VoxelToolLodTerrain::get_voxel_f_interpolated);
ClassDB::bind_method(D_METHOD("separate_floating_chunks", "box", "parent_node"),
&VoxelToolLodTerrain::separate_floating_chunks);
}

View File

@ -25,6 +25,7 @@ public:
// Specialized API
float get_voxel_f_interpolated(Vector3 position) const;
Array separate_floating_chunks(AABB world_box, Node *parent_node);
protected:
uint64_t _get_voxel(Vector3i pos) const override;

View File

@ -369,6 +369,21 @@ void build_regular_mesh(
FixedArray<float, 8> cell_samples_sdf;
for (unsigned int i = 0; i < corner_data_indices.size(); ++i) {
cell_samples_sdf[i] = sdf_as_float(sdf_data[corner_data_indices[i]]);
// TODO Need to investigate if there is a better way to eliminate degenerate triangles.
//
// Presence of zeroes in samples occurs more often when precision is scarce
// (8-bit, scaled SDF, or slow gradients).
// This causes two symptoms:
// - Degenerate triangles. Potentially bad for systems using the mesh later (MeshOptimizer, physics)
// - Glitched triangles. Wrong vertices get re-used.
// Needs closer investigation to know why, maybe related to case selection
//
// See also https://github.com/zeux/meshoptimizer/issues/312
//
// This is a quick fix to avoid it.
if (cell_samples_sdf[i] == 0.f) {
cell_samples_sdf[i] = 0.0001f;
}
}
// Concatenate the sign of cell values to obtain the case code.
@ -636,10 +651,13 @@ void build_regular_mesh(
} // for each cell vertex
for (int t = 0; t < triangle_count; ++t) {
for (int i = 0; i < 3; ++i) {
const int index = cell_vertex_indices[regular_cell_data.get_vertex_index(t * 3 + i)];
output.indices.push_back(index);
}
const int t0 = t * 3;
const int i0 = cell_vertex_indices[regular_cell_data.get_vertex_index(t0)];
const int i1 = cell_vertex_indices[regular_cell_data.get_vertex_index(t0 + 1)];
const int i2 = cell_vertex_indices[regular_cell_data.get_vertex_index(t0 + 2)];
output.indices.push_back(i0);
output.indices.push_back(i1);
output.indices.push_back(i2);
}
} // x

View File

@ -1,5 +1,6 @@
#include "voxel_mesher_transvoxel.h"
#include "../../storage/voxel_buffer.h"
#include "../../thirdparty/meshoptimizer/meshoptimizer.h"
#include "../../util/funcs.h"
#include "../../util/profiling.h"
#include "transvoxel_tables.cpp"
@ -59,11 +60,77 @@ void VoxelMesherTransvoxel::fill_surface_arrays(Array &arrays, const Transvoxel:
arrays[Mesh::ARRAY_INDEX] = indices;
}
template <typename T>
static void remap_vertex_array(const std::vector<T> &src_data, std::vector<T> &dst_data,
const std::vector<unsigned int> &remap_indices, unsigned int unique_vertex_count) {
if (src_data.size() == 0) {
dst_data.clear();
return;
}
dst_data.resize(unique_vertex_count);
meshopt_remapVertexBuffer(&dst_data[0], &src_data[0], src_data.size(), sizeof(T), remap_indices.data());
}
static void simplify(const Transvoxel::MeshArrays &src_mesh, Transvoxel::MeshArrays &dst_mesh,
float p_target_ratio, float p_error_threshold) {
VOXEL_PROFILE_SCOPE();
// Gather and check input
ERR_FAIL_COND(p_target_ratio < 0.f || p_target_ratio > 1.f);
ERR_FAIL_COND(p_error_threshold < 0.f || p_error_threshold > 1.f);
ERR_FAIL_COND(src_mesh.vertices.size() < 3);
ERR_FAIL_COND(src_mesh.indices.size() < 3);
const unsigned int target_index_count = p_target_ratio * src_mesh.indices.size();
static thread_local std::vector<unsigned int> lod_indices;
lod_indices.clear();
lod_indices.resize(src_mesh.indices.size());
float lod_error = 0.f;
// Simplify
{
VOXEL_PROFILE_SCOPE_NAMED("meshopt_simplify");
const unsigned int lod_index_count = meshopt_simplify(
&lod_indices[0], reinterpret_cast<const unsigned int *>(src_mesh.indices.data()), src_mesh.indices.size(),
&src_mesh.vertices[0].x, src_mesh.vertices.size(),
sizeof(Vector3), target_index_count, p_error_threshold, &lod_error);
lod_indices.resize(lod_index_count);
}
// Produce output
Array surface;
surface.resize(Mesh::ARRAY_MAX);
static thread_local std::vector<unsigned int> remap_indices;
remap_indices.clear();
remap_indices.resize(src_mesh.vertices.size());
const unsigned int unique_vertex_count = meshopt_optimizeVertexFetchRemap(
&remap_indices[0], lod_indices.data(), lod_indices.size(), src_mesh.vertices.size());
remap_vertex_array(src_mesh.vertices, dst_mesh.vertices, remap_indices, unique_vertex_count);
remap_vertex_array(src_mesh.normals, dst_mesh.normals, remap_indices, unique_vertex_count);
remap_vertex_array(src_mesh.extra, dst_mesh.extra, remap_indices, unique_vertex_count);
remap_vertex_array(src_mesh.uv, dst_mesh.uv, remap_indices, unique_vertex_count);
dst_mesh.indices.resize(lod_indices.size());
// TODO Not sure if arguments are correct
meshopt_remapIndexBuffer(reinterpret_cast<unsigned int *>(dst_mesh.indices.data()),
lod_indices.data(), lod_indices.size(), remap_indices.data());
}
void VoxelMesherTransvoxel::build(VoxelMesher::Output &output, const VoxelMesher::Input &input) {
VOXEL_PROFILE_SCOPE();
static thread_local Transvoxel::Cache s_cache;
static thread_local Transvoxel::MeshArrays s_mesh_arrays;
static thread_local Transvoxel::MeshArrays s_simplified_mesh_arrays;
const int sdf_channel = VoxelBuffer::CHANNEL_SDF;
@ -91,7 +158,20 @@ void VoxelMesherTransvoxel::build(VoxelMesher::Output &output, const VoxelMesher
}
Array regular_arrays;
fill_surface_arrays(regular_arrays, s_mesh_arrays);
if (_mesh_optimization_params.enabled) {
// TODO When voxel texturing is enabled, this will decrease quality a lot.
// There is no support yet for taking textures into account when simplifying.
// See https://github.com/zeux/meshoptimizer/issues/158
simplify(s_mesh_arrays, s_simplified_mesh_arrays,
_mesh_optimization_params.target_ratio, _mesh_optimization_params.error_threshold);
fill_surface_arrays(regular_arrays, s_simplified_mesh_arrays);
} else {
fill_surface_arrays(regular_arrays, s_mesh_arrays);
}
output.surfaces.push_back(regular_arrays);
for (int dir = 0; dir < Cube::SIDE_COUNT; ++dir) {
@ -159,6 +239,30 @@ VoxelMesherTransvoxel::TexturingMode VoxelMesherTransvoxel::get_texturing_mode()
return _texture_mode;
}
void VoxelMesherTransvoxel::set_mesh_optimization_enabled(bool enabled) {
_mesh_optimization_params.enabled = enabled;
}
bool VoxelMesherTransvoxel::is_mesh_optimization_enabled() const {
return _mesh_optimization_params.enabled;
}
void VoxelMesherTransvoxel::set_mesh_optimization_error_threshold(float threshold) {
_mesh_optimization_params.error_threshold = clamp(threshold, 0.f, 1.f);
}
float VoxelMesherTransvoxel::get_mesh_optimization_error_threshold() const {
return _mesh_optimization_params.error_threshold;
}
void VoxelMesherTransvoxel::set_mesh_optimization_target_ratio(float ratio) {
_mesh_optimization_params.target_ratio = clamp(ratio, 0.f, 1.f);
}
float VoxelMesherTransvoxel::get_mesh_optimization_target_ratio() const {
return _mesh_optimization_params.target_ratio;
}
void VoxelMesherTransvoxel::_bind_methods() {
ClassDB::bind_method(D_METHOD("build_transition_mesh", "voxel_buffer", "direction"),
&VoxelMesherTransvoxel::build_transition_mesh);
@ -166,10 +270,34 @@ void VoxelMesherTransvoxel::_bind_methods() {
ClassDB::bind_method(D_METHOD("set_texturing_mode", "mode"), &VoxelMesherTransvoxel::set_texturing_mode);
ClassDB::bind_method(D_METHOD("get_texturing_mode"), &VoxelMesherTransvoxel::get_texturing_mode);
ClassDB::bind_method(D_METHOD("set_mesh_optimization_enabled", "enabled"),
&VoxelMesherTransvoxel::set_mesh_optimization_enabled);
ClassDB::bind_method(D_METHOD("is_mesh_optimization_enabled"),
&VoxelMesherTransvoxel::is_mesh_optimization_enabled);
ClassDB::bind_method(D_METHOD("set_mesh_optimization_error_threshold", "threshold"),
&VoxelMesherTransvoxel::set_mesh_optimization_error_threshold);
ClassDB::bind_method(D_METHOD("get_mesh_optimization_error_threshold"),
&VoxelMesherTransvoxel::get_mesh_optimization_error_threshold);
ClassDB::bind_method(D_METHOD("set_mesh_optimization_target_ratio", "ratio"),
&VoxelMesherTransvoxel::set_mesh_optimization_target_ratio);
ClassDB::bind_method(D_METHOD("get_mesh_optimization_target_ratio"),
&VoxelMesherTransvoxel::get_mesh_optimization_target_ratio);
ADD_PROPERTY(PropertyInfo(
Variant::INT, "texturing_mode", PROPERTY_HINT_ENUM, "None,4-blend over 16 textures (4 bits)"),
"set_texturing_mode", "get_texturing_mode");
ADD_PROPERTY(PropertyInfo(Variant::BOOL, "mesh_optimization_enabled"),
"set_mesh_optimization_enabled", "is_mesh_optimization_enabled");
ADD_PROPERTY(PropertyInfo(Variant::REAL, "mesh_optimization_error_threshold"),
"set_mesh_optimization_error_threshold", "get_mesh_optimization_error_threshold");
ADD_PROPERTY(PropertyInfo(Variant::REAL, "mesh_optimization_target_ratio"),
"set_mesh_optimization_target_ratio", "get_mesh_optimization_target_ratio");
BIND_ENUM_CONSTANT(TEXTURES_NONE);
BIND_ENUM_CONSTANT(TEXTURES_BLEND_4_OVER_16);
}

View File

@ -27,6 +27,15 @@ public:
void set_texturing_mode(TexturingMode mode);
TexturingMode get_texturing_mode() const;
void set_mesh_optimization_enabled(bool enabled);
bool is_mesh_optimization_enabled() const;
void set_mesh_optimization_error_threshold(float threshold);
float get_mesh_optimization_error_threshold() const;
void set_mesh_optimization_target_ratio(float ratio);
float get_mesh_optimization_target_ratio() const;
protected:
static void _bind_methods();
@ -34,6 +43,14 @@ private:
void fill_surface_arrays(Array &arrays, const Transvoxel::MeshArrays &src);
TexturingMode _texture_mode = TEXTURES_NONE;
struct MeshOptimizationParams {
bool enabled = false;
float error_threshold = 0.005;
float target_ratio = 0.0;
};
MeshOptimizationParams _mesh_optimization_params;
};
VARIANT_ENUM_CAST(VoxelMesherTransvoxel::TexturingMode);

View File

@ -1,6 +1,7 @@
#include "tests.h"
#include "../generators/graph/voxel_generator_graph.h"
#include "../storage/voxel_data_map.h"
#include "../util/island_finder.h"
#include "../util/math/box3i.h"
#include <core/hash_map.h>
@ -476,6 +477,86 @@ void test_voxel_graph_generator_texturing() {
}
}
void test_island_finder() {
const char *cdata =
"X X X - X "
"X X X - - "
"X X X - - "
"X X X - - "
"X X X - - "
//
"- - - - - "
"X X - - - "
"X X - - - "
"X X X X X "
"X X - - X "
//
"- - - - - "
"- - - - - "
"- - - - - "
"- - - - - "
"- - - - - "
//
"- - - - - "
"- - - - - "
"- - X - - "
"- - X X - "
"- - - - - "
//
"- - - - - "
"- - - - - "
"- - - - - "
"- - - X - "
"- - - - - "
//
;
const Vector3i grid_size(5, 5, 5);
ERR_FAIL_COND(grid_size.volume() != strlen(cdata) / 2);
std::vector<int> grid;
grid.resize(grid_size.volume());
for (unsigned int i = 0; i < grid.size(); ++i) {
const char c = cdata[i * 2];
if (c == 'X') {
grid[i] = 1;
} else if (c == '-') {
grid[i] = 0;
} else {
ERR_FAIL();
}
}
std::vector<uint8_t> output;
output.resize(grid_size.volume());
unsigned int label_count;
IslandFinder island_finder;
island_finder.scan_3d(
Box3i(Vector3i(), grid_size),
[&grid, grid_size](Vector3i pos) {
const unsigned int i = pos.get_zxy_index(grid_size);
CRASH_COND(i >= grid.size());
return grid[i] == 1;
},
to_span(output), &label_count);
// unsigned int i = 0;
// for (int z = 0; z < grid_size.z; ++z) {
// for (int x = 0; x < grid_size.x; ++x) {
// String s;
// for (int y = 0; y < grid_size.y; ++y) {
// s += String::num_int64(output[i++]);
// s += " ";
// }
// print_line(s);
// }
// print_line("//");
// }
ERR_FAIL_COND(label_count != 3);
}
void test_unordered_remove_if() {
struct L {
static unsigned int count(const std::vector<int> &vec, int v) {
@ -594,6 +675,7 @@ void run_voxel_tests() {
VOXEL_TEST(test_copy_3d_region_zxy);
VOXEL_TEST(test_voxel_graph_generator_default_graph_compilation);
VOXEL_TEST(test_voxel_graph_generator_texturing);
VOXEL_TEST(test_island_finder);
VOXEL_TEST(test_unordered_remove_if);
print_line("------------ Voxel tests end -------------");

21
thirdparty/meshoptimizer/LICENSE.md vendored Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2016-2021 Arseny Kapoulkine
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

1
thirdparty/meshoptimizer/VERSION.txt vendored Normal file
View File

@ -0,0 +1 @@
316167c3606c4bfd7647976ca0299afa31163ea7

View File

@ -0,0 +1,8 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*))
{
meshopt_Allocator::Storage::allocate = allocate;
meshopt_Allocator::Storage::deallocate = deallocate;
}

856
thirdparty/meshoptimizer/clusterizer.cpp vendored Normal file
View File

@ -0,0 +1,856 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <float.h>
#include <math.h>
#include <string.h>
// This work is based on:
// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
// Jack Ritter. An Efficient Bounding Sphere. 1990
namespace meshopt
{
// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
const size_t kMeshletMaxVertices = 255;
// A reasonable limit is around 2*max_vertices or less
const size_t kMeshletMaxTriangles = 512;
struct TriangleAdjacency2
{
unsigned int* counts;
unsigned int* offsets;
unsigned int* data;
};
static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
{
size_t face_count = index_count / 3;
// allocate arrays
adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
adjacency.data = allocator.allocate<unsigned int>(index_count);
// fill triangle counts
memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
for (size_t i = 0; i < index_count; ++i)
{
assert(indices[i] < vertex_count);
adjacency.counts[indices[i]]++;
}
// fill offset table
unsigned int offset = 0;
for (size_t i = 0; i < vertex_count; ++i)
{
adjacency.offsets[i] = offset;
offset += adjacency.counts[i];
}
assert(offset == index_count);
// fill triangle data
for (size_t i = 0; i < face_count; ++i)
{
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
adjacency.data[adjacency.offsets[a]++] = unsigned(i);
adjacency.data[adjacency.offsets[b]++] = unsigned(i);
adjacency.data[adjacency.offsets[c]++] = unsigned(i);
}
// fix offsets that have been disturbed by the previous pass
for (size_t i = 0; i < vertex_count; ++i)
{
assert(adjacency.offsets[i] >= adjacency.counts[i]);
adjacency.offsets[i] -= adjacency.counts[i];
}
}
static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
{
assert(count > 0);
// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
size_t pmin[3] = {0, 0, 0};
size_t pmax[3] = {0, 0, 0};
for (size_t i = 0; i < count; ++i)
{
const float* p = points[i];
for (int axis = 0; axis < 3; ++axis)
{
pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
}
}
// find the pair of points with largest distance
float paxisd2 = 0;
int paxis = 0;
for (int axis = 0; axis < 3; ++axis)
{
const float* p1 = points[pmin[axis]];
const float* p2 = points[pmax[axis]];
float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
if (d2 > paxisd2)
{
paxisd2 = d2;
paxis = axis;
}
}
// use the longest segment as the initial sphere diameter
const float* p1 = points[pmin[paxis]];
const float* p2 = points[pmax[paxis]];
float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
float radius = sqrtf(paxisd2) / 2;
// iteratively adjust the sphere up until all points fit
for (size_t i = 0; i < count; ++i)
{
const float* p = points[i];
float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
if (d2 > radius * radius)
{
float d = sqrtf(d2);
assert(d > 0);
float k = 0.5f + (radius / d) / 2;
center[0] = center[0] * k + p[0] * (1 - k);
center[1] = center[1] * k + p[1] * (1 - k);
center[2] = center[2] * k + p[2] * (1 - k);
radius = (radius + d) / 2;
}
}
result[0] = center[0];
result[1] = center[1];
result[2] = center[2];
result[3] = radius;
}
struct Cone
{
float px, py, pz;
float nx, ny, nz;
};
static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
{
float cone = 1.f - spread * cone_weight;
float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
}
static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
{
Cone result = acc;
float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
result.px *= center_scale;
result.py *= center_scale;
result.pz *= center_scale;
float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
result.nx *= axis_scale;
result.ny *= axis_scale;
result.nz *= axis_scale;
return result;
}
static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
(void)vertex_count;
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
size_t face_count = index_count / 3;
float mesh_area = 0;
for (size_t i = 0; i < face_count; ++i)
{
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
const float* p0 = vertex_positions + vertex_stride_float * a;
const float* p1 = vertex_positions + vertex_stride_float * b;
const float* p2 = vertex_positions + vertex_stride_float * c;
float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
float normalx = p10[1] * p20[2] - p10[2] * p20[1];
float normaly = p10[2] * p20[0] - p10[0] * p20[2];
float normalz = p10[0] * p20[1] - p10[1] * p20[0];
float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
float invarea = (area == 0.f) ? 0.f : 1.f / area;
triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
triangles[i].nx = normalx * invarea;
triangles[i].ny = normaly * invarea;
triangles[i].nz = normalz * invarea;
mesh_area += area;
}
return mesh_area;
}
static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
{
size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
// fill 4b padding with 0
while (offset & 3)
meshlet_triangles[offset++] = 0;
}
static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
{
unsigned char& av = used[a];
unsigned char& bv = used[b];
unsigned char& cv = used[c];
bool result = false;
unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
{
meshlets[meshlet_offset] = meshlet;
for (size_t j = 0; j < meshlet.vertex_count; ++j)
used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
finishMeshlet(meshlet, meshlet_triangles);
meshlet.vertex_offset += meshlet.vertex_count;
meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
meshlet.vertex_count = 0;
meshlet.triangle_count = 0;
result = true;
}
if (av == 0xff)
{
av = (unsigned char)meshlet.vertex_count;
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
}
if (bv == 0xff)
{
bv = (unsigned char)meshlet.vertex_count;
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
}
if (cv == 0xff)
{
cv = (unsigned char)meshlet.vertex_count;
meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
}
meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
meshlet.triangle_count++;
return result;
}
struct KDNode
{
union
{
float split;
unsigned int index;
};
// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
unsigned int axis : 2;
unsigned int children : 30;
};
static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
{
size_t m = 0;
// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
for (size_t i = 0; i < count; ++i)
{
float v = points[indices[i] * stride + axis];
// swap(m, i) unconditionally
unsigned int t = indices[m];
indices[m] = indices[i];
indices[i] = t;
// when v >= pivot, we swap i with m without advancing it, preserving invariants
m += v < pivot;
}
return m;
}
static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
{
assert(offset + count <= node_count);
(void)node_count;
KDNode& result = nodes[offset];
result.index = indices[0];
result.axis = 3;
result.children = unsigned(count - 1);
// all remaining points are stored in nodes immediately following the leaf
for (size_t i = 1; i < count; ++i)
{
KDNode& tail = nodes[offset + i];
tail.index = indices[i];
tail.axis = 3;
tail.children = ~0u >> 2; // bogus value to prevent misuse
}
return offset + count;
}
static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
{
assert(count > 0);
assert(offset < node_count);
if (count <= leaf_size)
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
float mean[3] = {};
float vars[3] = {};
float runc = 1, runs = 1;
// gather statistics on the points in the subtree using Welford's algorithm
for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
{
const float* point = points + indices[i] * stride;
for (int k = 0; k < 3; ++k)
{
float delta = point[k] - mean[k];
mean[k] += delta * runs;
vars[k] += delta * (point[k] - mean[k]);
}
}
// split axis is one where the variance is largest
unsigned int axis = vars[0] >= vars[1] && vars[0] >= vars[2] ? 0 : vars[1] >= vars[2] ? 1 : 2;
float split = mean[axis];
size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
// when the partition is degenerate simply consolidate the points into a single node
if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
KDNode& result = nodes[offset];
result.split = split;
result.axis = axis;
// left subtree is right after our node
size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
// distance to the right subtree is represented explicitly
result.children = unsigned(next_offset - offset - 1);
return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
}
static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
{
const KDNode& node = nodes[root];
if (node.axis == 3)
{
// leaf
for (unsigned int i = 0; i <= node.children; ++i)
{
unsigned int index = nodes[root + i].index;
if (emitted_flags[index])
continue;
const float* point = points + index * stride;
float distance2 =
(point[0] - position[0]) * (point[0] - position[0]) +
(point[1] - position[1]) * (point[1] - position[1]) +
(point[2] - position[2]) * (point[2] - position[2]);
float distance = sqrtf(distance2);
if (distance < limit)
{
result = index;
limit = distance;
}
}
}
else
{
// branch; we order recursion to process the node that search position is in first
float delta = position[node.axis] - node.split;
unsigned int first = (delta <= 0) ? 0 : node.children;
unsigned int second = first ^ node.children;
kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
// only process the other node if it can have a match based on closest distance so far
if (fabsf(delta) <= limit)
kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
}
}
} // namespace meshopt
size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
(void)kMeshletMaxVertices;
(void)kMeshletMaxTriangles;
// meshlet construction is limited by max vertices and max triangles per meshlet
// the worst case is that the input is an unindexed stream since this equally stresses both limits
// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
size_t max_vertices_conservative = max_vertices - 2;
size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
}
size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
meshopt_Allocator allocator;
TriangleAdjacency2 adjacency = {};
buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
size_t face_count = index_count / 3;
unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
memset(emitted_flags, 0, face_count);
// for each triangle, precompute centroid & normal to use for scoring
Cone* triangles = allocator.allocate<Cone>(face_count);
float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
// assuming each meshlet is a square patch, expected radius is sqrt(expected area)
float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
// build a kd-tree for nearest neighbor lookup
unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
for (size_t i = 0; i < face_count; ++i)
kdindices[i] = unsigned(i);
KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
// index of the vertex in the meshlet, 0xff if the vertex isn't used
unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
memset(used, -1, vertex_count);
meshopt_Meshlet meshlet = {};
size_t meshlet_offset = 0;
Cone meshlet_cone_acc = {};
for (;;)
{
unsigned int best_triangle = ~0u;
unsigned int best_extra = 5;
float best_score = FLT_MAX;
Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
for (size_t i = 0; i < meshlet.vertex_count; ++i)
{
unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
size_t neighbours_size = adjacency.counts[index];
for (size_t j = 0; j < neighbours_size; ++j)
{
unsigned int triangle = neighbours[j];
assert(!emitted_flags[triangle]);
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
// triangles that don't add new vertices to meshlets are max. priority
if (extra != 0)
{
// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
extra = 0;
extra++;
}
// since topology-based priority is always more important than the score, we can skip scoring in some cases
if (extra > best_extra)
continue;
const Cone& tri_cone = triangles[triangle];
float distance2 =
(tri_cone.px - meshlet_cone.px) * (tri_cone.px - meshlet_cone.px) +
(tri_cone.py - meshlet_cone.py) * (tri_cone.py - meshlet_cone.py) +
(tri_cone.pz - meshlet_cone.pz) * (tri_cone.pz - meshlet_cone.pz);
float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
float score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
// note that topology-based priority is always more important than the score
// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
if (extra < best_extra || score < best_score)
{
best_triangle = triangle;
best_extra = extra;
best_score = score;
}
}
}
if (best_triangle == ~0u)
{
float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
unsigned int index = ~0u;
float limit = FLT_MAX;
kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
best_triangle = index;
}
if (best_triangle == ~0u)
break;
unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
{
meshlet_offset++;
memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
}
live_triangles[a]--;
live_triangles[b]--;
live_triangles[c]--;
// remove emitted triangle from adjacency data
// this makes sure that we spend less time traversing these lists on subsequent iterations
for (size_t k = 0; k < 3; ++k)
{
unsigned int index = indices[best_triangle * 3 + k];
unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
size_t neighbours_size = adjacency.counts[index];
for (size_t i = 0; i < neighbours_size; ++i)
{
unsigned int tri = neighbours[i];
if (tri == best_triangle)
{
neighbours[i] = neighbours[neighbours_size - 1];
adjacency.counts[index]--;
break;
}
}
}
// update aggregated meshlet cone data for scoring subsequent triangles
meshlet_cone_acc.px += triangles[best_triangle].px;
meshlet_cone_acc.py += triangles[best_triangle].py;
meshlet_cone_acc.pz += triangles[best_triangle].pz;
meshlet_cone_acc.nx += triangles[best_triangle].nx;
meshlet_cone_acc.ny += triangles[best_triangle].ny;
meshlet_cone_acc.nz += triangles[best_triangle].nz;
emitted_flags[best_triangle] = 1;
}
if (meshlet.triangle_count)
{
finishMeshlet(meshlet, meshlet_triangles);
meshlets[meshlet_offset++] = meshlet;
}
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
return meshlet_offset;
}
size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
meshopt_Allocator allocator;
// index of the vertex in the meshlet, 0xff if the vertex isn't used
unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
memset(used, -1, vertex_count);
meshopt_Meshlet meshlet = {};
size_t meshlet_offset = 0;
for (size_t i = 0; i < index_count; i += 3)
{
unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
// appends triangle to the meshlet and writes previous meshlet to the output if full
meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
}
if (meshlet.triangle_count)
{
finishMeshlet(meshlet, meshlet_triangles);
meshlets[meshlet_offset++] = meshlet;
}
assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
return meshlet_offset;
}
meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(index_count / 3 <= kMeshletMaxTriangles);
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
(void)vertex_count;
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
// compute triangle normals and gather triangle corners
float normals[kMeshletMaxTriangles][3];
float corners[kMeshletMaxTriangles][3][3];
size_t triangles = 0;
for (size_t i = 0; i < index_count; i += 3)
{
unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
const float* p0 = vertex_positions + vertex_stride_float * a;
const float* p1 = vertex_positions + vertex_stride_float * b;
const float* p2 = vertex_positions + vertex_stride_float * c;
float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
float normalx = p10[1] * p20[2] - p10[2] * p20[1];
float normaly = p10[2] * p20[0] - p10[0] * p20[2];
float normalz = p10[0] * p20[1] - p10[1] * p20[0];
float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
// no need to include degenerate triangles - they will be invisible anyway
if (area == 0.f)
continue;
// record triangle normals & corners for future use; normal and corner 0 define a plane equation
normals[triangles][0] = normalx / area;
normals[triangles][1] = normaly / area;
normals[triangles][2] = normalz / area;
memcpy(corners[triangles][0], p0, 3 * sizeof(float));
memcpy(corners[triangles][1], p1, 3 * sizeof(float));
memcpy(corners[triangles][2], p2, 3 * sizeof(float));
triangles++;
}
meshopt_Bounds bounds = {};
// degenerate cluster, no valid triangles => trivial reject (cone data is 0)
if (triangles == 0)
return bounds;
// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
float psphere[4] = {};
computeBoundingSphere(psphere, corners[0], triangles * 3);
float center[3] = {psphere[0], psphere[1], psphere[2]};
// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
float nsphere[4] = {};
computeBoundingSphere(nsphere, normals, triangles);
float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
axis[0] *= invaxislength;
axis[1] *= invaxislength;
axis[2] *= invaxislength;
// compute a tight cone around all normals, mindp = cos(angle/2)
float mindp = 1.f;
for (size_t i = 0; i < triangles; ++i)
{
float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
mindp = (dp < mindp) ? dp : mindp;
}
// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
bounds.center[0] = center[0];
bounds.center[1] = center[1];
bounds.center[2] = center[2];
bounds.radius = psphere[3];
// degenerate cluster, normal cone is larger than a hemisphere => trivial accept
// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
if (mindp <= 0.1f)
{
bounds.cone_cutoff = 1;
bounds.cone_cutoff_s8 = 127;
return bounds;
}
float maxt = 0;
// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
for (size_t i = 0; i < triangles; ++i)
{
// dot(center-t*axis-corner, trinormal) = 0
// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
float cx = center[0] - corners[i][0][0];
float cy = center[1] - corners[i][0][1];
float cz = center[2] - corners[i][0][2];
float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
// dn should be larger than mindp cutoff above
assert(dn > 0.f);
float t = dc / dn;
maxt = (t > maxt) ? t : maxt;
}
// cone apex should be in the negative half-space of all cluster triangles by construction
bounds.cone_apex[0] = center[0] - axis[0] * maxt;
bounds.cone_apex[1] = center[1] - axis[1] * maxt;
bounds.cone_apex[2] = center[2] - axis[2] * maxt;
// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
bounds.cone_axis[0] = axis[0];
bounds.cone_axis[1] = axis[1];
bounds.cone_axis[2] = axis[2];
// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
// quantize axis & cutoff to 8-bit SNORM format
bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
// note that we need to round this up instead of rounding to nearest, hence +1
int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
return bounds;
}
meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(triangle_count <= kMeshletMaxTriangles);
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
unsigned int indices[kMeshletMaxTriangles * 3];
for (size_t i = 0; i < triangle_count * 3; ++i)
{
unsigned int index = meshlet_vertices[meshlet_triangles[i]];
assert(index < vertex_count);
indices[i] = index;
}
return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
}

674
thirdparty/meshoptimizer/indexcodec.cpp vendored Normal file
View File

@ -0,0 +1,674 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <string.h>
// This work is based on:
// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
namespace meshopt
{
const unsigned char kIndexHeader = 0xe0;
const unsigned char kSequenceHeader = 0xd0;
static int gEncodeIndexVersion = 0;
typedef unsigned int VertexFifo[16];
typedef unsigned int EdgeFifo[16][2];
static const unsigned int kTriangleIndexOrder[3][3] = {
{0, 1, 2},
{1, 2, 0},
{2, 0, 1},
};
static const unsigned char kCodeAuxEncodingTable[16] = {
0x00, 0x76, 0x87, 0x56, 0x67, 0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69,
0, 0, // last two entries aren't used for encoding
};
static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c, unsigned int next)
{
(void)a;
return (b == next) ? 1 : (c == next) ? 2 : 0;
}
static int getEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)
{
for (int i = 0; i < 16; ++i)
{
size_t index = (offset - 1 - i) & 15;
unsigned int e0 = fifo[index][0];
unsigned int e1 = fifo[index][1];
if (e0 == a && e1 == b)
return (i << 2) | 0;
if (e0 == b && e1 == c)
return (i << 2) | 1;
if (e0 == c && e1 == a)
return (i << 2) | 2;
}
return -1;
}
static void pushEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, size_t& offset)
{
fifo[offset][0] = a;
fifo[offset][1] = b;
offset = (offset + 1) & 15;
}
static int getVertexFifo(VertexFifo fifo, unsigned int v, size_t offset)
{
for (int i = 0; i < 16; ++i)
{
size_t index = (offset - 1 - i) & 15;
if (fifo[index] == v)
return i;
}
return -1;
}
static void pushVertexFifo(VertexFifo fifo, unsigned int v, size_t& offset, int cond = 1)
{
fifo[offset] = v;
offset = (offset + cond) & 15;
}
static void encodeVByte(unsigned char*& data, unsigned int v)
{
// encode 32-bit value in up to 5 7-bit groups
do
{
*data++ = (v & 127) | (v > 127 ? 128 : 0);
v >>= 7;
} while (v);
}
static unsigned int decodeVByte(const unsigned char*& data)
{
unsigned char lead = *data++;
// fast path: single byte
if (lead < 128)
return lead;
// slow path: up to 4 extra bytes
// note that this loop always terminates, which is important for malformed data
unsigned int result = lead & 127;
unsigned int shift = 7;
for (int i = 0; i < 4; ++i)
{
unsigned char group = *data++;
result |= unsigned(group & 127) << shift;
shift += 7;
if (group < 128)
break;
}
return result;
}
static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int last)
{
unsigned int d = index - last;
unsigned int v = (d << 1) ^ (int(d) >> 31);
encodeVByte(data, v);
}
static unsigned int decodeIndex(const unsigned char*& data, unsigned int last)
{
unsigned int v = decodeVByte(data);
unsigned int d = (v >> 1) ^ -int(v & 1);
return last + d;
}
static int getCodeAuxIndex(unsigned char v, const unsigned char* table)
{
for (int i = 0; i < 16; ++i)
if (table[i] == v)
return i;
return -1;
}
static void writeTriangle(void* destination, size_t offset, size_t index_size, unsigned int a, unsigned int b, unsigned int c)
{
if (index_size == 2)
{
static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
}
else
{
static_cast<unsigned int*>(destination)[offset + 0] = a;
static_cast<unsigned int*>(destination)[offset + 1] = b;
static_cast<unsigned int*>(destination)[offset + 2] = c;
}
}
} // namespace meshopt
size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
{
using namespace meshopt;
assert(index_count % 3 == 0);
// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
if (buffer_size < 1 + index_count / 3 + 16)
return 0;
int version = gEncodeIndexVersion;
buffer[0] = (unsigned char)(kIndexHeader | version);
EdgeFifo edgefifo;
memset(edgefifo, -1, sizeof(edgefifo));
VertexFifo vertexfifo;
memset(vertexfifo, -1, sizeof(vertexfifo));
size_t edgefifooffset = 0;
size_t vertexfifooffset = 0;
unsigned int next = 0;
unsigned int last = 0;
unsigned char* code = buffer + 1;
unsigned char* data = code + index_count / 3;
unsigned char* data_safe_end = buffer + buffer_size - 16;
int fecmax = version >= 1 ? 13 : 15;
// use static encoding table; it's possible to pack the result and then build an optimal table and repack
// for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set
const unsigned char* codeaux_table = kCodeAuxEncodingTable;
for (size_t i = 0; i < index_count; i += 3)
{
// make sure we have enough space to write a triangle
// each triangle writes at most 16 bytes: 1b for codeaux and 5b for each free index
// after this we can be sure we can write without extra bounds checks
if (data > data_safe_end)
return 0;
int fer = getEdgeFifo(edgefifo, indices[i + 0], indices[i + 1], indices[i + 2], edgefifooffset);
if (fer >= 0 && (fer >> 2) < 15)
{
const unsigned int* order = kTriangleIndexOrder[fer & 3];
unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
// encode edge index and vertex fifo index, next or free index
int fe = fer >> 2;
int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
int fec = (fc >= 1 && fc < fecmax) ? fc : (c == next) ? (next++, 0) : 15;
if (fec == 15 && version >= 1)
{
// encode last-1 and last+1 to optimize strip-like sequences
if (c + 1 == last)
fec = 13, last = c;
if (c == last + 1)
fec = 14, last = c;
}
*code++ = (unsigned char)((fe << 4) | fec);
// note that we need to update the last index since free indices are delta-encoded
if (fec == 15)
encodeIndex(data, c, last), last = c;
// we only need to push third vertex since first two are likely already in the vertex fifo
if (fec == 0 || fec >= fecmax)
pushVertexFifo(vertexfifo, c, vertexfifooffset);
// we only need to push two new edges to edge fifo since the third one is already there
pushEdgeFifo(edgefifo, c, b, edgefifooffset);
pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
else
{
int rotation = rotateTriangle(indices[i + 0], indices[i + 1], indices[i + 2], next);
const unsigned int* order = kTriangleIndexOrder[rotation];
unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
// if a/b/c are 0/1/2, we emit a reset code
bool reset = false;
if (a == 0 && b == 1 && c == 2 && next > 0 && version >= 1)
{
reset = true;
next = 0;
// reset vertex fifo to make sure we don't accidentally reference vertices from that in the future
// this makes sure next continues to get incremented instead of being stuck
memset(vertexfifo, -1, sizeof(vertexfifo));
}
int fb = getVertexFifo(vertexfifo, b, vertexfifooffset);
int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
int fea = (a == next) ? (next++, 0) : 15;
int feb = (fb >= 0 && fb < 14) ? (fb + 1) : (b == next) ? (next++, 0) : 15;
int fec = (fc >= 0 && fc < 14) ? (fc + 1) : (c == next) ? (next++, 0) : 15;
// we encode feb & fec in 4 bits using a table if possible, and as a full byte otherwise
unsigned char codeaux = (unsigned char)((feb << 4) | fec);
int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table);
// <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15
if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14 && !reset)
{
*code++ = (unsigned char)((15 << 4) | codeauxindex);
}
else
{
*code++ = (unsigned char)((15 << 4) | 14 | fea);
*data++ = codeaux;
}
// note that we need to update the last index since free indices are delta-encoded
if (fea == 15)
encodeIndex(data, a, last), last = a;
if (feb == 15)
encodeIndex(data, b, last), last = b;
if (fec == 15)
encodeIndex(data, c, last), last = c;
// only push vertices that weren't already in fifo
if (fea == 0 || fea == 15)
pushVertexFifo(vertexfifo, a, vertexfifooffset);
if (feb == 0 || feb == 15)
pushVertexFifo(vertexfifo, b, vertexfifooffset);
if (fec == 0 || fec == 15)
pushVertexFifo(vertexfifo, c, vertexfifooffset);
// all three edges aren't in the fifo; pushing all of them is important so that we can match them for later triangles
pushEdgeFifo(edgefifo, b, a, edgefifooffset);
pushEdgeFifo(edgefifo, c, b, edgefifooffset);
pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
}
// make sure we have enough space to write codeaux table
if (data > data_safe_end)
return 0;
// add codeaux encoding table to the end of the stream; this is used for decoding codeaux *and* as padding
// we need padding for decoding to be able to assume that each triangle is encoded as <= 16 bytes of extra data
// this is enough space for aux byte + 5 bytes per varint index which is the absolute worst case for any input
for (size_t i = 0; i < 16; ++i)
{
// decoder assumes that table entries never refer to separately encoded indices
assert((codeaux_table[i] & 0xf) != 0xf && (codeaux_table[i] >> 4) != 0xf);
*data++ = codeaux_table[i];
}
// since we encode restarts as codeaux without a table reference, we need to make sure 00 is encoded as a table reference
assert(codeaux_table[0] == 0);
assert(data >= buffer + index_count / 3 + 16);
assert(data <= buffer + buffer_size);
return data - buffer;
}
size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
{
assert(index_count % 3 == 0);
// compute number of bits required for each index
unsigned int vertex_bits = 1;
while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
vertex_bits++;
// worst-case encoding is 2 header bytes + 3 varint-7 encoded index deltas
unsigned int vertex_groups = (vertex_bits + 1 + 6) / 7;
return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16;
}
void meshopt_encodeIndexVersion(int version)
{
assert(unsigned(version) <= 1);
meshopt::gEncodeIndexVersion = version;
}
int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(index_size == 2 || index_size == 4);
// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
if (buffer_size < 1 + index_count / 3 + 16)
return -2;
if ((buffer[0] & 0xf0) != kIndexHeader)
return -1;
int version = buffer[0] & 0x0f;
if (version > 1)
return -1;
EdgeFifo edgefifo;
memset(edgefifo, -1, sizeof(edgefifo));
VertexFifo vertexfifo;
memset(vertexfifo, -1, sizeof(vertexfifo));
size_t edgefifooffset = 0;
size_t vertexfifooffset = 0;
unsigned int next = 0;
unsigned int last = 0;
int fecmax = version >= 1 ? 13 : 15;
// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
const unsigned char* code = buffer + 1;
const unsigned char* data = code + index_count / 3;
const unsigned char* data_safe_end = buffer + buffer_size - 16;
const unsigned char* codeaux_table = data_safe_end;
for (size_t i = 0; i < index_count; i += 3)
{
// make sure we have enough data to read for a triangle
// each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
// after this we can be sure we can read without extra bounds checks
if (data > data_safe_end)
return -2;
unsigned char codetri = *code++;
if (codetri < 0xf0)
{
int fe = codetri >> 4;
// fifo reads are wrapped around 16 entry buffer
unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
int fec = codetri & 15;
// note: this is the most common path in the entire decoder
// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
if (fec < fecmax)
{
// fifo reads are wrapped around 16 entry buffer
unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
unsigned int c = (fec == 0) ? next : cf;
int fec0 = fec == 0;
next += fec0;
// output triangle
writeTriangle(destination, i, index_size, a, b, c);
// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
pushEdgeFifo(edgefifo, c, b, edgefifooffset);
pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
else
{
unsigned int c = 0;
// fec - (fec ^ 3) decodes 13, 14 into -1, 1
// note that we need to update the last index since free indices are delta-encoded
last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
// output triangle
writeTriangle(destination, i, index_size, a, b, c);
// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, c, vertexfifooffset);
pushEdgeFifo(edgefifo, c, b, edgefifooffset);
pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
}
else
{
// fast path: read codeaux from the table
if (codetri < 0xfe)
{
unsigned char codeaux = codeaux_table[codetri & 15];
// note: table can't contain feb/fec=15
int feb = codeaux >> 4;
int fec = codeaux & 15;
// fifo reads are wrapped around 16 entry buffer
// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
unsigned int a = next++;
unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
unsigned int b = (feb == 0) ? next : bf;
int feb0 = feb == 0;
next += feb0;
unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
unsigned int c = (fec == 0) ? next : cf;
int fec0 = fec == 0;
next += fec0;
// output triangle
writeTriangle(destination, i, index_size, a, b, c);
// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, a, vertexfifooffset);
pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
pushEdgeFifo(edgefifo, b, a, edgefifooffset);
pushEdgeFifo(edgefifo, c, b, edgefifooffset);
pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
else
{
// slow path: read a full byte for codeaux instead of using a table lookup
unsigned char codeaux = *data++;
int fea = codetri == 0xfe ? 0 : 15;
int feb = codeaux >> 4;
int fec = codeaux & 15;
// reset: codeaux is 0 but encoded as not-a-table
if (codeaux == 0)
next = 0;
// fifo reads are wrapped around 16 entry buffer
// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
unsigned int a = (fea == 0) ? next++ : 0;
unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
// note that we need to update the last index since free indices are delta-encoded
if (fea == 15)
last = a = decodeIndex(data, last);
if (feb == 15)
last = b = decodeIndex(data, last);
if (fec == 15)
last = c = decodeIndex(data, last);
// output triangle
writeTriangle(destination, i, index_size, a, b, c);
// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
pushVertexFifo(vertexfifo, a, vertexfifooffset);
pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
pushEdgeFifo(edgefifo, b, a, edgefifooffset);
pushEdgeFifo(edgefifo, c, b, edgefifooffset);
pushEdgeFifo(edgefifo, a, c, edgefifooffset);
}
}
}
// we should've read all data bytes and stopped at the boundary between data and codeaux table
if (data != data_safe_end)
return -3;
return 0;
}
size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
{
using namespace meshopt;
// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
if (buffer_size < 1 + index_count + 4)
return 0;
int version = gEncodeIndexVersion;
buffer[0] = (unsigned char)(kSequenceHeader | version);
unsigned int last[2] = {};
unsigned int current = 0;
unsigned char* data = buffer + 1;
unsigned char* data_safe_end = buffer + buffer_size - 4;
for (size_t i = 0; i < index_count; ++i)
{
// make sure we have enough data to write
// each index writes at most 5 bytes of data; there's a 4 byte tail after data_safe_end
// after this we can be sure we can write without extra bounds checks
if (data >= data_safe_end)
return 0;
unsigned int index = indices[i];
// this is a heuristic that switches between baselines when the delta grows too large
// we want the encoded delta to fit into one byte (7 bits), but 2 bits are used for sign and baseline index
// for now we immediately switch the baseline when delta grows too large - this can be adjusted arbitrarily
int cd = int(index - last[current]);
current ^= ((cd < 0 ? -cd : cd) >= 30);
// encode delta from the last index
unsigned int d = index - last[current];
unsigned int v = (d << 1) ^ (int(d) >> 31);
// note: low bit encodes the index of the last baseline which will be used for reconstruction
encodeVByte(data, (v << 1) | current);
// update last for the next iteration that uses it
last[current] = index;
}
// make sure we have enough space to write tail
if (data > data_safe_end)
return 0;
for (int k = 0; k < 4; ++k)
*data++ = 0;
return data - buffer;
}
size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count)
{
// compute number of bits required for each index
unsigned int vertex_bits = 1;
while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
vertex_bits++;
// worst-case encoding is 1 varint-7 encoded index delta for a K bit value and an extra bit
unsigned int vertex_groups = (vertex_bits + 1 + 1 + 6) / 7;
return 1 + index_count * vertex_groups + 4;
}
int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
{
using namespace meshopt;
// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
if (buffer_size < 1 + index_count + 4)
return -2;
if ((buffer[0] & 0xf0) != kSequenceHeader)
return -1;
int version = buffer[0] & 0x0f;
if (version > 1)
return -1;
const unsigned char* data = buffer + 1;
const unsigned char* data_safe_end = buffer + buffer_size - 4;
unsigned int last[2] = {};
for (size_t i = 0; i < index_count; ++i)
{
// make sure we have enough data to read
// each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
// after this we can be sure we can read without extra bounds checks
if (data >= data_safe_end)
return -2;
unsigned int v = decodeVByte(data);
// decode the index of the last baseline
unsigned int current = v & 1;
v >>= 1;
// reconstruct index as a delta
unsigned int d = (v >> 1) ^ -int(v & 1);
unsigned int index = last[current] + d;
// update last for the next iteration that uses it
last[current] = index;
if (index_size == 2)
{
static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
}
else
{
static_cast<unsigned int*>(destination)[i] = index;
}
}
// we should've read all data bytes and stopped at the boundary between data and tail
if (data != data_safe_end)
return -3;
return 0;
}

View File

@ -0,0 +1,551 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <string.h>
// This work is based on:
// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
namespace meshopt
{
static unsigned int hashUpdate4(unsigned int h, const unsigned char* key, size_t len)
{
// MurmurHash2
const unsigned int m = 0x5bd1e995;
const int r = 24;
while (len >= 4)
{
unsigned int k = *reinterpret_cast<const unsigned int*>(key);
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
key += 4;
len -= 4;
}
return h;
}
struct VertexHasher
{
const unsigned char* vertices;
size_t vertex_size;
size_t vertex_stride;
size_t hash(unsigned int index) const
{
return hashUpdate4(0, vertices + index * vertex_stride, vertex_size);
}
bool equal(unsigned int lhs, unsigned int rhs) const
{
return memcmp(vertices + lhs * vertex_stride, vertices + rhs * vertex_stride, vertex_size) == 0;
}
};
struct VertexStreamHasher
{
const meshopt_Stream* streams;
size_t stream_count;
size_t hash(unsigned int index) const
{
unsigned int h = 0;
for (size_t i = 0; i < stream_count; ++i)
{
const meshopt_Stream& s = streams[i];
const unsigned char* data = static_cast<const unsigned char*>(s.data);
h = hashUpdate4(h, data + index * s.stride, s.size);
}
return h;
}
bool equal(unsigned int lhs, unsigned int rhs) const
{
for (size_t i = 0; i < stream_count; ++i)
{
const meshopt_Stream& s = streams[i];
const unsigned char* data = static_cast<const unsigned char*>(s.data);
if (memcmp(data + lhs * s.stride, data + rhs * s.stride, s.size) != 0)
return false;
}
return true;
}
};
struct EdgeHasher
{
const unsigned int* remap;
size_t hash(unsigned long long edge) const
{
unsigned int e0 = unsigned(edge >> 32);
unsigned int e1 = unsigned(edge);
unsigned int h1 = remap[e0];
unsigned int h2 = remap[e1];
const unsigned int m = 0x5bd1e995;
// MurmurHash64B finalizer
h1 ^= h2 >> 18;
h1 *= m;
h2 ^= h1 >> 22;
h2 *= m;
h1 ^= h2 >> 17;
h1 *= m;
h2 ^= h1 >> 19;
h2 *= m;
return h2;
}
bool equal(unsigned long long lhs, unsigned long long rhs) const
{
unsigned int l0 = unsigned(lhs >> 32);
unsigned int l1 = unsigned(lhs);
unsigned int r0 = unsigned(rhs >> 32);
unsigned int r1 = unsigned(rhs);
return remap[l0] == remap[r0] && remap[l1] == remap[r1];
}
};
static size_t hashBuckets(size_t count)
{
size_t buckets = 1;
while (buckets < count + count / 4)
buckets *= 2;
return buckets;
}
template <typename T, typename Hash>
static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
{
assert(buckets > 0);
assert((buckets & (buckets - 1)) == 0);
size_t hashmod = buckets - 1;
size_t bucket = hash.hash(key) & hashmod;
for (size_t probe = 0; probe <= hashmod; ++probe)
{
T& item = table[bucket];
if (item == empty)
return &item;
if (hash.equal(item, key))
return &item;
// hash collision, quadratic probing
bucket = (bucket + probe + 1) & hashmod;
}
assert(false && "Hash table is full"); // unreachable
return 0;
}
static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
{
VertexHasher vertex_hasher = {reinterpret_cast<const unsigned char*>(vertex_positions), 3 * sizeof(float), vertex_positions_stride};
size_t vertex_table_size = hashBuckets(vertex_count);
unsigned int* vertex_table = allocator.allocate<unsigned int>(vertex_table_size);
memset(vertex_table, -1, vertex_table_size * sizeof(unsigned int));
for (size_t i = 0; i < vertex_count; ++i)
{
unsigned int index = unsigned(i);
unsigned int* entry = hashLookup(vertex_table, vertex_table_size, vertex_hasher, index, ~0u);
if (*entry == ~0u)
*entry = index;
remap[index] = *entry;
}
}
} // namespace meshopt
size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
{
using namespace meshopt;
assert(indices || index_count == vertex_count);
assert(index_count % 3 == 0);
assert(vertex_size > 0 && vertex_size <= 256);
meshopt_Allocator allocator;
memset(destination, -1, vertex_count * sizeof(unsigned int));
VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
unsigned int next_vertex = 0;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices ? indices[i] : unsigned(i);
assert(index < vertex_count);
if (destination[index] == ~0u)
{
unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
if (*entry == ~0u)
{
*entry = index;
destination[index] = next_vertex++;
}
else
{
assert(destination[*entry] != ~0u);
destination[index] = destination[*entry];
}
}
}
assert(next_vertex <= vertex_count);
return next_vertex;
}
size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
{
using namespace meshopt;
assert(indices || index_count == vertex_count);
assert(index_count % 3 == 0);
assert(stream_count > 0 && stream_count <= 16);
for (size_t i = 0; i < stream_count; ++i)
{
assert(streams[i].size > 0 && streams[i].size <= 256);
assert(streams[i].size <= streams[i].stride);
}
meshopt_Allocator allocator;
memset(destination, -1, vertex_count * sizeof(unsigned int));
VertexStreamHasher hasher = {streams, stream_count};
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
unsigned int next_vertex = 0;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices ? indices[i] : unsigned(i);
assert(index < vertex_count);
if (destination[index] == ~0u)
{
unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
if (*entry == ~0u)
{
*entry = index;
destination[index] = next_vertex++;
}
else
{
assert(destination[*entry] != ~0u);
destination[index] = destination[*entry];
}
}
}
assert(next_vertex <= vertex_count);
return next_vertex;
}
void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
{
assert(vertex_size > 0 && vertex_size <= 256);
meshopt_Allocator allocator;
// support in-place remap
if (destination == vertices)
{
unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
memcpy(vertices_copy, vertices, vertex_count * vertex_size);
vertices = vertices_copy;
}
for (size_t i = 0; i < vertex_count; ++i)
{
if (remap[i] != ~0u)
{
assert(remap[i] < vertex_count);
memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
}
}
}
void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap)
{
assert(index_count % 3 == 0);
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices ? indices[i] : unsigned(i);
assert(remap[index] != ~0u);
destination[i] = remap[index];
}
}
void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
{
using namespace meshopt;
assert(indices);
assert(index_count % 3 == 0);
assert(vertex_size > 0 && vertex_size <= 256);
assert(vertex_size <= vertex_stride);
meshopt_Allocator allocator;
unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
memset(remap, -1, vertex_count * sizeof(unsigned int));
VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
if (remap[index] == ~0u)
{
unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
if (*entry == ~0u)
*entry = index;
remap[index] = *entry;
}
destination[i] = remap[index];
}
}
void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
{
using namespace meshopt;
assert(indices);
assert(index_count % 3 == 0);
assert(stream_count > 0 && stream_count <= 16);
for (size_t i = 0; i < stream_count; ++i)
{
assert(streams[i].size > 0 && streams[i].size <= 256);
assert(streams[i].size <= streams[i].stride);
}
meshopt_Allocator allocator;
unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
memset(remap, -1, vertex_count * sizeof(unsigned int));
VertexStreamHasher hasher = {streams, stream_count};
size_t table_size = hashBuckets(vertex_count);
unsigned int* table = allocator.allocate<unsigned int>(table_size);
memset(table, -1, table_size * sizeof(unsigned int));
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
if (remap[index] == ~0u)
{
unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
if (*entry == ~0u)
*entry = index;
remap[index] = *entry;
}
destination[i] = remap[index];
}
}
void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
meshopt_Allocator allocator;
static const int next[4] = {1, 2, 0, 1};
// build position remap: for each vertex, which other (canonical) vertex does it map to?
unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
// build edge set; this stores all triangle edges but we can look these up by any other wedge
EdgeHasher edge_hasher = {remap};
size_t edge_table_size = hashBuckets(index_count);
unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
unsigned int* edge_vertex_table = allocator.allocate<unsigned int>(edge_table_size);
memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
memset(edge_vertex_table, -1, edge_table_size * sizeof(unsigned int));
for (size_t i = 0; i < index_count; i += 3)
{
for (int e = 0; e < 3; ++e)
{
unsigned int i0 = indices[i + e];
unsigned int i1 = indices[i + next[e]];
unsigned int i2 = indices[i + next[e + 1]];
assert(i0 < vertex_count && i1 < vertex_count && i2 < vertex_count);
unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
if (*entry == ~0ull)
{
*entry = edge;
// store vertex opposite to the edge
edge_vertex_table[entry - edge_table] = i2;
}
}
}
// build resulting index buffer: 6 indices for each input triangle
for (size_t i = 0; i < index_count; i += 3)
{
unsigned int patch[6];
for (int e = 0; e < 3; ++e)
{
unsigned int i0 = indices[i + e];
unsigned int i1 = indices[i + next[e]];
assert(i0 < vertex_count && i1 < vertex_count);
// note: this refers to the opposite edge!
unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
unsigned long long* oppe = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
patch[e * 2 + 0] = i0;
patch[e * 2 + 1] = (*oppe == ~0ull) ? i0 : edge_vertex_table[oppe - edge_table];
}
memcpy(destination + i * 2, patch, sizeof(patch));
}
}
void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
meshopt_Allocator allocator;
static const int next[3] = {1, 2, 0};
// build position remap: for each vertex, which other (canonical) vertex does it map to?
unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
// build edge set; this stores all triangle edges but we can look these up by any other wedge
EdgeHasher edge_hasher = {remap};
size_t edge_table_size = hashBuckets(index_count);
unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
for (size_t i = 0; i < index_count; i += 3)
{
for (int e = 0; e < 3; ++e)
{
unsigned int i0 = indices[i + e];
unsigned int i1 = indices[i + next[e]];
assert(i0 < vertex_count && i1 < vertex_count);
unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
if (*entry == ~0ull)
*entry = edge;
}
}
// build resulting index buffer: 12 indices for each input triangle
for (size_t i = 0; i < index_count; i += 3)
{
unsigned int patch[12];
for (int e = 0; e < 3; ++e)
{
unsigned int i0 = indices[i + e];
unsigned int i1 = indices[i + next[e]];
assert(i0 < vertex_count && i1 < vertex_count);
// note: this refers to the opposite edge!
unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
unsigned long long oppe = *hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
// use the same edge if opposite edge doesn't exist (border)
oppe = (oppe == ~0ull) ? edge : oppe;
// triangle index (0, 1, 2)
patch[e] = i0;
// opposite edge (3, 4; 5, 6; 7, 8)
patch[3 + e * 2 + 0] = unsigned(oppe);
patch[3 + e * 2 + 1] = unsigned(oppe >> 32);
// dominant vertex (9, 10, 11)
patch[9 + e] = remap[i0];
}
memcpy(destination + i * 4, patch, sizeof(patch));
}
}

1029
thirdparty/meshoptimizer/meshoptimizer.h vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,230 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <float.h>
#include <string.h>
// This work is based on:
// Nicolas Capens. Advanced Rasterization. 2004
namespace meshopt
{
const int kViewport = 256;
struct OverdrawBuffer
{
float z[kViewport][kViewport][2];
unsigned int overdraw[kViewport][kViewport][2];
};
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
#ifndef max
#define max(a, b) ((a) > (b) ? (a) : (b))
#endif
static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
{
// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
// z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1)
// (x2-x1 y2-y1)(dzdx) = (z2-z1)
// (x3-x1 y3-y1)(dzdy) (z3-z1)
// we'll solve it with Cramer's rule
float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
float invdet = (det == 0) ? 0 : 1 / det;
dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
return det;
}
// half-space fixed point triangle rasterizer
static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, float v2x, float v2y, float v2z, float v3x, float v3y, float v3z)
{
// compute depth gradients
float DZx, DZy;
float det = computeDepthGradients(DZx, DZy, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
int sign = det > 0;
// flip backfacing triangles to simplify rasterization logic
if (sign)
{
// flipping v2 & v3 preserves depth gradients since they're based on v1
float t;
t = v2x, v2x = v3x, v3x = t;
t = v2y, v2y = v3y, v3y = t;
t = v2z, v2z = v3z, v3z = t;
// flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below
v1z = kViewport - v1z;
DZx = -DZx;
DZy = -DZy;
}
// coordinates, 28.4 fixed point
int X1 = int(16.0f * v1x + 0.5f);
int X2 = int(16.0f * v2x + 0.5f);
int X3 = int(16.0f * v3x + 0.5f);
int Y1 = int(16.0f * v1y + 0.5f);
int Y2 = int(16.0f * v2y + 0.5f);
int Y3 = int(16.0f * v3y + 0.5f);
// bounding rectangle, clipped against viewport
// since we rasterize pixels with covered centers, min >0.5 should round up
// as for max, due to top-left filling convention we will never rasterize right/bottom edges
// so max >= 0.5 should round down
int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
// deltas, 28.4 fixed point
int DX12 = X1 - X2;
int DX23 = X2 - X3;
int DX31 = X3 - X1;
int DY12 = Y1 - Y2;
int DY23 = Y2 - Y3;
int DY31 = Y3 - Y1;
// fill convention correction
int TL1 = DY12 < 0 || (DY12 == 0 && DX12 > 0);
int TL2 = DY23 < 0 || (DY23 == 0 && DX23 > 0);
int TL3 = DY31 < 0 || (DY31 == 0 && DX31 > 0);
// half edge equations, 24.8 fixed point
// note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers
int FX = (minx << 4) + 8;
int FY = (miny << 4) + 8;
int CY1 = DX12 * (FY - Y1) - DY12 * (FX - X1) + TL1 - 1;
int CY2 = DX23 * (FY - Y2) - DY23 * (FX - X2) + TL2 - 1;
int CY3 = DX31 * (FY - Y3) - DY31 * (FX - X3) + TL3 - 1;
float ZY = v1z + (DZx * float(FX - X1) + DZy * float(FY - Y1)) * (1 / 16.f);
for (int y = miny; y < maxy; y++)
{
int CX1 = CY1;
int CX2 = CY2;
int CX3 = CY3;
float ZX = ZY;
for (int x = minx; x < maxx; x++)
{
// check if all CXn are non-negative
if ((CX1 | CX2 | CX3) >= 0)
{
if (ZX >= buffer->z[y][x][sign])
{
buffer->z[y][x][sign] = ZX;
buffer->overdraw[y][x][sign]++;
}
}
// signed left shift is UB for negative numbers so use unsigned-signed casts
CX1 -= int(unsigned(DY12) << 4);
CX2 -= int(unsigned(DY23) << 4);
CX3 -= int(unsigned(DY31) << 4);
ZX += DZx;
}
// signed left shift is UB for negative numbers so use unsigned-signed casts
CY1 += int(unsigned(DX12) << 4);
CY2 += int(unsigned(DX23) << 4);
CY3 += int(unsigned(DX31) << 4);
ZY += DZy;
}
}
} // namespace meshopt
meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
meshopt_Allocator allocator;
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
meshopt_OverdrawStatistics result = {};
float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
for (size_t i = 0; i < vertex_count; ++i)
{
const float* v = vertex_positions + i * vertex_stride_float;
for (int j = 0; j < 3; ++j)
{
minv[j] = min(minv[j], v[j]);
maxv[j] = max(maxv[j], v[j]);
}
}
float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
float scale = kViewport / extent;
float* triangles = allocator.allocate<float>(index_count * 3);
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
const float* v = vertex_positions + index * vertex_stride_float;
triangles[i * 3 + 0] = (v[0] - minv[0]) * scale;
triangles[i * 3 + 1] = (v[1] - minv[1]) * scale;
triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
}
OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
for (int axis = 0; axis < 3; ++axis)
{
memset(buffer, 0, sizeof(OverdrawBuffer));
for (size_t i = 0; i < index_count; i += 3)
{
const float* vn0 = &triangles[3 * (i + 0)];
const float* vn1 = &triangles[3 * (i + 1)];
const float* vn2 = &triangles[3 * (i + 2)];
switch (axis)
{
case 0:
rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
break;
case 1:
rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
break;
case 2:
rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
break;
}
}
for (int y = 0; y < kViewport; ++y)
for (int x = 0; x < kViewport; ++x)
for (int s = 0; s < 2; ++s)
{
unsigned int overdraw = buffer->overdraw[y][x][s];
result.pixels_covered += overdraw > 0;
result.pixels_shaded += overdraw;
}
}
result.overdraw = result.pixels_covered ? float(result.pixels_shaded) / float(result.pixels_covered) : 0.f;
return result;
}

View File

@ -0,0 +1,333 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <math.h>
#include <string.h>
// This work is based on:
// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
namespace meshopt
{
static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
float mesh_centroid[3] = {};
for (size_t i = 0; i < index_count; ++i)
{
const float* p = vertex_positions + vertex_stride_float * indices[i];
mesh_centroid[0] += p[0];
mesh_centroid[1] += p[1];
mesh_centroid[2] += p[2];
}
mesh_centroid[0] /= index_count;
mesh_centroid[1] /= index_count;
mesh_centroid[2] /= index_count;
for (size_t cluster = 0; cluster < cluster_count; ++cluster)
{
size_t cluster_begin = clusters[cluster] * 3;
size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
assert(cluster_begin < cluster_end);
float cluster_area = 0;
float cluster_centroid[3] = {};
float cluster_normal[3] = {};
for (size_t i = cluster_begin; i < cluster_end; i += 3)
{
const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
float normalx = p10[1] * p20[2] - p10[2] * p20[1];
float normaly = p10[2] * p20[0] - p10[0] * p20[2];
float normalz = p10[0] * p20[1] - p10[1] * p20[0];
float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
cluster_normal[0] += normalx;
cluster_normal[1] += normaly;
cluster_normal[2] += normalz;
cluster_area += area;
}
float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
cluster_centroid[0] *= inv_cluster_area;
cluster_centroid[1] *= inv_cluster_area;
cluster_centroid[2] *= inv_cluster_area;
float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
cluster_normal[0] *= inv_cluster_normal_length;
cluster_normal[1] *= inv_cluster_normal_length;
cluster_normal[2] *= inv_cluster_normal_length;
float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
}
}
static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
{
// compute sort data bounds and renormalize, using fixed point snorm
float sort_data_max = 1e-3f;
for (size_t i = 0; i < cluster_count; ++i)
{
float dpa = fabsf(sort_data[i]);
sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
}
const int sort_bits = 11;
for (size_t i = 0; i < cluster_count; ++i)
{
// note that we flip distribution since high dot product should come first
float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
}
// fill histogram for counting sort
unsigned int histogram[1 << sort_bits];
memset(histogram, 0, sizeof(histogram));
for (size_t i = 0; i < cluster_count; ++i)
{
histogram[sort_keys[i]]++;
}
// compute offsets based on histogram data
size_t histogram_sum = 0;
for (size_t i = 0; i < 1 << sort_bits; ++i)
{
size_t count = histogram[i];
histogram[i] = unsigned(histogram_sum);
histogram_sum += count;
}
assert(histogram_sum == cluster_count);
// compute sort order based on offsets
for (size_t i = 0; i < cluster_count; ++i)
{
sort_order[histogram[sort_keys[i]]++] = unsigned(i);
}
}
static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
{
unsigned int cache_misses = 0;
// if vertex is not in cache, put it in cache
if (timestamp - cache_timestamps[a] > cache_size)
{
cache_timestamps[a] = timestamp++;
cache_misses++;
}
if (timestamp - cache_timestamps[b] > cache_size)
{
cache_timestamps[b] = timestamp++;
cache_misses++;
}
if (timestamp - cache_timestamps[c] > cache_size)
{
cache_timestamps[c] = timestamp++;
cache_misses++;
}
return cache_misses;
}
static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
{
memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
unsigned int timestamp = cache_size + 1;
size_t face_count = index_count / 3;
size_t result = 0;
for (size_t i = 0; i < face_count; ++i)
{
unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
// when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
// that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
// suggests an inefficiency in the vertex cache optimization algorithm
// usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
if (i == 0 || m == 3)
{
destination[result++] = unsigned(i);
}
}
assert(result <= index_count / 3);
return result;
}
static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
{
memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
unsigned int timestamp = 0;
size_t result = 0;
for (size_t it = 0; it < cluster_count; ++it)
{
size_t start = clusters[it];
size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
assert(start < end);
// reset cache
timestamp += cache_size + 1;
// measure cluster ACMR
unsigned int cluster_misses = 0;
for (size_t i = start; i < end; ++i)
{
unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
cluster_misses += m;
}
float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
// first cluster always starts from the hard cluster boundary
destination[result++] = unsigned(start);
// reset cache
timestamp += cache_size + 1;
unsigned int running_misses = 0;
unsigned int running_faces = 0;
for (size_t i = start; i < end; ++i)
{
unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
running_misses += m;
running_faces += 1;
if (float(running_misses) / float(running_faces) <= cluster_threshold)
{
// we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
// note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
// cluster is empty; however, the 'pop_back' after the loop will clean it up
destination[result++] = unsigned(i + 1);
// reset cache
timestamp += cache_size + 1;
running_misses = 0;
running_faces = 0;
}
}
// each time we reach the target ACMR we flush the cluster
// this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
// in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
// thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
// there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
// to the cluster boundary array which we need to remove anyway - this code will do that automatically
if (destination[result - 1] != start)
{
result--;
}
}
assert(result >= cluster_count);
assert(result <= index_count / 3);
return result;
}
} // namespace meshopt
void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
meshopt_Allocator allocator;
// guard for empty meshes
if (index_count == 0 || vertex_count == 0)
return;
// support in-place optimization
if (destination == indices)
{
unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
indices = indices_copy;
}
unsigned int cache_size = 16;
unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
// generate hard boundaries from full-triangle cache misses
unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
// generate soft boundaries
unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
const unsigned int* clusters = soft_clusters;
size_t cluster_count = soft_cluster_count;
// fill sort data
float* sort_data = allocator.allocate<float>(cluster_count);
calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
// sort clusters using sort data
unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
// fill output buffer
size_t offset = 0;
for (size_t it = 0; it < cluster_count; ++it)
{
unsigned int cluster = sort_order[it];
assert(cluster < cluster_count);
size_t cluster_begin = clusters[cluster] * 3;
size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
assert(cluster_begin < cluster_end);
memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
offset += cluster_end - cluster_begin;
}
assert(offset == index_count);
}

1669
thirdparty/meshoptimizer/simplifier.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,194 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <float.h>
#include <string.h>
// This work is based on:
// Fabian Giesen. Decoding Morton codes. 2009
namespace meshopt
{
// "Insert" two 0 bits after each of the 10 low bits of x
inline unsigned int part1By2(unsigned int x)
{
x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
return x;
}
static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
{
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
for (size_t i = 0; i < vertex_count; ++i)
{
const float* v = vertex_positions_data + i * vertex_stride_float;
for (int j = 0; j < 3; ++j)
{
float vj = v[j];
minv[j] = minv[j] > vj ? vj : minv[j];
maxv[j] = maxv[j] < vj ? vj : maxv[j];
}
}
float extent = 0.f;
extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
float scale = extent == 0 ? 0.f : 1.f / extent;
// generate Morton order based on the position inside a unit cube
for (size_t i = 0; i < vertex_count; ++i)
{
const float* v = vertex_positions_data + i * vertex_stride_float;
int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
}
}
static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
{
memset(hist, 0, sizeof(hist));
// compute 3 10-bit histograms in parallel
for (size_t i = 0; i < count; ++i)
{
unsigned int id = data[i];
hist[(id >> 0) & 1023][0]++;
hist[(id >> 10) & 1023][1]++;
hist[(id >> 20) & 1023][2]++;
}
unsigned int sumx = 0, sumy = 0, sumz = 0;
// replace histogram data with prefix histogram sums in-place
for (int i = 0; i < 1024; ++i)
{
unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
hist[i][0] = sumx;
hist[i][1] = sumy;
hist[i][2] = sumz;
sumx += hx;
sumy += hy;
sumz += hz;
}
assert(sumx == count && sumy == count && sumz == count);
}
static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
{
int bitoff = pass * 10;
for (size_t i = 0; i < count; ++i)
{
unsigned int id = (keys[source[i]] >> bitoff) & 1023;
destination[hist[id][pass]++] = source[i];
}
}
} // namespace meshopt
void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
meshopt_Allocator allocator;
unsigned int* keys = allocator.allocate<unsigned int>(vertex_count);
computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
unsigned int hist[1024][3];
computeHistogram(hist, keys, vertex_count);
unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
for (size_t i = 0; i < vertex_count; ++i)
destination[i] = unsigned(i);
// 3-pass radix sort computes the resulting order into scratch
radixPass(scratch, destination, keys, vertex_count, hist, 0);
radixPass(destination, scratch, keys, vertex_count, hist, 1);
radixPass(scratch, destination, keys, vertex_count, hist, 2);
// since our remap table is mapping old=>new, we need to reverse it
for (size_t i = 0; i < vertex_count; ++i)
destination[scratch[i]] = unsigned(i);
}
void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
assert(vertex_positions_stride % sizeof(float) == 0);
(void)vertex_count;
size_t face_count = index_count / 3;
size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
meshopt_Allocator allocator;
float* centroids = allocator.allocate<float>(face_count * 3);
for (size_t i = 0; i < face_count; ++i)
{
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
const float* va = vertex_positions + a * vertex_stride_float;
const float* vb = vertex_positions + b * vertex_stride_float;
const float* vc = vertex_positions + c * vertex_stride_float;
centroids[i * 3 + 0] = (va[0] + vb[0] + vc[0]) / 3.f;
centroids[i * 3 + 1] = (va[1] + vb[1] + vc[1]) / 3.f;
centroids[i * 3 + 2] = (va[2] + vb[2] + vc[2]) / 3.f;
}
unsigned int* remap = allocator.allocate<unsigned int>(face_count);
meshopt_spatialSortRemap(remap, centroids, face_count, sizeof(float) * 3);
// support in-order remap
if (destination == indices)
{
unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
indices = indices_copy;
}
for (size_t i = 0; i < face_count; ++i)
{
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
unsigned int r = remap[i];
destination[r * 3 + 0] = a;
destination[r * 3 + 1] = b;
destination[r * 3 + 2] = c;
}
}

295
thirdparty/meshoptimizer/stripifier.cpp vendored Normal file
View File

@ -0,0 +1,295 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <limits.h>
#include <string.h>
// This work is based on:
// Francine Evans, Steven Skiena and Amitabh Varshney. Optimizing Triangle Strips for Fast Rendering. 1996
namespace meshopt
{
static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
{
unsigned int index = 0;
unsigned int iv = ~0u;
for (size_t i = 0; i < buffer_size; ++i)
{
unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
unsigned int v = (va < vb && va < vc) ? va : (vb < vc) ? vb : vc;
if (v < iv)
{
index = unsigned(i);
iv = v;
}
}
return index;
}
static int findStripNext(const unsigned int buffer[][3], unsigned int buffer_size, unsigned int e0, unsigned int e1)
{
for (size_t i = 0; i < buffer_size; ++i)
{
unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
if (e0 == a && e1 == b)
return (int(i) << 2) | 2;
else if (e0 == b && e1 == c)
return (int(i) << 2) | 0;
else if (e0 == c && e1 == a)
return (int(i) << 2) | 1;
}
return -1;
}
} // namespace meshopt
size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index)
{
assert(destination != indices);
assert(index_count % 3 == 0);
using namespace meshopt;
meshopt_Allocator allocator;
const size_t buffer_capacity = 8;
unsigned int buffer[buffer_capacity][3] = {};
unsigned int buffer_size = 0;
size_t index_offset = 0;
unsigned int strip[2] = {};
unsigned int parity = 0;
size_t strip_size = 0;
// compute vertex valence; this is used to prioritize starting triangle for strips
unsigned int* valence = allocator.allocate<unsigned int>(vertex_count);
memset(valence, 0, vertex_count * sizeof(unsigned int));
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
valence[index]++;
}
int next = -1;
while (buffer_size > 0 || index_offset < index_count)
{
assert(next < 0 || (size_t(next >> 2) < buffer_size && (next & 3) < 3));
// fill triangle buffer
while (buffer_size < buffer_capacity && index_offset < index_count)
{
buffer[buffer_size][0] = indices[index_offset + 0];
buffer[buffer_size][1] = indices[index_offset + 1];
buffer[buffer_size][2] = indices[index_offset + 2];
buffer_size++;
index_offset += 3;
}
assert(buffer_size > 0);
if (next >= 0)
{
unsigned int i = next >> 2;
unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
unsigned int v = buffer[i][next & 3];
// ordered removal from the buffer
memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
buffer_size--;
// update vertex valences for strip start heuristic
valence[a]--;
valence[b]--;
valence[c]--;
// find next triangle (note that edge order flips on every iteration)
// in some cases we need to perform a swap to pick a different outgoing triangle edge
// for [a b c], the default strip edge is [b c], but we might want to use [a c]
int cont = findStripNext(buffer, buffer_size, parity ? strip[1] : v, parity ? v : strip[1]);
int swap = cont < 0 ? findStripNext(buffer, buffer_size, parity ? v : strip[0], parity ? strip[0] : v) : -1;
if (cont < 0 && swap >= 0)
{
// [a b c] => [a b a c]
destination[strip_size++] = strip[0];
destination[strip_size++] = v;
// next strip has same winding
// ? a b => b a v
strip[1] = v;
next = swap;
}
else
{
// emit the next vertex in the strip
destination[strip_size++] = v;
// next strip has flipped winding
strip[0] = strip[1];
strip[1] = v;
parity ^= 1;
next = cont;
}
}
else
{
// if we didn't find anything, we need to find the next new triangle
// we use a heuristic to maximize the strip length
unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
// ordered removal from the buffer
memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
buffer_size--;
// update vertex valences for strip start heuristic
valence[a]--;
valence[b]--;
valence[c]--;
// we need to pre-rotate the triangle so that we will find a match in the existing buffer on the next iteration
int ea = findStripNext(buffer, buffer_size, c, b);
int eb = findStripNext(buffer, buffer_size, a, c);
int ec = findStripNext(buffer, buffer_size, b, a);
// in some cases we can have several matching edges; since we can pick any edge, we pick the one with the smallest
// triangle index in the buffer. this reduces the effect of stripification on ACMR and additionally - for unclear
// reasons - slightly improves the stripification efficiency
int mine = INT_MAX;
mine = (ea >= 0 && mine > ea) ? ea : mine;
mine = (eb >= 0 && mine > eb) ? eb : mine;
mine = (ec >= 0 && mine > ec) ? ec : mine;
if (ea == mine)
{
// keep abc
next = ea;
}
else if (eb == mine)
{
// abc -> bca
unsigned int t = a;
a = b, b = c, c = t;
next = eb;
}
else if (ec == mine)
{
// abc -> cab
unsigned int t = c;
c = b, b = a, a = t;
next = ec;
}
if (restart_index)
{
if (strip_size)
destination[strip_size++] = restart_index;
destination[strip_size++] = a;
destination[strip_size++] = b;
destination[strip_size++] = c;
// new strip always starts with the same edge winding
strip[0] = b;
strip[1] = c;
parity = 1;
}
else
{
if (strip_size)
{
// connect last strip using degenerate triangles
destination[strip_size++] = strip[1];
destination[strip_size++] = a;
}
// note that we may need to flip the emitted triangle based on parity
// we always end up with outgoing edge "cb" in the end
unsigned int e0 = parity ? c : b;
unsigned int e1 = parity ? b : c;
destination[strip_size++] = a;
destination[strip_size++] = e0;
destination[strip_size++] = e1;
strip[0] = e0;
strip[1] = e1;
parity ^= 1;
}
}
}
return strip_size;
}
size_t meshopt_stripifyBound(size_t index_count)
{
assert(index_count % 3 == 0);
// worst case without restarts is 2 degenerate indices and 3 indices per triangle
// worst case with restarts is 1 restart index and 3 indices per triangle
return (index_count / 3) * 5;
}
size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index)
{
assert(destination != indices);
size_t offset = 0;
size_t start = 0;
for (size_t i = 0; i < index_count; ++i)
{
if (restart_index && indices[i] == restart_index)
{
start = i + 1;
}
else if (i - start >= 2)
{
unsigned int a = indices[i - 2], b = indices[i - 1], c = indices[i];
// flip winding for odd triangles
if ((i - start) & 1)
{
unsigned int t = a;
a = b, b = t;
}
// although we use restart indices, strip swaps still produce degenerate triangles, so skip them
if (a != b && a != c && b != c)
{
destination[offset + 0] = a;
destination[offset + 1] = b;
destination[offset + 2] = c;
offset += 3;
}
}
}
return offset;
}
size_t meshopt_unstripifyBound(size_t index_count)
{
assert(index_count == 0 || index_count >= 3);
return (index_count == 0) ? 0 : (index_count - 2) * 3;
}

View File

@ -0,0 +1,73 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <string.h>
meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
{
assert(index_count % 3 == 0);
assert(cache_size >= 3);
assert(warp_size == 0 || warp_size >= 3);
meshopt_Allocator allocator;
meshopt_VertexCacheStatistics result = {};
unsigned int warp_offset = 0;
unsigned int primgroup_offset = 0;
unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
unsigned int timestamp = cache_size + 1;
for (size_t i = 0; i < index_count; i += 3)
{
unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
assert(a < vertex_count && b < vertex_count && c < vertex_count);
bool ac = (timestamp - cache_timestamps[a]) > cache_size;
bool bc = (timestamp - cache_timestamps[b]) > cache_size;
bool cc = (timestamp - cache_timestamps[c]) > cache_size;
// flush cache if triangle doesn't fit into warp or into the primitive buffer
if ((primgroup_size && primgroup_offset == primgroup_size) || (warp_size && warp_offset + ac + bc + cc > warp_size))
{
result.warps_executed += warp_offset > 0;
warp_offset = 0;
primgroup_offset = 0;
// reset cache
timestamp += cache_size + 1;
}
// update cache and add vertices to warp
for (int j = 0; j < 3; ++j)
{
unsigned int index = indices[i + j];
if (timestamp - cache_timestamps[index] > cache_size)
{
cache_timestamps[index] = timestamp++;
result.vertices_transformed++;
warp_offset++;
}
}
primgroup_offset++;
}
size_t unique_vertex_count = 0;
for (size_t i = 0; i < vertex_count; ++i)
unique_vertex_count += cache_timestamps[i] > 0;
result.warps_executed += warp_offset > 0;
result.acmr = index_count == 0 ? 0 : float(result.vertices_transformed) / float(index_count / 3);
result.atvr = unique_vertex_count == 0 ? 0 : float(result.vertices_transformed) / float(unique_vertex_count);
return result;
}

View File

@ -0,0 +1,473 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <string.h>
// This work is based on:
// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
namespace meshopt
{
const size_t kCacheSizeMax = 16;
const size_t kValenceMax = 8;
struct VertexScoreTable
{
float cache[1 + kCacheSizeMax];
float live[1 + kValenceMax];
};
// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
static const VertexScoreTable kVertexScoreTable = {
{0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
{0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
};
// Tuned to minimize the encoded index buffer size
static const VertexScoreTable kVertexScoreTableStrip = {
{0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
{0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
};
struct TriangleAdjacency
{
unsigned int* counts;
unsigned int* offsets;
unsigned int* data;
};
static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
{
size_t face_count = index_count / 3;
// allocate arrays
adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
adjacency.data = allocator.allocate<unsigned int>(index_count);
// fill triangle counts
memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
for (size_t i = 0; i < index_count; ++i)
{
assert(indices[i] < vertex_count);
adjacency.counts[indices[i]]++;
}
// fill offset table
unsigned int offset = 0;
for (size_t i = 0; i < vertex_count; ++i)
{
adjacency.offsets[i] = offset;
offset += adjacency.counts[i];
}
assert(offset == index_count);
// fill triangle data
for (size_t i = 0; i < face_count; ++i)
{
unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
adjacency.data[adjacency.offsets[a]++] = unsigned(i);
adjacency.data[adjacency.offsets[b]++] = unsigned(i);
adjacency.data[adjacency.offsets[c]++] = unsigned(i);
}
// fix offsets that have been disturbed by the previous pass
for (size_t i = 0; i < vertex_count; ++i)
{
assert(adjacency.offsets[i] >= adjacency.counts[i]);
adjacency.offsets[i] -= adjacency.counts[i];
}
}
static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
{
// check dead-end stack
while (dead_end_top)
{
unsigned int vertex = dead_end[--dead_end_top];
if (live_triangles[vertex] > 0)
return vertex;
}
// input order
while (input_cursor < vertex_count)
{
if (live_triangles[input_cursor] > 0)
return input_cursor;
++input_cursor;
}
return ~0u;
}
static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
{
unsigned int best_candidate = ~0u;
int best_priority = -1;
for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
{
unsigned int vertex = *next_candidate;
// otherwise we don't need to process it
if (live_triangles[vertex] > 0)
{
int priority = 0;
// will it be in cache after fanning?
if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
{
priority = timestamp - cache_timestamps[vertex]; // position in cache
}
if (priority > best_priority)
{
best_candidate = vertex;
best_priority = priority;
}
}
}
return best_candidate;
}
static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
{
assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
}
static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
{
// input order
while (input_cursor < face_count)
{
if (!emitted_flags[input_cursor])
return input_cursor;
++input_cursor;
}
return ~0u;
}
} // namespace meshopt
void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
{
using namespace meshopt;
assert(index_count % 3 == 0);
meshopt_Allocator allocator;
// guard for empty meshes
if (index_count == 0 || vertex_count == 0)
return;
// support in-place optimization
if (destination == indices)
{
unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
indices = indices_copy;
}
unsigned int cache_size = 16;
assert(cache_size <= kCacheSizeMax);
size_t face_count = index_count / 3;
// build adjacency information
TriangleAdjacency adjacency = {};
buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
// live triangle counts
unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
// emitted flags
unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
memset(emitted_flags, 0, face_count);
// compute initial vertex scores
float* vertex_scores = allocator.allocate<float>(vertex_count);
for (size_t i = 0; i < vertex_count; ++i)
vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
// compute triangle scores
float* triangle_scores = allocator.allocate<float>(face_count);
for (size_t i = 0; i < face_count; ++i)
{
unsigned int a = indices[i * 3 + 0];
unsigned int b = indices[i * 3 + 1];
unsigned int c = indices[i * 3 + 2];
triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
}
unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
unsigned int* cache = cache_holder;
unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
size_t cache_count = 0;
unsigned int current_triangle = 0;
unsigned int input_cursor = 1;
unsigned int output_triangle = 0;
while (current_triangle != ~0u)
{
assert(output_triangle < face_count);
unsigned int a = indices[current_triangle * 3 + 0];
unsigned int b = indices[current_triangle * 3 + 1];
unsigned int c = indices[current_triangle * 3 + 2];
// output indices
destination[output_triangle * 3 + 0] = a;
destination[output_triangle * 3 + 1] = b;
destination[output_triangle * 3 + 2] = c;
output_triangle++;
// update emitted flags
emitted_flags[current_triangle] = true;
triangle_scores[current_triangle] = 0;
// new triangle
size_t cache_write = 0;
cache_new[cache_write++] = a;
cache_new[cache_write++] = b;
cache_new[cache_write++] = c;
// old triangles
for (size_t i = 0; i < cache_count; ++i)
{
unsigned int index = cache[i];
if (index != a && index != b && index != c)
{
cache_new[cache_write++] = index;
}
}
unsigned int* cache_temp = cache;
cache = cache_new, cache_new = cache_temp;
cache_count = cache_write > cache_size ? cache_size : cache_write;
// update live triangle counts
live_triangles[a]--;
live_triangles[b]--;
live_triangles[c]--;
// remove emitted triangle from adjacency data
// this makes sure that we spend less time traversing these lists on subsequent iterations
for (size_t k = 0; k < 3; ++k)
{
unsigned int index = indices[current_triangle * 3 + k];
unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
size_t neighbours_size = adjacency.counts[index];
for (size_t i = 0; i < neighbours_size; ++i)
{
unsigned int tri = neighbours[i];
if (tri == current_triangle)
{
neighbours[i] = neighbours[neighbours_size - 1];
adjacency.counts[index]--;
break;
}
}
}
unsigned int best_triangle = ~0u;
float best_score = 0;
// update cache positions, vertex scores and triangle scores, and find next best triangle
for (size_t i = 0; i < cache_write; ++i)
{
unsigned int index = cache[i];
int cache_position = i >= cache_size ? -1 : int(i);
// update vertex score
float score = vertexScore(table, cache_position, live_triangles[index]);
float score_diff = score - vertex_scores[index];
vertex_scores[index] = score;
// update scores of vertex triangles
const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
{
unsigned int tri = *it;
assert(!emitted_flags[tri]);
float tri_score = triangle_scores[tri] + score_diff;
assert(tri_score > 0);
if (best_score < tri_score)
{
best_triangle = tri;
best_score = tri_score;
}
triangle_scores[tri] = tri_score;
}
}
// step through input triangles in order if we hit a dead-end
current_triangle = best_triangle;
if (current_triangle == ~0u)
{
current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
}
}
assert(input_cursor == face_count);
assert(output_triangle == face_count);
}
void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
{
meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
}
void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
{
meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
}
void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
{
using namespace meshopt;
assert(index_count % 3 == 0);
assert(cache_size >= 3);
meshopt_Allocator allocator;
// guard for empty meshes
if (index_count == 0 || vertex_count == 0)
return;
// support in-place optimization
if (destination == indices)
{
unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
indices = indices_copy;
}
size_t face_count = index_count / 3;
// build adjacency information
TriangleAdjacency adjacency = {};
buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
// live triangle counts
unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
// cache time stamps
unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
// dead-end stack
unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
unsigned int dead_end_top = 0;
// emitted flags
unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
memset(emitted_flags, 0, face_count);
unsigned int current_vertex = 0;
unsigned int timestamp = cache_size + 1;
unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
unsigned int output_triangle = 0;
while (current_vertex != ~0u)
{
const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
// emit all vertex neighbours
const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
{
unsigned int triangle = *it;
if (!emitted_flags[triangle])
{
unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
// output indices
destination[output_triangle * 3 + 0] = a;
destination[output_triangle * 3 + 1] = b;
destination[output_triangle * 3 + 2] = c;
output_triangle++;
// update dead-end stack
dead_end[dead_end_top + 0] = a;
dead_end[dead_end_top + 1] = b;
dead_end[dead_end_top + 2] = c;
dead_end_top += 3;
// update live triangle counts
live_triangles[a]--;
live_triangles[b]--;
live_triangles[c]--;
// update cache info
// if vertex is not in cache, put it in cache
if (timestamp - cache_timestamps[a] > cache_size)
cache_timestamps[a] = timestamp++;
if (timestamp - cache_timestamps[b] > cache_size)
cache_timestamps[b] = timestamp++;
if (timestamp - cache_timestamps[c] > cache_size)
cache_timestamps[c] = timestamp++;
// update emitted flags
emitted_flags[triangle] = true;
}
}
// next candidates are the ones we pushed to dead-end stack just now
const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
// get next vertex
current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
if (current_vertex == ~0u)
{
current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
}
}
assert(output_triangle == face_count);
}

1193
thirdparty/meshoptimizer/vertexcodec.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,843 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <math.h>
#include <string.h>
// The block below auto-detects SIMD ISA that can be used on the target platform
#ifndef MESHOPTIMIZER_NO_SIMD
// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
#if defined(__SSE2__)
#define SIMD_SSE
#endif
// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
#define SIMD_SSE
#endif
// GCC/clang define these when NEON support is available
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#define SIMD_NEON
#endif
// On MSVC, we assume that ARM builds always target NEON-capable devices
#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
#define SIMD_NEON
#endif
// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
#if defined(__wasm_simd128__)
#define SIMD_WASM
#endif
#endif // !MESHOPTIMIZER_NO_SIMD
#ifdef SIMD_SSE
#include <emmintrin.h>
#include <stdint.h>
#endif
#ifdef _MSC_VER
#include <intrin.h>
#endif
#ifdef SIMD_NEON
#if defined(_MSC_VER) && defined(_M_ARM64)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#endif
#ifdef SIMD_WASM
#include <wasm_simd128.h>
#endif
#ifdef SIMD_WASM
#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
#endif
namespace meshopt
{
#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
template <typename T>
static void decodeFilterOct(T* data, size_t count)
{
const float max = float((1 << (sizeof(T) * 8 - 1)) - 1);
for (size_t i = 0; i < count; ++i)
{
// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
float x = float(data[i * 4 + 0]);
float y = float(data[i * 4 + 1]);
float z = float(data[i * 4 + 2]) - fabsf(x) - fabsf(y);
// fixup octahedral coordinates for z<0
float t = (z >= 0.f) ? 0.f : z;
x += (x >= 0.f) ? t : -t;
y += (y >= 0.f) ? t : -t;
// compute normal length & scale
float l = sqrtf(x * x + y * y + z * z);
float s = max / l;
// rounded signed float->int
int xf = int(x * s + (x >= 0.f ? 0.5f : -0.5f));
int yf = int(y * s + (y >= 0.f ? 0.5f : -0.5f));
int zf = int(z * s + (z >= 0.f ? 0.5f : -0.5f));
data[i * 4 + 0] = T(xf);
data[i * 4 + 1] = T(yf);
data[i * 4 + 2] = T(zf);
}
}
static void decodeFilterQuat(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
for (size_t i = 0; i < count; ++i)
{
// recover scale from the high byte of the component
int sf = data[i * 4 + 3] | 3;
float ss = scale / float(sf);
// convert x/y/z to [-1..1] (scaled...)
float x = float(data[i * 4 + 0]) * ss;
float y = float(data[i * 4 + 1]) * ss;
float z = float(data[i * 4 + 2]) * ss;
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
float ww = 1.f - x * x - y * y - z * z;
float w = sqrtf(ww >= 0.f ? ww : 0.f);
// rounded signed float->int
int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
int wf = int(w * 32767.f + 0.5f);
int qc = data[i * 4 + 3] & 3;
// output order is dictated by input index
data[i * 4 + ((qc + 1) & 3)] = short(xf);
data[i * 4 + ((qc + 2) & 3)] = short(yf);
data[i * 4 + ((qc + 3) & 3)] = short(zf);
data[i * 4 + ((qc + 0) & 3)] = short(wf);
}
}
static void decodeFilterExp(unsigned int* data, size_t count)
{
for (size_t i = 0; i < count; ++i)
{
unsigned int v = data[i];
// decode mantissa and exponent
int m = int(v << 8) >> 8;
int e = int(v) >> 24;
union
{
float f;
unsigned int ui;
} u;
// optimized version of ldexp(float(m), e)
u.ui = unsigned(e + 127) << 23;
u.f = u.f * float(m);
data[i] = u.ui;
}
}
#endif
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
template <typename T>
static void dispatchSimd(void (*process)(T*, size_t), T* data, size_t count, size_t stride)
{
assert(stride <= 4);
size_t count4 = count & ~size_t(3);
process(data, count4);
if (count4 < count)
{
T tail[4 * 4] = {}; // max stride 4, max count 4
size_t tail_size = (count - count4) * stride * sizeof(T);
assert(tail_size <= sizeof(tail));
memcpy(tail, data + count4 * stride, tail_size);
process(tail, count - count4);
memcpy(data + count4 * stride, tail, tail_size);
}
}
inline uint64_t rotateleft64(uint64_t v, int x)
{
#if defined(_MSC_VER) && !defined(__clang__)
return _rotl64(v, x);
// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
return __builtin_rotateleft64(v, x);
#else
return (v << (x & 63)) | (v >> ((64 - x) & 63));
#endif
}
#endif
#ifdef SIMD_SSE
static void decodeFilterOctSimd(signed char* data, size_t count)
{
const __m128 sign = _mm_set1_ps(-0.f);
for (size_t i = 0; i < count; i += 4)
{
__m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24);
__m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24);
// unpack z; note that z is unsigned so we technically don't need to sign extend it
__m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24);
// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
__m128 x = _mm_cvtepi32_ps(xf);
__m128 y = _mm_cvtepi32_ps(yf);
__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
// fixup octahedral coordinates for z<0
__m128 t = _mm_min_ps(z, _mm_setzero_ps());
x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
// compute normal length & scale
__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
__m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll));
// rounded signed float->int
__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
// combine xr/yr/zr into final value
__m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000));
res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff)));
res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8));
res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16));
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
}
}
static void decodeFilterOctSimd(short* data, size_t count)
{
const __m128 sign = _mm_set1_ps(-0.f);
for (size_t i = 0; i < count; i += 4)
{
__m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
__m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
// gather both x/y 16-bit pairs in each 32-bit lane
__m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0)));
// sign-extends each of x,y in [x y] with arithmetic shifts
__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16);
__m128i yf = _mm_srai_epi32(n4, 16);
// unpack z; note that z is unsigned so we don't need to sign extend it
__m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1)));
__m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff));
// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
__m128 x = _mm_cvtepi32_ps(xf);
__m128 y = _mm_cvtepi32_ps(yf);
__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
// fixup octahedral coordinates for z<0
__m128 t = _mm_min_ps(z, _mm_setzero_ps());
x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
// compute normal length & scale
__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
__m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll));
// rounded signed float->int
__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
// mix x/z and y/0 to make 16-bit unpack easier
__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
__m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff));
// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
__m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
// patch in .w
res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
}
}
static void decodeFilterQuatSimd(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
__m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
__m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
// gather both x/y 16-bit pairs in each 32-bit lane
__m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0)));
__m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1)));
// sign-extends each of x,y in [x y] with arithmetic shifts
__m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16);
__m128i yf = _mm_srai_epi32(q4_xy, 16);
__m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16);
__m128i cf = _mm_srai_epi32(q4_zc, 16);
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
// convert x/y/z to [-1..1] (scaled...)
__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
__m128 s = _mm_set1_ps(32767.f);
// rounded signed float->int
__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
// mix x/z and w/y to make 16-bit unpack easier
__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
__m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16));
// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
__m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
__m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
// store results to stack so that we can rotate using scalar instructions
uint64_t res[4];
_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1);
// rotate and store
uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
}
}
static void decodeFilterExpSimd(unsigned int* data, size_t count)
{
for (size_t i = 0; i < count; i += 4)
{
__m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
// decode exponent into 2^x directly
__m128i ef = _mm_srai_epi32(v, 24);
__m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23);
// decode 24-bit mantissa into floating-point value
__m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8);
__m128 m = _mm_cvtepi32_ps(mf);
__m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
}
}
#endif
#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
inline float32x4_t vsqrtq_f32(float32x4_t x)
{
float32x4_t r = vrsqrteq_f32(x);
r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
return vmulq_f32(r, x);
}
inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
{
float32x4_t r = vrecpeq_f32(y);
r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
return vmulq_f32(x, r);
}
#endif
#ifdef SIMD_NEON
static void decodeFilterOctSimd(signed char* data, size_t count)
{
const int32x4_t sign = vdupq_n_s32(0x80000000);
for (size_t i = 0; i < count; i += 4)
{
int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24);
int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24);
// unpack z; note that z is unsigned so we technically don't need to sign extend it
int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24);
// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
float32x4_t x = vcvtq_f32_s32(xf);
float32x4_t y = vcvtq_f32_s32(yf);
float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
// fixup octahedral coordinates for z<0
float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
// compute normal length & scale
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
float32x4_t rl = vrsqrteq_f32(ll);
float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
// combine xr/yr/zr into final value
int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
}
}
static void decodeFilterOctSimd(short* data, size_t count)
{
const int32x4_t sign = vdupq_n_s32(0x80000000);
for (size_t i = 0; i < count; i += 4)
{
int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
// gather both x/y 16-bit pairs in each 32-bit lane
int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0];
// sign-extends each of x,y in [x y] with arithmetic shifts
int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16);
int32x4_t yf = vshrq_n_s32(n4, 16);
// unpack z; note that z is unsigned so we don't need to sign extend it
int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1];
int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff));
// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
float32x4_t x = vcvtq_f32_s32(xf);
float32x4_t y = vcvtq_f32_s32(yf);
float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
// fixup octahedral coordinates for z<0
float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
// compute normal length & scale
float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
float32x4_t rl = vrsqrteq_f32(ll);
rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
// mix x/z and y/0 to make 16-bit unpack easier
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]);
int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]);
// patch in .w
res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0);
res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1);
vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
}
}
static void decodeFilterQuatSimd(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
// gather both x/y 16-bit pairs in each 32-bit lane
int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0];
int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1];
// sign-extends each of x,y in [x y] with arithmetic shifts
int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16);
int32x4_t yf = vshrq_n_s32(q4_xy, 16);
int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16);
int32x4_t cf = vshrq_n_s32(q4_zc, 16);
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
// convert x/y/z to [-1..1] (scaled...)
float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
float32x4_t s = vdupq_n_f32(32767.f);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const float32x4_t fsnap = vdupq_n_f32(3 << 22);
int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
// mix x/z and w/y to make 16-bit unpack easier
int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
// rotate and store
uint64_t* out = (uint64_t*)&data[i * 4];
out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
}
}
static void decodeFilterExpSimd(unsigned int* data, size_t count)
{
for (size_t i = 0; i < count; i += 4)
{
int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
// decode exponent into 2^x directly
int32x4_t ef = vshrq_n_s32(v, 24);
int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23);
// decode 24-bit mantissa into floating-point value
int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8);
float32x4_t m = vcvtq_f32_s32(mf);
float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
}
}
#endif
#ifdef SIMD_WASM
static void decodeFilterOctSimd(signed char* data, size_t count)
{
const v128_t sign = wasm_f32x4_splat(-0.f);
for (size_t i = 0; i < count; i += 4)
{
v128_t n4 = wasm_v128_load(&data[i * 4]);
// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 24), 24);
v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 24);
// unpack z; note that z is unsigned so we technically don't need to sign extend it
v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 8), 24);
// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
v128_t x = wasm_f32x4_convert_i32x4(xf);
v128_t y = wasm_f32x4_convert_i32x4(yf);
v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
// fixup octahedral coordinates for z<0
// note: i32x4_min with 0 is equvalent to f32x4_min
v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
// compute normal length & scale
v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll));
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
// combine xr/yr/zr into final value
v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000));
res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(0xff)));
res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(0xff)), 8));
res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(0xff)), 16));
wasm_v128_store(&data[i * 4], res);
}
}
static void decodeFilterOctSimd(short* data, size_t count)
{
const v128_t sign = wasm_f32x4_splat(-0.f);
const v128_t zmask = wasm_i32x4_splat(0x7fff);
for (size_t i = 0; i < count; i += 4)
{
v128_t n4_0 = wasm_v128_load(&data[(i + 0) * 4]);
v128_t n4_1 = wasm_v128_load(&data[(i + 2) * 4]);
// gather both x/y 16-bit pairs in each 32-bit lane
v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1);
// sign-extends each of x,y in [x y] with arithmetic shifts
v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 16);
v128_t yf = wasm_i32x4_shr(n4, 16);
// unpack z; note that z is unsigned so we don't need to sign extend it
v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1);
v128_t zf = wasm_v128_and(z4, zmask);
// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
v128_t x = wasm_f32x4_convert_i32x4(xf);
v128_t y = wasm_f32x4_convert_i32x4(yf);
v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
// fixup octahedral coordinates for z<0
// note: i32x4_min with 0 is equvalent to f32x4_min
v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
// compute normal length & scale
v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll));
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
// mix x/z and y/0 to make 16-bit unpack easier
v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(0xffff));
// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r);
v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
// patch in .w
res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000)));
res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000)));
wasm_v128_store(&data[(i + 0) * 4], res_0);
wasm_v128_store(&data[(i + 2) * 4], res_1);
}
}
static void decodeFilterQuatSimd(short* data, size_t count)
{
const float scale = 1.f / sqrtf(2.f);
for (size_t i = 0; i < count; i += 4)
{
v128_t q4_0 = wasm_v128_load(&data[(i + 0) * 4]);
v128_t q4_1 = wasm_v128_load(&data[(i + 2) * 4]);
// gather both x/y 16-bit pairs in each 32-bit lane
v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1);
v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1);
// sign-extends each of x,y in [x y] with arithmetic shifts
v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16);
v128_t yf = wasm_i32x4_shr(q4_xy, 16);
v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16);
v128_t cf = wasm_i32x4_shr(q4_zc, 16);
// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
// convert x/y/z to [-1..1] (scaled...)
v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
// note: i32x4_max with 0 is equivalent to f32x4_max
v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
v128_t s = wasm_f32x4_splat(32767.f);
// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
const v128_t fsnap = wasm_f32x4_splat(3 << 22);
v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
// mix x/z and w/y to make 16-bit unpack easier
v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(yr, 16));
// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr);
v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
// compute component index shifted left by 4 (and moved into i32x4 slot)
// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
volatile v128_t cm = wasm_i32x4_shl(cf, 4);
// rotate and store
uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
}
}
static void decodeFilterExpSimd(unsigned int* data, size_t count)
{
for (size_t i = 0; i < count; i += 4)
{
v128_t v = wasm_v128_load(&data[i]);
// decode exponent into 2^x directly
v128_t ef = wasm_i32x4_shr(v, 24);
v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23);
// decode 24-bit mantissa into floating-point value
v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8);
v128_t m = wasm_f32x4_convert_i32x4(mf);
v128_t r = wasm_f32x4_mul(es, m);
wasm_v128_store(&data[i], r);
}
}
#endif
} // namespace meshopt
void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size)
{
using namespace meshopt;
assert(vertex_size == 4 || vertex_size == 8);
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
if (vertex_size == 4)
dispatchSimd(decodeFilterOctSimd, static_cast<signed char*>(buffer), vertex_count, 4);
else
dispatchSimd(decodeFilterOctSimd, static_cast<short*>(buffer), vertex_count, 4);
#else
if (vertex_size == 4)
decodeFilterOct(static_cast<signed char*>(buffer), vertex_count);
else
decodeFilterOct(static_cast<short*>(buffer), vertex_count);
#endif
}
void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size)
{
using namespace meshopt;
assert(vertex_size == 8);
(void)vertex_size;
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
dispatchSimd(decodeFilterQuatSimd, static_cast<short*>(buffer), vertex_count, 4);
#else
decodeFilterQuat(static_cast<short*>(buffer), vertex_count);
#endif
}
void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size)
{
using namespace meshopt;
assert(vertex_size % 4 == 0);
#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
dispatchSimd(decodeFilterExpSimd, static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4), 1);
#else
decodeFilterExp(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
#endif
}
#undef SIMD_SSE
#undef SIMD_NEON
#undef SIMD_WASM

View File

@ -0,0 +1,58 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <string.h>
meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
{
assert(index_count % 3 == 0);
assert(vertex_size > 0 && vertex_size <= 256);
meshopt_Allocator allocator;
meshopt_VertexFetchStatistics result = {};
unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
memset(vertex_visited, 0, vertex_count);
const size_t kCacheLine = 64;
const size_t kCacheSize = 128 * 1024;
// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
size_t cache[kCacheSize / kCacheLine] = {};
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
vertex_visited[index] = 1;
size_t start_address = index * vertex_size;
size_t end_address = start_address + vertex_size;
size_t start_tag = start_address / kCacheLine;
size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
assert(start_tag < end_tag);
for (size_t tag = start_tag; tag < end_tag; ++tag)
{
size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
// we store +1 since cache is filled with 0 by default
result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
cache[line] = tag + 1;
}
}
size_t unique_vertex_count = 0;
for (size_t i = 0; i < vertex_count; ++i)
unique_vertex_count += vertex_visited[i];
result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
return result;
}

View File

@ -0,0 +1,74 @@
// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
#include "meshoptimizer.h"
#include <assert.h>
#include <string.h>
size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
{
assert(index_count % 3 == 0);
memset(destination, -1, vertex_count * sizeof(unsigned int));
unsigned int next_vertex = 0;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
if (destination[index] == ~0u)
{
destination[index] = next_vertex++;
}
}
assert(next_vertex <= vertex_count);
return next_vertex;
}
size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
{
assert(index_count % 3 == 0);
assert(vertex_size > 0 && vertex_size <= 256);
meshopt_Allocator allocator;
// support in-place optimization
if (destination == vertices)
{
unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
memcpy(vertices_copy, vertices, vertex_count * vertex_size);
vertices = vertices_copy;
}
// build vertex remap table
unsigned int* vertex_remap = allocator.allocate<unsigned int>(vertex_count);
memset(vertex_remap, -1, vertex_count * sizeof(unsigned int));
unsigned int next_vertex = 0;
for (size_t i = 0; i < index_count; ++i)
{
unsigned int index = indices[i];
assert(index < vertex_count);
unsigned int& remap = vertex_remap[index];
if (remap == ~0u) // vertex was not added to destination VB
{
// add vertex
memcpy(static_cast<unsigned char*>(destination) + next_vertex * vertex_size, static_cast<const unsigned char*>(vertices) + index * vertex_size, vertex_size);
remap = next_vertex++;
}
// modify indices in place
indices[i] = remap;
}
assert(next_vertex <= vertex_count);
return next_vertex;
}

207
util/island_finder.h Normal file
View File

@ -0,0 +1,207 @@
#ifndef ISLAND_FINDER_H
#define ISLAND_FINDER_H
#include "math/box3i.h"
#include "span.h"
// Scans a grid of binary values and returns another grid
// where all contiguous islands are labelled with a unique ID.
// It is based on a two-pass version of Connected-Component-Labeling.
//
// In the first pass we scan the grid to identify connected chunks by giving them temporary IDs,
// and marking equivalent ones if two chunks touch.
// In the second pass, we replace IDs with consecutive ones starting from 1, which are more convenient to use.
//
// See https://en.wikipedia.org/wiki/Connected-component_labeling
//
class IslandFinder {
public:
static const int MAX_ISLANDS = 256;
template <typename VolumePredicate_F>
void scan_3d(Box3i box, VolumePredicate_F volume_predicate_func, Span<uint8_t> output, unsigned int *out_count) {
int volume = box.size.volume();
CRASH_COND(output.size() != volume);
memset(output.data(), 0, volume * sizeof(uint8_t));
memset(_equivalences.data(), 0, MAX_ISLANDS * sizeof(uint8_t));
int top_label = 0;
int left_label = 0;
int back_label = 0;
int next_unique_label = 1;
Vector3i pos;
for (pos.z = 0; pos.z < box.size.z; ++pos.z) {
for (pos.x = 0; pos.x < box.size.x; ++pos.x) {
// TODO I initially wrote this algorithm in ZYX order, but translated to ZXY when porting to C++.
// `left` means `top`, and `top` means `left`.
left_label = 0;
for (pos.y = 0; pos.y < box.size.y; ++pos.y) {
int label = 0;
if (volume_predicate_func(box.pos + pos)) {
if (pos.z > 0) {
back_label = output[Vector3i(pos.x, pos.y, pos.z - 1).get_zxy_index(box.size)];
} else {
back_label = 0;
}
if (pos.x > 0) {
top_label = output[Vector3i(pos.x - 1, pos.y, pos.z).get_zxy_index(box.size)];
} else {
top_label = 0;
}
// TODO This soup of ifs is the first that worked for me, but there must be a way to simplify
if (left_label == 0 && top_label == 0 && back_label == 0) {
// TODO Make the algorithm return instead, it's hard for the caller to handle it otherwise
CRASH_COND(next_unique_label >= MAX_ISLANDS);
_equivalences[next_unique_label] = 0;
label = next_unique_label;
++next_unique_label;
} else if (left_label == 0 && top_label == 0) {
label = back_label;
} else if (left_label == 0 && back_label == 0) {
label = top_label;
} else if (top_label == 0 && back_label == 0) {
label = left_label;
} else if (left_label == 0 || (top_label != 0 && back_label != 0 &&
(left_label == top_label || left_label == back_label))) {
if (top_label == back_label) {
label = back_label;
} else if (top_label < back_label) {
label = top_label;
add_equivalence(back_label, top_label);
} else {
label = back_label;
add_equivalence(top_label, back_label);
}
} else if (top_label == 0 || (left_label != 0 && back_label != 0 &&
(top_label == left_label || top_label == back_label))) {
if (left_label == back_label) {
label = back_label;
} else if (left_label < back_label) {
label = left_label;
add_equivalence(back_label, left_label);
} else {
label = back_label;
add_equivalence(left_label, back_label);
}
} else if (back_label == 0 || (left_label != 0 && top_label != 0 &&
(back_label == left_label || back_label == top_label))) {
if (left_label == top_label) {
label = top_label;
} else if (left_label < top_label) {
label = left_label;
add_equivalence(top_label, left_label);
} else {
label = top_label;
add_equivalence(left_label, top_label);
}
} else {
int a[3] = { left_label, top_label, back_label };
SortArray<int> sa;
sa.sort(a, 3);
label = a[0];
add_equivalence(a[1], a[0]);
add_equivalence(a[2], a[1]);
}
output[pos.get_zxy_index(box.size)] = label;
}
left_label = label;
}
}
}
flatten_equivalences();
int count = compact_labels(next_unique_label);
if (out_count != nullptr) {
*out_count = count;
}
for (unsigned int i = 0; i < output.size(); ++i) {
uint8_t &c = output[i];
uint8_t e = _equivalences[c];
if (e != 0) {
c = e;
}
}
}
private:
void add_equivalence(int upper, int lower) {
CRASH_COND(upper <= lower);
int prev_lower = _equivalences[upper];
if (prev_lower == 0) {
_equivalences[upper] = lower;
} else if (prev_lower > lower) {
_equivalences[upper] = lower;
add_equivalence(prev_lower, lower);
} else if (prev_lower < lower) {
add_equivalence(lower, prev_lower);
}
}
// Makes sure equivalences go straight to the label without transitive links
void flatten_equivalences() {
for (int i = 1; i < MAX_ISLANDS; ++i) {
int e = _equivalences[i];
if (e == 0) {
continue;
}
int e2 = _equivalences[e];
while (e2 != 0) {
e = e2;
e2 = _equivalences[e];
}
_equivalences[i] = e;
}
}
// Make sure labels obtained from equivalences are sequential and start from 1.
// Returns total label count.
int compact_labels(int equivalences_count) {
int next_label = 1;
for (int i = 1; i < equivalences_count; ++i) {
const int e = _equivalences[i];
if (e == 0) {
// That label has no equivalent, give it an index
_equivalences[i] = next_label;
next_label += 1;
} else {
// That label has an equivalent, give it that index instead
int e2 = _equivalences[e];
_equivalences[i] = e2;
}
}
// We started from 1, but end with what would have been the next ID, so we subtract 1 to obtain the count
return next_label - 1;
}
private:
FixedArray<uint8_t, MAX_ISLANDS> _equivalences;
};
#endif // ISLAND_FINDER_H