render: significantly improve performance of TransformBuffers

Before the changes, a release build of 'many-lights' was running at about 130fps, now its 430fps
2024-04-21 00:54:45 -04:00 · 2024-04-21 00:54:45 -04:00 · 8eac563229
parent 24e1c0281e
commit 8eac563229
9 changed files with 410 additions and 133 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1849,6 +1849,7 @@ dependencies = [
 "tracing-log 0.1.4",
 "tracing-subscriber",
 "tracing-tracy",
+ "unique",
 "uuid",
 "wgpu",
 "winit",
@ -3565,6 +3566,12 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c"

+[[package]]
+name = "unique"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d360722e1f3884f5b14d332185f02ff111f771f0c76a313268fe6af1409aba96"
+
 [[package]]
 name = "url"
 version = "2.5.0"
--- a/examples/many-lights/Cargo.toml
+++ b/examples/many-lights/Cargo.toml
@ -15,5 +15,8 @@ fps_counter = "3.0.0"
 linker = "/usr/bin/clang"
 rustflags = ["-Clink-arg=-fuse-ld=lld", "-Clink-arg=-Wl,--no-rosegment"]

+[profile.dev]
+opt-level = 1
+
 [profile.release]
 debug = true
--- a/lyra-ecs/src/system/graph.rs
+++ b/lyra-ecs/src/system/graph.rs
@ -1,6 +1,6 @@
 use std::{collections::{HashMap, VecDeque, HashSet}, ptr::NonNull};

-use tracing::{debug_span, info_span};
+use tracing::{debug_span, info_span, instrument};

 use super::System;

@ -60,6 +60,7 @@ impl GraphExecutor {
    }

    /// Executes the systems in the graph
+    #[instrument(skip(self, world_ptr, stop_on_error))]
    pub fn execute(&mut self, mut world_ptr: NonNull<World>, stop_on_error: bool)
            -> Result<Vec<GraphExecutorError>, GraphExecutorError> {
        let mut stack = VecDeque::new();
@ -71,13 +72,11 @@ impl GraphExecutor {

        let mut possible_errors = Vec::new();

-        let sys_span = info_span!("graph_exec", system=tracing::field::Empty);
-
        while let Some(node) = stack.pop_front() {
            let system = self.systems.get_mut(node.as_str()).unwrap();

-            sys_span.record("system", system.name.clone());
-            let _e = sys_span.enter();
+            let span = info_span!("graph_exec", system=system.name.clone());
+            let _e = span.enter();

            if let Err(e) = system.system.execute(world_ptr)
                    .map_err(|e| GraphExecutorError::SystemError(node, e)) {
--- a/lyra-game/Cargo.toml
+++ b/lyra-game/Cargo.toml
@ -21,7 +21,7 @@ tracing-tracy = { version = "0.11.0", optional = true }

 async-std = { version = "1.12.0", features = [ "unstable", "attributes" ] }
 cfg-if = "1"
-bytemuck = { version = "1.12", features = [ "derive" ] }
+bytemuck = { version = "1.12", features = [ "derive", "min_const_generics" ] }
 image = { version = "0.24", default-features = false, features = ["png", "jpeg"] }
 anyhow = "1.0"
 instant = "0.1"
@ -33,6 +33,7 @@ quote = "1.0.29"
 uuid = { version = "1.5.0", features = ["v4", "fast-rng"] }
 itertools = "0.11.0"
 thiserror = "1.0.56"
+unique = "0.9.1"

 [features]
-tracy = ["dep:tracing-tracy"]
+tracy = ["dep:tracing-tracy"]
--- a/lyra-game/src/render/avec.rs
+++ b/lyra-game/src/render/avec.rs
@ -0,0 +1,292 @@
+use std::{alloc::Layout, cmp, marker::PhantomData, mem};
+
+use std::{alloc, ptr};
+use unique::Unique;
+
+/// A [`Vec`] with its elements aligned to a runtime alignment value.
+pub struct AVec<T> {
+    buf: Unique<u8>,
+    cap: usize,
+    len: usize,
+    align: usize,
+    _marker: PhantomData<T>,
+}
+
+impl<T> AVec<T> {
+    // Tiny Vecs are dumb. Skip to:
+    // - 8 if the element size is 1, because any heap allocators are likely
+    //   to round up a request of less than 8 bytes to at least 8 bytes.
+    // - 4 if elements are moderate-sized (<= 1 KiB).
+    // - 1 otherwise, to avoid wasting too much space for very short Vecs.
+    //
+    // Taken from Rust's standard library RawVec
+    pub(crate) const MIN_NON_ZERO_CAP: usize = if mem::size_of::<T>() == 1 {
+        8
+    } else if mem::size_of::<T>() <= 1024 {
+        4
+    } else {
+        1
+    };
+
+    #[inline]
+    pub fn new(alignment: usize) -> Self {
+        debug_assert!(mem::size_of::<T>() > 0, "ZSTs not yet supported");
+
+        Self {
+            buf: Unique::dangling(),
+            cap: 0,
+            len: 0,
+            align: alignment,
+            _marker: PhantomData
+        }
+    }
+
+    /// Constructs a new, empty `AVec` with at least the specified capacity.
+    /// 
+    /// The aligned vector will be able to hold at least `capacity` elements without reallocating.
+    /// This method may allocate for more elements than `capacity`. If `capacity` is zero,
+    /// the vector will not allocate.
+    /// 
+    /// # Panics
+    /// 
+    /// Panics if the capacity exceeds `usize::MAX` bytes.
+    #[inline]
+    pub fn with_capacity(alignment: usize, capacity: usize) -> Self {
+        let mut s = Self::new(alignment);
+
+        if capacity > 0 {
+            unsafe {
+                s.grow_amortized(0, capacity);
+            }
+        }
+
+        s
+    }
+
+    /// Calculates the size of the 'slot' for a single **aligned** item.
+    #[inline(always)]
+    fn slot_size(&self) -> usize {
+        let a = self.align - 1;
+        mem::align_of::<T>() + (a) & !a
+    }
+
+    /// # Panics
+    /// 
+    /// Panics if the new capacity exceeds `usize::MAX` bytes.
+    #[inline]
+    unsafe fn grow_amortized(&mut self, len: usize, additional: usize) {
+        debug_assert!(additional > 0);
+
+        let required_cap = len.checked_add(additional)
+            .expect("Capacity overflow");
+
+        let cap = cmp::max(self.cap * 2, required_cap);
+        let cap = cmp::max(Self::MIN_NON_ZERO_CAP, cap);
+
+        let new_layout = Layout::from_size_align_unchecked(cap * self.slot_size(), self.align);
+
+        let ptr = alloc::alloc(new_layout);
+        self.buf = Unique::new_unchecked(ptr);
+        self.cap = cap;
+    }
+
+    /// # Panics
+    /// 
+    /// Panics if the new capacity exceeds `usize::MAX` bytes.
+    #[inline]
+    unsafe fn grow_exact(&mut self, len: usize, additional: usize) {
+        debug_assert!(additional > 0);
+
+        let cap = len.checked_add(additional)
+            .expect("Capacity overflow");
+
+        let new_layout = Layout::from_size_align_unchecked(cap * self.slot_size(), self.align);
+
+        let ptr = alloc::alloc(new_layout);
+        self.buf = Unique::new_unchecked(ptr);
+        self.cap = cap;
+    }
+
+    /// Reserves capacity for at least `additional` more elements.
+    /// 
+    /// The collection may reserve more space to speculatively avoid frequent reallocations.
+    /// After calling `reserve`, capacity will be greater than or equal to
+    /// `self.len() + additional`. Does nothing if capacity is already sufficient.
+    /// 
+    /// # Panics
+    /// 
+    /// Panics if the new capacity exceeds `usize::MAX` bytes.
+    #[inline]
+    pub fn reserve(&mut self, additional: usize) {
+        debug_assert!(additional > 0);
+
+        let remaining = self.capacity().wrapping_sub(self.len);
+
+        if additional > remaining {
+            unsafe { self.grow_amortized(self.len, additional) };
+        }
+    }
+
+    /// Reserves capacity for `additional` more elements.
+    /// 
+    /// Unlike [`reserve`], this will not over-allocate to speculatively avoid frequent
+    /// reallocations. After calling `reserve_exact`, capacity will be equal to
+    /// `self.len() + additional`. Does nothing if the capacity is already sufficient.
+    /// 
+    /// Prefer [`reserve`] if future insertions are expected.
+    /// 
+    /// # Panics
+    /// 
+    /// Panics if the new capacity exceeds `usize::MAX` bytes.
+    #[inline]
+    pub fn reserve_exact(&mut self, additional: usize) {
+        let remaining = self.capacity().wrapping_sub(self.len);
+
+        if additional > remaining {
+            unsafe { self.grow_exact(self.len, additional) };
+        }
+    }
+
+    /// Appends an element to the back of the collection.
+    /// 
+    /// # Panics
+    /// 
+    /// Panics if the new capacity exceeds `usize::MAX` bytes.
+    #[inline]
+    pub fn push(&mut self, val: T) {
+        if self.len == self.cap {
+            self.reserve(self.slot_size());
+        }
+
+        unsafe {
+            // SAFETY: the length is ensured to be less than the capacity.
+            self.set_at_unchecked(self.len, val);
+        }
+
+        self.len += 1;
+    }
+
+    /// Sets an element at position `idx` within the vector to `val`.
+    /// 
+    /// # Unsafe
+    /// 
+    /// If `self.len > idx`, bytes past the length of the vector will be written to, potentially
+    /// also writing past the capacity of the vector.
+    #[inline(always)]
+    unsafe fn set_at_unchecked(&mut self, idx: usize, val: T) {
+        let ptr = self.buf
+            .as_ptr()
+            .add(idx * self.slot_size());
+
+        std::ptr::write(ptr.cast::<T>(), val);
+    }
+
+    /// Sets an element at position `idx` within the vector to `val`.
+    /// 
+    /// # Panics
+    /// 
+    /// Panics if `idx >= self.len`.
+    #[inline(always)]
+    pub fn set_at(&mut self, idx: usize, val: T) {
+        assert!(self.len > idx);
+        
+        unsafe {
+            self.set_at_unchecked(idx, val);
+        }
+    }
+
+    /// Shortens the vector, keeping the first `len` elements and dropping the rest.
+    /// 
+    /// If `len` is greater or equal to the vector’s current length, this has no effect.
+    #[inline]
+    pub fn truncate(&mut self, len: usize) {
+        if len > self.len {
+            return;
+        }
+        
+        unsafe {
+            // drop each element past the new length
+            for i in len..self.len {
+                let ptr = self.buf.as_ptr()
+                    .add(i * self.slot_size())
+                    .cast::<T>();
+
+                ptr::drop_in_place(ptr);
+            }
+        }
+
+        self.len = len;
+    }
+
+    #[inline(always)]
+    pub fn as_ptr(&self) -> *const u8 {
+        self.buf.as_ptr()
+    }
+
+    #[inline(always)]
+    pub fn as_mut_ptr(&self) -> *mut u8 {
+        self.buf.as_ptr()
+    }
+
+    /// Returns the alignment of the elements in the vector.
+    #[inline(always)]
+    pub fn align(&self) -> usize {
+        self.align
+    }
+
+    /// Returns the length of the vector.
+    #[inline(always)]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Returns the capacity of the vector.
+    /// 
+    /// The capacity is the amount of elements that the vector can store without reallocating.
+    #[inline(always)]
+    pub fn capacity(&self) -> usize {
+        self.cap
+    }
+}
+
+impl<T: Clone> AVec<T> {
+    /// Resized the `AVec` in-place so that `len` is equal to `new_len`.
+    /// 
+    /// If `new_len` is greater than `len`, the `AVec` is extended by the difference, and
+    /// each additional slot is filled with `value`. If `new_len` is less than `len`,
+    /// the `AVec` will be truncated by to be `new_len`
+    /// 
+    /// This method requires `T` to implement [`Clone`] in order to clone the passed value.
+    /// 
+    /// # Panics
+    /// 
+    /// Panics if the new capacity exceeds `usize::MAX` bytes.
+    #[inline]
+    pub fn resize(&mut self, new_len: usize, value: T) {
+        if new_len > self.len {
+            self.reserve(new_len - self.len);
+
+            unsafe {
+                let mut ptr = self.buf
+                    .as_ptr().add(self.len * self.slot_size());
+
+                // write all elements besides the last one
+                for _ in 1..new_len {
+                    std::ptr::write(ptr.cast::<T>(), value.clone());
+                    ptr = ptr.add(self.slot_size());
+                    self.len += 1;
+                }
+
+                if new_len > 0 {
+                    // the last element can be written without cloning
+                    std::ptr::write(ptr.cast::<T>(), value.clone());
+                    self.len += 1;
+                }
+
+                self.len = new_len;
+            }
+        } else {
+            self.truncate(new_len);
+        }
+    }
+}
--- a/lyra-game/src/render/mod.rs
+++ b/lyra-game/src/render/mod.rs
@ -12,4 +12,5 @@ pub mod camera;
 pub mod window;
 pub mod transform_buffer_storage;
 pub mod light;
-pub mod light_cull_compute;
+pub mod light_cull_compute;
+pub mod avec;
--- a/lyra-game/src/render/renderer.rs
+++ b/lyra-game/src/render/renderer.rs
@ -169,7 +169,7 @@ impl BasicRenderer {
            format: surface_format,
            width: size.width,
            height: size.height,
-            present_mode,
+            present_mode: wgpu::PresentMode::Immediate,
            alpha_mode: surface_caps.alpha_modes[0],
            view_formats: vec![],
        };
@ -448,7 +448,8 @@ impl Renderer for BasicRenderer {
            alive_entities.insert(entity);

            if let Some((mesh_han, mesh_epoch)) = mesh_pair {
-                let interop_pos = self.interpolate_transforms(now_inst, last_epoch, entity, &transform, transform_epoch);
+                // TODO: speed up interpolating transforms
+                let interop_pos = *transform; //self.interpolate_transforms(now_inst, last_epoch, entity, &transform, transform_epoch);

                if let Some(mesh) = mesh_han.data_ref() {
                    // if process mesh did not just create a new mesh, and the epoch
@ -464,8 +465,8 @@ impl Renderer for BasicRenderer {
                    }
                    
                    let group = TransformGroup::EntityRes(entity, mesh_han.uuid());
-                    let transform_id = self.transform_buffers.update_or_push(&self.queue, &self.render_limits,
-                        group, || ( interop_pos.calculate_mat4(), glam::Mat3::from_quat(interop_pos.rotation) ));
+                    let transform_id = self.transform_buffers.update_or_push(&self.device, &self.queue, &self.render_limits,
+                        group, interop_pos.calculate_mat4(), glam::Mat3::from_quat(interop_pos.rotation));

                    let material = mesh.material.as_ref().unwrap()
                        .data_ref().unwrap();
@ -482,7 +483,8 @@ impl Renderer for BasicRenderer {
                        lyra_scene::system_update_world_transforms(scene.world(), view).unwrap();
                    }

-                    let interpo_pos = self.interpolate_transforms(now_inst, last_epoch, entity, &transform, transform_epoch);
+                    // TODO: speed up interpolating transforms
+                    let interpo_pos = *transform; //self.interpolate_transforms(now_inst, last_epoch, entity, &transform, transform_epoch);

                    for (mesh_han, pos) in scene.world().view_iter::<(&MeshHandle, &WorldTransform)>() {
                        if let Some(mesh) = mesh_han.data_ref() {
@ -502,8 +504,8 @@ impl Renderer for BasicRenderer {

                            let scene_mesh_group = TransformGroup::Res(scene_han.uuid(), mesh_han.uuid());
                            let group = TransformGroup::OwnedGroup(entity, scene_mesh_group.into());
-                            let transform_id = self.transform_buffers.update_or_push(&self.queue, &self.render_limits,
-                                group, || ( mesh_interpo.calculate_mat4(), glam::Mat3::from_quat(mesh_interpo.rotation) ));
+                            let transform_id = self.transform_buffers.update_or_push(&self.device, &self.queue, &self.render_limits,
+                                group, mesh_interpo.calculate_mat4(), glam::Mat3::from_quat(mesh_interpo.rotation) );
                            
                            let material = mesh.material.as_ref().unwrap()
                                .data_ref().unwrap();
@ -517,7 +519,7 @@ impl Renderer for BasicRenderer {
        }

        // collect dead entities
-        self.transform_buffers.tick();
+        self.transform_buffers.send_to_gpu(&self.queue);
        
        // when buffer storage length does not match the amount of iterated entities,
        // remove all dead entities, and their buffers, if they weren't iterated over
@ -611,7 +613,7 @@ impl Renderer for BasicRenderer {
                    // Get the bindgroup for job's transform and bind to it using an offset.
                    let bindgroup = self.transform_buffers.bind_group(job.transform_id);
                    let offset = self.transform_buffers.buffer_offset(job.transform_id);
-                    render_pass.set_bind_group(1, bindgroup, &[ offset, offset, ]);
+                    render_pass.set_bind_group(1, bindgroup, &[ offset, ]);

                    render_pass.set_bind_group(2, &self.camera_buffer.bindgroup(), &[]);
                    render_pass.set_bind_group(3, &self.light_buffers.bind_group_pair.bindgroup, &[]);
--- a/lyra-game/src/render/shaders/base.wgsl
+++ b/lyra-game/src/render/shaders/base.wgsl
@ -21,6 +21,11 @@ struct VertexOutput {
    @location(2) world_normal: vec3<f32>,
 }

+struct TransformData {
+    transform: mat4x4<f32>,
+    normal_matrix: mat4x4<f32>,
+}
+
 struct CameraUniform {
    view: mat4x4<f32>,
    inverse_projection: mat4x4<f32>,
@ -51,9 +56,7 @@ struct Lights {
 };

@group(1) @binding(0)
-var<uniform> u_model_transform: mat4x4<f32>;
-@group(1) @binding(1)
-var<uniform> u_model_normal_matrix: mat4x4<f32>;
+var<uniform> u_model_transform_data: TransformData;

@group(2) @binding(0)
 var<uniform> u_camera: CameraUniform;
@ -68,13 +71,14 @@ fn vs_main(
    var out: VertexOutput;
    
    out.tex_coords = model.tex_coords;
-    out.clip_position = u_camera.view_projection * u_model_transform * vec4<f32>(model.position, 1.0);
+    out.clip_position = u_camera.view_projection * u_model_transform_data.transform * vec4<f32>(model.position, 1.0);
    
    // the normal mat is actually only a mat3x3, but there's a bug in wgpu: https://github.com/gfx-rs/wgpu-rs/issues/36
-    let normal_mat = mat3x3(u_model_normal_matrix[0].xyz, u_model_normal_matrix[1].xyz, u_model_normal_matrix[2].xyz);
+    let normal_mat4 = u_model_transform_data.normal_matrix;
+    let normal_mat = mat3x3(normal_mat4[0].xyz, normal_mat4[1].xyz, normal_mat4[2].xyz);
    out.world_normal = normalize(normal_mat * model.normal, );
    
-    var world_position: vec4<f32> = u_model_transform * vec4<f32>(model.position, 1.0);
+    var world_position: vec4<f32> = u_model_transform_data.transform * vec4<f32>(model.position, 1.0);
    out.world_position = world_position.xyz;

    return out;
--- a/lyra-game/src/render/transform_buffer_storage.rs
+++ b/lyra-game/src/render/transform_buffer_storage.rs
@ -7,6 +7,8 @@ use wgpu::Limits;

 use std::mem;

+use crate::render::avec::AVec;
+
 /// A group id created from a [`TransformGroup`].
 /// 
 /// This is mainly created so that [`TransformGroup::OwnedGroup`] can use another group inside of it.
@ -67,8 +69,10 @@ pub struct TransformIndex {
 struct BufferEntry {
    pub len: usize,
    pub bindgroup: wgpu::BindGroup,
-    pub transform_buffer: wgpu::Buffer,
-    pub normal_buffer: wgpu::Buffer,
+    pub buffer: wgpu::Buffer,
+    transforms: AVec<TransformNormalMatPair>,
+    //pub normal_buffer: wgpu::Buffer,
+
 }

 /// A HashMap that caches values for reuse.
@ -159,10 +163,12 @@ impl<K: Hash + Eq + PartialEq + Clone, V: Clone, S: BuildHasher> CachedValMap<K,
 /// update, and retrieve the transforms.
 pub struct TransformBuffers {
    pub bindgroup_layout: wgpu::BindGroupLayout,
-    groups: CachedValMap<TransformGroupId, TransformIndex>,
+    //groups: CachedValMap<TransformGroupId, TransformIndex>,
+    //groups: SlotMap<TransformGroupId, TransformIndex>,
    entries: Vec<BufferEntry>,
    limits: wgpu::Limits,
    max_transform_count: usize,
+    next_index: usize,
 }

 impl TransformBuffers {
@ -181,26 +187,16 @@ impl TransformBuffers {
                    },
                    count: None,
                },
-                wgpu::BindGroupLayoutEntry {
-                    binding: 1,
-                    visibility: wgpu::ShaderStages::VERTEX,
-                    ty: wgpu::BindingType::Buffer {
-                        ty: wgpu::BufferBindingType::Uniform,
-                        has_dynamic_offset: true,
-                        min_binding_size: None,
-                    },
-                    count: None,
-                }
            ],
            label: Some("transform_bind_group_layout"),
        });

        let mut s = Self {
            bindgroup_layout,
-            groups: Default::default(),
            entries: Default::default(),
-            max_transform_count: (limits.max_uniform_buffer_binding_size / 2) as usize / (mem::size_of::<glam::Mat4>()),
+            max_transform_count: (limits.max_uniform_buffer_binding_size) as usize / (limits.min_uniform_buffer_offset_alignment as usize), //(mem::size_of::<glam::Mat4>()),
            limits,
+            next_index: 0,
        };

        // create the first uniform buffer
@ -209,73 +205,59 @@ impl TransformBuffers {
        s
    }

-    /// Update an existing transform in the buffers.
+    /// Write the transform buffers to the gpu.
    /// 
-    /// # Panics
-    /// Panics if the `entity_group` is not already inside of the buffers.
-    #[instrument(skip(self, queue, limits, entity_group, transform, normal_matrix))]
-    pub fn update_transform(&mut self, queue: &wgpu::Queue, limits: &Limits, entity_group: TransformGroup, transform: glam::Mat4, normal_matrix: glam::Mat3) -> TransformIndex {
-        let index = *self.groups.get(entity_group.into())
-            .expect("Use 'push_transform' for new entities");
-        let entry = self.entries.get_mut(index.entry_index).unwrap();
+    /// This uses [`wgpu::Queue::write_buffer`], so the write is not immediately submitted,
+    /// and instead enqueued internally to happen at the start of the next submit() call.
+    pub fn send_to_gpu(&mut self, queue: &wgpu::Queue) {
+        self.next_index = 0;

-        let normal_matrix = glam::Mat4::from_mat3(normal_matrix);
+        for entry in &mut self.entries {
+            entry.len = 0;

-        // write the transform and normal to the end of the transform
-        let offset = Self::get_buffer_offset(limits, index) as _;
-        queue.write_buffer(&entry.transform_buffer, offset, bytemuck::bytes_of(&transform));
-        queue.write_buffer(&entry.normal_buffer, offset, bytemuck::bytes_of(&normal_matrix));
+            let p = entry.transforms.as_ptr();
+            let bytes = unsafe { std::slice::from_raw_parts(p as *const u8, entry.transforms.len() * entry.transforms.align()) };

-        index
-    }
-
-    /// Push a new transform into the buffers.
-    #[instrument(skip(self, queue, limits, entity_group, transform, normal_matrix))]
-    pub fn push_transform(&mut self, queue: &wgpu::Queue, limits: &Limits, entity_group: TransformGroup, transform: glam::Mat4, normal_matrix: glam::Mat3) -> TransformIndex {
-        self.groups.insert(entity_group.into(), || {
-            // this closure is only called when there are no values that can be reused,
-            // so we get a brand new index at the end of the last entry in the chain.
-            let last = self.entries.last_mut().unwrap();
-            
-            // ensure the gpu buffer is not overflown
-            debug_assert!(last.len < self.max_transform_count,
-                "Transform buffer is filled and 'next_indices' was not incremented! \
-                    Was a new buffer created?");
-            
-            let tidx = last.len;
-            last.len += 1;
-
-            TransformIndex {
-                entry_index: self.entries.len() - 1,
-                transform_index: tidx
-            }
-        });
-
-        self.update_transform(queue, limits, entity_group, transform, normal_matrix)
-    }
-
-    /// Collect the dead transforms and prepare self to check next time.
-    pub fn tick(&mut self) {
-        self.groups.update();
-    }
-
-    /// Returns a boolean indicating if the buffer contains this group.
-    pub fn contains(&self, group: TransformGroup) -> bool {
-        self.groups.contains(group.into())
+            queue.write_buffer(&entry.buffer, 0, bytes);
+        }
    }

    /// Update an existing transform group or if its not existing yet, pushes it to the buffer.
    /// 
    /// Returns: the index that the transform is at in the buffers.
-    #[instrument(skip(self, queue, limits, group, transform_fn))]
-    pub fn update_or_push<F>(&mut self, queue: &wgpu::Queue, limits: &Limits, group: TransformGroup, transform_fn: F) -> TransformIndex
-        where F: Fn() -> (glam::Mat4, glam::Mat3)
+    #[instrument(skip(self, device, queue, limits, group, transform, normal_matrix))]
+    #[inline(always)]
+    pub fn update_or_push(&mut self, device: &wgpu::Device, queue: &wgpu::Queue, limits: &Limits, group: TransformGroup, transform: glam::Mat4, normal_matrix: glam::Mat3) -> TransformIndex
    {   
-        let (transform, normal_matrix) = transform_fn();
-        if self.contains(group) {
-            self.update_transform(queue, limits, group, transform, normal_matrix)
-        } else {
-            self.push_transform(queue, limits, group, transform, normal_matrix)
+        // maybe will be used at some point again
+        let _ = (queue, limits, group);
+        
+        let normal_matrix = glam::Mat4::from_mat3(normal_matrix);
+        
+        let index = self.next_index;
+        self.next_index += 1;
+        
+        // the index of the entry to put the transform into
+        let entry_index = index / self.max_transform_count;
+        // the index of the transform in the buffer
+        let transform_index = index % self.max_transform_count;
+
+        if entry_index >= self.entries.len() {
+            self.expand_buffers(device);
+        }
+
+        let entry = self.entries.get_mut(entry_index).unwrap();
+
+        // write the transform and normal to the end of the transform
+        entry.transforms.set_at(transform_index, TransformNormalMatPair {
+            transform,
+            normal_mat: normal_matrix,
+        });
+        entry.len += 1;
+
+        TransformIndex {
+            entry_index: 0,
+            transform_index: index,
        }
    }

@ -297,21 +279,9 @@ impl TransformBuffers {
            }
        );

-        let normal_mat_buffer = device.create_buffer(
-            &wgpu::BufferDescriptor {
-                label: Some(&format!("B_NormalMatrix_{}", self.entries.len())),
-                usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
-                size: max_buffer_sizes,
-                mapped_at_creation: false,
-            }
-        );
+        let tran_stride = mem::size_of::<TransformNormalMatPair>();

-        let tran_stride = mem::size_of::<glam::Mat4>();
-        // although a normal matrix only needs to be a mat3, there's a weird issue with
-        // misalignment from wgpu or spirv-cross: https://github.com/gfx-rs/wgpu-rs/issues/36
-        let norm_stride = mem::size_of::<glam::Mat4>();
-
-        let transform_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+        let bindgroup = device.create_bind_group(&wgpu::BindGroupDescriptor {
            layout: &self.bindgroup_layout,
            entries: &[
                wgpu::BindGroupEntry {
@ -324,42 +294,34 @@ impl TransformBuffers {
                        }
                    )
                },
-                wgpu::BindGroupEntry {
-                    binding: 1,
-                    resource: wgpu::BindingResource::Buffer(
-                            wgpu::BufferBinding {
-                            buffer: &normal_mat_buffer,
-                            offset: 0,
-                            size: Some(NonZeroU64::new(norm_stride as u64).unwrap())
-                        }
-                    )
-                }
            ],
            label: Some("BG_Transforms"),
        });

+        let mut transforms = AVec::new(limits.min_uniform_buffer_offset_alignment as _);
+        transforms.resize(self.max_transform_count, TransformNormalMatPair {
+            transform: glam::Mat4::IDENTITY,
+            normal_mat: glam::Mat4::IDENTITY,
+        });
+
        let entry = BufferEntry {
-            bindgroup: transform_bind_group,
-            transform_buffer,
-            normal_buffer: normal_mat_buffer,
+            bindgroup,
+            buffer: transform_buffer,
            len: 0,
+
+            transforms,
        };
        self.entries.push(entry);
    }

    /// Returns the bind group for the transform index.
+    #[inline(always)]
    pub fn bind_group(&self, transform_id: TransformIndex) -> &wgpu::BindGroup {
-        let entry = self.entries.get(transform_id.entry_index).unwrap();
+        let entry_index = transform_id.transform_index / self.max_transform_count;
+        let entry = self.entries.get(entry_index).unwrap();
        &entry.bindgroup
    }

-    /// Get the buffer offset for a transform using wgpu limits.
-    /// 
-    /// If its possible to borrow immutably, use [`TransformBuffers::buffer_offset`].
-    fn get_buffer_offset(limits: &wgpu::Limits, transform_index: TransformIndex) -> u32 {
-        transform_index.transform_index as u32 * limits.min_uniform_buffer_offset_alignment as u32
-    }
-
    /// Returns the offset of the transform inside the bind group buffer.
    /// 
    /// ```nobuild
@ -367,15 +329,21 @@ impl TransformBuffers {
    /// let offset = transform_buffers.buffer_offset(job.transform_id);
    /// render_pass.set_bind_group(1, bindgroup, &[ offset, offset, ]);
    /// ```
+    #[inline(always)]
    pub fn buffer_offset(&self, transform_index: TransformIndex) -> u32 {
-        Self::get_buffer_offset(&self.limits, transform_index)
+        //Self::get_buffer_offset(&self.limits, transform_index)
+        let transform_index = transform_index.transform_index % self.max_transform_count;
+        let t = transform_index as u32 * self.limits.min_uniform_buffer_offset_alignment as u32;
+        //debug!("offset: {t}");
+        t
    }

    /// Returns a boolean indicating if the buffers need to be expanded
    pub fn needs_expand(&self) -> bool {
-        self.entries.last()
+        false
+        /* self.entries.last()
            .map(|entry| entry.len >= self.max_transform_count)
-            .unwrap_or(false)
+            .unwrap_or(false) */
    }
 }