astc_decoder: Reduce workgroup size

This reduces the amount of over dispatching when there are odd dimensions (i.e. ASTC 8x5), which rarely evenly divide into 32x32.
astc_decoder: Compute offset swizzles in-shader
2021-08-01 01:22:27 -04:00 · 2021-08-01 01:22:26 -04:00 · 2021-07-31 22:28:04 -04:00 · 2021-07-31 21:36:26 -04:00 · 2021-07-31 21:36:26 -04:00 · 2021-07-31 21:26:42 -04:00
12 changed files with 455 additions and 472 deletions
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -166,6 +166,8 @@ NvResult nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vecto
            LOG_ERROR(Service_NVDRV, "failed to map size={}", object->size);
        } else {
            cmd_buffer.map_address = object->dma_map_addr;
+            AddBufferMap(object->dma_map_addr, object->size, object->addr,
+                         object->status == nvmap::Object::Status::Allocated);
        }
    }
    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
@@ -176,11 +178,30 @@ NvResult nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vecto
 }

 NvResult nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
-    // This is intntionally stubbed.
-    // Skip unmapping buffers here, as to not break the continuity of the VP9 reference frame
-    // addresses, and risk invalidating data before the async GPU thread is done with it
+    IoctlMapBuffer params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
+    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
+    SliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
+
+    auto& gpu = system.GPU();
+
+    for (auto& cmd_buffer : cmd_buffer_handles) {
+        const auto object{nvmap_dev->GetObject(cmd_buffer.map_handle)};
+        if (!object) {
+            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmd_buffer.map_handle);
+            std::memcpy(output.data(), &params, output.size());
+            return NvResult::InvalidState;
+        }
+        if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) {
+            gpu.MemoryManager().Unmap(object->dma_map_addr, *size);
+        } else {
+            // This occurs quite frequently, however does not seem to impact functionality
+            LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr,
+                      object->dma_map_addr);
+        }
+        object->dma_map_addr = 0;
+    }
    std::memset(output.data(), 0, output.size());
-    LOG_DEBUG(Service_NVDRV, "(STUBBED) called");
    return NvResult::Success;
 }

@@ -191,4 +212,33 @@ NvResult nvhost_nvdec_common::SetSubmitTimeout(const std::vector<u8>& input,
    return NvResult::Success;
 }

+std::optional<nvhost_nvdec_common::BufferMap> nvhost_nvdec_common::FindBufferMap(
+    GPUVAddr gpu_addr) const {
+    const auto it = std::find_if(
+        buffer_mappings.begin(), buffer_mappings.upper_bound(gpu_addr), [&](const auto& entry) {
+            return (gpu_addr >= entry.second.StartAddr() && gpu_addr < entry.second.EndAddr());
+        });
+
+    ASSERT(it != buffer_mappings.end());
+    return it->second;
+}
+
+void nvhost_nvdec_common::AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr,
+                                       bool is_allocated) {
+    buffer_mappings.insert_or_assign(gpu_addr, BufferMap{gpu_addr, size, cpu_addr, is_allocated});
+}
+
+std::optional<std::size_t> nvhost_nvdec_common::RemoveBufferMap(GPUVAddr gpu_addr) {
+    const auto iter{buffer_mappings.find(gpu_addr)};
+    if (iter == buffer_mappings.end()) {
+        return std::nullopt;
+    }
+    std::size_t size = 0;
+    if (iter->second.IsAllocated()) {
+        size = iter->second.Size();
+    }
+    buffer_mappings.erase(iter);
+    return size;
+}
+
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -23,6 +23,45 @@ public:
    ~nvhost_nvdec_common() override;

 protected:
+    class BufferMap final {
+    public:
+        constexpr BufferMap() = default;
+
+        constexpr BufferMap(GPUVAddr start_addr_, std::size_t size_)
+            : start_addr{start_addr_}, end_addr{start_addr_ + size_} {}
+
+        constexpr BufferMap(GPUVAddr start_addr_, std::size_t size_, VAddr cpu_addr_,
+                            bool is_allocated_)
+            : start_addr{start_addr_}, end_addr{start_addr_ + size_}, cpu_addr{cpu_addr_},
+              is_allocated{is_allocated_} {}
+
+        constexpr VAddr StartAddr() const {
+            return start_addr;
+        }
+
+        constexpr VAddr EndAddr() const {
+            return end_addr;
+        }
+
+        constexpr std::size_t Size() const {
+            return end_addr - start_addr;
+        }
+
+        constexpr VAddr CpuAddr() const {
+            return cpu_addr;
+        }
+
+        constexpr bool IsAllocated() const {
+            return is_allocated;
+        }
+
+    private:
+        GPUVAddr start_addr{};
+        GPUVAddr end_addr{};
+        VAddr cpu_addr{};
+        bool is_allocated{};
+    };
+
    struct IoctlSetNvmapFD {
        s32_le nvmap_fd{};
    };
@@ -115,11 +154,17 @@ protected:
    NvResult UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
    NvResult SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output);

+    std::optional<BufferMap> FindBufferMap(GPUVAddr gpu_addr) const;
+    void AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr, bool is_allocated);
+    std::optional<std::size_t> RemoveBufferMap(GPUVAddr gpu_addr);
+
    s32_le nvmap_fd{};
    u32_le submit_timeout{};
    std::shared_ptr<nvmap> nvmap_dev;
    SyncpointManager& syncpoint_manager;
    std::array<u32, MaxSyncPoints> device_syncpoints{};
+    // This is expected to be ordered, therefore we must use a map, not unordered_map
+    std::map<GPUVAddr, BufferMap> buffer_mappings;
 };
 }; // namespace Devices
 } // namespace Service::Nvidia
--- a/src/video_core/command_classes/codecs/vp9.cpp
+++ b/src/video_core/command_classes/codecs/vp9.cpp
@@ -11,9 +11,6 @@

 namespace Tegra::Decoder {
 namespace {
-constexpr u32 diff_update_probability = 252;
-constexpr u32 frame_sync_code = 0x498342;
-
 // Default compressed header probabilities once frame context resets
 constexpr Vp9EntropyProbs default_probs{
    .y_mode_prob{
@@ -364,7 +361,8 @@ Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state)
    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);

    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
-    // order: last, golden, altref, current.
+    // order: last, golden, altref, current. It may be worthwhile to track the updates done here
+    // to avoid buffering frame data needed for reference frame updating in the header composition.
    std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4,
              vp9_info.frame_offsets.begin());

@@ -386,18 +384,33 @@ Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state)
        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
                                      current_frame.info.bitstream_size);
    }
-    if (!next_frame.bit_stream.empty()) {
+    // Buffer two frames, saving the last show frame info
+    if (!next_next_frame.bit_stream.empty()) {
        Vp9FrameContainer temp{
            .info = current_frame.info,
            .bit_stream = std::move(current_frame.bit_stream),
        };
-        next_frame.info.show_frame = current_frame.info.last_frame_shown;
-        current_frame.info = next_frame.info;
-        current_frame.bit_stream = std::move(next_frame.bit_stream);
-        next_frame = std::move(temp);
+        next_next_frame.info.show_frame = current_frame.info.last_frame_shown;
+        current_frame.info = next_next_frame.info;
+        current_frame.bit_stream = std::move(next_next_frame.bit_stream);
+        next_next_frame = std::move(temp);
+
+        if (!next_frame.bit_stream.empty()) {
+            Vp9FrameContainer temp2{
+                .info = current_frame.info,
+                .bit_stream = std::move(current_frame.bit_stream),
+            };
+            next_frame.info.show_frame = current_frame.info.last_frame_shown;
+            current_frame.info = next_frame.info;
+            current_frame.bit_stream = std::move(next_frame.bit_stream);
+            next_frame = std::move(temp2);
+        } else {
+            next_frame.info = current_frame.info;
+            next_frame.bit_stream = std::move(current_frame.bit_stream);
+        }
    } else {
-        next_frame.info = current_frame.info;
-        next_frame.bit_stream = std::move(current_frame.bit_stream);
+        next_next_frame.info = current_frame.info;
+        next_next_frame.bit_stream = std::move(current_frame.bit_stream);
    }
    return current_frame;
 }
@@ -600,64 +613,86 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {

        // Reset context
        prev_frame_probs = default_probs;
-        swap_ref_indices = false;
+        swap_next_golden = false;
        loop_filter_ref_deltas.fill(0);
        loop_filter_mode_deltas.fill(0);
-        frame_ctxs.fill(default_probs);
+
+        // allow frames offsets to stabilize before checking for golden frames
+        grace_period = 4;
+
+        // On key frames, all frame slots are set to the current frame,
+        // so the value of the selected slot doesn't really matter.
+        frame_ctxs.fill({current_frame_number, false, default_probs});

        // intra only, meaning the frame can be recreated with no other references
        current_frame_info.intra_only = true;
+
    } else {
+
        if (!current_frame_info.show_frame) {
            uncomp_writer.WriteBit(current_frame_info.intra_only);
+            if (!current_frame_info.last_frame_was_key) {
+                swap_next_golden = !swap_next_golden;
+            }
        } else {
            current_frame_info.intra_only = false;
        }
        if (!current_frame_info.error_resilient_mode) {
            uncomp_writer.WriteU(0, 2); // Reset frame context.
        }
-        const auto& curr_offsets = current_frame_info.frame_offsets;
-        const auto& next_offsets = next_frame.info.frame_offsets;
-        const bool ref_frames_different = curr_offsets[1] != curr_offsets[2];
-        const bool next_references_swap =
-            (next_offsets[1] == curr_offsets[2]) || (next_offsets[2] == curr_offsets[1]);
-        const bool needs_ref_swap = ref_frames_different && next_references_swap;
-        if (needs_ref_swap) {
-            swap_ref_indices = !swap_ref_indices;
-        }
-        union {
-            u32 raw;
-            BitField<0, 1, u32> refresh_last;
-            BitField<1, 2, u32> refresh_golden;
-            BitField<2, 1, u32> refresh_alt;
-        } refresh_frame_flags;

-        refresh_frame_flags.raw = 0;
-        for (u32 index = 0; index < 3; ++index) {
-            // Refresh indices that use the current frame as an index
-            if (curr_offsets[3] == next_offsets[index]) {
-                refresh_frame_flags.raw |= 1u << index;
+        // Last, Golden, Altref frames
+        std::array<s32, 3> ref_frame_index{0, 1, 2};
+
+        // Set when next frame is hidden
+        // altref and golden references are swapped
+        if (swap_next_golden) {
+            ref_frame_index = std::array<s32, 3>{0, 2, 1};
+        }
+
+        // update Last Frame
+        u64 refresh_frame_flags = 1;
+
+        // golden frame may refresh, determined if the next golden frame offset is changed
+        bool golden_refresh = false;
+        if (grace_period <= 0) {
+            for (s32 index = 1; index < 3; ++index) {
+                if (current_frame_info.frame_offsets[index] !=
+                    next_frame.info.frame_offsets[index]) {
+                    current_frame_info.refresh_frame[index] = true;
+                    golden_refresh = true;
+                    grace_period = 3;
+                }
            }
        }
-        if (swap_ref_indices) {
-            const u32 temp = refresh_frame_flags.refresh_golden;
-            refresh_frame_flags.refresh_golden.Assign(refresh_frame_flags.refresh_alt.Value());
-            refresh_frame_flags.refresh_alt.Assign(temp);
+
+        if (current_frame_info.show_frame &&
+            (!next_frame.info.show_frame || next_frame.info.is_key_frame)) {
+            // Update golden frame
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
        }
+
+        if (!current_frame_info.show_frame) {
+            // Update altref
+            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        } else if (golden_refresh) {
+            refresh_frame_flags = 3;
+        }
+
        if (current_frame_info.intra_only) {
            uncomp_writer.WriteU(frame_sync_code, 24);
-            uncomp_writer.WriteU(refresh_frame_flags.raw, 8);
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
            uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
            uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
            uncomp_writer.WriteBit(false); // Render and frame size different.
        } else {
-            const bool swap_indices = needs_ref_swap ^ swap_ref_indices;
-            const auto ref_frame_index = swap_indices ? std::array{0, 2, 1} : std::array{0, 1, 2};
-            uncomp_writer.WriteU(refresh_frame_flags.raw, 8);
-            for (size_t index = 1; index < 4; index++) {
+            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+
+            for (s32 index = 1; index < 4; index++) {
                uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
                uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
            }
+
            uncomp_writer.WriteBit(true);  // Frame size with refs.
            uncomp_writer.WriteBit(false); // Render and frame size different.
            uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
@@ -679,9 +714,10 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
        frame_ctx_idx = 1;
    }

-    uncomp_writer.WriteU(frame_ctx_idx, 2);       // Frame context index.
-    prev_frame_probs = frame_ctxs[frame_ctx_idx]; // reference probabilities for compressed header
-    frame_ctxs[frame_ctx_idx] = current_frame_info.entropy;
+    uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
+    prev_frame_probs =
+        frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header
+    frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};

    uncomp_writer.WriteU(current_frame_info.first_level, 6);
    uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
@@ -776,6 +812,7 @@ const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters
        current_frame_info = curr_frame.info;
        bitstream = std::move(curr_frame.bit_stream);
    }
+
    // The uncompressed header routine sets PrevProb parameters needed for the compressed header
    auto uncomp_writer = ComposeUncompressedHeader();
    std::vector<u8> compressed_header = ComposeCompressedHeader();
@@ -791,6 +828,13 @@ const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters
              frame.begin() + uncompressed_header.size());
    std::copy(bitstream.begin(), bitstream.end(),
              frame.begin() + uncompressed_header.size() + compressed_header.size());
+
+    // keep track of frame number
+    current_frame_number++;
+    grace_period--;
+
+    // don't display hidden frames
+    hidden = !current_frame_info.show_frame;
    return frame;
 }

--- a/src/video_core/command_classes/codecs/vp9.h
+++ b/src/video_core/command_classes/codecs/vp9.h
@@ -14,6 +14,7 @@

 namespace Tegra {
 class GPU;
+enum class FrameType { KeyFrame = 0, InterFrame = 1 };
 namespace Decoder {

 /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
@@ -123,7 +124,7 @@ public:

    /// Returns true if the most recent frame was a hidden frame.
    [[nodiscard]] bool WasFrameHidden() const {
-        return !current_frame_info.show_frame;
+        return hidden;
    }

 private:
@@ -177,12 +178,19 @@ private:
    std::array<s8, 4> loop_filter_ref_deltas{};
    std::array<s8, 2> loop_filter_mode_deltas{};

+    bool hidden = false;
+    s64 current_frame_number = -2; // since we buffer 2 frames
+    s32 grace_period = 6;          // frame offsets need to stabilize
+    std::array<FrameContexts, 4> frame_ctxs{};
    Vp9FrameContainer next_frame{};
-    std::array<Vp9EntropyProbs, 4> frame_ctxs{};
-    bool swap_ref_indices{};
+    Vp9FrameContainer next_next_frame{};
+    bool swap_next_golden{};

    Vp9PictureInfo current_frame_info{};
    Vp9EntropyProbs prev_frame_probs{};
+
+    s32 diff_update_probability = 252;
+    s32 frame_sync_code = 0x498342;
 };

 } // namespace Decoder
--- a/src/video_core/command_classes/codecs/vp9_types.h
+++ b/src/video_core/command_classes/codecs/vp9_types.h
@@ -296,6 +296,12 @@ struct RefPoolElement {
    bool refresh{};
 };

+struct FrameContexts {
+    s64 from;
+    bool adapted;
+    Vp9EntropyProbs probs;
+};
+
 #define ASSERT_POSITION(field_name, position)                                                      \
    static_assert(offsetof(Vp9EntropyProbs, field_name) == position,                               \
                  "Field " #field_name " has invalid position")
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -10,33 +10,27 @@
 #define END_PUSH_CONSTANTS };
 #define UNIFORM(n)
 #define BINDING_INPUT_BUFFER 0
-#define BINDING_ENC_BUFFER 1
-#define BINDING_SWIZZLE_BUFFER 2
-#define BINDING_OUTPUT_IMAGE 3
+#define BINDING_OUTPUT_IMAGE 1

 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv

 #define BEGIN_PUSH_CONSTANTS
 #define END_PUSH_CONSTANTS
 #define UNIFORM(n) layout(location = n) uniform
-#define BINDING_SWIZZLE_BUFFER 0
-#define BINDING_INPUT_BUFFER 1
-#define BINDING_ENC_BUFFER 2
+#define BINDING_INPUT_BUFFER 0
 #define BINDING_OUTPUT_IMAGE 0

 #endif

-layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;

 BEGIN_PUSH_CONSTANTS
 UNIFORM(1) uvec2 block_dims;
-
-UNIFORM(2) uint bytes_per_block_log2;
-UNIFORM(3) uint layer_stride;
-UNIFORM(4) uint block_size;
-UNIFORM(5) uint x_shift;
-UNIFORM(6) uint block_height;
-UNIFORM(7) uint block_height_mask;
+UNIFORM(2) uint layer_stride;
+UNIFORM(3) uint block_size;
+UNIFORM(4) uint x_shift;
+UNIFORM(5) uint block_height;
+UNIFORM(6) uint block_height_mask;
 END_PUSH_CONSTANTS

 struct EncodingData {
@@ -55,45 +49,35 @@ struct TexelWeightParams {
    bool void_extent_hdr;
 };

-// Swizzle data
-layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
-    uint swizzle_table[];
-};
-
 layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
-    uint astc_data[];
-};
-
-// ASTC Encodings data
-layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
-    EncodingData encoding_values[];
+    uvec4 astc_data[];
 };

 layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image;

-const uint GOB_SIZE_X = 64;
-const uint GOB_SIZE_Y = 8;
-const uint GOB_SIZE_Z = 1;
-const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
-
 const uint GOB_SIZE_X_SHIFT = 6;
 const uint GOB_SIZE_Y_SHIFT = 3;
-const uint GOB_SIZE_Z_SHIFT = 0;
-const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;

-const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
-
-const int BLOCK_SIZE_IN_BYTES = 16;
-
-const int BLOCK_INFO_ERROR = 0;
-const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
-const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
-const int BLOCK_INFO_NORMAL = 3;
+const uint BYTES_PER_BLOCK_LOG2 = 4;

 const int JUST_BITS = 0;
 const int QUINT = 1;
 const int TRIT = 2;

+// ASTC Encodings data, sorted in ascending order based on their BitLength value
+// (see GetBitLength() function)
+EncodingData encoding_values[22] = EncodingData[](
+    EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0),
+    EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0),
+    EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0),
+    EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0),
+    EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0),
+    EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0),
+    EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0),
+    EncodingData(JUST_BITS, 8, 0, 0)
+);
+
 // The following constants are expanded variants of the Replicate()
 // function calls corresponding to the following arguments:
 // value: index into the generated table
@@ -135,44 +119,37 @@ const uint REPLICATE_7_BIT_TO_8_TABLE[128] =
 // Input ASTC texture globals
 uint current_index = 0;
 int bitsread = 0;
-uint total_bitsread = 0;
-uint local_buff[16];
+int total_bitsread = 0;
+uvec4 local_buff;

 // Color data globals
-uint color_endpoint_data[16];
+uvec4 color_endpoint_data;
 int color_bitsread = 0;
-uint total_color_bitsread = 0;
-int color_index = 0;

 // Four values, two endpoints, four maximum paritions
 uint color_values[32];
 int colvals_index = 0;

 // Weight data globals
-uint texel_weight_data[16];
+uvec4 texel_weight_data;
 int texel_bitsread = 0;
-uint total_texel_bitsread = 0;
-int texel_index = 0;

 bool texel_flag = false;

 // Global "vectors" to be pushed into when decoding
-EncodingData result_vector[100];
+EncodingData result_vector[144];
 int result_index = 0;

-EncodingData texel_vector[100];
+EncodingData texel_vector[144];
 int texel_vector_index = 0;

 uint unquantized_texel_weights[2][144];

 uint SwizzleOffset(uvec2 pos) {
-    pos = pos & SWIZZLE_MASK;
-    return swizzle_table[pos.y * 64 + pos.x];
-}
-
-uint ReadTexel(uint offset) {
-    // extract the 8-bit value from the 32-bit packed data.
-    return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
+    uint x = pos.x;
+    uint y = pos.y;
+    return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
+                          (y % 2) * 16 + (x % 16);
 }

 // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
@@ -278,14 +255,10 @@ uint Hash52(uint p) {
    return p;
 }

-uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
-    if (partition_count == 1) {
-        return 0;
-    }
+uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
    if (small_block) {
        x <<= 1;
        y <<= 1;
-        z <<= 1;
    }

    seed += (partition_count - 1) * 1024;
@@ -299,10 +272,6 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
    uint seed6 = uint((rnum >> 20) & 0xF);
    uint seed7 = uint((rnum >> 24) & 0xF);
    uint seed8 = uint((rnum >> 28) & 0xF);
-    uint seed9 = uint((rnum >> 18) & 0xF);
-    uint seed10 = uint((rnum >> 22) & 0xF);
-    uint seed11 = uint((rnum >> 26) & 0xF);
-    uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF);

    seed1 = (seed1 * seed1);
    seed2 = (seed2 * seed2);
@@ -312,12 +281,8 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
    seed6 = (seed6 * seed6);
    seed7 = (seed7 * seed7);
    seed8 = (seed8 * seed8);
-    seed9 = (seed9 * seed9);
-    seed10 = (seed10 * seed10);
-    seed11 = (seed11 * seed11);
-    seed12 = (seed12 * seed12);

-    int sh1, sh2, sh3;
+    uint sh1, sh2;
    if ((seed & 1) > 0) {
        sh1 = (seed & 2) > 0 ? 4 : 5;
        sh2 = (partition_count == 3) ? 6 : 5;
@@ -325,25 +290,19 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
        sh1 = (partition_count == 3) ? 6 : 5;
        sh2 = (seed & 2) > 0 ? 4 : 5;
    }
-    sh3 = (seed & 0x10) > 0 ? sh1 : sh2;
+    seed1 >>= sh1;
+    seed2 >>= sh2;
+    seed3 >>= sh1;
+    seed4 >>= sh2;
+    seed5 >>= sh1;
+    seed6 >>= sh2;
+    seed7 >>= sh1;
+    seed8 >>= sh2;

-    seed1 = (seed1 >> sh1);
-    seed2 = (seed2 >> sh2);
-    seed3 = (seed3 >> sh1);
-    seed4 = (seed4 >> sh2);
-    seed5 = (seed5 >> sh1);
-    seed6 = (seed6 >> sh2);
-    seed7 = (seed7 >> sh1);
-    seed8 = (seed8 >> sh2);
-    seed9 = (seed9 >> sh3);
-    seed10 = (seed10 >> sh3);
-    seed11 = (seed11 >> sh3);
-    seed12 = (seed12 >> sh3);
-
-    uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
-    uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
-    uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
-    uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+    uint a = seed1 * x + seed2 * y + (rnum >> 14);
+    uint b = seed3 * x + seed4 * y + (rnum >> 10);
+    uint c = seed5 * x + seed6 * y + (rnum >> 6);
+    uint d = seed7 * x + seed8 * y + (rnum >> 2);

    a &= 0x3F;
    b &= 0x3F;
@@ -368,58 +327,37 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
    }
 }

-uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
-    return SelectPartition(seed, x, y, 0, partition_count, small_block);
-}
-
-uint ReadBit() {
-    if (current_index >= local_buff.length()) {
+uint ExtractBits(uvec4 payload, int offset, int bits) {
+    if (bits <= 0) {
        return 0;
    }
-    uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
-    ++bitsread;
-    ++total_bitsread;
-    if (bitsread == 8) {
-        ++current_index;
-        bitsread = 0;
+    int last_offset = offset + bits - 1;
+    int shifted_offset = offset >> 5;
+    if ((last_offset >> 5) == shifted_offset) {
+        return bitfieldExtract(payload[shifted_offset], offset & 31, bits);
    }
-    return bit;
+    int first_bits = 32 - (offset & 31);
+    int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits));
+    int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits));
+    return result_first | (result_second << first_bits);
 }

 uint StreamBits(uint num_bits) {
-    uint ret = 0;
-    for (uint i = 0; i < num_bits; i++) {
-        ret |= ((ReadBit() & 1) << i);
-    }
+    int int_bits = int(num_bits);
+    uint ret = ExtractBits(local_buff, total_bitsread, int_bits);
+    total_bitsread += int_bits;
    return ret;
 }

-uint ReadColorBit() {
-    uint bit = 0;
-    if (texel_flag) {
-        bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
-        ++texel_bitsread;
-        ++total_texel_bitsread;
-        if (texel_bitsread == 8) {
-            ++texel_index;
-            texel_bitsread = 0;
-        }
-    } else {
-        bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
-        ++color_bitsread;
-        ++total_color_bitsread;
-        if (color_bitsread == 8) {
-            ++color_index;
-            color_bitsread = 0;
-        }
-    }
-    return bit;
-}
-
 uint StreamColorBits(uint num_bits) {
    uint ret = 0;
-    for (uint i = 0; i < num_bits; i++) {
-        ret |= ((ReadColorBit() & 1) << i);
+    int int_bits = int(num_bits);
+    if (texel_flag) {
+        ret = ExtractBits(texel_weight_data, texel_bitsread, int_bits);
+        texel_bitsread += int_bits;
+    } else {
+        ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits);
+        color_bitsread += int_bits;
    }
    return ret;
 }
@@ -596,22 +534,16 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
    for (uint i = 0; i < num_partitions; i++) {
        num_values += ((modes[i] >> 2) + 1) << 1;
    }
-    int range = 256;
-    while (--range > 0) {
-        EncodingData val = encoding_values[range];
+    // Find the largest encoding that's within color_data_bits
+    // TODO(ameerj): profile with binary search
+    int range = 0;
+    while (++range < encoding_values.length()) {
        uint bit_length = GetBitLength(num_values, range);
-        if (bit_length <= color_data_bits) {
-            while (--range > 0) {
-                EncodingData newval = encoding_values[range];
-                if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
-                    break;
-                }
-            }
-            ++range;
+        if (bit_length > color_data_bits) {
            break;
        }
    }
-    DecodeIntegerSequence(range, num_values);
+    DecodeIntegerSequence(range - 1, num_values);
    uint out_index = 0;
    for (int itr = 0; itr < result_index; ++itr) {
        if (out_index >= num_values) {
@@ -1028,7 +960,7 @@ int FindLayout(uint mode) {
    return 5;
 }

-TexelWeightParams DecodeBlockInfo(uint block_index) {
+TexelWeightParams DecodeBlockInfo() {
    TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false);
    uint mode = StreamBits(11);
    if ((mode & 0x1ff) == 0x1fc) {
@@ -1110,10 +1042,10 @@ TexelWeightParams DecodeBlockInfo(uint block_index) {
    }
    weight_index -= 2;
    if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
-        const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31);
+        const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12);
        params.max_weight = max_weights[weight_index];
    } else {
-        const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7);
+        const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6);
        params.max_weight = max_weights[weight_index];
    }
    return params;
@@ -1144,8 +1076,8 @@ void FillVoidExtentLDR(ivec3 coord) {
    }
 }

-void DecompressBlock(ivec3 coord, uint block_index) {
-    TexelWeightParams params = DecodeBlockInfo(block_index);
+void DecompressBlock(ivec3 coord) {
+    TexelWeightParams params = DecodeBlockInfo();
    if (params.error_state) {
        FillError(coord);
        return;
@@ -1212,7 +1144,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
    // Read color data...
    uint color_data_bits = remaining_bits;
    while (remaining_bits > 0) {
-        int nb = int(min(remaining_bits, 8U));
+        int nb = int(min(remaining_bits, 32U));
        uint b = StreamBits(nb);
        color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
        ++ced_pointer;
@@ -1254,25 +1186,20 @@ void DecompressBlock(ivec3 coord, uint block_index) {
        ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]);
    }

-    for (uint i = 0; i < 16; i++) {
-        texel_weight_data[i] = local_buff[i];
-    }
-    for (uint i = 0; i < 8; i++) {
-#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16
-        uint a = REVERSE_BYTE(texel_weight_data[i]);
-        uint b = REVERSE_BYTE(texel_weight_data[15 - i]);
-#undef REVERSE_BYTE
-        texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8));
-        texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8));
-    }
+    texel_weight_data = local_buff;
+    texel_weight_data = bitfieldReverse(texel_weight_data).wzyx;
    uint clear_byte_start =
        (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1;
-    texel_weight_data[clear_byte_start - 1] =
-        texel_weight_data[clear_byte_start - 1] &
+
+    uint byte_insert = ExtractBits(texel_weight_data, int(clear_byte_start - 1) * 8, 8) &
        uint(
            ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
-    for (uint i = 0; i < 16 - clear_byte_start; i++) {
-        texel_weight_data[clear_byte_start + i] = 0U;
+    uint vec_index = (clear_byte_start - 1) >> 2;
+    texel_weight_data[vec_index] =
+        bitfieldInsert(texel_weight_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8);
+    for (uint i = clear_byte_start; i < 16; ++i) {
+        uint idx = i >> 2;
+        texel_weight_data[idx] = bitfieldInsert(texel_weight_data[idx], 0, int(i % 4) * 8, 8);
    }
    texel_flag = true; // use texel "vector" and bit stream in integer decoding
    DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
@@ -1281,8 +1208,11 @@ void DecompressBlock(ivec3 coord, uint block_index) {

    for (uint j = 0; j < block_dims.y; j++) {
        for (uint i = 0; i < block_dims.x; i++) {
-            uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
+            uint local_partition = 0;
+            if (num_partitions > 1) {
+                local_partition = Select2DPartition(partition_index, i, j, num_partitions,
                                                     (block_dims.y * block_dims.x) < 32);
+            }
            vec4 p;
            uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
            uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
@@ -1303,7 +1233,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {

 void main() {
    uvec3 pos = gl_GlobalInvocationID;
-    pos.x <<= bytes_per_block_log2;
+    pos.x <<= BYTES_PER_BLOCK_LOG2;

    // Read as soon as possible due to its latency
    const uint swizzle = SwizzleOffset(pos.xy);
@@ -1321,13 +1251,8 @@ void main() {
    if (any(greaterThanEqual(coord, imageSize(dest_image)))) {
        return;
    }
-    uint block_index =
-        pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x;
-
    current_index = 0;
    bitsread = 0;
-    for (int i = 0; i < 16; i++) {
-        local_buff[i] = ReadTexel(offset + i);
-    }
-    DecompressBlock(coord, block_index);
+    local_buff = astc_data[offset / 16];
+    DecompressBlock(coord);
 }
--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -60,19 +60,14 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
      copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
    const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
    swizzle_table_buffer.Create();
-    astc_buffer.Create();
    glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
-    glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_ENCODINGS_VALUES), &ASTC_ENCODINGS_VALUES,
-                         0);
 }

 UtilShaders::~UtilShaders() = default;

 void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
                             std::span<const VideoCommon::SwizzleParameters> swizzles) {
-    static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
-    static constexpr GLuint BINDING_INPUT_BUFFER = 1;
-    static constexpr GLuint BINDING_ENC_BUFFER = 2;
+    static constexpr GLuint BINDING_INPUT_BUFFER = 0;
    static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;

    const Extent2D tile_size{
@@ -80,34 +75,32 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
        .height = VideoCore::Surface::DefaultBlockHeight(image.info.format),
    };
    program_manager.BindComputeProgram(astc_decoder_program.handle);
-    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
-    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle);
-
    glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
    glUniform2ui(1, tile_size.width, tile_size.height);
+
    // Ensure buffer data is valid before dispatching
    glFlush();
    for (const SwizzleParameters& swizzle : swizzles) {
        const size_t input_offset = swizzle.buffer_offset + map.offset;
-        const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
-        const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
+        const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U);
+        const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U);

        const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
        ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
        ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
+        ASSERT(params.bytes_per_block_log2 == 4);

-        glUniform1ui(2, params.bytes_per_block_log2);
-        glUniform1ui(3, params.layer_stride);
-        glUniform1ui(4, params.block_size);
-        glUniform1ui(5, params.x_shift);
-        glUniform1ui(6, params.block_height);
-        glUniform1ui(7, params.block_height_mask);
+        glUniform1ui(2, params.layer_stride);
+        glUniform1ui(3, params.block_size);
+        glUniform1ui(4, params.x_shift);
+        glUniform1ui(5, params.block_height);
+        glUniform1ui(6, params.block_height_mask);

-        glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0,
-                           GL_WRITE_ONLY, GL_RGBA8);
        // ASTC texture data
        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
                          image.guest_size_bytes - swizzle.buffer_offset);
+        glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0,
+                           GL_WRITE_ONLY, GL_RGBA8);

        glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers);
    }
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -62,7 +62,6 @@ private:
    ProgramManager& program_manager;

    OGLBuffer swizzle_table_buffer;
-    OGLBuffer astc_buffer;

    OGLProgram astc_decoder_program;
    OGLProgram block_linear_unswizzle_2d_program;
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -30,16 +30,12 @@
 namespace Vulkan {

 using Tegra::Texture::SWIZZLE_TABLE;
-using Tegra::Texture::ASTC::ASTC_ENCODINGS_VALUES;
-using namespace Tegra::Texture::ASTC;

 namespace {

 constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
-constexpr u32 ASTC_BINDING_ENC_BUFFER = 1;
-constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 2;
-constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 3;
-constexpr size_t ASTC_NUM_BINDINGS = 4;
+constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1;
+constexpr size_t ASTC_NUM_BINDINGS = 2;

 template <size_t size>
 inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{
@@ -75,7 +71,7 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
    .score = 2,
 };

-constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDINGS{{
+constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
    {
        .binding = ASTC_BINDING_INPUT_BUFFER,
        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
@@ -83,20 +79,6 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDIN
        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
        .pImmutableSamplers = nullptr,
    },
-    {
-        .binding = ASTC_BINDING_ENC_BUFFER,
-        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-        .descriptorCount = 1,
-        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
-        .pImmutableSamplers = nullptr,
-    },
-    {
-        .binding = ASTC_BINDING_SWIZZLE_BUFFER,
-        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-        .descriptorCount = 1,
-        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
-        .pImmutableSamplers = nullptr,
-    },
    {
        .binding = ASTC_BINDING_OUTPUT_IMAGE,
        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
@@ -108,12 +90,12 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDIN

 constexpr DescriptorBankInfo ASTC_BANK_INFO{
    .uniform_buffers = 0,
-    .storage_buffers = 3,
+    .storage_buffers = 1,
    .texture_buffers = 0,
    .image_buffers = 0,
    .textures = 0,
    .images = 1,
-    .score = 4,
+    .score = 2,
 };

 constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{
@@ -135,22 +117,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
            .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
            .stride = sizeof(DescriptorUpdateEntry),
        },
-        {
-            .dstBinding = ASTC_BINDING_ENC_BUFFER,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .offset = ASTC_BINDING_ENC_BUFFER * sizeof(DescriptorUpdateEntry),
-            .stride = sizeof(DescriptorUpdateEntry),
-        },
-        {
-            .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry),
-            .stride = sizeof(DescriptorUpdateEntry),
-        },
        {
            .dstBinding = ASTC_BINDING_OUTPUT_IMAGE,
            .dstArrayElement = 0,
@@ -163,7 +129,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>

 struct AstcPushConstants {
    std::array<u32, 2> blocks_dims;
-    u32 bytes_per_block_log2;
    u32 layer_stride;
    u32 block_size;
    u32 x_shift;
@@ -354,46 +319,6 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,

 ASTCDecoderPass::~ASTCDecoderPass() = default;

-void ASTCDecoderPass::MakeDataBuffer() {
-    constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_ENCODINGS_VALUES) + sizeof(SWIZZLE_TABLE);
-    data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
-        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
-        .pNext = nullptr,
-        .flags = 0,
-        .size = TOTAL_BUFFER_SIZE,
-        .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-        .queueFamilyIndexCount = 0,
-        .pQueueFamilyIndices = nullptr,
-    });
-    data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload);
-
-    const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload);
-    std::memcpy(staging_ref.mapped_span.data(), &ASTC_ENCODINGS_VALUES,
-                sizeof(ASTC_ENCODINGS_VALUES));
-    // Tack on the swizzle table at the end of the buffer
-    std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_ENCODINGS_VALUES), &SWIZZLE_TABLE,
-                sizeof(SWIZZLE_TABLE));
-
-    scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer,
-                      TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) {
-        static constexpr VkMemoryBarrier write_barrier{
-            .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
-            .pNext = nullptr,
-            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
-            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
-        };
-        const VkBufferCopy copy{
-            .srcOffset = offset,
-            .dstOffset = 0,
-            .size = TOTAL_BUFFER_SIZE,
-        };
-        cmdbuf.CopyBuffer(src, dst, copy);
-        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-                               0, write_barrier);
-    });
-}
-
 void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
                               std::span<const VideoCommon::SwizzleParameters> swizzles) {
    using namespace VideoCommon::Accelerated;
@@ -402,9 +327,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
        VideoCore::Surface::DefaultBlockHeight(image.info.format),
    };
    scheduler.RequestOutsideRenderPassOperationContext();
-    if (!data_buffer) {
-        MakeDataBuffer();
-    }
    const VkPipeline vk_pipeline = *pipeline;
    const VkImageAspectFlags aspect_mask = image.AspectMask();
    const VkImage vk_image = image.Handle();
@@ -436,16 +358,13 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
        });
    for (const VideoCommon::SwizzleParameters& swizzle : swizzles) {
        const size_t input_offset = swizzle.buffer_offset + map.offset;
-        const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
-        const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
+        const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U);
+        const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U);
        const u32 num_dispatches_z = image.info.resources.layers;

        update_descriptor_queue.Acquire();
        update_descriptor_queue.AddBuffer(map.buffer, input_offset,
                                          image.guest_size_bytes - swizzle.buffer_offset);
-        update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(ASTC_ENCODINGS_VALUES));
-        update_descriptor_queue.AddBuffer(*data_buffer, sizeof(ASTC_ENCODINGS_VALUES),
-                                          sizeof(SWIZZLE_TABLE));
        update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level));
        const void* const descriptor_data{update_descriptor_queue.UpdateData()};

@@ -453,11 +372,11 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
        const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
        ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
        ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
+        ASSERT(params.bytes_per_block_log2 == 4);
        scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims,
                          params, descriptor_data](vk::CommandBuffer cmdbuf) {
            const AstcPushConstants uniforms{
                .blocks_dims = block_dims,
-                .bytes_per_block_log2 = params.bytes_per_block_log2,
                .layer_stride = params.layer_stride,
                .block_size = params.block_size,
                .x_shift = params.x_shift,
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -96,15 +96,10 @@ public:
                  std::span<const VideoCommon::SwizzleParameters> swizzles);

 private:
-    void MakeDataBuffer();
-
    VKScheduler& scheduler;
    StagingBufferPool& staging_buffer_pool;
    VKUpdateDescriptorQueue& update_descriptor_queue;
    MemoryAllocator& memory_allocator;
-
-    vk::Buffer data_buffer;
-    MemoryCommit data_buffer_commit;
 };

 } // namespace Vulkan
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -151,6 +151,76 @@ private:
    const IntType& m_Bits;
 };

+enum class IntegerEncoding { JustBits, Quint, Trit };
+
+struct IntegerEncodedValue {
+    constexpr IntegerEncodedValue() = default;
+
+    constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
+        : encoding{encoding_}, num_bits{num_bits_} {}
+
+    constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
+        return encoding == other.encoding && num_bits == other.num_bits;
+    }
+
+    // Returns the number of bits required to encode num_vals values.
+    u32 GetBitLength(u32 num_vals) const {
+        u32 total_bits = num_bits * num_vals;
+        if (encoding == IntegerEncoding::Trit) {
+            total_bits += (num_vals * 8 + 4) / 5;
+        } else if (encoding == IntegerEncoding::Quint) {
+            total_bits += (num_vals * 7 + 2) / 3;
+        }
+        return total_bits;
+    }
+
+    IntegerEncoding encoding{};
+    u32 num_bits = 0;
+    u32 bit_value = 0;
+    union {
+        u32 quint_value = 0;
+        u32 trit_value;
+    };
+};
+
+// Returns a new instance of this struct that corresponds to the
+// can take no more than mav_value values
+static constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) {
+    while (mav_value > 0) {
+        u32 check = mav_value + 1;
+
+        // Is mav_value a power of two?
+        if (!(check & (check - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value));
+        }
+
+        // Is mav_value of the type 3*2^n - 1?
+        if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1));
+        }
+
+        // Is mav_value of the type 5*2^n - 1?
+        if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1));
+        }
+
+        // Apparently it can't be represented with a bounded integer sequence...
+        // just iterate.
+        mav_value--;
+    }
+    return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
+    std::array<IntegerEncodedValue, 256> encodings{};
+    for (std::size_t i = 0; i < encodings.size(); ++i) {
+        encodings[i] = CreateEncoding(static_cast<u32>(i));
+    }
+    return encodings;
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues();
+
 namespace Tegra::Texture::ASTC {
 using IntegerEncodedVector = boost::container::static_vector<
    IntegerEncodedValue, 256,
@@ -521,35 +591,41 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
    return params;
 }

-static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
-                              u32 blockHeight) {
-    // Don't actually care about the void extent, just read the bits...
-    for (s32 i = 0; i < 4; ++i) {
-        strm.ReadBits<13>();
+// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
+// is the same as [(num_bits - 1):0] and repeats all the way down.
+template <typename IntType>
+static constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) {
+    if (num_bits == 0 || to_bit == 0) {
+        return 0;
    }
-
-    // Decode the RGBA components and renormalize them to the range [0, 255]
-    u16 r = static_cast<u16>(strm.ReadBits<16>());
-    u16 g = static_cast<u16>(strm.ReadBits<16>());
-    u16 b = static_cast<u16>(strm.ReadBits<16>());
-    u16 a = static_cast<u16>(strm.ReadBits<16>());
-
-    u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
-               (static_cast<u32>(a) & 0xFF00) << 16;
-
-    for (u32 j = 0; j < blockHeight; j++) {
-        for (u32 i = 0; i < blockWidth; i++) {
-            outBuf[j * blockWidth + i] = rgba;
+    const IntType v = val & static_cast<IntType>((1 << num_bits) - 1);
+    IntType res = v;
+    u32 reslen = num_bits;
+    while (reslen < to_bit) {
+        u32 comp = 0;
+        if (num_bits > to_bit - reslen) {
+            u32 newshift = to_bit - reslen;
+            comp = num_bits - newshift;
+            num_bits = newshift;
        }
+        res = static_cast<IntType>(res << num_bits);
+        res = static_cast<IntType>(res | (v >> comp));
+        reslen += num_bits;
    }
+    return res;
 }

-static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
-    for (u32 j = 0; j < blockHeight; j++) {
-        for (u32 i = 0; i < blockWidth; i++) {
-            outBuf[j * blockWidth + i] = 0xFFFF00FF;
-        }
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+    return std::size_t(1) << num_bits;
+}
+
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+    std::array<IntType, NumReplicateEntries(num_bits)> table{};
+    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+        table[value] = Replicate(value, num_bits, to_bit);
    }
+    return table;
 }

 static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
@@ -572,6 +648,9 @@ static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>
 static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
 static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
 static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
 /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
 /// to the runtime implementation
 static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
@@ -1316,6 +1395,37 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
 #undef READ_INT_VALUES
 }

+static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
+                              u32 blockHeight) {
+    // Don't actually care about the void extent, just read the bits...
+    for (s32 i = 0; i < 4; ++i) {
+        strm.ReadBits<13>();
+    }
+
+    // Decode the RGBA components and renormalize them to the range [0, 255]
+    u16 r = static_cast<u16>(strm.ReadBits<16>());
+    u16 g = static_cast<u16>(strm.ReadBits<16>());
+    u16 b = static_cast<u16>(strm.ReadBits<16>());
+    u16 a = static_cast<u16>(strm.ReadBits<16>());
+
+    u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
+               (static_cast<u32>(a) & 0xFF00) << 16;
+
+    for (u32 j = 0; j < blockHeight; j++) {
+        for (u32 i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = rgba;
+        }
+    }
+}
+
+static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
+    for (u32 j = 0; j < blockHeight; j++) {
+        for (u32 i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = 0xFFFF00FF;
+        }
+    }
+}
+
 static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
                            const u32 blockHeight, std::span<u32, 12 * 12> outBuf) {
    InputBitStream strm(inBuf);
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -9,117 +9,6 @@

 namespace Tegra::Texture::ASTC {

-enum class IntegerEncoding { JustBits, Quint, Trit };
-
-struct IntegerEncodedValue {
-    constexpr IntegerEncodedValue() = default;
-
-    constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
-        : encoding{encoding_}, num_bits{num_bits_} {}
-
-    constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
-        return encoding == other.encoding && num_bits == other.num_bits;
-    }
-
-    // Returns the number of bits required to encode num_vals values.
-    u32 GetBitLength(u32 num_vals) const {
-        u32 total_bits = num_bits * num_vals;
-        if (encoding == IntegerEncoding::Trit) {
-            total_bits += (num_vals * 8 + 4) / 5;
-        } else if (encoding == IntegerEncoding::Quint) {
-            total_bits += (num_vals * 7 + 2) / 3;
-        }
-        return total_bits;
-    }
-
-    IntegerEncoding encoding{};
-    u32 num_bits = 0;
-    u32 bit_value = 0;
-    union {
-        u32 quint_value = 0;
-        u32 trit_value;
-    };
-};
-
-// Returns a new instance of this struct that corresponds to the
-// can take no more than mav_value values
-constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) {
-    while (mav_value > 0) {
-        u32 check = mav_value + 1;
-
-        // Is mav_value a power of two?
-        if (!(check & (check - 1))) {
-            return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value));
-        }
-
-        // Is mav_value of the type 3*2^n - 1?
-        if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
-            return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1));
-        }
-
-        // Is mav_value of the type 5*2^n - 1?
-        if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
-            return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1));
-        }
-
-        // Apparently it can't be represented with a bounded integer sequence...
-        // just iterate.
-        mav_value--;
-    }
-    return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
-}
-
-constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
-    std::array<IntegerEncodedValue, 256> encodings{};
-    for (std::size_t i = 0; i < encodings.size(); ++i) {
-        encodings[i] = CreateEncoding(static_cast<u32>(i));
-    }
-    return encodings;
-}
-
-constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues();
-
-// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
-// is the same as [(num_bits - 1):0] and repeats all the way down.
-template <typename IntType>
-constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) {
-    if (num_bits == 0 || to_bit == 0) {
-        return 0;
-    }
-    const IntType v = val & static_cast<IntType>((1 << num_bits) - 1);
-    IntType res = v;
-    u32 reslen = num_bits;
-    while (reslen < to_bit) {
-        u32 comp = 0;
-        if (num_bits > to_bit - reslen) {
-            u32 newshift = to_bit - reslen;
-            comp = num_bits - newshift;
-            num_bits = newshift;
-        }
-        res = static_cast<IntType>(res << num_bits);
-        res = static_cast<IntType>(res | (v >> comp));
-        reslen += num_bits;
-    }
-    return res;
-}
-
-constexpr std::size_t NumReplicateEntries(u32 num_bits) {
-    return std::size_t(1) << num_bits;
-}
-
-template <typename IntType, u32 num_bits, u32 to_bit>
-constexpr auto MakeReplicateTable() {
-    std::array<IntType, NumReplicateEntries(num_bits)> table{};
-    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
-        table[value] = Replicate(value, num_bits, to_bit);
-    }
-    return table;
-}
-
-constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
-constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
-constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
-
 void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
                uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);
Author	SHA1	Message	Date
ameerj	c439fc9be9	astc_decoder: Reduce workgroup size This reduces the amount of over dispatching when there are odd dimensions (i.e. ASTC 8x5), which rarely evenly divide into 32x32.	2021-08-01 01:22:27 -04:00
ameerj	5ab8053511	astc_decoder: Compute offset swizzles in-shader Alleviates the dependency on the swizzle table and a uniform which is constant for all ASTC texture sizes.	2021-08-01 01:22:26 -04:00
ameerj	b2862e4772	astc_decoder: Make use of uvec4 for payload data	2021-07-31 22:28:04 -04:00
ameerj	a75d70fa90	astc_decoder: Simplify Select2DPartition	2021-07-31 21:36:26 -04:00
ameerj	5665d05547	astc_decoder: Optimize the use EncodingData This buffer was a list of EncodingData structures sorted by their bit length, with some duplication from the cpu decoder implementation. We can take advantage of its sorted property to optimize its usage in the shader. Thanks to wwylele for the optimization idea.	2021-07-31 21:36:26 -04:00
ameerj	15c0c213b1	astc.h: Move data to cpp implementation Moves leftover values that are no longer used by the gpu decoder back to the cpp implementation.	2021-07-31 21:26:42 -04:00