Add credits for MME shadow RAM

Move ShadowRamControl::Replay handler to MacroInterpreter::Send
maxwell_3d: Implement MME shadow RAM
2020-03-18 23:06:08 +03:00 · 2020-03-18 21:14:33 +03:00 · 2020-03-17 23:30:04 +03:00 · 2020-03-17 10:53:38 -04:00 · 2020-03-16 22:52:42 -04:00 · 2020-03-16 04:03:34 -03:00
65 changed files with 2109 additions and 1785 deletions
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -57,8 +57,6 @@ set(HASH_FILES
    "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.h"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.cpp"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.h"
-    "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.cpp"
-    "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.h"
    "${VIDEO_CORE}/shader/decode/arithmetic.cpp"
    "${VIDEO_CORE}/shader/decode/arithmetic_half.cpp"
    "${VIDEO_CORE}/shader/decode/arithmetic_half_immediate.cpp"
@@ -91,8 +89,6 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/ast.h"
    "${VIDEO_CORE}/shader/compiler_settings.cpp"
    "${VIDEO_CORE}/shader/compiler_settings.h"
-    "${VIDEO_CORE}/shader/const_buffer_locker.cpp"
-    "${VIDEO_CORE}/shader/const_buffer_locker.h"
    "${VIDEO_CORE}/shader/control_flow.cpp"
    "${VIDEO_CORE}/shader/control_flow.h"
    "${VIDEO_CORE}/shader/decode.cpp"
@@ -101,9 +97,13 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/node.h"
    "${VIDEO_CORE}/shader/node_helper.cpp"
    "${VIDEO_CORE}/shader/node_helper.h"
+    "${VIDEO_CORE}/shader/registry.cpp"
+    "${VIDEO_CORE}/shader/registry.h"
    "${VIDEO_CORE}/shader/shader_ir.cpp"
    "${VIDEO_CORE}/shader/shader_ir.h"
    "${VIDEO_CORE}/shader/track.cpp"
+    "${VIDEO_CORE}/shader/transform_feedback.cpp"
+    "${VIDEO_CORE}/shader/transform_feedback.h"
 )
 set(COMBINED "")
 foreach (F IN LISTS HASH_FILES)
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -38,8 +38,6 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.h"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.cpp"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_disk_cache.h"
-      "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.cpp"
-      "${VIDEO_CORE}/renderer_opengl/gl_shader_gen.h"
      "${VIDEO_CORE}/shader/decode/arithmetic.cpp"
      "${VIDEO_CORE}/shader/decode/arithmetic_half.cpp"
      "${VIDEO_CORE}/shader/decode/arithmetic_half_immediate.cpp"
@@ -72,8 +70,6 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/ast.h"
      "${VIDEO_CORE}/shader/compiler_settings.cpp"
      "${VIDEO_CORE}/shader/compiler_settings.h"
-      "${VIDEO_CORE}/shader/const_buffer_locker.cpp"
-      "${VIDEO_CORE}/shader/const_buffer_locker.h"
      "${VIDEO_CORE}/shader/control_flow.cpp"
      "${VIDEO_CORE}/shader/control_flow.h"
      "${VIDEO_CORE}/shader/decode.cpp"
@@ -82,9 +78,13 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/node.h"
      "${VIDEO_CORE}/shader/node_helper.cpp"
      "${VIDEO_CORE}/shader/node_helper.h"
+      "${VIDEO_CORE}/shader/registry.cpp"
+      "${VIDEO_CORE}/shader/registry.h"
      "${VIDEO_CORE}/shader/shader_ir.cpp"
      "${VIDEO_CORE}/shader/shader_ir.h"
      "${VIDEO_CORE}/shader/track.cpp"
+      "${VIDEO_CORE}/shader/transform_feedback.cpp"
+      "${VIDEO_CORE}/shader/transform_feedback.h"
      # and also check that the scm_rev files haven't changed
      "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.cpp.in"
      "${CMAKE_CURRENT_SOURCE_DIR}/scm_rev.h"
--- a/src/common/page_table.cpp
+++ b/src/common/page_table.cpp
@@ -16,7 +16,6 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) {

    pointers.resize(num_page_table_entries);
    attributes.resize(num_page_table_entries);
-    backing_addr.resize(num_page_table_entries);

    // The default is a 39-bit address space, which causes an initial 1GB allocation size. If the
    // vector size is subsequently decreased (via resize), the vector might not automatically
@@ -25,6 +24,17 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) {

    pointers.shrink_to_fit();
    attributes.shrink_to_fit();
+}
+
+BackingPageTable::BackingPageTable(std::size_t page_size_in_bits) : PageTable{page_size_in_bits} {}
+
+BackingPageTable::~BackingPageTable() = default;
+
+void BackingPageTable::Resize(std::size_t address_space_width_in_bits) {
+    PageTable::Resize(address_space_width_in_bits);
+    const std::size_t num_page_table_entries = 1ULL
+                                               << (address_space_width_in_bits - page_size_in_bits);
+    backing_addr.resize(num_page_table_entries);
    backing_addr.shrink_to_fit();
 }

--- a/src/common/page_table.h
+++ b/src/common/page_table.h
@@ -76,9 +76,20 @@ struct PageTable {
     */
    std::vector<PageType> attributes;

-    std::vector<u64> backing_addr;
-
    const std::size_t page_size_in_bits{};
 };

+/**
+ * A more advanced Page Table with the ability to save a backing address when using it
+ * depends on another MMU.
+ */
+struct BackingPageTable : PageTable {
+    explicit BackingPageTable(std::size_t page_size_in_bits);
+    ~BackingPageTable();
+
+    void Resize(std::size_t address_space_width_in_bits);
+
+    std::vector<u64> backing_addr;
+};
+
 } // namespace Common
--- a/src/core/frontend/framebuffer_layout.cpp
+++ b/src/core/frontend/framebuffer_layout.cpp
@@ -48,8 +48,8 @@ FramebufferLayout FrameLayoutFromResolutionScale(u32 res_scale) {
    u32 width, height;

    if (Settings::values.use_docked_mode) {
-        width = ScreenDocked::WidthDocked * res_scale;
-        height = ScreenDocked::HeightDocked * res_scale;
+        width = ScreenDocked::Width * res_scale;
+        height = ScreenDocked::Height * res_scale;
    } else {
        width = ScreenUndocked::Width * res_scale;
        height = ScreenUndocked::Height * res_scale;
--- a/src/core/frontend/framebuffer_layout.h
+++ b/src/core/frontend/framebuffer_layout.h
@@ -8,15 +8,15 @@

 namespace Layout {

-enum ScreenUndocked : u32 {
-    Width = 1280,
-    Height = 720,
-};
+namespace ScreenUndocked {
+constexpr u32 Width = 1280;
+constexpr u32 Height = 720;
+} // namespace ScreenUndocked

-enum ScreenDocked : u32 {
-    WidthDocked = 1920,
-    HeightDocked = 1080,
-};
+namespace ScreenDocked {
+constexpr u32 Width = 1920;
+constexpr u32 Height = 1080;
+} // namespace ScreenDocked

 enum class AspectRatio {
    Default,
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -65,8 +65,6 @@ add_library(video_core STATIC
    renderer_opengl/gl_shader_decompiler.h
    renderer_opengl/gl_shader_disk_cache.cpp
    renderer_opengl/gl_shader_disk_cache.h
-    renderer_opengl/gl_shader_gen.cpp
-    renderer_opengl/gl_shader_gen.h
    renderer_opengl/gl_shader_manager.cpp
    renderer_opengl/gl_shader_manager.h
    renderer_opengl/gl_shader_util.cpp
@@ -118,8 +116,6 @@ add_library(video_core STATIC
    shader/ast.h
    shader/compiler_settings.cpp
    shader/compiler_settings.h
-    shader/const_buffer_locker.cpp
-    shader/const_buffer_locker.h
    shader/control_flow.cpp
    shader/control_flow.h
    shader/decode.cpp
@@ -128,9 +124,13 @@ add_library(video_core STATIC
    shader/node_helper.cpp
    shader/node_helper.h
    shader/node.h
+    shader/registry.cpp
+    shader/registry.h
    shader/shader_ir.cpp
    shader/shader_ir.h
    shader/track.cpp
+    shader/transform_feedback.cpp
+    shader/transform_feedback.h
    surface.cpp
    surface.h
    texture_cache/format_lookup_table.cpp
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -16,11 +16,12 @@ namespace Tegra::Engines {

 struct SamplerDescriptor {
    union {
-        BitField<0, 20, Tegra::Shader::TextureType> texture_type;
-        BitField<20, 1, u32> is_array;
-        BitField<21, 1, u32> is_buffer;
-        BitField<22, 1, u32> is_shadow;
-        u32 raw{};
+        u32 raw = 0;
+        BitField<0, 2, Tegra::Shader::TextureType> texture_type;
+        BitField<2, 3, Tegra::Texture::ComponentType> component_type;
+        BitField<5, 1, u32> is_array;
+        BitField<6, 1, u32> is_buffer;
+        BitField<7, 1, u32> is_shadow;
    };

    bool operator==(const SamplerDescriptor& rhs) const noexcept {
@@ -31,68 +32,48 @@ struct SamplerDescriptor {
        return !operator==(rhs);
    }

-    static SamplerDescriptor FromTicTexture(Tegra::Texture::TextureType tic_texture_type) {
+    static SamplerDescriptor FromTIC(const Tegra::Texture::TICEntry& tic) {
+        using Tegra::Shader::TextureType;
        SamplerDescriptor result;
-        switch (tic_texture_type) {
+
+        // This is going to be used to determine the shading language type.
+        // Because of that we don't care about all component types on color textures.
+        result.component_type.Assign(tic.r_type.Value());
+
+        switch (tic.texture_type.Value()) {
        case Tegra::Texture::TextureType::Texture1D:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture1D);
            return result;
        case Tegra::Texture::TextureType::Texture2D:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture2D);
            return result;
        case Tegra::Texture::TextureType::Texture3D:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture3D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture3D);
            return result;
        case Tegra::Texture::TextureType::TextureCubemap:
-            result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::TextureCube);
            return result;
        case Tegra::Texture::TextureType::Texture1DArray:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D);
+            result.texture_type.Assign(TextureType::Texture1D);
            result.is_array.Assign(1);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
            return result;
        case Tegra::Texture::TextureType::Texture2DArray:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
+            result.texture_type.Assign(TextureType::Texture2D);
            result.is_array.Assign(1);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
            return result;
        case Tegra::Texture::TextureType::Texture1DBuffer:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D);
-            result.is_array.Assign(0);
+            result.texture_type.Assign(TextureType::Texture1D);
            result.is_buffer.Assign(1);
-            result.is_shadow.Assign(0);
            return result;
        case Tegra::Texture::TextureType::Texture2DNoMipmap:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture2D);
            return result;
        case Tegra::Texture::TextureType::TextureCubeArray:
-            result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube);
+            result.texture_type.Assign(TextureType::TextureCube);
            result.is_array.Assign(1);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
            return result;
        default:
-            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
-            result.is_array.Assign(0);
-            result.is_buffer.Assign(0);
-            result.is_shadow.Assign(0);
+            result.texture_type.Assign(TextureType::Texture2D);
            return result;
        }
    }
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -89,7 +89,7 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con

    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
-    SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value());
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
    return result;
 }
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -122,6 +122,13 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3
 void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
    const u32 method = method_call.method;

+    // Keep track of the register value in shadow_regs when requested.
+    if (method < Regs::NUM_REGS &&
+        (regs.shadow_ram_control == Regs::ShadowRamControl::Track ||
+         regs.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter)) {
+        shadow_regs.reg_array[method] = method_call.argument;
+    }
+
    if (method == cb_data_state.current) {
        regs.reg_array[method] = method_call.argument;
        ProcessCBData(method_call.argument);
@@ -638,7 +645,7 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b

    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
-    SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value());
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
    return result;
 }
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -67,6 +67,7 @@ public:
        static constexpr std::size_t NumVaryings = 31;
        static constexpr std::size_t NumImages = 8; // TODO(Rodrigo): Investigate this number
        static constexpr std::size_t NumClipDistances = 8;
+        static constexpr std::size_t NumTransformFeedbackBuffers = 4;
        static constexpr std::size_t MaxShaderProgram = 6;
        static constexpr std::size_t MaxShaderStage = 5;
        // Maximum number of const buffers per shader stage.
@@ -524,6 +525,24 @@ public:
            FractionalEven = 2,
        };

+        enum class PolygonMode : u32 {
+            Point = 0x1b00,
+            Line = 0x1b01,
+            Fill = 0x1b02,
+        };
+
+        // Thanks to fincs for reverse engineering
+        // MME shadow RAM and Ryujinx for providing
+        // the base for the implementation.
+        // https://github.com/fincs
+        // https://github.com/Ryujinx/Ryujinx/pull/987
+        enum class ShadowRamControl : u32 {
+            Track = 0,
+            TrackWithFilter = 1,
+            Passthrough = 2,
+            Replay = 3,
+        };
+
        struct RenderTargetConfig {
            u32 address_high;
            u32 address_low;
@@ -621,6 +640,29 @@ public:
            float depth_range_far;
        };

+        struct TransformFeedbackBinding {
+            u32 buffer_enable;
+            u32 address_high;
+            u32 address_low;
+            s32 buffer_size;
+            s32 buffer_offset;
+            INSERT_UNION_PADDING_WORDS(3);
+
+            GPUVAddr Address() const {
+                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                             address_low);
+            }
+        };
+        static_assert(sizeof(TransformFeedbackBinding) == 32);
+
+        struct TransformFeedbackLayout {
+            u32 stream;
+            u32 varying_count;
+            u32 stride;
+            INSERT_UNION_PADDING_WORDS(1);
+        };
+        static_assert(sizeof(TransformFeedbackLayout) == 16);
+
        bool IsShaderConfigEnabled(std::size_t index) const {
            // The VertexB is always enabled.
            if (index == static_cast<std::size_t>(Regs::ShaderProgram::VertexB)) {
@@ -629,6 +671,10 @@ public:
            return shader_config[index].enable != 0;
        }

+        bool IsShaderConfigEnabled(Regs::ShaderProgram type) const {
+            return IsShaderConfigEnabled(static_cast<std::size_t>(type));
+        }
+
        union {
            struct {
                INSERT_UNION_PADDING_WORDS(0x45);
@@ -640,7 +686,9 @@ public:
                    u32 bind;
                } macros;

-                INSERT_UNION_PADDING_WORDS(0x17);
+                ShadowRamControl shadow_ram_control;
+
+                INSERT_UNION_PADDING_WORDS(0x16);

                Upload::Registers upload;
                struct {
@@ -677,7 +725,13 @@ public:

                u32 rasterize_enable;

-                INSERT_UNION_PADDING_WORDS(0xF1);
+                std::array<TransformFeedbackBinding, NumTransformFeedbackBuffers> tfb_bindings;
+
+                INSERT_UNION_PADDING_WORDS(0xC0);
+
+                std::array<TransformFeedbackLayout, NumTransformFeedbackBuffers> tfb_layouts;
+
+                INSERT_UNION_PADDING_WORDS(0x1);

                u32 tfb_enabled;

@@ -705,7 +759,12 @@ public:

                s32 clear_stencil;

-                INSERT_UNION_PADDING_WORDS(0x7);
+                INSERT_UNION_PADDING_WORDS(0x2);
+
+                PolygonMode polygon_mode_front;
+                PolygonMode polygon_mode_back;
+
+                INSERT_UNION_PADDING_WORDS(0x3);

                u32 polygon_offset_point_enable;
                u32 polygon_offset_line_enable;
@@ -764,7 +823,11 @@ public:
                    BitField<12, 4, u32> viewport;
                } clear_flags;

-                INSERT_UNION_PADDING_WORDS(0x19);
+                INSERT_UNION_PADDING_WORDS(0x10);
+
+                u32 fill_rectangle;
+
+                INSERT_UNION_PADDING_WORDS(0x8);

                std::array<VertexAttribute, NumVertexAttributes> vertex_attrib_format;

@@ -1187,7 +1250,11 @@ public:

                u32 tex_cb_index;

-                INSERT_UNION_PADDING_WORDS(0x395);
+                INSERT_UNION_PADDING_WORDS(0x7D);
+
+                std::array<std::array<u8, 128>, NumTransformFeedbackBuffers> tfb_varying_locs;
+
+                INSERT_UNION_PADDING_WORDS(0x298);

                struct {
                    /// Compressed address of a buffer that holds information about bound SSBOs.
@@ -1210,7 +1277,10 @@ public:
            };
            std::array<u32, NUM_REGS> reg_array;
        };
-    } regs{};
+    };
+
+    Regs regs{};
+    Regs shadow_regs{};

    static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "Maxwell3D Regs has wrong size");
    static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable");
@@ -1405,6 +1475,7 @@ private:
                  "Field " #field_name " has invalid position")

 ASSERT_REG_POSITION(macros, 0x45);
+ASSERT_REG_POSITION(shadow_ram_control, 0x49);
 ASSERT_REG_POSITION(upload, 0x60);
 ASSERT_REG_POSITION(exec_upload, 0x6C);
 ASSERT_REG_POSITION(data_upload, 0x6D);
@@ -1413,6 +1484,8 @@ ASSERT_REG_POSITION(tess_mode, 0xC8);
 ASSERT_REG_POSITION(tess_level_outer, 0xC9);
 ASSERT_REG_POSITION(tess_level_inner, 0xCD);
 ASSERT_REG_POSITION(rasterize_enable, 0xDF);
+ASSERT_REG_POSITION(tfb_bindings, 0xE0);
+ASSERT_REG_POSITION(tfb_layouts, 0x1C0);
 ASSERT_REG_POSITION(tfb_enabled, 0x1D1);
 ASSERT_REG_POSITION(rt, 0x200);
 ASSERT_REG_POSITION(viewport_transform, 0x280);
@@ -1422,6 +1495,8 @@ ASSERT_REG_POSITION(depth_mode, 0x35F);
 ASSERT_REG_POSITION(clear_color[0], 0x360);
 ASSERT_REG_POSITION(clear_depth, 0x364);
 ASSERT_REG_POSITION(clear_stencil, 0x368);
+ASSERT_REG_POSITION(polygon_mode_front, 0x36B);
+ASSERT_REG_POSITION(polygon_mode_back, 0x36C);
 ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370);
 ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371);
 ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372);
@@ -1435,6 +1510,7 @@ ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
 ASSERT_REG_POSITION(depth_bounds, 0x3E7);
 ASSERT_REG_POSITION(zeta, 0x3F8);
 ASSERT_REG_POSITION(clear_flags, 0x43E);
+ASSERT_REG_POSITION(fill_rectangle, 0x44F);
 ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
 ASSERT_REG_POSITION(rt_control, 0x487);
 ASSERT_REG_POSITION(zeta_width, 0x48a);
@@ -1508,6 +1584,7 @@ ASSERT_REG_POSITION(firmware, 0x8C0);
 ASSERT_REG_POSITION(const_buffer, 0x8E0);
 ASSERT_REG_POSITION(cb_bind[0], 0x904);
 ASSERT_REG_POSITION(tex_cb_index, 0x982);
+ASSERT_REG_POSITION(tfb_varying_locs, 0xA00);
 ASSERT_REG_POSITION(ssbo_info, 0xD18);
 ASSERT_REG_POSITION(tex_info_buffers.address[0], 0xD2A);
 ASSERT_REG_POSITION(tex_info_buffers.size[0], 0xD2F);
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -911,14 +911,9 @@ union Instruction {
    } fadd32i;

    union {
-        BitField<20, 8, u64> shift_position;
-        BitField<28, 8, u64> shift_length;
-        BitField<48, 1, u64> negate_b;
-        BitField<49, 1, u64> negate_a;
-
-        u64 GetLeftShiftValue() const {
-            return 32 - (shift_position + shift_length);
-        }
+        BitField<40, 1, u64> brev;
+        BitField<47, 1, u64> rd_cc;
+        BitField<48, 1, u64> is_signed;
    } bfe;

    union {
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -39,6 +39,7 @@ enum class RenderTargetFormat : u32 {
    RGBA32_FLOAT = 0xC0,
    RGBA32_UINT = 0xC2,
    RGBA16_UNORM = 0xC6,
+    RGBA16_SNORM = 0xC7,
    RGBA16_UINT = 0xC9,
    RGBA16_FLOAT = 0xCA,
    RG32_FLOAT = 0xCB,
--- a/src/video_core/guest_driver.cpp
+++ b/src/video_core/guest_driver.cpp
@@ -4,13 +4,15 @@

 #include <algorithm>
 #include <limits>
+#include <vector>

+#include "common/common_types.h"
 #include "video_core/guest_driver.h"

 namespace VideoCore {

-void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets) {
-    if (texture_handler_size_deduced) {
+void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32> bound_offsets) {
+    if (texture_handler_size) {
        return;
    }
    const std::size_t size = bound_offsets.size();
@@ -29,7 +31,6 @@ void GuestDriverProfile::DeduceTextureHandlerSize(std::vector<u32>&& bound_offse
    if (min_val > 2) {
        return;
    }
-    texture_handler_size_deduced = true;
    texture_handler_size = min_texture_handler_size * min_val;
 }

--- a/src/video_core/guest_driver.h
+++ b/src/video_core/guest_driver.h
@@ -4,6 +4,7 @@

 #pragma once

+#include <optional>
 #include <vector>

 #include "common/common_types.h"
@@ -17,25 +18,29 @@ namespace VideoCore {
 */
 class GuestDriverProfile {
 public:
-    void DeduceTextureHandlerSize(std::vector<u32>&& bound_offsets);
+    explicit GuestDriverProfile() = default;
+    explicit GuestDriverProfile(std::optional<u32> texture_handler_size)
+        : texture_handler_size{texture_handler_size} {}
+
+    void DeduceTextureHandlerSize(std::vector<u32> bound_offsets);

    u32 GetTextureHandlerSize() const {
-        return texture_handler_size;
+        return texture_handler_size.value_or(default_texture_handler_size);
    }

-    bool TextureHandlerSizeKnown() const {
-        return texture_handler_size_deduced;
+    bool IsTextureHandlerSizeKnown() const {
+        return texture_handler_size.has_value();
    }

 private:
    // Minimum size of texture handler any driver can use.
    static constexpr u32 min_texture_handler_size = 4;
-    // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily
-    // use 4 bytes instead. Thus, certain drivers may squish the size.
+
+    // This goes with Vulkan and OpenGL standards but Nvidia GPUs can easily use 4 bytes instead.
+    // Thus, certain drivers may squish the size.
    static constexpr u32 default_texture_handler_size = 8;

-    u32 texture_handler_size = default_texture_handler_size;
-    bool texture_handler_size_deduced = false;
+    std::optional<u32> texture_handler_size = default_texture_handler_size;
 };

 } // namespace VideoCore
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -328,6 +328,12 @@ void MacroInterpreter::SetMethodAddress(u32 address) {
 }

 void MacroInterpreter::Send(u32 value) {
+    // Use the tracked value in shadow_regs when requested.
+    if (method_address.address < Engines::Maxwell3D::Regs::NUM_REGS &&
+        maxwell3d.regs.shadow_ram_control == Engines::Maxwell3D::Regs::ShadowRamControl::Replay) {
+        value = maxwell3d.shadow_regs.reg_array[method_address.address];
+    }
+
    maxwell3d.CallMethodFromMME({method_address.address, value});
    // Increment the method address by the method increment.
    method_address.address.Assign(method_address.address.Value() +
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -174,7 +174,7 @@ private:
    /// End of address space, based on address space in bits.
    static constexpr GPUVAddr address_space_end{1ULL << address_space_width};

-    Common::PageTable page_table{page_bits};
+    Common::BackingPageTable page_table{page_bits};
    VMAMap vma_map;
    VideoCore::RasterizerInterface& rasterizer;

--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -51,6 +51,7 @@ static constexpr ConversionArray morton_to_linear_fns = {
    MortonCopy<true, PixelFormat::R8UI>,
    MortonCopy<true, PixelFormat::RGBA16F>,
    MortonCopy<true, PixelFormat::RGBA16U>,
+    MortonCopy<true, PixelFormat::RGBA16S>,
    MortonCopy<true, PixelFormat::RGBA16UI>,
    MortonCopy<true, PixelFormat::R11FG11FB10F>,
    MortonCopy<true, PixelFormat::RGBA32UI>,
@@ -131,6 +132,7 @@ static constexpr ConversionArray linear_to_morton_fns = {
    MortonCopy<false, PixelFormat::R8U>,
    MortonCopy<false, PixelFormat::R8UI>,
    MortonCopy<false, PixelFormat::RGBA16F>,
+    MortonCopy<false, PixelFormat::RGBA16S>,
    MortonCopy<false, PixelFormat::RGBA16U>,
    MortonCopy<false, PixelFormat::RGBA16UI>,
    MortonCopy<false, PixelFormat::R11FG11FB10F>,
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -25,7 +25,6 @@ constexpr std::size_t NumQueryTypes = 1;

 enum class LoadCallbackStage {
    Prepare,
-    Decompile,
    Build,
    Complete,
 };
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -28,7 +28,6 @@
 #include "video_core/renderer_opengl/gl_query_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
-#include "video_core/renderer_opengl/gl_shader_gen.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"

@@ -76,7 +75,7 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry
 }

 std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
-                               const GLShader::ConstBufferEntry& entry) {
+                               const ConstBufferEntry& entry) {
    if (!entry.IsIndirect()) {
        return entry.GetSize();
    }
@@ -272,9 +271,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
        SetupDrawTextures(stage, shader);
        SetupDrawImages(stage, shader);

-        const ProgramVariant variant(primitive_mode);
-        const auto program_handle = shader->GetHandle(variant);
-
+        const GLuint program_handle = shader->GetHandle();
        switch (program) {
        case Maxwell::ShaderProgram::VertexA:
        case Maxwell::ShaderProgram::VertexB:
@@ -295,7 +292,7 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
        // When a clip distance is enabled but not set in the shader it crops parts of the screen
        // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
        // clip distances only when it's written by a shader stage.
-        clip_distances |= shader->GetShaderEntries().clip_distances;
+        clip_distances |= shader->GetEntries().clip_distances;

        // When VertexA is enabled, we have dual vertex shaders
        if (program == Maxwell::ShaderProgram::VertexA) {
@@ -487,6 +484,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {

    SyncViewport();
    SyncRasterizeEnable();
+    SyncPolygonModes();
    SyncColorMask();
    SyncFragmentColorClampState();
    SyncMultiSampleState();
@@ -498,7 +496,6 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    SyncCullMode();
    SyncPrimitiveRestart();
    SyncScissorTest();
-    SyncTransformFeedback();
    SyncPointState();
    SyncPolygonOffset();
    SyncAlphaTest();
@@ -571,7 +568,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
        glTextureBarrier();
    }

-    ++num_queued_commands;
+    BeginTransformFeedback(primitive_mode);

    const GLuint base_instance = static_cast<GLuint>(gpu.regs.vb_base_instance);
    const GLsizei num_instances =
@@ -610,6 +607,10 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
                                              num_instances, base_instance);
        }
    }
+
+    EndTransformFeedback();
+
+    ++num_queued_commands;
 }

 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
@@ -622,12 +623,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
    auto kernel = shader_cache.GetComputeKernel(code_addr);
    SetupComputeTextures(kernel);
    SetupComputeImages(kernel);
-
-    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
-    const ProgramVariant variant(launch_desc.block_dim_x, launch_desc.block_dim_y,
-                                 launch_desc.block_dim_z, launch_desc.shared_alloc,
-                                 launch_desc.local_pos_alloc);
-    program_manager.BindComputeShader(kernel->GetHandle(variant));
+    program_manager.BindComputeShader(kernel->GetHandle());

    const std::size_t buffer_size =
        Tegra::Engines::KeplerCompute::NumConstBuffers *
@@ -645,6 +641,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
    bind_ubo_pushbuffer.Bind();
    bind_ssbo_pushbuffer.Bind();

+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
    glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
    ++num_queued_commands;
 }
@@ -749,7 +746,7 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad
    const auto& shader_stage = stages[stage_index];

    u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
-    for (const auto& entry : shader->GetShaderEntries().const_buffers) {
+    for (const auto& entry : shader->GetEntries().const_buffers) {
        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
        SetupConstBuffer(binding++, buffer, entry);
    }
@@ -760,7 +757,7 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;

    u32 binding = 0;
-    for (const auto& entry : kernel->GetShaderEntries().const_buffers) {
+    for (const auto& entry : kernel->GetEntries().const_buffers) {
        const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
        const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
        Tegra::Engines::ConstBufferInfo buffer;
@@ -772,7 +769,7 @@ void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
 }

 void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                                        const GLShader::ConstBufferEntry& entry) {
+                                        const ConstBufferEntry& entry) {
    if (!buffer.enabled) {
        // Set values to zero to unbind buffers
        bind_ubo_pushbuffer.Push(binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
@@ -796,7 +793,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
    const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};

    u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
-    for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
+    for (const auto& entry : shader->GetEntries().global_memory_entries) {
        const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
        const auto gpu_addr{memory_manager.Read<u64>(addr)};
        const auto size{memory_manager.Read<u32>(addr + 8)};
@@ -810,7 +807,7 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
    const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};

    u32 binding = 0;
-    for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) {
+    for (const auto& entry : kernel->GetEntries().global_memory_entries) {
        const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
        const auto gpu_addr{memory_manager.Read<u64>(addr)};
        const auto size{memory_manager.Read<u32>(addr + 8)};
@@ -818,7 +815,7 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
    }
 }

-void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry,
+void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
                                         GPUVAddr gpu_addr, std::size_t size) {
    const auto alignment{device.GetShaderStorageBufferAlignment()};
    const auto [ssbo, buffer_offset] =
@@ -830,7 +827,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
    MICROPROFILE_SCOPE(OpenGL_Texture);
    const auto& maxwell3d = system.GPU().Maxwell3D();
    u32 binding = device.GetBaseBindings(stage_index).sampler;
-    for (const auto& entry : shader->GetShaderEntries().samplers) {
+    for (const auto& entry : shader->GetEntries().samplers) {
        const auto shader_type = static_cast<ShaderType>(stage_index);
        for (std::size_t i = 0; i < entry.Size(); ++i) {
            const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i);
@@ -843,7 +840,7 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
    MICROPROFILE_SCOPE(OpenGL_Texture);
    const auto& compute = system.GPU().KeplerCompute();
    u32 binding = 0;
-    for (const auto& entry : kernel->GetShaderEntries().samplers) {
+    for (const auto& entry : kernel->GetEntries().samplers) {
        for (std::size_t i = 0; i < entry.Size(); ++i) {
            const auto texture = GetTextureInfo(compute, entry, ShaderType::Compute, i);
            SetupTexture(binding++, texture, entry);
@@ -852,7 +849,7 @@ void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
 }

 void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
-                                    const GLShader::SamplerEntry& entry) {
+                                    const SamplerEntry& entry) {
    const auto view = texture_cache.GetTextureSurface(texture.tic, entry);
    if (!view) {
        // Can occur when texture addr is null or its memory is unmapped/invalid
@@ -875,7 +872,7 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
 void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
    const auto& maxwell3d = system.GPU().Maxwell3D();
    u32 binding = device.GetBaseBindings(stage_index).image;
-    for (const auto& entry : shader->GetShaderEntries().images) {
+    for (const auto& entry : shader->GetEntries().images) {
        const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage_index);
        const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic;
        SetupImage(binding++, tic, entry);
@@ -885,14 +882,14 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh
 void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
    const auto& compute = system.GPU().KeplerCompute();
    u32 binding = 0;
-    for (const auto& entry : shader->GetShaderEntries().images) {
+    for (const auto& entry : shader->GetEntries().images) {
        const auto tic = GetTextureInfo(compute, entry, Tegra::Engines::ShaderType::Compute).tic;
        SetupImage(binding++, tic, entry);
    }
 }

 void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic,
-                                  const GLShader::ImageEntry& entry) {
+                                  const ImageEntry& entry) {
    const auto view = texture_cache.GetImageSurface(tic, entry);
    if (!view) {
        glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
@@ -1096,6 +1093,45 @@ void RasterizerOpenGL::SyncRasterizeEnable() {
    oglEnable(GL_RASTERIZER_DISCARD, gpu.regs.rasterize_enable == 0);
 }

+void RasterizerOpenGL::SyncPolygonModes() {
+    auto& gpu = system.GPU().Maxwell3D();
+    auto& flags = gpu.dirty.flags;
+    if (!flags[Dirty::PolygonModes]) {
+        return;
+    }
+    flags[Dirty::PolygonModes] = false;
+
+    if (gpu.regs.fill_rectangle) {
+        if (!GLAD_GL_NV_fill_rectangle) {
+            LOG_ERROR(Render_OpenGL, "GL_NV_fill_rectangle used and not supported");
+            glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+            return;
+        }
+
+        flags[Dirty::PolygonModeFront] = true;
+        flags[Dirty::PolygonModeBack] = true;
+        glPolygonMode(GL_FRONT_AND_BACK, GL_FILL_RECTANGLE_NV);
+        return;
+    }
+
+    if (gpu.regs.polygon_mode_front == gpu.regs.polygon_mode_back) {
+        flags[Dirty::PolygonModeFront] = false;
+        flags[Dirty::PolygonModeBack] = false;
+        glPolygonMode(GL_FRONT_AND_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
+        return;
+    }
+
+    if (flags[Dirty::PolygonModeFront]) {
+        flags[Dirty::PolygonModeFront] = false;
+        glPolygonMode(GL_FRONT, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_front));
+    }
+
+    if (flags[Dirty::PolygonModeBack]) {
+        flags[Dirty::PolygonModeBack] = false;
+        glPolygonMode(GL_BACK, MaxwellToGL::PolygonMode(gpu.regs.polygon_mode_back));
+    }
+}
+
 void RasterizerOpenGL::SyncColorMask() {
    auto& gpu = system.GPU().Maxwell3D();
    auto& flags = gpu.dirty.flags;
@@ -1257,11 +1293,6 @@ void RasterizerOpenGL::SyncScissorTest() {
    }
 }

-void RasterizerOpenGL::SyncTransformFeedback() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    UNIMPLEMENTED_IF_MSG(regs.tfb_enabled != 0, "Transform feedbacks are not implemented");
-}
-
 void RasterizerOpenGL::SyncPointState() {
    auto& gpu = system.GPU().Maxwell3D();
    auto& flags = gpu.dirty.flags;
@@ -1337,4 +1368,62 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
    oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
 }

+void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    if (regs.tfb_enabled == 0) {
+        return;
+    }
+
+    UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
+
+    for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
+        const auto& binding = regs.tfb_bindings[index];
+        if (!binding.buffer_enable) {
+            if (enabled_transform_feedback_buffers[index]) {
+                glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0,
+                                  0);
+            }
+            enabled_transform_feedback_buffers[index] = false;
+            continue;
+        }
+        enabled_transform_feedback_buffers[index] = true;
+
+        auto& tfb_buffer = transform_feedback_buffers[index];
+        tfb_buffer.Create();
+
+        const GLuint handle = tfb_buffer.handle;
+        const std::size_t size = binding.buffer_size;
+        glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY);
+        glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0,
+                          static_cast<GLsizeiptr>(size));
+    }
+
+    glBeginTransformFeedback(GL_POINTS);
+}
+
+void RasterizerOpenGL::EndTransformFeedback() {
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    if (regs.tfb_enabled == 0) {
+        return;
+    }
+
+    glEndTransformFeedback();
+
+    for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
+        const auto& binding = regs.tfb_bindings[index];
+        if (!binding.buffer_enable) {
+            continue;
+        }
+        UNIMPLEMENTED_IF(binding.buffer_offset != 0);
+
+        const GLuint handle = transform_feedback_buffers[index].handle;
+        const GPUVAddr gpu_addr = binding.Address();
+        const std::size_t size = binding.buffer_size;
+        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, *dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+    }
+}
+
 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -98,7 +98,7 @@ private:

    /// Configures a constant buffer.
    void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                          const GLShader::ConstBufferEntry& entry);
+                          const ConstBufferEntry& entry);

    /// Configures the current global memory entries to use for the draw command.
    void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
@@ -107,7 +107,7 @@ private:
    void SetupComputeGlobalMemory(const Shader& kernel);

    /// Configures a constant buffer.
-    void SetupGlobalMemory(u32 binding, const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
+    void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                           std::size_t size);

    /// Configures the current textures to use for the draw command.
@@ -118,7 +118,7 @@ private:

    /// Configures a texture.
    void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
-                      const GLShader::SamplerEntry& entry);
+                      const SamplerEntry& entry);

    /// Configures images in a graphics shader.
    void SetupDrawImages(std::size_t stage_index, const Shader& shader);
@@ -127,8 +127,7 @@ private:
    void SetupComputeImages(const Shader& shader);

    /// Configures an image.
-    void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic,
-                    const GLShader::ImageEntry& entry);
+    void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);

    /// Syncs the viewport and depth range to match the guest state
    void SyncViewport();
@@ -169,15 +168,15 @@ private:
    /// Syncs the scissor test state to match the guest state
    void SyncScissorTest();

-    /// Syncs the transform feedback state to match the guest state
-    void SyncTransformFeedback();
-
    /// Syncs the point state to match the guest state
    void SyncPointState();

    /// Syncs the rasterizer enable state to match the guest state
    void SyncRasterizeEnable();

+    /// Syncs polygon modes to match the guest state
+    void SyncPolygonModes();
+
    /// Syncs Color Mask
    void SyncColorMask();

@@ -190,6 +189,12 @@ private:
    /// Syncs the framebuffer sRGB state to match the guest state
    void SyncFramebufferSRGB();

+    /// Begin a transform feedback
+    void BeginTransformFeedback(GLenum primitive_mode);
+
+    /// End a transform feedback
+    void EndTransformFeedback();
+
    /// Check for extension that are not strictly required but are needed for correct emulation
    void CheckExtensions();

@@ -227,6 +232,11 @@ private:
    BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
    BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};

+    std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
+        transform_feedback_buffers;
+    std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
+        enabled_transform_feedback_buffers;
+
    /// Number of commands queued to the OpenGL driver. Reseted on flush.
    std::size_t num_queued_commands = 0;

--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -2,12 +2,16 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <atomic>
+#include <functional>
 #include <mutex>
 #include <optional>
 #include <string>
 #include <thread>
 #include <unordered_set>
+
 #include <boost/functional/hash.hpp>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
@@ -24,13 +28,14 @@
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/renderer_opengl/utils.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

 namespace OpenGL {

 using Tegra::Engines::ShaderType;
-using VideoCommon::Shader::ConstBufferLocker;
 using VideoCommon::Shader::ProgramCode;
+using VideoCommon::Shader::Registry;
 using VideoCommon::Shader::ShaderIR;

 namespace {
@@ -56,7 +61,7 @@ constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
 }

 /// Calculates the size of a program stream
-std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
+std::size_t CalculateProgramSize(const ProgramCode& program) {
    constexpr std::size_t start_offset = 10;
    // This is the encoded version of BRA that jumps to itself. All Nvidia
    // shaders end with one.
@@ -109,32 +114,9 @@ constexpr GLenum GetGLShaderType(ShaderType shader_type) {
    }
 }

-/// Describes primitive behavior on geometry shaders
-constexpr std::pair<const char*, u32> GetPrimitiveDescription(GLenum primitive_mode) {
-    switch (primitive_mode) {
-    case GL_POINTS:
-        return {"points", 1};
-    case GL_LINES:
-    case GL_LINE_STRIP:
-        return {"lines", 2};
-    case GL_LINES_ADJACENCY:
-    case GL_LINE_STRIP_ADJACENCY:
-        return {"lines_adjacency", 4};
-    case GL_TRIANGLES:
-    case GL_TRIANGLE_STRIP:
-    case GL_TRIANGLE_FAN:
-        return {"triangles", 3};
-    case GL_TRIANGLES_ADJACENCY:
-    case GL_TRIANGLE_STRIP_ADJACENCY:
-        return {"triangles_adjacency", 6};
-    default:
-        return {"points", 1};
-    }
-}
-
 /// Hashes one (or two) program streams
 u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& code,
-                        const ProgramCode& code_b) {
+                        const ProgramCode& code_b = {}) {
    u64 unique_identifier = boost::hash_value(code);
    if (is_a) {
        // VertexA programs include two programs
@@ -143,24 +125,6 @@ u64 GetUniqueIdentifier(ShaderType shader_type, bool is_a, const ProgramCode& co
    return unique_identifier;
 }

-/// Creates an unspecialized program from code streams
-std::string GenerateGLSL(const Device& device, ShaderType shader_type, const ShaderIR& ir,
-                         const std::optional<ShaderIR>& ir_b) {
-    switch (shader_type) {
-    case ShaderType::Vertex:
-        return GLShader::GenerateVertexShader(device, ir, ir_b ? &*ir_b : nullptr);
-    case ShaderType::Geometry:
-        return GLShader::GenerateGeometryShader(device, ir);
-    case ShaderType::Fragment:
-        return GLShader::GenerateFragmentShader(device, ir);
-    case ShaderType::Compute:
-        return GLShader::GenerateComputeShader(device, ir);
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented shader_type={}", static_cast<u32>(shader_type));
-        return {};
-    }
-}
-
 constexpr const char* GetShaderTypeName(ShaderType shader_type) {
    switch (shader_type) {
    case ShaderType::Vertex:
@@ -196,102 +160,38 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
    return {};
 }

-std::string GetShaderId(u64 unique_identifier, ShaderType shader_type) {
+std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
    return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
 }

-Tegra::Engines::ConstBufferEngineInterface& GetConstBufferEngineInterface(Core::System& system,
-                                                                          ShaderType shader_type) {
-    if (shader_type == ShaderType::Compute) {
-        return system.GPU().KeplerCompute();
-    } else {
-        return system.GPU().Maxwell3D();
+std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
+    const VideoCore::GuestDriverProfile guest_profile{entry.texture_handler_size};
+    const VideoCommon::Shader::SerializedRegistryInfo info{guest_profile, entry.bound_buffer,
+                                                           entry.graphics_info, entry.compute_info};
+    const auto registry = std::make_shared<Registry>(entry.type, info);
+    for (const auto& [address, value] : entry.keys) {
+        const auto [buffer, offset] = address;
+        registry->InsertKey(buffer, offset, value);
    }
-}
-
-std::unique_ptr<ConstBufferLocker> MakeLocker(Core::System& system, ShaderType shader_type) {
-    return std::make_unique<ConstBufferLocker>(shader_type,
-                                               GetConstBufferEngineInterface(system, shader_type));
-}
-
-void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) {
-    locker.SetBoundBuffer(usage.bound_buffer);
-    for (const auto& key : usage.keys) {
-        const auto [buffer, offset] = key.first;
-        locker.InsertKey(buffer, offset, key.second);
+    for (const auto& [offset, sampler] : entry.bound_samplers) {
+        registry->InsertBoundSampler(offset, sampler);
    }
-    for (const auto& [offset, sampler] : usage.bound_samplers) {
-        locker.InsertBoundSampler(offset, sampler);
-    }
-    for (const auto& [key, sampler] : usage.bindless_samplers) {
+    for (const auto& [key, sampler] : entry.bindless_samplers) {
        const auto [buffer, offset] = key;
-        locker.InsertBindlessSampler(buffer, offset, sampler);
+        registry->InsertBindlessSampler(buffer, offset, sampler);
    }
+    return registry;
 }

-CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderType shader_type,
-                          const ProgramCode& code, const ProgramCode& code_b,
-                          ConstBufferLocker& locker, const ProgramVariant& variant,
-                          bool hint_retrievable = false) {
-    LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, shader_type));
-
-    const bool is_compute = shader_type == ShaderType::Compute;
-    const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
-    const ShaderIR ir(code, main_offset, COMPILER_SETTINGS, locker);
-    std::optional<ShaderIR> ir_b;
-    if (!code_b.empty()) {
-        ir_b.emplace(code_b, main_offset, COMPILER_SETTINGS, locker);
-    }
-
-    std::string source = fmt::format(R"(// {}
-#version 430 core
-#extension GL_ARB_separate_shader_objects : enable
-)",
-                                     GetShaderId(unique_identifier, shader_type));
-    if (device.HasShaderBallot()) {
-        source += "#extension GL_ARB_shader_ballot : require\n";
-    }
-    if (device.HasVertexViewportLayer()) {
-        source += "#extension GL_ARB_shader_viewport_layer_array : require\n";
-    }
-    if (device.HasImageLoadFormatted()) {
-        source += "#extension GL_EXT_shader_image_load_formatted : require\n";
-    }
-    if (device.HasWarpIntrinsics()) {
-        source += "#extension GL_NV_gpu_shader5 : require\n"
-                  "#extension GL_NV_shader_thread_group : require\n"
-                  "#extension GL_NV_shader_thread_shuffle : require\n";
-    }
-    // This pragma stops Nvidia's driver from over optimizing math (probably using fp16 operations)
-    // on places where we don't want to.
-    // Thanks to Ryujinx for finding this workaround.
-    source += "#pragma optionNV(fastmath off)\n";
-
-    if (shader_type == ShaderType::Geometry) {
-        const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(variant.primitive_mode);
-        source += fmt::format("#define MAX_VERTEX_INPUT {}\n", max_vertices);
-        source += fmt::format("layout ({}) in;\n", glsl_topology);
-    }
-    if (shader_type == ShaderType::Compute) {
-        if (variant.local_memory_size > 0) {
-            source += fmt::format("#define LOCAL_MEMORY_SIZE {}\n",
-                                  Common::AlignUp(variant.local_memory_size, 4) / 4);
-        }
-        source +=
-            fmt::format("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;\n",
-                        variant.block_x, variant.block_y, variant.block_z);
-
-        if (variant.shared_memory_size > 0) {
-            // shared_memory_size is described in number of words
-            source += fmt::format("shared uint smem[{}];\n", variant.shared_memory_size);
-        }
-    }
-
-    source += '\n';
-    source += GenerateGLSL(device, shader_type, ir, ir_b);
+std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
+                                        u64 unique_identifier, const ShaderIR& ir,
+                                        const Registry& registry, bool hint_retrievable = false) {
+    const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
+    LOG_INFO(Render_OpenGL, "{}", shader_id);

+    const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
    OGLShader shader;
-    shader.Create(source.c_str(), GetGLShaderType(shader_type));
+    shader.Create(glsl.c_str(), GetGLShaderType(shader_type));

    auto program = std::make_shared<OGLProgram>();
    program->Create(true, hint_retrievable, shader.handle);
@@ -299,7 +199,7 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp
 }

 std::unordered_set<GLenum> GetSupportedFormats() {
-    GLint num_formats{};
+    GLint num_formats;
    glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats);

    std::vector<GLint> formats(num_formats);
@@ -314,115 +214,82 @@ std::unordered_set<GLenum> GetSupportedFormats() {

 } // Anonymous namespace

-CachedShader::CachedShader(const ShaderParameters& params, ShaderType shader_type,
-                           GLShader::ShaderEntries entries, ProgramCode code, ProgramCode code_b)
-    : RasterizerCacheObject{params.host_ptr}, system{params.system},
-      disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr},
-      unique_identifier{params.unique_identifier}, shader_type{shader_type},
-      entries{std::move(entries)}, code{std::move(code)}, code_b{std::move(code_b)} {
-    if (!params.precompiled_variants) {
-        return;
-    }
-    for (const auto& pair : *params.precompiled_variants) {
-        auto locker = MakeLocker(system, shader_type);
-        const auto& usage = pair->first;
-        FillLocker(*locker, usage);
+CachedShader::CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes,
+                           std::shared_ptr<VideoCommon::Shader::Registry> registry,
+                           ShaderEntries entries, std::shared_ptr<OGLProgram> program)
+    : RasterizerCacheObject{host_ptr}, registry{std::move(registry)}, entries{std::move(entries)},
+      cpu_addr{cpu_addr}, size_in_bytes{size_in_bytes}, program{std::move(program)} {}

-        std::unique_ptr<LockerVariant>* locker_variant = nullptr;
-        const auto it =
-            std::find_if(locker_variants.begin(), locker_variants.end(), [&](const auto& variant) {
-                return variant->locker->HasEqualKeys(*locker);
-            });
-        if (it == locker_variants.end()) {
-            locker_variant = &locker_variants.emplace_back();
-            *locker_variant = std::make_unique<LockerVariant>();
-            locker_variant->get()->locker = std::move(locker);
-        } else {
-            locker_variant = &*it;
-        }
-        locker_variant->get()->programs.emplace(usage.variant, pair->second);
-    }
+CachedShader::~CachedShader() = default;
+
+GLuint CachedShader::GetHandle() const {
+    DEBUG_ASSERT(registry->IsConsistent());
+    return program->handle;
 }

 Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
                                           Maxwell::ShaderProgram program_type, ProgramCode code,
                                           ProgramCode code_b) {
    const auto shader_type = GetShaderType(program_type);
-    params.disk_cache.SaveRaw(
-        ShaderDiskCacheRaw(params.unique_identifier, shader_type, code, code_b));
+    const std::size_t size_in_bytes = code.size() * sizeof(u64);

-    ConstBufferLocker locker(shader_type, params.system.GPU().Maxwell3D());
-    const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, locker);
+    auto registry = std::make_shared<Registry>(shader_type, params.system.GPU().Maxwell3D());
+    const ShaderIR ir(code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
    // TODO(Rodrigo): Handle VertexA shaders
    // std::optional<ShaderIR> ir_b;
    // if (!code_b.empty()) {
    //     ir_b.emplace(code_b, STAGE_MAIN_OFFSET);
    // }
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params, shader_type, GLShader::GetEntries(ir), std::move(code), std::move(code_b)));
+    auto program = BuildShader(params.device, shader_type, params.unique_identifier, ir, *registry);
+
+    ShaderDiskCacheEntry entry;
+    entry.type = shader_type;
+    entry.code = std::move(code);
+    entry.code_b = std::move(code_b);
+    entry.unique_identifier = params.unique_identifier;
+    entry.bound_buffer = registry->GetBoundBuffer();
+    entry.graphics_info = registry->GetGraphicsInfo();
+    entry.keys = registry->GetKeys();
+    entry.bound_samplers = registry->GetBoundSamplers();
+    entry.bindless_samplers = registry->GetBindlessSamplers();
+    params.disk_cache.SaveEntry(std::move(entry));
+
+    return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr,
+                                                          size_in_bytes, std::move(registry),
+                                                          MakeEntries(ir), std::move(program)));
 }

 Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
-    params.disk_cache.SaveRaw(
-        ShaderDiskCacheRaw(params.unique_identifier, ShaderType::Compute, code));
+    const std::size_t size_in_bytes = code.size() * sizeof(u64);

-    ConstBufferLocker locker(Tegra::Engines::ShaderType::Compute,
-                             params.system.GPU().KeplerCompute());
-    const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, locker);
-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params, ShaderType::Compute, GLShader::GetEntries(ir), std::move(code), {}));
+    auto& engine = params.system.GPU().KeplerCompute();
+    auto registry = std::make_shared<Registry>(ShaderType::Compute, engine);
+    const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, *registry);
+    const u64 uid = params.unique_identifier;
+    auto program = BuildShader(params.device, ShaderType::Compute, uid, ir, *registry);
+
+    ShaderDiskCacheEntry entry;
+    entry.type = ShaderType::Compute;
+    entry.code = std::move(code);
+    entry.unique_identifier = uid;
+    entry.bound_buffer = registry->GetBoundBuffer();
+    entry.compute_info = registry->GetComputeInfo();
+    entry.keys = registry->GetKeys();
+    entry.bound_samplers = registry->GetBoundSamplers();
+    entry.bindless_samplers = registry->GetBindlessSamplers();
+    params.disk_cache.SaveEntry(std::move(entry));
+
+    return std::shared_ptr<CachedShader>(new CachedShader(params.host_ptr, params.cpu_addr,
+                                                          size_in_bytes, std::move(registry),
+                                                          MakeEntries(ir), std::move(program)));
 }

 Shader CachedShader::CreateFromCache(const ShaderParameters& params,
-                                     const UnspecializedShader& unspecialized) {
-    return std::shared_ptr<CachedShader>(new CachedShader(params, unspecialized.type,
-                                                          unspecialized.entries, unspecialized.code,
-                                                          unspecialized.code_b));
-}
-
-GLuint CachedShader::GetHandle(const ProgramVariant& variant) {
-    EnsureValidLockerVariant();
-
-    const auto [entry, is_cache_miss] = curr_locker_variant->programs.try_emplace(variant);
-    auto& program = entry->second;
-    if (!is_cache_miss) {
-        return program->handle;
-    }
-
-    program = BuildShader(device, unique_identifier, shader_type, code, code_b,
-                          *curr_locker_variant->locker, variant);
-    disk_cache.SaveUsage(GetUsage(variant, *curr_locker_variant->locker));
-
-    LabelGLObject(GL_PROGRAM, program->handle, cpu_addr);
-    return program->handle;
-}
-
-bool CachedShader::EnsureValidLockerVariant() {
-    const auto previous_variant = curr_locker_variant;
-    if (curr_locker_variant && !curr_locker_variant->locker->IsConsistent()) {
-        curr_locker_variant = nullptr;
-    }
-    if (!curr_locker_variant) {
-        for (auto& variant : locker_variants) {
-            if (variant->locker->IsConsistent()) {
-                curr_locker_variant = variant.get();
-            }
-        }
-    }
-    if (!curr_locker_variant) {
-        auto& new_variant = locker_variants.emplace_back();
-        new_variant = std::make_unique<LockerVariant>();
-        new_variant->locker = MakeLocker(system, shader_type);
-        curr_locker_variant = new_variant.get();
-    }
-    return previous_variant == curr_locker_variant;
-}
-
-ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant,
-                                            const ConstBufferLocker& locker) const {
-    return ShaderDiskCacheUsage{unique_identifier,         variant,
-                                locker.GetBoundBuffer(),   locker.GetKeys(),
-                                locker.GetBoundSamplers(), locker.GetBindlessSamplers()};
+                                     const PrecompiledShader& precompiled_shader,
+                                     std::size_t size_in_bytes) {
+    return std::shared_ptr<CachedShader>(new CachedShader(
+        params.host_ptr, params.cpu_addr, size_in_bytes, precompiled_shader.registry,
+        precompiled_shader.entries, precompiled_shader.program));
 }

 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
@@ -432,16 +299,12 @@ ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System&

 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                      const VideoCore::DiskResourceLoadCallback& callback) {
-    const auto transferable = disk_cache.LoadTransferable();
+    const std::optional transferable = disk_cache.LoadTransferable();
    if (!transferable) {
        return;
    }
-    const auto [raws, shader_usages] = *transferable;
-    if (!GenerateUnspecializedShaders(stop_loading, callback, raws) || stop_loading) {
-        return;
-    }

-    const auto dumps = disk_cache.LoadPrecompiled();
+    const std::vector gl_cache = disk_cache.LoadPrecompiled();
    const auto supported_formats = GetSupportedFormats();

    // Track if precompiled cache was altered during loading to know if we have to
@@ -450,77 +313,82 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,

    // Inform the frontend about shader build initialization
    if (callback) {
-        callback(VideoCore::LoadCallbackStage::Build, 0, shader_usages.size());
+        callback(VideoCore::LoadCallbackStage::Build, 0, transferable->size());
    }

    std::mutex mutex;
    std::size_t built_shaders = 0; // It doesn't have be atomic since it's used behind a mutex
-    std::atomic_bool compilation_failed = false;
+    std::atomic_bool gl_cache_failed = false;

-    const auto Worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin,
-                            std::size_t end, const std::vector<ShaderDiskCacheUsage>& shader_usages,
-                            const ShaderDumpsMap& dumps) {
+    const auto find_precompiled = [&gl_cache](u64 id) {
+        return std::find_if(gl_cache.begin(), gl_cache.end(),
+                            [id](const auto& entry) { return entry.unique_identifier == id; });
+    };
+
+    const auto worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin,
+                            std::size_t end) {
        context->MakeCurrent();
        SCOPE_EXIT({ return context->DoneCurrent(); });

        for (std::size_t i = begin; i < end; ++i) {
-            if (stop_loading || compilation_failed) {
+            if (stop_loading) {
                return;
            }
-            const auto& usage{shader_usages[i]};
-            const auto& unspecialized{unspecialized_shaders.at(usage.unique_identifier)};
-            const auto dump{dumps.find(usage)};
+            const auto& entry = (*transferable)[i];
+            const u64 uid = entry.unique_identifier;
+            const auto it = find_precompiled(uid);
+            const auto precompiled_entry = it != gl_cache.end() ? &*it : nullptr;

-            CachedProgram shader;
-            if (dump != dumps.end()) {
-                // If the shader is dumped, attempt to load it with
-                shader = GeneratePrecompiledProgram(dump->second, supported_formats);
-                if (!shader) {
-                    compilation_failed = true;
-                    return;
+            const bool is_compute = entry.type == ShaderType::Compute;
+            const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
+            auto registry = MakeRegistry(entry);
+            const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);
+
+            std::shared_ptr<OGLProgram> program;
+            if (precompiled_entry) {
+                // If the shader is precompiled, attempt to load it with
+                program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
+                if (!program) {
+                    gl_cache_failed = true;
                }
            }
-            if (!shader) {
-                auto locker{MakeLocker(system, unspecialized.type)};
-                FillLocker(*locker, usage);
-
-                shader = BuildShader(device, usage.unique_identifier, unspecialized.type,
-                                     unspecialized.code, unspecialized.code_b, *locker,
-                                     usage.variant, true);
+            if (!program) {
+                // Otherwise compile it from GLSL
+                program = BuildShader(device, entry.type, uid, ir, *registry, true);
            }

+            PrecompiledShader shader;
+            shader.program = std::move(program);
+            shader.registry = std::move(registry);
+            shader.entries = MakeEntries(ir);
+
            std::scoped_lock lock{mutex};
            if (callback) {
                callback(VideoCore::LoadCallbackStage::Build, ++built_shaders,
-                         shader_usages.size());
+                         transferable->size());
            }
-
-            precompiled_programs.emplace(usage, std::move(shader));
-
-            // TODO(Rodrigo): Is there a better way to do this?
-            precompiled_variants[usage.unique_identifier].push_back(
-                precompiled_programs.find(usage));
+            runtime_cache.emplace(entry.unique_identifier, std::move(shader));
        }
    };

    const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)};
-    const std::size_t bucket_size{shader_usages.size() / num_workers};
+    const std::size_t bucket_size{transferable->size() / num_workers};
    std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers);
    std::vector<std::thread> threads(num_workers);
    for (std::size_t i = 0; i < num_workers; ++i) {
        const bool is_last_worker = i + 1 == num_workers;
        const std::size_t start{bucket_size * i};
-        const std::size_t end{is_last_worker ? shader_usages.size() : start + bucket_size};
+        const std::size_t end{is_last_worker ? transferable->size() : start + bucket_size};

        // On some platforms the shared context has to be created from the GUI thread
        contexts[i] = emu_window.CreateSharedContext();
-        threads[i] = std::thread(Worker, contexts[i].get(), start, end, shader_usages, dumps);
+        threads[i] = std::thread(worker, contexts[i].get(), start, end);
    }
    for (auto& thread : threads) {
        thread.join();
    }

-    if (compilation_failed) {
+    if (gl_cache_failed) {
        // Invalidate the precompiled cache if a shader dumped shader was rejected
        disk_cache.InvalidatePrecompiled();
        precompiled_cache_altered = true;
@@ -533,11 +401,12 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
    // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
    // before precompiling them

-    for (std::size_t i = 0; i < shader_usages.size(); ++i) {
-        const auto& usage{shader_usages[i]};
-        if (dumps.find(usage) == dumps.end()) {
-            const auto& program{precompiled_programs.at(usage)};
-            disk_cache.SaveDump(usage, program->handle);
+    for (std::size_t i = 0; i < transferable->size(); ++i) {
+        const u64 id = (*transferable)[i].unique_identifier;
+        const auto it = find_precompiled(id);
+        if (it == gl_cache.end()) {
+            const GLuint program = runtime_cache.at(id).program->handle;
+            disk_cache.SavePrecompiled(id, program);
            precompiled_cache_altered = true;
        }
    }
@@ -547,80 +416,29 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
    }
 }

-const PrecompiledVariants* ShaderCacheOpenGL::GetPrecompiledVariants(u64 unique_identifier) const {
-    const auto it = precompiled_variants.find(unique_identifier);
-    return it == precompiled_variants.end() ? nullptr : &it->second;
-}
-
-CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram(
-    const ShaderDiskCacheDump& dump, const std::unordered_set<GLenum>& supported_formats) {
-    if (supported_formats.find(dump.binary_format) == supported_formats.end()) {
-        LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format - removing");
+std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
+    const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
+    const std::unordered_set<GLenum>& supported_formats) {
+    if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
+        LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format, removing");
        return {};
    }

-    CachedProgram shader = std::make_shared<OGLProgram>();
-    shader->handle = glCreateProgram();
-    glProgramParameteri(shader->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
-    glProgramBinary(shader->handle, dump.binary_format, dump.binary.data(),
-                    static_cast<GLsizei>(dump.binary.size()));
+    auto program = std::make_shared<OGLProgram>();
+    program->handle = glCreateProgram();
+    glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
+    glProgramBinary(program->handle, precompiled_entry.binary_format,
+                    precompiled_entry.binary.data(),
+                    static_cast<GLsizei>(precompiled_entry.binary.size()));

-    GLint link_status{};
-    glGetProgramiv(shader->handle, GL_LINK_STATUS, &link_status);
+    GLint link_status;
+    glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
    if (link_status == GL_FALSE) {
-        LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver - removing");
+        LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
        return {};
    }

-    return shader;
-}
-
-bool ShaderCacheOpenGL::GenerateUnspecializedShaders(
-    const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback,
-    const std::vector<ShaderDiskCacheRaw>& raws) {
-    if (callback) {
-        callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size());
-    }
-
-    for (std::size_t i = 0; i < raws.size(); ++i) {
-        if (stop_loading) {
-            return false;
-        }
-        const auto& raw{raws[i]};
-        const u64 unique_identifier{raw.GetUniqueIdentifier()};
-        const u64 calculated_hash{
-            GetUniqueIdentifier(raw.GetType(), raw.HasProgramA(), raw.GetCode(), raw.GetCodeB())};
-        if (unique_identifier != calculated_hash) {
-            LOG_ERROR(Render_OpenGL,
-                      "Invalid hash in entry={:016x} (obtained hash={:016x}) - "
-                      "removing shader cache",
-                      raw.GetUniqueIdentifier(), calculated_hash);
-            disk_cache.InvalidateTransferable();
-            return false;
-        }
-
-        const u32 main_offset =
-            raw.GetType() == ShaderType::Compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
-        ConstBufferLocker locker(raw.GetType());
-        const ShaderIR ir(raw.GetCode(), main_offset, COMPILER_SETTINGS, locker);
-        // TODO(Rodrigo): Handle VertexA shaders
-        // std::optional<ShaderIR> ir_b;
-        // if (raw.HasProgramA()) {
-        //     ir_b.emplace(raw.GetProgramCodeB(), main_offset);
-        // }
-
-        UnspecializedShader unspecialized;
-        unspecialized.entries = GLShader::GetEntries(ir);
-        unspecialized.type = raw.GetType();
-        unspecialized.code = raw.GetCode();
-        unspecialized.code_b = raw.GetCodeB();
-        unspecialized_shaders.emplace(raw.GetUniqueIdentifier(), unspecialized);
-
-        if (callback) {
-            callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size());
-        }
-    }
-    return true;
+    return program;
 }

 Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
@@ -648,17 +466,17 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {

    const auto unique_identifier = GetUniqueIdentifier(
        GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);
-    const auto precompiled_variants = GetPrecompiledVariants(unique_identifier);
    const auto cpu_addr{*memory_manager.GpuToCpuAddress(address)};
-    const ShaderParameters params{system,   disk_cache, precompiled_variants, device,
+    const ShaderParameters params{system,   disk_cache, device,
                                  cpu_addr, host_ptr,   unique_identifier};

-    const auto found = unspecialized_shaders.find(unique_identifier);
-    if (found == unspecialized_shaders.end()) {
+    const auto found = runtime_cache.find(unique_identifier);
+    if (found == runtime_cache.end()) {
        shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
                                                     std::move(code_b));
    } else {
-        shader = CachedShader::CreateFromCache(params, found->second);
+        const std::size_t size_in_bytes = code.size() * sizeof(u64);
+        shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
    }
    Register(shader);

@@ -673,19 +491,19 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
        return kernel;
    }

-    // No kernel found - create a new one
+    // No kernel found, create a new one
    auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code, {})};
-    const auto precompiled_variants = GetPrecompiledVariants(unique_identifier);
+    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
    const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)};
-    const ShaderParameters params{system,   disk_cache, precompiled_variants, device,
+    const ShaderParameters params{system,   disk_cache, device,
                                  cpu_addr, host_ptr,   unique_identifier};

-    const auto found = unspecialized_shaders.find(unique_identifier);
-    if (found == unspecialized_shaders.end()) {
+    const auto found = runtime_cache.find(unique_identifier);
+    if (found == runtime_cache.end()) {
        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
    } else {
-        kernel = CachedShader::CreateFromCache(params, found->second);
+        const std::size_t size_in_bytes = code.size() * sizeof(u64);
+        kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
    }

    Register(kernel);
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -22,7 +22,7 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
-#include "video_core/shader/const_buffer_locker.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

 namespace Core {
@@ -41,22 +41,17 @@ class RasterizerOpenGL;
 struct UnspecializedShader;

 using Shader = std::shared_ptr<CachedShader>;
-using CachedProgram = std::shared_ptr<OGLProgram>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using PrecompiledPrograms = std::unordered_map<ShaderDiskCacheUsage, CachedProgram>;
-using PrecompiledVariants = std::vector<PrecompiledPrograms::iterator>;

-struct UnspecializedShader {
-    GLShader::ShaderEntries entries;
-    Tegra::Engines::ShaderType type;
-    ProgramCode code;
-    ProgramCode code_b;
+struct PrecompiledShader {
+    std::shared_ptr<OGLProgram> program;
+    std::shared_ptr<VideoCommon::Shader::Registry> registry;
+    ShaderEntries entries;
 };

 struct ShaderParameters {
    Core::System& system;
    ShaderDiskCacheOpenGL& disk_cache;
-    const PrecompiledVariants* precompiled_variants;
    const Device& device;
    VAddr cpu_addr;
    u8* host_ptr;
@@ -65,61 +60,45 @@ struct ShaderParameters {

 class CachedShader final : public RasterizerCacheObject {
 public:
+    ~CachedShader();
+
+    /// Gets the GL program handle for the shader
+    GLuint GetHandle() const;
+
+    /// Returns the guest CPU address of the shader
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
+    }
+
+    /// Returns the size in bytes of the shader
+    std::size_t GetSizeInBytes() const override {
+        return size_in_bytes;
+    }
+
+    /// Gets the shader entries for the shader
+    const ShaderEntries& GetEntries() const {
+        return entries;
+    }
+
    static Shader CreateStageFromMemory(const ShaderParameters& params,
                                        Maxwell::ShaderProgram program_type,
                                        ProgramCode program_code, ProgramCode program_code_b);
    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);

    static Shader CreateFromCache(const ShaderParameters& params,
-                                  const UnspecializedShader& unspecialized);
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return code.size() * sizeof(u64);
-    }
-
-    /// Gets the shader entries for the shader
-    const GLShader::ShaderEntries& GetShaderEntries() const {
-        return entries;
-    }
-
-    /// Gets the GL program handle for the shader
-    GLuint GetHandle(const ProgramVariant& variant);
+                                  const PrecompiledShader& precompiled_shader,
+                                  std::size_t size_in_bytes);

 private:
-    struct LockerVariant {
-        std::unique_ptr<VideoCommon::Shader::ConstBufferLocker> locker;
-        std::unordered_map<ProgramVariant, CachedProgram> programs;
-    };
+    explicit CachedShader(const u8* host_ptr, VAddr cpu_addr, std::size_t size_in_bytes,
+                          std::shared_ptr<VideoCommon::Shader::Registry> registry,
+                          ShaderEntries entries, std::shared_ptr<OGLProgram> program);

-    explicit CachedShader(const ShaderParameters& params, Tegra::Engines::ShaderType shader_type,
-                          GLShader::ShaderEntries entries, ProgramCode program_code,
-                          ProgramCode program_code_b);
-
-    bool EnsureValidLockerVariant();
-
-    ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant,
-                                  const VideoCommon::Shader::ConstBufferLocker& locker) const;
-
-    Core::System& system;
-    ShaderDiskCacheOpenGL& disk_cache;
-    const Device& device;
-
-    VAddr cpu_addr{};
-
-    u64 unique_identifier{};
-    Tegra::Engines::ShaderType shader_type{};
-
-    GLShader::ShaderEntries entries;
-
-    ProgramCode code;
-    ProgramCode code_b;
-
-    LockerVariant* curr_locker_variant = nullptr;
-    std::vector<std::unique_ptr<LockerVariant>> locker_variants;
+    std::shared_ptr<VideoCommon::Shader::Registry> registry;
+    ShaderEntries entries;
+    VAddr cpu_addr = 0;
+    std::size_t size_in_bytes = 0;
+    std::shared_ptr<OGLProgram> program;
 };

 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
@@ -142,25 +121,15 @@ protected:
    void FlushObjectInner(const Shader& object) override {}

 private:
-    bool GenerateUnspecializedShaders(const std::atomic_bool& stop_loading,
-                                      const VideoCore::DiskResourceLoadCallback& callback,
-                                      const std::vector<ShaderDiskCacheRaw>& raws);
-
-    CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
-                                             const std::unordered_set<GLenum>& supported_formats);
-
-    const PrecompiledVariants* GetPrecompiledVariants(u64 unique_identifier) const;
+    std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
+        const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
+        const std::unordered_set<GLenum>& supported_formats);

    Core::System& system;
    Core::Frontend::EmuWindow& emu_window;
    const Device& device;
-
    ShaderDiskCacheOpenGL disk_cache;
-
-    PrecompiledPrograms precompiled_programs;
-    std::unordered_map<u64, PrecompiledVariants> precompiled_variants;
-
-    std::unordered_map<u64, UnspecializedShader> unspecialized_shaders;
+    std::unordered_map<u64, PrecompiledShader> runtime_cache;

    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
 };
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -23,8 +23,9 @@
 #include "video_core/shader/ast.h"
 #include "video_core/shader/node.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader/transform_feedback.h"

-namespace OpenGL::GLShader {
+namespace OpenGL {

 namespace {

@@ -36,6 +37,8 @@ using Tegra::Shader::IpaInterpMode;
 using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::Register;
+using VideoCommon::Shader::BuildTransformFeedback;
+using VideoCommon::Shader::Registry;

 using namespace std::string_literals;
 using namespace VideoCommon::Shader;
@@ -48,6 +51,11 @@ class ExprDecompiler;

 enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat };

+constexpr std::array FLOAT_TYPES{"float", "vec2", "vec3", "vec4"};
+
+constexpr std::string_view INPUT_ATTRIBUTE_NAME = "in_attr";
+constexpr std::string_view OUTPUT_ATTRIBUTE_NAME = "out_attr";
+
 struct TextureOffset {};
 struct TextureDerivates {};
 using TextureArgument = std::pair<Type, Node>;
@@ -56,6 +64,25 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));

+constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
+#define ftou floatBitsToUint
+#define itof intBitsToFloat
+#define utof uintBitsToFloat
+
+bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{
+    bvec2 is_nan1 = isnan(pair1);
+    bvec2 is_nan2 = isnan(pair2);
+    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y);
+}}
+
+const float fswzadd_modifiers_a[] = float[4](-1.0f,  1.0f, -1.0f,  0.0f );
+const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f,  1.0f, -1.0f );
+
+layout (std140, binding = {}) uniform vs_config {{
+    float y_direction;
+}};
+)";
+
 class ShaderWriter final {
 public:
    void AddExpression(std::string_view text) {
@@ -269,12 +296,41 @@ const char* GetImageTypeDeclaration(Tegra::Shader::ImageType image_type) {
    }
 }

+/// Describes primitive behavior on geometry shaders
+std::pair<const char*, u32> GetPrimitiveDescription(Maxwell::PrimitiveTopology topology) {
+    switch (topology) {
+    case Maxwell::PrimitiveTopology::Points:
+        return {"points", 1};
+    case Maxwell::PrimitiveTopology::Lines:
+    case Maxwell::PrimitiveTopology::LineStrip:
+        return {"lines", 2};
+    case Maxwell::PrimitiveTopology::LinesAdjacency:
+    case Maxwell::PrimitiveTopology::LineStripAdjacency:
+        return {"lines_adjacency", 4};
+    case Maxwell::PrimitiveTopology::Triangles:
+    case Maxwell::PrimitiveTopology::TriangleStrip:
+    case Maxwell::PrimitiveTopology::TriangleFan:
+        return {"triangles", 3};
+    case Maxwell::PrimitiveTopology::TrianglesAdjacency:
+    case Maxwell::PrimitiveTopology::TriangleStripAdjacency:
+        return {"triangles_adjacency", 6};
+    default:
+        UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology));
+        return {"points", 1};
+    }
+}
+
 /// Generates code to use for a swizzle operation.
-constexpr const char* GetSwizzle(u32 element) {
+constexpr const char* GetSwizzle(std::size_t element) {
    constexpr std::array swizzle = {".x", ".y", ".z", ".w"};
    return swizzle.at(element);
 }

+constexpr const char* GetColorSwizzle(std::size_t element) {
+    constexpr std::array swizzle = {".r", ".g", ".b", ".a"};
+    return swizzle.at(element);
+}
+
 /// Translate topology
 std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
    switch (topology) {
@@ -341,11 +397,66 @@ std::string FlowStackTopName(MetaStackClass stack) {
    return stage == ShaderType::Vertex;
 }

+struct GenericVaryingDescription {
+    std::string name;
+    u8 first_element = 0;
+    bool is_scalar = false;
+};
+
 class GLSLDecompiler final {
 public:
-    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderType stage,
-                            std::string suffix)
-        : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
+    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
+                            ShaderType stage, std::string_view identifier, std::string_view suffix)
+        : device{device}, ir{ir}, registry{registry}, stage{stage},
+          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+        if (stage != ShaderType::Compute) {
+            transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
+        }
+    }
+
+    void Decompile() {
+        DeclareHeader();
+        DeclareVertex();
+        DeclareGeometry();
+        DeclareFragment();
+        DeclareCompute();
+        DeclareInputAttributes();
+        DeclareOutputAttributes();
+        DeclareImages();
+        DeclareSamplers();
+        DeclareGlobalMemory();
+        DeclareConstantBuffers();
+        DeclareLocalMemory();
+        DeclareRegisters();
+        DeclarePredicates();
+        DeclareInternalFlags();
+        DeclareCustomVariables();
+        DeclarePhysicalAttributeReader();
+
+        code.AddLine("void main() {{");
+        ++code.scope;
+
+        if (stage == ShaderType::Vertex) {
+            code.AddLine("gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f);");
+        }
+
+        if (ir.IsDecompiled()) {
+            DecompileAST();
+        } else {
+            DecompileBranchMode();
+        }
+
+        --code.scope;
+        code.AddLine("}}");
+    }
+
+    std::string GetResult() {
+        return code.GetResult();
+    }
+
+private:
+    friend class ASTDecompiler;
+    friend class ExprDecompiler;

    void DecompileBranchMode() {
        // VM's program counter
@@ -387,43 +498,36 @@ public:

    void DecompileAST();

-    void Decompile() {
-        DeclareVertex();
-        DeclareGeometry();
-        DeclareRegisters();
-        DeclareCustomVariables();
-        DeclarePredicates();
-        DeclareLocalMemory();
-        DeclareInternalFlags();
-        DeclareInputAttributes();
-        DeclareOutputAttributes();
-        DeclareConstantBuffers();
-        DeclareGlobalMemory();
-        DeclareSamplers();
-        DeclareImages();
-        DeclarePhysicalAttributeReader();
-
-        code.AddLine("void execute_{}() {{", suffix);
-        ++code.scope;
-
-        if (ir.IsDecompiled()) {
-            DecompileAST();
-        } else {
-            DecompileBranchMode();
+    void DeclareHeader() {
+        if (!identifier.empty()) {
+            code.AddLine("// {}", identifier);
        }
+        code.AddLine("#version 440 core");
+        code.AddLine("#extension GL_ARB_separate_shader_objects : enable");
+        if (device.HasShaderBallot()) {
+            code.AddLine("#extension GL_ARB_shader_ballot : require");
+        }
+        if (device.HasVertexViewportLayer()) {
+            code.AddLine("#extension GL_ARB_shader_viewport_layer_array : require");
+        }
+        if (device.HasImageLoadFormatted()) {
+            code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
+        }
+        if (device.HasWarpIntrinsics()) {
+            code.AddLine("#extension GL_NV_gpu_shader5 : require");
+            code.AddLine("#extension GL_NV_shader_thread_group : require");
+            code.AddLine("#extension GL_NV_shader_thread_shuffle : require");
+        }
+        // This pragma stops Nvidia's driver from over optimizing math (probably using fp16
+        // operations) on places where we don't want to.
+        // Thanks to Ryujinx for finding this workaround.
+        code.AddLine("#pragma optionNV(fastmath off)");

-        --code.scope;
-        code.AddLine("}}");
+        code.AddNewLine();
+
+        code.AddLine(CommonDeclarations, EmulationUniformBlockBinding);
    }

-    std::string GetResult() {
-        return code.GetResult();
-    }
-
-private:
-    friend class ASTDecompiler;
-    friend class ExprDecompiler;
-
    void DeclareVertex() {
        if (!IsVertexShader(stage))
            return;
@@ -436,9 +540,15 @@ private:
            return;
        }

+        const auto& info = registry.GetGraphicsInfo();
+        const auto input_topology = info.primitive_topology;
+        const auto [glsl_topology, max_vertices] = GetPrimitiveDescription(input_topology);
+        max_input_vertices = max_vertices;
+        code.AddLine("layout ({}) in;", glsl_topology);
+
        const auto topology = GetTopologyName(header.common3.output_topology);
-        const auto max_vertices = header.common4.max_output_vertices.Value();
-        code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_vertices);
+        const auto max_output_vertices = header.common4.max_output_vertices.Value();
+        code.AddLine("layout ({}, max_vertices = {}) out;", topology, max_output_vertices);
        code.AddNewLine();

        code.AddLine("in gl_PerVertex {{");
@@ -450,11 +560,40 @@ private:
        DeclareVertexRedeclarations();
    }

+    void DeclareFragment() {
+        if (stage != ShaderType::Fragment) {
+            return;
+        }
+        for (u32 rt = 0; rt < Maxwell::NumRenderTargets; ++rt) {
+            code.AddLine("layout (location = {}) out vec4 frag_color{};", rt, rt);
+        }
+    }
+
+    void DeclareCompute() {
+        if (stage != ShaderType::Compute) {
+            return;
+        }
+        const auto& info = registry.GetComputeInfo();
+        if (const u32 size = info.shared_memory_size_in_words; size > 0) {
+            code.AddLine("shared uint smem[{}];", size);
+            code.AddNewLine();
+        }
+        code.AddLine("layout (local_size_x = {}, local_size_y = {}, local_size_z = {}) in;",
+                     info.workgroup_size[0], info.workgroup_size[1], info.workgroup_size[2]);
+        code.AddNewLine();
+    }
+
    void DeclareVertexRedeclarations() {
        code.AddLine("out gl_PerVertex {{");
        ++code.scope;

-        code.AddLine("vec4 gl_Position;");
+        auto pos_xfb = GetTransformFeedbackDecoration(Attribute::Index::Position);
+        if (!pos_xfb.empty()) {
+            pos_xfb = fmt::format("layout ({}) ", pos_xfb);
+        }
+        const char* pos_type =
+            FLOAT_TYPES.at(GetNumComponents(Attribute::Index::Position).value_or(4) - 1);
+        code.AddLine("{}{} gl_Position;", pos_xfb, pos_type);

        for (const auto attribute : ir.GetOutputAttributes()) {
            if (attribute == Attribute::Index::ClipDistances0123 ||
@@ -525,18 +664,16 @@ private:
    }

    void DeclareLocalMemory() {
+        u64 local_memory_size = 0;
        if (stage == ShaderType::Compute) {
-            code.AddLine("#ifdef LOCAL_MEMORY_SIZE");
-            code.AddLine("uint {}[LOCAL_MEMORY_SIZE];", GetLocalMemory());
-            code.AddLine("#endif");
-            return;
+            local_memory_size = registry.GetComputeInfo().local_memory_size_in_words * 4ULL;
+        } else {
+            local_memory_size = header.GetLocalMemorySize();
        }
-
-        const u64 local_memory_size = header.GetLocalMemorySize();
        if (local_memory_size == 0) {
            return;
        }
-        const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
+        const u64 element_count = Common::AlignUp(local_memory_size, 4) / 4;
        code.AddLine("uint {}[{}];", GetLocalMemory(), element_count);
        code.AddNewLine();
    }
@@ -589,7 +726,7 @@ private:
    void DeclareInputAttribute(Attribute::Index index, bool skip_unused) {
        const u32 location{GetGenericAttributeIndex(index)};

-        std::string name{GetInputAttribute(index)};
+        std::string name{GetGenericInputAttribute(index)};
        if (stage == ShaderType::Geometry) {
            name = "gs_" + name + "[]";
        }
@@ -626,9 +763,59 @@ private:
        }
    }

+    std::optional<std::size_t> GetNumComponents(Attribute::Index index, u8 element = 0) const {
+        const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
+        const auto it = transform_feedback.find(location);
+        if (it == transform_feedback.end()) {
+            return {};
+        }
+        return it->second.components;
+    }
+
+    std::string GetTransformFeedbackDecoration(Attribute::Index index, u8 element = 0) const {
+        const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
+        const auto it = transform_feedback.find(location);
+        if (it == transform_feedback.end()) {
+            return {};
+        }
+
+        const VaryingTFB& tfb = it->second;
+        return fmt::format("xfb_buffer = {}, xfb_offset = {}, xfb_stride = {}", tfb.buffer,
+                           tfb.offset, tfb.stride);
+    }
+
    void DeclareOutputAttribute(Attribute::Index index) {
-        const u32 location{GetGenericAttributeIndex(index)};
-        code.AddLine("layout (location = {}) out vec4 {};", location, GetOutputAttribute(index));
+        static constexpr std::string_view swizzle = "xyzw";
+        u8 element = 0;
+        while (element < 4) {
+            auto xfb = GetTransformFeedbackDecoration(index, element);
+            if (!xfb.empty()) {
+                xfb = fmt::format(", {}", xfb);
+            }
+            const std::size_t remainder = 4 - element;
+            const std::size_t num_components = GetNumComponents(index, element).value_or(remainder);
+            const char* const type = FLOAT_TYPES.at(num_components - 1);
+
+            const u32 location = GetGenericAttributeIndex(index);
+
+            GenericVaryingDescription description;
+            description.first_element = static_cast<u8>(element);
+            description.is_scalar = num_components == 1;
+            description.name = AppendSuffix(location, OUTPUT_ATTRIBUTE_NAME);
+            if (element != 0 || num_components != 4) {
+                const std::string_view name_swizzle = swizzle.substr(element, num_components);
+                description.name = fmt::format("{}_{}", description.name, name_swizzle);
+            }
+            for (std::size_t i = 0; i < num_components; ++i) {
+                const u8 offset = static_cast<u8>(location * 4 + element + i);
+                varying_description.insert({offset, description});
+            }
+
+            code.AddLine("layout (location = {}, component = {}{}) out {} {};", location, element,
+                         xfb, type, description.name);
+
+            element = static_cast<u8>(static_cast<std::size_t>(element) + num_components);
+        }
    }

    void DeclareConstantBuffers() {
@@ -925,7 +1112,8 @@ private:
                // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
                // set an 0x80000000 index for those and the shader fails to build. Find out why
                // this happens and what's its intent.
-                return fmt::format("gs_{}[{} % MAX_VERTEX_INPUT]", name, Visit(buffer).AsUint());
+                return fmt::format("gs_{}[{} % {}]", name, Visit(buffer).AsUint(),
+                                   max_input_vertices.value());
            }
            return std::string(name);
        };
@@ -980,7 +1168,7 @@ private:
            return {"0", Type::Int};
        default:
            if (IsGenericAttribute(attribute)) {
-                return {GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element),
+                return {GeometryPass(GetGenericInputAttribute(attribute)) + GetSwizzle(element),
                        Type::Float};
            }
            break;
@@ -1049,8 +1237,7 @@ private:
            return {{fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float}};
        default:
            if (IsGenericAttribute(attribute)) {
-                return {
-                    {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), Type::Float}};
+                return {{GetGenericOutputAttribute(attribute, abuf->GetElement()), Type::Float}};
            }
            UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
            return {};
@@ -1822,16 +2009,19 @@ private:
        expr += GetSampler(meta->sampler);
        expr += ", ";

-        expr += constructors.at(operation.GetOperandsCount() - 1);
+        expr += constructors.at(operation.GetOperandsCount() + (meta->array ? 1 : 0) - 1);
        expr += '(';
        for (std::size_t i = 0; i < count; ++i) {
-            expr += VisitOperand(operation, i).AsInt();
-            const std::size_t next = i + 1;
-            if (next == count)
-                expr += ')';
-            else if (next < count)
+            if (i > 0) {
                expr += ", ";
+            }
+            expr += VisitOperand(operation, i).AsInt();
        }
+        if (meta->array) {
+            expr += ", ";
+            expr += Visit(meta->array).AsInt();
+        }
+        expr += ')';

        if (meta->lod && !meta->sampler.IsBuffer()) {
            expr += ", ";
@@ -1945,7 +2135,7 @@ private:
            // TODO(Subv): Figure out how dual-source blending is configured in the Switch.
            for (u32 component = 0; component < 4; ++component) {
                if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
-                    code.AddLine("FragColor{}[{}] = {};", render_target, component,
+                    code.AddLine("frag_color{}{} = {};", render_target, GetColorSwizzle(component),
                                 SafeGetRegister(current_reg).AsFloat());
                    ++current_reg;
                }
@@ -2261,27 +2451,34 @@ private:
    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

    std::string GetRegister(u32 index) const {
-        return GetDeclarationWithSuffix(index, "gpr");
+        return AppendSuffix(index, "gpr");
    }

    std::string GetCustomVariable(u32 index) const {
-        return GetDeclarationWithSuffix(index, "custom_var");
+        return AppendSuffix(index, "custom_var");
    }

    std::string GetPredicate(Tegra::Shader::Pred pred) const {
-        return GetDeclarationWithSuffix(static_cast<u32>(pred), "pred");
+        return AppendSuffix(static_cast<u32>(pred), "pred");
    }

-    std::string GetInputAttribute(Attribute::Index attribute) const {
-        return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "input_attr");
+    std::string GetGenericInputAttribute(Attribute::Index attribute) const {
+        return AppendSuffix(GetGenericAttributeIndex(attribute), INPUT_ATTRIBUTE_NAME);
    }

-    std::string GetOutputAttribute(Attribute::Index attribute) const {
-        return GetDeclarationWithSuffix(GetGenericAttributeIndex(attribute), "output_attr");
+    std::unordered_map<u8, GenericVaryingDescription> varying_description;
+
+    std::string GetGenericOutputAttribute(Attribute::Index attribute, std::size_t element) const {
+        const u8 offset = static_cast<u8>(GetGenericAttributeIndex(attribute) * 4 + element);
+        const auto& description = varying_description.at(offset);
+        if (description.is_scalar) {
+            return description.name;
+        }
+        return fmt::format("{}[{}]", description.name, element - description.first_element);
    }

    std::string GetConstBuffer(u32 index) const {
-        return GetDeclarationWithSuffix(index, "cbuf");
+        return AppendSuffix(index, "cbuf");
    }

    std::string GetGlobalMemory(const GlobalMemoryBase& descriptor) const {
@@ -2294,11 +2491,15 @@ private:
    }

    std::string GetConstBufferBlock(u32 index) const {
-        return GetDeclarationWithSuffix(index, "cbuf_block");
+        return AppendSuffix(index, "cbuf_block");
    }

    std::string GetLocalMemory() const {
-        return "lmem_" + suffix;
+        if (suffix.empty()) {
+            return "lmem";
+        } else {
+            return "lmem_" + std::string{suffix};
+        }
    }

    std::string GetInternalFlag(InternalFlag flag) const {
@@ -2307,19 +2508,27 @@ private:
        const auto index = static_cast<u32>(flag);
        ASSERT(index < static_cast<u32>(InternalFlag::Amount));

-        return fmt::format("{}_{}", InternalFlagNames[index], suffix);
+        if (suffix.empty()) {
+            return InternalFlagNames[index];
+        } else {
+            return fmt::format("{}_{}", InternalFlagNames[index], suffix);
+        }
    }

    std::string GetSampler(const Sampler& sampler) const {
-        return GetDeclarationWithSuffix(static_cast<u32>(sampler.GetIndex()), "sampler");
+        return AppendSuffix(static_cast<u32>(sampler.GetIndex()), "sampler");
    }

    std::string GetImage(const Image& image) const {
-        return GetDeclarationWithSuffix(static_cast<u32>(image.GetIndex()), "image");
+        return AppendSuffix(static_cast<u32>(image.GetIndex()), "image");
    }

-    std::string GetDeclarationWithSuffix(u32 index, std::string_view name) const {
-        return fmt::format("{}_{}_{}", name, index, suffix);
+    std::string AppendSuffix(u32 index, std::string_view name) const {
+        if (suffix.empty()) {
+            return fmt::format("{}{}", name, index);
+        } else {
+            return fmt::format("{}{}_{}", name, index, suffix);
+        }
    }

    u32 GetNumPhysicalInputAttributes() const {
@@ -2334,17 +2543,31 @@ private:
        return std::min<u32>(device.GetMaxVaryings(), Maxwell::NumVaryings);
    }

+    bool IsRenderTargetEnabled(u32 render_target) const {
+        for (u32 component = 0; component < 4; ++component) {
+            if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
    const Device& device;
    const ShaderIR& ir;
+    const Registry& registry;
    const ShaderType stage;
-    const std::string suffix;
+    const std::string_view identifier;
+    const std::string_view suffix;
    const Header header;
+    std::unordered_map<u8, VaryingTFB> transform_feedback;

    ShaderWriter code;
+
+    std::optional<u32> max_input_vertices;
 };

-std::string GetFlowVariable(u32 i) {
-    return fmt::format("flow_var_{}", i);
+std::string GetFlowVariable(u32 index) {
+    return fmt::format("flow_var{}", index);
 }

 class ExprDecompiler {
@@ -2531,7 +2754,7 @@ void GLSLDecompiler::DecompileAST() {

 } // Anonymous namespace

-ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) {
+ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
    ShaderEntries entries;
    for (const auto& cbuf : ir.GetConstantBuffers()) {
        entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2555,28 +2778,12 @@ ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) {
    return entries;
 }

-std::string GetCommonDeclarations() {
-    return R"(#define ftoi floatBitsToInt
-#define ftou floatBitsToUint
-#define itof intBitsToFloat
-#define utof uintBitsToFloat
-
-bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {
-    bvec2 is_nan1 = isnan(pair1);
-    bvec2 is_nan2 = isnan(pair2);
-    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y);
-}
-
-const float fswzadd_modifiers_a[] = float[4](-1.0f,  1.0f, -1.0f,  0.0f );
-const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f,  1.0f, -1.0f );
-)";
-}
-
-std::string Decompile(const Device& device, const ShaderIR& ir, ShaderType stage,
-                      const std::string& suffix) {
-    GLSLDecompiler decompiler(device, ir, stage, suffix);
+std::string DecompileShader(const Device& device, const ShaderIR& ir, const Registry& registry,
+                            ShaderType stage, std::string_view identifier,
+                            std::string_view suffix) {
+    GLSLDecompiler decompiler(device, ir, registry, stage, identifier, suffix);
    decompiler.Decompile();
    return decompiler.GetResult();
 }

-} // namespace OpenGL::GLShader
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -6,22 +6,18 @@

 #include <array>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

-namespace VideoCommon::Shader {
-class ShaderIR;
-}
-
 namespace OpenGL {
-class Device;
-}

-namespace OpenGL::GLShader {
+class Device;

 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 using SamplerEntry = VideoCommon::Shader::Sampler;
@@ -78,11 +74,11 @@ struct ShaderEntries {
    std::size_t shader_length{};
 };

-ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir);
+ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);

-std::string GetCommonDeclarations();
+std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                            const VideoCommon::Shader::Registry& registry,
+                            Tegra::Engines::ShaderType stage, std::string_view identifier,
+                            std::string_view suffix = {});

-std::string Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
-                      Tegra::Engines::ShaderType stage, const std::string& suffix);
-
-} // namespace OpenGL::GLShader
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -31,32 +31,24 @@ namespace {

 using ShaderCacheVersionHash = std::array<u8, 64>;

-enum class TransferableEntryKind : u32 {
-    Raw,
-    Usage,
-};
-
 struct ConstBufferKey {
-    u32 cbuf{};
-    u32 offset{};
-    u32 value{};
+    u32 cbuf = 0;
+    u32 offset = 0;
+    u32 value = 0;
 };

 struct BoundSamplerKey {
-    u32 offset{};
-    Tegra::Engines::SamplerDescriptor sampler{};
+    u32 offset = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
 };

 struct BindlessSamplerKey {
-    u32 cbuf{};
-    u32 offset{};
-    Tegra::Engines::SamplerDescriptor sampler{};
+    u32 cbuf = 0;
+    u32 offset = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
 };

-constexpr u32 NativeVersion = 12;
-
-// Making sure sizes doesn't change by accident
-static_assert(sizeof(ProgramVariant) == 20);
+constexpr u32 NativeVersion = 20;

 ShaderCacheVersionHash GetShaderCacheVersionHash() {
    ShaderCacheVersionHash hash{};
@@ -67,61 +59,124 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {

 } // Anonymous namespace

-ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ShaderType type, ProgramCode code,
-                                       ProgramCode code_b)
-    : unique_identifier{unique_identifier}, type{type}, code{std::move(code)}, code_b{std::move(
-                                                                                   code_b)} {}
+ShaderDiskCacheEntry::ShaderDiskCacheEntry() = default;

-ShaderDiskCacheRaw::ShaderDiskCacheRaw() = default;
+ShaderDiskCacheEntry::~ShaderDiskCacheEntry() = default;

-ShaderDiskCacheRaw::~ShaderDiskCacheRaw() = default;
-
-bool ShaderDiskCacheRaw::Load(FileUtil::IOFile& file) {
-    if (file.ReadBytes(&unique_identifier, sizeof(u64)) != sizeof(u64) ||
-        file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) {
+bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
+    if (file.ReadBytes(&type, sizeof(u32)) != sizeof(u32)) {
        return false;
    }
-    u32 code_size{};
-    u32 code_size_b{};
+    u32 code_size;
+    u32 code_size_b;
    if (file.ReadBytes(&code_size, sizeof(u32)) != sizeof(u32) ||
        file.ReadBytes(&code_size_b, sizeof(u32)) != sizeof(u32)) {
        return false;
    }
-
    code.resize(code_size);
    code_b.resize(code_size_b);

-    if (file.ReadArray(code.data(), code_size) != code_size)
+    if (file.ReadArray(code.data(), code_size) != code_size) {
        return false;
-
+    }
    if (HasProgramA() && file.ReadArray(code_b.data(), code_size_b) != code_size_b) {
        return false;
    }
+
+    u8 is_texture_handler_size_known;
+    u32 texture_handler_size_value;
+    u32 num_keys;
+    u32 num_bound_samplers;
+    u32 num_bindless_samplers;
+    if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
+        file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
+        file.ReadArray(&texture_handler_size_value, 1) != 1 ||
+        file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
+        file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+        file.ReadArray(&num_bindless_samplers, 1) != 1) {
+        return false;
+    }
+    if (is_texture_handler_size_known) {
+        texture_handler_size = texture_handler_size_value;
+    }
+
+    std::vector<ConstBufferKey> flat_keys(num_keys);
+    std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
+    std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+    if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
+        file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
+            flat_bound_samplers.size() ||
+        file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
+            flat_bindless_samplers.size()) {
+        return false;
+    }
+    for (const auto& key : flat_keys) {
+        keys.insert({{key.cbuf, key.offset}, key.value});
+    }
+    for (const auto& key : flat_bound_samplers) {
+        bound_samplers.emplace(key.offset, key.sampler);
+    }
+    for (const auto& key : flat_bindless_samplers) {
+        bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+    }
+
    return true;
 }

-bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const {
-    if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(static_cast<u32>(type)) != 1 ||
+bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
+    if (file.WriteObject(static_cast<u32>(type)) != 1 ||
        file.WriteObject(static_cast<u32>(code.size())) != 1 ||
        file.WriteObject(static_cast<u32>(code_b.size())) != 1) {
        return false;
    }
-
-    if (file.WriteArray(code.data(), code.size()) != code.size())
+    if (file.WriteArray(code.data(), code.size()) != code.size()) {
        return false;
-
+    }
    if (HasProgramA() && file.WriteArray(code_b.data(), code_b.size()) != code_b.size()) {
        return false;
    }
-    return true;
+
+    if (file.WriteObject(unique_identifier) != 1 || file.WriteObject(bound_buffer) != 1 ||
+        file.WriteObject(static_cast<u8>(texture_handler_size.has_value())) != 1 ||
+        file.WriteObject(texture_handler_size.value_or(0)) != 1 ||
+        file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
+        file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
+        return false;
+    }
+
+    std::vector<ConstBufferKey> flat_keys;
+    flat_keys.reserve(keys.size());
+    for (const auto& [address, value] : keys) {
+        flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
+    }
+
+    std::vector<BoundSamplerKey> flat_bound_samplers;
+    flat_bound_samplers.reserve(bound_samplers.size());
+    for (const auto& [address, sampler] : bound_samplers) {
+        flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+    }
+
+    std::vector<BindlessSamplerKey> flat_bindless_samplers;
+    flat_bindless_samplers.reserve(bindless_samplers.size());
+    for (const auto& [address, sampler] : bindless_samplers) {
+        flat_bindless_samplers.push_back(
+            BindlessSamplerKey{address.first, address.second, sampler});
+    }
+
+    return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
+           file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
+               flat_bound_samplers.size() &&
+           file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
+               flat_bindless_samplers.size();
 }

 ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}

 ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;

-std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
-ShaderDiskCacheOpenGL::LoadTransferable() {
+std::optional<std::vector<ShaderDiskCacheEntry>> ShaderDiskCacheOpenGL::LoadTransferable() {
    // Skip games without title id
    const bool has_title_id = system.CurrentProcess()->GetTitleID() != 0;
    if (!Settings::values.use_disk_shader_cache || !has_title_id) {
@@ -130,17 +185,14 @@ ShaderDiskCacheOpenGL::LoadTransferable() {

    FileUtil::IOFile file(GetTransferablePath(), "rb");
    if (!file.IsOpen()) {
-        LOG_INFO(Render_OpenGL, "No transferable shader cache found for game with title id={}",
-                 GetTitleID());
+        LOG_INFO(Render_OpenGL, "No transferable shader cache found");
        is_usable = true;
        return {};
    }

    u32 version{};
    if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) {
-        LOG_ERROR(Render_OpenGL,
-                  "Failed to get transferable cache version for title id={}, skipping",
-                  GetTitleID());
+        LOG_ERROR(Render_OpenGL, "Failed to get transferable cache version, skipping it");
        return {};
    }

@@ -158,105 +210,42 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
    }

    // Version is valid, load the shaders
-    constexpr const char error_loading[] = "Failed to load transferable raw entry, skipping";
-    std::vector<ShaderDiskCacheRaw> raws;
-    std::vector<ShaderDiskCacheUsage> usages;
+    std::vector<ShaderDiskCacheEntry> entries;
    while (file.Tell() < file.GetSize()) {
-        TransferableEntryKind kind{};
-        if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) {
-            LOG_ERROR(Render_OpenGL, "Failed to read transferable file, skipping");
-            return {};
-        }
-
-        switch (kind) {
-        case TransferableEntryKind::Raw: {
-            ShaderDiskCacheRaw entry;
-            if (!entry.Load(file)) {
-                LOG_ERROR(Render_OpenGL, error_loading);
-                return {};
-            }
-            transferable.insert({entry.GetUniqueIdentifier(), {}});
-            raws.push_back(std::move(entry));
-            break;
-        }
-        case TransferableEntryKind::Usage: {
-            ShaderDiskCacheUsage usage;
-
-            u32 num_keys{};
-            u32 num_bound_samplers{};
-            u32 num_bindless_samplers{};
-            if (file.ReadArray(&usage.unique_identifier, 1) != 1 ||
-                file.ReadArray(&usage.variant, 1) != 1 ||
-                file.ReadArray(&usage.bound_buffer, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 ||
-                file.ReadArray(&num_bound_samplers, 1) != 1 ||
-                file.ReadArray(&num_bindless_samplers, 1) != 1) {
-                LOG_ERROR(Render_OpenGL, error_loading);
-                return {};
-            }
-
-            std::vector<ConstBufferKey> keys(num_keys);
-            std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers);
-            std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers);
-            if (file.ReadArray(keys.data(), keys.size()) != keys.size() ||
-                file.ReadArray(bound_samplers.data(), bound_samplers.size()) !=
-                    bound_samplers.size() ||
-                file.ReadArray(bindless_samplers.data(), bindless_samplers.size()) !=
-                    bindless_samplers.size()) {
-                LOG_ERROR(Render_OpenGL, error_loading);
-                return {};
-            }
-            for (const auto& key : keys) {
-                usage.keys.insert({{key.cbuf, key.offset}, key.value});
-            }
-            for (const auto& key : bound_samplers) {
-                usage.bound_samplers.emplace(key.offset, key.sampler);
-            }
-            for (const auto& key : bindless_samplers) {
-                usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
-            }
-
-            usages.push_back(std::move(usage));
-            break;
-        }
-        default:
-            LOG_ERROR(Render_OpenGL, "Unknown transferable shader cache entry kind={}, skipping",
-                      static_cast<u32>(kind));
+        ShaderDiskCacheEntry& entry = entries.emplace_back();
+        if (!entry.Load(file)) {
+            LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry, skipping");
            return {};
        }
    }

    is_usable = true;
-    return {{std::move(raws), std::move(usages)}};
+    return {std::move(entries)};
 }

-std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>
-ShaderDiskCacheOpenGL::LoadPrecompiled() {
+std::vector<ShaderDiskCachePrecompiled> ShaderDiskCacheOpenGL::LoadPrecompiled() {
    if (!is_usable) {
        return {};
    }

-    std::string path = GetPrecompiledPath();
-    FileUtil::IOFile file(path, "rb");
+    FileUtil::IOFile file(GetPrecompiledPath(), "rb");
    if (!file.IsOpen()) {
-        LOG_INFO(Render_OpenGL, "No precompiled shader cache found for game with title id={}",
-                 GetTitleID());
+        LOG_INFO(Render_OpenGL, "No precompiled shader cache found");
        return {};
    }

-    const auto result = LoadPrecompiledFile(file);
-    if (!result) {
-        LOG_INFO(Render_OpenGL,
-                 "Failed to load precompiled cache for game with title id={}, removing",
-                 GetTitleID());
-        file.Close();
-        InvalidatePrecompiled();
-        return {};
+    if (const auto result = LoadPrecompiledFile(file)) {
+        return *result;
    }
-    return *result;
+
+    LOG_INFO(Render_OpenGL, "Failed to load precompiled cache");
+    file.Close();
+    InvalidatePrecompiled();
+    return {};
 }

-std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>
-ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
+std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::LoadPrecompiledFile(
+    FileUtil::IOFile& file) {
    // Read compressed file from disk and decompress to virtual precompiled cache file
    std::vector<u8> compressed(file.GetSize());
    file.ReadBytes(compressed.data(), compressed.size());
@@ -275,58 +264,22 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
        return {};
    }

-    ShaderDumpsMap dumps;
+    std::vector<ShaderDiskCachePrecompiled> entries;
    while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) {
-        u32 num_keys{};
-        u32 num_bound_samplers{};
-        u32 num_bindless_samplers{};
-        ShaderDiskCacheUsage usage;
-        if (!LoadObjectFromPrecompiled(usage.unique_identifier) ||
-            !LoadObjectFromPrecompiled(usage.variant) ||
-            !LoadObjectFromPrecompiled(usage.bound_buffer) ||
-            !LoadObjectFromPrecompiled(num_keys) ||
-            !LoadObjectFromPrecompiled(num_bound_samplers) ||
-            !LoadObjectFromPrecompiled(num_bindless_samplers)) {
-            return {};
-        }
-        std::vector<ConstBufferKey> keys(num_keys);
-        std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers);
-        std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers);
-        if (!LoadArrayFromPrecompiled(keys.data(), keys.size()) ||
-            !LoadArrayFromPrecompiled(bound_samplers.data(), bound_samplers.size()) !=
-                bound_samplers.size() ||
-            !LoadArrayFromPrecompiled(bindless_samplers.data(), bindless_samplers.size()) !=
-                bindless_samplers.size()) {
-            return {};
-        }
-        for (const auto& key : keys) {
-            usage.keys.insert({{key.cbuf, key.offset}, key.value});
-        }
-        for (const auto& key : bound_samplers) {
-            usage.bound_samplers.emplace(key.offset, key.sampler);
-        }
-        for (const auto& key : bindless_samplers) {
-            usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
-        }
-
-        ShaderDiskCacheDump dump;
-        if (!LoadObjectFromPrecompiled(dump.binary_format)) {
+        u32 binary_size;
+        auto& entry = entries.emplace_back();
+        if (!LoadObjectFromPrecompiled(entry.unique_identifier) ||
+            !LoadObjectFromPrecompiled(entry.binary_format) ||
+            !LoadObjectFromPrecompiled(binary_size)) {
            return {};
        }

-        u32 binary_length{};
-        if (!LoadObjectFromPrecompiled(binary_length)) {
+        entry.binary.resize(binary_size);
+        if (!LoadArrayFromPrecompiled(entry.binary.data(), entry.binary.size())) {
            return {};
        }
-
-        dump.binary.resize(binary_length);
-        if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) {
-            return {};
-        }
-
-        dumps.emplace(std::move(usage), dump);
    }
-    return dumps;
+    return entries;
 }

 void ShaderDiskCacheOpenGL::InvalidateTransferable() {
@@ -346,13 +299,13 @@ void ShaderDiskCacheOpenGL::InvalidatePrecompiled() {
    }
 }

-void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) {
+void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) {
    if (!is_usable) {
        return;
    }

-    const u64 id = entry.GetUniqueIdentifier();
-    if (transferable.find(id) != transferable.end()) {
+    const u64 id = entry.unique_identifier;
+    if (stored_transferable.find(id) != stored_transferable.end()) {
        // The shader already exists
        return;
    }
@@ -361,71 +314,17 @@ void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) {
    if (!file.IsOpen()) {
        return;
    }
-    if (file.WriteObject(TransferableEntryKind::Raw) != 1 || !entry.Save(file)) {
+    if (!entry.Save(file)) {
        LOG_ERROR(Render_OpenGL, "Failed to save raw transferable cache entry, removing");
        file.Close();
        InvalidateTransferable();
        return;
    }
-    transferable.insert({id, {}});
+
+    stored_transferable.insert(id);
 }

-void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) {
-    if (!is_usable) {
-        return;
-    }
-
-    const auto it = transferable.find(usage.unique_identifier);
-    ASSERT_MSG(it != transferable.end(), "Saving shader usage without storing raw previously");
-
-    auto& usages{it->second};
-    if (usages.find(usage) != usages.end()) {
-        // Skip this variant since the shader is already stored.
-        return;
-    }
-    usages.insert(usage);
-
-    FileUtil::IOFile file = AppendTransferableFile();
-    if (!file.IsOpen())
-        return;
-    const auto Close = [&] {
-        LOG_ERROR(Render_OpenGL, "Failed to save usage transferable cache entry, removing");
-        file.Close();
-        InvalidateTransferable();
-    };
-
-    if (file.WriteObject(TransferableEntryKind::Usage) != 1 ||
-        file.WriteObject(usage.unique_identifier) != 1 || file.WriteObject(usage.variant) != 1 ||
-        file.WriteObject(usage.bound_buffer) != 1 ||
-        file.WriteObject(static_cast<u32>(usage.keys.size())) != 1 ||
-        file.WriteObject(static_cast<u32>(usage.bound_samplers.size())) != 1 ||
-        file.WriteObject(static_cast<u32>(usage.bindless_samplers.size())) != 1) {
-        Close();
-        return;
-    }
-    for (const auto& [pair, value] : usage.keys) {
-        const auto [cbuf, offset] = pair;
-        if (file.WriteObject(ConstBufferKey{cbuf, offset, value}) != 1) {
-            Close();
-            return;
-        }
-    }
-    for (const auto& [offset, sampler] : usage.bound_samplers) {
-        if (file.WriteObject(BoundSamplerKey{offset, sampler}) != 1) {
-            Close();
-            return;
-        }
-    }
-    for (const auto& [pair, sampler] : usage.bindless_samplers) {
-        const auto [cbuf, offset] = pair;
-        if (file.WriteObject(BindlessSamplerKey{cbuf, offset, sampler}) != 1) {
-            Close();
-            return;
-        }
-    }
-}
-
-void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint program) {
+void ShaderDiskCacheOpenGL::SavePrecompiled(u64 unique_identifier, GLuint program) {
    if (!is_usable) {
        return;
    }
@@ -437,51 +336,19 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
        SavePrecompiledHeaderToVirtualPrecompiledCache();
    }

-    GLint binary_length{};
+    GLint binary_length;
    glGetProgramiv(program, GL_PROGRAM_BINARY_LENGTH, &binary_length);

-    GLenum binary_format{};
+    GLenum binary_format;
    std::vector<u8> binary(binary_length);
    glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data());

-    const auto Close = [&] {
-        LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016X}, removing",
-                  usage.unique_identifier);
-        InvalidatePrecompiled();
-    };
-
-    if (!SaveObjectToPrecompiled(usage.unique_identifier) ||
-        !SaveObjectToPrecompiled(usage.variant) || !SaveObjectToPrecompiled(usage.bound_buffer) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(usage.keys.size())) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(usage.bound_samplers.size())) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(usage.bindless_samplers.size()))) {
-        Close();
-        return;
-    }
-    for (const auto& [pair, value] : usage.keys) {
-        const auto [cbuf, offset] = pair;
-        if (SaveObjectToPrecompiled(ConstBufferKey{cbuf, offset, value}) != 1) {
-            Close();
-            return;
-        }
-    }
-    for (const auto& [offset, sampler] : usage.bound_samplers) {
-        if (SaveObjectToPrecompiled(BoundSamplerKey{offset, sampler}) != 1) {
-            Close();
-            return;
-        }
-    }
-    for (const auto& [pair, sampler] : usage.bindless_samplers) {
-        const auto [cbuf, offset] = pair;
-        if (SaveObjectToPrecompiled(BindlessSamplerKey{cbuf, offset, sampler}) != 1) {
-            Close();
-            return;
-        }
-    }
-    if (!SaveObjectToPrecompiled(static_cast<u32>(binary_format)) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) ||
+    if (!SaveObjectToPrecompiled(unique_identifier) || !SaveObjectToPrecompiled(binary_format) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(binary.size())) ||
        !SaveArrayToPrecompiled(binary.data(), binary.size())) {
-        Close();
+        LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016X}, removing",
+                  unique_identifier);
+        InvalidatePrecompiled();
    }
 }

@@ -534,7 +401,6 @@ void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
    if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) {
        LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}",
                  precompiled_path);
-        return;
    }
 }

--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -19,8 +19,7 @@
 #include "common/common_types.h"
 #include "core/file_sys/vfs_vector.h"
 #include "video_core/engines/shader_type.h"
-#include "video_core/renderer_opengl/gl_shader_gen.h"
-#include "video_core/shader/const_buffer_locker.h"
+#include "video_core/shader/registry.h"

 namespace Core {
 class System;
@@ -32,139 +31,39 @@ class IOFile;

 namespace OpenGL {

-struct ShaderDiskCacheUsage;
-struct ShaderDiskCacheDump;
-
 using ProgramCode = std::vector<u64>;
-using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;

-/// Describes the different variants a program can be compiled with.
-struct ProgramVariant final {
-    ProgramVariant() = default;
-
-    /// Graphics constructor.
-    explicit constexpr ProgramVariant(GLenum primitive_mode) noexcept
-        : primitive_mode{primitive_mode} {}
-
-    /// Compute constructor.
-    explicit constexpr ProgramVariant(u32 block_x, u32 block_y, u32 block_z, u32 shared_memory_size,
-                                      u32 local_memory_size) noexcept
-        : block_x{block_x}, block_y{static_cast<u16>(block_y)}, block_z{static_cast<u16>(block_z)},
-          shared_memory_size{shared_memory_size}, local_memory_size{local_memory_size} {}
-
-    // Graphics specific parameters.
-    GLenum primitive_mode{};
-
-    // Compute specific parameters.
-    u32 block_x{};
-    u16 block_y{};
-    u16 block_z{};
-    u32 shared_memory_size{};
-    u32 local_memory_size{};
-
-    bool operator==(const ProgramVariant& rhs) const noexcept {
-        return std::tie(primitive_mode, block_x, block_y, block_z, shared_memory_size,
-                        local_memory_size) == std::tie(rhs.primitive_mode, rhs.block_x, rhs.block_y,
-                                                       rhs.block_z, rhs.shared_memory_size,
-                                                       rhs.local_memory_size);
-    }
-
-    bool operator!=(const ProgramVariant& rhs) const noexcept {
-        return !operator==(rhs);
-    }
-};
-static_assert(std::is_trivially_copyable_v<ProgramVariant>);
-
-/// Describes how a shader is used.
-struct ShaderDiskCacheUsage {
-    u64 unique_identifier{};
-    ProgramVariant variant;
-    u32 bound_buffer{};
-    VideoCommon::Shader::KeyMap keys;
-    VideoCommon::Shader::BoundSamplerMap bound_samplers;
-    VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
-
-    bool operator==(const ShaderDiskCacheUsage& rhs) const {
-        return std::tie(unique_identifier, variant, keys, bound_samplers, bindless_samplers) ==
-               std::tie(rhs.unique_identifier, rhs.variant, rhs.keys, rhs.bound_samplers,
-                        rhs.bindless_samplers);
-    }
-
-    bool operator!=(const ShaderDiskCacheUsage& rhs) const {
-        return !operator==(rhs);
-    }
-};
-
-} // namespace OpenGL
-
-namespace std {
-
-template <>
-struct hash<OpenGL::ProgramVariant> {
-    std::size_t operator()(const OpenGL::ProgramVariant& variant) const noexcept {
-        return (static_cast<std::size_t>(variant.primitive_mode) << 6) ^
-               static_cast<std::size_t>(variant.block_x) ^
-               (static_cast<std::size_t>(variant.block_y) << 32) ^
-               (static_cast<std::size_t>(variant.block_z) << 48) ^
-               (static_cast<std::size_t>(variant.shared_memory_size) << 16) ^
-               (static_cast<std::size_t>(variant.local_memory_size) << 36);
-    }
-};
-
-template <>
-struct hash<OpenGL::ShaderDiskCacheUsage> {
-    std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept {
-        return static_cast<std::size_t>(usage.unique_identifier) ^
-               std::hash<OpenGL::ProgramVariant>{}(usage.variant);
-    }
-};
-
-} // namespace std
-
-namespace OpenGL {
-
-/// Describes a shader how it's used by the guest GPU
-class ShaderDiskCacheRaw {
-public:
-    explicit ShaderDiskCacheRaw(u64 unique_identifier, Tegra::Engines::ShaderType type,
-                                ProgramCode code, ProgramCode code_b = {});
-    ShaderDiskCacheRaw();
-    ~ShaderDiskCacheRaw();
+/// Describes a shader and how it's used by the guest GPU
+struct ShaderDiskCacheEntry {
+    ShaderDiskCacheEntry();
+    ~ShaderDiskCacheEntry();

    bool Load(FileUtil::IOFile& file);

    bool Save(FileUtil::IOFile& file) const;

-    u64 GetUniqueIdentifier() const {
-        return unique_identifier;
-    }
-
    bool HasProgramA() const {
        return !code.empty() && !code_b.empty();
    }

-    Tegra::Engines::ShaderType GetType() const {
-        return type;
-    }
-
-    const ProgramCode& GetCode() const {
-        return code;
-    }
-
-    const ProgramCode& GetCodeB() const {
-        return code_b;
-    }
-
-private:
-    u64 unique_identifier{};
    Tegra::Engines::ShaderType type{};
    ProgramCode code;
    ProgramCode code_b;
+
+    u64 unique_identifier = 0;
+    std::optional<u32> texture_handler_size;
+    u32 bound_buffer = 0;
+    VideoCommon::Shader::GraphicsInfo graphics_info;
+    VideoCommon::Shader::ComputeInfo compute_info;
+    VideoCommon::Shader::KeyMap keys;
+    VideoCommon::Shader::BoundSamplerMap bound_samplers;
+    VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
 };

 /// Contains an OpenGL dumped binary program
-struct ShaderDiskCacheDump {
-    GLenum binary_format{};
+struct ShaderDiskCachePrecompiled {
+    u64 unique_identifier = 0;
+    GLenum binary_format = 0;
    std::vector<u8> binary;
 };

@@ -174,11 +73,10 @@ public:
    ~ShaderDiskCacheOpenGL();

    /// Loads transferable cache. If file has a old version or on failure, it deletes the file.
-    std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
-    LoadTransferable();
+    std::optional<std::vector<ShaderDiskCacheEntry>> LoadTransferable();

    /// Loads current game's precompiled cache. Invalidates on failure.
-    std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> LoadPrecompiled();
+    std::vector<ShaderDiskCachePrecompiled> LoadPrecompiled();

    /// Removes the transferable (and precompiled) cache file.
    void InvalidateTransferable();
@@ -187,21 +85,18 @@ public:
    void InvalidatePrecompiled();

    /// Saves a raw dump to the transferable file. Checks for collisions.
-    void SaveRaw(const ShaderDiskCacheRaw& entry);
-
-    /// Saves shader usage to the transferable file. Does not check for collisions.
-    void SaveUsage(const ShaderDiskCacheUsage& usage);
+    void SaveEntry(const ShaderDiskCacheEntry& entry);

    /// Saves a dump entry to the precompiled file. Does not check for collisions.
-    void SaveDump(const ShaderDiskCacheUsage& usage, GLuint program);
+    void SavePrecompiled(u64 unique_identifier, GLuint program);

    /// Serializes virtual precompiled shader cache file to real file
    void SaveVirtualPrecompiledFile();

 private:
    /// Loads the transferable cache. Returns empty on failure.
-    std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>
-    LoadPrecompiledFile(FileUtil::IOFile& file);
+    std::optional<std::vector<ShaderDiskCachePrecompiled>> LoadPrecompiledFile(
+        FileUtil::IOFile& file);

    /// Opens current game's transferable file and write it's header if it doesn't exist
    FileUtil::IOFile AppendTransferableFile() const;
@@ -270,7 +165,7 @@ private:
    std::size_t precompiled_cache_virtual_file_offset = 0;

    // Stored transferable shaders
-    std::unordered_map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable;
+    std::unordered_set<u64> stored_transferable;

    // The cache has been loaded at boot
    bool is_usable{};
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -1,109 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <string>
-
-#include <fmt/format.h>
-
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/engines/shader_type.h"
-#include "video_core/renderer_opengl/gl_device.h"
-#include "video_core/renderer_opengl/gl_shader_decompiler.h"
-#include "video_core/renderer_opengl/gl_shader_gen.h"
-#include "video_core/shader/shader_ir.h"
-
-namespace OpenGL::GLShader {
-
-using Tegra::Engines::Maxwell3D;
-using Tegra::Engines::ShaderType;
-using VideoCommon::Shader::CompileDepth;
-using VideoCommon::Shader::CompilerSettings;
-using VideoCommon::Shader::ProgramCode;
-using VideoCommon::Shader::ShaderIR;
-
-std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b) {
-    std::string out = GetCommonDeclarations();
-    out += fmt::format(R"(
-layout (std140, binding = {}) uniform vs_config {{
-    float y_direction;
-}};
-
-)",
-                       EmulationUniformBlockBinding);
-    out += Decompile(device, ir, ShaderType::Vertex, "vertex");
-    if (ir_b) {
-        out += Decompile(device, *ir_b, ShaderType::Vertex, "vertex_b");
-    }
-
-    out += R"(
-void main() {
-    gl_Position = vec4(0.0f, 0.0f, 0.0f, 1.0f);
-    execute_vertex();
-)";
-    if (ir_b) {
-        out += "    execute_vertex_b();";
-    }
-    out += "}\n";
-    return out;
-}
-
-std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir) {
-    std::string out = GetCommonDeclarations();
-    out += fmt::format(R"(
-layout (std140, binding = {}) uniform gs_config {{
-    float y_direction;
-}};
-
-)",
-                       EmulationUniformBlockBinding);
-    out += Decompile(device, ir, ShaderType::Geometry, "geometry");
-
-    out += R"(
-void main() {
-    execute_geometry();
-}
-)";
-    return out;
-}
-
-std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir) {
-    std::string out = GetCommonDeclarations();
-    out += fmt::format(R"(
-layout (location = 0) out vec4 FragColor0;
-layout (location = 1) out vec4 FragColor1;
-layout (location = 2) out vec4 FragColor2;
-layout (location = 3) out vec4 FragColor3;
-layout (location = 4) out vec4 FragColor4;
-layout (location = 5) out vec4 FragColor5;
-layout (location = 6) out vec4 FragColor6;
-layout (location = 7) out vec4 FragColor7;
-
-layout (std140, binding = {}) uniform fs_config {{
-    float y_direction;
-}};
-
-)",
-                       EmulationUniformBlockBinding);
-    out += Decompile(device, ir, ShaderType::Fragment, "fragment");
-
-    out += R"(
-void main() {
-    execute_fragment();
-}
-)";
-    return out;
-}
-
-std::string GenerateComputeShader(const Device& device, const ShaderIR& ir) {
-    std::string out = GetCommonDeclarations();
-    out += Decompile(device, ir, ShaderType::Compute, "compute");
-    out += R"(
-void main() {
-    execute_compute();
-}
-)";
-    return out;
-}
-
-} // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -1,34 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <vector>
-
-#include "common/common_types.h"
-#include "video_core/renderer_opengl/gl_shader_decompiler.h"
-#include "video_core/shader/shader_ir.h"
-
-namespace OpenGL {
-class Device;
-}
-
-namespace OpenGL::GLShader {
-
-using VideoCommon::Shader::ProgramCode;
-using VideoCommon::Shader::ShaderIR;
-
-/// Generates the GLSL vertex shader program source code for the given VS program
-std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b);
-
-/// Generates the GLSL geometry shader program source code for the given GS program
-std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir);
-
-/// Generates the GLSL fragment shader program source code for the given FS program
-std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir);
-
-/// Generates the GLSL compute shader program source code for the given CS program
-std::string GenerateComputeShader(const Device& device, const ShaderIR& ir);
-
-} // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_state_tracker.cpp
+++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp
@@ -94,6 +94,15 @@ void SetupDirtyShaders(Tables& tables) {
              Shaders);
 }

+void SetupDirtyPolygonModes(Tables& tables) {
+    tables[0][OFF(polygon_mode_front)] = PolygonModeFront;
+    tables[0][OFF(polygon_mode_back)] = PolygonModeBack;
+
+    tables[1][OFF(polygon_mode_front)] = PolygonModes;
+    tables[1][OFF(polygon_mode_back)] = PolygonModes;
+    tables[0][OFF(fill_rectangle)] = PolygonModes;
+}
+
 void SetupDirtyDepthTest(Tables& tables) {
    auto& table = tables[0];
    table[OFF(depth_test_enable)] = DepthTest;
@@ -211,6 +220,7 @@ void StateTracker::Initialize() {
    SetupDirtyVertexArrays(tables);
    SetupDirtyVertexFormat(tables);
    SetupDirtyShaders(tables);
+    SetupDirtyPolygonModes(tables);
    SetupDirtyDepthTest(tables);
    SetupDirtyStencilTest(tables);
    SetupDirtyAlphaTest(tables);
--- a/src/video_core/renderer_opengl/gl_state_tracker.h
+++ b/src/video_core/renderer_opengl/gl_state_tracker.h
@@ -59,6 +59,10 @@ enum : u8 {
    Shaders,
    ClipDistances,

+    PolygonModes,
+    PolygonModeFront,
+    PolygonModeBack,
+
    ColorMask,
    FrontFace,
    CullTest,
@@ -111,6 +115,13 @@ public:
        flags[OpenGL::Dirty::VertexInstance0 + 1] = true;
    }

+    void NotifyPolygonModes() {
+        auto& flags = system.GPU().Maxwell3D().dirty.flags;
+        flags[OpenGL::Dirty::PolygonModes] = true;
+        flags[OpenGL::Dirty::PolygonModeFront] = true;
+        flags[OpenGL::Dirty::PolygonModeBack] = true;
+    }
+
    void NotifyViewport0() {
        auto& flags = system.GPU().Maxwell3D().dirty.flags;
        flags[OpenGL::Dirty::Viewports] = true;
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -53,6 +53,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
    {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, false},                             // R8UI
    {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false},                                    // RGBA16F
    {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, false},                                 // RGBA16U
+    {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT, false},                                    // RGBA16S
    {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, false},                       // RGBA16UI
    {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false},            // R11FG11FB10F
    {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, false},                         // RGBA32UI
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -488,5 +488,18 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) {
    return GL_COPY;
 }

+inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) {
+    switch (polygon_mode) {
+    case Maxwell::PolygonMode::Point:
+        return GL_POINT;
+    case Maxwell::PolygonMode::Line:
+        return GL_LINE;
+    case Maxwell::PolygonMode::Fill:
+        return GL_FILL;
+    }
+    UNREACHABLE_MSG("Invalid polygon mode={}", static_cast<int>(polygon_mode));
+    return GL_FILL;
+}
+
 } // namespace MaxwellToGL
 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -5,8 +5,11 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdlib>
+#include <cstring>
 #include <memory>
+
 #include <glad/glad.h>
+
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
@@ -25,6 +28,8 @@

 namespace OpenGL {

+namespace {
+
 // If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have
 // to wait on available presentation frames.
 constexpr std::size_t SWAP_CHAIN_SIZE = 3;
@@ -41,124 +46,6 @@ struct Frame {
    bool is_srgb{};                   /// Framebuffer is sRGB or RGB
 };

-/**
- * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
- * but also make sure that rendering happens at the pace that the frontend dictates. This is a
- * helper class that the renderer uses to sync frames between the render thread and the presentation
- * thread
- */
-class FrameMailbox {
-public:
-    std::mutex swap_chain_lock;
-    std::condition_variable present_cv;
-    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
-    std::queue<Frame*> free_queue;
-    std::deque<Frame*> present_queue;
-    Frame* previous_frame{};
-
-    FrameMailbox() {
-        for (auto& frame : swap_chain) {
-            free_queue.push(&frame);
-        }
-    }
-
-    ~FrameMailbox() {
-        // lock the mutex and clear out the present and free_queues and notify any people who are
-        // blocked to prevent deadlock on shutdown
-        std::scoped_lock lock{swap_chain_lock};
-        std::queue<Frame*>().swap(free_queue);
-        present_queue.clear();
-        present_cv.notify_all();
-    }
-
-    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
-        frame->present.Release();
-        frame->present.Create();
-        GLint previous_draw_fbo{};
-        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
-        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
-        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
-                                  frame->color.handle);
-        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
-        }
-        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
-        frame->color_reloaded = false;
-    }
-
-    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
-        // Recreate the color texture attachment
-        frame->color.Release();
-        frame->color.Create();
-        const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8;
-        glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height);
-
-        // Recreate the FBO for the render target
-        frame->render.Release();
-        frame->render.Create();
-        glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle);
-        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
-                                  frame->color.handle);
-        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
-            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
-        }
-
-        frame->width = width;
-        frame->height = height;
-        frame->color_reloaded = true;
-    }
-
-    Frame* GetRenderFrame() {
-        std::unique_lock lock{swap_chain_lock};
-
-        // If theres no free frames, we will reuse the oldest render frame
-        if (free_queue.empty()) {
-            auto frame = present_queue.back();
-            present_queue.pop_back();
-            return frame;
-        }
-
-        Frame* frame = free_queue.front();
-        free_queue.pop();
-        return frame;
-    }
-
-    void ReleaseRenderFrame(Frame* frame) {
-        std::unique_lock lock{swap_chain_lock};
-        present_queue.push_front(frame);
-        present_cv.notify_one();
-    }
-
-    Frame* TryGetPresentFrame(int timeout_ms) {
-        std::unique_lock lock{swap_chain_lock};
-        // wait for new entries in the present_queue
-        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
-                            [&] { return !present_queue.empty(); });
-        if (present_queue.empty()) {
-            // timed out waiting for a frame to draw so return the previous frame
-            return previous_frame;
-        }
-
-        // free the previous frame and add it back to the free queue
-        if (previous_frame) {
-            free_queue.push(previous_frame);
-        }
-
-        // the newest entries are pushed to the front of the queue
-        Frame* frame = present_queue.front();
-        present_queue.pop_front();
-        // remove all old entries from the present queue and move them back to the free_queue
-        for (auto f : present_queue) {
-            free_queue.push(f);
-        }
-        present_queue.clear();
-        previous_frame = frame;
-        return frame;
-    }
-};
-
-namespace {
-
 constexpr char VERTEX_SHADER[] = R"(
 #version 430 core

@@ -211,6 +98,24 @@ struct ScreenRectVertex {
    std::array<GLfloat, 2> tex_coord;
 };

+/// Returns true if any debug tool is attached
+bool HasDebugTool() {
+    const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
+    if (nsight) {
+        return true;
+    }
+
+    GLint num_extensions;
+    glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions);
+    for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) {
+        const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index));
+        if (!std::strcmp(name, "GL_EXT_debug_tool")) {
+            return true;
+        }
+    }
+    return false;
+}
+
 /**
 * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left
 * corner and (width, height) on the lower-bottom.
@@ -294,6 +199,153 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit

 } // Anonymous namespace

+/**
+ * For smooth Vsync rendering, we want to always present the latest frame that the core generates,
+ * but also make sure that rendering happens at the pace that the frontend dictates. This is a
+ * helper class that the renderer uses to sync frames between the render thread and the presentation
+ * thread
+ */
+class FrameMailbox {
+public:
+    std::mutex swap_chain_lock;
+    std::condition_variable present_cv;
+    std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{};
+    std::queue<Frame*> free_queue;
+    std::deque<Frame*> present_queue;
+    Frame* previous_frame{};
+
+    FrameMailbox() : has_debug_tool{HasDebugTool()} {
+        for (auto& frame : swap_chain) {
+            free_queue.push(&frame);
+        }
+    }
+
+    ~FrameMailbox() {
+        // lock the mutex and clear out the present and free_queues and notify any people who are
+        // blocked to prevent deadlock on shutdown
+        std::scoped_lock lock{swap_chain_lock};
+        std::queue<Frame*>().swap(free_queue);
+        present_queue.clear();
+        present_cv.notify_all();
+    }
+
+    void ReloadPresentFrame(Frame* frame, u32 height, u32 width) {
+        frame->present.Release();
+        frame->present.Create();
+        GLint previous_draw_fbo{};
+        glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo);
+        glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle);
+        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                                  frame->color.handle);
+        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+            LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!");
+        }
+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo);
+        frame->color_reloaded = false;
+    }
+
+    void ReloadRenderFrame(Frame* frame, u32 width, u32 height) {
+        // Recreate the color texture attachment
+        frame->color.Release();
+        frame->color.Create();
+        const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8;
+        glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height);
+
+        // Recreate the FBO for the render target
+        frame->render.Release();
+        frame->render.Create();
+        glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle);
+        glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER,
+                                  frame->color.handle);
+        if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) {
+            LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!");
+        }
+
+        frame->width = width;
+        frame->height = height;
+        frame->color_reloaded = true;
+    }
+
+    Frame* GetRenderFrame() {
+        std::unique_lock lock{swap_chain_lock};
+
+        // If theres no free frames, we will reuse the oldest render frame
+        if (free_queue.empty()) {
+            auto frame = present_queue.back();
+            present_queue.pop_back();
+            return frame;
+        }
+
+        Frame* frame = free_queue.front();
+        free_queue.pop();
+        return frame;
+    }
+
+    void ReleaseRenderFrame(Frame* frame) {
+        std::unique_lock lock{swap_chain_lock};
+        present_queue.push_front(frame);
+        present_cv.notify_one();
+
+        DebugNotifyNextFrame();
+    }
+
+    Frame* TryGetPresentFrame(int timeout_ms) {
+        DebugWaitForNextFrame();
+
+        std::unique_lock lock{swap_chain_lock};
+        // wait for new entries in the present_queue
+        present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms),
+                            [&] { return !present_queue.empty(); });
+        if (present_queue.empty()) {
+            // timed out waiting for a frame to draw so return the previous frame
+            return previous_frame;
+        }
+
+        // free the previous frame and add it back to the free queue
+        if (previous_frame) {
+            free_queue.push(previous_frame);
+        }
+
+        // the newest entries are pushed to the front of the queue
+        Frame* frame = present_queue.front();
+        present_queue.pop_front();
+        // remove all old entries from the present queue and move them back to the free_queue
+        for (auto f : present_queue) {
+            free_queue.push(f);
+        }
+        present_queue.clear();
+        previous_frame = frame;
+        return frame;
+    }
+
+private:
+    std::mutex debug_synch_mutex;
+    std::condition_variable debug_synch_condition;
+    std::atomic_int frame_for_debug{};
+    const bool has_debug_tool; // When true, using a GPU debugger, so keep frames in lock-step
+
+    /// Signal that a new frame is available (called from GPU thread)
+    void DebugNotifyNextFrame() {
+        if (!has_debug_tool) {
+            return;
+        }
+        frame_for_debug++;
+        std::lock_guard lock{debug_synch_mutex};
+        debug_synch_condition.notify_one();
+    }
+
+    /// Wait for a new frame to be available (called from presentation thread)
+    void DebugWaitForNextFrame() {
+        if (!has_debug_tool) {
+            return;
+        }
+        const int last_frame = frame_for_debug;
+        std::unique_lock lock{debug_synch_mutex};
+        debug_synch_condition.wait(lock,
+                                   [this, last_frame] { return frame_for_debug > last_frame; });
+    }
+};
+
 RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system)
    : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system},
      frame_mailbox{std::make_unique<FrameMailbox>()} {}
@@ -576,6 +628,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {

    // TODO: Signal state tracker about these changes
    state_tracker.NotifyScreenDrawVertexArray();
+    state_tracker.NotifyPolygonModes();
    state_tracker.NotifyViewport0();
    state_tracker.NotifyScissor0();
    state_tracker.NotifyColorMask0();
@@ -611,6 +664,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
    glDisable(GL_ALPHA_TEST);
    glDisablei(GL_BLEND, 0);
    glDisablei(GL_SCISSOR_TEST, 0);
+    glPolygonMode(GL_FRONT_AND_BACK, GL_FILL);
    glCullFace(GL_BACK);
    glFrontFace(GL_CW);
    glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -125,6 +125,7 @@ struct FormatTuple {
    {vk::Format::eR8Uint, Attachable | Storage},                 // R8UI
    {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage},     // RGBA16F
    {vk::Format::eR16G16B16A16Unorm, Attachable | Storage},      // RGBA16U
+    {vk::Format::eR16G16B16A16Snorm, Attachable | Storage},      // RGBA16S
    {vk::Format::eR16G16B16A16Uint, Attachable | Storage},       // RGBA16UI
    {vk::Format::eB10G11R11UfloatPack32, Attachable | Storage},  // R11FG11FB10F
    {vk::Format::eR32G32B32A32Uint, Attachable | Storage},       // RGBA32UI
@@ -331,6 +332,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr
            return vk::Format::eR16G16B16Unorm;
        case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
            return vk::Format::eR16G16B16A16Unorm;
+        case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
+            return vk::Format::eA2B10G10R10UnormPack32;
        default:
            break;
        }
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -107,8 +107,7 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
    features.occlusionQueryPrecise = true;
    features.fragmentStoresAndAtomics = true;
    features.shaderImageGatherExtended = true;
-    features.shaderStorageImageReadWithoutFormat =
-        is_shader_storage_img_read_without_format_supported;
+    features.shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported;
    features.shaderStorageImageWriteWithoutFormat = true;
    features.textureCompressionASTC_LDR = is_optimal_astc_supported;

@@ -148,6 +147,15 @@ bool VKDevice::Create(const vk::DispatchLoaderDynamic& dldi, vk::Instance instan
        LOG_INFO(Render_Vulkan, "Device doesn't support uint8 indexes");
    }

+    vk::PhysicalDeviceTransformFeedbackFeaturesEXT transform_feedback;
+    if (ext_transform_feedback) {
+        transform_feedback.transformFeedback = true;
+        transform_feedback.geometryStreams = true;
+        SetNext(next, transform_feedback);
+    } else {
+        LOG_INFO(Render_Vulkan, "Device doesn't support transform feedbacks");
+    }
+
    if (!ext_depth_range_unrestricted) {
        LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted");
    }
@@ -385,7 +393,7 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
        }
    };

-    extensions.reserve(14);
+    extensions.reserve(15);
    extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
    extensions.push_back(VK_KHR_16BIT_STORAGE_EXTENSION_NAME);
    extensions.push_back(VK_KHR_8BIT_STORAGE_EXTENSION_NAME);
@@ -397,18 +405,22 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami

    [[maybe_unused]] const bool nsight =
        std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
-    bool khr_shader_float16_int8{};
-    bool ext_subgroup_size_control{};
+    bool has_khr_shader_float16_int8{};
+    bool has_ext_subgroup_size_control{};
+    bool has_ext_transform_feedback{};
    for (const auto& extension : physical.enumerateDeviceExtensionProperties(nullptr, dldi)) {
        Test(extension, khr_uniform_buffer_standard_layout,
             VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true);
-        Test(extension, khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false);
+        Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
+             false);
        Test(extension, ext_depth_range_unrestricted,
             VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true);
        Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true);
        Test(extension, ext_shader_viewport_index_layer,
             VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true);
-        Test(extension, ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
+        Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
+             false);
+        Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME,
             false);
        if (Settings::values.renderer_debug) {
            Test(extension, nv_device_diagnostic_checkpoints,
@@ -416,13 +428,13 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
        }
    }

-    if (khr_shader_float16_int8) {
+    if (has_khr_shader_float16_int8) {
        is_float16_supported =
            GetFeatures<vk::PhysicalDeviceFloat16Int8FeaturesKHR>(physical, dldi).shaderFloat16;
        extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME);
    }

-    if (ext_subgroup_size_control) {
+    if (has_ext_subgroup_size_control) {
        const auto features =
            GetFeatures<vk::PhysicalDeviceSubgroupSizeControlFeaturesEXT>(physical, dldi);
        const auto properties =
@@ -439,6 +451,20 @@ std::vector<const char*> VKDevice::LoadExtensions(const vk::DispatchLoaderDynami
        is_warp_potentially_bigger = true;
    }

+    if (has_ext_transform_feedback) {
+        const auto features =
+            GetFeatures<vk::PhysicalDeviceTransformFeedbackFeaturesEXT>(physical, dldi);
+        const auto properties =
+            GetProperties<vk::PhysicalDeviceTransformFeedbackPropertiesEXT>(physical, dldi);
+
+        if (features.transformFeedback && features.geometryStreams &&
+            properties.maxTransformFeedbackStreams >= 4 && properties.maxTransformFeedbackBuffers &&
+            properties.transformFeedbackQueries && properties.transformFeedbackDraw) {
+            extensions.push_back(VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME);
+            ext_transform_feedback = true;
+        }
+    }
+
    return extensions;
 }

@@ -467,8 +493,7 @@ void VKDevice::SetupFamilies(const vk::DispatchLoaderDynamic& dldi, vk::SurfaceK

 void VKDevice::SetupFeatures(const vk::DispatchLoaderDynamic& dldi) {
    const auto supported_features{physical.getFeatures(dldi)};
-    is_shader_storage_img_read_without_format_supported =
-        supported_features.shaderStorageImageReadWithoutFormat;
+    is_formatless_image_load_supported = supported_features.shaderStorageImageReadWithoutFormat;
    is_optimal_astc_supported = IsOptimalAstcSupported(supported_features, dldi);
 }

@@ -510,6 +535,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti
                                        vk::Format::eR32G32Sfloat,
                                        vk::Format::eR32G32Uint,
                                        vk::Format::eR16G16B16A16Uint,
+                                        vk::Format::eR16G16B16A16Snorm,
                                        vk::Format::eR16G16B16A16Unorm,
                                        vk::Format::eR16G16Unorm,
                                        vk::Format::eR16G16Snorm,
--- a/src/video_core/renderer_vulkan/vk_device.h
+++ b/src/video_core/renderer_vulkan/vk_device.h
@@ -122,11 +122,6 @@ public:
        return properties.limits.maxPushConstantsSize;
    }

-    /// Returns true if Shader storage Image Read Without Format supported.
-    bool IsShaderStorageImageReadWithoutFormatSupported() const {
-        return is_shader_storage_img_read_without_format_supported;
-    }
-
    /// Returns true if ASTC is natively supported.
    bool IsOptimalAstcSupported() const {
        return is_optimal_astc_supported;
@@ -147,6 +142,11 @@ public:
        return (guest_warp_stages & stage) != vk::ShaderStageFlags{};
    }

+    /// Returns true if formatless image load is supported.
+    bool IsFormatlessImageLoadSupported() const {
+        return is_formatless_image_load_supported;
+    }
+
    /// Returns true if the device supports VK_EXT_scalar_block_layout.
    bool IsKhrUniformBufferStandardLayoutSupported() const {
        return khr_uniform_buffer_standard_layout;
@@ -167,6 +167,11 @@ public:
        return ext_shader_viewport_index_layer;
    }

+    /// Returns true if the device supports VK_EXT_transform_feedback.
+    bool IsExtTransformFeedbackSupported() const {
+        return ext_transform_feedback;
+    }
+
    /// Returns true if the device supports VK_NV_device_diagnostic_checkpoints.
    bool IsNvDeviceDiagnosticCheckpoints() const {
        return nv_device_diagnostic_checkpoints;
@@ -214,26 +219,26 @@ private:
    static std::unordered_map<vk::Format, vk::FormatProperties> GetFormatProperties(
        const vk::DispatchLoaderDynamic& dldi, vk::PhysicalDevice physical);

-    const vk::PhysicalDevice physical;         ///< Physical device.
-    vk::DispatchLoaderDynamic dld;             ///< Device function pointers.
-    vk::PhysicalDeviceProperties properties;   ///< Device properties.
-    UniqueDevice logical;                      ///< Logical device.
-    vk::Queue graphics_queue;                  ///< Main graphics queue.
-    vk::Queue present_queue;                   ///< Main present queue.
-    u32 graphics_family{};                     ///< Main graphics queue family index.
-    u32 present_family{};                      ///< Main present queue family index.
-    vk::DriverIdKHR driver_id{};               ///< Driver ID.
-    vk::ShaderStageFlags guest_warp_stages{};  ///< Stages where the guest warp size can be forced.
-    bool is_optimal_astc_supported{};          ///< Support for native ASTC.
-    bool is_float16_supported{};               ///< Support for float16 arithmetics.
-    bool is_warp_potentially_bigger{};         ///< Host warp size can be bigger than guest.
+    const vk::PhysicalDevice physical;        ///< Physical device.
+    vk::DispatchLoaderDynamic dld;            ///< Device function pointers.
+    vk::PhysicalDeviceProperties properties;  ///< Device properties.
+    UniqueDevice logical;                     ///< Logical device.
+    vk::Queue graphics_queue;                 ///< Main graphics queue.
+    vk::Queue present_queue;                  ///< Main present queue.
+    u32 graphics_family{};                    ///< Main graphics queue family index.
+    u32 present_family{};                     ///< Main present queue family index.
+    vk::DriverIdKHR driver_id{};              ///< Driver ID.
+    vk::ShaderStageFlags guest_warp_stages{}; ///< Stages where the guest warp size can be forced.ed
+    bool is_optimal_astc_supported{};         ///< Support for native ASTC.
+    bool is_float16_supported{};              ///< Support for float16 arithmetics.
+    bool is_warp_potentially_bigger{};        ///< Host warp size can be bigger than guest.
+    bool is_formatless_image_load_supported{}; ///< Support for shader image read without format.
    bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs.
    bool ext_index_type_uint8{};               ///< Support for VK_EXT_index_type_uint8.
    bool ext_depth_range_unrestricted{};       ///< Support for VK_EXT_depth_range_unrestricted.
    bool ext_shader_viewport_index_layer{};    ///< Support for VK_EXT_shader_viewport_index_layer.
+    bool ext_transform_feedback{};             ///< Support for VK_EXT_transform_feedback.
    bool nv_device_diagnostic_checkpoints{};   ///< Support for VK_NV_device_diagnostic_checkpoints.
-    bool is_shader_storage_img_read_without_format_supported{}; ///< Support for shader storage
-                                                                ///< image read without format

    // Telemetry parameters
    std::string vendor_name;                      ///< Device's driver name.
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -161,8 +161,8 @@ CachedShader::CachedShader(Core::System& system, Tegra::Engines::ShaderType stag
                           GPUVAddr gpu_addr, VAddr cpu_addr, u8* host_ptr,
                           ProgramCode program_code, u32 main_offset)
    : RasterizerCacheObject{host_ptr}, gpu_addr{gpu_addr}, cpu_addr{cpu_addr},
-      program_code{std::move(program_code)}, locker{stage, GetEngine(system, stage)},
-      shader_ir{this->program_code, main_offset, compiler_settings, locker},
+      program_code{std::move(program_code)}, registry{stage, GetEngine(system, stage)},
+      shader_ir{this->program_code, main_offset, compiler_settings, registry},
      entries{GenerateShaderEntries(shader_ir)} {}

 CachedShader::~CachedShader() = default;
@@ -273,9 +273,9 @@ VKComputePipeline& VKPipelineCache::GetComputePipeline(const ComputePipelineCach
    specialization.workgroup_size = key.workgroup_size;
    specialization.shared_memory_size = key.shared_memory_size;

-    const SPIRVShader spirv_shader{
-        Decompile(device, shader->GetIR(), ShaderType::Compute, specialization),
-        shader->GetEntries()};
+    const SPIRVShader spirv_shader{Decompile(device, shader->GetIR(), ShaderType::Compute,
+                                             shader->GetRegistry(), specialization),
+                                   shader->GetEntries()};
    entry = std::make_unique<VKComputePipeline>(device, scheduler, descriptor_pool,
                                                update_descriptor_queue, spirv_shader);
    return *entry;
@@ -324,8 +324,7 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
    const auto& gpu = system.GPU().Maxwell3D();

    Specialization specialization;
-    specialization.primitive_topology = fixed_state.input_assembly.topology;
-    if (specialization.primitive_topology == Maxwell::PrimitiveTopology::Points) {
+    if (fixed_state.input_assembly.topology == Maxwell::PrimitiveTopology::Points) {
        ASSERT(fixed_state.input_assembly.point_size != 0.0f);
        specialization.point_size = fixed_state.input_assembly.point_size;
    }
@@ -333,9 +332,6 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
        specialization.attribute_types[i] = fixed_state.vertex_input.attributes[i].type;
    }
    specialization.ndc_minus_one_to_one = fixed_state.rasterizer.ndc_minus_one_to_one;
-    specialization.tessellation.primitive = fixed_state.tessellation.primitive;
-    specialization.tessellation.spacing = fixed_state.tessellation.spacing;
-    specialization.tessellation.clockwise = fixed_state.tessellation.clockwise;

    SPIRVProgram program;
    std::vector<vk::DescriptorSetLayoutBinding> bindings;
@@ -356,8 +352,9 @@ VKPipelineCache::DecompileShaders(const GraphicsPipelineCacheKey& key) {
        const std::size_t stage = index == 0 ? 0 : index - 1; // Stage indices are 0 - 5
        const auto program_type = GetShaderType(program_enum);
        const auto& entries = shader->GetEntries();
-        program[stage] = {Decompile(device, shader->GetIR(), program_type, specialization),
-                          entries};
+        program[stage] = {
+            Decompile(device, shader->GetIR(), program_type, shader->GetRegistry(), specialization),
+            entries};

        if (program_enum == Maxwell::ShaderProgram::VertexA) {
            // VertexB was combined with VertexA, so we skip the VertexB iteration
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h
@@ -25,7 +25,7 @@
 #include "video_core/renderer_vulkan/vk_renderpass_cache.h"
 #include "video_core/renderer_vulkan/vk_resource_manager.h"
 #include "video_core/renderer_vulkan/vk_shader_decompiler.h"
-#include "video_core/shader/const_buffer_locker.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
 #include "video_core/surface.h"

@@ -132,6 +132,10 @@ public:
        return shader_ir;
    }

+    const VideoCommon::Shader::Registry& GetRegistry() const {
+        return registry;
+    }
+
    const VideoCommon::Shader::ShaderIR& GetIR() const {
        return shader_ir;
    }
@@ -147,7 +151,7 @@ private:
    GPUVAddr gpu_addr{};
    VAddr cpu_addr{};
    ProgramCode program_code;
-    VideoCommon::Shader::ConstBufferLocker locker;
+    VideoCommon::Shader::Registry registry;
    VideoCommon::Shader::ShaderIR shader_ir;
    ShaderEntries entries;
 };
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -347,6 +347,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
            [&pipeline](auto cmdbuf, auto& dld) { cmdbuf.setCheckpointNV(&pipeline, dld); });
    }

+    BeginTransformFeedback();
+
    const auto pipeline_layout = pipeline.GetLayout();
    const auto descriptor_set = pipeline.CommitDescriptorSet();
    scheduler.Record([pipeline_layout, descriptor_set, draw_params](auto cmdbuf, auto& dld) {
@@ -356,6 +358,8 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
        }
        draw_params.Draw(cmdbuf, dld);
    });
+
+    EndTransformFeedback();
 }

 void RasterizerVulkan::Clear() {
@@ -738,6 +742,44 @@ void RasterizerVulkan::UpdateDynamicStates() {
    UpdateStencilFaces(regs);
 }

+void RasterizerVulkan::BeginTransformFeedback() {
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    if (regs.tfb_enabled == 0) {
+        return;
+    }
+
+    UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
+                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
+
+    UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable);
+    UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable);
+    UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable);
+
+    const auto& binding = regs.tfb_bindings[0];
+    UNIMPLEMENTED_IF(binding.buffer_enable == 0);
+    UNIMPLEMENTED_IF(binding.buffer_offset != 0);
+
+    const GPUVAddr gpu_addr = binding.Address();
+    const std::size_t size = binding.buffer_size;
+    const auto [buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+
+    scheduler.Record([buffer = *buffer, offset = offset, size](auto cmdbuf, auto& dld) {
+        cmdbuf.bindTransformFeedbackBuffersEXT(0, {buffer}, {offset}, {size}, dld);
+        cmdbuf.beginTransformFeedbackEXT(0, {}, {}, dld);
+    });
+}
+
+void RasterizerVulkan::EndTransformFeedback() {
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    if (regs.tfb_enabled == 0) {
+        return;
+    }
+
+    scheduler.Record(
+        [](auto cmdbuf, auto& dld) { cmdbuf.endTransformFeedbackEXT(0, {}, {}, dld); });
+}
+
 void RasterizerVulkan::SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,
                                         BufferBindings& buffer_bindings) {
    const auto& regs = system.GPU().Maxwell3D().regs;
@@ -854,7 +896,7 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::

 void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) {
    MICROPROFILE_SCOPE(Vulkan_Images);
-    const auto& gpu = system.GPU().KeplerCompute();
+    const auto& gpu = system.GPU().Maxwell3D();
    for (const auto& entry : entries.images) {
        const auto tic = GetTextureInfo(gpu, entry, stage).tic;
        SetupImage(tic, entry);
@@ -915,6 +957,13 @@ void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {

 void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
                                        const Tegra::Engines::ConstBufferInfo& buffer) {
+    if (!buffer.enabled) {
+        // Set values to zero to unbind buffers
+        update_descriptor_queue.AddBuffer(buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
+                                          sizeof(float));
+        return;
+    }
+
    // Align the size to avoid bad std140 interactions
    const std::size_t size =
        Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
@@ -1102,7 +1151,7 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const {
        // This implementation assumes that all attributes are used in the shader.
        const GPUVAddr start{regs.vertex_array[index].StartAddress()};
        const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
-        DEBUG_ASSERT(end > start);
+        DEBUG_ASSERT(end >= start);

        size += (end - start + 1) * regs.vertex_array[index].enable;
    }
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@@ -169,6 +169,10 @@ private:

    void UpdateDynamicStates();

+    void BeginTransformFeedback();
+
+    void EndTransformFeedback();
+
    bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment);

    void SetupVertexArrays(FixedPipelineState::VertexInput& vertex_input,
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -5,7 +5,9 @@
 #include <functional>
 #include <limits>
 #include <map>
+#include <optional>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>

 #include <fmt/format.h>
@@ -24,6 +26,7 @@
 #include "video_core/renderer_vulkan/vk_shader_decompiler.h"
 #include "video_core/shader/node.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader/transform_feedback.h"

 namespace Vulkan {

@@ -93,6 +96,12 @@ struct VertexIndices {
    std::optional<u32> clip_distances;
 };

+struct GenericVaryingDescription {
+    Id id = nullptr;
+    u32 first_element = 0;
+    bool is_scalar = false;
+};
+
 spv::Dim GetSamplerDim(const Sampler& sampler) {
    ASSERT(!sampler.IsBuffer());
    switch (sampler.GetType()) {
@@ -266,9 +275,13 @@ bool IsPrecise(Operation operand) {
 class SPIRVDecompiler final : public Sirit::Module {
 public:
    explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage,
-                             const Specialization& specialization)
+                             const Registry& registry, const Specialization& specialization)
        : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()},
-          specialization{specialization} {
+          registry{registry}, specialization{specialization} {
+        if (stage != ShaderType::Compute) {
+            transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
+        }
+
        AddCapability(spv::Capability::Shader);
        AddCapability(spv::Capability::UniformAndStorageBuffer16BitAccess);
        AddCapability(spv::Capability::ImageQuery);
@@ -286,6 +299,15 @@ public:
        AddExtension("SPV_KHR_variable_pointers");
        AddExtension("SPV_KHR_shader_draw_parameters");

+        if (!transform_feedback.empty()) {
+            if (device.IsExtTransformFeedbackSupported()) {
+                AddCapability(spv::Capability::TransformFeedback);
+            } else {
+                LOG_ERROR(Render_Vulkan, "Shader requires transform feedbacks but these are not "
+                                         "supported on this device");
+            }
+        }
+
        if (ir.UsesLayer() || ir.UsesViewportIndex()) {
            if (ir.UsesViewportIndex()) {
                AddCapability(spv::Capability::MultiViewport);
@@ -296,7 +318,7 @@ public:
            }
        }

-        if (device.IsShaderStorageImageReadWithoutFormatSupported()) {
+        if (device.IsFormatlessImageLoadSupported()) {
            AddCapability(spv::Capability::StorageImageReadWithoutFormat);
        }

@@ -318,25 +340,29 @@ public:
            AddExecutionMode(main, spv::ExecutionMode::OutputVertices,
                             header.common2.threads_per_input_primitive);
            break;
-        case ShaderType::TesselationEval:
+        case ShaderType::TesselationEval: {
+            const auto& info = registry.GetGraphicsInfo();
            AddCapability(spv::Capability::Tessellation);
            AddEntryPoint(spv::ExecutionModel::TessellationEvaluation, main, "main", interfaces);
-            AddExecutionMode(main, GetExecutionMode(specialization.tessellation.primitive));
-            AddExecutionMode(main, GetExecutionMode(specialization.tessellation.spacing));
-            AddExecutionMode(main, specialization.tessellation.clockwise
+            AddExecutionMode(main, GetExecutionMode(info.tessellation_primitive));
+            AddExecutionMode(main, GetExecutionMode(info.tessellation_spacing));
+            AddExecutionMode(main, info.tessellation_clockwise
                                       ? spv::ExecutionMode::VertexOrderCw
                                       : spv::ExecutionMode::VertexOrderCcw);
            break;
-        case ShaderType::Geometry:
+        }
+        case ShaderType::Geometry: {
+            const auto& info = registry.GetGraphicsInfo();
            AddCapability(spv::Capability::Geometry);
            AddEntryPoint(spv::ExecutionModel::Geometry, main, "main", interfaces);
-            AddExecutionMode(main, GetExecutionMode(specialization.primitive_topology));
+            AddExecutionMode(main, GetExecutionMode(info.primitive_topology));
            AddExecutionMode(main, GetExecutionMode(header.common3.output_topology));
            AddExecutionMode(main, spv::ExecutionMode::OutputVertices,
                             header.common4.max_output_vertices);
            // TODO(Rodrigo): Where can we get this info from?
            AddExecutionMode(main, spv::ExecutionMode::Invocations, 1U);
            break;
+        }
        case ShaderType::Fragment:
            AddEntryPoint(spv::ExecutionModel::Fragment, main, "main", interfaces);
            AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft);
@@ -545,7 +571,8 @@ private:
        if (stage != ShaderType::Geometry) {
            return;
        }
-        const u32 num_input = GetNumPrimitiveTopologyVertices(specialization.primitive_topology);
+        const auto& info = registry.GetGraphicsInfo();
+        const u32 num_input = GetNumPrimitiveTopologyVertices(info.primitive_topology);
        DeclareInputVertexArray(num_input);
        DeclareOutputVertex();
    }
@@ -742,12 +769,34 @@ private:
    }

    void DeclareOutputAttributes() {
+        if (stage == ShaderType::Compute || stage == ShaderType::Fragment) {
+            return;
+        }
+
+        UNIMPLEMENTED_IF(registry.GetGraphicsInfo().tfb_enabled && stage != ShaderType::Vertex);
        for (const auto index : ir.GetOutputAttributes()) {
            if (!IsGenericAttribute(index)) {
                continue;
            }
-            const u32 location = GetGenericAttributeLocation(index);
-            Id type = t_float4;
+            DeclareOutputAttribute(index);
+        }
+    }
+
+    void DeclareOutputAttribute(Attribute::Index index) {
+        static constexpr std::string_view swizzle = "xyzw";
+
+        const u32 location = GetGenericAttributeLocation(index);
+        u8 element = 0;
+        while (element < 4) {
+            const std::size_t remainder = 4 - element;
+
+            std::size_t num_components = remainder;
+            const std::optional tfb = GetTransformFeedbackInfo(index, element);
+            if (tfb) {
+                num_components = tfb->components;
+            }
+
+            Id type = GetTypeVectorDefinitionLut(Type::Float).at(num_components - 1);
            Id varying_default = v_varying_default;
            if (IsOutputAttributeArray()) {
                const u32 num = GetNumOutputVertices();
@@ -760,15 +809,47 @@ private:
            }
            type = TypePointer(spv::StorageClass::Output, type);

+            std::string name = fmt::format("out_attr{}", location);
+            if (num_components < 4 || element > 0) {
+                name = fmt::format("{}_{}", name, swizzle.substr(element, num_components));
+            }
+
            const Id id = OpVariable(type, spv::StorageClass::Output, varying_default);
-            Name(AddGlobalVariable(id), fmt::format("out_attr{}", location));
-            output_attributes.emplace(index, id);
+            Name(AddGlobalVariable(id), name);
+
+            GenericVaryingDescription description;
+            description.id = id;
+            description.first_element = element;
+            description.is_scalar = num_components == 1;
+            for (u32 i = 0; i < num_components; ++i) {
+                const u8 offset = static_cast<u8>(static_cast<u32>(index) * 4 + element + i);
+                output_attributes.emplace(offset, description);
+            }
            interfaces.push_back(id);

            Decorate(id, spv::Decoration::Location, location);
+            if (element > 0) {
+                Decorate(id, spv::Decoration::Component, static_cast<u32>(element));
+            }
+            if (tfb && device.IsExtTransformFeedbackSupported()) {
+                Decorate(id, spv::Decoration::XfbBuffer, static_cast<u32>(tfb->buffer));
+                Decorate(id, spv::Decoration::XfbStride, static_cast<u32>(tfb->stride));
+                Decorate(id, spv::Decoration::Offset, static_cast<u32>(tfb->offset));
+            }
+
+            element = static_cast<u8>(static_cast<std::size_t>(element) + num_components);
        }
    }

+    std::optional<VaryingTFB> GetTransformFeedbackInfo(Attribute::Index index, u8 element = 0) {
+        const u8 location = static_cast<u8>(static_cast<u32>(index) * 4 + element);
+        const auto it = transform_feedback.find(location);
+        if (it == transform_feedback.end()) {
+            return {};
+        }
+        return it->second;
+    }
+
    u32 DeclareConstantBuffers(u32 binding) {
        for (const auto& [index, size] : ir.GetConstantBuffers()) {
            const Id type = device.IsKhrUniformBufferStandardLayoutSupported() ? t_cbuf_scalar_ubo
@@ -898,7 +979,7 @@ private:
    u32 GetNumInputVertices() const {
        switch (stage) {
        case ShaderType::Geometry:
-            return GetNumPrimitiveTopologyVertices(specialization.primitive_topology);
+            return GetNumPrimitiveTopologyVertices(registry.GetGraphicsInfo().primitive_topology);
        case ShaderType::TesselationControl:
        case ShaderType::TesselationEval:
            return NumInputPatches;
@@ -1346,8 +1427,14 @@ private:
                }
                default:
                    if (IsGenericAttribute(attribute)) {
-                        const Id composite = output_attributes.at(attribute);
-                        return {ArrayPass(t_out_float, composite, {element}), Type::Float};
+                        const u8 offset = static_cast<u8>(static_cast<u8>(attribute) * 4 + element);
+                        const GenericVaryingDescription description = output_attributes.at(offset);
+                        const Id composite = description.id;
+                        std::vector<u32> indices;
+                        if (!description.is_scalar) {
+                            indices.push_back(element - description.first_element);
+                        }
+                        return {ArrayPass(t_out_float, composite, indices), Type::Float};
                    }
                    UNIMPLEMENTED_MSG("Unhandled output attribute: {}",
                                      static_cast<u32>(attribute));
@@ -1793,7 +1880,7 @@ private:
    }

    Expression ImageLoad(Operation operation) {
-        if (!device.IsShaderStorageImageReadWithoutFormatSupported()) {
+        if (!device.IsFormatlessImageLoadSupported()) {
            return {v_float_zero, Type::Float};
        }

@@ -2258,11 +2345,11 @@ private:
    std::array<Id, 4> GetTypeVectorDefinitionLut(Type type) const {
        switch (type) {
        case Type::Float:
-            return {nullptr, t_float2, t_float3, t_float4};
+            return {t_float, t_float2, t_float3, t_float4};
        case Type::Int:
-            return {nullptr, t_int2, t_int3, t_int4};
+            return {t_int, t_int2, t_int3, t_int4};
        case Type::Uint:
-            return {nullptr, t_uint2, t_uint3, t_uint4};
+            return {t_uint, t_uint2, t_uint3, t_uint4};
        default:
            UNIMPLEMENTED();
            return {};
@@ -2495,7 +2582,9 @@ private:
    const ShaderIR& ir;
    const ShaderType stage;
    const Tegra::Shader::Header header;
+    const Registry& registry;
    const Specialization& specialization;
+    std::unordered_map<u8, VaryingTFB> transform_feedback;

    const Id t_void = Name(TypeVoid(), "void");

@@ -2584,7 +2673,7 @@ private:
    Id shared_memory{};
    std::array<Id, INTERNAL_FLAGS_COUNT> internal_flags{};
    std::map<Attribute::Index, Id> input_attributes;
-    std::map<Attribute::Index, Id> output_attributes;
+    std::unordered_map<u8, GenericVaryingDescription> output_attributes;
    std::map<u32, Id> constant_buffers;
    std::map<GlobalMemoryBase, Id> global_buffers;
    std::map<u32, TexelBuffer> texel_buffers;
@@ -2870,8 +2959,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
 }

 std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
-                           ShaderType stage, const Specialization& specialization) {
-    return SPIRVDecompiler(device, ir, stage, specialization).Assemble();
+                           ShaderType stage, const VideoCommon::Shader::Registry& registry,
+                           const Specialization& specialization) {
+    return SPIRVDecompiler(device, ir, stage, registry, specialization).Assemble();
 }

 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h
@@ -15,6 +15,7 @@
 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

 namespace Vulkan {
@@ -91,17 +92,9 @@ struct Specialization final {
    u32 shared_memory_size{};

    // Graphics specific
-    Maxwell::PrimitiveTopology primitive_topology{};
    std::optional<float> point_size{};
    std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
    bool ndc_minus_one_to_one{};
-
-    // Tessellation specific
-    struct {
-        Maxwell::TessellationPrimitive primitive{};
-        Maxwell::TessellationSpacing spacing{};
-        bool clockwise{};
-    } tessellation;
 };
 // Old gcc versions don't consider this trivially copyable.
 // static_assert(std::is_trivially_copyable_v<Specialization>);
@@ -114,6 +107,8 @@ struct SPIRVShader {
 ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir);

 std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
-                           Tegra::Engines::ShaderType stage, const Specialization& specialization);
+                           Tegra::Engines::ShaderType stage,
+                           const VideoCommon::Shader::Registry& registry,
+                           const Specialization& specialization);

 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
+++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp
@@ -73,7 +73,8 @@ VKBuffer* VKStagingBufferPool::TryGetReservedBuffer(std::size_t size, bool host_
 VKBuffer& VKStagingBufferPool::CreateStagingBuffer(std::size_t size, bool host_visible) {
    const auto usage =
        vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst |
-        vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eIndexBuffer;
+        vk::BufferUsageFlagBits::eUniformBuffer | vk::BufferUsageFlagBits::eStorageBuffer |
+        vk::BufferUsageFlagBits::eIndexBuffer;
    const u32 log2 = Common::Log2Ceil64(size);
    const vk::BufferCreateInfo buffer_ci({}, 1ULL << log2, usage, vk::SharingMode::eExclusive, 0,
                                         nullptr);
--- a/src/video_core/shader/const_buffer_locker.cpp
+++ b/src/video_core/shader/const_buffer_locker.cpp
@@ -1,126 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <tuple>
-
-#include "common/common_types.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/engines/shader_type.h"
-#include "video_core/shader/const_buffer_locker.h"
-
-namespace VideoCommon::Shader {
-
-using Tegra::Engines::SamplerDescriptor;
-
-ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage)
-    : stage{shader_stage} {}
-
-ConstBufferLocker::ConstBufferLocker(Tegra::Engines::ShaderType shader_stage,
-                                     Tegra::Engines::ConstBufferEngineInterface& engine)
-    : stage{shader_stage}, engine{&engine} {}
-
-ConstBufferLocker::~ConstBufferLocker() = default;
-
-std::optional<u32> ConstBufferLocker::ObtainKey(u32 buffer, u32 offset) {
-    const std::pair<u32, u32> key = {buffer, offset};
-    const auto iter = keys.find(key);
-    if (iter != keys.end()) {
-        return iter->second;
-    }
-    if (!engine) {
-        return std::nullopt;
-    }
-    const u32 value = engine->AccessConstBuffer32(stage, buffer, offset);
-    keys.emplace(key, value);
-    return value;
-}
-
-std::optional<SamplerDescriptor> ConstBufferLocker::ObtainBoundSampler(u32 offset) {
-    const u32 key = offset;
-    const auto iter = bound_samplers.find(key);
-    if (iter != bound_samplers.end()) {
-        return iter->second;
-    }
-    if (!engine) {
-        return std::nullopt;
-    }
-    const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset);
-    bound_samplers.emplace(key, value);
-    return value;
-}
-
-std::optional<Tegra::Engines::SamplerDescriptor> ConstBufferLocker::ObtainBindlessSampler(
-    u32 buffer, u32 offset) {
-    const std::pair key = {buffer, offset};
-    const auto iter = bindless_samplers.find(key);
-    if (iter != bindless_samplers.end()) {
-        return iter->second;
-    }
-    if (!engine) {
-        return std::nullopt;
-    }
-    const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset);
-    bindless_samplers.emplace(key, value);
-    return value;
-}
-
-std::optional<u32> ConstBufferLocker::ObtainBoundBuffer() {
-    if (bound_buffer_saved) {
-        return bound_buffer;
-    }
-    if (!engine) {
-        return std::nullopt;
-    }
-    bound_buffer_saved = true;
-    bound_buffer = engine->GetBoundBuffer();
-    return bound_buffer;
-}
-
-void ConstBufferLocker::InsertKey(u32 buffer, u32 offset, u32 value) {
-    keys.insert_or_assign({buffer, offset}, value);
-}
-
-void ConstBufferLocker::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) {
-    bound_samplers.insert_or_assign(offset, sampler);
-}
-
-void ConstBufferLocker::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) {
-    bindless_samplers.insert_or_assign({buffer, offset}, sampler);
-}
-
-void ConstBufferLocker::SetBoundBuffer(u32 buffer) {
-    bound_buffer_saved = true;
-    bound_buffer = buffer;
-}
-
-bool ConstBufferLocker::IsConsistent() const {
-    if (!engine) {
-        return false;
-    }
-    return std::all_of(keys.begin(), keys.end(),
-                       [this](const auto& pair) {
-                           const auto [cbuf, offset] = pair.first;
-                           const auto value = pair.second;
-                           return value == engine->AccessConstBuffer32(stage, cbuf, offset);
-                       }) &&
-           std::all_of(bound_samplers.begin(), bound_samplers.end(),
-                       [this](const auto& sampler) {
-                           const auto [key, value] = sampler;
-                           return value == engine->AccessBoundSampler(stage, key);
-                       }) &&
-           std::all_of(bindless_samplers.begin(), bindless_samplers.end(),
-                       [this](const auto& sampler) {
-                           const auto [cbuf, offset] = sampler.first;
-                           const auto value = sampler.second;
-                           return value == engine->AccessBindlessSampler(stage, cbuf, offset);
-                       });
-}
-
-bool ConstBufferLocker::HasEqualKeys(const ConstBufferLocker& rhs) const {
-    return std::tie(keys, bound_samplers, bindless_samplers) ==
-           std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers);
-}
-
-} // namespace VideoCommon::Shader
--- a/src/video_core/shader/const_buffer_locker.h
+++ b/src/video_core/shader/const_buffer_locker.h
@@ -1,103 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <optional>
-#include <unordered_map>
-#include "common/common_types.h"
-#include "common/hash.h"
-#include "video_core/engines/const_buffer_engine_interface.h"
-#include "video_core/engines/shader_type.h"
-#include "video_core/guest_driver.h"
-
-namespace VideoCommon::Shader {
-
-using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
-using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
-using BindlessSamplerMap =
-    std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
-
-/**
- * The ConstBufferLocker is a class use to interface the 3D and compute engines with the shader
- * compiler. with it, the shader can obtain required data from GPU state and store it for disk
- * shader compilation.
- */
-class ConstBufferLocker {
-public:
-    explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage);
-
-    explicit ConstBufferLocker(Tegra::Engines::ShaderType shader_stage,
-                               Tegra::Engines::ConstBufferEngineInterface& engine);
-
-    ~ConstBufferLocker();
-
-    /// Retrieves a key from the locker, if it's registered, it will give the registered value, if
-    /// not it will obtain it from maxwell3d and register it.
-    std::optional<u32> ObtainKey(u32 buffer, u32 offset);
-
-    std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
-
-    std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
-
-    std::optional<u32> ObtainBoundBuffer();
-
-    /// Inserts a key.
-    void InsertKey(u32 buffer, u32 offset, u32 value);
-
-    /// Inserts a bound sampler key.
-    void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler);
-
-    /// Inserts a bindless sampler key.
-    void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler);
-
-    /// Set the bound buffer for this locker.
-    void SetBoundBuffer(u32 buffer);
-
-    /// Checks keys and samplers against engine's current const buffers. Returns true if they are
-    /// the same value, false otherwise;
-    bool IsConsistent() const;
-
-    /// Returns true if the keys are equal to the other ones in the locker.
-    bool HasEqualKeys(const ConstBufferLocker& rhs) const;
-
-    /// Gives an getter to the const buffer keys in the database.
-    const KeyMap& GetKeys() const {
-        return keys;
-    }
-
-    /// Gets samplers database.
-    const BoundSamplerMap& GetBoundSamplers() const {
-        return bound_samplers;
-    }
-
-    /// Gets bindless samplers database.
-    const BindlessSamplerMap& GetBindlessSamplers() const {
-        return bindless_samplers;
-    }
-
-    /// Gets bound buffer used on this shader
-    u32 GetBoundBuffer() const {
-        return bound_buffer;
-    }
-
-    /// Obtains access to the guest driver's profile.
-    VideoCore::GuestDriverProfile* AccessGuestDriverProfile() const {
-        if (engine) {
-            return &engine->AccessGuestDriverProfile();
-        }
-        return nullptr;
-    }
-
-private:
-    const Tegra::Engines::ShaderType stage;
-    Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
-    KeyMap keys;
-    BoundSamplerMap bound_samplers;
-    BindlessSamplerMap bindless_samplers;
-    bool bound_buffer_saved{};
-    u32 bound_buffer{};
-};
-
-} // namespace VideoCommon::Shader
--- a/src/video_core/shader/control_flow.cpp
+++ b/src/video_core/shader/control_flow.cpp
@@ -13,6 +13,7 @@
 #include "common/common_types.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/control_flow.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

 namespace VideoCommon::Shader {
@@ -64,11 +65,11 @@ struct BlockInfo {
 };

 struct CFGRebuildState {
-    explicit CFGRebuildState(const ProgramCode& program_code, u32 start, ConstBufferLocker& locker)
-        : program_code{program_code}, locker{locker}, start{start} {}
+    explicit CFGRebuildState(const ProgramCode& program_code, u32 start, Registry& registry)
+        : program_code{program_code}, registry{registry}, start{start} {}

    const ProgramCode& program_code;
-    ConstBufferLocker& locker;
+    Registry& registry;
    u32 start{};
    std::vector<BlockInfo> block_info;
    std::list<u32> inspect_queries;
@@ -438,7 +439,7 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address)
            const s32 pc_target = offset + result.relative_position;
            std::vector<CaseBranch> branches;
            for (u32 i = 0; i < result.entries; i++) {
-                auto key = state.locker.ObtainKey(result.buffer, result.offset + i * 4);
+                auto key = state.registry.ObtainKey(result.buffer, result.offset + i * 4);
                if (!key) {
                    return {ParseResult::AbnormalFlow, parse_info};
                }
@@ -656,14 +657,14 @@ void DecompileShader(CFGRebuildState& state) {

 std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address,
                                                const CompilerSettings& settings,
-                                                ConstBufferLocker& locker) {
+                                                Registry& registry) {
    auto result_out = std::make_unique<ShaderCharacteristics>();
    if (settings.depth == CompileDepth::BruteForce) {
        result_out->settings.depth = CompileDepth::BruteForce;
        return result_out;
    }

-    CFGRebuildState state{program_code, start_address, locker};
+    CFGRebuildState state{program_code, start_address, registry};
    // Inspect Code and generate blocks
    state.labels.clear();
    state.labels.emplace(start_address);
--- a/src/video_core/shader/control_flow.h
+++ b/src/video_core/shader/control_flow.h
@@ -12,6 +12,7 @@
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/compiler_settings.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

 namespace VideoCommon::Shader {
@@ -111,6 +112,6 @@ struct ShaderCharacteristics {

 std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, u32 start_address,
                                                const CompilerSettings& settings,
-                                                ConstBufferLocker& locker);
+                                                Registry& registry);

 } // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -34,13 +34,9 @@ constexpr bool IsSchedInstruction(u32 offset, u32 main_offset) {
    return (absolute_offset % SchedPeriod) == 0;
 }

-void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver,
+void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver,
                              const std::list<Sampler>& used_samplers) {
-    if (gpu_driver == nullptr) {
-        LOG_CRITICAL(HW_GPU, "GPU driver profile has not been created yet");
-        return;
-    }
-    if (gpu_driver->TextureHandlerSizeKnown() || used_samplers.size() <= 1) {
+    if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) {
        return;
    }
    u32 count{};
@@ -53,17 +49,13 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile* gpu_driver,
        bound_offsets.emplace_back(sampler.GetOffset());
    }
    if (count > 1) {
-        gpu_driver->DeduceTextureHandlerSize(std::move(bound_offsets));
+        gpu_driver.DeduceTextureHandlerSize(std::move(bound_offsets));
    }
 }

 std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce,
-                                        VideoCore::GuestDriverProfile* gpu_driver,
+                                        VideoCore::GuestDriverProfile& gpu_driver,
                                        const std::list<Sampler>& used_samplers) {
-    if (gpu_driver == nullptr) {
-        LOG_CRITICAL(HW_GPU, "GPU Driver profile has not been created yet");
-        return std::nullopt;
-    }
    const u32 base_offset = sampler_to_deduce.GetOffset();
    u32 max_offset{std::numeric_limits<u32>::max()};
    for (const auto& sampler : used_samplers) {
@@ -77,7 +69,7 @@ std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce,
    if (max_offset == std::numeric_limits<u32>::max()) {
        return std::nullopt;
    }
-    return ((max_offset - base_offset) * 4) / gpu_driver->GetTextureHandlerSize();
+    return ((max_offset - base_offset) * 4) / gpu_driver.GetTextureHandlerSize();
 }

 } // Anonymous namespace
@@ -149,7 +141,7 @@ void ShaderIR::Decode() {
    std::memcpy(&header, program_code.data(), sizeof(Tegra::Shader::Header));

    decompiled = false;
-    auto info = ScanFlow(program_code, main_offset, settings, locker);
+    auto info = ScanFlow(program_code, main_offset, settings, registry);
    auto& shader_info = *info;
    coverage_begin = shader_info.start;
    coverage_end = shader_info.end;
@@ -364,7 +356,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {

 void ShaderIR::PostDecode() {
    // Deduce texture handler size if needed
-    auto gpu_driver = locker.AccessGuestDriverProfile();
+    auto gpu_driver = registry.AccessGuestDriverProfile();
    DeduceTextureHandlerSize(gpu_driver, used_samplers);
    // Deduce Indexed Samplers
    if (!uses_indexed_samplers) {
--- a/src/video_core/shader/decode/bfe.cpp
+++ b/src/video_core/shader/decode/bfe.cpp
@@ -17,33 +17,60 @@ u32 ShaderIR::DecodeBfe(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
    const auto opcode = OpCode::Decode(instr);

-    UNIMPLEMENTED_IF(instr.bfe.negate_b);
-
    Node op_a = GetRegister(instr.gpr8);
-    op_a = GetOperandAbsNegInteger(op_a, false, instr.bfe.negate_a, false);
+    Node op_b = [&] {
+        switch (opcode->get().GetId()) {
+        case OpCode::Id::BFE_R:
+            return GetRegister(instr.gpr20);
+        case OpCode::Id::BFE_C:
+            return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
+        case OpCode::Id::BFE_IMM:
+            return Immediate(instr.alu.GetSignedImm20_20());
+        default:
+            UNREACHABLE();
+            return Immediate(0);
+        }
+    }();

-    switch (opcode->get().GetId()) {
-    case OpCode::Id::BFE_IMM: {
-        UNIMPLEMENTED_IF_MSG(instr.generates_cc,
-                             "Condition codes generation in BFE is not implemented");
+    UNIMPLEMENTED_IF_MSG(instr.bfe.rd_cc, "Condition codes in BFE is not implemented");

-        const Node inner_shift_imm = Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue()));
-        const Node outer_shift_imm =
-            Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position));
+    const bool is_signed = instr.bfe.is_signed;

-        const Node inner_shift =
-            Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, op_a, inner_shift_imm);
-        const Node outer_shift =
-            Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, inner_shift, outer_shift_imm);
-
-        SetInternalFlagsFromInteger(bb, outer_shift, instr.generates_cc);
-        SetRegister(bb, instr.gpr0, outer_shift);
-        break;
-    }
-    default:
-        UNIMPLEMENTED_MSG("Unhandled BFE instruction: {}", opcode->get().GetName());
+    // using reverse parallel method in
+    // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
+    // note for later if possible to implement faster method.
+    if (instr.bfe.brev) {
+        const auto swap = [&](u32 s, u32 mask) {
+            Node v1 =
+                SignedOperation(OperationCode::ILogicalShiftRight, is_signed, op_a, Immediate(s));
+            if (mask != 0) {
+                v1 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v1),
+                                     Immediate(mask));
+            }
+            Node v2 = op_a;
+            if (mask != 0) {
+                v2 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v2),
+                                     Immediate(mask));
+            }
+            v2 = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, std::move(v2),
+                                 Immediate(s));
+            return SignedOperation(OperationCode::IBitwiseOr, is_signed, std::move(v1),
+                                   std::move(v2));
+        };
+        op_a = swap(1, 0x55555555U);
+        op_a = swap(2, 0x33333333U);
+        op_a = swap(4, 0x0F0F0F0FU);
+        op_a = swap(8, 0x00FF00FFU);
+        op_a = swap(16, 0);
    }

+    const auto offset = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b,
+                                        Immediate(0), Immediate(8));
+    const auto bits = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b,
+                                      Immediate(8), Immediate(8));
+    auto result = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_a, offset, bits);
+    SetRegister(bb, instr.gpr0, std::move(result));
+
    return pc;
 }

--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -12,6 +12,7 @@
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/node_helper.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

 namespace VideoCommon::Shader {
@@ -359,8 +360,8 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(std::optional<SamplerInfo> sample
    if (sampler_info) {
        return *sampler_info;
    }
-    const auto sampler =
-        buffer ? locker.ObtainBindlessSampler(*buffer, offset) : locker.ObtainBoundSampler(offset);
+    const auto sampler = buffer ? registry.ObtainBindlessSampler(*buffer, offset)
+                                : registry.ObtainBoundSampler(offset);
    if (!sampler) {
        LOG_WARNING(HW_GPU, "Unknown sampler info");
        return SamplerInfo{TextureType::Texture2D, false, false, false};
--- a/src/video_core/shader/node_helper.cpp
+++ b/src/video_core/shader/node_helper.cpp
@@ -68,6 +68,8 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed)
        return OperationCode::UBitwiseXor;
    case OperationCode::IBitwiseNot:
        return OperationCode::UBitwiseNot;
+    case OperationCode::IBitfieldExtract:
+        return OperationCode::UBitfieldExtract;
    case OperationCode::IBitfieldInsert:
        return OperationCode::UBitfieldInsert;
    case OperationCode::IBitCount:
--- a/src/video_core/shader/registry.cpp
+++ b/src/video_core/shader/registry.cpp
@@ -0,0 +1,161 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <tuple>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/shader/registry.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Engines::ConstBufferEngineInterface;
+using Tegra::Engines::SamplerDescriptor;
+using Tegra::Engines::ShaderType;
+
+namespace {
+
+GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) {
+    if (shader_stage == ShaderType::Compute) {
+        return {};
+    }
+    auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine);
+
+    GraphicsInfo info;
+    info.tfb_layouts = graphics.regs.tfb_layouts;
+    info.tfb_varying_locs = graphics.regs.tfb_varying_locs;
+    info.primitive_topology = graphics.regs.draw.topology;
+    info.tessellation_primitive = graphics.regs.tess_mode.prim;
+    info.tessellation_spacing = graphics.regs.tess_mode.spacing;
+    info.tfb_enabled = graphics.regs.tfb_enabled;
+    info.tessellation_clockwise = graphics.regs.tess_mode.cw;
+    return info;
+}
+
+ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) {
+    if (shader_stage != ShaderType::Compute) {
+        return {};
+    }
+    auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine);
+    const auto& launch = compute.launch_description;
+
+    ComputeInfo info;
+    info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z};
+    info.local_memory_size_in_words = launch.local_pos_alloc;
+    info.shared_memory_size_in_words = launch.shared_alloc;
+    return info;
+}
+
+} // Anonymous namespace
+
+Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info)
+    : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile},
+      bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {}
+
+Registry::Registry(Tegra::Engines::ShaderType shader_stage,
+                   Tegra::Engines::ConstBufferEngineInterface& engine)
+    : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()},
+      graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo(
+                                                                 shader_stage, engine)} {}
+
+Registry::~Registry() = default;
+
+std::optional<u32> Registry::ObtainKey(u32 buffer, u32 offset) {
+    const std::pair<u32, u32> key = {buffer, offset};
+    const auto iter = keys.find(key);
+    if (iter != keys.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+    const u32 value = engine->AccessConstBuffer32(stage, buffer, offset);
+    keys.emplace(key, value);
+    return value;
+}
+
+std::optional<SamplerDescriptor> Registry::ObtainBoundSampler(u32 offset) {
+    const u32 key = offset;
+    const auto iter = bound_samplers.find(key);
+    if (iter != bound_samplers.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+    const SamplerDescriptor value = engine->AccessBoundSampler(stage, offset);
+    bound_samplers.emplace(key, value);
+    return value;
+}
+
+std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer,
+                                                                                 u32 offset) {
+    const std::pair key = {buffer, offset};
+    const auto iter = bindless_samplers.find(key);
+    if (iter != bindless_samplers.end()) {
+        return iter->second;
+    }
+    if (!engine) {
+        return std::nullopt;
+    }
+    const SamplerDescriptor value = engine->AccessBindlessSampler(stage, buffer, offset);
+    bindless_samplers.emplace(key, value);
+    return value;
+}
+
+void Registry::InsertKey(u32 buffer, u32 offset, u32 value) {
+    keys.insert_or_assign({buffer, offset}, value);
+}
+
+void Registry::InsertBoundSampler(u32 offset, SamplerDescriptor sampler) {
+    bound_samplers.insert_or_assign(offset, sampler);
+}
+
+void Registry::InsertBindlessSampler(u32 buffer, u32 offset, SamplerDescriptor sampler) {
+    bindless_samplers.insert_or_assign({buffer, offset}, sampler);
+}
+
+bool Registry::IsConsistent() const {
+    if (!engine) {
+        return true;
+    }
+    return std::all_of(keys.begin(), keys.end(),
+                       [this](const auto& pair) {
+                           const auto [cbuf, offset] = pair.first;
+                           const auto value = pair.second;
+                           return value == engine->AccessConstBuffer32(stage, cbuf, offset);
+                       }) &&
+           std::all_of(bound_samplers.begin(), bound_samplers.end(),
+                       [this](const auto& sampler) {
+                           const auto [key, value] = sampler;
+                           return value == engine->AccessBoundSampler(stage, key);
+                       }) &&
+           std::all_of(bindless_samplers.begin(), bindless_samplers.end(),
+                       [this](const auto& sampler) {
+                           const auto [cbuf, offset] = sampler.first;
+                           const auto value = sampler.second;
+                           return value == engine->AccessBindlessSampler(stage, cbuf, offset);
+                       });
+}
+
+bool Registry::HasEqualKeys(const Registry& rhs) const {
+    return std::tie(keys, bound_samplers, bindless_samplers) ==
+           std::tie(rhs.keys, rhs.bound_samplers, rhs.bindless_samplers);
+}
+
+const GraphicsInfo& Registry::GetGraphicsInfo() const {
+    ASSERT(stage != Tegra::Engines::ShaderType::Compute);
+    return graphics_info;
+}
+
+const ComputeInfo& Registry::GetComputeInfo() const {
+    ASSERT(stage == Tegra::Engines::ShaderType::Compute);
+    return compute_info;
+}
+
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/registry.h
+++ b/src/video_core/shader/registry.h
@@ -0,0 +1,137 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <optional>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+
+#include "common/common_types.h"
+#include "common/hash.h"
+#include "video_core/engines/const_buffer_engine_interface.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/engines/shader_type.h"
+#include "video_core/guest_driver.h"
+
+namespace VideoCommon::Shader {
+
+using KeyMap = std::unordered_map<std::pair<u32, u32>, u32, Common::PairHash>;
+using BoundSamplerMap = std::unordered_map<u32, Tegra::Engines::SamplerDescriptor>;
+using BindlessSamplerMap =
+    std::unordered_map<std::pair<u32, u32>, Tegra::Engines::SamplerDescriptor, Common::PairHash>;
+
+struct GraphicsInfo {
+    using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+    std::array<Maxwell::TransformFeedbackLayout, Maxwell::NumTransformFeedbackBuffers>
+        tfb_layouts{};
+    std::array<std::array<u8, 128>, Maxwell::NumTransformFeedbackBuffers> tfb_varying_locs{};
+    Maxwell::PrimitiveTopology primitive_topology{};
+    Maxwell::TessellationPrimitive tessellation_primitive{};
+    Maxwell::TessellationSpacing tessellation_spacing{};
+    bool tfb_enabled = false;
+    bool tessellation_clockwise = false;
+};
+static_assert(std::is_trivially_copyable_v<GraphicsInfo> &&
+              std::is_standard_layout_v<GraphicsInfo>);
+
+struct ComputeInfo {
+    std::array<u32, 3> workgroup_size{};
+    u32 shared_memory_size_in_words = 0;
+    u32 local_memory_size_in_words = 0;
+};
+static_assert(std::is_trivially_copyable_v<ComputeInfo> && std::is_standard_layout_v<ComputeInfo>);
+
+struct SerializedRegistryInfo {
+    VideoCore::GuestDriverProfile guest_driver_profile;
+    u32 bound_buffer = 0;
+    GraphicsInfo graphics;
+    ComputeInfo compute;
+};
+
+/**
+ * The Registry is a class use to interface the 3D and compute engines with the shader compiler.
+ * With it, the shader can obtain required data from GPU state and store it for disk shader
+ * compilation.
+ */
+class Registry {
+public:
+    explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info);
+
+    explicit Registry(Tegra::Engines::ShaderType shader_stage,
+                      Tegra::Engines::ConstBufferEngineInterface& engine);
+
+    ~Registry();
+
+    /// Retrieves a key from the registry, if it's registered, it will give the registered value, if
+    /// not it will obtain it from maxwell3d and register it.
+    std::optional<u32> ObtainKey(u32 buffer, u32 offset);
+
+    std::optional<Tegra::Engines::SamplerDescriptor> ObtainBoundSampler(u32 offset);
+
+    std::optional<Tegra::Engines::SamplerDescriptor> ObtainBindlessSampler(u32 buffer, u32 offset);
+
+    /// Inserts a key.
+    void InsertKey(u32 buffer, u32 offset, u32 value);
+
+    /// Inserts a bound sampler key.
+    void InsertBoundSampler(u32 offset, Tegra::Engines::SamplerDescriptor sampler);
+
+    /// Inserts a bindless sampler key.
+    void InsertBindlessSampler(u32 buffer, u32 offset, Tegra::Engines::SamplerDescriptor sampler);
+
+    /// Checks keys and samplers against engine's current const buffers.
+    /// Returns true if they are the same value, false otherwise.
+    bool IsConsistent() const;
+
+    /// Returns true if the keys are equal to the other ones in the registry.
+    bool HasEqualKeys(const Registry& rhs) const;
+
+    /// Returns graphics information from this shader
+    const GraphicsInfo& GetGraphicsInfo() const;
+
+    /// Returns compute information from this shader
+    const ComputeInfo& GetComputeInfo() const;
+
+    /// Gives an getter to the const buffer keys in the database.
+    const KeyMap& GetKeys() const {
+        return keys;
+    }
+
+    /// Gets samplers database.
+    const BoundSamplerMap& GetBoundSamplers() const {
+        return bound_samplers;
+    }
+
+    /// Gets bindless samplers database.
+    const BindlessSamplerMap& GetBindlessSamplers() const {
+        return bindless_samplers;
+    }
+
+    /// Gets bound buffer used on this shader
+    u32 GetBoundBuffer() const {
+        return bound_buffer;
+    }
+
+    /// Obtains access to the guest driver's profile.
+    VideoCore::GuestDriverProfile& AccessGuestDriverProfile() {
+        return engine ? engine->AccessGuestDriverProfile() : stored_guest_driver_profile;
+    }
+
+private:
+    const Tegra::Engines::ShaderType stage;
+    VideoCore::GuestDriverProfile stored_guest_driver_profile;
+    Tegra::Engines::ConstBufferEngineInterface* engine = nullptr;
+    KeyMap keys;
+    BoundSamplerMap bound_samplers;
+    BindlessSamplerMap bindless_samplers;
+    u32 bound_buffer;
+    GraphicsInfo graphics_info;
+    ComputeInfo compute_info;
+};
+
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -11,6 +11,7 @@
 #include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/node_helper.h"
+#include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"

 namespace VideoCommon::Shader {
@@ -24,8 +25,8 @@ using Tegra::Shader::PredOperation;
 using Tegra::Shader::Register;

 ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings,
-                   ConstBufferLocker& locker)
-    : program_code{program_code}, main_offset{main_offset}, settings{settings}, locker{locker} {
+                   Registry& registry)
+    : program_code{program_code}, main_offset{main_offset}, settings{settings}, registry{registry} {
    Decode();
    PostDecode();
 }
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -18,8 +18,8 @@
 #include "video_core/engines/shader_header.h"
 #include "video_core/shader/ast.h"
 #include "video_core/shader/compiler_settings.h"
-#include "video_core/shader/const_buffer_locker.h"
 #include "video_core/shader/node.h"
+#include "video_core/shader/registry.h"

 namespace VideoCommon::Shader {

@@ -69,7 +69,7 @@ struct GlobalMemoryUsage {
 class ShaderIR final {
 public:
    explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings,
-                      ConstBufferLocker& locker);
+                      Registry& registry);
    ~ShaderIR();

    const std::map<u32, NodeBlock>& GetBasicBlocks() const {
@@ -414,7 +414,7 @@ private:
    const ProgramCode& program_code;
    const u32 main_offset;
    const CompilerSettings settings;
-    ConstBufferLocker& locker;
+    Registry& registry;

    bool decompiled{};
    bool disable_flow_stack{};
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -81,26 +81,20 @@ std::tuple<Node, TrackSampler> ShaderIR::TrackBindlessSampler(Node tracked, cons
                MakeTrackSampler<BindlessSamplerNode>(cbuf->GetIndex(), immediate->GetValue());
            return {tracked, track};
        } else if (const auto operation = std::get_if<OperationNode>(&*offset)) {
-            auto bound_buffer = locker.ObtainBoundBuffer();
-            if (!bound_buffer) {
+            const u32 bound_buffer = registry.GetBoundBuffer();
+            if (bound_buffer != cbuf->GetIndex()) {
                return {};
            }
-            if (*bound_buffer != cbuf->GetIndex()) {
-                return {};
-            }
-            auto pair = DecoupleIndirectRead(*operation);
+            const auto pair = DecoupleIndirectRead(*operation);
            if (!pair) {
                return {};
            }
            auto [gpr, base_offset] = *pair;
            const auto offset_inm = std::get_if<ImmediateNode>(&*base_offset);
-            auto gpu_driver = locker.AccessGuestDriverProfile();
-            if (gpu_driver == nullptr) {
-                return {};
-            }
+            const auto& gpu_driver = registry.AccessGuestDriverProfile();
            const u32 bindless_cv = NewCustomVariable();
-            const Node op = Operation(OperationCode::UDiv, NO_PRECISE, gpr,
-                                      Immediate(gpu_driver->GetTextureHandlerSize()));
+            const Node op =
+                Operation(OperationCode::UDiv, gpr, Immediate(gpu_driver.GetTextureHandlerSize()));

            const Node cv_node = GetCustomVariable(bindless_cv);
            Node amend_op = Operation(OperationCode::Assign, cv_node, std::move(op));
--- a/src/video_core/shader/transform_feedback.cpp
+++ b/src/video_core/shader/transform_feedback.cpp
@@ -0,0 +1,115 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <unordered_map>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/shader/registry.h"
+#include "video_core/shader/transform_feedback.h"
+
+namespace VideoCommon::Shader {
+
+namespace {
+
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
+// TODO(Rodrigo): Change this to constexpr std::unordered_set in C++20
+
+/// Attribute offsets that describe a vector
+constexpr std::array VECTORS = {
+    28,  // gl_Position
+    32,  // Generic 0
+    36,  // Generic 1
+    40,  // Generic 2
+    44,  // Generic 3
+    48,  // Generic 4
+    52,  // Generic 5
+    56,  // Generic 6
+    60,  // Generic 7
+    64,  // Generic 8
+    68,  // Generic 9
+    72,  // Generic 10
+    76,  // Generic 11
+    80,  // Generic 12
+    84,  // Generic 13
+    88,  // Generic 14
+    92,  // Generic 15
+    96,  // Generic 16
+    100, // Generic 17
+    104, // Generic 18
+    108, // Generic 19
+    112, // Generic 20
+    116, // Generic 21
+    120, // Generic 22
+    124, // Generic 23
+    128, // Generic 24
+    132, // Generic 25
+    136, // Generic 26
+    140, // Generic 27
+    144, // Generic 28
+    148, // Generic 29
+    152, // Generic 30
+    156, // Generic 31
+    160, // gl_FrontColor
+    164, // gl_FrontSecondaryColor
+    160, // gl_BackColor
+    164, // gl_BackSecondaryColor
+    192, // gl_TexCoord[0]
+    196, // gl_TexCoord[1]
+    200, // gl_TexCoord[2]
+    204, // gl_TexCoord[3]
+    208, // gl_TexCoord[4]
+    212, // gl_TexCoord[5]
+    216, // gl_TexCoord[6]
+    220, // gl_TexCoord[7]
+};
+} // namespace
+
+std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info) {
+
+    std::unordered_map<u8, VaryingTFB> tfb;
+
+    for (std::size_t buffer = 0; buffer < Maxwell::NumTransformFeedbackBuffers; ++buffer) {
+        const auto& locations = info.tfb_varying_locs[buffer];
+        const auto& layout = info.tfb_layouts[buffer];
+        const std::size_t varying_count = layout.varying_count;
+
+        std::size_t highest = 0;
+
+        for (std::size_t offset = 0; offset < varying_count; ++offset) {
+            const std::size_t base_offset = offset;
+            const u8 location = locations[offset];
+
+            VaryingTFB varying;
+            varying.buffer = layout.stream;
+            varying.stride = layout.stride;
+            varying.offset = offset * sizeof(u32);
+            varying.components = 1;
+
+            if (std::find(VECTORS.begin(), VECTORS.end(), location / 4 * 4) != VECTORS.end()) {
+                UNIMPLEMENTED_IF_MSG(location % 4 != 0, "Unaligned TFB");
+
+                const u8 base_index = location / 4;
+                while (offset + 1 < varying_count && base_index == locations[offset + 1] / 4) {
+                    ++offset;
+                    ++varying.components;
+                }
+            }
+
+            [[maybe_unused]] const bool inserted = tfb.emplace(location, varying).second;
+            UNIMPLEMENTED_IF_MSG(!inserted, "Varying already stored");
+
+            highest = std::max(highest, (base_offset + varying.components) * sizeof(u32));
+        }
+
+        UNIMPLEMENTED_IF(highest != layout.stride);
+    }
+    return tfb;
+}
+
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/transform_feedback.h
+++ b/src/video_core/shader/transform_feedback.h
@@ -0,0 +1,23 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <unordered_map>
+
+#include "common/common_types.h"
+#include "video_core/shader/registry.h"
+
+namespace VideoCommon::Shader {
+
+struct VaryingTFB {
+    std::size_t buffer;
+    std::size_t stride;
+    std::size_t offset;
+    std::size_t components;
+};
+
+std::unordered_map<u8, VaryingTFB> BuildTransformFeedback(const GraphicsInfo& info);
+
+} // namespace VideoCommon::Shader
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -111,6 +111,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
        return PixelFormat::RGBA16F;
    case Tegra::RenderTargetFormat::RGBA16_UNORM:
        return PixelFormat::RGBA16U;
+    case Tegra::RenderTargetFormat::RGBA16_SNORM:
+        return PixelFormat::RGBA16S;
    case Tegra::RenderTargetFormat::RGBA16_UINT:
        return PixelFormat::RGBA16UI;
    case Tegra::RenderTargetFormat::RGBA32_FLOAT:
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -25,82 +25,83 @@ enum class PixelFormat {
    R8UI = 7,
    RGBA16F = 8,
    RGBA16U = 9,
-    RGBA16UI = 10,
-    R11FG11FB10F = 11,
-    RGBA32UI = 12,
-    DXT1 = 13,
-    DXT23 = 14,
-    DXT45 = 15,
-    DXN1 = 16, // This is also known as BC4
-    DXN2UNORM = 17,
-    DXN2SNORM = 18,
-    BC7U = 19,
-    BC6H_UF16 = 20,
-    BC6H_SF16 = 21,
-    ASTC_2D_4X4 = 22,
-    BGRA8 = 23,
-    RGBA32F = 24,
-    RG32F = 25,
-    R32F = 26,
-    R16F = 27,
-    R16U = 28,
-    R16S = 29,
-    R16UI = 30,
-    R16I = 31,
-    RG16 = 32,
-    RG16F = 33,
-    RG16UI = 34,
-    RG16I = 35,
-    RG16S = 36,
-    RGB32F = 37,
-    RGBA8_SRGB = 38,
-    RG8U = 39,
-    RG8S = 40,
-    RG32UI = 41,
-    RGBX16F = 42,
-    R32UI = 43,
-    R32I = 44,
-    ASTC_2D_8X8 = 45,
-    ASTC_2D_8X5 = 46,
-    ASTC_2D_5X4 = 47,
-    BGRA8_SRGB = 48,
-    DXT1_SRGB = 49,
-    DXT23_SRGB = 50,
-    DXT45_SRGB = 51,
-    BC7U_SRGB = 52,
-    R4G4B4A4U = 53,
-    ASTC_2D_4X4_SRGB = 54,
-    ASTC_2D_8X8_SRGB = 55,
-    ASTC_2D_8X5_SRGB = 56,
-    ASTC_2D_5X4_SRGB = 57,
-    ASTC_2D_5X5 = 58,
-    ASTC_2D_5X5_SRGB = 59,
-    ASTC_2D_10X8 = 60,
-    ASTC_2D_10X8_SRGB = 61,
-    ASTC_2D_6X6 = 62,
-    ASTC_2D_6X6_SRGB = 63,
-    ASTC_2D_10X10 = 64,
-    ASTC_2D_10X10_SRGB = 65,
-    ASTC_2D_12X12 = 66,
-    ASTC_2D_12X12_SRGB = 67,
-    ASTC_2D_8X6 = 68,
-    ASTC_2D_8X6_SRGB = 69,
-    ASTC_2D_6X5 = 70,
-    ASTC_2D_6X5_SRGB = 71,
-    E5B9G9R9F = 72,
+    RGBA16S = 10,
+    RGBA16UI = 11,
+    R11FG11FB10F = 12,
+    RGBA32UI = 13,
+    DXT1 = 14,
+    DXT23 = 15,
+    DXT45 = 16,
+    DXN1 = 17, // This is also known as BC4
+    DXN2UNORM = 18,
+    DXN2SNORM = 19,
+    BC7U = 20,
+    BC6H_UF16 = 21,
+    BC6H_SF16 = 22,
+    ASTC_2D_4X4 = 23,
+    BGRA8 = 24,
+    RGBA32F = 25,
+    RG32F = 26,
+    R32F = 27,
+    R16F = 28,
+    R16U = 29,
+    R16S = 30,
+    R16UI = 31,
+    R16I = 32,
+    RG16 = 33,
+    RG16F = 34,
+    RG16UI = 35,
+    RG16I = 36,
+    RG16S = 37,
+    RGB32F = 38,
+    RGBA8_SRGB = 39,
+    RG8U = 40,
+    RG8S = 41,
+    RG32UI = 42,
+    RGBX16F = 43,
+    R32UI = 44,
+    R32I = 45,
+    ASTC_2D_8X8 = 46,
+    ASTC_2D_8X5 = 47,
+    ASTC_2D_5X4 = 48,
+    BGRA8_SRGB = 49,
+    DXT1_SRGB = 50,
+    DXT23_SRGB = 51,
+    DXT45_SRGB = 52,
+    BC7U_SRGB = 53,
+    R4G4B4A4U = 54,
+    ASTC_2D_4X4_SRGB = 55,
+    ASTC_2D_8X8_SRGB = 56,
+    ASTC_2D_8X5_SRGB = 57,
+    ASTC_2D_5X4_SRGB = 58,
+    ASTC_2D_5X5 = 59,
+    ASTC_2D_5X5_SRGB = 60,
+    ASTC_2D_10X8 = 61,
+    ASTC_2D_10X8_SRGB = 62,
+    ASTC_2D_6X6 = 63,
+    ASTC_2D_6X6_SRGB = 64,
+    ASTC_2D_10X10 = 65,
+    ASTC_2D_10X10_SRGB = 66,
+    ASTC_2D_12X12 = 67,
+    ASTC_2D_12X12_SRGB = 68,
+    ASTC_2D_8X6 = 69,
+    ASTC_2D_8X6_SRGB = 70,
+    ASTC_2D_6X5 = 71,
+    ASTC_2D_6X5_SRGB = 72,
+    E5B9G9R9F = 73,

    MaxColorFormat,

    // Depth formats
-    Z32F = 73,
-    Z16 = 74,
+    Z32F = 74,
+    Z16 = 75,

    MaxDepthFormat,

    // DepthStencil formats
-    Z24S8 = 75,
-    S8Z24 = 76,
-    Z32FS8 = 77,
+    Z24S8 = 76,
+    S8Z24 = 77,
+    Z32FS8 = 78,

    MaxDepthStencilFormat,

@@ -138,6 +139,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{
    0, // R8UI
    0, // RGBA16F
    0, // RGBA16U
+    0, // RGBA16S
    0, // RGBA16UI
    0, // R11FG11FB10F
    0, // RGBA32UI
@@ -235,6 +237,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{
    1,  // R8UI
    1,  // RGBA16F
    1,  // RGBA16U
+    1,  // RGBA16S
    1,  // RGBA16UI
    1,  // R11FG11FB10F
    1,  // RGBA32UI
@@ -324,6 +327,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{
    1,  // R8UI
    1,  // RGBA16F
    1,  // RGBA16U
+    1,  // RGBA16S
    1,  // RGBA16UI
    1,  // R11FG11FB10F
    1,  // RGBA32UI
@@ -413,6 +417,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{
    8,   // R8UI
    64,  // RGBA16F
    64,  // RGBA16U
+    64,  // RGBA16S
    64,  // RGBA16UI
    32,  // R11FG11FB10F
    128, // RGBA32UI
@@ -517,6 +522,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table
    SurfaceCompression::None,       // R8UI
    SurfaceCompression::None,       // RGBA16F
    SurfaceCompression::None,       // RGBA16U
+    SurfaceCompression::None,       // RGBA16S
    SurfaceCompression::None,       // RGBA16UI
    SurfaceCompression::None,       // R11FG11FB10F
    SurfaceCompression::None,       // RGBA32UI
--- a/src/video_core/texture_cache/format_lookup_table.cpp
+++ b/src/video_core/texture_cache/format_lookup_table.cpp
@@ -41,7 +41,7 @@ struct Table {
    ComponentType alpha_component;
    bool is_srgb;
 };
-constexpr std::array<Table, 75> DefinitionTable = {{
+constexpr std::array<Table, 76> DefinitionTable = {{
    {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U},
    {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S},
    {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI},
@@ -61,6 +61,7 @@ constexpr std::array<Table, 75> DefinitionTable = {{
    {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U},
    {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S},

+    {TextureFormat::R16_G16_B16_A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RGBA16S},
    {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U},
    {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F},
    {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI},
--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -113,8 +113,10 @@ SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_ta
        params.height = tic.Height();
        params.depth = tic.Depth();
        params.pitch = params.is_tiled ? 0 : tic.Pitch();
-        if (params.target == SurfaceTarget::TextureCubemap ||
-            params.target == SurfaceTarget::TextureCubeArray) {
+        if (params.target == SurfaceTarget::Texture2D && params.depth > 1) {
+            params.depth = 1;
+        } else if (params.target == SurfaceTarget::TextureCubemap ||
+                   params.target == SurfaceTarget::TextureCubeArray) {
            params.depth *= 6;
        }
        params.num_levels = tic.max_mip_level + 1;
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -104,6 +104,11 @@ public:
        if (!cache_addr) {
            return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
        }
+
+        if (!IsTypeCompatible(tic.texture_type, entry)) {
+            return GetNullSurface(SurfaceParams::ExpectedTarget(entry));
+        }
+
        const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)};
        const auto [surface, view] = GetSurface(gpu_addr, cache_addr, params, true, false);
        if (guard_samplers) {
@@ -914,13 +919,15 @@ private:
        params.width = 1;
        params.height = 1;
        params.depth = 1;
+        if (target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray) {
+            params.depth = 6;
+        }
        params.pitch = 4;
        params.num_levels = 1;
        params.emulated_levels = 1;
-        params.pixel_format = VideoCore::Surface::PixelFormat::RGBA16F;
+        params.pixel_format = VideoCore::Surface::PixelFormat::R8U;
        params.type = VideoCore::Surface::SurfaceType::ColorTexture;
        auto surface = CreateSurface(0ULL, params);
-        invalid_memory.clear();
        invalid_memory.resize(surface->GetHostSizeInBytes(), 0U);
        surface->UploadTexture(invalid_memory);
        surface->MarkAsModified(false, Tick());
@@ -1082,6 +1089,36 @@ private:
        return siblings_table[static_cast<std::size_t>(format)];
    }

+    /// Returns true the shader sampler entry is compatible with the TIC texture type.
+    static bool IsTypeCompatible(Tegra::Texture::TextureType tic_type,
+                                 const VideoCommon::Shader::Sampler& entry) {
+        const auto shader_type = entry.GetType();
+        switch (tic_type) {
+        case Tegra::Texture::TextureType::Texture1D:
+        case Tegra::Texture::TextureType::Texture1DArray:
+            return shader_type == Tegra::Shader::TextureType::Texture1D;
+        case Tegra::Texture::TextureType::Texture1DBuffer:
+            // TODO(Rodrigo): Assume as valid for now
+            return true;
+        case Tegra::Texture::TextureType::Texture2D:
+        case Tegra::Texture::TextureType::Texture2DNoMipmap:
+            return shader_type == Tegra::Shader::TextureType::Texture2D;
+        case Tegra::Texture::TextureType::Texture2DArray:
+            return shader_type == Tegra::Shader::TextureType::Texture2D ||
+                   shader_type == Tegra::Shader::TextureType::TextureCube;
+        case Tegra::Texture::TextureType::Texture3D:
+            return shader_type == Tegra::Shader::TextureType::Texture3D;
+        case Tegra::Texture::TextureType::TextureCubeArray:
+        case Tegra::Texture::TextureType::TextureCubemap:
+            if (shader_type == Tegra::Shader::TextureType::TextureCube) {
+                return true;
+            }
+            return shader_type == Tegra::Shader::TextureType::Texture2D && entry.IsArray();
+        }
+        UNREACHABLE();
+        return true;
+    }
+
    struct FramebufferTargetInfo {
        TSurface target;
        TView view;
--- a/src/yuzu/loading_screen.cpp
+++ b/src/yuzu/loading_screen.cpp
@@ -34,18 +34,6 @@ constexpr char PROGRESSBAR_STYLE_PREPARE[] = R"(
 QProgressBar {}
 QProgressBar::chunk {})";

-constexpr char PROGRESSBAR_STYLE_DECOMPILE[] = R"(
-QProgressBar {
-  background-color: black;
-  border: 2px solid white;
-  border-radius: 4px;
-  padding: 2px;
-}
-QProgressBar::chunk {
-  background-color: #0ab9e6;
-  width: 1px;
-})";
-
 constexpr char PROGRESSBAR_STYLE_BUILD[] = R"(
 QProgressBar {
  background-color: black;
@@ -100,13 +88,11 @@ LoadingScreen::LoadingScreen(QWidget* parent)

    stage_translations = {
        {VideoCore::LoadCallbackStage::Prepare, tr("Loading...")},
-        {VideoCore::LoadCallbackStage::Decompile, tr("Preparing Shaders %1 / %2")},
        {VideoCore::LoadCallbackStage::Build, tr("Loading Shaders %1 / %2")},
        {VideoCore::LoadCallbackStage::Complete, tr("Launching...")},
    };
    progressbar_style = {
        {VideoCore::LoadCallbackStage::Prepare, PROGRESSBAR_STYLE_PREPARE},
-        {VideoCore::LoadCallbackStage::Decompile, PROGRESSBAR_STYLE_DECOMPILE},
        {VideoCore::LoadCallbackStage::Build, PROGRESSBAR_STYLE_BUILD},
        {VideoCore::LoadCallbackStage::Complete, PROGRESSBAR_STYLE_COMPLETE},
    };
@@ -192,8 +178,7 @@ void LoadingScreen::OnLoadProgress(VideoCore::LoadCallbackStage stage, std::size
    }

    // update labels and progress bar
-    if (stage == VideoCore::LoadCallbackStage::Decompile ||
-        stage == VideoCore::LoadCallbackStage::Build) {
+    if (stage == VideoCore::LoadCallbackStage::Build) {
        ui->stage->setText(stage_translations[stage].arg(value).arg(total));
    } else {
        ui->stage->setText(stage_translations[stage]);
Author	SHA1	Message	Date
Skyth	5f08e51727	Add credits for MME shadow RAM	2020-03-18 23:06:08 +03:00
Skyth	72c17f8de8	Move ShadowRamControl::Replay handler to MacroInterpreter::Send	2020-03-18 21:14:33 +03:00
Skyth	c8aa2fe56b	maxwell_3d: Implement MME shadow RAM Implementation was based on this Ryujinx PR: https://github.com/Ryujinx/Ryujinx/pull/987	2020-03-17 23:30:04 +03:00
bunnei	1c45c8086e	Merge pull request #3498 from ReinUsesLisp/texel-fetch-glsl gl_shader_decompiler: Add layer component to texelFetch	2020-03-17 10:53:38 -04:00
bunnei	e8ded20d24	Merge pull request #3521 from ReinUsesLisp/nsight-debug renderer_opengl: Detect Nvidia Nsight as a debugging tool	2020-03-16 22:52:42 -04:00
ReinUsesLisp	53d673a7d3	renderer_opengl: Move some logic to an anonymous namespace	2020-03-16 04:03:34 -03:00
ReinUsesLisp	311d2fc768	renderer_opengl: Detect Nvidia Nsight as a debugging tool Use getenv to detect Nsight.	2020-03-16 03:59:08 -03:00
Rodrigo Locatti	b16c8e0e8d	Merge pull request #3515 from ReinUsesLisp/vertex-vk-assert vk_rasterizer: Fix vertex range assert	2020-03-15 21:26:54 -03:00
Rodrigo Locatti	7cc46a6faa	Merge pull request #3501 from ReinUsesLisp/rgba16-snorm video_core: Implement RGBA16_SNORM	2020-03-15 21:24:53 -03:00
Rodrigo Locatti	ddafc99776	Merge pull request #3502 from namkazt/patch-3 shader_decode: Reimplement BFE instructions	2020-03-15 21:23:04 -03:00
Rodrigo Locatti	d64edf21bb	Merge pull request #3503 from makigumo/patch-2 maxwell_to_vk: add vertex format eA2B10G10R10UnormPack32	2020-03-15 21:21:38 -03:00
Rodrigo Locatti	86b1f15d9a	Merge pull request #3512 from bunnei/fix-renderdoc renderer_opengl: Keep frames synchronized when using a GPU debugger.	2020-03-15 19:28:43 -03:00
Rodrigo Locatti	d91a880f11	Merge pull request #3516 from makigumo/patch-3 vk_shader_decompiler: fix linux build	2020-03-15 18:43:40 -03:00
makigumo	f91046bf8d	vk_shader_decompiler: fix linux build	2020-03-15 18:00:14 +01:00
ReinUsesLisp	a7131af7d6	vk_rasterizer: Fix vertex range assert End can be equal to start in CalculateVertexArraysSize. This is quite common when the vertex size is zero.	2020-03-15 04:04:17 -03:00
bunnei	c5afe93dcc	renderer_opengl: Keep presentation frames in lock-step when GPU debugging. - Fixes renderdoc with OpenGL renderer.	2020-03-14 17:45:01 -04:00
bunnei	4373fa8042	gl_device: Add option to check GL_EXT_debug_tool.	2020-03-14 17:39:29 -04:00
bunnei	4dfd5c84ea	Merge pull request #3508 from FernandoS27/page-table PageTable: move backing addresses to a children class as the CPU page table does not need them.	2020-03-14 16:50:27 -04:00
Fernando Sahmkow	c51dbf8038	Merge pull request #3500 from ReinUsesLisp/incompatible-types texture_cache: Report incompatible textures as black	2020-03-14 09:49:05 -04:00
Fernando Sahmkow	41905ee467	Merge pull request #3499 from ReinUsesLisp/depth-2d-array texture_cache/surface_params: Force depth=1 on 2D textures	2020-03-14 09:48:39 -04:00
Fernando Sahmkow	35145bd529	Merge pull request #3490 from ReinUsesLisp/transform-feedbacks video_core: Initial implementation of transform feedbacks	2020-03-14 09:48:15 -04:00
Fernando Sahmkow	27cbb75e7c	PageTable: move backing addresses to a children class as the CPU page table does not need them. This PR aims to reduce the memory usage in the CPU page table by moving GPU specific parameters into a child class. This saves 1Gb of Memory for most games.	2020-03-14 09:43:57 -04:00
Nguyen Dac Nam	3287b1247d	clang-format	2020-03-14 10:07:40 +07:00
Nguyen Dac Nam	240d45830d	nit	2020-03-14 09:57:24 +07:00
ReinUsesLisp	69c7a01f88	vk/gl_shader_decompiler: Silence assertion on compute	2020-03-13 18:33:05 -03:00
ReinUsesLisp	62560f1e63	vk_shader_decompiler: Fix default varying regression	2020-03-13 18:33:05 -03:00
ReinUsesLisp	afebdda203	maxwell_3d: Add padding words to XFB entries Use INSERT_UNION_PADDING_WORDS instead of alignas to ensure a size requirement.	2020-03-13 18:33:05 -03:00
ReinUsesLisp	4bc4851d45	gl_shader_decompiler: Fix implicit conversion errors	2020-03-13 18:33:05 -03:00
Rodrigo Locatti	47459f6a36	vk_shader_decompiler: Fix implicit type conversion Co-Authored-By: Mat M. <mathew1800@gmail.com>	2020-03-13 18:33:05 -03:00
ReinUsesLisp	2fae1e6205	vk_rasterizer: Implement transform feedback binding zero	2020-03-13 18:33:05 -03:00
ReinUsesLisp	b67360c0f8	vk_shader_decompiler: Add XFB decorations to generic varyings	2020-03-13 18:33:05 -03:00
ReinUsesLisp	8d5bdcb17b	vk_device: Enable VK_EXT_transform_feedback when available	2020-03-13 18:33:05 -03:00
ReinUsesLisp	c320702092	vk_device: Shrink formatless capability name size	2020-03-13 18:33:05 -03:00
ReinUsesLisp	ae6189d7c2	shader/transform_feedback: Expose buffer stride	2020-03-13 18:33:05 -03:00
ReinUsesLisp	7acebd7eb6	vk_shader_decompiler: Use registry for specialization	2020-03-13 18:33:05 -03:00
ReinUsesLisp	8e9f23f393	gl_rasterizer: Implement transform feedback bindings	2020-03-13 18:33:04 -03:00
ReinUsesLisp	4d711dface	gl_shader_decompiler: Decorate output attributes with XFB layout We sometimes have to slice attributes in different parts. This is needed for example in instances where the game feedbacks 3 components but writes 4 from the shader (something that is possible with GL_NV_transform_feedback).	2020-03-13 18:33:04 -03:00
ReinUsesLisp	3dcaa84ba4	shader/transform_feedback: Add host API friendly TFB builder	2020-03-13 18:33:04 -03:00
Fernando Sahmkow	666d431ad8	Merge pull request #3473 from ReinUsesLisp/shader-purge gl_shader_cache: Rework shader cache and store texture arrays	2020-03-13 16:26:24 -04:00
Rodrigo Locatti	244fe13219	Merge branch 'master' into shader-purge	2020-03-13 16:44:06 -03:00
bunnei	b30b1f741d	Merge pull request #3491 from ReinUsesLisp/polygon-modes gl_rasterizer: Implement polygon modes and fill rectangles	2020-03-13 10:08:57 -04:00
makigumo	753bc2026f	fix formatting	2020-03-13 11:37:24 +01:00
makigumo	54681909be	maxwell_to_vk: add vertex format eA2B10G10R10UnormPack32	2020-03-13 11:26:13 +01:00
Nguyen Dac Nam	00607fe1e0	clang-format	2020-03-13 15:38:57 +07:00
Nguyen Dac Nam	325977c0c6	Apply suggestions from code review Co-Authored-By: Mat M. <mathew1800@gmail.com>	2020-03-13 15:35:15 +07:00
Nguyen Dac Nam	70ff82f72d	shader_decode: BFE add ref of reverse parallel method.	2020-03-13 14:20:18 +07:00
Nguyen Dac Nam	96a4abe12d	shader_decode: implement BREV on BFE Implement reverse parallel follow: https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel	2020-03-13 14:13:31 +07:00
Nguyen Dac Nam	93547cac68	shader_bytecode: update BFE instructions struct.	2020-03-13 12:52:16 +07:00
Nguyen Dac Nam	911c56ccef	node_helper: add IBitfieldExtract case	2020-03-13 12:50:32 +07:00
Nguyen Dac Nam	465ba30d08	shader_decode: Reimplement BFE instructions	2020-03-13 12:48:01 +07:00
ReinUsesLisp	e24197bb3f	gl_shader_decompiler: Initialize gl_Position on vertex shaders	2020-03-12 23:31:06 -03:00
Fernando Sahmkow	00e9ba0603	Merge pull request #3483 from namkazt/patch-1 vk_rasterizer: fix mistype on SetupGraphicsImages	2020-03-12 22:10:48 -04:00
Fernando Sahmkow	f159a12820	Merge pull request #3480 from ReinUsesLisp/vk-disabled-ubo vk_rasterizer: Support disabled uniform buffers	2020-03-12 22:09:49 -04:00
ReinUsesLisp	3a10016e38	gl_shader_decompiler: Add missing {} on smem GLSL emission	2020-03-12 21:50:37 -03:00
ReinUsesLisp	4dcca90ef4	video_core: Implement RGBA16_SNORM Implement RGBA16_SNORM with the current API. Nothing special here.	2020-03-12 21:42:33 -03:00
ReinUsesLisp	e22816a5bb	texture_cache: Report incompatible textures as black Some games bind incompatible texture types to certain types. For example Astral Chain binds a 2D texture with 1 layer (non-array) to a cubemap slot (that's how it's used in the shader). After testing this in hardware, the expected "undefined behavior" is to report all pixels as black. We already have a path for reporting black textures in the texture cache. When textures types are incompatible, this commit binds these kind of textures. This is done on the API agnostic texture cache so no extra code has to be inserted on OpenGL or Vulkan. As a side effect, this fixes invalidations of ASTC textures on Astral Chain. This happened because yuzu detected a cube texture and forced 6 faces, generating a texture larger than what the TIC reported.	2020-03-12 18:22:05 -03:00
ReinUsesLisp	daae6a323b	texture_cache/surface_params: Force depth=1 on 2D textures Sometimes games will sample a 2D array TIC with a 2D access in the shader. This causes bad interactions with the rest of the texture cache. To emulate what the game wants to do, force a depth=1 on 2D textures (not 2D arrays) and let the texture cache handle the rest.	2020-03-12 18:11:42 -03:00
ReinUsesLisp	38fe070d78	gl_shader_decompiler: Add layer component to texelFetch TexelFetch was not emitting the array component generating invalid GLSL.	2020-03-12 18:10:29 -03:00
bunnei	ca2d228c9d	Merge pull request #3497 from FernandoS27/microprogfile-extend Small corrections and features to microprofile	2020-03-12 12:14:03 -04:00
bunnei	c21dc36cda	Merge pull request #3496 from vitor-k/remove-enum framebuffer_layout.h: drop the use of enum for screen dimensions	2020-03-12 12:00:39 -04:00
ReinUsesLisp	825d629565	gl_shader_decompiler: Fix regression in render target declarations A previous commit introduced a way to declare as few render targets as possible. Turns out this introduced a regression in some games.	2020-03-12 05:01:20 -03:00
Vitor Kiguchi	e891ff9a0c	framebuffer_layout.h: drop the use of enum for screen dimensions. +clang format	2020-03-11 14:22:28 -03:00
ReinUsesLisp	e4bc3c3342	gl_rasterizer: Implement polygon modes and fill rectangles	2020-03-09 20:39:58 -03:00
ReinUsesLisp	eb5861e0a2	engines/maxwell_3d: Add TFB registers and store them in shader registry	2020-03-09 18:40:53 -03:00
ReinUsesLisp	b1acb4f73f	shader/registry: Address feedback	2020-03-09 18:40:53 -03:00
ReinUsesLisp	b1061afed9	gl_shader_decompiler: Add identifier to decompiled code	2020-03-09 18:40:53 -03:00
ReinUsesLisp	e612242977	gl_shader_decompiler: Roll back to GLSL core 430 RenderDoc won't build shaders if we use GLSL compatibility.	2020-03-09 18:40:53 -03:00
ReinUsesLisp	978172530e	const_buffer_engine_interface: Store component types This is required for Vulkan. Sampling integer textures with float handles is illegal.	2020-03-09 18:40:53 -03:00
ReinUsesLisp	120f688272	yuzu/loading_screen: Remove unused shader progress mode	2020-03-09 18:40:53 -03:00
ReinUsesLisp	e1932351a9	gl_shader_cache: Reduce registry consistency to debug assert Registry consistency is something that practically can't happen and it has a measurable runtime cost. Reduce it to a DEBUG_ASSERT.	2020-03-09 18:40:07 -03:00
ReinUsesLisp	66a8a3e887	shader/registry: Cache tessellation state	2020-03-09 18:40:07 -03:00
ReinUsesLisp	0528be5c92	shader/registry: Store graphics and compute metadata Store information GLSL forces us to provide but it's dynamic state in hardware (workgroup sizes, primitive topology, shared memory size).	2020-03-09 18:40:07 -03:00
ReinUsesLisp	e8efd5a901	video_core: Rename "const buffer locker" to "registry"	2020-03-09 18:40:06 -03:00
ReinUsesLisp	bd8b9bbcee	gl_shader_cache: Rework shader cache and remove post-specializations Instead of pre-specializing shaders and then post-specializing them, drop the later and only "specialize" the shader while decoding it.	2020-03-09 18:40:06 -03:00
Nguyen Dac Nam	16cfbb068c	vk_reasterizer: fix mistype on SetupGraphicsImages This should use Maxwell3D engine. Fixed some GPU error on Kirby and maybe other games.	2020-03-08 10:06:59 +07:00
ReinUsesLisp	e4f9ce0379	vk_rasterizer: Support disabled uniform buffers	2020-03-06 18:47:51 -03:00