Remove comments

btdrv: Update function table
Merge pull request #4159 from ogniK5377/mem-manager-dumb-assert
2020-06-27 01:32:28 +02:00 · 2020-06-26 21:52:55 +02:00 · 2020-06-25 22:53:13 -04:00 · 2020-06-25 21:05:27 -03:00 · 2020-06-25 15:10:18 -04:00 · 2020-06-25 15:58:09 -03:00
168 changed files with 7641 additions and 2572 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,6 +13,9 @@
 [submodule "soundtouch"]
    path = externals/soundtouch
    url = https://github.com/citra-emu/ext-soundtouch.git
+[submodule "libressl"]
+    path = externals/libressl
+    url = https://github.com/citra-emu/ext-libressl-portable.git
 [submodule "discord-rpc"]
    path = externals/discord-rpc
    url = https://github.com/discordapp/discord-rpc.git
@@ -28,3 +31,6 @@
 [submodule "libzip"]
    path = externals/libzip/libzip
    url = https://github.com/nih-at/libzip.git
+[submodule "xbyak"]
+    path = externals/xbyak
+    url = https://github.com/herumi/xbyak.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.11)
+cmake_minimum_required(VERSION 3.15)

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/externals/cmake-modules")
@@ -13,7 +13,7 @@ project(yuzu)
 option(ENABLE_SDL2 "Enable the SDL2 frontend" ON)

 option(ENABLE_QT "Enable the Qt frontend" ON)
-CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" OFF "ENABLE_QT;MSVC" OFF)
+CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" ON "ENABLE_QT;MSVC" OFF)

 option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON)

@@ -152,7 +152,6 @@ macro(yuzu_find_packages)
        "Boost             1.71        boost/1.72.0"
        "Catch2            2.11        catch2/2.11.0"
        "fmt               6.2         fmt/6.2.0"
-        "OpenSSL           1.1         openssl/1.1.1f"
    # can't use until https://github.com/bincrafters/community/issues/1173
        #"libzip            1.5         libzip/1.5.2@bincrafters/stable"
        "lz4               1.8         lz4/1.9.2"
@@ -312,15 +311,6 @@ elseif (TARGET Boost::boost)
    add_library(boost ALIAS Boost::boost)
 endif()

-if (NOT TARGET OpenSSL::SSL)
-    set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
-    add_library(OpenSSL::SSL ALIAS OpenSSL::OpenSSL)
-endif()
-if (NOT TARGET OpenSSL::Crypto)
-    set_target_properties(OpenSSL::OpenSSL PROPERTIES IMPORTED_GLOBAL TRUE)
-    add_library(OpenSSL::Crypto ALIAS OpenSSL::OpenSSL)
-endif()
-
 if (TARGET sdl2::sdl2)
    # imported from the conan generated sdl2Config.cmake
    set_target_properties(sdl2::sdl2 PROPERTIES IMPORTED_GLOBAL TRUE)
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -51,6 +51,8 @@ endif()
 # The variable SRC_DIR must be passed into the script (since it uses the current build directory for all values of CMAKE_*_DIR)
 set(VIDEO_CORE "${SRC_DIR}/src/video_core")
 set(HASH_FILES
+    "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
+    "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
    "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
--- a/dist/qt_themes/qdarkstyle/style.qss
+++ b/dist/qt_themes/qdarkstyle/style.qss
@@ -673,10 +673,6 @@ QTabWidget::pane {
    border-bottom-left-radius: 2px;
 }

-QTabWidget::tab-bar {
-    overflow: visible;
-}
-
 QTabBar {
    qproperty-drawBase: 0;
    border-radius: 3px;
--- a/dist/yuzu.manifest
+++ b/dist/yuzu.manifest
@@ -1,24 +1,58 @@
 <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
-<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
- <trustInfo xmlns="urn:schemas-microsoft-com:asm.v3">
-  <security>
-   <requestedPrivileges>
-    <requestedExecutionLevel level="asInvoker" uiAccess="false"/>
-   </requestedPrivileges>
-  </security>
- </trustInfo>
- <application xmlns="urn:schemas-microsoft-com:asm.v3">
-  <windowsSettings>
-   <dpiAware xmlns="http://schemas.microsoft.com/SMI/2005/WindowsSettings">True/PM</dpiAware>
-   <longPathAware xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">true</longPathAware>
-  </windowsSettings>
- </application>
- <compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1">
-  <application>
-   <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
-   <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
-   <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
-   <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
-  </application>
- </compatibility>
-</assembly>
+<assembly manifestVersion="1.0"
+    xmlns="urn:schemas-microsoft-com:asm.v1"
+    xmlns:asmv3="urn:schemas-microsoft-com:asm.v3">
+  <asmv3:application>
+    <asmv3:windowsSettings>
+      <!-- Windows 7/8/8.1/10 -->
+      <dpiAware
+        xmlns="http://schemas.microsoft.com/SMI/2005/WindowsSettings">
+        true/pm
+      </dpiAware>
+      <!-- Windows 10, version 1607 or later -->
+      <dpiAwareness
+        xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">
+        PerMonitorV2
+      </dpiAwareness>
+      <!-- Windows 10, version 1703 or later -->
+      <gdiScaling
+          xmlns="http://schemas.microsoft.com/SMI/2017/WindowsSettings">
+        true
+      </gdiScaling>
+      <ws2:longPathAware
+          xmlns:ws3="http://schemas.microsoft.com/SMI/2016/WindowsSettings">
+        true
+      </ws2:longPathAware>
+    </asmv3:windowsSettings>
+  </asmv3:application>
+  <compatibility
+      xmlns="urn:schemas-microsoft-com:compatibility.v1">
+    <application>
+      <!-- Windows 10 -->
+      <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
+      <!-- Windows 8.1 -->
+      <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
+      <!-- Windows 8 -->
+      <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
+      <!-- Windows 7 -->
+      <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
+    </application>
+  </compatibility>
+  <trustInfo
+      xmlns="urn:schemas-microsoft-com:asm.v3">
+    <security>
+      <requestedPrivileges>
+        <!--
+          UAC settings:
+          - app should run at same integrity level as calling process
+          - app does not need to manipulate windows belonging to
+            higher-integrity-level processes
+          -->
+        <requestedExecutionLevel
+            level="asInvoker"
+            uiAccess="false"
+        />
+      </requestedPrivileges>
+    </security>
+  </trustInfo>
+</assembly>
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -4,6 +4,13 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMakeModules")
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/externals/find-modules")
 include(DownloadExternals)

+# xbyak
+if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
+    add_library(xbyak INTERFACE)
+    target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
+    target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
+endif()
+
 # Catch
 add_library(catch-single-include INTERFACE)
 target_include_directories(catch-single-include INTERFACE catch/single_include)
@@ -66,6 +73,15 @@ if (NOT LIBZIP_FOUND)
 endif()

 if (ENABLE_WEB_SERVICE)
+    # LibreSSL
+    set(LIBRESSL_SKIP_INSTALL ON CACHE BOOL "")
+    add_subdirectory(libressl EXCLUDE_FROM_ALL)
+    target_include_directories(ssl INTERFACE ./libressl/include)
+    target_compile_definitions(ssl PRIVATE -DHAVE_INET_NTOP)
+    get_directory_property(OPENSSL_LIBRARIES
+        DIRECTORY libressl
+        DEFINITION OPENSSL_LIBS)
+
    # lurlparser
    add_subdirectory(lurlparser EXCLUDE_FROM_ALL)

@@ -73,5 +89,5 @@ if (ENABLE_WEB_SERVICE)
    add_library(httplib INTERFACE)
    target_include_directories(httplib INTERFACE ./httplib)
    target_compile_definitions(httplib INTERFACE -DCPPHTTPLIB_OPENSSL_SUPPORT)
-    target_link_libraries(httplib INTERFACE OpenSSL::SSL OpenSSL::Crypto)
+    target_link_libraries(httplib INTERFACE ${OPENSSL_LIBRARIES})
 endif()
--- a/externals/libressl
+++ b/externals/libressl
--- a/externals/sirit
+++ b/externals/sirit
--- a/externals/xbyak
+++ b/externals/xbyak
--- a/src/audio_core/audio_renderer.cpp
+++ b/src/audio_core/audio_renderer.cpp
@@ -180,11 +180,12 @@ ResultVal<std::vector<u8>> AudioRenderer::UpdateAudioRenderer(const std::vector<

    // Copy output header
    UpdateDataHeader response_data{worker_params};
-    std::vector<u8> output_params(response_data.total_size);
    if (behavior_info.IsElapsedFrameCountSupported()) {
-        response_data.frame_count = 0x10;
-        response_data.total_size += 0x10;
+        response_data.render_info = sizeof(RendererInfo);
+        response_data.total_size += sizeof(RendererInfo);
    }
+
+    std::vector<u8> output_params(response_data.total_size);
    std::memcpy(output_params.data(), &response_data, sizeof(UpdateDataHeader));

    // Copy output memory pool entries
@@ -219,6 +220,17 @@ ResultVal<std::vector<u8>> AudioRenderer::UpdateAudioRenderer(const std::vector<
        return Audren::ERR_INVALID_PARAMETERS;
    }

+    if (behavior_info.IsElapsedFrameCountSupported()) {
+        const std::size_t renderer_info_offset{
+            sizeof(UpdateDataHeader) + response_data.memory_pools_size + response_data.voices_size +
+            response_data.effects_size + response_data.sinks_size +
+            response_data.performance_manager_size + response_data.behavior_size};
+        RendererInfo renderer_info{};
+        renderer_info.elasped_frame_count = elapsed_frame_count;
+        std::memcpy(output_params.data() + renderer_info_offset, &renderer_info,
+                    sizeof(RendererInfo));
+    }
+
    return MakeResult(output_params);
 }

@@ -447,6 +459,7 @@ void AudioRenderer::QueueMixedBuffer(Buffer::Tag tag) {
        }
    }
    audio_out->QueueBuffer(stream, tag, std::move(buffer));
+    elapsed_frame_count++;
 }

 void AudioRenderer::ReleaseAndQueueBuffers() {
--- a/src/audio_core/audio_renderer.h
+++ b/src/audio_core/audio_renderer.h
@@ -196,6 +196,12 @@ struct EffectOutStatus {
 };
 static_assert(sizeof(EffectOutStatus) == 0x10, "EffectOutStatus is an invalid size");

+struct RendererInfo {
+    u64_le elasped_frame_count{};
+    INSERT_PADDING_WORDS(2);
+};
+static_assert(sizeof(RendererInfo) == 0x10, "RendererInfo is an invalid size");
+
 struct UpdateDataHeader {
    UpdateDataHeader() {}

@@ -209,7 +215,7 @@ struct UpdateDataHeader {
        mixes_size = 0x0;
        sinks_size = config.sink_count * 0x20;
        performance_manager_size = 0x10;
-        frame_count = 0;
+        render_info = 0;
        total_size = sizeof(UpdateDataHeader) + behavior_size + memory_pools_size + voices_size +
                     effects_size + sinks_size + performance_manager_size;
    }
@@ -223,8 +229,8 @@ struct UpdateDataHeader {
    u32_le mixes_size{};
    u32_le sinks_size{};
    u32_le performance_manager_size{};
-    INSERT_PADDING_WORDS(1);
-    u32_le frame_count{};
+    u32_le splitter_size{};
+    u32_le render_info{};
    INSERT_PADDING_WORDS(4);
    u32_le total_size{};
 };
@@ -258,6 +264,7 @@ private:
    std::unique_ptr<AudioOut> audio_out;
    StreamPtr stream;
    Core::Memory::Memory& memory;
+    std::size_t elapsed_frame_count{};
 };

 } // namespace AudioCore
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -32,6 +32,8 @@ add_custom_command(OUTPUT scm_rev.cpp
    DEPENDS
      # WARNING! It was too much work to try and make a common location for this list,
      # so if you need to change it, please update CMakeModules/GenerateSCMRev.cmake as well
+      "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.cpp"
+      "${VIDEO_CORE}/renderer_opengl/gl_arb_decompiler.h"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.cpp"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_cache.h"
      "${VIDEO_CORE}/renderer_opengl/gl_shader_decompiler.cpp"
@@ -123,6 +125,8 @@ add_library(common STATIC
    lz4_compression.cpp
    lz4_compression.h
    math_util.h
+    memory_detect.cpp
+    memory_detect.h
    memory_hook.cpp
    memory_hook.h
    microprofile.cpp
@@ -169,10 +173,12 @@ if(ARCHITECTURE_x86_64)
        PRIVATE
            x64/cpu_detect.cpp
            x64/cpu_detect.h
+            x64/xbyak_abi.h
+            x64/xbyak_util.h
    )
 endif()

 create_target_directory_groups(common)

 target_link_libraries(common PUBLIC Boost::boost fmt::fmt microprofile)
-target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd)
+target_link_libraries(common PRIVATE lz4::lz4 zstd::zstd xbyak)
--- a/src/common/memory_detect.cpp
+++ b/src/common/memory_detect.cpp
@@ -0,0 +1,60 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#ifdef _WIN32
+// clang-format off
+#include <windows.h>
+#include <sysinfoapi.h>
+// clang-format on
+#else
+#include <sys/types.h>
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#else
+#include <sys/sysinfo.h>
+#endif
+#endif
+
+#include "common/memory_detect.h"
+
+namespace Common {
+
+// Detects the RAM and Swapfile sizes
+static MemoryInfo Detect() {
+    MemoryInfo mem_info{};
+
+#ifdef _WIN32
+    MEMORYSTATUSEX memorystatus;
+    memorystatus.dwLength = sizeof(memorystatus);
+    GlobalMemoryStatusEx(&memorystatus);
+    mem_info.TotalPhysicalMemory = memorystatus.ullTotalPhys;
+    mem_info.TotalSwapMemory = memorystatus.ullTotalPageFile - mem_info.TotalPhysicalMemory;
+#elif defined(__APPLE__)
+    u64 ramsize;
+    struct xsw_usage vmusage;
+    std::size_t sizeof_ramsize = sizeof(ramsize);
+    std::size_t sizeof_vmusage = sizeof(vmusage);
+    // hw and vm are defined in sysctl.h
+    // https://github.com/apple/darwin-xnu/blob/master/bsd/sys/sysctl.h#L471
+    // sysctlbyname(const char *, void *, size_t *, void *, size_t);
+    sysctlbyname("hw.memsize", &ramsize, &sizeof_ramsize, NULL, 0);
+    sysctlbyname("vm.swapusage", &vmusage, &sizeof_vmusage, NULL, 0);
+    mem_info.TotalPhysicalMemory = ramsize;
+    mem_info.TotalSwapMemory = vmusage.xsu_total;
+#else
+    struct sysinfo meminfo;
+    sysinfo(&meminfo);
+    mem_info.TotalPhysicalMemory = meminfo.totalram;
+    mem_info.TotalSwapMemory = meminfo.totalswap;
+#endif
+
+    return mem_info;
+}
+
+const MemoryInfo& GetMemInfo() {
+    static MemoryInfo mem_info = Detect();
+    return mem_info;
+}
+
+} // namespace Common
--- a/src/common/memory_detect.h
+++ b/src/common/memory_detect.h
@@ -0,0 +1,22 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Common {
+
+struct MemoryInfo {
+    u64 TotalPhysicalMemory{};
+    u64 TotalSwapMemory{};
+};
+
+/**
+ * Gets the memory info of the host system
+ * @return Reference to a MemoryInfo struct with the physical and swap memory sizes in bytes
+ */
+const MemoryInfo& GetMemInfo();
+
+} // namespace Common
--- a/src/common/telemetry.cpp
+++ b/src/common/telemetry.cpp
@@ -60,6 +60,7 @@ void AppendCPUInfo(FieldCollection& fc) {
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AES", Common::GetCPUCaps().aes);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX", Common::GetCPUCaps().avx);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX2", Common::GetCPUCaps().avx2);
+    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_AVX512", Common::GetCPUCaps().avx512);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_BMI1", Common::GetCPUCaps().bmi1);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_BMI2", Common::GetCPUCaps().bmi2);
    fc.AddField(FieldType::UserSystem, "CPU_Extension_x64_FMA", Common::GetCPUCaps().fma);
--- a/src/common/x64/cpu_detect.cpp
+++ b/src/common/x64/cpu_detect.cpp
@@ -110,6 +110,11 @@ static CPUCaps Detect() {
                caps.bmi1 = true;
            if ((cpu_id[1] >> 8) & 1)
                caps.bmi2 = true;
+            // Checks for AVX512F, AVX512CD, AVX512VL, AVX512DQ, AVX512BW (Intel Skylake-X/SP)
+            if ((cpu_id[1] >> 16) & 1 && (cpu_id[1] >> 28) & 1 && (cpu_id[1] >> 31) & 1 &&
+                (cpu_id[1] >> 17) & 1 && (cpu_id[1] >> 30) & 1) {
+                caps.avx512 = caps.avx2;
+            }
        }
    }

--- a/src/common/x64/cpu_detect.h
+++ b/src/common/x64/cpu_detect.h
@@ -19,6 +19,7 @@ struct CPUCaps {
    bool lzcnt;
    bool avx;
    bool avx2;
+    bool avx512;
    bool bmi1;
    bool bmi2;
    bool fma;
--- a/src/common/x64/xbyak_abi.h
+++ b/src/common/x64/xbyak_abi.h
@@ -0,0 +1,229 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <bitset>
+#include <initializer_list>
+#include <xbyak.h>
+#include "common/assert.h"
+
+namespace Common::X64 {
+
+inline std::size_t RegToIndex(const Xbyak::Reg& reg) {
+    using Kind = Xbyak::Reg::Kind;
+    ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0,
+               "RegSet only support GPRs and XMM registers.");
+    ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15.");
+    return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16);
+}
+
+inline Xbyak::Reg64 IndexToReg64(std::size_t reg_index) {
+    ASSERT(reg_index < 16);
+    return Xbyak::Reg64(static_cast<int>(reg_index));
+}
+
+inline Xbyak::Xmm IndexToXmm(std::size_t reg_index) {
+    ASSERT(reg_index >= 16 && reg_index < 32);
+    return Xbyak::Xmm(static_cast<int>(reg_index - 16));
+}
+
+inline Xbyak::Reg IndexToReg(std::size_t reg_index) {
+    if (reg_index < 16) {
+        return IndexToReg64(reg_index);
+    } else {
+        return IndexToXmm(reg_index);
+    }
+}
+
+inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
+    std::bitset<32> bits;
+    for (const Xbyak::Reg& reg : regs) {
+        bits[RegToIndex(reg)] = true;
+    }
+    return bits;
+}
+
+const std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
+const std::bitset<32> ABI_ALL_XMMS(0xFFFF0000);
+
+#ifdef _WIN32
+
+// Microsoft x64 ABI
+const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx;
+const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
+const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
+const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;
+
+const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rcx,
+    Xbyak::util::rdx,
+    Xbyak::util::r8,
+    Xbyak::util::r9,
+    Xbyak::util::r10,
+    Xbyak::util::r11,
+    // XMMs
+    Xbyak::util::xmm0,
+    Xbyak::util::xmm1,
+    Xbyak::util::xmm2,
+    Xbyak::util::xmm3,
+    Xbyak::util::xmm4,
+    Xbyak::util::xmm5,
+});
+
+const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rbx,
+    Xbyak::util::rsi,
+    Xbyak::util::rdi,
+    Xbyak::util::rbp,
+    Xbyak::util::r12,
+    Xbyak::util::r13,
+    Xbyak::util::r14,
+    Xbyak::util::r15,
+    // XMMs
+    Xbyak::util::xmm6,
+    Xbyak::util::xmm7,
+    Xbyak::util::xmm8,
+    Xbyak::util::xmm9,
+    Xbyak::util::xmm10,
+    Xbyak::util::xmm11,
+    Xbyak::util::xmm12,
+    Xbyak::util::xmm13,
+    Xbyak::util::xmm14,
+    Xbyak::util::xmm15,
+});
+
+constexpr size_t ABI_SHADOW_SPACE = 0x20;
+
+#else
+
+// System V x86-64 ABI
+const Xbyak::Reg ABI_RETURN = Xbyak::util::rax;
+const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi;
+const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
+const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
+const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;
+
+const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rcx,
+    Xbyak::util::rdx,
+    Xbyak::util::rdi,
+    Xbyak::util::rsi,
+    Xbyak::util::r8,
+    Xbyak::util::r9,
+    Xbyak::util::r10,
+    Xbyak::util::r11,
+    // XMMs
+    Xbyak::util::xmm0,
+    Xbyak::util::xmm1,
+    Xbyak::util::xmm2,
+    Xbyak::util::xmm3,
+    Xbyak::util::xmm4,
+    Xbyak::util::xmm5,
+    Xbyak::util::xmm6,
+    Xbyak::util::xmm7,
+    Xbyak::util::xmm8,
+    Xbyak::util::xmm9,
+    Xbyak::util::xmm10,
+    Xbyak::util::xmm11,
+    Xbyak::util::xmm12,
+    Xbyak::util::xmm13,
+    Xbyak::util::xmm14,
+    Xbyak::util::xmm15,
+});
+
+const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+    // GPRs
+    Xbyak::util::rbx,
+    Xbyak::util::rbp,
+    Xbyak::util::r12,
+    Xbyak::util::r13,
+    Xbyak::util::r14,
+    Xbyak::util::r15,
+});
+
+constexpr size_t ABI_SHADOW_SPACE = 0;
+
+#endif
+
+struct ABIFrameInfo {
+    s32 subtraction;
+    s32 xmm_offset;
+};
+
+inline ABIFrameInfo ABI_CalculateFrameSize(std::bitset<32> regs, size_t rsp_alignment,
+                                           size_t needed_frame_size) {
+    const auto count = (regs & ABI_ALL_GPRS).count();
+    rsp_alignment -= count * 8;
+    size_t subtraction = 0;
+    const auto xmm_count = (regs & ABI_ALL_XMMS).count();
+    if (xmm_count) {
+        // If we have any XMMs to save, we must align the stack here.
+        subtraction = rsp_alignment & 0xF;
+    }
+    subtraction += 0x10 * xmm_count;
+    size_t xmm_base_subtraction = subtraction;
+    subtraction += needed_frame_size;
+    subtraction += ABI_SHADOW_SPACE;
+    // Final alignment.
+    rsp_alignment -= subtraction;
+    subtraction += rsp_alignment & 0xF;
+
+    return ABIFrameInfo{static_cast<s32>(subtraction),
+                        static_cast<s32>(subtraction - xmm_base_subtraction)};
+}
+
+inline size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                              size_t rsp_alignment, size_t needed_frame_size = 0) {
+    auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
+
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_GPRS[i]) {
+            code.push(IndexToReg64(i));
+        }
+    }
+
+    if (frame_info.subtraction != 0) {
+        code.sub(code.rsp, frame_info.subtraction);
+    }
+
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_XMMS[i]) {
+            code.movaps(code.xword[code.rsp + frame_info.xmm_offset], IndexToXmm(i));
+            frame_info.xmm_offset += 0x10;
+        }
+    }
+
+    return ABI_SHADOW_SPACE;
+}
+
+inline void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, std::bitset<32> regs,
+                                           size_t rsp_alignment, size_t needed_frame_size = 0) {
+    auto frame_info = ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size);
+
+    for (std::size_t i = 0; i < regs.size(); ++i) {
+        if (regs[i] && ABI_ALL_XMMS[i]) {
+            code.movaps(IndexToXmm(i), code.xword[code.rsp + frame_info.xmm_offset]);
+            frame_info.xmm_offset += 0x10;
+        }
+    }
+
+    if (frame_info.subtraction != 0) {
+        code.add(code.rsp, frame_info.subtraction);
+    }
+
+    // GPRs need to be popped in reverse order
+    for (std::size_t j = 0; j < regs.size(); ++j) {
+        const std::size_t i = regs.size() - j - 1;
+        if (regs[i] && ABI_ALL_GPRS[i]) {
+            code.pop(IndexToReg64(i));
+        }
+    }
+}
+
+} // namespace Common::X64
--- a/src/common/x64/xbyak_util.h
+++ b/src/common/x64/xbyak_util.h
@@ -0,0 +1,47 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include <xbyak.h>
+#include "common/x64/xbyak_abi.h"
+
+namespace Common::X64 {
+
+// Constants for use with cmpps/cmpss
+enum {
+    CMP_EQ = 0,
+    CMP_LT = 1,
+    CMP_LE = 2,
+    CMP_UNORD = 3,
+    CMP_NEQ = 4,
+    CMP_NLT = 5,
+    CMP_NLE = 6,
+    CMP_ORD = 7,
+};
+
+constexpr bool IsWithin2G(uintptr_t ref, uintptr_t target) {
+    const u64 distance = target - (ref + 5);
+    return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL);
+}
+
+inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) {
+    return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target);
+}
+
+template <typename T>
+inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) {
+    static_assert(std::is_pointer_v<T>, "Argument must be a (function) pointer.");
+    size_t addr = reinterpret_cast<size_t>(f);
+    if (IsWithin2G(code, addr)) {
+        code.call(f);
+    } else {
+        // ABI_RETURN is a safe temp register to use before a call
+        code.mov(ABI_RETURN, addr);
+        code.call(ABI_RETURN);
+    }
+}
+
+} // namespace Common::X64
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -606,11 +606,11 @@ endif()
 create_target_directory_groups(core)

 target_link_libraries(core PUBLIC common PRIVATE audio_core video_core)
-target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls Opus::Opus unicorn)
+target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls Opus::Opus unicorn zip)

 if (YUZU_ENABLE_BOXCAT)
    target_compile_definitions(core PRIVATE -DYUZU_ENABLE_BOXCAT)
-    target_link_libraries(core PRIVATE httplib nlohmann_json::nlohmann_json zip)
+    target_link_libraries(core PRIVATE httplib nlohmann_json::nlohmann_json)
 endif()

 if (ENABLE_WEB_SERVICE)
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -50,7 +50,8 @@ public:
    }

    void InterpreterFallback(u32 pc, std::size_t num_instructions) override {
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("This should never happen, pc = {:08X}, code = {:08X}", pc,
+                          MemoryReadCode(pc));
    }

    void ExceptionRaised(u32 pc, Dynarmic::A32::Exception exception) override {
@@ -61,7 +62,7 @@ public:
        case Dynarmic::A32::Exception::Breakpoint:
            break;
        }
-        LOG_CRITICAL(HW_GPU, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
+        LOG_CRITICAL(Core_ARM, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
                     static_cast<std::size_t>(exception), pc, MemoryReadCode(pc));
        UNIMPLEMENTED();
    }
@@ -89,8 +90,6 @@ public:

    ARM_Dynarmic_32& parent;
    std::size_t num_interpreted_instructions{};
-    u64 tpidrro_el0{};
-    u64 tpidr_el0{};
 };

 std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable& page_table,
@@ -99,7 +98,7 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable&
    config.callbacks = cb.get();
    // TODO(bunnei): Implement page table for 32-bit
    // config.page_table = &page_table.pointers;
-    config.coprocessors[15] = std::make_shared<DynarmicCP15>((u32*)&CP15_regs[0]);
+    config.coprocessors[15] = cp15;
    config.define_unpredictable_behaviour = true;
    return std::make_unique<Dynarmic::A32::Jit>(config);
 }
@@ -112,13 +111,13 @@ void ARM_Dynarmic_32::Run() {
 }

 void ARM_Dynarmic_32::Step() {
-    cb->InterpreterFallback(jit->Regs()[15], 1);
+    jit->Step();
 }

 ARM_Dynarmic_32::ARM_Dynarmic_32(System& system, ExclusiveMonitor& exclusive_monitor,
                                 std::size_t core_index)
-    : ARM_Interface{system},
-      cb(std::make_unique<DynarmicCallbacks32>(*this)), core_index{core_index},
+    : ARM_Interface{system}, cb(std::make_unique<DynarmicCallbacks32>(*this)),
+      cp15(std::make_shared<DynarmicCP15>(*this)), core_index{core_index},
      exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}

 ARM_Dynarmic_32::~ARM_Dynarmic_32() = default;
@@ -154,19 +153,19 @@ void ARM_Dynarmic_32::SetPSTATE(u32 cpsr) {
 }

 u64 ARM_Dynarmic_32::GetTlsAddress() const {
-    return CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
+    return cp15->uro;
 }

 void ARM_Dynarmic_32::SetTlsAddress(VAddr address) {
-    CP15_regs[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)] = static_cast<u32>(address);
+    cp15->uro = static_cast<u32>(address);
 }

 u64 ARM_Dynarmic_32::GetTPIDR_EL0() const {
-    return cb->tpidr_el0;
+    return cp15->uprw;
 }

 void ARM_Dynarmic_32::SetTPIDR_EL0(u64 value) {
-    cb->tpidr_el0 = value;
+    cp15->uprw = static_cast<u32>(value);
 }

 void ARM_Dynarmic_32::SaveContext(ThreadContext32& ctx) {
--- a/src/core/arm/dynarmic/arm_dynarmic_32.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.h
@@ -22,6 +22,7 @@ class Memory;
 namespace Core {

 class DynarmicCallbacks32;
+class DynarmicCP15;
 class DynarmicExclusiveMonitor;
 class System;

@@ -66,12 +67,14 @@ private:
        std::unordered_map<JitCacheKey, std::shared_ptr<Dynarmic::A32::Jit>, Common::PairHash>;

    friend class DynarmicCallbacks32;
+    friend class DynarmicCP15;
+
    std::unique_ptr<DynarmicCallbacks32> cb;
    JitCacheType jit_cache;
    std::shared_ptr<Dynarmic::A32::Jit> jit;
+    std::shared_ptr<DynarmicCP15> cp15;
    std::size_t core_index;
    DynarmicExclusiveMonitor& exclusive_monitor;
-    std::array<u32, 84> CP15_regs{};
 };

 } // namespace Core
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -98,8 +98,8 @@ public:
            }
            [[fallthrough]];
        default:
-            ASSERT_MSG(false, "ExceptionRaised(exception = {}, pc = {:X})",
-                       static_cast<std::size_t>(exception), pc);
+            ASSERT_MSG(false, "ExceptionRaised(exception = {}, pc = {:08X}, code = {:08X})",
+                       static_cast<std::size_t>(exception), pc, MemoryReadCode(pc));
        }
    }

--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.cpp
@@ -2,79 +2,132 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <fmt/format.h>
+#include "common/logging/log.h"
+#include "core/arm/dynarmic/arm_dynarmic_32.h"
 #include "core/arm/dynarmic/arm_dynarmic_cp15.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/core_timing_util.h"

 using Callback = Dynarmic::A32::Coprocessor::Callback;
 using CallbackOrAccessOneWord = Dynarmic::A32::Coprocessor::CallbackOrAccessOneWord;
 using CallbackOrAccessTwoWords = Dynarmic::A32::Coprocessor::CallbackOrAccessTwoWords;

+template <>
+struct fmt::formatter<Dynarmic::A32::CoprocReg> {
+    constexpr auto parse(format_parse_context& ctx) {
+        return ctx.begin();
+    }
+    template <typename FormatContext>
+    auto format(const Dynarmic::A32::CoprocReg& reg, FormatContext& ctx) {
+        return format_to(ctx.out(), "cp{}", static_cast<size_t>(reg));
+    }
+};
+
+namespace Core {
+
+static u32 dummy_value;
+
 std::optional<Callback> DynarmicCP15::CompileInternalOperation(bool two, unsigned opc1,
                                                               CoprocReg CRd, CoprocReg CRn,
                                                               CoprocReg CRm, unsigned opc2) {
+    LOG_CRITICAL(Core_ARM, "CP15: cdp{} p15, {}, {}, {}, {}, {}", two ? "2" : "", opc1, CRd, CRn,
+                 CRm, opc2);
    return {};
 }

 CallbackOrAccessOneWord DynarmicCP15::CompileSendOneWord(bool two, unsigned opc1, CoprocReg CRn,
                                                         CoprocReg CRm, unsigned opc2) {
-    // TODO(merry): Privileged CP15 registers
-
    if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C5 && opc2 == 4) {
+        // CP15_FLUSH_PREFETCH_BUFFER
        // This is a dummy write, we ignore the value written here.
-        return &CP15[static_cast<std::size_t>(CP15Register::CP15_FLUSH_PREFETCH_BUFFER)];
+        return &dummy_value;
    }

    if (!two && CRn == CoprocReg::C7 && opc1 == 0 && CRm == CoprocReg::C10) {
        switch (opc2) {
        case 4:
+            // CP15_DATA_SYNC_BARRIER
            // This is a dummy write, we ignore the value written here.
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_SYNC_BARRIER)];
+            return &dummy_value;
        case 5:
+            // CP15_DATA_MEMORY_BARRIER
            // This is a dummy write, we ignore the value written here.
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_DATA_MEMORY_BARRIER)];
-        default:
-            return {};
+            return &dummy_value;
        }
    }

    if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0 && opc2 == 2) {
-        return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+        // CP15_THREAD_UPRW
+        return &uprw;
    }

+    LOG_CRITICAL(Core_ARM, "CP15: mcr{} p15, {}, <Rt>, {}, {}, {}", two ? "2" : "", opc1, CRn, CRm,
+                 opc2);
    return {};
 }

 CallbackOrAccessTwoWords DynarmicCP15::CompileSendTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    LOG_CRITICAL(Core_ARM, "CP15: mcrr{} p15, {}, <Rt>, <Rt2>, {}", two ? "2" : "", opc, CRm);
    return {};
 }

 CallbackOrAccessOneWord DynarmicCP15::CompileGetOneWord(bool two, unsigned opc1, CoprocReg CRn,
                                                        CoprocReg CRm, unsigned opc2) {
-    // TODO(merry): Privileged CP15 registers
-
    if (!two && CRn == CoprocReg::C13 && opc1 == 0 && CRm == CoprocReg::C0) {
        switch (opc2) {
        case 2:
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_UPRW)];
+            // CP15_THREAD_UPRW
+            return &uprw;
        case 3:
-            return &CP15[static_cast<std::size_t>(CP15Register::CP15_THREAD_URO)];
-        default:
-            return {};
+            // CP15_THREAD_URO
+            return &uro;
        }
    }

+    LOG_CRITICAL(Core_ARM, "CP15: mrc{} p15, {}, <Rt>, {}, {}, {}", two ? "2" : "", opc1, CRn, CRm,
+                 opc2);
    return {};
 }

 CallbackOrAccessTwoWords DynarmicCP15::CompileGetTwoWords(bool two, unsigned opc, CoprocReg CRm) {
+    if (!two && opc == 0 && CRm == CoprocReg::C14) {
+        // CNTPCT
+        const auto callback = static_cast<u64 (*)(Dynarmic::A32::Jit*, void*, u32, u32)>(
+            [](Dynarmic::A32::Jit*, void* arg, u32, u32) -> u64 {
+                ARM_Dynarmic_32& parent = *(ARM_Dynarmic_32*)arg;
+                return Timing::CpuCyclesToClockCycles(parent.system.CoreTiming().GetTicks());
+            });
+        return Dynarmic::A32::Coprocessor::Callback{callback, (void*)&parent};
+    }
+
+    LOG_CRITICAL(Core_ARM, "CP15: mrrc{} p15, {}, <Rt>, <Rt2>, {}", two ? "2" : "", opc, CRm);
    return {};
 }

 std::optional<Callback> DynarmicCP15::CompileLoadWords(bool two, bool long_transfer, CoprocReg CRd,
                                                       std::optional<u8> option) {
+    if (option) {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd, *option);
+    } else {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd);
+    }
    return {};
 }

 std::optional<Callback> DynarmicCP15::CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
                                                        std::optional<u8> option) {
+    if (option) {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...], {}", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd, *option);
+    } else {
+        LOG_CRITICAL(Core_ARM, "CP15: mrrc{}{} p15, {}, [...]", two ? "2" : "",
+                     long_transfer ? "l" : "", CRd);
+    }
    return {};
 }
+
+} // namespace Core
--- a/src/core/arm/dynarmic/arm_dynarmic_cp15.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_cp15.h
@@ -10,128 +10,15 @@
 #include <dynarmic/A32/coprocessor.h>
 #include "common/common_types.h"

-enum class CP15Register {
-    // c0 - Information registers
-    CP15_MAIN_ID,
-    CP15_CACHE_TYPE,
-    CP15_TCM_STATUS,
-    CP15_TLB_TYPE,
-    CP15_CPU_ID,
-    CP15_PROCESSOR_FEATURE_0,
-    CP15_PROCESSOR_FEATURE_1,
-    CP15_DEBUG_FEATURE_0,
-    CP15_AUXILIARY_FEATURE_0,
-    CP15_MEMORY_MODEL_FEATURE_0,
-    CP15_MEMORY_MODEL_FEATURE_1,
-    CP15_MEMORY_MODEL_FEATURE_2,
-    CP15_MEMORY_MODEL_FEATURE_3,
-    CP15_ISA_FEATURE_0,
-    CP15_ISA_FEATURE_1,
-    CP15_ISA_FEATURE_2,
-    CP15_ISA_FEATURE_3,
-    CP15_ISA_FEATURE_4,
+namespace Core {

-    // c1 - Control registers
-    CP15_CONTROL,
-    CP15_AUXILIARY_CONTROL,
-    CP15_COPROCESSOR_ACCESS_CONTROL,
-
-    // c2 - Translation table registers
-    CP15_TRANSLATION_BASE_TABLE_0,
-    CP15_TRANSLATION_BASE_TABLE_1,
-    CP15_TRANSLATION_BASE_CONTROL,
-    CP15_DOMAIN_ACCESS_CONTROL,
-    CP15_RESERVED,
-
-    // c5 - Fault status registers
-    CP15_FAULT_STATUS,
-    CP15_INSTR_FAULT_STATUS,
-    CP15_COMBINED_DATA_FSR = CP15_FAULT_STATUS,
-    CP15_INST_FSR,
-
-    // c6 - Fault Address registers
-    CP15_FAULT_ADDRESS,
-    CP15_COMBINED_DATA_FAR = CP15_FAULT_ADDRESS,
-    CP15_WFAR,
-    CP15_IFAR,
-
-    // c7 - Cache operation registers
-    CP15_WAIT_FOR_INTERRUPT,
-    CP15_PHYS_ADDRESS,
-    CP15_INVALIDATE_INSTR_CACHE,
-    CP15_INVALIDATE_INSTR_CACHE_USING_MVA,
-    CP15_INVALIDATE_INSTR_CACHE_USING_INDEX,
-    CP15_FLUSH_PREFETCH_BUFFER,
-    CP15_FLUSH_BRANCH_TARGET_CACHE,
-    CP15_FLUSH_BRANCH_TARGET_CACHE_ENTRY,
-    CP15_INVALIDATE_DATA_CACHE,
-    CP15_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
-    CP15_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
-    CP15_INVALIDATE_DATA_AND_INSTR_CACHE,
-    CP15_CLEAN_DATA_CACHE,
-    CP15_CLEAN_DATA_CACHE_LINE_USING_MVA,
-    CP15_CLEAN_DATA_CACHE_LINE_USING_INDEX,
-    CP15_DATA_SYNC_BARRIER,
-    CP15_DATA_MEMORY_BARRIER,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_MVA,
-    CP15_CLEAN_AND_INVALIDATE_DATA_CACHE_LINE_USING_INDEX,
-
-    // c8 - TLB operations
-    CP15_INVALIDATE_ITLB,
-    CP15_INVALIDATE_ITLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_ITLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_ITLB_ENTRY_ON_MVA,
-    CP15_INVALIDATE_DTLB,
-    CP15_INVALIDATE_DTLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_DTLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_DTLB_ENTRY_ON_MVA,
-    CP15_INVALIDATE_UTLB,
-    CP15_INVALIDATE_UTLB_SINGLE_ENTRY,
-    CP15_INVALIDATE_UTLB_ENTRY_ON_ASID_MATCH,
-    CP15_INVALIDATE_UTLB_ENTRY_ON_MVA,
-
-    // c9 - Data cache lockdown register
-    CP15_DATA_CACHE_LOCKDOWN,
-
-    // c10 - TLB/Memory map registers
-    CP15_TLB_LOCKDOWN,
-    CP15_PRIMARY_REGION_REMAP,
-    CP15_NORMAL_REGION_REMAP,
-
-    // c13 - Thread related registers
-    CP15_PID,
-    CP15_CONTEXT_ID,
-    CP15_THREAD_UPRW, // Thread ID register - User/Privileged Read/Write
-    CP15_THREAD_URO,  // Thread ID register - User Read Only (Privileged R/W)
-    CP15_THREAD_PRW,  // Thread ID register - Privileged R/W only.
-
-    // c15 - Performance and TLB lockdown registers
-    CP15_PERFORMANCE_MONITOR_CONTROL,
-    CP15_CYCLE_COUNTER,
-    CP15_COUNT_0,
-    CP15_COUNT_1,
-    CP15_READ_MAIN_TLB_LOCKDOWN_ENTRY,
-    CP15_WRITE_MAIN_TLB_LOCKDOWN_ENTRY,
-    CP15_MAIN_TLB_LOCKDOWN_VIRT_ADDRESS,
-    CP15_MAIN_TLB_LOCKDOWN_PHYS_ADDRESS,
-    CP15_MAIN_TLB_LOCKDOWN_ATTRIBUTE,
-    CP15_TLB_DEBUG_CONTROL,
-
-    // Skyeye defined
-    CP15_TLB_FAULT_ADDR,
-    CP15_TLB_FAULT_STATUS,
-
-    // Not an actual register.
-    // All registers should be defined above this.
-    CP15_REGISTER_COUNT,
-};
+class ARM_Dynarmic_32;

 class DynarmicCP15 final : public Dynarmic::A32::Coprocessor {
 public:
    using CoprocReg = Dynarmic::A32::CoprocReg;

-    explicit DynarmicCP15(u32* cp15) : CP15(cp15){};
+    explicit DynarmicCP15(ARM_Dynarmic_32& parent) : parent(parent) {}

    std::optional<Callback> CompileInternalOperation(bool two, unsigned opc1, CoprocReg CRd,
                                                     CoprocReg CRn, CoprocReg CRm,
@@ -147,6 +34,9 @@ public:
    std::optional<Callback> CompileStoreWords(bool two, bool long_transfer, CoprocReg CRd,
                                              std::optional<u8> option) override;

-private:
-    u32* CP15{};
+    ARM_Dynarmic_32& parent;
+    u32 uprw;
+    u32 uro;
 };
+
+} // namespace Core
--- a/src/core/file_sys/patch_manager.cpp
+++ b/src/core/file_sys/patch_manager.cpp
@@ -10,6 +10,7 @@
 #include "common/file_util.h"
 #include "common/hex_util.h"
 #include "common/logging/log.h"
+#include "common/string_util.h"
 #include "core/core.h"
 #include "core/file_sys/content_archive.h"
 #include "core/file_sys/control_metadata.h"
@@ -48,6 +49,23 @@ std::string FormatTitleVersion(u32 version, TitleVersionFormat format) {
    return fmt::format("v{}.{}.{}", bytes[3], bytes[2], bytes[1]);
 }

+std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
+                                                       std::string_view name) {
+#ifdef _WIN32
+    return dir->GetSubdirectory(name);
+#else
+    const auto subdirs = dir->GetSubdirectories();
+    for (const auto& subdir : subdirs) {
+        std::string dir_name = Common::ToLower(subdir->GetName());
+        if (dir_name == name) {
+            return subdir;
+        }
+    }
+
+    return nullptr;
+#endif
+}
+
 PatchManager::PatchManager(u64 title_id) : title_id(title_id) {}

 PatchManager::~PatchManager() = default;
@@ -104,7 +122,7 @@ VirtualDir PatchManager::PatchExeFS(VirtualDir exefs) const {
            if (std::find(disabled.begin(), disabled.end(), subdir->GetName()) != disabled.end())
                continue;

-            auto exefs_dir = subdir->GetSubdirectory("exefs");
+            auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
            if (exefs_dir != nullptr)
                layers.push_back(std::move(exefs_dir));
        }
@@ -130,7 +148,7 @@ std::vector<VirtualFile> PatchManager::CollectPatches(const std::vector<VirtualD
        if (std::find(disabled.cbegin(), disabled.cend(), subdir->GetName()) != disabled.cend())
            continue;

-        auto exefs_dir = subdir->GetSubdirectory("exefs");
+        auto exefs_dir = FindSubdirectoryCaseless(subdir, "exefs");
        if (exefs_dir != nullptr) {
            for (const auto& file : exefs_dir->GetFiles()) {
                if (file->GetExtension() == "ips") {
@@ -295,7 +313,7 @@ std::vector<Core::Memory::CheatEntry> PatchManager::CreateCheatList(
            continue;
        }

-        auto cheats_dir = subdir->GetSubdirectory("cheats");
+        auto cheats_dir = FindSubdirectoryCaseless(subdir, "cheats");
        if (cheats_dir != nullptr) {
            auto res = ReadCheatFileFromFolder(system, title_id, build_id_, cheats_dir, true);
            if (res.has_value()) {
@@ -340,11 +358,11 @@ static void ApplyLayeredFS(VirtualFile& romfs, u64 title_id, ContentRecordType t
            continue;
        }

-        auto romfs_dir = subdir->GetSubdirectory("romfs");
+        auto romfs_dir = FindSubdirectoryCaseless(subdir, "romfs");
        if (romfs_dir != nullptr)
            layers.push_back(std::move(romfs_dir));

-        auto ext_dir = subdir->GetSubdirectory("romfs_ext");
+        auto ext_dir = FindSubdirectoryCaseless(subdir, "romfs_ext");
        if (ext_dir != nullptr)
            layers_ext.push_back(std::move(ext_dir));
    }
@@ -470,7 +488,7 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
        for (const auto& mod : mod_dir->GetSubdirectories()) {
            std::string types;

-            const auto exefs_dir = mod->GetSubdirectory("exefs");
+            const auto exefs_dir = FindSubdirectoryCaseless(mod, "exefs");
            if (IsDirValidAndNonEmpty(exefs_dir)) {
                bool ips = false;
                bool ipswitch = false;
@@ -494,9 +512,9 @@ std::map<std::string, std::string, std::less<>> PatchManager::GetPatchVersionNam
                if (layeredfs)
                    AppendCommaIfNotEmpty(types, "LayeredExeFS");
            }
-            if (IsDirValidAndNonEmpty(mod->GetSubdirectory("romfs")))
+            if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "romfs")))
                AppendCommaIfNotEmpty(types, "LayeredFS");
-            if (IsDirValidAndNonEmpty(mod->GetSubdirectory("cheats")))
+            if (IsDirValidAndNonEmpty(FindSubdirectoryCaseless(mod, "cheats")))
                AppendCommaIfNotEmpty(types, "Cheats");

            if (types.empty())
--- a/src/core/file_sys/patch_manager.h
+++ b/src/core/file_sys/patch_manager.h
@@ -29,6 +29,11 @@ enum class TitleVersionFormat : u8 {
 std::string FormatTitleVersion(u32 version,
                               TitleVersionFormat format = TitleVersionFormat::ThreeElements);

+// Returns a directory with name matching name case-insensitive. Returns nullptr if directory
+// doesn't have a directory with name.
+std::shared_ptr<VfsDirectory> FindSubdirectoryCaseless(const std::shared_ptr<VfsDirectory> dir,
+                                                       std::string_view name);
+
 // A centralized class to manage patches to games.
 class PatchManager {
 public:
--- a/src/core/file_sys/system_archive/mii_model.cpp
+++ b/src/core/file_sys/system_archive/mii_model.cpp
@@ -40,7 +40,7 @@ VirtualDir MiiModel() {
    out->AddFile(std::make_shared<ArrayVfsFile<MiiModelData::SHAPE_MID.size()>>(
        MiiModelData::SHAPE_MID, "ShapeMid.dat"));

-    return std::move(out);
+    return out;
 }

 } // namespace FileSys::SystemArchive
--- a/src/core/file_sys/system_archive/shared_font.cpp
+++ b/src/core/file_sys/system_archive/shared_font.cpp
@@ -23,7 +23,7 @@ VirtualFile PackBFTTF(const std::array<u8, Size>& data, const std::string& name)

    std::vector<u8> bfttf(Size + sizeof(u64));

-    u64 offset = 0;
+    size_t offset = 0;
    Service::NS::EncryptSharedFont(vec, bfttf, offset);
    return std::make_shared<VectorVfsFile>(std::move(bfttf), name);
 }
--- a/src/core/file_sys/system_archive/system_version.cpp
+++ b/src/core/file_sys/system_archive/system_version.cpp
@@ -12,17 +12,17 @@ namespace SystemVersionData {
 // This section should reflect the best system version to describe yuzu's HLE api.
 // TODO(DarkLordZach): Update when HLE gets better.

-constexpr u8 VERSION_MAJOR = 5;
-constexpr u8 VERSION_MINOR = 1;
-constexpr u8 VERSION_MICRO = 0;
+constexpr u8 VERSION_MAJOR = 10;
+constexpr u8 VERSION_MINOR = 0;
+constexpr u8 VERSION_MICRO = 2;

-constexpr u8 REVISION_MAJOR = 3;
+constexpr u8 REVISION_MAJOR = 1;
 constexpr u8 REVISION_MINOR = 0;

 constexpr char PLATFORM_STRING[] = "NX";
-constexpr char VERSION_HASH[] = "23f9df53e25709d756e0c76effcb2473bd3447dd";
-constexpr char DISPLAY_VERSION[] = "5.1.0";
-constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 5.1.0-3.0";
+constexpr char VERSION_HASH[] = "f90143fa8bbc061d4f68c35f95f04f8080c0ecdc";
+constexpr char DISPLAY_VERSION[] = "10.0.2";
+constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 10.0.2-1.0";

 } // namespace SystemVersionData

--- a/src/core/hle/kernel/memory/memory_manager.cpp
+++ b/src/core/hle/kernel/memory/memory_manager.cpp
@@ -104,7 +104,7 @@ ResultCode MemoryManager::Allocate(PageLinkedList& page_list, std::size_t num_pa
    // Ensure that we don't leave anything un-freed
    auto group_guard = detail::ScopeExit([&] {
        for (const auto& it : page_list.Nodes()) {
-            const auto min_num_pages{std::min(
+            const auto min_num_pages{std::min<size_t>(
                it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)};
            chosen_manager.Free(it.GetAddress(), min_num_pages);
        }
@@ -139,7 +139,6 @@ ResultCode MemoryManager::Allocate(PageLinkedList& page_list, std::size_t num_pa
    }

    // Only succeed if we allocated as many pages as we wanted
-    ASSERT(num_pages >= 0);
    if (num_pages) {
        return ERR_OUT_OF_MEMORY;
    }
@@ -165,7 +164,7 @@ ResultCode MemoryManager::Free(PageLinkedList& page_list, std::size_t num_pages,

    // Free all of the pages
    for (const auto& it : page_list.Nodes()) {
-        const auto min_num_pages{std::min(
+        const auto min_num_pages{std::min<size_t>(
            it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)};
        chosen_manager.Free(it.GetAddress(), min_num_pages);
    }
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -132,7 +132,8 @@ std::shared_ptr<ResourceLimit> Process::GetResourceLimit() const {

 u64 Process::GetTotalPhysicalMemoryAvailable() const {
    const u64 capacity{resource_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory) +
-                       page_table->GetTotalHeapSize() + image_size + main_thread_stack_size};
+                       page_table->GetTotalHeapSize() + GetSystemResourceSize() + image_size +
+                       main_thread_stack_size};

    if (capacity < memory_usage_capacity) {
        return capacity;
@@ -146,7 +147,8 @@ u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const {
 }

 u64 Process::GetTotalPhysicalMemoryUsed() const {
-    return image_size + main_thread_stack_size + page_table->GetTotalHeapSize();
+    return image_size + main_thread_stack_size + page_table->GetTotalHeapSize() +
+           GetSystemResourceSize();
 }

 u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const {
--- a/src/core/hle/kernel/readable_event.cpp
+++ b/src/core/hle/kernel/readable_event.cpp
@@ -38,7 +38,7 @@ void ReadableEvent::Clear() {

 ResultCode ReadableEvent::Reset() {
    if (!is_signaled) {
-        LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
+        LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}",
                  GetObjectId(), GetTypeName(), GetName());
        return ERR_INVALID_STATE;
    }
--- a/src/core/hle/kernel/resource_limit.cpp
+++ b/src/core/hle/kernel/resource_limit.cpp
@@ -24,13 +24,9 @@ bool ResourceLimit::Reserve(ResourceType resource, s64 amount, u64 timeout) {
    const std::size_t index{ResourceTypeToIndex(resource)};

    s64 new_value = current[index] + amount;
-    while (new_value > limit[index] && available[index] + amount <= limit[index]) {
+    if (new_value > limit[index] && available[index] + amount <= limit[index]) {
        // TODO(bunnei): This is wrong for multicore, we should wait the calling thread for timeout
        new_value = current[index] + amount;
-
-        if (timeout >= 0) {
-            break;
-        }
    }

    if (new_value <= limit[index]) {
--- a/src/core/hle/service/acc/acc.cpp
+++ b/src/core/hle/service/acc/acc.cpp
@@ -44,6 +44,218 @@ static constexpr u32 SanitizeJPEGSize(std::size_t size) {
    return static_cast<u32>(std::min(size, max_jpeg_image_size));
 }

+class IManagerForSystemService final : public ServiceFramework<IManagerForSystemService> {
+public:
+    explicit IManagerForSystemService(Common::UUID user_id)
+        : ServiceFramework("IManagerForSystemService") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "CheckAvailability"},
+            {1, nullptr, "GetAccountId"},
+            {2, nullptr, "EnsureIdTokenCacheAsync"},
+            {3, nullptr, "LoadIdTokenCache"},
+            {100, nullptr, "SetSystemProgramIdentification"},
+            {101, nullptr, "RefreshNotificationTokenAsync"}, // 7.0.0+
+            {110, nullptr, "GetServiceEntryRequirementCache"}, // 4.0.0+
+            {111, nullptr, "InvalidateServiceEntryRequirementCache"}, // 4.0.0+
+            {112, nullptr, "InvalidateTokenCache"}, // 4.0.0 - 6.2.0
+            {113, nullptr, "GetServiceEntryRequirementCacheForOnlinePlay"}, // 6.1.0+
+            {120, nullptr, "GetNintendoAccountId"},
+            {121, nullptr, "CalculateNintendoAccountAuthenticationFingerprint"}, // 9.0.0+
+            {130, nullptr, "GetNintendoAccountUserResourceCache"},
+            {131, nullptr, "RefreshNintendoAccountUserResourceCacheAsync"},
+            {132, nullptr, "RefreshNintendoAccountUserResourceCacheAsyncIfSecondsElapsed"},
+            {133, nullptr, "GetNintendoAccountVerificationUrlCache"}, // 9.0.0+
+            {134, nullptr, "RefreshNintendoAccountVerificationUrlCache"}, // 9.0.0+
+            {135, nullptr, "RefreshNintendoAccountVerificationUrlCacheAsyncIfSecondsElapsed"}, // 9.0.0+
+            {140, nullptr, "GetNetworkServiceLicenseCache"}, // 5.0.0+
+            {141, nullptr, "RefreshNetworkServiceLicenseCacheAsync"}, // 5.0.0+
+            {142, nullptr, "RefreshNetworkServiceLicenseCacheAsyncIfSecondsElapsed"}, // 5.0.0+
+            {150, nullptr, "CreateAuthorizationRequest"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+// 3.0.0+
+class IFloatingRegistrationRequest final : public ServiceFramework<IFloatingRegistrationRequest> {
+public:
+    explicit IFloatingRegistrationRequest(Common::UUID user_id)
+        : ServiceFramework("IFloatingRegistrationRequest") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSessionId"},
+            {12, nullptr, "GetAccountId"},
+            {13, nullptr, "GetLinkedNintendoAccountId"},
+            {14, nullptr, "GetNickname"},
+            {15, nullptr, "GetProfileImage"},
+            {21, nullptr, "LoadIdTokenCache"},
+            {100, nullptr, "RegisterUser"}, // [1.0.0-3.0.2] RegisterAsync
+            {101, nullptr, "RegisterUserWithUid"}, // [1.0.0-3.0.2] RegisterWithUidAsync
+            {102, nullptr, "RegisterNetworkServiceAccountAsync"}, // 4.0.0+
+            {103, nullptr, "RegisterNetworkServiceAccountWithUidAsync"}, // 4.0.0+
+            {110, nullptr, "SetSystemProgramIdentification"},
+            {111, nullptr, "EnsureIdTokenCacheAsync"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IAdministrator final : public ServiceFramework<IAdministrator> {
+public:
+    explicit IAdministrator(Common::UUID user_id) : ServiceFramework("IAdministrator") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "CheckAvailability"},
+            {1, nullptr, "GetAccountId"},
+            {2, nullptr, "EnsureIdTokenCacheAsync"},
+            {3, nullptr, "LoadIdTokenCache"},
+            {100, nullptr, "SetSystemProgramIdentification"},
+            {101, nullptr, "RefreshNotificationTokenAsync"}, // 7.0.0+
+            {110, nullptr, "GetServiceEntryRequirementCache"}, // 4.0.0+
+            {111, nullptr, "InvalidateServiceEntryRequirementCache"}, // 4.0.0+
+            {112, nullptr, "InvalidateTokenCache"}, // 4.0.0 - 6.2.0
+            {113, nullptr, "GetServiceEntryRequirementCacheForOnlinePlay"}, // 6.1.0+
+            {120, nullptr, "GetNintendoAccountId"},
+            {121, nullptr, "CalculateNintendoAccountAuthenticationFingerprint"}, // 9.0.0+
+            {130, nullptr, "GetNintendoAccountUserResourceCache"},
+            {131, nullptr, "RefreshNintendoAccountUserResourceCacheAsync"},
+            {132, nullptr, "RefreshNintendoAccountUserResourceCacheAsyncIfSecondsElapsed"},
+            {133, nullptr, "GetNintendoAccountVerificationUrlCache"}, // 9.0.0+
+            {134, nullptr, "RefreshNintendoAccountVerificationUrlCacheAsync"}, // 9.0.0+
+            {135, nullptr, "RefreshNintendoAccountVerificationUrlCacheAsyncIfSecondsElapsed"}, // 9.0.0+
+            {140, nullptr, "GetNetworkServiceLicenseCache"}, // 5.0.0+
+            {141, nullptr, "RefreshNetworkServiceLicenseCacheAsync"}, // 5.0.0+
+            {142, nullptr, "RefreshNetworkServiceLicenseCacheAsyncIfSecondsElapsed"}, // 5.0.0+
+            {150, nullptr, "CreateAuthorizationRequest"},
+            {200, nullptr, "IsRegistered"},
+            {201, nullptr, "RegisterAsync"},
+            {202, nullptr, "UnregisterAsync"},
+            {203, nullptr, "DeleteRegistrationInfoLocally"},
+            {220, nullptr, "SynchronizeProfileAsync"},
+            {221, nullptr, "UploadProfileAsync"},
+            {222, nullptr, "SynchronizaProfileAsyncIfSecondsElapsed"},
+            {250, nullptr, "IsLinkedWithNintendoAccount"},
+            {251, nullptr, "CreateProcedureToLinkWithNintendoAccount"},
+            {252, nullptr, "ResumeProcedureToLinkWithNintendoAccount"},
+            {255, nullptr, "CreateProcedureToUpdateLinkageStateOfNintendoAccount"},
+            {256, nullptr, "ResumeProcedureToUpdateLinkageStateOfNintendoAccount"},
+            {260, nullptr, "CreateProcedureToLinkNnidWithNintendoAccount"}, // 3.0.0+
+            {261, nullptr, "ResumeProcedureToLinkNnidWithNintendoAccount"}, // 3.0.0+
+            {280, nullptr, "ProxyProcedureToAcquireApplicationAuthorizationForNintendoAccount"},
+            {290, nullptr, "GetRequestForNintendoAccountUserResourceView"}, // 8.0.0+
+            {300, nullptr, "TryRecoverNintendoAccountUserStateAsync"}, // 6.0.0+
+            {400, nullptr, "IsServiceEntryRequirementCacheRefreshRequiredForOnlinePlay"}, // 6.1.0+
+            {401, nullptr, "RefreshServiceEntryRequirementCacheForOnlinePlayAsync"}, // 6.1.0+
+            {900, nullptr, "GetAuthenticationInfoForWin"}, // 9.0.0+
+            {901, nullptr, "ImportAsyncForWin"}, // 9.0.0+
+            {997, nullptr, "DebugUnlinkNintendoAccountAsync"},
+            {998, nullptr, "DebugSetAvailabilityErrorDetail"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IAuthorizationRequest final : public ServiceFramework<IAuthorizationRequest> {
+public:
+    explicit IAuthorizationRequest(Common::UUID user_id)
+        : ServiceFramework("IAuthorizationRequest") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSessionId"},
+            {10, nullptr, "InvokeWithoutInteractionAsync"},
+            {19, nullptr, "IsAuthorized"},
+            {20, nullptr, "GetAuthorizationCode"},
+            {21, nullptr, "GetIdToken"},
+            {22, nullptr, "GetState"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IOAuthProcedure final : public ServiceFramework<IOAuthProcedure> {
+public:
+    explicit IOAuthProcedure(Common::UUID user_id) : ServiceFramework("IOAuthProcedure") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "PrepareAsync"},
+            {1, nullptr, "GetRequest"},
+            {2, nullptr, "ApplyResponse"},
+            {3, nullptr, "ApplyResponseAsync"},
+            {10, nullptr, "Suspend"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+// 3.0.0+
+class IOAuthProcedureForExternalNsa final : public ServiceFramework<IOAuthProcedureForExternalNsa> {
+public:
+    explicit IOAuthProcedureForExternalNsa(Common::UUID user_id)
+        : ServiceFramework("IOAuthProcedureForExternalNsa") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "PrepareAsync"},
+            {1, nullptr, "GetRequest"},
+            {2, nullptr, "ApplyResponse"},
+            {3, nullptr, "ApplyResponseAsync"},
+            {10, nullptr, "Suspend"},
+            {100, nullptr, "GetAccountId"},
+            {101, nullptr, "GetLinkedNintendoAccountId"},
+            {102, nullptr, "GetNickname"},
+            {103, nullptr, "GetProfileImage"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IOAuthProcedureForNintendoAccountLinkage final
+    : public ServiceFramework<IOAuthProcedureForNintendoAccountLinkage> {
+public:
+    explicit IOAuthProcedureForNintendoAccountLinkage(Common::UUID user_id)
+        : ServiceFramework("IOAuthProcedureForNintendoAccountLinkage") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "PrepareAsync"},
+            {1, nullptr, "GetRequest"},
+            {2, nullptr, "ApplyResponse"},
+            {3, nullptr, "ApplyResponseAsync"},
+            {10, nullptr, "Suspend"},
+            {100, nullptr, "GetRequestWithTheme"},
+            {101, nullptr, "IsNetworkServiceAccountReplaced"},
+            {199, nullptr, "GetUrlForIntroductionOfExtraMembership"}, // 2.0.0 - 5.1.0
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class INotifier final : public ServiceFramework<INotifier> {
+public:
+    explicit INotifier(Common::UUID user_id) : ServiceFramework("INotifier") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSystemEvent"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
 class IProfileCommon : public ServiceFramework<IProfileCommon> {
 public:
    explicit IProfileCommon(const char* name, bool editor_commands, Common::UUID user_id,
@@ -226,6 +438,54 @@ public:
        : IProfileCommon("IProfileEditor", true, user_id, profile_manager) {}
 };

+class IAsyncContext final : public ServiceFramework<IAsyncContext> {
+public:
+    explicit IAsyncContext(Common::UUID user_id) : ServiceFramework("IAsyncContext") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSystemEvent"},
+            {1, nullptr, "Cancel"},
+            {2, nullptr, "HasDone"},
+            {3, nullptr, "GetResult"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class ISessionObject final : public ServiceFramework<ISessionObject> {
+public:
+    explicit ISessionObject(Common::UUID user_id) : ServiceFramework("ISessionObject") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {999, nullptr, "Dummy"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class IGuestLoginRequest final : public ServiceFramework<IGuestLoginRequest> {
+public:
+    explicit IGuestLoginRequest(Common::UUID) : ServiceFramework("IGuestLoginRequest") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSessionId"},
+            {11, nullptr, "Unknown"}, // 1.0.0 - 2.3.0 (the name is blank on Switchbrew)
+            {12, nullptr, "GetAccountId"},
+            {13, nullptr, "GetLinkedNintendoAccountId"},
+            {14, nullptr, "GetNickname"},
+            {15, nullptr, "GetProfileImage"},
+            {21, nullptr, "LoadIdTokenCache"}, // 3.0.0+
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
 class IManagerForApplication final : public ServiceFramework<IManagerForApplication> {
 public:
    explicit IManagerForApplication(Common::UUID user_id)
@@ -265,6 +525,87 @@ private:
    Common::UUID user_id;
 };

+// 6.0.0+
+class IAsyncNetworkServiceLicenseKindContext final
+    : public ServiceFramework<IAsyncNetworkServiceLicenseKindContext> {
+public:
+    explicit IAsyncNetworkServiceLicenseKindContext(Common::UUID user_id)
+        : ServiceFramework("IAsyncNetworkServiceLicenseKindContext") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetSystemEvent"},
+            {1, nullptr, "Cancel"},
+            {2, nullptr, "HasDone"},
+            {3, nullptr, "GetResult"},
+            {4, nullptr, "GetNetworkServiceLicenseKind"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+// 8.0.0+
+class IOAuthProcedureForUserRegistration final
+    : public ServiceFramework<IOAuthProcedureForUserRegistration> {
+public:
+    explicit IOAuthProcedureForUserRegistration(Common::UUID user_id)
+        : ServiceFramework("IOAuthProcedureForUserRegistration") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "PrepareAsync"},
+            {1, nullptr, "GetRequest"},
+            {2, nullptr, "ApplyResponse"},
+            {3, nullptr, "ApplyResponseAsync"},
+            {10, nullptr, "Suspend"},
+            {100, nullptr, "GetAccountId"},
+            {101, nullptr, "GetLinkedNintendoAccountId"},
+            {102, nullptr, "GetNickname"},
+            {103, nullptr, "GetProfileImage"},
+            {110, nullptr, "RegisterUserAsync"},
+            {111, nullptr, "GetUid"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+class DAUTH_O final : public ServiceFramework<DAUTH_O> {
+public:
+    explicit DAUTH_O(Common::UUID) : ServiceFramework("dauth:o") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "EnsureAuthenticationTokenCacheAsync"}, // [5.0.0-5.1.0] GeneratePostData
+            {1, nullptr, "LoadAuthenticationTokenCache"}, // 6.0.0+
+            {2, nullptr, "InvalidateAuthenticationTokenCache"}, // 6.0.0+
+            {10, nullptr, "EnsureEdgeTokenCacheAsync"}, // 6.0.0+
+            {11, nullptr, "LoadEdgeTokenCache"}, // 6.0.0+
+            {12, nullptr, "InvalidateEdgeTokenCache"}, // 6.0.0+
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
+// 6.0.0+
+class IAsyncResult final : public ServiceFramework<IAsyncResult> {
+public:
+    explicit IAsyncResult(Common::UUID user_id) : ServiceFramework("IAsyncResult") {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "GetResult"},
+            {1, nullptr, "Cancel"},
+            {2, nullptr, "IsAvailable"},
+            {3, nullptr, "GetSystemEvent"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+};
+
 void Module::Interface::GetUserCount(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service_ACC, "called");
    IPC::ResponseBuilder rb{ctx, 3};
--- a/src/core/hle/service/acc/acc_aa.cpp
+++ b/src/core/hle/service/acc/acc_aa.cpp
@@ -13,8 +13,8 @@ ACC_AA::ACC_AA(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
        {0, nullptr, "EnsureCacheAsync"},
        {1, nullptr, "LoadCache"},
        {2, nullptr, "GetDeviceAccountId"},
-        {50, nullptr, "RegisterNotificationTokenAsync"},
-        {51, nullptr, "UnregisterNotificationTokenAsync"},
+        {50, nullptr, "RegisterNotificationTokenAsync"},   // 1.0.0 - 6.2.0
+        {51, nullptr, "UnregisterNotificationTokenAsync"}, // 1.0.0 - 6.2.0
    };
    RegisterHandlers(functions);
 }
--- a/src/core/hle/service/acc/acc_su.cpp
+++ b/src/core/hle/service/acc/acc_su.cpp
@@ -17,28 +17,28 @@ ACC_SU::ACC_SU(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
        {3, &ACC_SU::ListOpenUsers, "ListOpenUsers"},
        {4, &ACC_SU::GetLastOpenedUser, "GetLastOpenedUser"},
        {5, &ACC_SU::GetProfile, "GetProfile"},
-        {6, nullptr, "GetProfileDigest"},
+        {6, nullptr, "GetProfileDigest"}, // 3.0.0+
        {50, &ACC_SU::IsUserRegistrationRequestPermitted, "IsUserRegistrationRequestPermitted"},
        {51, &ACC_SU::TrySelectUserWithoutInteraction, "TrySelectUserWithoutInteraction"},
-        {60, nullptr, "ListOpenContextStoredUsers"},
-        {99, nullptr, "DebugActivateOpenContextRetention"},
+        {60, nullptr, "ListOpenContextStoredUsers"}, // 5.0.0 - 5.1.0
+        {99, nullptr, "DebugActivateOpenContextRetention"}, // 6.0.0+
        {100, nullptr, "GetUserRegistrationNotifier"},
        {101, nullptr, "GetUserStateChangeNotifier"},
        {102, nullptr, "GetBaasAccountManagerForSystemService"},
        {103, nullptr, "GetBaasUserAvailabilityChangeNotifier"},
        {104, nullptr, "GetProfileUpdateNotifier"},
-        {105, nullptr, "CheckNetworkServiceAvailabilityAsync"},
-        {106, nullptr, "GetProfileSyncNotifier"},
+        {105, nullptr, "CheckNetworkServiceAvailabilityAsync"}, // 4.0.0+
+        {106, nullptr, "GetProfileSyncNotifier"}, // 9.0.0+
        {110, nullptr, "StoreSaveDataThumbnail"},
        {111, nullptr, "ClearSaveDataThumbnail"},
        {112, nullptr, "LoadSaveDataThumbnail"},
-        {113, nullptr, "GetSaveDataThumbnailExistence"},
-        {120, nullptr, "ListOpenUsersInApplication"},
-        {130, nullptr, "ActivateOpenContextRetention"},
-        {140, &ACC_SU::ListQualifiedUsers, "ListQualifiedUsers"},
-        {150, nullptr, "AuthenticateApplicationAsync"},
-        {190, nullptr, "GetUserLastOpenedApplication"},
-        {191, nullptr, "ActivateOpenContextHolder"},
+        {113, nullptr, "GetSaveDataThumbnailExistence"}, // 5.0.0+
+        {120, nullptr, "ListOpenUsersInApplication"}, // 10.0.0+
+        {130, nullptr, "ActivateOpenContextRetention"}, // 6.0.0+
+        {140, &ACC_SU::ListQualifiedUsers, "ListQualifiedUsers"}, // 6.0.0+
+        {150, nullptr, "AuthenticateApplicationAsync"}, // 10.0.0+
+        {190, nullptr, "GetUserLastOpenedApplication"}, // 1.0.0 - 9.2.0
+        {191, nullptr, "ActivateOpenContextHolder"}, // 7.0.0+
        {200, nullptr, "BeginUserRegistration"},
        {201, nullptr, "CompleteUserRegistration"},
        {202, nullptr, "CancelUserRegistration"},
@@ -46,15 +46,15 @@ ACC_SU::ACC_SU(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
        {204, nullptr, "SetUserPosition"},
        {205, &ACC_SU::GetProfileEditor, "GetProfileEditor"},
        {206, nullptr, "CompleteUserRegistrationForcibly"},
-        {210, nullptr, "CreateFloatingRegistrationRequest"},
-        {211, nullptr, "CreateProcedureToRegisterUserWithNintendoAccount"},
-        {212, nullptr, "ResumeProcedureToRegisterUserWithNintendoAccount"},
+        {210, nullptr, "CreateFloatingRegistrationRequest"}, // 3.0.0+
+        {211, nullptr, "CreateProcedureToRegisterUserWithNintendoAccount"}, // 8.0.0+
+        {212, nullptr, "ResumeProcedureToRegisterUserWithNintendoAccount"}, // 8.0.0+
        {230, nullptr, "AuthenticateServiceAsync"},
        {250, nullptr, "GetBaasAccountAdministrator"},
        {290, nullptr, "ProxyProcedureForGuestLoginWithNintendoAccount"},
-        {291, nullptr, "ProxyProcedureForFloatingRegistrationWithNintendoAccount"},
+        {291, nullptr, "ProxyProcedureForFloatingRegistrationWithNintendoAccount"}, // 3.0.0+
        {299, nullptr, "SuspendBackgroundDaemon"},
-        {997, nullptr, "DebugInvalidateTokenCacheForUser"},
+        {997, nullptr, "DebugInvalidateTokenCacheForUser"}, // 3.0.0+
        {998, nullptr, "DebugSetUserStateClose"},
        {999, nullptr, "DebugSetUserStateOpen"},
    };
--- a/src/core/hle/service/acc/acc_u0.cpp
+++ b/src/core/hle/service/acc/acc_u0.cpp
@@ -17,23 +17,23 @@ ACC_U0::ACC_U0(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
        {3, &ACC_U0::ListOpenUsers, "ListOpenUsers"},
        {4, &ACC_U0::GetLastOpenedUser, "GetLastOpenedUser"},
        {5, &ACC_U0::GetProfile, "GetProfile"},
-        {6, nullptr, "GetProfileDigest"},
+        {6, nullptr, "GetProfileDigest"}, // 3.0.0+
        {50, &ACC_U0::IsUserRegistrationRequestPermitted, "IsUserRegistrationRequestPermitted"},
        {51, &ACC_U0::TrySelectUserWithoutInteraction, "TrySelectUserWithoutInteraction"},
-        {60, nullptr, "ListOpenContextStoredUsers"},
-        {99, nullptr, "DebugActivateOpenContextRetention"},
+        {60, nullptr, "ListOpenContextStoredUsers"}, // 5.0.0 - 5.1.0
+        {99, nullptr, "DebugActivateOpenContextRetention"}, // 6.0.0+
        {100, &ACC_U0::InitializeApplicationInfo, "InitializeApplicationInfo"},
        {101, &ACC_U0::GetBaasAccountManagerForApplication, "GetBaasAccountManagerForApplication"},
        {102, nullptr, "AuthenticateApplicationAsync"},
-        {103, nullptr, "CheckNetworkServiceAvailabilityAsync"},
+        {103, nullptr, "CheckNetworkServiceAvailabilityAsync"}, // 4.0.0+
        {110, nullptr, "StoreSaveDataThumbnail"},
        {111, nullptr, "ClearSaveDataThumbnail"},
        {120, nullptr, "CreateGuestLoginRequest"},
-        {130, nullptr, "LoadOpenContext"},
-        {131, nullptr, "ListOpenContextStoredUsers"},
-        {140, &ACC_U0::InitializeApplicationInfoRestricted, "InitializeApplicationInfoRestricted"},
-        {141, &ACC_U0::ListQualifiedUsers, "ListQualifiedUsers"},
-        {150, &ACC_U0::IsUserAccountSwitchLocked, "IsUserAccountSwitchLocked"},
+        {130, nullptr, "LoadOpenContext"}, // 5.0.0+
+        {131, nullptr, "ListOpenContextStoredUsers"}, // 6.0.0+
+        {140, &ACC_U0::InitializeApplicationInfoRestricted, "InitializeApplicationInfoRestricted"}, // 6.0.0+
+        {141, &ACC_U0::ListQualifiedUsers, "ListQualifiedUsers"}, // 6.0.0+
+        {150, &ACC_U0::IsUserAccountSwitchLocked, "IsUserAccountSwitchLocked"}, // 6.0.0+
    };
    // clang-format on

--- a/src/core/hle/service/acc/acc_u1.cpp
+++ b/src/core/hle/service/acc/acc_u1.cpp
@@ -17,28 +17,29 @@ ACC_U1::ACC_U1(std::shared_ptr<Module> module, std::shared_ptr<ProfileManager> p
        {3, &ACC_U1::ListOpenUsers, "ListOpenUsers"},
        {4, &ACC_U1::GetLastOpenedUser, "GetLastOpenedUser"},
        {5, &ACC_U1::GetProfile, "GetProfile"},
-        {6, nullptr, "GetProfileDigest"},
+        {6, nullptr, "GetProfileDigest"}, // 3.0.0+
        {50, &ACC_U1::IsUserRegistrationRequestPermitted, "IsUserRegistrationRequestPermitted"},
        {51, &ACC_U1::TrySelectUserWithoutInteraction, "TrySelectUserWithoutInteraction"},
-        {60, nullptr, "ListOpenContextStoredUsers"},
-        {99, nullptr, "DebugActivateOpenContextRetention"},
+        {60, nullptr, "ListOpenContextStoredUsers"}, // 5.0.0 - 5.1.0
+        {99, nullptr, "DebugActivateOpenContextRetention"}, // 6.0.0+
        {100, nullptr, "GetUserRegistrationNotifier"},
        {101, nullptr, "GetUserStateChangeNotifier"},
        {102, nullptr, "GetBaasAccountManagerForSystemService"},
-        {103, nullptr, "GetProfileUpdateNotifier"},
-        {104, nullptr, "CheckNetworkServiceAvailabilityAsync"},
-        {105, nullptr, "GetBaasUserAvailabilityChangeNotifier"},
-        {106, nullptr, "GetProfileSyncNotifier"},
+        {103, nullptr, "GetBaasUserAvailabilityChangeNotifier"},
+        {104, nullptr, "GetProfileUpdateNotifier"},
+        {105, nullptr, "CheckNetworkServiceAvailabilityAsync"}, // 4.0.0+
+        {106, nullptr, "GetProfileSyncNotifier"}, // 9.0.0+
        {110, nullptr, "StoreSaveDataThumbnail"},
        {111, nullptr, "ClearSaveDataThumbnail"},
        {112, nullptr, "LoadSaveDataThumbnail"},
-        {113, nullptr, "GetSaveDataThumbnailExistence"},
-        {130, nullptr, "ActivateOpenContextRetention"},
-        {140, &ACC_U1::ListQualifiedUsers, "ListQualifiedUsers"},
-        {150, nullptr, "AuthenticateApplicationAsync"},
-        {190, nullptr, "GetUserLastOpenedApplication"},
-        {191, nullptr, "ActivateOpenContextHolder"},
-        {997, nullptr, "DebugInvalidateTokenCacheForUser"},
+        {113, nullptr, "GetSaveDataThumbnailExistence"}, // 5.0.0+
+        {120, nullptr, "ListOpenUsersInApplication"}, // 10.0.0+
+        {130, nullptr, "ActivateOpenContextRetention"}, // 6.0.0+
+        {140, &ACC_U1::ListQualifiedUsers, "ListQualifiedUsers"}, // 6.0.0+
+        {150, nullptr, "AuthenticateApplicationAsync"}, // 10.0.0+
+        {190, nullptr, "GetUserLastOpenedApplication"}, // 1.0.0 - 9.2.0
+        {191, nullptr, "ActivateOpenContextHolder"}, // 7.0.0+
+        {997, nullptr, "DebugInvalidateTokenCacheForUser"}, // 3.0.0+
        {998, nullptr, "DebugSetUserStateClose"},
        {999, nullptr, "DebugSetUserStateOpen"},
    };
--- a/src/core/hle/service/am/applets/software_keyboard.cpp
+++ b/src/core/hle/service/am/applets/software_keyboard.cpp
@@ -30,7 +30,7 @@ static Core::Frontend::SoftwareKeyboardParameters ConvertToFrontendParameters(
                                                                       config.sub_text.size());
    params.guide_text = Common::UTF16StringFromFixedZeroTerminatedBuffer(config.guide_text.data(),
                                                                         config.guide_text.size());
-    params.initial_text = initial_text;
+    params.initial_text = std::move(initial_text);
    params.max_length = config.length_limit == 0 ? DEFAULT_MAX_LENGTH : config.length_limit;
    params.password = static_cast<bool>(config.is_password);
    params.cursor_at_beginning = static_cast<bool>(config.initial_cursor_position);
@@ -109,7 +109,7 @@ void SoftwareKeyboard::Execute() {

    const auto parameters = ConvertToFrontendParameters(config, initial_text);

-    frontend.RequestText([this](std::optional<std::u16string> text) { WriteText(text); },
+    frontend.RequestText([this](std::optional<std::u16string> text) { WriteText(std::move(text)); },
                         parameters);
 }

--- a/src/core/hle/service/btdrv/btdrv.cpp
+++ b/src/core/hle/service/btdrv/btdrv.cpp
@@ -58,102 +58,103 @@ public:
            {1, nullptr, "InitializeBluetooth"},
            {2, nullptr, "EnableBluetooth"},
            {3, nullptr, "DisableBluetooth"},
-            {4, nullptr, "CleanupBluetooth"},
+            {4, nullptr, "FinalizeBluetooth"},
            {5, nullptr, "GetAdapterProperties"},
            {6, nullptr, "GetAdapterProperty"},
            {7, nullptr, "SetAdapterProperty"},
-            {8, nullptr, "StartDiscovery"},
-            {9, nullptr, "CancelDiscovery"},
+            {8, nullptr, "StartInquiry"},
+            {9, nullptr, "StopInquiry"},
            {10, nullptr, "CreateBond"},
            {11, nullptr, "RemoveBond"},
            {12, nullptr, "CancelBond"},
-            {13, nullptr, "PinReply"},
-            {14, nullptr, "SspReply"},
+            {13, nullptr, "RespondToPinRequest"},
+            {14, nullptr, "RespondToSspRequest"},
            {15, nullptr, "GetEventInfo"},
            {16, nullptr, "InitializeHid"},
-            {17, nullptr, "HidConnect"},
-            {18, nullptr, "HidDisconnect"},
-            {19, nullptr, "HidSendData"},
-            {20, nullptr, "HidSendData2"},
-            {21, nullptr, "HidSetReport"},
-            {22, nullptr, "HidGetReport"},
-            {23, nullptr, "HidWakeController"},
-            {24, nullptr, "HidAddPairedDevice"},
-            {25, nullptr, "HidGetPairedDevice"},
-            {26, nullptr, "CleanupHid"},
-            {27, nullptr, "HidGetEventInfo"},
-            {28, nullptr, "ExtSetTsi"},
-            {29, nullptr, "ExtSetBurstMode"},
-            {30, nullptr, "ExtSetZeroRetran"},
-            {31, nullptr, "ExtSetMcMode"},
-            {32, nullptr, "ExtStartLlrMode"},
-            {33, nullptr, "ExtExitLlrMode"},
-            {34, nullptr, "ExtSetRadio"},
-            {35, nullptr, "ExtSetVisibility"},
-            {36, nullptr, "ExtSetTbfcScan"},
+            {17, nullptr, "OpenHidConnection"},
+            {18, nullptr, "CloseHidConnection"},
+            {19, nullptr, "WriteHidData"},
+            {20, nullptr, "WriteHidData2"},
+            {21, nullptr, "SetHidReport"},
+            {22, nullptr, "GetHidReport"},
+            {23, nullptr, "TriggerConnection"},
+            {24, nullptr, "AddPairedDeviceInfo"},
+            {25, nullptr, "GetPairedDeviceInfo"},
+            {26, nullptr, "FinalizeHid"},
+            {27, nullptr, "GetHidEventInfo"},
+            {28, nullptr, "SetTsi"},
+            {29, nullptr, "EnableBurstMode"},
+            {30, nullptr, "SetZeroRetransmission"},
+            {31, nullptr, "EnableMcMode"},
+            {32, nullptr, "EnableLlrScan"},
+            {33, nullptr, "DisableLlrScan"},
+            {34, nullptr, "EnableRadio"},
+            {35, nullptr, "SetVisibility"},
+            {36, nullptr, "EnableTbfcScan"},
            {37, nullptr, "RegisterHidReportEvent"},
-            {38, nullptr, "HidGetReportEventInfo"},
+            {38, nullptr, "GetHidReportEventInfo"},
            {39, nullptr, "GetLatestPlr"},
-            {40, nullptr, "ExtGetPendingConnections"},
+            {40, nullptr, "GetPendingConnections"},
            {41, nullptr, "GetChannelMap"},
-            {42, nullptr, "EnableBluetoothBoostSetting"},
-            {43, nullptr, "IsBluetoothBoostSettingEnabled"},
-            {44, nullptr, "EnableBluetoothAfhSetting"},
-            {45, nullptr, "IsBluetoothAfhSettingEnabled"},
-            {46, nullptr, "InitializeBluetoothLe"},
-            {47, nullptr, "EnableBluetoothLe"},
-            {48, nullptr, "DisableBluetoothLe"},
-            {49, nullptr, "CleanupBluetoothLe"},
-            {50, nullptr, "SetLeVisibility"},
-            {51, nullptr, "SetLeConnectionParameter"},
-            {52, nullptr, "SetLeDefaultConnectionParameter"},
-            {53, nullptr, "SetLeAdvertiseData"},
-            {54, nullptr, "SetLeAdvertiseParameter"},
-            {55, nullptr, "StartLeScan"},
-            {56, nullptr, "StopLeScan"},
-            {57, nullptr, "AddLeScanFilterCondition"},
-            {58, nullptr, "DeleteLeScanFilterCondition"},
-            {59, nullptr, "DeleteLeScanFilter"},
-            {60, nullptr, "ClearLeScanFilters"},
-            {61, nullptr, "EnableLeScanFilter"},
-            {62, nullptr, "RegisterLeClient"},
-            {63, nullptr, "UnregisterLeClient"},
-            {64, nullptr, "UnregisterLeClientAll"},
-            {65, nullptr, "LeClientConnect"},
-            {66, nullptr, "LeClientCancelConnection"},
-            {67, nullptr, "LeClientDisconnect"},
-            {68, nullptr, "LeClientGetAttributes"},
-            {69, nullptr, "LeClientDiscoverService"},
-            {70, nullptr, "LeClientConfigureMtu"},
-            {71, nullptr, "RegisterLeServer"},
-            {72, nullptr, "UnregisterLeServer"},
-            {73, nullptr, "LeServerConnect"},
-            {74, nullptr, "LeServerDisconnect"},
-            {75, nullptr, "CreateLeService"},
-            {76, nullptr, "StartLeService"},
-            {77, nullptr, "AddLeCharacteristic"},
-            {78, nullptr, "AddLeDescriptor"},
-            {79, nullptr, "GetLeCoreEventInfo"},
-            {80, nullptr, "LeGetFirstCharacteristic"},
-            {81, nullptr, "LeGetNextCharacteristic"},
-            {82, nullptr, "LeGetFirstDescriptor"},
-            {83, nullptr, "LeGetNextDescriptor"},
-            {84, nullptr, "RegisterLeCoreDataPath"},
-            {85, nullptr, "UnregisterLeCoreDataPath"},
-            {86, nullptr, "RegisterLeHidDataPath"},
-            {87, nullptr, "UnregisterLeHidDataPath"},
-            {88, nullptr, "RegisterLeDataPath"},
-            {89, nullptr, "UnregisterLeDataPath"},
-            {90, nullptr, "LeClientReadCharacteristic"},
-            {91, nullptr, "LeClientReadDescriptor"},
-            {92, nullptr, "LeClientWriteCharacteristic"},
-            {93, nullptr, "LeClientWriteDescriptor"},
-            {94, nullptr, "LeClientRegisterNotification"},
-            {95, nullptr, "LeClientDeregisterNotification"},
+            {42, nullptr, "EnableTxPowerBoostSetting"},
+            {43, nullptr, "IsTxPowerBoostSettingEnabled"},
+            {44, nullptr, "EnableAfhSetting"},
+            {45, nullptr, "IsAfhSettingEnabled"},
+            {46, nullptr, "InitializeBle"},
+            {47, nullptr, "EnableBle"},
+            {48, nullptr, "DisableBle"},
+            {49, nullptr, "FinalizeBle"},
+            {50, nullptr, "SetBleVisibility"},
+            {51, nullptr, "SetBleConnectionParameter"},
+            {52, nullptr, "SetBleDefaultConnectionParameter"},
+            {53, nullptr, "SetBleAdvertiseData"},
+            {54, nullptr, "SetBleAdvertiseParameter"},
+            {55, nullptr, "StartBleScan"},
+            {56, nullptr, "StopBleScan"},
+            {57, nullptr, "AddBleScanFilterCondition"},
+            {58, nullptr, "DeleteBleScanFilterCondition"},
+            {59, nullptr, "DeleteBleScanFilter"},
+            {60, nullptr, "ClearBleScanFilters"},
+            {61, nullptr, "EnableBleScanFilter"},
+            {62, nullptr, "RegisterGattClient"},
+            {63, nullptr, "UnregisterGattClient"},
+            {64, nullptr, "UnregisterAllGattClients"},
+            {65, nullptr, "ConnectGattServer"},
+            {66, nullptr, "CancelConnectGattServer"},
+            {67, nullptr, "DisconnectGattServer"},
+            {68, nullptr, "GetGattAttribute"},
+            {69, nullptr, "GetGattService"},
+            {70, nullptr, "ConfigureAttMtu"},
+            {71, nullptr, "RegisterGattServer"},
+            {72, nullptr, "UnregisterGattServer"},
+            {73, nullptr, "ConnectGattClient"},
+            {74, nullptr, "DisconnectGattClient"},
+            {75, nullptr, "AddGattService"},
+            {76, nullptr, "EnableGattService"},
+            {77, nullptr, "AddGattCharacteristic"},
+            {78, nullptr, "AddGattDescriptor"},
+            {79, nullptr, "GetBleManagedEventInfo"},
+            {80, nullptr, "GetGattFirstCharacteristic"},
+            {81, nullptr, "GetGattNextCharacteristic"},
+            {82, nullptr, "GetGattFirstDescriptor"},
+            {83, nullptr, "GetGattNextDescriptor"},
+            {84, nullptr, "RegisterGattManagedDataPath"},
+            {85, nullptr, "UnregisterGattManagedDataPath"},
+            {86, nullptr, "RegisterGattHidDataPath"},
+            {87, nullptr, "UnregisterGattHidDataPath"},
+            {88, nullptr, "RegisterGattDataPath"},
+            {89, nullptr, "UnregisterGattDataPath"},
+            {90, nullptr, "ReadGattCharacteristic"},
+            {91, nullptr, "ReadGattDescriptor"},
+            {92, nullptr, "WriteGattCharacteristic"},
+            {93, nullptr, "WriteGattDescriptor"},
+            {94, nullptr, "RegisterGattNotification"},
+            {95, nullptr, "UnregisterGattNotification"},
            {96, nullptr, "GetLeHidEventInfo"},
            {97, nullptr, "RegisterBleHidEvent"},
-            {98, nullptr, "SetLeScanParameter"},
-            {256, nullptr, "GetIsManufacturingMode"},
+            {98, nullptr, "SetBleScanParameter"},
+            {99, nullptr, "MoveToSecondaryPiconet"},
+            {256, nullptr, "IsManufacturingMode"},
            {257, nullptr, "EmulateBluetoothCrash"},
            {258, nullptr, "GetBleChannelMap"},
        };
--- a/src/core/hle/service/hid/controllers/keyboard.cpp
+++ b/src/core/hle/service/hid/controllers/keyboard.cpp
@@ -38,10 +38,11 @@ void Controller_Keyboard::OnUpdate(const Core::Timing::CoreTiming& core_timing,
    cur_entry.sampling_number = last_entry.sampling_number + 1;
    cur_entry.sampling_number2 = cur_entry.sampling_number;

+    cur_entry.key.fill(0);
+    cur_entry.modifier = 0;
+
    for (std::size_t i = 0; i < keyboard_keys.size(); ++i) {
-        for (std::size_t k = 0; k < KEYS_PER_BYTE; ++k) {
-            cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << k);
-        }
+        cur_entry.key[i / KEYS_PER_BYTE] |= (keyboard_keys[i]->GetStatus() << (i % KEYS_PER_BYTE));
    }

    for (std::size_t i = 0; i < keyboard_mods.size(); ++i) {
--- a/src/core/hle/service/hid/controllers/npad.cpp
+++ b/src/core/hle/service/hid/controllers/npad.cpp
@@ -566,6 +566,14 @@ void Controller_NPad::DisconnectNPad(u32 npad_id) {
    connected_controllers[NPadIdToIndex(npad_id)].is_connected = false;
 }

+void Controller_NPad::SetGyroscopeZeroDriftMode(GyroscopeZeroDriftMode drift_mode) {
+    gyroscope_zero_drift_mode = drift_mode;
+}
+
+Controller_NPad::GyroscopeZeroDriftMode Controller_NPad::GetGyroscopeZeroDriftMode() const {
+    return gyroscope_zero_drift_mode;
+}
+
 void Controller_NPad::StartLRAssignmentMode() {
    // Nothing internally is used for lr assignment mode. Since we have the ability to set the
    // controller types from boot, it doesn't really matter about showing a selection screen
--- a/src/core/hle/service/hid/controllers/npad.h
+++ b/src/core/hle/service/hid/controllers/npad.h
@@ -58,6 +58,12 @@ public:
    };
    static_assert(sizeof(Vibration) == 0x10, "Vibration is an invalid size");

+    enum class GyroscopeZeroDriftMode : u32 {
+        Loose = 0,
+        Standard = 1,
+        Tight = 2,
+    };
+
    enum class NpadHoldType : u64 {
        Vertical = 0,
        Horizontal = 1,
@@ -117,6 +123,8 @@ public:

    void ConnectNPad(u32 npad_id);
    void DisconnectNPad(u32 npad_id);
+    void SetGyroscopeZeroDriftMode(GyroscopeZeroDriftMode drift_mode);
+    GyroscopeZeroDriftMode GetGyroscopeZeroDriftMode() const;
    LedPattern GetLedPattern(u32 npad_id);
    void SetVibrationEnabled(bool can_vibrate);
    bool IsVibrationEnabled() const;
@@ -324,8 +332,8 @@ private:
    std::array<Kernel::EventPair, 10> styleset_changed_events;
    Vibration last_processed_vibration{};
    std::array<ControllerHolder, 10> connected_controllers{};
+    GyroscopeZeroDriftMode gyroscope_zero_drift_mode{GyroscopeZeroDriftMode::Standard};
    bool can_controllers_vibrate{true};
-
    std::array<ControllerPad, 10> npad_pad_states{};
    bool is_in_lr_assignment_mode{false};
    Core::System& system;
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -161,7 +161,7 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
        {40, nullptr, "AcquireXpadIdEventHandle"},
        {41, nullptr, "ReleaseXpadIdEventHandle"},
        {51, &Hid::ActivateXpad, "ActivateXpad"},
-        {55, nullptr, "GetXpadIds"},
+        {55, &Hid::GetXpadIDs, "GetXpadIds"},
        {56, nullptr, "ActivateJoyXpad"},
        {58, nullptr, "GetJoyXpadLifoHandle"},
        {59, nullptr, "GetJoyXpadIds"},
@@ -185,8 +185,8 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
        {77, nullptr, "GetAccelerometerPlayMode"},
        {78, nullptr, "ResetAccelerometerPlayMode"},
        {79, &Hid::SetGyroscopeZeroDriftMode, "SetGyroscopeZeroDriftMode"},
-        {80, nullptr, "GetGyroscopeZeroDriftMode"},
-        {81, nullptr, "ResetGyroscopeZeroDriftMode"},
+        {80, &Hid::GetGyroscopeZeroDriftMode, "GetGyroscopeZeroDriftMode"},
+        {81, &Hid::ResetGyroscopeZeroDriftMode, "ResetGyroscopeZeroDriftMode"},
        {82, &Hid::IsSixAxisSensorAtRest, "IsSixAxisSensorAtRest"},
        {83, nullptr, "IsFirmwareUpdateAvailableForSixAxisSensor"},
        {91, &Hid::ActivateGesture, "ActivateGesture"},
@@ -230,15 +230,15 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
        {211, nullptr, "IsVibrationDeviceMounted"},
        {300, &Hid::ActivateConsoleSixAxisSensor, "ActivateConsoleSixAxisSensor"},
        {301, &Hid::StartConsoleSixAxisSensor, "StartConsoleSixAxisSensor"},
-        {302, nullptr, "StopConsoleSixAxisSensor"},
-        {303, nullptr, "ActivateSevenSixAxisSensor"},
-        {304, nullptr, "StartSevenSixAxisSensor"},
+        {302, &Hid::StopConsoleSixAxisSensor, "StopConsoleSixAxisSensor"},
+        {303, &Hid::ActivateSevenSixAxisSensor, "ActivateSevenSixAxisSensor"},
+        {304, &Hid::StartSevenSixAxisSensor, "StartSevenSixAxisSensor"},
        {305, &Hid::StopSevenSixAxisSensor, "StopSevenSixAxisSensor"},
        {306, &Hid::InitializeSevenSixAxisSensor, "InitializeSevenSixAxisSensor"},
-        {307, nullptr, "FinalizeSevenSixAxisSensor"},
+        {307, &Hid::FinalizeSevenSixAxisSensor, "FinalizeSevenSixAxisSensor"},
        {308, nullptr, "SetSevenSixAxisSensorFusionStrength"},
        {309, nullptr, "GetSevenSixAxisSensorFusionStrength"},
-        {310, nullptr, "ResetSevenSixAxisSensorTimestamp"},
+        {310, &Hid::ResetSevenSixAxisSensorTimestamp, "ResetSevenSixAxisSensorTimestamp"},
        {400, nullptr, "IsUsbFullKeyControllerEnabled"},
        {401, nullptr, "EnableUsbFullKeyController"},
        {402, nullptr, "IsUsbFullKeyControllerConnected"},
@@ -319,6 +319,17 @@ void Hid::ActivateXpad(Kernel::HLERequestContext& ctx) {
    rb.Push(RESULT_SUCCESS);
 }

+void Hid::GetXpadIDs(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 3};
+    rb.Push(RESULT_SUCCESS);
+    rb.Push(0);
+}
+
 void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
    const auto applet_resource_user_id{rp.Pop<u64>()};
@@ -363,6 +374,15 @@ void Hid::ActivateKeyboard(Kernel::HLERequestContext& ctx) {
    rb.Push(RESULT_SUCCESS);
 }

+void Hid::SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto flags{rp.Pop<u32>()};
+    LOG_WARNING(Service_HID, "(STUBBED) called. flags={}", flags);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
 void Hid::ActivateGesture(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
    const auto unknown{rp.Pop<u32>()};
@@ -402,15 +422,59 @@ void Hid::StartSixAxisSensor(Kernel::HLERequestContext& ctx) {
    rb.Push(RESULT_SUCCESS);
 }

+void Hid::StopSixAxisSensor(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto handle{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, handle={}, applet_resource_user_id={}", handle,
+                applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
 void Hid::SetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
    const auto handle{rp.Pop<u32>()};
    const auto drift_mode{rp.Pop<u32>()};
    const auto applet_resource_user_id{rp.Pop<u64>()};

-    LOG_WARNING(Service_HID,
-                "(STUBBED) called, handle={}, drift_mode={}, applet_resource_user_id={}", handle,
-                drift_mode, applet_resource_user_id);
+    applet_resource->GetController<Controller_NPad>(HidController::NPad)
+        .SetGyroscopeZeroDriftMode(Controller_NPad::GyroscopeZeroDriftMode{drift_mode});
+
+    LOG_DEBUG(Service_HID, "called, handle={}, drift_mode={}, applet_resource_user_id={}", handle,
+              drift_mode, applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::GetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto handle{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_DEBUG(Service_HID, "called, handle={}, applet_resource_user_id={}", handle,
+              applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 3};
+    rb.Push(RESULT_SUCCESS);
+    rb.Push<u32>(
+        static_cast<u32>(applet_resource->GetController<Controller_NPad>(HidController::NPad)
+                             .GetGyroscopeZeroDriftMode()));
+}
+
+void Hid::ResetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto handle{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    applet_resource->GetController<Controller_NPad>(HidController::NPad)
+        .SetGyroscopeZeroDriftMode(Controller_NPad::GyroscopeZeroDriftMode::Standard);
+
+    LOG_DEBUG(Service_HID, "called, handle={}, applet_resource_user_id={}", handle,
+              applet_resource_user_id);

    IPC::ResponseBuilder rb{ctx, 2};
    rb.Push(RESULT_SUCCESS);
@@ -821,33 +885,35 @@ void Hid::StartConsoleSixAxisSensor(Kernel::HLERequestContext& ctx) {
    rb.Push(RESULT_SUCCESS);
 }

-void Hid::StopSixAxisSensor(Kernel::HLERequestContext& ctx) {
+void Hid::StopConsoleSixAxisSensor(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
    const auto handle{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};

-    LOG_WARNING(Service_HID, "(STUBBED) called, handle={}", handle);
+    LOG_WARNING(Service_HID, "(STUBBED) called, handle={}, applet_resource_user_id={}", handle,
+                applet_resource_user_id);

    IPC::ResponseBuilder rb{ctx, 2};
    rb.Push(RESULT_SUCCESS);
 }

-void Hid::SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx) {
+void Hid::ActivateSevenSixAxisSensor(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
    const auto applet_resource_user_id{rp.Pop<u64>()};
-    const auto unknown{rp.Pop<u32>()};

-    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}, unknown={}",
-                applet_resource_user_id, unknown);
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}",
+                applet_resource_user_id);

    IPC::ResponseBuilder rb{ctx, 2};
    rb.Push(RESULT_SUCCESS);
 }

-void Hid::SetPalmaBoostMode(Kernel::HLERequestContext& ctx) {
+void Hid::StartSevenSixAxisSensor(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
-    const auto unknown{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};

-    LOG_WARNING(Service_HID, "(STUBBED) called, unknown={}", unknown);
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}",
+                applet_resource_user_id);

    IPC::ResponseBuilder rb{ctx, 2};
    rb.Push(RESULT_SUCCESS);
@@ -871,10 +937,46 @@ void Hid::InitializeSevenSixAxisSensor(Kernel::HLERequestContext& ctx) {
    rb.Push(RESULT_SUCCESS);
 }

-void Hid::SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx) {
+void Hid::FinalizeSevenSixAxisSensor(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
-    const auto flags{rp.Pop<u32>()};
-    LOG_WARNING(Service_HID, "(STUBBED) called. flags={}", flags);
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}",
+                applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::ResetSevenSixAxisSensorTimestamp(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}",
+                applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+    const auto is_palma_all_connectable{rp.Pop<bool>()};
+
+    LOG_WARNING(Service_HID,
+                "(STUBBED) called, applet_resource_user_id={}, is_palma_all_connectable={}",
+                applet_resource_user_id, is_palma_all_connectable);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::SetPalmaBoostMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto palma_boost_mode{rp.Pop<bool>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, palma_boost_mode={}", palma_boost_mode);

    IPC::ResponseBuilder rb{ctx, 2};
    rb.Push(RESULT_SUCCESS);
--- a/src/core/hle/service/hid/hid.h
+++ b/src/core/hle/service/hid/hid.h
@@ -86,14 +86,19 @@ public:
 private:
    void CreateAppletResource(Kernel::HLERequestContext& ctx);
    void ActivateXpad(Kernel::HLERequestContext& ctx);
+    void GetXpadIDs(Kernel::HLERequestContext& ctx);
    void ActivateDebugPad(Kernel::HLERequestContext& ctx);
    void ActivateTouchScreen(Kernel::HLERequestContext& ctx);
    void ActivateMouse(Kernel::HLERequestContext& ctx);
    void ActivateKeyboard(Kernel::HLERequestContext& ctx);
+    void SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx);
    void ActivateGesture(Kernel::HLERequestContext& ctx);
    void ActivateNpadWithRevision(Kernel::HLERequestContext& ctx);
    void StartSixAxisSensor(Kernel::HLERequestContext& ctx);
+    void StopSixAxisSensor(Kernel::HLERequestContext& ctx);
    void SetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx);
+    void GetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx);
+    void ResetGyroscopeZeroDriftMode(Kernel::HLERequestContext& ctx);
    void IsSixAxisSensorAtRest(Kernel::HLERequestContext& ctx);
    void SetSupportedNpadStyleSet(Kernel::HLERequestContext& ctx);
    void GetSupportedNpadStyleSet(Kernel::HLERequestContext& ctx);
@@ -125,12 +130,15 @@ private:
    void IsVibrationPermitted(Kernel::HLERequestContext& ctx);
    void ActivateConsoleSixAxisSensor(Kernel::HLERequestContext& ctx);
    void StartConsoleSixAxisSensor(Kernel::HLERequestContext& ctx);
-    void StopSixAxisSensor(Kernel::HLERequestContext& ctx);
-    void SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx);
-    void SetPalmaBoostMode(Kernel::HLERequestContext& ctx);
+    void StopConsoleSixAxisSensor(Kernel::HLERequestContext& ctx);
+    void ActivateSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
+    void StartSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
    void StopSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
    void InitializeSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
-    void SendKeyboardLockKeyEvent(Kernel::HLERequestContext& ctx);
+    void FinalizeSevenSixAxisSensor(Kernel::HLERequestContext& ctx);
+    void ResetSevenSixAxisSensorTimestamp(Kernel::HLERequestContext& ctx);
+    void SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx);
+    void SetPalmaBoostMode(Kernel::HLERequestContext& ctx);

    std::shared_ptr<IAppletResource> applet_resource;
    Core::System& system;
--- a/src/core/hle/service/lm/manager.cpp
+++ b/src/core/hle/service/lm/manager.cpp
@@ -86,7 +86,8 @@ std::string FormatField(Field type, const std::vector<u8>& data) {
        return Common::StringFromFixedZeroTerminatedBuffer(
            reinterpret_cast<const char*>(data.data()), data.size());
    default:
-        UNIMPLEMENTED();
+        UNIMPLEMENTED_MSG("Unimplemented field type={}", type);
+        return "";
    }
 }

--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -177,7 +177,8 @@ private:
    void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) {
        LOG_DEBUG(Service_NIFM, "called");

-        ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c, "NetworkProfileData is not the correct size");
+        ASSERT_MSG(ctx.GetReadBufferSize() == 0x17c,
+                   "SfNetworkProfileData is not the correct size");
        u128 uuid{};
        auto buffer = ctx.ReadBuffer();
        std::memcpy(&uuid, buffer.data() + 8, sizeof(u128));
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
@@ -25,7 +25,7 @@ u32 nvhost_ctrl_gpu::ioctl(Ioctl command, const std::vector<u8>& input,
    case IoctlCommand::IocGetCharacteristicsCommand:
        return GetCharacteristics(input, output, output2, version);
    case IoctlCommand::IocGetTPCMasksCommand:
-        return GetTPCMasks(input, output);
+        return GetTPCMasks(input, output, output2, version);
    case IoctlCommand::IocGetActiveSlotMaskCommand:
        return GetActiveSlotMask(input, output);
    case IoctlCommand::IocZcullGetCtxSizeCommand:
@@ -98,17 +98,22 @@ u32 nvhost_ctrl_gpu::GetCharacteristics(const std::vector<u8>& input, std::vecto
    return 0;
 }

-u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output) {
+u32 nvhost_ctrl_gpu::GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output,
+                                 std::vector<u8>& output2, IoctlVersion version) {
    IoctlGpuGetTpcMasksArgs params{};
    std::memcpy(&params, input.data(), input.size());
-    LOG_INFO(Service_NVDRV, "called, mask=0x{:X}, mask_buf_addr=0x{:X}", params.mask_buf_size,
-             params.mask_buf_addr);
-    // TODO(ogniK): Confirm value on hardware
-    if (params.mask_buf_size)
-        params.tpc_mask_size = 4 * 1; // 4 * num_gpc
-    else
-        params.tpc_mask_size = 0;
-    std::memcpy(output.data(), &params, sizeof(params));
+    LOG_DEBUG(Service_NVDRV, "called, mask_buffer_size=0x{:X}", params.mask_buffer_size);
+    if (params.mask_buffer_size != 0) {
+        params.tcp_mask = 3;
+    }
+
+    if (version == IoctlVersion::Version3) {
+        std::memcpy(output.data(), input.data(), output.size());
+        std::memcpy(output2.data(), &params.tcp_mask, output2.size());
+    } else {
+        std::memcpy(output.data(), &params, output.size());
+    }
+
    return 0;
 }

--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
@@ -92,16 +92,11 @@ private:
                  "IoctlCharacteristics is incorrect size");

    struct IoctlGpuGetTpcMasksArgs {
-        /// [in]  TPC mask buffer size reserved by userspace. Should be at least
-        /// sizeof(__u32) * fls(gpc_mask) to receive TPC mask for each GPC.
-        /// [out] full kernel buffer size
-        u32_le mask_buf_size;
-        u32_le reserved;
-
-        /// [in]  pointer to TPC mask buffer. It will receive one 32-bit TPC mask per GPC or 0 if
-        /// GPC is not enabled or not present. This parameter is ignored if mask_buf_size is 0.
-        u64_le mask_buf_addr;
-        u64_le tpc_mask_size; // Nintendo add this?
+        u32_le mask_buffer_size{};
+        INSERT_PADDING_WORDS(1);
+        u64_le mask_buffer_address{};
+        u32_le tcp_mask{};
+        INSERT_PADDING_WORDS(1);
    };
    static_assert(sizeof(IoctlGpuGetTpcMasksArgs) == 24,
                  "IoctlGpuGetTpcMasksArgs is incorrect size");
@@ -166,7 +161,8 @@ private:

    u32 GetCharacteristics(const std::vector<u8>& input, std::vector<u8>& output,
                           std::vector<u8>& output2, IoctlVersion version);
-    u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetTPCMasks(const std::vector<u8>& input, std::vector<u8>& output, std::vector<u8>& output2,
+                    IoctlVersion version);
    u32 GetActiveSlotMask(const std::vector<u8>& input, std::vector<u8>& output);
    u32 ZCullGetCtxSize(const std::vector<u8>& input, std::vector<u8>& output);
    u32 ZCullGetInfo(const std::vector<u8>& input, std::vector<u8>& output);
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -112,6 +112,7 @@ void LogSettings() {
    LogSetting("Renderer_UseAsynchronousGpuEmulation",
               Settings::values.use_asynchronous_gpu_emulation);
    LogSetting("Renderer_UseVsync", Settings::values.use_vsync);
+    LogSetting("Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
    LogSetting("Renderer_AnisotropicFilteringLevel", Settings::values.max_anisotropy);
    LogSetting("Audio_OutputEngine", Settings::values.sink_id);
    LogSetting("Audio_EnableAudioStretching", Settings::values.enable_audio_stretching);
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -437,7 +437,7 @@ struct Values {
    bool renderer_debug;
    int vulkan_device;

-    float resolution_factor;
+    u16 resolution_factor{1};
    int aspect_ratio;
    int max_anisotropy;
    bool use_frame_limit;
@@ -446,6 +446,7 @@ struct Values {
    GPUAccuracy gpu_accuracy;
    bool use_asynchronous_gpu_emulation;
    bool use_vsync;
+    bool use_assembly_shaders;
    bool force_30fps_mode;
    bool use_fast_gpu_time;

@@ -473,6 +474,7 @@ struct Values {
    bool reporting_services;
    bool quest_flag;
    bool disable_cpu_opt;
+    bool disable_macro_jit;

    // BCAT
    std::string bcat_backend;
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -201,6 +201,7 @@ void TelemetrySession::AddInitialInfo(Loader::AppLoader& app_loader) {
    AddField(field_type, "Renderer_UseAsynchronousGpuEmulation",
             Settings::values.use_asynchronous_gpu_emulation);
    AddField(field_type, "Renderer_UseVsync", Settings::values.use_vsync);
+    AddField(field_type, "Renderer_UseAssemblyShaders", Settings::values.use_assembly_shaders);
    AddField(field_type, "System_UseDockedMode", Settings::values.use_docked_mode);
 }

--- a/src/input_common/keyboard.cpp
+++ b/src/input_common/keyboard.cpp
@@ -76,7 +76,7 @@ std::unique_ptr<Input::ButtonDevice> Keyboard::Create(const Common::ParamPackage
    int key_code = params.Get("code", 0);
    std::unique_ptr<KeyButton> button = std::make_unique<KeyButton>(key_button_list);
    key_button_list->AddKeyButton(key_code, button.get());
-    return std::move(button);
+    return button;
 }

 void Keyboard::PressKey(int key_code) {
--- a/src/input_common/motion_emu.cpp
+++ b/src/input_common/motion_emu.cpp
@@ -145,7 +145,7 @@ std::unique_ptr<Input::MotionDevice> MotionEmu::Create(const Common::ParamPackag
    // Previously created device is disconnected here. Having two motion devices for 3DS is not
    // expected.
    current_device = device_wrapper->device;
-    return std::move(device_wrapper);
+    return device_wrapper;
 }

 void MotionEmu::BeginTilt(int x, int y) {
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_library(video_core STATIC
    buffer_cache/buffer_block.h
    buffer_cache/buffer_cache.h
+    buffer_cache/map_interval.cpp
    buffer_cache/map_interval.h
    dirty_flags.cpp
    dirty_flags.h
@@ -24,6 +25,14 @@ add_library(video_core STATIC
    engines/shader_bytecode.h
    engines/shader_header.h
    engines/shader_type.h
+    macro/macro.cpp
+    macro/macro.h
+    macro/macro_hle.cpp
+    macro/macro_hle.h
+    macro/macro_interpreter.cpp
+    macro/macro_interpreter.h
+    macro/macro_jit_x64.cpp
+    macro/macro_jit_x64.h
    fence_manager.h
    gpu.cpp
    gpu.h
@@ -35,8 +44,6 @@ add_library(video_core STATIC
    gpu_thread.h
    guest_driver.cpp
    guest_driver.h
-    macro_interpreter.cpp
-    macro_interpreter.h
    memory_manager.cpp
    memory_manager.h
    morton.cpp
@@ -44,11 +51,11 @@ add_library(video_core STATIC
    query_cache.h
    rasterizer_accelerated.cpp
    rasterizer_accelerated.h
-    rasterizer_cache.cpp
-    rasterizer_cache.h
    rasterizer_interface.h
    renderer_base.cpp
    renderer_base.h
+    renderer_opengl/gl_arb_decompiler.cpp
+    renderer_opengl/gl_arb_decompiler.h
    renderer_opengl/gl_buffer_cache.cpp
    renderer_opengl/gl_buffer_cache.h
    renderer_opengl/gl_device.cpp
@@ -88,6 +95,7 @@ add_library(video_core STATIC
    renderer_opengl/utils.h
    sampler_cache.cpp
    sampler_cache.h
+    shader_cache.h
    shader/decode/arithmetic.cpp
    shader/decode/arithmetic_immediate.cpp
    shader/decode/bfe.cpp
@@ -228,7 +236,7 @@ endif()
 create_target_directory_groups(video_core)

 target_link_libraries(video_core PUBLIC common core)
-target_link_libraries(video_core PRIVATE glad)
+target_link_libraries(video_core PRIVATE glad xbyak)

 if (ENABLE_VULKAN)
    target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include)
--- a/src/video_core/buffer_cache/buffer_block.h
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -15,48 +15,47 @@ namespace VideoCommon {

 class BufferBlock {
 public:
-    bool Overlaps(const VAddr start, const VAddr end) const {
+    bool Overlaps(VAddr start, VAddr end) const {
        return (cpu_addr < end) && (cpu_addr_end > start);
    }

-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
+    bool IsInside(VAddr other_start, VAddr other_end) const {
        return cpu_addr <= other_start && other_end <= cpu_addr_end;
    }

-    std::size_t GetOffset(const VAddr in_addr) {
+    std::size_t Offset(VAddr in_addr) const {
        return static_cast<std::size_t>(in_addr - cpu_addr);
    }

-    VAddr GetCpuAddr() const {
+    VAddr CpuAddr() const {
        return cpu_addr;
    }

-    VAddr GetCpuAddrEnd() const {
+    VAddr CpuAddrEnd() const {
        return cpu_addr_end;
    }

-    void SetCpuAddr(const VAddr new_addr) {
+    void SetCpuAddr(VAddr new_addr) {
        cpu_addr = new_addr;
        cpu_addr_end = new_addr + size;
    }

-    std::size_t GetSize() const {
+    std::size_t Size() const {
        return size;
    }

+    u64 Epoch() const {
+        return epoch;
+    }
+
    void SetEpoch(u64 new_epoch) {
        epoch = new_epoch;
    }

-    u64 GetEpoch() {
-        return epoch;
-    }
-
 protected:
-    explicit BufferBlock(VAddr cpu_addr, const std::size_t size) : size{size} {
-        SetCpuAddr(cpu_addr);
+    explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
+        SetCpuAddr(cpu_addr_);
    }
-    ~BufferBlock() = default;

 private:
    VAddr cpu_addr{};
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -12,11 +12,12 @@
 #include <utility>
 #include <vector>

-#include <boost/icl/interval_map.hpp>
+#include <boost/container/small_vector.hpp>
 #include <boost/icl/interval_set.hpp>
-#include <boost/range/iterator_range.hpp>
+#include <boost/intrusive/set.hpp>

 #include "common/alignment.h"
+#include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
 #include "core/core.h"
@@ -29,102 +30,124 @@

 namespace VideoCommon {

-using MapInterval = std::shared_ptr<MapIntervalBase>;
-
-template <typename OwnerBuffer, typename BufferType, typename StreamBuffer>
+template <typename Buffer, typename BufferType, typename StreamBuffer>
 class BufferCache {
+    using IntervalSet = boost::icl::interval_set<VAddr>;
+    using IntervalType = typename IntervalSet::interval_type;
+    using VectorMapInterval = boost::container::small_vector<MapInterval*, 1>;
+
+    static constexpr u64 WRITE_PAGE_BIT = 11;
+    static constexpr u64 BLOCK_PAGE_BITS = 21;
+    static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS;
+
 public:
-    using BufferInfo = std::pair<BufferType, u64>;
+    struct BufferInfo {
+        BufferType handle;
+        u64 offset;
+        u64 address;
+    };

    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
                            bool is_written = false, bool use_fast_cbuf = false) {
        std::lock_guard lock{mutex};

-        const std::optional<VAddr> cpu_addr_opt =
-            system.GPU().MemoryManager().GpuToCpuAddress(gpu_addr);
-
+        auto& memory_manager = system.GPU().MemoryManager();
+        const std::optional<VAddr> cpu_addr_opt = memory_manager.GpuToCpuAddress(gpu_addr);
        if (!cpu_addr_opt) {
-            return {GetEmptyBuffer(size), 0};
+            return GetEmptyBuffer(size);
        }
-
-        VAddr cpu_addr = *cpu_addr_opt;
+        const VAddr cpu_addr = *cpu_addr_opt;

        // Cache management is a big overhead, so only cache entries with a given size.
        // TODO: Figure out which size is the best for given games.
        constexpr std::size_t max_stream_size = 0x800;
        if (use_fast_cbuf || size < max_stream_size) {
            if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) {
-                auto& memory_manager = system.GPU().MemoryManager();
+                const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size);
                if (use_fast_cbuf) {
-                    if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-                        return ConstBufferUpload(host_ptr, size);
+                    u8* dest;
+                    if (is_granular) {
+                        dest = memory_manager.GetPointer(gpu_addr);
                    } else {
                        staging_buffer.resize(size);
-                        memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                        return ConstBufferUpload(staging_buffer.data(), size);
+                        dest = staging_buffer.data();
+                        memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
                    }
+                    return ConstBufferUpload(dest, size);
+                }
+                if (is_granular) {
+                    u8* const host_ptr = memory_manager.GetPointer(gpu_addr);
+                    return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) {
+                        std::memcpy(dest, host_ptr, size);
+                    });
                } else {
-                    if (memory_manager.IsGranularRange(gpu_addr, size)) {
-                        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-                        return StreamBufferUpload(host_ptr, size, alignment);
-                    } else {
-                        staging_buffer.resize(size);
-                        memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                        return StreamBufferUpload(staging_buffer.data(), size, alignment);
-                    }
+                    return StreamBufferUpload(
+                        size, alignment, [&memory_manager, gpu_addr, size](u8* dest) {
+                            memory_manager.ReadBlockUnsafe(gpu_addr, dest, size);
+                        });
                }
            }
        }

-        auto block = GetBlock(cpu_addr, size);
-        auto map = MapAddress(block, gpu_addr, cpu_addr, size);
+        Buffer* const block = GetBlock(cpu_addr, size);
+        MapInterval* const map = MapAddress(block, gpu_addr, cpu_addr, size);
+        if (!map) {
+            return GetEmptyBuffer(size);
+        }
        if (is_written) {
            map->MarkAsModified(true, GetModifiedTicks());
            if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
                MarkForAsyncFlush(map);
            }
-            if (!map->IsWritten()) {
-                map->MarkAsWritten(true);
-                MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+            if (!map->is_written) {
+                map->is_written = true;
+                MarkRegionAsWritten(map->start, map->end - 1);
            }
        }

-        return {ToHandle(block), static_cast<u64>(block->GetOffset(cpu_addr))};
+        return BufferInfo{block->Handle(), block->Offset(cpu_addr), block->Address()};
    }

    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
                                std::size_t alignment = 4) {
        std::lock_guard lock{mutex};
-        return StreamBufferUpload(raw_pointer, size, alignment);
+        return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) {
+            std::memcpy(dest, raw_pointer, size);
+        });
    }

-    void Map(std::size_t max_size) {
+    /// Prepares the buffer cache for data uploading
+    /// @param max_size Maximum number of bytes that will be uploaded
+    /// @return True when a stream buffer invalidation was required, false otherwise
+    bool Map(std::size_t max_size) {
        std::lock_guard lock{mutex};

+        bool invalidated;
        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
        buffer_offset = buffer_offset_base;
+
+        return invalidated;
    }

-    /// Finishes the upload stream, returns true on bindings invalidation.
-    bool Unmap() {
+    /// Finishes the upload stream
+    void Unmap() {
        std::lock_guard lock{mutex};
-
        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
-        return std::exchange(invalidated, false);
    }

+    /// Function called at the end of each frame, inteded for deferred operations
    void TickFrame() {
        ++epoch;
+
        while (!pending_destruction.empty()) {
            // Delay at least 4 frames before destruction.
            // This is due to triple buffering happening on some drivers.
            static constexpr u64 epochs_to_destroy = 5;
-            if (pending_destruction.front()->GetEpoch() + epochs_to_destroy > epoch) {
+            if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) {
                break;
            }
-            pending_destruction.pop_front();
+            pending_destruction.pop();
        }
    }

@@ -132,12 +155,11 @@ public:
    void FlushRegion(VAddr addr, std::size_t size) {
        std::lock_guard lock{mutex};

-        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
-            return a->GetModificationTick() < b->GetModificationTick();
-        });
-        for (auto& object : objects) {
-            if (object->IsModified() && object->IsRegistered()) {
+        VectorMapInterval objects = GetMapsInRange(addr, size);
+        std::sort(objects.begin(), objects.end(),
+                  [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; });
+        for (MapInterval* object : objects) {
+            if (object->is_modified && object->is_registered) {
                mutex.unlock();
                FlushMap(object);
                mutex.lock();
@@ -148,9 +170,9 @@ public:
    bool MustFlushRegion(VAddr addr, std::size_t size) {
        std::lock_guard lock{mutex};

-        const std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval& map) {
-            return map->IsModified() && map->IsRegistered();
+        const VectorMapInterval objects = GetMapsInRange(addr, size);
+        return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) {
+            return map->is_modified && map->is_registered;
        });
    }

@@ -158,9 +180,8 @@ public:
    void InvalidateRegion(VAddr addr, u64 size) {
        std::lock_guard lock{mutex};

-        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
-        for (auto& object : objects) {
-            if (object->IsRegistered()) {
+        for (auto& object : GetMapsInRange(addr, size)) {
+            if (object->is_registered) {
                Unregister(object);
            }
        }
@@ -169,10 +190,10 @@ public:
    void OnCPUWrite(VAddr addr, std::size_t size) {
        std::lock_guard lock{mutex};

-        for (const auto& object : GetMapsInRange(addr, size)) {
-            if (object->IsMemoryMarked() && object->IsRegistered()) {
+        for (MapInterval* object : GetMapsInRange(addr, size)) {
+            if (object->is_memory_marked && object->is_registered) {
                UnmarkMemory(object);
-                object->SetSyncPending(true);
+                object->is_sync_pending = true;
                marked_for_unregister.emplace_back(object);
            }
        }
@@ -181,9 +202,9 @@ public:
    void SyncGuestHost() {
        std::lock_guard lock{mutex};

-        for (const auto& object : marked_for_unregister) {
-            if (object->IsRegistered()) {
-                object->SetSyncPending(false);
+        for (auto& object : marked_for_unregister) {
+            if (object->is_registered) {
+                object->is_sync_pending = false;
                Unregister(object);
            }
        }
@@ -192,9 +213,9 @@ public:

    void CommitAsyncFlushes() {
        if (uncommitted_flushes) {
-            auto commit_list = std::make_shared<std::list<MapInterval>>();
-            for (auto& map : *uncommitted_flushes) {
-                if (map->IsRegistered() && map->IsModified()) {
+            auto commit_list = std::make_shared<std::list<MapInterval*>>();
+            for (MapInterval* map : *uncommitted_flushes) {
+                if (map->is_registered && map->is_modified) {
                    // TODO(Blinkhawk): Implement backend asynchronous flushing
                    // AsyncFlushMap(map)
                    commit_list->push_back(map);
@@ -228,8 +249,8 @@ public:
            committed_flushes.pop_front();
            return;
        }
-        for (MapInterval& map : *flush_list) {
-            if (map->IsRegistered()) {
+        for (MapInterval* map : *flush_list) {
+            if (map->is_registered) {
                // TODO(Blinkhawk): Replace this for reading the asynchronous flush
                FlushMap(map);
            }
@@ -237,104 +258,90 @@ public:
        committed_flushes.pop_front();
    }

-    virtual BufferType GetEmptyBuffer(std::size_t size) = 0;
+    virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0;

 protected:
    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
                         std::unique_ptr<StreamBuffer> stream_buffer)
-        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
-          stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)} {}

    ~BufferCache() = default;

-    virtual BufferType ToHandle(const OwnerBuffer& storage) = 0;
-
-    virtual OwnerBuffer CreateBlock(VAddr cpu_addr, std::size_t size) = 0;
-
-    virtual void UploadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
-                                 const u8* data) = 0;
-
-    virtual void DownloadBlockData(const OwnerBuffer& buffer, std::size_t offset, std::size_t size,
-                                   u8* data) = 0;
-
-    virtual void CopyBlock(const OwnerBuffer& src, const OwnerBuffer& dst, std::size_t src_offset,
-                           std::size_t dst_offset, std::size_t size) = 0;
+    virtual std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) = 0;

    virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
        return {};
    }

    /// Register an object into the cache
-    void Register(const MapInterval& new_map, bool inherit_written = false) {
-        const VAddr cpu_addr = new_map->GetStart();
+    MapInterval* Register(MapInterval new_map, bool inherit_written = false) {
+        const VAddr cpu_addr = new_map.start;
        if (!cpu_addr) {
            LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
-                         new_map->GetGpuAddress());
-            return;
+                         new_map.gpu_addr);
+            return nullptr;
        }
-        const std::size_t size = new_map->GetEnd() - new_map->GetStart();
-        new_map->MarkAsRegistered(true);
-        const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
-        mapped_addresses.insert({interval, new_map});
+        const std::size_t size = new_map.end - new_map.start;
+        new_map.is_registered = true;
        rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1);
-        new_map->SetMemoryMarked(true);
+        new_map.is_memory_marked = true;
        if (inherit_written) {
-            MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
-            new_map->MarkAsWritten(true);
+            MarkRegionAsWritten(new_map.start, new_map.end - 1);
+            new_map.is_written = true;
        }
+        MapInterval* const storage = mapped_addresses_allocator.Allocate();
+        *storage = new_map;
+        mapped_addresses.insert(*storage);
+        return storage;
    }

-    void UnmarkMemory(const MapInterval& map) {
-        if (!map->IsMemoryMarked()) {
+    void UnmarkMemory(MapInterval* map) {
+        if (!map->is_memory_marked) {
            return;
        }
-        const std::size_t size = map->GetEnd() - map->GetStart();
-        rasterizer.UpdatePagesCachedCount(map->GetStart(), size, -1);
-        map->SetMemoryMarked(false);
+        const std::size_t size = map->end - map->start;
+        rasterizer.UpdatePagesCachedCount(map->start, size, -1);
+        map->is_memory_marked = false;
    }

    /// Unregisters an object from the cache
-    void Unregister(const MapInterval& map) {
+    void Unregister(MapInterval* map) {
        UnmarkMemory(map);
-        map->MarkAsRegistered(false);
-        if (map->IsSyncPending()) {
+        map->is_registered = false;
+        if (map->is_sync_pending) {
+            map->is_sync_pending = false;
            marked_for_unregister.remove(map);
-            map->SetSyncPending(false);
        }
-        if (map->IsWritten()) {
-            UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+        if (map->is_written) {
+            UnmarkRegionAsWritten(map->start, map->end - 1);
        }
-        const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
-        mapped_addresses.erase(delete_interval);
+        const auto it = mapped_addresses.find(*map);
+        ASSERT(it != mapped_addresses.end());
+        mapped_addresses.erase(it);
+        mapped_addresses_allocator.Release(map);
    }

 private:
-    MapInterval CreateMap(const VAddr start, const VAddr end, const GPUVAddr gpu_addr) {
-        return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
-    }
-
-    MapInterval MapAddress(const OwnerBuffer& block, const GPUVAddr gpu_addr, const VAddr cpu_addr,
-                           const std::size_t size) {
-        std::vector<MapInterval> overlaps = GetMapsInRange(cpu_addr, size);
+    MapInterval* MapAddress(const Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr,
+                            std::size_t size) {
+        const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size);
        if (overlaps.empty()) {
            auto& memory_manager = system.GPU().MemoryManager();
            const VAddr cpu_addr_end = cpu_addr + size;
-            MapInterval new_map = CreateMap(cpu_addr, cpu_addr_end, gpu_addr);
            if (memory_manager.IsGranularRange(gpu_addr, size)) {
                u8* host_ptr = memory_manager.GetPointer(gpu_addr);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, host_ptr);
+                block->Upload(block->Offset(cpu_addr), size, host_ptr);
            } else {
                staging_buffer.resize(size);
                memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(cpu_addr), size, staging_buffer.data());
+                block->Upload(block->Offset(cpu_addr), size, staging_buffer.data());
            }
-            Register(new_map);
-            return new_map;
+            return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr));
        }

        const VAddr cpu_addr_end = cpu_addr + size;
        if (overlaps.size() == 1) {
-            MapInterval& current_map = overlaps[0];
+            MapInterval* const current_map = overlaps[0];
            if (current_map->IsInside(cpu_addr, cpu_addr_end)) {
                return current_map;
            }
@@ -344,60 +351,70 @@ private:
        bool write_inheritance = false;
        bool modified_inheritance = false;
        // Calculate new buffer parameters
-        for (auto& overlap : overlaps) {
-            new_start = std::min(overlap->GetStart(), new_start);
-            new_end = std::max(overlap->GetEnd(), new_end);
-            write_inheritance |= overlap->IsWritten();
-            modified_inheritance |= overlap->IsModified();
+        for (MapInterval* overlap : overlaps) {
+            new_start = std::min(overlap->start, new_start);
+            new_end = std::max(overlap->end, new_end);
+            write_inheritance |= overlap->is_written;
+            modified_inheritance |= overlap->is_modified;
        }
        GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr;
        for (auto& overlap : overlaps) {
            Unregister(overlap);
        }
        UpdateBlock(block, new_start, new_end, overlaps);
-        MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
+
+        const MapInterval new_map{new_start, new_end, new_gpu_addr};
+        MapInterval* const map = Register(new_map, write_inheritance);
+        if (!map) {
+            return nullptr;
+        }
        if (modified_inheritance) {
-            new_map->MarkAsModified(true, GetModifiedTicks());
+            map->MarkAsModified(true, GetModifiedTicks());
            if (Settings::IsGPULevelHigh() && Settings::values.use_asynchronous_gpu_emulation) {
-                MarkForAsyncFlush(new_map);
+                MarkForAsyncFlush(map);
            }
        }
-        Register(new_map, write_inheritance);
-        return new_map;
+        return map;
    }

-    void UpdateBlock(const OwnerBuffer& block, VAddr start, VAddr end,
-                     std::vector<MapInterval>& overlaps) {
+    void UpdateBlock(const Buffer* block, VAddr start, VAddr end,
+                     const VectorMapInterval& overlaps) {
        const IntervalType base_interval{start, end};
        IntervalSet interval_set{};
        interval_set.add(base_interval);
        for (auto& overlap : overlaps) {
-            const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
+            const IntervalType subtract{overlap->start, overlap->end};
            interval_set.subtract(subtract);
        }
        for (auto& interval : interval_set) {
-            std::size_t size = interval.upper() - interval.lower();
-            if (size > 0) {
-                staging_buffer.resize(size);
-                system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
-                UploadBlockData(block, block->GetOffset(interval.lower()), size,
-                                staging_buffer.data());
+            const std::size_t size = interval.upper() - interval.lower();
+            if (size == 0) {
+                continue;
            }
+            staging_buffer.resize(size);
+            system.Memory().ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size);
+            block->Upload(block->Offset(interval.lower()), size, staging_buffer.data());
        }
    }

-    std::vector<MapInterval> GetMapsInRange(VAddr addr, std::size_t size) {
+    VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) {
+        VectorMapInterval result;
        if (size == 0) {
-            return {};
+            return result;
        }

-        std::vector<MapInterval> objects{};
-        const IntervalType interval{addr, addr + size};
-        for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
-            objects.push_back(pair.second);
+        const VAddr addr_end = addr + size;
+        auto it = mapped_addresses.lower_bound(addr);
+        if (it != mapped_addresses.begin()) {
+            --it;
        }
-
-        return objects;
+        while (it != mapped_addresses.end() && it->start < addr_end) {
+            if (it->Overlaps(addr, addr_end)) {
+                result.push_back(&*it);
+            }
+            ++it;
+        }
+        return result;
    }

    /// Returns a ticks counter used for tracking when cached objects were last modified
@@ -405,24 +422,28 @@ private:
        return ++modified_ticks;
    }

-    void FlushMap(MapInterval map) {
-        std::size_t size = map->GetEnd() - map->GetStart();
-        OwnerBuffer block = blocks[map->GetStart() >> block_page_bits];
+    void FlushMap(MapInterval* map) {
+        const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS);
+        ASSERT_OR_EXECUTE(it != blocks.end(), return;);
+
+        std::shared_ptr<Buffer> block = it->second;
+
+        const std::size_t size = map->end - map->start;
        staging_buffer.resize(size);
-        DownloadBlockData(block, block->GetOffset(map->GetStart()), size, staging_buffer.data());
-        system.Memory().WriteBlockUnsafe(map->GetStart(), staging_buffer.data(), size);
+        block->Download(block->Offset(map->start), size, staging_buffer.data());
+        system.Memory().WriteBlockUnsafe(map->start, staging_buffer.data(), size);
        map->MarkAsModified(false, 0);
    }

-    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
-                                  std::size_t alignment) {
+    template <typename Callable>
+    BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) {
        AlignBuffer(alignment);
        const std::size_t uploaded_offset = buffer_offset;
-        std::memcpy(buffer_ptr, raw_pointer, size);
+        callable(buffer_ptr);

        buffer_ptr += size;
        buffer_offset += size;
-        return {stream_buffer_handle, uploaded_offset};
+        return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
    }

    void AlignBuffer(std::size_t alignment) {
@@ -432,97 +453,89 @@ private:
        buffer_offset = offset_aligned;
    }

-    OwnerBuffer EnlargeBlock(OwnerBuffer buffer) {
-        const std::size_t old_size = buffer->GetSize();
-        const std::size_t new_size = old_size + block_page_size;
-        const VAddr cpu_addr = buffer->GetCpuAddr();
-        OwnerBuffer new_buffer = CreateBlock(cpu_addr, new_size);
-        CopyBlock(buffer, new_buffer, 0, 0, old_size);
-        buffer->SetEpoch(epoch);
-        pending_destruction.push_back(buffer);
+    std::shared_ptr<Buffer> EnlargeBlock(std::shared_ptr<Buffer> buffer) {
+        const std::size_t old_size = buffer->Size();
+        const std::size_t new_size = old_size + BLOCK_PAGE_SIZE;
+        const VAddr cpu_addr = buffer->CpuAddr();
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(cpu_addr, new_size);
+        new_buffer->CopyFrom(*buffer, 0, 0, old_size);
+        QueueDestruction(std::move(buffer));
+
        const VAddr cpu_addr_end = cpu_addr + new_size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
        }
+
        return new_buffer;
    }

-    OwnerBuffer MergeBlocks(OwnerBuffer first, OwnerBuffer second) {
-        const std::size_t size_1 = first->GetSize();
-        const std::size_t size_2 = second->GetSize();
-        const VAddr first_addr = first->GetCpuAddr();
-        const VAddr second_addr = second->GetCpuAddr();
+    std::shared_ptr<Buffer> MergeBlocks(std::shared_ptr<Buffer> first,
+                                        std::shared_ptr<Buffer> second) {
+        const std::size_t size_1 = first->Size();
+        const std::size_t size_2 = second->Size();
+        const VAddr first_addr = first->CpuAddr();
+        const VAddr second_addr = second->CpuAddr();
        const VAddr new_addr = std::min(first_addr, second_addr);
        const std::size_t new_size = size_1 + size_2;
-        OwnerBuffer new_buffer = CreateBlock(new_addr, new_size);
-        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
-        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
-        first->SetEpoch(epoch);
-        second->SetEpoch(epoch);
-        pending_destruction.push_back(first);
-        pending_destruction.push_back(second);
+
+        std::shared_ptr<Buffer> new_buffer = CreateBlock(new_addr, new_size);
+        new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1);
+        new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2);
+        QueueDestruction(std::move(first));
+        QueueDestruction(std::move(second));
+
        const VAddr cpu_addr_end = new_addr + new_size - 1;
-        u64 page_start = new_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
-            blocks[page_start] = new_buffer;
-            ++page_start;
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
+            blocks.insert_or_assign(page_start, new_buffer);
        }
        return new_buffer;
    }

-    OwnerBuffer GetBlock(const VAddr cpu_addr, const std::size_t size) {
-        OwnerBuffer found;
+    Buffer* GetBlock(VAddr cpu_addr, std::size_t size) {
+        std::shared_ptr<Buffer> found;
+
        const VAddr cpu_addr_end = cpu_addr + size - 1;
-        u64 page_start = cpu_addr >> block_page_bits;
-        const u64 page_end = cpu_addr_end >> block_page_bits;
-        while (page_start <= page_end) {
+        const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS;
+        for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) {
            auto it = blocks.find(page_start);
            if (it == blocks.end()) {
                if (found) {
                    found = EnlargeBlock(found);
-                } else {
-                    const VAddr start_addr = (page_start << block_page_bits);
-                    found = CreateBlock(start_addr, block_page_size);
-                    blocks[page_start] = found;
-                }
-            } else {
-                if (found) {
-                    if (found == it->second) {
-                        ++page_start;
-                        continue;
-                    }
-                    found = MergeBlocks(found, it->second);
-                } else {
-                    found = it->second;
+                    continue;
                }
+                const VAddr start_addr = page_start << BLOCK_PAGE_BITS;
+                found = CreateBlock(start_addr, BLOCK_PAGE_SIZE);
+                blocks.insert_or_assign(page_start, found);
+                continue;
+            }
+            if (!found) {
+                found = it->second;
+                continue;
+            }
+            if (found != it->second) {
+                found = MergeBlocks(std::move(found), it->second);
            }
-            ++page_start;
        }
-        return found;
+        return found.get();
    }

-    void MarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void MarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
            auto it = written_pages.find(page_start);
            if (it != written_pages.end()) {
                it->second = it->second + 1;
            } else {
-                written_pages[page_start] = 1;
+                written_pages.insert_or_assign(page_start, 1);
            }
-            page_start++;
        }
    }

-    void UnmarkRegionAsWritten(const VAddr start, const VAddr end) {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    void UnmarkRegionAsWritten(VAddr start, VAddr end) {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
            auto it = written_pages.find(page_start);
            if (it != written_pages.end()) {
                if (it->second > 1) {
@@ -531,25 +544,27 @@ private:
                    written_pages.erase(it);
                }
            }
-            page_start++;
        }
    }

-    bool IsRegionWritten(const VAddr start, const VAddr end) const {
-        u64 page_start = start >> write_page_bit;
-        const u64 page_end = end >> write_page_bit;
-        while (page_start <= page_end) {
+    bool IsRegionWritten(VAddr start, VAddr end) const {
+        const u64 page_end = end >> WRITE_PAGE_BIT;
+        for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) {
            if (written_pages.count(page_start) > 0) {
                return true;
            }
-            page_start++;
        }
        return false;
    }

-    void MarkForAsyncFlush(MapInterval& map) {
+    void QueueDestruction(std::shared_ptr<Buffer> buffer) {
+        buffer->SetEpoch(epoch);
+        pending_destruction.push(std::move(buffer));
+    }
+
+    void MarkForAsyncFlush(MapInterval* map) {
        if (!uncommitted_flushes) {
-            uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval>>();
+            uncommitted_flushes = std::make_shared<std::unordered_set<MapInterval*>>();
        }
        uncommitted_flushes->insert(map);
    }
@@ -558,35 +573,29 @@ private:
    Core::System& system;

    std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle{};
-
-    bool invalidated = false;
+    BufferType stream_buffer_handle;

    u8* buffer_ptr = nullptr;
    u64 buffer_offset = 0;
    u64 buffer_offset_base = 0;

-    using IntervalSet = boost::icl::interval_set<VAddr>;
-    using IntervalCache = boost::icl::interval_map<VAddr, MapInterval>;
-    using IntervalType = typename IntervalCache::interval_type;
-    IntervalCache mapped_addresses;
+    MapIntervalAllocator mapped_addresses_allocator;
+    boost::intrusive::set<MapInterval, boost::intrusive::compare<MapIntervalCompare>>
+        mapped_addresses;

-    static constexpr u64 write_page_bit = 11;
    std::unordered_map<u64, u32> written_pages;
+    std::unordered_map<u64, std::shared_ptr<Buffer>> blocks;

-    static constexpr u64 block_page_bits = 21;
-    static constexpr u64 block_page_size = 1ULL << block_page_bits;
-    std::unordered_map<u64, OwnerBuffer> blocks;
-
-    std::list<OwnerBuffer> pending_destruction;
+    std::queue<std::shared_ptr<Buffer>> pending_destruction;
    u64 epoch = 0;
    u64 modified_ticks = 0;

    std::vector<u8> staging_buffer;
-    std::list<MapInterval> marked_for_unregister;

-    std::shared_ptr<std::unordered_set<MapInterval>> uncommitted_flushes{};
-    std::list<std::shared_ptr<std::list<MapInterval>>> committed_flushes;
+    std::list<MapInterval*> marked_for_unregister;
+
+    std::shared_ptr<std::unordered_set<MapInterval*>> uncommitted_flushes;
+    std::list<std::shared_ptr<std::list<MapInterval*>>> committed_flushes;

    std::recursive_mutex mutex;
 };
--- a/src/video_core/buffer_cache/map_interval.cpp
+++ b/src/video_core/buffer_cache/map_interval.cpp
@@ -0,0 +1,33 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
+#include <memory>
+
+#include "video_core/buffer_cache/map_interval.h"
+
+namespace VideoCommon {
+
+MapIntervalAllocator::MapIntervalAllocator() {
+    FillFreeList(first_chunk);
+}
+
+MapIntervalAllocator::~MapIntervalAllocator() = default;
+
+void MapIntervalAllocator::AllocateNewChunk() {
+    *new_chunk = std::make_unique<Chunk>();
+    FillFreeList(**new_chunk);
+    new_chunk = &(*new_chunk)->next;
+}
+
+void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
+    const std::size_t old_size = free_list.size();
+    free_list.resize(old_size + chunk.data.size());
+    std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
+                   [](MapInterval& interval) { return &interval; });
+}
+
+} // namespace VideoCommon
--- a/src/video_core/buffer_cache/map_interval.h
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -4,104 +4,89 @@

 #pragma once

+#include <array>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <boost/intrusive/set_hook.hpp>
+
 #include "common/common_types.h"
 #include "video_core/gpu.h"

 namespace VideoCommon {

-class MapIntervalBase {
-public:
-    MapIntervalBase(const VAddr start, const VAddr end, const GPUVAddr gpu_addr)
-        : start{start}, end{end}, gpu_addr{gpu_addr} {}
+struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
+    MapInterval() = default;

-    void SetCpuAddress(VAddr new_cpu_addr) {
-        cpu_addr = new_cpu_addr;
+    /*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
+
+    explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
+        : start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
+
+    bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
+        return start <= other_start && other_end <= end;
    }

-    VAddr GetCpuAddress() const {
-        return cpu_addr;
+    bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
+        return start < other_end && other_start < end;
    }

-    GPUVAddr GetGpuAddress() const {
-        return gpu_addr;
-    }
-
-    bool IsInside(const VAddr other_start, const VAddr other_end) const {
-        return (start <= other_start && other_end <= end);
-    }
-
-    bool operator==(const MapIntervalBase& rhs) const {
-        return std::tie(start, end) == std::tie(rhs.start, rhs.end);
-    }
-
-    bool operator!=(const MapIntervalBase& rhs) const {
-        return !operator==(rhs);
-    }
-
-    void MarkAsRegistered(const bool registered) {
-        is_registered = registered;
-    }
-
-    bool IsRegistered() const {
-        return is_registered;
-    }
-
-    void SetMemoryMarked(bool is_memory_marked_) {
-        is_memory_marked = is_memory_marked_;
-    }
-
-    bool IsMemoryMarked() const {
-        return is_memory_marked;
-    }
-
-    void SetSyncPending(bool is_sync_pending_) {
-        is_sync_pending = is_sync_pending_;
-    }
-
-    bool IsSyncPending() const {
-        return is_sync_pending;
-    }
-
-    VAddr GetStart() const {
-        return start;
-    }
-
-    VAddr GetEnd() const {
-        return end;
-    }
-
-    void MarkAsModified(const bool is_modified_, const u64 tick) {
+    void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
        is_modified = is_modified_;
-        ticks = tick;
+        ticks = ticks_;
    }

-    bool IsModified() const {
-        return is_modified;
+    boost::intrusive::set_member_hook<> member_hook_;
+    VAddr start = 0;
+    VAddr end = 0;
+    GPUVAddr gpu_addr = 0;
+    u64 ticks = 0;
+    bool is_written = false;
+    bool is_modified = false;
+    bool is_registered = false;
+    bool is_memory_marked = false;
+    bool is_sync_pending = false;
+};
+
+struct MapIntervalCompare {
+    constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
+        return lhs.start < rhs.start;
+    }
+};
+
+class MapIntervalAllocator {
+public:
+    MapIntervalAllocator();
+    ~MapIntervalAllocator();
+
+    MapInterval* Allocate() {
+        if (free_list.empty()) {
+            AllocateNewChunk();
+        }
+        MapInterval* const interval = free_list.back();
+        free_list.pop_back();
+        return interval;
    }

-    u64 GetModificationTick() const {
-        return ticks;
-    }
-
-    void MarkAsWritten(const bool is_written_) {
-        is_written = is_written_;
-    }
-
-    bool IsWritten() const {
-        return is_written;
+    void Release(MapInterval* interval) {
+        free_list.push_back(interval);
    }

 private:
-    VAddr start;
-    VAddr end;
-    GPUVAddr gpu_addr;
-    VAddr cpu_addr{};
-    bool is_written{};
-    bool is_modified{};
-    bool is_registered{};
-    bool is_memory_marked{};
-    bool is_sync_pending{};
-    u64 ticks{};
+    struct Chunk {
+        std::unique_ptr<Chunk> next;
+        std::array<MapInterval, 0x8000> data;
+    };
+
+    void AllocateNewChunk();
+
+    void FillFreeList(Chunk& chunk);
+
+    std::vector<MapInterval*> free_list;
+    std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
+
+    Chunk first_chunk;
 };

 } // namespace VideoCommon
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -93,6 +93,7 @@ public:
    virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
    virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                                    u64 offset) const = 0;
+    virtual SamplerDescriptor AccessSampler(u32 handle) const = 0;
    virtual u32 GetBoundBuffer() const = 0;

    virtual VideoCore::GuestDriverProfile& AccessGuestDriverProfile() = 0;
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -92,8 +92,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con
    ASSERT(stage == ShaderType::Compute);
    const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
    const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}

-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -219,6 +219,8 @@ public:
    SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                            u64 offset) const override;

+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
    u32 GetBoundBuffer() const override {
        return regs.tex_cb_index;
    }
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00;
 Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                     MemoryManager& memory_manager)
    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
-      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+      macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} {
    dirty.flags.flip();
-
    InitializeRegisterDefaults();
 }

@@ -106,7 +105,11 @@ void Maxwell3D::InitializeRegisterDefaults() {
    regs.rasterize_enable = 1;
    regs.rt_separate_frag_data = 1;
    regs.framebuffer_srgb = 1;
+    regs.line_width_aliased = 1.0f;
+    regs.line_width_smooth = 1.0f;
    regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise;
+    regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill;
+    regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill;

    shadow_state = regs;

@@ -116,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
    mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }

-void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) {
+void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
    // Reset the current macro.
    executing_macro = 0;

@@ -125,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3
        ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());

    // Execute the current macro.
-    macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters);
+    macro_engine->Execute(*this, macro_positions[entry], parameters);
    if (mme_draw.current_mode != MMEDrawMode::Undefined) {
        FlushMMEInlineDraw();
    }
@@ -161,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {

        // Call the macro when there are no more parameters in the command buffer
        if (is_last_call) {
-            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+            CallMacroMethod(executing_macro, macro_params);
            macro_params.clear();
        }
        return;
@@ -197,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
        break;
    }
    case MAXWELL3D_REG_INDEX(macros.data): {
-        ProcessMacroUpload(arg);
+        macro_engine->AddCode(regs.macros.upload_address, arg);
        break;
    }
    case MAXWELL3D_REG_INDEX(macros.bind): {
@@ -306,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,

        // Call the macro when there are no more parameters in the command buffer
        if (amount == methods_pending) {
-            CallMacroMethod(executing_macro, macro_params.size(), macro_params.data());
+            CallMacroMethod(executing_macro, macro_params);
            macro_params.clear();
        }
        return;
@@ -420,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() {
 }

 void Maxwell3D::ProcessMacroUpload(u32 data) {
-    ASSERT_MSG(regs.macros.upload_address < macro_memory.size(),
-               "upload_address exceeded macro_memory size!");
-    macro_memory[regs.macros.upload_address++] = data;
+    macro_engine->AddCode(regs.macros.upload_address++, data);
 }

 void Maxwell3D::ProcessMacroBind(u32 data) {
@@ -457,8 +458,9 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {

 void Maxwell3D::ProcessQueryGet() {
    // TODO(Subv): Support the other query units.
-    ASSERT_MSG(regs.query.query_get.unit == Regs::QueryUnit::Crop,
-               "Units other than CROP are unimplemented");
+    if (regs.query.query_get.unit != Regs::QueryUnit::Crop) {
+        LOG_DEBUG(HW_GPU, "Units other than CROP are unimplemented");
+    }

    switch (regs.query.query_get.operation) {
    case Regs::QueryOperation::Release:
@@ -534,8 +536,8 @@ void Maxwell3D::ProcessCounterReset() {
        rasterizer.ResetCounter(QueryType::SamplesPassed);
        break;
    default:
-        LOG_WARNING(Render_OpenGL, "Unimplemented counter reset={}",
-                    static_cast<int>(regs.counter_reset));
+        LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}",
+                  static_cast<int>(regs.counter_reset));
        break;
    }
 }
@@ -592,8 +594,8 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
                         system.GPU().GetTicks());
        return {};
    default:
-        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
-                          static_cast<u32>(regs.query.query_get.select.Value()));
+        LOG_DEBUG(HW_GPU, "Unimplemented query select type {}",
+                  static_cast<u32>(regs.query.query_get.select.Value()));
        return 1;
    }
 }
@@ -738,8 +740,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b
    const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
    const auto& tex_info_buffer = shader.const_buffers[const_buffer];
    const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
+    return AccessSampler(memory_manager.Read<u32>(tex_info_address));
+}

-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
+    const Texture::TextureHandle tex_handle{handle};
    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -23,7 +23,7 @@
 #include "video_core/engines/engine_upload.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/gpu.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro.h"
 #include "video_core/textures/texture.h"

 namespace Core {
@@ -598,6 +598,7 @@ public:
                BitField<4, 3, u32> block_height;
                BitField<8, 3, u32> block_depth;
                BitField<12, 1, InvMemoryLayout> type;
+                BitField<16, 1, u32> is_3d;
            } memory_layout;
            union {
                BitField<0, 16, u32> layers;
@@ -1403,6 +1404,8 @@ public:
    SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
                                            u64 offset) const override;

+    SamplerDescriptor AccessSampler(u32 handle) const override;
+
    u32 GetBoundBuffer() const override {
        return regs.tex_cb_index;
    }
@@ -1411,19 +1414,18 @@ public:

    const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override;

-    /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
-    /// we've seen used.
-    using MacroMemory = std::array<u32, 0x40000>;
-
-    /// Gets a reference to macro memory.
-    const MacroMemory& GetMacroMemory() const {
-        return macro_memory;
-    }
-
    bool ShouldExecute() const {
        return execute_on;
    }

+    VideoCore::RasterizerInterface& GetRasterizer() {
+        return rasterizer;
+    }
+
+    const VideoCore::RasterizerInterface& GetRasterizer() const {
+        return rasterizer;
+    }
+
    /// Notify a memory write has happened.
    void OnMemoryWrite() {
        dirty.flags |= dirty.on_write_stores;
@@ -1468,16 +1470,13 @@ private:

    std::array<bool, Regs::NUM_REGS> mme_inline{};

-    /// Memory for macro code
-    MacroMemory macro_memory;
-
    /// Macro method that is currently being executed / being fed parameters.
    u32 executing_macro = 0;
    /// Parameters that have been submitted to the macro call so far.
    std::vector<u32> macro_params;

    /// Interpreter for the macro codes uploaded to the GPU.
-    MacroInterpreter macro_interpreter;
+    std::unique_ptr<MacroEngine> macro_engine;

    static constexpr u32 null_cb_data = 0xFFFFFFFF;
    struct {
@@ -1506,7 +1505,7 @@ private:
     * @param num_parameters Number of arguments
     * @param parameters Arguments to the method call
     */
-    void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters);
+    void CallMacroMethod(u32 method, const std::vector<u32>& parameters);

    /// Handles writes to the macro uploading register.
    void ProcessMacroUpload(u32 data);
--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -0,0 +1,72 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <boost/container_hash/hash.hpp>
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "core/settings.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+namespace Tegra {
+
+MacroEngine::MacroEngine(Engines::Maxwell3D& maxwell3d)
+    : hle_macros{std::make_unique<Tegra::HLEMacro>(maxwell3d)} {}
+
+MacroEngine::~MacroEngine() = default;
+
+void MacroEngine::AddCode(u32 method, u32 data) {
+    uploaded_macro_code[method].push_back(data);
+}
+
+void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
+                          const std::vector<u32>& parameters) {
+    auto compiled_macro = macro_cache.find(method);
+    if (compiled_macro != macro_cache.end()) {
+        const auto& cache_info = compiled_macro->second;
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
+    } else {
+        // Macro not compiled, check if it's uploaded and if so, compile it
+        auto macro_code = uploaded_macro_code.find(method);
+        if (macro_code == uploaded_macro_code.end()) {
+            UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method);
+            return;
+        }
+        auto& cache_info = macro_cache[method];
+        cache_info.hash = boost::hash_value(macro_code->second);
+        cache_info.lle_program = Compile(macro_code->second);
+
+        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
+        if (hle_program.has_value()) {
+            cache_info.has_hle_program = true;
+            cache_info.hle_program = std::move(hle_program.value());
+        }
+
+        if (cache_info.has_hle_program) {
+            cache_info.hle_program->Execute(parameters, method);
+        } else {
+            cache_info.lle_program->Execute(parameters, method);
+        }
+    }
+}
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) {
+    if (Settings::values.disable_macro_jit) {
+        return std::make_unique<MacroInterpreter>(maxwell3d);
+    }
+#ifdef ARCHITECTURE_x86_64
+    return std::make_unique<MacroJITx64>(maxwell3d);
+#else
+    return std::make_unique<MacroInterpreter>(maxwell3d);
+#endif
+}
+
+} // namespace Tegra
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -0,0 +1,141 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+namespace Macro {
+constexpr std::size_t NUM_MACRO_REGISTERS = 8;
+enum class Operation : u32 {
+    ALU = 0,
+    AddImmediate = 1,
+    ExtractInsert = 2,
+    ExtractShiftLeftImmediate = 3,
+    ExtractShiftLeftRegister = 4,
+    Read = 5,
+    Unused = 6, // This operation doesn't seem to be a valid encoding.
+    Branch = 7,
+};
+
+enum class ALUOperation : u32 {
+    Add = 0,
+    AddWithCarry = 1,
+    Subtract = 2,
+    SubtractWithBorrow = 3,
+    // Operations 4-7 don't seem to be valid encodings.
+    Xor = 8,
+    Or = 9,
+    And = 10,
+    AndNot = 11,
+    Nand = 12
+};
+
+enum class ResultOperation : u32 {
+    IgnoreAndFetch = 0,
+    Move = 1,
+    MoveAndSetMethod = 2,
+    FetchAndSend = 3,
+    MoveAndSend = 4,
+    FetchAndSetMethod = 5,
+    MoveAndSetMethodFetchAndSend = 6,
+    MoveAndSetMethodSend = 7
+};
+
+enum class BranchCondition : u32 {
+    Zero = 0,
+    NotZero = 1,
+};
+
+union Opcode {
+    u32 raw;
+    BitField<0, 3, Operation> operation;
+    BitField<4, 3, ResultOperation> result_operation;
+    BitField<4, 1, BranchCondition> branch_condition;
+    // If set on a branch, then the branch doesn't have a delay slot.
+    BitField<5, 1, u32> branch_annul;
+    BitField<7, 1, u32> is_exit;
+    BitField<8, 3, u32> dst;
+    BitField<11, 3, u32> src_a;
+    BitField<14, 3, u32> src_b;
+    // The signed immediate overlaps the second source operand and the alu operation.
+    BitField<14, 18, s32> immediate;
+
+    BitField<17, 5, ALUOperation> alu_operation;
+
+    // Bitfield instructions data
+    BitField<17, 5, u32> bf_src_bit;
+    BitField<22, 5, u32> bf_size;
+    BitField<27, 5, u32> bf_dst_bit;
+
+    u32 GetBitfieldMask() const {
+        return (1 << bf_size) - 1;
+    }
+
+    s32 GetBranchTarget() const {
+        return static_cast<s32>(immediate * sizeof(u32));
+    }
+};
+
+union MethodAddress {
+    u32 raw;
+    BitField<0, 12, u32> address;
+    BitField<12, 6, u32> increment;
+};
+
+} // namespace Macro
+
+class HLEMacro;
+
+class CachedMacro {
+public:
+    virtual ~CachedMacro() = default;
+    /**
+     * Executes the macro code with the specified input parameters.
+     * @param code The macro byte code to execute
+     * @param parameters The parameters of the macro
+     */
+    virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0;
+};
+
+class MacroEngine {
+public:
+    explicit MacroEngine(Engines::Maxwell3D& maxwell3d);
+    virtual ~MacroEngine();
+
+    // Store the uploaded macro code to compile them when they're called.
+    void AddCode(u32 method, u32 data);
+
+    // Compiles the macro if its not in the cache, and executes the compiled macro
+    void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
+
+protected:
+    virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
+
+private:
+    struct CacheInfo {
+        std::unique_ptr<CachedMacro> lle_program{};
+        std::unique_ptr<CachedMacro> hle_program{};
+        u64 hash{};
+        bool has_hle_program{};
+    };
+
+    std::unordered_map<u32, CacheInfo> macro_cache;
+    std::unordered_map<u32, std::vector<u32>> uploaded_macro_code;
+    std::unique_ptr<HLEMacro> hle_macros;
+};
+
+std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d);
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -0,0 +1,113 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <array>
+#include <vector>
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_hle.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace Tegra {
+
+namespace {
+// HLE'd functions
+static void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
+
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0] &
+                                                                        ~(0x3ffffff << 26)));
+    maxwell3d.regs.vb_base_instance = parameters[5];
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.regs.vb_element_base = parameters[3];
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.index_array.first = parameters[4];
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0D61FC9FAAC9FCAD(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+
+    maxwell3d.regs.vertex_buffer.first = parameters[3];
+    maxwell3d.regs.vertex_buffer.count = parameters[1];
+    maxwell3d.regs.vb_base_instance = parameters[4];
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    maxwell3d.mme_draw.instance_count = count;
+
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(false, true);
+    }
+    maxwell3d.regs.vertex_buffer.count = 0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+
+static void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d,
+                                 const std::vector<u32>& parameters) {
+    const u32 instance_count = (maxwell3d.GetRegisterValue(0xD1B) & parameters[2]);
+    const u32 element_base = parameters[4];
+    const u32 base_instance = parameters[5];
+    maxwell3d.regs.index_array.first = parameters[3];
+    maxwell3d.regs.reg_array[0x446] = element_base; // vertex id base?
+    maxwell3d.regs.index_array.count = parameters[1];
+    maxwell3d.regs.vb_element_base = element_base;
+    maxwell3d.regs.vb_base_instance = base_instance;
+    maxwell3d.mme_draw.instance_count = instance_count;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, element_base);
+    maxwell3d.CallMethodFromMME(0x8e5, base_instance);
+    maxwell3d.regs.draw.topology.Assign(
+        static_cast<Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology>(parameters[0]));
+    if (maxwell3d.ShouldExecute()) {
+        maxwell3d.GetRasterizer().Draw(true, true);
+    }
+    maxwell3d.regs.reg_array[0x446] = 0x0; // vertex id base?
+    maxwell3d.regs.index_array.count = 0;
+    maxwell3d.regs.vb_element_base = 0x0;
+    maxwell3d.regs.vb_base_instance = 0x0;
+    maxwell3d.mme_draw.instance_count = 0;
+    maxwell3d.CallMethodFromMME(0x8e3, 0x640);
+    maxwell3d.CallMethodFromMME(0x8e4, 0x0);
+    maxwell3d.CallMethodFromMME(0x8e5, 0x0);
+    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
+}
+} // namespace
+
+constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
+    std::make_pair<u64, HLEFunction>(0x771BB18C62444DA0, &HLE_771BB18C62444DA0),
+    std::make_pair<u64, HLEFunction>(0x0D61FC9FAAC9FCAD, &HLE_0D61FC9FAAC9FCAD),
+    std::make_pair<u64, HLEFunction>(0x0217920100488FF7, &HLE_0217920100488FF7),
+}};
+
+HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
+HLEMacro::~HLEMacro() = default;
+
+std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
+    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
+                                 [hash](const auto& pair) { return pair.first == hash; });
+    if (it == hle_funcs.end()) {
+        return std::nullopt;
+    }
+    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
+}
+
+HLEMacroImpl::~HLEMacroImpl() = default;
+
+HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func)
+    : maxwell3d(maxwell3d), func(func) {}
+
+void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
+    func(maxwell3d, parameters);
+}
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_hle.h
+++ b/src/video_core/macro/macro_hle.h
@@ -0,0 +1,44 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <optional>
+#include <vector>
+#include "common/common_types.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
+
+class HLEMacro {
+public:
+    explicit HLEMacro(Engines::Maxwell3D& maxwell3d);
+    ~HLEMacro();
+
+    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class HLEMacroImpl : public CachedMacro {
+public:
+    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
+    ~HLEMacroImpl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+    HLEFunction func;
+};
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -1,4 +1,4 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

@@ -6,109 +6,47 @@
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/macro_interpreter.h"
+#include "video_core/macro/macro_interpreter.h"

 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));

 namespace Tegra {
-namespace {
-enum class Operation : u32 {
-    ALU = 0,
-    AddImmediate = 1,
-    ExtractInsert = 2,
-    ExtractShiftLeftImmediate = 3,
-    ExtractShiftLeftRegister = 4,
-    Read = 5,
-    Unused = 6, // This operation doesn't seem to be a valid encoding.
-    Branch = 7,
-};
-} // Anonymous namespace
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}

-enum class MacroInterpreter::ALUOperation : u32 {
-    Add = 0,
-    AddWithCarry = 1,
-    Subtract = 2,
-    SubtractWithBorrow = 3,
-    // Operations 4-7 don't seem to be valid encodings.
-    Xor = 8,
-    Or = 9,
-    And = 10,
-    AndNot = 11,
-    Nand = 12
-};
+std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
+}

-enum class MacroInterpreter::ResultOperation : u32 {
-    IgnoreAndFetch = 0,
-    Move = 1,
-    MoveAndSetMethod = 2,
-    FetchAndSend = 3,
-    MoveAndSend = 4,
-    FetchAndSetMethod = 5,
-    MoveAndSetMethodFetchAndSend = 6,
-    MoveAndSetMethodSend = 7
-};
+MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d,
+                                           const std::vector<u32>& code)
+    : maxwell3d(maxwell3d), code(code) {}

-enum class MacroInterpreter::BranchCondition : u32 {
-    Zero = 0,
-    NotZero = 1,
-};
-
-union MacroInterpreter::Opcode {
-    u32 raw;
-    BitField<0, 3, Operation> operation;
-    BitField<4, 3, ResultOperation> result_operation;
-    BitField<4, 1, BranchCondition> branch_condition;
-    // If set on a branch, then the branch doesn't have a delay slot.
-    BitField<5, 1, u32> branch_annul;
-    BitField<7, 1, u32> is_exit;
-    BitField<8, 3, u32> dst;
-    BitField<11, 3, u32> src_a;
-    BitField<14, 3, u32> src_b;
-    // The signed immediate overlaps the second source operand and the alu operation.
-    BitField<14, 18, s32> immediate;
-
-    BitField<17, 5, ALUOperation> alu_operation;
-
-    // Bitfield instructions data
-    BitField<17, 5, u32> bf_src_bit;
-    BitField<22, 5, u32> bf_size;
-    BitField<27, 5, u32> bf_dst_bit;
-
-    u32 GetBitfieldMask() const {
-        return (1 << bf_size) - 1;
-    }
-
-    s32 GetBranchTarget() const {
-        return static_cast<s32>(immediate * sizeof(u32));
-    }
-};
-
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}
-
-void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) {
+void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) {
    MICROPROFILE_SCOPE(MacroInterp);
    Reset();

    registers[1] = parameters[0];
+    num_parameters = parameters.size();

    if (num_parameters > parameters_capacity) {
        parameters_capacity = num_parameters;
        this->parameters = std::make_unique<u32[]>(num_parameters);
    }
-    std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32));
+    std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32));
    this->num_parameters = num_parameters;

    // Execute the code until we hit an exit condition.
    bool keep_executing = true;
    while (keep_executing) {
-        keep_executing = Step(offset, false);
+        keep_executing = Step(false);
    }

    // Assert the the macro used all the input parameters
    ASSERT(next_parameter_index == num_parameters);
 }

-void MacroInterpreter::Reset() {
+void MacroInterpreterImpl::Reset() {
    registers = {};
    pc = 0;
    delayed_pc = {};
@@ -120,10 +58,10 @@ void MacroInterpreter::Reset() {
    carry_flag = false;
 }

-bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
+bool MacroInterpreterImpl::Step(bool is_delay_slot) {
    u32 base_address = pc;

-    Opcode opcode = GetOpcode(offset);
+    Macro::Opcode opcode = GetOpcode();
    pc += 4;

    // Update the program counter if we were delayed
@@ -134,18 +72,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
    }

    switch (opcode.operation) {
-    case Operation::ALU: {
+    case Macro::Operation::ALU: {
        u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a),
                                  GetRegister(opcode.src_b));
        ProcessResult(opcode.result_operation, opcode.dst, result);
        break;
    }
-    case Operation::AddImmediate: {
+    case Macro::Operation::AddImmediate: {
        ProcessResult(opcode.result_operation, opcode.dst,
                      GetRegister(opcode.src_a) + opcode.immediate);
        break;
    }
-    case Operation::ExtractInsert: {
+    case Macro::Operation::ExtractInsert: {
        u32 dst = GetRegister(opcode.src_a);
        u32 src = GetRegister(opcode.src_b);

@@ -155,7 +93,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
        ProcessResult(opcode.result_operation, opcode.dst, dst);
        break;
    }
-    case Operation::ExtractShiftLeftImmediate: {
+    case Macro::Operation::ExtractShiftLeftImmediate: {
        u32 dst = GetRegister(opcode.src_a);
        u32 src = GetRegister(opcode.src_b);

@@ -164,7 +102,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
        ProcessResult(opcode.result_operation, opcode.dst, result);
        break;
    }
-    case Operation::ExtractShiftLeftRegister: {
+    case Macro::Operation::ExtractShiftLeftRegister: {
        u32 dst = GetRegister(opcode.src_a);
        u32 src = GetRegister(opcode.src_b);

@@ -173,12 +111,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
        ProcessResult(opcode.result_operation, opcode.dst, result);
        break;
    }
-    case Operation::Read: {
+    case Macro::Operation::Read: {
        u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate);
        ProcessResult(opcode.result_operation, opcode.dst, result);
        break;
    }
-    case Operation::Branch: {
+    case Macro::Operation::Branch: {
        ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
        u32 value = GetRegister(opcode.src_a);
        bool taken = EvaluateBranchCondition(opcode.branch_condition, value);
@@ -191,7 +129,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {

            delayed_pc = base_address + opcode.GetBranchTarget();
            // Execute one more instruction due to the delay slot.
-            return Step(offset, true);
+            return Step(true);
        }
        break;
    }
@@ -204,51 +142,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
    // cause an exit if it's executed inside a delay slot.
    if (opcode.is_exit && !is_delay_slot) {
        // Exit has a delay slot, execute the next instruction
-        Step(offset, true);
+        Step(true);
        return false;
    }

    return true;
 }

-MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const {
-    const auto& macro_memory{maxwell3d.GetMacroMemory()};
-    ASSERT((pc % sizeof(u32)) == 0);
-    ASSERT((pc + offset) < macro_memory.size() * sizeof(u32));
-    return {macro_memory[offset + pc / sizeof(u32)]};
-}
-
-u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) {
+u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) {
    switch (operation) {
-    case ALUOperation::Add: {
+    case Macro::ALUOperation::Add: {
        const u64 result{static_cast<u64>(src_a) + src_b};
        carry_flag = result > 0xffffffff;
        return static_cast<u32>(result);
    }
-    case ALUOperation::AddWithCarry: {
+    case Macro::ALUOperation::AddWithCarry: {
        const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)};
        carry_flag = result > 0xffffffff;
        return static_cast<u32>(result);
    }
-    case ALUOperation::Subtract: {
+    case Macro::ALUOperation::Subtract: {
        const u64 result{static_cast<u64>(src_a) - src_b};
        carry_flag = result < 0x100000000;
        return static_cast<u32>(result);
    }
-    case ALUOperation::SubtractWithBorrow: {
+    case Macro::ALUOperation::SubtractWithBorrow: {
        const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)};
        carry_flag = result < 0x100000000;
        return static_cast<u32>(result);
    }
-    case ALUOperation::Xor:
+    case Macro::ALUOperation::Xor:
        return src_a ^ src_b;
-    case ALUOperation::Or:
+    case Macro::ALUOperation::Or:
        return src_a | src_b;
-    case ALUOperation::And:
+    case Macro::ALUOperation::And:
        return src_a & src_b;
-    case ALUOperation::AndNot:
+    case Macro::ALUOperation::AndNot:
        return src_a & ~src_b;
-    case ALUOperation::Nand:
+    case Macro::ALUOperation::Nand:
        return ~(src_a & src_b);

    default:
@@ -257,43 +188,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b)
    }
 }

-void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) {
+void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) {
    switch (operation) {
-    case ResultOperation::IgnoreAndFetch:
+    case Macro::ResultOperation::IgnoreAndFetch:
        // Fetch parameter and ignore result.
        SetRegister(reg, FetchParameter());
        break;
-    case ResultOperation::Move:
+    case Macro::ResultOperation::Move:
        // Move result.
        SetRegister(reg, result);
        break;
-    case ResultOperation::MoveAndSetMethod:
+    case Macro::ResultOperation::MoveAndSetMethod:
        // Move result and use as Method Address.
        SetRegister(reg, result);
        SetMethodAddress(result);
        break;
-    case ResultOperation::FetchAndSend:
+    case Macro::ResultOperation::FetchAndSend:
        // Fetch parameter and send result.
        SetRegister(reg, FetchParameter());
        Send(result);
        break;
-    case ResultOperation::MoveAndSend:
+    case Macro::ResultOperation::MoveAndSend:
        // Move and send result.
        SetRegister(reg, result);
        Send(result);
        break;
-    case ResultOperation::FetchAndSetMethod:
+    case Macro::ResultOperation::FetchAndSetMethod:
        // Fetch parameter and use result as Method Address.
        SetRegister(reg, FetchParameter());
        SetMethodAddress(result);
        break;
-    case ResultOperation::MoveAndSetMethodFetchAndSend:
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
        // Move result and use as Method Address, then fetch and send parameter.
        SetRegister(reg, result);
        SetMethodAddress(result);
        Send(FetchParameter());
        break;
-    case ResultOperation::MoveAndSetMethodSend:
+    case Macro::ResultOperation::MoveAndSetMethodSend:
        // Move result and use as Method Address, then send bits 12:17 of result.
        SetRegister(reg, result);
        SetMethodAddress(result);
@@ -304,16 +235,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res
    }
 }

-u32 MacroInterpreter::FetchParameter() {
-    ASSERT(next_parameter_index < num_parameters);
-    return parameters[next_parameter_index++];
+bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const {
+    switch (cond) {
+    case Macro::BranchCondition::Zero:
+        return value == 0;
+    case Macro::BranchCondition::NotZero:
+        return value != 0;
+    }
+    UNREACHABLE();
+    return true;
 }

-u32 MacroInterpreter::GetRegister(u32 register_id) const {
+Macro::Opcode MacroInterpreterImpl::GetOpcode() const {
+    ASSERT((pc % sizeof(u32)) == 0);
+    ASSERT(pc < code.size() * sizeof(u32));
+    return {code[pc / sizeof(u32)]};
+}
+
+u32 MacroInterpreterImpl::GetRegister(u32 register_id) const {
    return registers.at(register_id);
 }

-void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
+void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) {
    // Register 0 is hardwired as the zero register.
    // Ensure no writes to it actually occur.
    if (register_id == 0) {
@@ -323,30 +266,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) {
    registers.at(register_id) = value;
 }

-void MacroInterpreter::SetMethodAddress(u32 address) {
+void MacroInterpreterImpl::SetMethodAddress(u32 address) {
    method_address.raw = address;
 }

-void MacroInterpreter::Send(u32 value) {
+void MacroInterpreterImpl::Send(u32 value) {
    maxwell3d.CallMethodFromMME(method_address.address, value);
    // Increment the method address by the method increment.
    method_address.address.Assign(method_address.address.Value() +
                                  method_address.increment.Value());
 }

-u32 MacroInterpreter::Read(u32 method) const {
+u32 MacroInterpreterImpl::Read(u32 method) const {
    return maxwell3d.GetRegisterValue(method);
 }

-bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const {
-    switch (cond) {
-    case BranchCondition::Zero:
-        return value == 0;
-    case BranchCondition::NotZero:
-        return value != 0;
-    }
-    UNREACHABLE();
-    return true;
+u32 MacroInterpreterImpl::FetchParameter() {
+    ASSERT(next_parameter_index < num_parameters);
+    return parameters[next_parameter_index++];
 }

 } // namespace Tegra
--- a/src/video_core/macro/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@@ -1,44 +1,37 @@
-// Copyright 2018 yuzu Emulator Project
+// Copyright 2020 yuzu Emulator Project
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

 #pragma once
-
 #include <array>
 #include <optional>
-
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_types.h"
+#include "video_core/macro/macro.h"

 namespace Tegra {
 namespace Engines {
 class Maxwell3D;
 }

-class MacroInterpreter final {
+class MacroInterpreter final : public MacroEngine {
 public:
    explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d);

-    /**
-     * Executes the macro code with the specified input parameters.
-     * @param offset Offset to start execution at.
-     * @param parameters The parameters of the macro.
-     */
-    void Execute(u32 offset, std::size_t num_parameters, const u32* parameters);
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;

 private:
-    enum class ALUOperation : u32;
-    enum class BranchCondition : u32;
-    enum class ResultOperation : u32;
+    Engines::Maxwell3D& maxwell3d;
+};

-    union Opcode;
-
-    union MethodAddress {
-        u32 raw;
-        BitField<0, 12, u32> address;
-        BitField<12, 6, u32> increment;
-    };
+class MacroInterpreterImpl : public CachedMacro {
+public:
+    MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    void Execute(const std::vector<u32>& parameters, u32 method) override;

+private:
    /// Resets the execution engine state, zeroing registers, etc.
    void Reset();

@@ -49,20 +42,20 @@ private:
     * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
     * previous instruction.
     */
-    bool Step(u32 offset, bool is_delay_slot);
+    bool Step(bool is_delay_slot);

    /// Calculates the result of an ALU operation. src_a OP src_b;
-    u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b);
+    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);

    /// Performs the result operation on the input result and stores it in the specified register
    /// (if necessary).
-    void ProcessResult(ResultOperation operation, u32 reg, u32 result);
+    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);

    /// Evaluates the branch condition and returns whether the branch should be taken or not.
-    bool EvaluateBranchCondition(BranchCondition cond, u32 value) const;
+    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;

    /// Reads an opcode at the current program counter location.
-    Opcode GetOpcode(u32 offset) const;
+    Macro::Opcode GetOpcode() const;

    /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
    u32 GetRegister(u32 register_id) const;
@@ -89,13 +82,11 @@ private:
    /// Program counter to execute at after the delay slot is executed.
    std::optional<u32> delayed_pc;

-    static constexpr std::size_t NumMacroRegisters = 8;
-
    /// General purpose macro registers.
-    std::array<u32, NumMacroRegisters> registers = {};
+    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};

    /// Method address to use for the next Send instruction.
-    MethodAddress method_address = {};
+    Macro::MethodAddress method_address = {};

    /// Input parameters of the current macro.
    std::unique_ptr<u32[]> parameters;
@@ -105,5 +96,7 @@ private:
    u32 next_parameter_index = 0;

    bool carry_flag = false;
+    const std::vector<u32>& code;
 };
+
 } // namespace Tegra
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -0,0 +1,621 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/microprofile.h"
+#include "common/x64/xbyak_util.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro_interpreter.h"
+#include "video_core/macro/macro_jit_x64.h"
+
+MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47));
+MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));
+
+namespace Tegra {
+static const Xbyak::Reg64 STATE = Xbyak::util::rbx;
+static const Xbyak::Reg32 RESULT = Xbyak::util::ebp;
+static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
+static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
+static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;
+
+static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
+    STATE,
+    RESULT,
+    PARAMETERS,
+    METHOD_ADDRESS,
+    BRANCH_HOLDER,
+});
+
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d)
+    : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {}
+
+std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
+}
+
+MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code)
+    : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) {
+    Compile();
+}
+
+MacroJITx64Impl::~MacroJITx64Impl() = default;
+
+void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
+    MICROPROFILE_SCOPE(MacroJitExecute);
+    ASSERT_OR_EXECUTE(program != nullptr, { return; });
+    JITState state{};
+    state.maxwell3d = &maxwell3d;
+    state.registers = {};
+    program(&state, parameters.data());
+}
+
+void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) {
+    const bool is_a_zero = opcode.src_a == 0;
+    const bool is_b_zero = opcode.src_b == 0;
+    const bool valid_operation = !is_a_zero && !is_b_zero;
+    [[maybe_unused]] const bool is_move_operation = !is_a_zero && is_b_zero;
+    const bool has_zero_register = is_a_zero || is_b_zero;
+    const bool no_zero_reg_skip = opcode.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                                  opcode.alu_operation == Macro::ALUOperation::SubtractWithBorrow;
+
+    Xbyak::Reg32 src_a;
+    Xbyak::Reg32 src_b;
+
+    if (!optimizer.zero_reg_skip || no_zero_reg_skip) {
+        src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        src_b = Compile_GetRegister(opcode.src_b, eax);
+    } else {
+        if (!is_a_zero) {
+            src_a = Compile_GetRegister(opcode.src_a, RESULT);
+        }
+        if (!is_b_zero) {
+            src_b = Compile_GetRegister(opcode.src_b, eax);
+        }
+    }
+
+    bool has_emitted = false;
+
+    switch (opcode.alu_operation) {
+    case Macro::ALUOperation::Add:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                add(src_a, src_b);
+            }
+        } else {
+            add(src_a, src_b);
+        }
+
+        if (!optimizer.can_skip_carry) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::AddWithCarry:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        adc(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Subtract:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                sub(src_a, src_b);
+                has_emitted = true;
+            }
+        } else {
+            sub(src_a, src_b);
+            has_emitted = true;
+        }
+        if (!optimizer.can_skip_carry && has_emitted) {
+            setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        }
+        break;
+    case Macro::ALUOperation::SubtractWithBorrow:
+        bt(dword[STATE + offsetof(JITState, carry_flag)], 0);
+        sbb(src_a, src_b);
+        setc(byte[STATE + offsetof(JITState, carry_flag)]);
+        break;
+    case Macro::ALUOperation::Xor:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                xor_(src_a, src_b);
+            }
+        } else {
+            xor_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Or:
+        if (optimizer.zero_reg_skip) {
+            if (valid_operation) {
+                or_(src_a, src_b);
+            }
+        } else {
+            or_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::And:
+        if (optimizer.zero_reg_skip) {
+            if (!has_zero_register) {
+                and_(src_a, src_b);
+            }
+        } else {
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::AndNot:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                not_(src_b);
+                and_(src_a, src_b);
+            }
+        } else {
+            not_(src_b);
+            and_(src_a, src_b);
+        }
+        break;
+    case Macro::ALUOperation::Nand:
+        if (optimizer.zero_reg_skip) {
+            if (!is_a_zero) {
+                and_(src_a, src_b);
+                not_(src_a);
+            }
+        } else {
+            and_(src_a, src_b);
+            not_(src_a);
+        }
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented ALU operation {}",
+                          static_cast<std::size_t>(opcode.alu_operation.Value()));
+        break;
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) {
+    if (optimizer.skip_dummy_addimmediate) {
+        // Games tend to use this as an exit instruction placeholder. It's to encode an instruction
+        // without doing anything. In our case we can just not emit anything.
+        if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) {
+            return;
+        }
+    }
+    // Check for redundant moves
+    if (optimizer.optimize_for_method_move &&
+        opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) {
+        if (next_opcode.has_value()) {
+            const auto next = *next_opcode;
+            if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod &&
+                opcode.dst == next.dst) {
+                return;
+            }
+        }
+    }
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) {
+    auto dst = Compile_GetRegister(opcode.src_a, RESULT);
+    auto src = Compile_GetRegister(opcode.src_b, eax);
+
+    if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) {
+        shr(src, opcode.bf_src_bit);
+    } else if (opcode.bf_src_bit == 31) {
+        xor_(src, src);
+    }
+    // Don't bother masking the whole register since we're using a 32 bit register
+    if (opcode.bf_size != 31 && opcode.bf_size != 0) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+    if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+
+    const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit);
+    if (mask != 0xffffffff) {
+        and_(dst, mask);
+    }
+    or_(dst, src);
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    shr(src, dst.cvt8());
+    if (opcode.bf_size != 0 && opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    } else if (opcode.bf_size == 0) {
+        xor_(src, src);
+    }
+
+    if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) {
+        shl(src, opcode.bf_dst_bit);
+    } else if (opcode.bf_dst_bit == 31) {
+        xor_(src, src);
+    }
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) {
+    const auto dst = Compile_GetRegister(opcode.src_a, ecx);
+    const auto src = Compile_GetRegister(opcode.src_b, RESULT);
+
+    if (opcode.bf_src_bit != 0) {
+        shr(src, opcode.bf_src_bit);
+    }
+
+    if (opcode.bf_size != 31) {
+        and_(src, opcode.GetBitfieldMask());
+    }
+    shl(src, dst.cvt8());
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
+    if (optimizer.zero_reg_skip && opcode.src_a == 0) {
+        if (opcode.immediate == 0) {
+            xor_(RESULT, RESULT);
+        } else {
+            mov(RESULT, opcode.immediate);
+        }
+    } else {
+        auto result = Compile_GetRegister(opcode.src_a, RESULT);
+        if (opcode.immediate > 2) {
+            add(result, opcode.immediate);
+        } else if (opcode.immediate == 1) {
+            inc(result);
+        } else if (opcode.immediate < 0) {
+            sub(result, opcode.immediate * -1);
+        }
+    }
+
+    // Equivalent to Engines::Maxwell3D::GetRegisterValue:
+    if (optimizer.enable_asserts) {
+        Xbyak::Label pass_range_check;
+        cmp(RESULT, static_cast<u32>(Engines::Maxwell3D::Regs::NUM_REGS));
+        jb(pass_range_check);
+        int3();
+        L(pass_range_check);
+    }
+    mov(rax, qword[STATE]);
+    mov(RESULT,
+        dword[rax + offsetof(Engines::Maxwell3D, regs) +
+              offsetof(Engines::Maxwell3D::Regs, reg_array) + RESULT.cvt64() * sizeof(u32)]);
+
+    Compile_ProcessResult(opcode.result_operation, opcode.dst);
+}
+
+static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+    maxwell3d->CallMethodFromMME(method_address.address, value);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+    mov(Common::X64::ABI_PARAM1, qword[STATE]);
+    mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
+    mov(Common::X64::ABI_PARAM3, value);
+    Common::X64::CallFarFunction(*this, &Send);
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
+
+    Xbyak::Label dont_process{};
+    // Get increment
+    test(METHOD_ADDRESS, 0x3f000);
+    // If zero, method address doesn't update
+    je(dont_process);
+
+    mov(ecx, METHOD_ADDRESS);
+    and_(METHOD_ADDRESS, 0xfff);
+    shr(ecx, 12);
+    and_(ecx, 0x3f);
+    lea(eax, ptr[rcx + METHOD_ADDRESS.cvt64()]);
+    sal(ecx, 12);
+    or_(eax, ecx);
+
+    mov(METHOD_ADDRESS, eax);
+
+    L(dont_process);
+}
+
+void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
+    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
+    const s32 jump_address =
+        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
+
+    Xbyak::Label end;
+    auto value = Compile_GetRegister(opcode.src_a, eax);
+    test(value, value);
+    if (optimizer.has_delayed_pc) {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            jne(end, T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            je(end, T_NEAR);
+            break;
+        }
+
+        if (opcode.branch_annul) {
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(labels[jump_address], T_NEAR);
+        } else {
+            Xbyak::Label handle_post_exit{};
+            Xbyak::Label skip{};
+            jmp(skip, T_NEAR);
+            if (opcode.is_exit) {
+                L(handle_post_exit);
+                // Execute 1 instruction
+                mov(BRANCH_HOLDER, end_of_code);
+                // Jump to next instruction to skip delay slot check
+                jmp(labels[jump_address], T_NEAR);
+            } else {
+                L(handle_post_exit);
+                xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+                jmp(labels[jump_address], T_NEAR);
+            }
+            L(skip);
+            mov(BRANCH_HOLDER, handle_post_exit);
+            jmp(delay_skip[pc], T_NEAR);
+        }
+    } else {
+        switch (opcode.branch_condition) {
+        case Macro::BranchCondition::Zero:
+            je(labels[jump_address], T_NEAR);
+            break;
+        case Macro::BranchCondition::NotZero:
+            jne(labels[jump_address], T_NEAR);
+            break;
+        }
+    }
+
+    L(end);
+}
+
+void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
+    optimizer.can_skip_carry = true;
+    optimizer.has_delayed_pc = false;
+    for (auto raw_op : code) {
+        Macro::Opcode op{};
+        op.raw = raw_op;
+
+        if (op.operation == Macro::Operation::ALU) {
+            // Scan for any ALU operations which actually use the carry flag, if they don't exist in
+            // our current code we can skip emitting the carry flag handling operations
+            if (op.alu_operation == Macro::ALUOperation::AddWithCarry ||
+                op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) {
+                optimizer.can_skip_carry = false;
+            }
+        }
+
+        if (op.operation == Macro::Operation::Branch) {
+            if (!op.branch_annul) {
+                optimizer.has_delayed_pc = true;
+            }
+        }
+    }
+}
+
+void MacroJITx64Impl::Compile() {
+    MICROPROFILE_SCOPE(MacroJitCompile);
+    bool keep_executing = true;
+    labels.fill(Xbyak::Label());
+
+    Common::X64::ABI_PushRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    // JIT state
+    mov(STATE, Common::X64::ABI_PARAM1);
+    mov(PARAMETERS, Common::X64::ABI_PARAM2);
+    xor_(RESULT, RESULT);
+    xor_(METHOD_ADDRESS, METHOD_ADDRESS);
+    xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+
+    mov(dword[STATE + offsetof(JITState, registers) + 4], Compile_FetchParameter());
+
+    // Track get register for zero registers and mark it as no-op
+    optimizer.zero_reg_skip = true;
+
+    // AddImmediate tends to be used as a NOP instruction, if we detect this we can
+    // completely skip the entire code path and no emit anything
+    optimizer.skip_dummy_addimmediate = true;
+
+    // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting
+    // one if our register isn't "dirty"
+    optimizer.optimize_for_method_move = true;
+
+    // Enable run-time assertions in JITted code
+    optimizer.enable_asserts = false;
+
+    // Check to see if we can skip emitting certain instructions
+    Optimizer_ScanFlags();
+
+    const u32 op_count = static_cast<u32>(code.size());
+    for (u32 i = 0; i < op_count; i++) {
+        if (i < op_count - 1) {
+            pc = i + 1;
+            next_opcode = GetOpCode();
+        } else {
+            next_opcode = {};
+        }
+        pc = i;
+        Compile_NextInstruction();
+    }
+
+    L(end_of_code);
+
+    Common::X64::ABI_PopRegistersAndAdjustStack(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8);
+    ret();
+    ready();
+    program = getCode<ProgramType>();
+}
+
+bool MacroJITx64Impl::Compile_NextInstruction() {
+    const auto opcode = GetOpCode();
+    if (labels[pc].getAddress()) {
+        return false;
+    }
+
+    L(labels[pc]);
+
+    switch (opcode.operation) {
+    case Macro::Operation::ALU:
+        Compile_ALU(opcode);
+        break;
+    case Macro::Operation::AddImmediate:
+        Compile_AddImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractInsert:
+        Compile_ExtractInsert(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftImmediate:
+        Compile_ExtractShiftLeftImmediate(opcode);
+        break;
+    case Macro::Operation::ExtractShiftLeftRegister:
+        Compile_ExtractShiftLeftRegister(opcode);
+        break;
+    case Macro::Operation::Read:
+        Compile_Read(opcode);
+        break;
+    case Macro::Operation::Branch:
+        Compile_Branch(opcode);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value());
+        break;
+    }
+
+    if (optimizer.has_delayed_pc) {
+        if (opcode.is_exit) {
+            mov(rax, end_of_code);
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            cmove(BRANCH_HOLDER, rax);
+            // Jump to next instruction to skip delay slot check
+            je(labels[pc + 1], T_NEAR);
+        } else {
+            // TODO(ogniK): Optimize delay slot branching
+            Xbyak::Label no_delay_slot{};
+            test(BRANCH_HOLDER, BRANCH_HOLDER);
+            je(no_delay_slot, T_NEAR);
+            mov(rax, BRANCH_HOLDER);
+            xor_(BRANCH_HOLDER, BRANCH_HOLDER);
+            jmp(rax);
+            L(no_delay_slot);
+        }
+        L(delay_skip[pc]);
+        if (opcode.is_exit) {
+            return false;
+        }
+    } else {
+        test(BRANCH_HOLDER, BRANCH_HOLDER);
+        jne(end_of_code, T_NEAR);
+        if (opcode.is_exit) {
+            inc(BRANCH_HOLDER);
+            return false;
+        }
+    }
+    return true;
+}
+
+Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
+    mov(eax, dword[PARAMETERS]);
+    add(PARAMETERS, sizeof(u32));
+    return eax;
+}
+
+Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) {
+    if (index == 0) {
+        // Register 0 is always zero
+        xor_(dst, dst);
+    } else {
+        mov(dst, dword[STATE + offsetof(JITState, registers) + index * sizeof(u32)]);
+    }
+
+    return dst;
+}
+
+void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) {
+    const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) {
+        // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero
+        // register.
+        if (reg == 0) {
+            return;
+        }
+        mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result);
+    };
+    const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); };
+
+    switch (operation) {
+    case Macro::ResultOperation::IgnoreAndFetch:
+        SetRegister(reg, Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::Move:
+        SetRegister(reg, RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethod:
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSend:
+        // Fetch parameter and send result.
+        SetRegister(reg, Compile_FetchParameter());
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSend:
+        // Move and send result.
+        SetRegister(reg, RESULT);
+        Compile_Send(RESULT);
+        break;
+    case Macro::ResultOperation::FetchAndSetMethod:
+        // Fetch parameter and use result as Method Address.
+        SetRegister(reg, Compile_FetchParameter());
+        SetMethodAddress(RESULT);
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodFetchAndSend:
+        // Move result and use as Method Address, then fetch and send parameter.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        Compile_Send(Compile_FetchParameter());
+        break;
+    case Macro::ResultOperation::MoveAndSetMethodSend:
+        // Move result and use as Method Address, then send bits 12:17 of result.
+        SetRegister(reg, RESULT);
+        SetMethodAddress(RESULT);
+        shr(RESULT, 12);
+        and_(RESULT, 0b111111);
+        Compile_Send(RESULT);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation));
+    }
+}
+
+Macro::Opcode MacroJITx64Impl::GetOpCode() const {
+    ASSERT(pc < code.size());
+    return {code[pc]};
+}
+
+std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
+    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
+}
+
+} // namespace Tegra
--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -0,0 +1,98 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <bitset>
+#include <xbyak.h>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "common/x64/xbyak_abi.h"
+#include "video_core/macro/macro.h"
+
+namespace Tegra {
+
+namespace Engines {
+class Maxwell3D;
+}
+
+/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
+constexpr size_t MAX_CODE_SIZE = 0x10000;
+
+class MacroJITx64 final : public MacroEngine {
+public:
+    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d);
+
+protected:
+    std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override;
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+};
+
+class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
+public:
+    MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code);
+    ~MacroJITx64Impl();
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+    void Compile_ALU(Macro::Opcode opcode);
+    void Compile_AddImmediate(Macro::Opcode opcode);
+    void Compile_ExtractInsert(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
+    void Compile_Read(Macro::Opcode opcode);
+    void Compile_Branch(Macro::Opcode opcode);
+
+private:
+    void Optimizer_ScanFlags();
+
+    void Compile();
+    bool Compile_NextInstruction();
+
+    Xbyak::Reg32 Compile_FetchParameter();
+    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
+
+    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
+    void Compile_Send(Xbyak::Reg32 value);
+
+    Macro::Opcode GetOpCode() const;
+    std::bitset<32> PersistentCallerSavedRegs() const;
+
+    struct JITState {
+        Engines::Maxwell3D* maxwell3d{};
+        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
+        u32 carry_flag{};
+    };
+    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
+    using ProgramType = void (*)(JITState*, const u32*);
+
+    struct OptimizerState {
+        bool can_skip_carry{};
+        bool has_delayed_pc{};
+        bool zero_reg_skip{};
+        bool skip_dummy_addimmediate{};
+        bool optimize_for_method_move{};
+        bool enable_asserts{};
+    };
+    OptimizerState optimizer{};
+
+    std::optional<Macro::Opcode> next_opcode{};
+    ProgramType program{nullptr};
+
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
+    Xbyak::Label end_of_code{};
+
+    bool is_delay_slot{};
+    u32 pc{};
+    std::optional<u32> delayed_pc;
+
+    const std::vector<u32>& code;
+    Engines::Maxwell3D& maxwell3d;
+};
+
+} // namespace Tegra
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -210,10 +210,11 @@ bool MemoryManager::IsBlockContinuous(const GPUVAddr start, const std::size_t si
    return range == inner_size;
 }

-void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
+void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer,
+                              const std::size_t size) const {
    std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};

    auto& memory = system.Memory();

@@ -234,11 +235,11 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::s
    }
 }

-void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
+void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
                                    const std::size_t size) const {
    std::size_t remaining_size{size};
-    std::size_t page_index{src_addr >> page_bits};
-    std::size_t page_offset{src_addr & page_mask};
+    std::size_t page_index{gpu_src_addr >> page_bits};
+    std::size_t page_offset{gpu_src_addr & page_mask};

    auto& memory = system.Memory();

@@ -259,10 +260,11 @@ void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
    }
 }

-void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
+void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer,
+                               const std::size_t size) {
    std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};

    auto& memory = system.Memory();

@@ -283,11 +285,11 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const
    }
 }

-void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
+void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
                                     const std::size_t size) {
    std::size_t remaining_size{size};
-    std::size_t page_index{dest_addr >> page_bits};
-    std::size_t page_offset{dest_addr & page_mask};
+    std::size_t page_index{gpu_dest_addr >> page_bits};
+    std::size_t page_offset{gpu_dest_addr & page_mask};

    auto& memory = system.Memory();

@@ -306,16 +308,18 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
    }
 }

-void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                              const std::size_t size) {
    std::vector<u8> tmp_buffer(size);
-    ReadBlock(src_addr, tmp_buffer.data(), size);
-    WriteBlock(dest_addr, tmp_buffer.data(), size);
+    ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlock(gpu_dest_addr, tmp_buffer.data(), size);
 }

-void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
+void MemoryManager::CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr,
+                                    const std::size_t size) {
    std::vector<u8> tmp_buffer(size);
-    ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
-    WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
+    ReadBlockUnsafe(gpu_src_addr, tmp_buffer.data(), size);
+    WriteBlockUnsafe(gpu_dest_addr, tmp_buffer.data(), size);
 }

 bool MemoryManager::IsGranularRange(GPUVAddr gpu_addr, std::size_t size) {
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -79,9 +79,9 @@ public:
     * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
     * Flushes and Invalidations, respectively to each operation.
     */
-    void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    void ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);

    /**
     * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
@@ -93,9 +93,9 @@ public:
     * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
     * being flushed.
     */
-    void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const;
-    void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, std::size_t size);
-    void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size);
+    void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
+    void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
+    void CopyBlockUnsafe(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size);

    /**
     * IsGranularRange checks if a gpu region can be simply read with a pointer
--- a/src/video_core/query_cache.h
+++ b/src/video_core/query_cache.h
@@ -220,8 +220,8 @@ private:
            return cache_begin < addr_end && addr_begin < cache_end;
        };

-        const u64 page_end = addr_end >> PAGE_SHIFT;
-        for (u64 page = addr_begin >> PAGE_SHIFT; page <= page_end; ++page) {
+        const u64 page_end = addr_end >> PAGE_BITS;
+        for (u64 page = addr_begin >> PAGE_BITS; page <= page_end; ++page) {
            const auto& it = cached_queries.find(page);
            if (it == std::end(cached_queries)) {
                continue;
@@ -242,14 +242,14 @@ private:
    /// Registers the passed parameters as cached and returns a pointer to the stored cached query.
    CachedQuery* Register(VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr, bool timestamp) {
        rasterizer.UpdatePagesCachedCount(cpu_addr, CachedQuery::SizeInBytes(timestamp), 1);
-        const u64 page = static_cast<u64>(cpu_addr) >> PAGE_SHIFT;
+        const u64 page = static_cast<u64>(cpu_addr) >> PAGE_BITS;
        return &cached_queries[page].emplace_back(static_cast<QueryCache&>(*this), type, cpu_addr,
                                                  host_ptr);
    }

    /// Tries to a get a cached query. Returns nullptr on failure.
    CachedQuery* TryGet(VAddr addr) {
-        const u64 page = static_cast<u64>(addr) >> PAGE_SHIFT;
+        const u64 page = static_cast<u64>(addr) >> PAGE_BITS;
        const auto it = cached_queries.find(page);
        if (it == std::end(cached_queries)) {
            return nullptr;
@@ -268,7 +268,7 @@ private:
    }

    static constexpr std::uintptr_t PAGE_SIZE = 4096;
-    static constexpr unsigned PAGE_SHIFT = 12;
+    static constexpr unsigned PAGE_BITS = 12;

    Core::System& system;
    VideoCore::RasterizerInterface& rasterizer;
--- a/src/video_core/rasterizer_cache.cpp
+++ b/src/video_core/rasterizer_cache.cpp
@@ -1,7 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "video_core/rasterizer_cache.h"
-
-RasterizerCacheObject::~RasterizerCacheObject() = default;
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -1,197 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <mutex>
-#include <set>
-#include <unordered_map>
-
-#include <boost/icl/interval_map.hpp>
-#include <boost/range/iterator_range_core.hpp>
-
-#include "common/common_types.h"
-#include "core/settings.h"
-#include "video_core/gpu.h"
-#include "video_core/rasterizer_interface.h"
-
-class RasterizerCacheObject {
-public:
-    explicit RasterizerCacheObject(const VAddr cpu_addr) : cpu_addr{cpu_addr} {}
-
-    virtual ~RasterizerCacheObject();
-
-    VAddr GetCpuAddr() const {
-        return cpu_addr;
-    }
-
-    /// Gets the size of the shader in guest memory, required for cache management
-    virtual std::size_t GetSizeInBytes() const = 0;
-
-    /// Sets whether the cached object should be considered registered
-    void SetIsRegistered(bool registered) {
-        is_registered = registered;
-    }
-
-    /// Returns true if the cached object is registered
-    bool IsRegistered() const {
-        return is_registered;
-    }
-
-    /// Returns true if the cached object is dirty
-    bool IsDirty() const {
-        return is_dirty;
-    }
-
-    /// Returns ticks from when this cached object was last modified
-    u64 GetLastModifiedTicks() const {
-        return last_modified_ticks;
-    }
-
-    /// Marks an object as recently modified, used to specify whether it is clean or dirty
-    template <class T>
-    void MarkAsModified(bool dirty, T& cache) {
-        is_dirty = dirty;
-        last_modified_ticks = cache.GetModifiedTicks();
-    }
-
-private:
-    bool is_registered{};      ///< Whether the object is currently registered with the cache
-    bool is_dirty{};           ///< Whether the object is dirty (out of sync with guest memory)
-    u64 last_modified_ticks{}; ///< When the object was last modified, used for in-order flushing
-    VAddr cpu_addr{};          ///< Cpu address memory, unique from emulated virtual address space
-};
-
-template <class T>
-class RasterizerCache : NonCopyable {
-    friend class RasterizerCacheObject;
-
-public:
-    explicit RasterizerCache(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
-
-    /// Write any cached resources overlapping the specified region back to memory
-    void FlushRegion(VAddr addr, std::size_t size) {
-        std::lock_guard lock{mutex};
-
-        const auto& objects{GetSortedObjectsFromRegion(addr, size)};
-        for (auto& object : objects) {
-            FlushObject(object);
-        }
-    }
-
-    /// Mark the specified region as being invalidated
-    void InvalidateRegion(VAddr addr, u64 size) {
-        std::lock_guard lock{mutex};
-
-        const auto& objects{GetSortedObjectsFromRegion(addr, size)};
-        for (auto& object : objects) {
-            if (!object->IsRegistered()) {
-                // Skip duplicates
-                continue;
-            }
-            Unregister(object);
-        }
-    }
-
-    /// Invalidates everything in the cache
-    void InvalidateAll() {
-        std::lock_guard lock{mutex};
-
-        while (interval_cache.begin() != interval_cache.end()) {
-            Unregister(*interval_cache.begin()->second.begin());
-        }
-    }
-
-protected:
-    /// Tries to get an object from the cache with the specified cache address
-    T TryGet(VAddr addr) const {
-        const auto iter = map_cache.find(addr);
-        if (iter != map_cache.end())
-            return iter->second;
-        return nullptr;
-    }
-
-    /// Register an object into the cache
-    virtual void Register(const T& object) {
-        std::lock_guard lock{mutex};
-
-        object->SetIsRegistered(true);
-        interval_cache.add({GetInterval(object), ObjectSet{object}});
-        map_cache.insert({object->GetCpuAddr(), object});
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), 1);
-    }
-
-    /// Unregisters an object from the cache
-    virtual void Unregister(const T& object) {
-        std::lock_guard lock{mutex};
-
-        object->SetIsRegistered(false);
-        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
-        const VAddr addr = object->GetCpuAddr();
-        interval_cache.subtract({GetInterval(object), ObjectSet{object}});
-        map_cache.erase(addr);
-    }
-
-    /// Returns a ticks counter used for tracking when cached objects were last modified
-    u64 GetModifiedTicks() {
-        std::lock_guard lock{mutex};
-
-        return ++modified_ticks;
-    }
-
-    virtual void FlushObjectInner(const T& object) = 0;
-
-    /// Flushes the specified object, updating appropriate cache state as needed
-    void FlushObject(const T& object) {
-        std::lock_guard lock{mutex};
-
-        if (!object->IsDirty()) {
-            return;
-        }
-        FlushObjectInner(object);
-        object->MarkAsModified(false, *this);
-    }
-
-    std::recursive_mutex mutex;
-
-private:
-    /// Returns a list of cached objects from the specified memory region, ordered by access time
-    std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
-        if (size == 0) {
-            return {};
-        }
-
-        std::vector<T> objects;
-        const ObjectInterval interval{addr, addr + size};
-        for (auto& pair : boost::make_iterator_range(interval_cache.equal_range(interval))) {
-            for (auto& cached_object : pair.second) {
-                if (!cached_object) {
-                    continue;
-                }
-                objects.push_back(cached_object);
-            }
-        }
-
-        std::sort(objects.begin(), objects.end(), [](const T& a, const T& b) -> bool {
-            return a->GetLastModifiedTicks() < b->GetLastModifiedTicks();
-        });
-
-        return objects;
-    }
-
-    using ObjectSet = std::set<T>;
-    using ObjectCache = std::unordered_map<VAddr, T>;
-    using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
-    using ObjectInterval = typename IntervalCache::interval_type;
-
-    static auto GetInterval(const T& object) {
-        return ObjectInterval::right_open(object->GetCpuAddr(),
-                                          object->GetCpuAddr() + object->GetSizeInBytes());
-    }
-
-    ObjectCache map_cache;
-    IntervalCache interval_cache; ///< Cache of objects
-    u64 modified_ticks{};         ///< Counter of cache state ticks, used for in-order flushing
-    VideoCore::RasterizerInterface& rasterizer;
-};
--- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
--- a/src/video_core/renderer_opengl/gl_arb_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.h
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <string_view>
+
+#include "common/common_types.h"
+
+namespace Tegra::Engines {
+enum class ShaderType : u32;
+}
+
+namespace VideoCommon::Shader {
+class ShaderIR;
+class Registry;
+} // namespace VideoCommon::Shader
+
+namespace OpenGL {
+
+class Device;
+
+std::string DecompileAssemblyShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                                    const VideoCommon::Shader::Registry& registry,
+                                    Tegra::Engines::ShaderType stage, std::string_view identifier);
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -8,6 +8,7 @@

 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
@@ -21,22 +22,46 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs;

 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));

-CachedBufferBlock::CachedBufferBlock(VAddr cpu_addr, const std::size_t size)
+Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size)
    : VideoCommon::BufferBlock{cpu_addr, size} {
    gl_buffer.Create();
    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
+    }
 }

-CachedBufferBlock::~CachedBufferBlock() = default;
+Buffer::~Buffer() = default;
+
+void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) const {
+    glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                         data);
+}
+
+void Buffer::Download(std::size_t offset, std::size_t size, u8* data) const {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+    glGetNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size),
+                            data);
+}
+
+void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                      std::size_t size) const {
+    glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+}

 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               const Device& device, std::size_t stream_size)
-    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+                               const Device& device_, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, system,
+                         std::make_unique<OGLStreamBuffer>(device_, stream_size, true)},
+      device{device_} {
    if (!device.HasFastBufferSubData()) {
        return;
    }

-    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
    glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
    for (const GLuint cbuf : cbufs) {
        glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
@@ -47,44 +72,21 @@ OGLBufferCache::~OGLBufferCache() {
    glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
 }

-Buffer OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
-    return std::make_shared<CachedBufferBlock>(cpu_addr, size);
+std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
+    return std::make_shared<Buffer>(device, cpu_addr, size);
 }

-GLuint OGLBufferCache::ToHandle(const Buffer& buffer) {
-    return buffer->GetHandle();
-}
-
-GLuint OGLBufferCache::GetEmptyBuffer(std::size_t) {
-    return 0;
-}
-
-void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                     const u8* data) {
-    glNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
-                         static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                                       u8* data) {
-    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
-    glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
-    glGetNamedBufferSubData(buffer->GetHandle(), static_cast<GLintptr>(offset),
-                            static_cast<GLsizeiptr>(size), data);
-}
-
-void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                               std::size_t dst_offset, std::size_t size) {
-    glCopyNamedBufferSubData(src->GetHandle(), dst->GetHandle(), static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    return {0, 0, 0};
 }

 OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
                                                             std::size_t size) {
    DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
-    const GLuint& cbuf = cbufs[cbuf_cursor++];
+    const GLuint cbuf = cbufs[cbuf_cursor++];
+
    glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
-    return {cbuf, 0};
+    return {cbuf, 0, 0};
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -10,7 +10,6 @@
 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"

@@ -24,57 +23,57 @@ class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;

-class CachedBufferBlock;
-
-using Buffer = std::shared_ptr<CachedBufferBlock>;
-using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
-
-class CachedBufferBlock : public VideoCommon::BufferBlock {
+class Buffer : public VideoCommon::BufferBlock {
 public:
-    explicit CachedBufferBlock(VAddr cpu_addr, const std::size_t size);
-    ~CachedBufferBlock();
+    explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size);
+    ~Buffer();

-    GLuint GetHandle() const {
+    void Upload(std::size_t offset, std::size_t size, const u8* data) const;
+
+    void Download(std::size_t offset, std::size_t size, u8* data) const;
+
+    void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
+                  std::size_t size) const;
+
+    GLuint Handle() const noexcept {
        return gl_buffer.handle;
    }

+    u64 Address() const noexcept {
+        return gpu_address;
+    }
+
 private:
    OGLBuffer gl_buffer;
+    u64 gpu_address = 0;
 };

+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;
 class OGLBufferCache final : public GenericBufferCache {
 public:
    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                            const Device& device, std::size_t stream_size);
    ~OGLBufferCache();

-    GLuint GetEmptyBuffer(std::size_t) override;
+    BufferInfo GetEmptyBuffer(std::size_t) override;

    void Acquire() noexcept {
        cbuf_cursor = 0;
    }

 protected:
-    Buffer CreateBlock(VAddr cpu_addr, std::size_t size) override;
-
-    GLuint ToHandle(const Buffer& buffer) override;
-
-    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                         const u8* data) override;
-
-    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
-                           u8* data) override;
-
-    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
-                   std::size_t dst_offset, std::size_t size) override;
+    std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;

    BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;

 private:
+    static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                                             Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+
+    const Device& device;
+
    std::size_t cbuf_cursor = 0;
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
-                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
-        cbufs;
+    std::array<GLuint, NUM_CBUFS> cbufs{};
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -6,6 +6,7 @@
 #include <array>
 #include <cstddef>
 #include <cstring>
+#include <limits>
 #include <optional>
 #include <vector>

@@ -13,6 +14,7 @@

 #include "common/logging/log.h"
 #include "common/scope_exit.h"
+#include "core/settings.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"

@@ -25,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1;

 constexpr u32 NumStages = 5;

-constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
-                                  GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS,
-                                  GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS};
+constexpr std::array LimitUBOs = {
+    GL_MAX_VERTEX_UNIFORM_BLOCKS,          GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS,
+    GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS,
+    GL_MAX_FRAGMENT_UNIFORM_BLOCKS,        GL_MAX_COMPUTE_UNIFORM_BLOCKS};

 constexpr std::array LimitSSBOs = {
-    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
+    GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS,          GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS,
    GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS,
-    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS};
+    GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS,        GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS};

-constexpr std::array LimitSamplers = {
-    GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
-    GL_MAX_TEXTURE_IMAGE_UNITS};
+constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_TEXTURE_IMAGE_UNITS,
+                                      GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS};

-constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
-                                    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS,
-                                    GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS};
+constexpr std::array LimitImages = {
+    GL_MAX_VERTEX_IMAGE_UNIFORMS,          GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS,
+    GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS,
+    GL_MAX_FRAGMENT_IMAGE_UNIFORMS,        GL_MAX_COMPUTE_IMAGE_UNIFORMS};

 template <typename T>
 T GetInteger(GLenum pname) {
@@ -84,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) {
    return std::exchange(base, base + amount);
 }

+std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept {
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max;
+    std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(),
+                   [](GLenum pname) { return GetInteger<u32>(pname); });
+    return max;
+}
+
 std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept {
    std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings;

@@ -111,16 +123,24 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
    u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS);
    u32 base_images = 0;

-    // Reserve more image bindings on fragment and vertex stages.
+    // GL_MAX_IMAGE_UNITS is guaranteed by the spec to have a minimum value of 8.
+    // Due to the limitation of GL_MAX_IMAGE_UNITS, reserve at least 4 image bindings on the
+    // fragment stage, and at least 1 for the rest of the stages.
+    // So far games are observed to use 1 image binding on vertex and 4 on fragment stages.
+
+    // Reserve at least 4 image bindings on the fragment stage.
    bindings[4].image =
-        Extract(base_images, num_images, num_images / NumStages + 2, LimitImages[4]);
-    bindings[0].image =
-        Extract(base_images, num_images, num_images / NumStages + 1, LimitImages[0]);
+        Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]);
+
+    // This is guaranteed to be at least 1.
+    const u32 total_extracted_images = num_images / (NumStages - 1);

    // Reserve the other image bindings.
-    const u32 total_extracted_images = num_images / (NumStages - 2);
-    for (std::size_t i = 2; i < NumStages; ++i) {
+    for (std::size_t i = 0; i < NumStages; ++i) {
        const std::size_t stage = stage_swizzle[i];
+        if (stage == 4) {
+            continue;
+        }
        bindings[stage].image =
            Extract(base_images, num_images, total_extracted_images, LimitImages[stage]);
    }
@@ -132,6 +152,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
 }

 bool IsASTCSupported() {
+    static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY};
    static constexpr std::array formats = {
        GL_COMPRESSED_RGBA_ASTC_4x4_KHR,           GL_COMPRESSED_RGBA_ASTC_5x4_KHR,
        GL_COMPRESSED_RGBA_ASTC_5x5_KHR,           GL_COMPRESSED_RGBA_ASTC_6x5_KHR,
@@ -148,25 +169,59 @@ bool IsASTCSupported() {
        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR,  GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR,
        GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR,
    };
-    return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) {
-               GLint supported;
-               glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1,
-                                     &supported);
-               return supported == GL_TRUE;
-           }) == formats.end();
+    static constexpr std::array required_support = {
+        GL_VERTEX_TEXTURE,   GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE,
+        GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE,     GL_COMPUTE_TEXTURE,
+    };
+
+    for (const GLenum target : targets) {
+        for (const GLenum format : formats) {
+            for (const GLenum support : required_support) {
+                GLint value;
+                glGetInternalformativ(target, format, support, 1, &value);
+                if (value != GL_FULL_SUPPORT) {
+                    return false;
+                }
+            }
+        }
+    }
+    return true;
+}
+
+/// @brief Returns true when a GL_RENDERER is a Turing GPU
+/// @param renderer GL_RENDERER string
+bool IsTuring(std::string_view renderer) {
+    static constexpr std::array<std::string_view, 12> TURING_GPUS = {
+        "GTX 1650",        "GTX 1660",        "RTX 2060",        "RTX 2070",
+        "RTX 2080",        "TITAN RTX",       "Quadro RTX 3000", "Quadro RTX 4000",
+        "Quadro RTX 5000", "Quadro RTX 6000", "Quadro RTX 8000", "Tesla T4",
+    };
+    return std::any_of(TURING_GPUS.begin(), TURING_GPUS.end(),
+                       [renderer](std::string_view candidate) {
+                           return renderer.find(candidate) != std::string_view::npos;
+                       });
 }

 } // Anonymous namespace

-Device::Device() : base_bindings{BuildBaseBindings()} {
+Device::Device()
+    : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} {
    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
-    const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+    const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
+    const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
    const std::vector extensions = GetExtensions();

    const bool is_nvidia = vendor == "NVIDIA Corporation";
    const bool is_amd = vendor == "ATI Technologies Inc.";
-    const bool is_intel = vendor == "Intel";
-    const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
+    const bool is_turing = is_nvidia && IsTuring(renderer);
+
+    bool disable_fast_buffer_sub_data = false;
+    if (is_nvidia && version == "4.6.0 NVIDIA 443.24") {
+        LOG_WARNING(
+            Render_OpenGL,
+            "Beta driver 443.24 is known to have issues. There might be performance issues.");
+        disable_fast_buffer_sub_data = true;
+    }

    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
@@ -177,30 +232,46 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
    has_shader_ballot = GLAD_GL_ARB_shader_ballot;
    has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
    has_image_load_formatted = HasExtension(extensions, "GL_EXT_shader_image_load_formatted");
+    has_texture_shadow_lod = HasExtension(extensions, "GL_EXT_texture_shadow_lod");
    has_astc = IsASTCSupported();
    has_variable_aoffi = TestVariableAoffi();
    has_component_indexing_bug = is_amd;
    has_precise_bug = TestPreciseBug();
-    has_broken_compute = is_intel_proprietary;
-    has_fast_buffer_sub_data = is_nvidia;
+    has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2;
+
+    // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive
+    // uniform buffers as "push constants"
+    has_fast_buffer_sub_data = is_nvidia && !disable_fast_buffer_sub_data;
+
+    // Nvidia's driver on Turing GPUs randomly crashes when the buffer is made resident, or on
+    // DeleteBuffers. Disable unified memory on these devices.
+    has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory && !is_turing;
+
+    use_assembly_shaders = Settings::values.use_assembly_shaders && GLAD_GL_NV_gpu_program5 &&
+                           GLAD_GL_NV_compute_program5 && GLAD_GL_NV_transform_feedback &&
+                           GLAD_GL_NV_transform_feedback2;

    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
    LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
    LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
+
+    if (Settings::values.use_assembly_shaders && !use_assembly_shaders) {
+        LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported");
+    }
 }

 Device::Device(std::nullptr_t) {
-    uniform_buffer_alignment = 0;
+    max_uniform_buffers.fill(std::numeric_limits<u32>::max());
+    uniform_buffer_alignment = 4;
+    shader_storage_alignment = 4;
    max_vertex_attributes = 16;
    max_varyings = 15;
    has_warp_intrinsics = true;
    has_shader_ballot = true;
    has_vertex_viewport_layer = true;
    has_image_load_formatted = true;
+    has_texture_shadow_lod = true;
    has_variable_aoffi = true;
-    has_component_indexing_bug = false;
-    has_broken_compute = false;
-    has_precise_bug = false;
 }

 bool Device::TestVariableAoffi() {
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -24,6 +24,10 @@ public:
    explicit Device();
    explicit Device(std::nullptr_t);

+    u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept {
+        return max_uniform_buffers[static_cast<std::size_t>(shader_type)];
+    }
+
    const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept {
        return base_bindings[stage_index];
    }
@@ -64,6 +68,14 @@ public:
        return has_image_load_formatted;
    }

+    bool HasTextureShadowLod() const {
+        return has_texture_shadow_lod;
+    }
+
+    bool HasVertexBufferUnifiedMemory() const {
+        return has_vertex_buffer_unified_memory;
+    }
+
    bool HasASTC() const {
        return has_astc;
    }
@@ -80,19 +92,24 @@ public:
        return has_precise_bug;
    }

-    bool HasBrokenCompute() const {
-        return has_broken_compute;
-    }
-
    bool HasFastBufferSubData() const {
        return has_fast_buffer_sub_data;
    }

+    bool HasNvViewportArray2() const {
+        return has_nv_viewport_array2;
+    }
+
+    bool UseAssemblyShaders() const {
+        return use_assembly_shaders;
+    }
+
 private:
    static bool TestVariableAoffi();
    static bool TestPreciseBug();

-    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings;
+    std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{};
+    std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{};
    std::size_t uniform_buffer_alignment{};
    std::size_t shader_storage_alignment{};
    u32 max_vertex_attributes{};
@@ -101,12 +118,15 @@ private:
    bool has_shader_ballot{};
    bool has_vertex_viewport_layer{};
    bool has_image_load_formatted{};
+    bool has_texture_shadow_lod{};
+    bool has_vertex_buffer_unified_memory{};
    bool has_astc{};
    bool has_variable_aoffi{};
    bool has_component_indexing_bug{};
    bool has_precise_bug{};
-    bool has_broken_compute{};
    bool has_fast_buffer_sub_data{};
+    bool has_nv_viewport_array2{};
+    bool use_assembly_shaders{};
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_fence_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp
@@ -4,6 +4,7 @@

 #include "common/assert.h"

+#include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_fence_manager.h"

 namespace OpenGL {
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -30,6 +30,7 @@
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
+#include "video_core/shader_cache.h"

 namespace OpenGL {

@@ -54,15 +55,34 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255

 namespace {

-constexpr std::size_t NumSupportedVertexAttributes = 16;
+constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
+constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
+    NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
+constexpr std::size_t TOTAL_CONST_BUFFER_BYTES =
+    NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
+
+constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
+constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;

 template <typename Engine, typename Entry>
 Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry,
                                               ShaderType shader_type, std::size_t index = 0) {
-    if (entry.is_bindless) {
-        const auto tex_handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
-        return engine.GetTextureInfo(tex_handle);
+    if constexpr (std::is_same_v<Entry, SamplerEntry>) {
+        if (entry.is_separated) {
+            const u32 buffer_1 = entry.buffer;
+            const u32 buffer_2 = entry.secondary_buffer;
+            const u32 offset_1 = entry.offset;
+            const u32 offset_2 = entry.secondary_offset;
+            const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1);
+            const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2);
+            return engine.GetTextureInfo(handle_1 | handle_2);
+        }
    }
+    if (entry.is_bindless) {
+        const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset);
+        return engine.GetTextureInfo(handle);
+    }
+
    const auto& gpu_profile = engine.AccessGuestDriverProfile();
    const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize());
    if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) {
@@ -87,6 +107,34 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
    return buffer.size;
 }

+/// Translates hardware transform feedback indices
+/// @param location Hardware location
+/// @return Pair of ARB_transform_feedback3 token stream first and third arguments
+/// @note Read https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_transform_feedback3.txt
+std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) {
+    const u8 index = location / 4;
+    if (index >= 8 && index <= 39) {
+        return {GL_GENERIC_ATTRIB_NV, index - 8};
+    }
+    if (index >= 48 && index <= 55) {
+        return {GL_TEXTURE_COORD_NV, index - 48};
+    }
+    switch (index) {
+    case 7:
+        return {GL_POSITION, 0};
+    case 40:
+        return {GL_PRIMARY_COLOR_NV, 0};
+    case 41:
+        return {GL_SECONDARY_COLOR_NV, 0};
+    case 42:
+        return {GL_BACK_PRIMARY_COLOR_NV, 0};
+    case 43:
+        return {GL_BACK_SECONDARY_COLOR_NV, 0};
+    }
+    UNIMPLEMENTED_MSG("index={}", static_cast<int>(index));
+    return {GL_POSITION, 0};
+}
+
 void oglEnable(GLenum cap, bool state) {
    (state ? glEnable : glDisable)(cap);
 }
@@ -94,17 +142,33 @@ void oglEnable(GLenum cap, bool state) {
 } // Anonymous namespace

 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                                   ScreenInfo& info, GLShader::ProgramManager& program_manager,
-                                   StateTracker& state_tracker)
-    : RasterizerAccelerated{system.Memory()}, texture_cache{system, *this, device, state_tracker},
+                                   const Device& device, ScreenInfo& info,
+                                   ProgramManager& program_manager, StateTracker& state_tracker)
+    : RasterizerAccelerated{system.Memory()}, device{device}, texture_cache{system, *this, device,
+                                                                            state_tracker},
      shader_cache{*this, system, emu_window, device}, query_cache{system, *this},
      buffer_cache{*this, system, device, STREAM_BUFFER_SIZE},
      fence_manager{system, *this, texture_cache, buffer_cache, query_cache}, system{system},
      screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} {
    CheckExtensions();
+
+    unified_uniform_buffer.Create();
+    glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
+
+    if (device.UseAssemblyShaders()) {
+        glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+        for (const GLuint cbuf : staging_cbufs) {
+            glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
+                                 nullptr, 0);
+        }
+    }
 }

-RasterizerOpenGL::~RasterizerOpenGL() {}
+RasterizerOpenGL::~RasterizerOpenGL() {
+    if (device.UseAssemblyShaders()) {
+        glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
+    }
+}

 void RasterizerOpenGL::CheckExtensions() {
    if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) {
@@ -130,7 +194,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
    // avoid OpenGL errors.
    // TODO(Subv): Analyze the shader to identify which attributes are actually used and don't
    // assume every shader uses them all.
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
        if (!flags[Dirty::VertexFormat0 + index]) {
            continue;
        }
@@ -168,9 +232,11 @@ void RasterizerOpenGL::SetupVertexBuffer() {

    MICROPROFILE_SCOPE(OpenGL_VB);

+    const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
+
    // Upload all guest vertex arrays sequentially to our buffer
    const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
        if (!flags[Dirty::VertexBuffer0 + index]) {
            continue;
        }
@@ -183,16 +249,25 @@ void RasterizerOpenGL::SetupVertexBuffer() {

        const GPUVAddr start = vertex_array.StartAddress();
        const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
-
        ASSERT(end >= start);
+
+        const GLuint gl_index = static_cast<GLuint>(index);
        const u64 size = end - start;
        if (size == 0) {
-            glBindVertexBuffer(static_cast<GLuint>(index), 0, 0, vertex_array.stride);
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            if (use_unified_memory) {
+                glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
+            }
            continue;
        }
-        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);
-        glBindVertexBuffer(static_cast<GLuint>(index), vertex_buffer, vertex_buffer_offset,
-                           vertex_array.stride);
+        const auto info = buffer_cache.UploadMemory(start, size);
+        if (use_unified_memory) {
+            glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
+            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
+                                   info.address + info.offset, size);
+        } else {
+            glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
+        }
    }
 }

@@ -205,7 +280,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
    flags[Dirty::VertexInstances] = false;

    const auto& regs = gpu.regs;
-    for (std::size_t index = 0; index < NumSupportedVertexAttributes; ++index) {
+    for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_ATTRIBUTES; ++index) {
        if (!flags[Dirty::VertexInstance0 + index]) {
            continue;
        }
@@ -222,14 +297,15 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() {
    MICROPROFILE_SCOPE(OpenGL_Index);
    const auto& regs = system.GPU().Maxwell3D().regs;
    const std::size_t size = CalculateIndexBufferSize();
-    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer);
-    return offset;
+    const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
+    return info.offset;
 }

 void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
    MICROPROFILE_SCOPE(OpenGL_Shader);
    auto& gpu = system.GPU().Maxwell3D();
+    std::size_t num_ssbos = 0;
    u32 clip_distances = 0;

    for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
@@ -259,7 +335,15 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
            continue;
        }

-        Shader shader{shader_cache.GetStageProgram(program)};
+        Shader* const shader = shader_cache.GetStageProgram(program);
+
+        if (device.UseAssemblyShaders()) {
+            // Check for ARB limitation. We only have 16 SSBOs per context state. To workaround this
+            // all stages share the same bindings.
+            const std::size_t num_stage_ssbos = shader->GetEntries().global_memory_entries.size();
+            ASSERT_MSG(num_stage_ssbos == 0 || num_ssbos == 0, "SSBOs on more than one stage");
+            num_ssbos += num_stage_ssbos;
+        }

        // Stage indices are 0 - 5
        const std::size_t stage = index == 0 ? 0 : index - 1;
@@ -526,6 +610,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    SyncFramebufferSRGB();

    buffer_cache.Acquire();
+    current_cbuf = 0;

    std::size_t buffer_size = CalculateVertexArraysSize();

@@ -535,16 +620,25 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    }

    // Uniform space for the 5 shader stages
-    buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
-                  (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
-                      Maxwell::MaxShaderStage;
+    buffer_size =
+        Common::AlignUp<std::size_t>(buffer_size, 4) +
+        (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;

    // Add space for at least 18 constant buffers
    buffer_size += Maxwell::MaxConstBuffers *
                   (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());

    // Prepare the vertex array.
-    buffer_cache.Map(buffer_size);
+    const bool invalidated = buffer_cache.Map(buffer_size);
+
+    if (invalidated) {
+        // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty
+        auto& dirty = gpu.dirty.flags;
+        dirty[Dirty::VertexBuffers] = true;
+        for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
+            dirty[index] = true;
+        }
+    }

    // Prepare vertex array format.
    SetupVertexFormat();
@@ -558,12 +652,14 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
    }

    // Setup emulation uniform buffer.
-    GLShader::MaxwellUniformData ubo;
-    ubo.SetFromRegs(gpu);
-    const auto [buffer, offset] =
-        buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
-    glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, buffer, offset,
-                      static_cast<GLsizeiptr>(sizeof(ubo)));
+    if (!device.UseAssemblyShaders()) {
+        MaxwellUniformData ubo;
+        ubo.SetFromRegs(gpu);
+        const auto info =
+            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
+        glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
+                          static_cast<GLsizeiptr>(sizeof(ubo)));
+    }

    // Setup shaders and their used resources.
    texture_cache.GuardSamplers(true);
@@ -630,16 +726,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
 }

 void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
-    if (device.HasBrokenCompute()) {
-        return;
-    }
-
    buffer_cache.Acquire();
+    current_cbuf = 0;

    auto kernel = shader_cache.GetComputeKernel(code_addr);
    SetupComputeTextures(kernel);
    SetupComputeImages(kernel);
-    program_manager.BindComputeShader(kernel->GetHandle());

    const std::size_t buffer_size =
        Tegra::Engines::KeplerCompute::NumConstBuffers *
@@ -652,6 +744,7 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
    buffer_cache.Unmap();

    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    program_manager.BindCompute(kernel->GetHandle());
    glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
    ++num_queued_commands;
 }
@@ -701,15 +794,15 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
        return;
    }
    texture_cache.OnCPUWrite(addr, size);
-    shader_cache.InvalidateRegion(addr, size);
+    shader_cache.OnCPUWrite(addr, size);
    buffer_cache.OnCPUWrite(addr, size);
-    query_cache.InvalidateRegion(addr, size);
 }

 void RasterizerOpenGL::SyncGuestHost() {
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
    texture_cache.SyncGuestHost();
    buffer_cache.SyncGuestHost();
+    shader_cache.SyncGuestHost();
 }

 void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) {
@@ -811,40 +904,72 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
    return true;
 }

-void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
+    static constexpr std::array PARAMETER_LUT = {
+        GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
+        GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV};
+
    MICROPROFILE_SCOPE(OpenGL_UBO);
    const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
    const auto& shader_stage = stages[stage_index];
+    const auto& entries = shader->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;
+    const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;

-    u32 binding = device.GetBaseBindings(stage_index).uniform_buffer;
-    for (const auto& entry : shader->GetEntries().const_buffers) {
-        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
-        SetupConstBuffer(binding++, buffer, entry);
+    const auto base_bindings = device.GetBaseBindings(stage_index);
+    u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
+    for (const auto& entry : entries.const_buffers) {
+        const u32 index = entry.GetIndex();
+        const auto& buffer = shader_stage.const_buffers[index];
+        SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
+                         base_unified_offset + index * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
+                                           entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
+                          base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
    }
 }

-void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
    MICROPROFILE_SCOPE(OpenGL_UBO);
    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    const auto& entries = kernel->GetEntries();
+    const bool use_unified = entries.use_unified_uniforms;

    u32 binding = 0;
-    for (const auto& entry : kernel->GetEntries().const_buffers) {
+    for (const auto& entry : entries.const_buffers) {
        const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
        const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
        Tegra::Engines::ConstBufferInfo buffer;
        buffer.address = config.Address();
        buffer.size = config.size;
        buffer.enabled = mask[entry.GetIndex()];
-        SetupConstBuffer(binding++, buffer, entry);
+        SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
+                         use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
+        ++binding;
+    }
+    if (use_unified) {
+        const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
+        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
+                          NUM_CONST_BUFFERS_BYTES_PER_STAGE);
    }
 }

-void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                                        const ConstBufferEntry& entry) {
+void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
+                                        const Tegra::Engines::ConstBufferInfo& buffer,
+                                        const ConstBufferEntry& entry, bool use_unified,
+                                        std::size_t unified_offset) {
    if (!buffer.enabled) {
        // Set values to zero to unbind buffers
-        glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer_cache.GetEmptyBuffer(sizeof(float)), 0,
-                          sizeof(float));
+        if (device.UseAssemblyShaders()) {
+            glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
+        } else {
+            glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
+        }
        return;
    }

@@ -852,18 +977,39 @@ void RasterizerOpenGL::SetupConstBuffer(u32 binding, const Tegra::Engines::Const
    // UBO alignment requirements.
    const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));

-    const auto alignment = device.GetUniformBufferAlignment();
-    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
-                                                          device.HasFastBufferSubData());
-    glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size);
+    const bool fast_upload = !use_unified && device.HasFastBufferSubData();
+
+    const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
+    const GPUVAddr gpu_addr = buffer.address;
+    auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
+
+    if (device.UseAssemblyShaders()) {
+        UNIMPLEMENTED_IF(use_unified);
+        if (info.offset != 0) {
+            const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
+            glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
+            info.handle = staging_cbuf;
+            info.offset = 0;
+        }
+        glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
+        return;
+    }
+
+    if (use_unified) {
+        glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
+                                 unified_offset, size);
+    } else {
+        glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
+    }
 }

-void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
    auto& gpu{system.GPU()};
    auto& memory_manager{gpu.MemoryManager()};
    const auto cbufs{gpu.Maxwell3D().state.shader_stages[stage_index]};

-    u32 binding = device.GetBaseBindings(stage_index).shader_storage_buffer;
+    u32 binding =
+        device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
    for (const auto& entry : shader->GetEntries().global_memory_entries) {
        const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
        const GPUVAddr gpu_addr{memory_manager.Read<u64>(addr)};
@@ -872,7 +1018,7 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shad
    }
 }

-void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
    auto& gpu{system.GPU()};
    auto& memory_manager{gpu.MemoryManager()};
    const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
@@ -889,13 +1035,12 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
 void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
                                         GPUVAddr gpu_addr, std::size_t size) {
    const auto alignment{device.GetShaderStorageBufferAlignment()};
-    const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
-    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, ssbo, buffer_offset,
+    const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
+    glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
                      static_cast<GLsizeiptr>(size));
 }

-void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) {
    MICROPROFILE_SCOPE(OpenGL_Texture);
    const auto& maxwell3d = system.GPU().Maxwell3D();
    u32 binding = device.GetBaseBindings(stage_index).sampler;
@@ -908,7 +1053,7 @@ void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, const Shader&
    }
 }

-void RasterizerOpenGL::SetupComputeTextures(const Shader& kernel) {
+void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) {
    MICROPROFILE_SCOPE(OpenGL_Texture);
    const auto& compute = system.GPU().KeplerCompute();
    u32 binding = 0;
@@ -929,19 +1074,15 @@ void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextu
        glBindTextureUnit(binding, 0);
        return;
    }
-    glBindTextureUnit(binding, view->GetTexture());
-
-    if (view->GetSurfaceParams().IsBuffer()) {
-        return;
+    const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source,
+                                           texture.tic.z_source, texture.tic.w_source);
+    glBindTextureUnit(binding, handle);
+    if (!view->GetSurfaceParams().IsBuffer()) {
+        glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
    }
-    // Apply swizzle to textures that are not buffers.
-    view->ApplySwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
-                       texture.tic.w_source);
-
-    glBindSampler(binding, sampler_cache.GetSampler(texture.tsc));
 }

-void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& shader) {
+void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) {
    const auto& maxwell3d = system.GPU().Maxwell3D();
    u32 binding = device.GetBaseBindings(stage_index).image;
    for (const auto& entry : shader->GetEntries().images) {
@@ -951,7 +1092,7 @@ void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, const Shader& sh
    }
 }

-void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
+void RasterizerOpenGL::SetupComputeImages(Shader* shader) {
    const auto& compute = system.GPU().KeplerCompute();
    u32 binding = 0;
    for (const auto& entry : shader->GetEntries().images) {
@@ -967,14 +1108,11 @@ void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& t
        glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8);
        return;
    }
-    if (!tic.IsBuffer()) {
-        view->ApplySwizzle(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
-    }
    if (entry.is_written) {
        view->MarkAsModified(texture_cache.Tick());
    }
-    glBindImageTexture(binding, view->GetTexture(), 0, GL_TRUE, 0, GL_READ_WRITE,
-                       view->GetFormat());
+    const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source);
+    glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat());
 }

 void RasterizerOpenGL::SyncViewport() {
@@ -983,6 +1121,26 @@ void RasterizerOpenGL::SyncViewport() {
    const auto& regs = gpu.regs;

    const bool dirty_viewport = flags[Dirty::Viewports];
+    const bool dirty_clip_control = flags[Dirty::ClipControl];
+
+    if (dirty_clip_control || flags[Dirty::FrontFace]) {
+        flags[Dirty::FrontFace] = false;
+
+        GLenum mode = MaxwellToGL::FrontFace(regs.front_face);
+        if (regs.screen_y_control.triangle_rast_flip != 0 &&
+            regs.viewport_transform[0].scale_y < 0.0f) {
+            switch (mode) {
+            case GL_CW:
+                mode = GL_CCW;
+                break;
+            case GL_CCW:
+                mode = GL_CW;
+                break;
+            }
+        }
+        glFrontFace(mode);
+    }
+
    if (dirty_viewport || flags[Dirty::ClipControl]) {
        flags[Dirty::ClipControl] = false;

@@ -1080,11 +1238,6 @@ void RasterizerOpenGL::SyncCullMode() {
            glDisable(GL_CULL_FACE);
        }
    }
-
-    if (flags[Dirty::FrontFace]) {
-        flags[Dirty::FrontFace] = false;
-        glFrontFace(MaxwellToGL::FrontFace(regs.front_face));
-    }
 }

 void RasterizerOpenGL::SyncPrimitiveRestart() {
@@ -1455,12 +1608,70 @@ void RasterizerOpenGL::SyncFramebufferSRGB() {
    oglEnable(GL_FRAMEBUFFER_SRGB, gpu.regs.framebuffer_srgb);
 }

+void RasterizerOpenGL::SyncTransformFeedback() {
+    // TODO(Rodrigo): Inject SKIP_COMPONENTS*_NV when required. An unimplemented message will signal
+    // when this is required.
+    const auto& regs = system.GPU().Maxwell3D().regs;
+
+    static constexpr std::size_t STRIDE = 3;
+    std::array<GLint, 128 * STRIDE * Maxwell::NumTransformFeedbackBuffers> attribs;
+    std::array<GLint, Maxwell::NumTransformFeedbackBuffers> streams;
+
+    GLint* cursor = attribs.data();
+    GLint* current_stream = streams.data();
+
+    for (std::size_t feedback = 0; feedback < Maxwell::NumTransformFeedbackBuffers; ++feedback) {
+        const auto& layout = regs.tfb_layouts[feedback];
+        UNIMPLEMENTED_IF_MSG(layout.stride != layout.varying_count * 4, "Stride padding");
+        if (layout.varying_count == 0) {
+            continue;
+        }
+
+        *current_stream = static_cast<GLint>(feedback);
+        if (current_stream != streams.data()) {
+            // When stepping one stream, push the expected token
+            cursor[0] = GL_NEXT_BUFFER_NV;
+            cursor[1] = 0;
+            cursor[2] = 0;
+            cursor += STRIDE;
+        }
+        ++current_stream;
+
+        const auto& locations = regs.tfb_varying_locs[feedback];
+        std::optional<u8> current_index;
+        for (u32 offset = 0; offset < layout.varying_count; ++offset) {
+            const u8 location = locations[offset];
+            const u8 index = location / 4;
+
+            if (current_index == index) {
+                // Increase number of components of the previous attachment
+                ++cursor[-2];
+                continue;
+            }
+            current_index = index;
+
+            std::tie(cursor[0], cursor[2]) = TransformFeedbackEnum(location);
+            cursor[1] = 1;
+            cursor += STRIDE;
+        }
+    }
+
+    const GLsizei num_attribs = static_cast<GLsizei>((cursor - attribs.data()) / STRIDE);
+    const GLsizei num_strides = static_cast<GLsizei>(current_stream - streams.data());
+    glTransformFeedbackStreamAttribsNV(num_attribs, attribs.data(), num_strides, streams.data(),
+                                       GL_INTERLEAVED_ATTRIBS);
+}
+
 void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
    const auto& regs = system.GPU().Maxwell3D().regs;
    if (regs.tfb_enabled == 0) {
        return;
    }

+    if (device.UseAssemblyShaders()) {
+        SyncTransformFeedback();
+    }
+
    UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
                     regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
@@ -1487,6 +1698,10 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
                          static_cast<GLsizeiptr>(size));
    }

+    // We may have to call BeginTransformFeedbackNV here since they seem to call different
+    // implementations on Nvidia's driver (the pointer is different) but we are using
+    // ARB_transform_feedback3 features with NV_transform_feedback interactions and the ARB
+    // extension doesn't define BeginTransformFeedback (without NV) interactions. It just works.
    glBeginTransformFeedback(GL_POINTS);
 }

@@ -1508,8 +1723,9 @@ void RasterizerOpenGL::EndTransformFeedback() {
        const GLuint handle = transform_feedback_buffers[index].handle;
        const GPUVAddr gpu_addr = binding.Address();
        const std::size_t size = binding.buffer_size;
-        const auto [dest_buffer, offset] = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
-        glCopyNamedBufferSubData(handle, dest_buffer, 0, offset, static_cast<GLsizeiptr>(size));
+        const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
+        glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
+                                 static_cast<GLsizeiptr>(size));
    }
 }

--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -19,7 +19,6 @@
 #include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_accelerated.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
@@ -56,8 +55,8 @@ struct DrawParameters;
 class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
 public:
    explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                              ScreenInfo& info, GLShader::ProgramManager& program_manager,
-                              StateTracker& state_tracker);
+                              const Device& device, ScreenInfo& info,
+                              ProgramManager& program_manager, StateTracker& state_tracker);
    ~RasterizerOpenGL() override;

    void Draw(bool is_indexed, bool is_instanced) override;
@@ -100,40 +99,41 @@ private:
    void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil);

    /// Configures the current constbuffers to use for the draw command.
-    void SetupDrawConstBuffers(std::size_t stage_index, const Shader& shader);
+    void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);

    /// Configures the current constbuffers to use for the kernel invocation.
-    void SetupComputeConstBuffers(const Shader& kernel);
+    void SetupComputeConstBuffers(Shader* kernel);

    /// Configures a constant buffer.
-    void SetupConstBuffer(u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
-                          const ConstBufferEntry& entry);
+    void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
+                          const ConstBufferEntry& entry, bool use_unified,
+                          std::size_t unified_offset);

    /// Configures the current global memory entries to use for the draw command.
-    void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader);
+    void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);

    /// Configures the current global memory entries to use for the kernel invocation.
-    void SetupComputeGlobalMemory(const Shader& kernel);
+    void SetupComputeGlobalMemory(Shader* kernel);

    /// Configures a constant buffer.
    void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
                           std::size_t size);

    /// Configures the current textures to use for the draw command.
-    void SetupDrawTextures(std::size_t stage_index, const Shader& shader);
+    void SetupDrawTextures(std::size_t stage_index, Shader* shader);

    /// Configures the textures used in a compute shader.
-    void SetupComputeTextures(const Shader& kernel);
+    void SetupComputeTextures(Shader* kernel);

    /// Configures a texture.
    void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture,
                      const SamplerEntry& entry);

    /// Configures images in a graphics shader.
-    void SetupDrawImages(std::size_t stage_index, const Shader& shader);
+    void SetupDrawImages(std::size_t stage_index, Shader* shader);

    /// Configures images in a compute shader.
-    void SetupComputeImages(const Shader& shader);
+    void SetupComputeImages(Shader* shader);

    /// Configures an image.
    void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry);
@@ -201,6 +201,10 @@ private:
    /// Syncs the framebuffer sRGB state to match the guest state
    void SyncFramebufferSRGB();

+    /// Syncs transform feedback state to match guest state
+    /// @note Only valid on assembly shaders
+    void SyncTransformFeedback();
+
    /// Begin a transform feedback
    void BeginTransformFeedback(GLenum primitive_mode);

@@ -224,7 +228,7 @@ private:

    void SetupShaders(GLenum primitive_mode);

-    const Device device;
+    const Device& device;

    TextureCacheOpenGL texture_cache;
    ShaderCacheOpenGL shader_cache;
@@ -236,7 +240,7 @@ private:

    Core::System& system;
    ScreenInfo& screen_info;
-    GLShader::ProgramManager& program_manager;
+    ProgramManager& program_manager;
    StateTracker& state_tracker;

    static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
@@ -248,6 +252,13 @@ private:
    std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
        enabled_transform_feedback_buffers;

+    static constexpr std::size_t NUM_CONSTANT_BUFFERS =
+        Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+        Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
+    std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
+    std::size_t current_cbuf = 0;
+    OGLBuffer unified_uniform_buffer;
+
    /// Number of commands queued to the OpenGL driver. Reseted on flush.
    std::size_t num_queued_commands = 0;

--- a/src/video_core/renderer_opengl/gl_resource_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp
@@ -125,6 +125,15 @@ void OGLProgram::Release() {
    handle = 0;
 }

+void OGLAssemblyProgram::Release() {
+    if (handle == 0) {
+        return;
+    }
+    MICROPROFILE_SCOPE(OpenGL_ResourceDeletion);
+    glDeleteProgramsARB(1, &handle);
+    handle = 0;
+}
+
 void OGLPipeline::Create() {
    if (handle != 0)
        return;
--- a/src/video_core/renderer_opengl/gl_resource_manager.h
+++ b/src/video_core/renderer_opengl/gl_resource_manager.h
@@ -167,6 +167,22 @@ public:
    GLuint handle = 0;
 };

+class OGLAssemblyProgram : private NonCopyable {
+public:
+    OGLAssemblyProgram() = default;
+
+    OGLAssemblyProgram(OGLAssemblyProgram&& o) noexcept : handle(std::exchange(o.handle, 0)) {}
+
+    ~OGLAssemblyProgram() {
+        Release();
+    }
+
+    /// Deletes the internal OpenGL resource
+    void Release();
+
+    GLuint handle = 0;
+};
+
 class OGLPipeline : private NonCopyable {
 public:
    OGLPipeline() = default;
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -20,6 +20,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/shader_type.h"
 #include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_arb_decompiler.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
@@ -29,6 +30,7 @@
 #include "video_core/shader/memory_util.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"

 namespace OpenGL {

@@ -97,6 +99,24 @@ constexpr ShaderType GetShaderType(Maxwell::ShaderProgram program_type) {
    return {};
 }

+constexpr GLenum AssemblyEnum(ShaderType shader_type) {
+    switch (shader_type) {
+    case ShaderType::Vertex:
+        return GL_VERTEX_PROGRAM_NV;
+    case ShaderType::TesselationControl:
+        return GL_TESS_CONTROL_PROGRAM_NV;
+    case ShaderType::TesselationEval:
+        return GL_TESS_EVALUATION_PROGRAM_NV;
+    case ShaderType::Geometry:
+        return GL_GEOMETRY_PROGRAM_NV;
+    case ShaderType::Fragment:
+        return GL_FRAGMENT_PROGRAM_NV;
+    case ShaderType::Compute:
+        return GL_COMPUTE_PROGRAM_NV;
+    }
+    return {};
+}
+
 std::string MakeShaderID(u64 unique_identifier, ShaderType shader_type) {
    return fmt::format("{}{:016X}", GetShaderTypeName(shader_type), unique_identifier);
 }
@@ -120,18 +140,44 @@ std::shared_ptr<Registry> MakeRegistry(const ShaderDiskCacheEntry& entry) {
    return registry;
 }

-std::shared_ptr<OGLProgram> BuildShader(const Device& device, ShaderType shader_type,
-                                        u64 unique_identifier, const ShaderIR& ir,
-                                        const Registry& registry, bool hint_retrievable = false) {
+ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 unique_identifier,
+                             const ShaderIR& ir, const Registry& registry,
+                             bool hint_retrievable = false) {
    const std::string shader_id = MakeShaderID(unique_identifier, shader_type);
    LOG_INFO(Render_OpenGL, "{}", shader_id);

-    const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
-    OGLShader shader;
-    shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
+    auto program = std::make_shared<ProgramHandle>();
+
+    if (device.UseAssemblyShaders()) {
+        const std::string arb =
+            DecompileAssemblyShader(device, ir, registry, shader_type, shader_id);
+
+        GLuint& arb_prog = program->assembly_program.handle;
+
+// Commented out functions signal OpenGL errors but are compatible with apitrace.
+// Use them only to capture and replay on apitrace.
+#if 0
+        glGenProgramsNV(1, &arb_prog);
+        glLoadProgramNV(AssemblyEnum(shader_type), arb_prog, static_cast<GLsizei>(arb.size()),
+                        reinterpret_cast<const GLubyte*>(arb.data()));
+#else
+        glGenProgramsARB(1, &arb_prog);
+        glNamedProgramStringEXT(arb_prog, AssemblyEnum(shader_type), GL_PROGRAM_FORMAT_ASCII_ARB,
+                                static_cast<GLsizei>(arb.size()), arb.data());
+#endif
+        const auto err = reinterpret_cast<const char*>(glGetString(GL_PROGRAM_ERROR_STRING_NV));
+        if (err && *err) {
+            LOG_CRITICAL(Render_OpenGL, "{}", err);
+            LOG_INFO(Render_OpenGL, "\n{}", arb);
+        }
+    } else {
+        const std::string glsl = DecompileShader(device, ir, registry, shader_type, shader_id);
+        OGLShader shader;
+        shader.Create(glsl.c_str(), GetGLShaderType(shader_type));
+
+        program->source_program.Create(true, hint_retrievable, shader.handle);
+    }

-    auto program = std::make_shared<OGLProgram>();
-    program->Create(true, hint_retrievable, shader.handle);
    return program;
 }

@@ -151,22 +197,26 @@ std::unordered_set<GLenum> GetSupportedFormats() {

 } // Anonymous namespace

-CachedShader::CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                           std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                           ShaderEntries entries, std::shared_ptr<OGLProgram> program)
-    : RasterizerCacheObject{cpu_addr}, registry{std::move(registry)}, entries{std::move(entries)},
-      size_in_bytes{size_in_bytes}, program{std::move(program)} {}
-
-CachedShader::~CachedShader() = default;
-
-GLuint CachedShader::GetHandle() const {
-    DEBUG_ASSERT(registry->IsConsistent());
-    return program->handle;
+Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_,
+               ProgramSharedPtr program_)
+    : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)} {
+    handle = program->assembly_program.handle;
+    if (handle == 0) {
+        handle = program->source_program.handle;
+    }
+    ASSERT(handle != 0);
 }

-Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
-                                           Maxwell::ShaderProgram program_type, ProgramCode code,
-                                           ProgramCode code_b) {
+Shader::~Shader() = default;
+
+GLuint Shader::GetHandle() const {
+    DEBUG_ASSERT(registry->IsConsistent());
+    return handle;
+}
+
+std::unique_ptr<Shader> Shader::CreateStageFromMemory(const ShaderParameters& params,
+                                                      Maxwell::ShaderProgram program_type,
+                                                      ProgramCode code, ProgramCode code_b) {
    const auto shader_type = GetShaderType(program_type);
    const std::size_t size_in_bytes = code.size() * sizeof(u64);

@@ -191,11 +241,12 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
    entry.bindless_samplers = registry->GetBindlessSamplers();
    params.disk_cache.SaveEntry(std::move(entry));

-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+    return std::unique_ptr<Shader>(new Shader(
+        std::move(registry), MakeEntries(params.device, ir, shader_type), std::move(program)));
 }

-Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
+std::unique_ptr<Shader> Shader::CreateKernelFromMemory(const ShaderParameters& params,
+                                                       ProgramCode code) {
    const std::size_t size_in_bytes = code.size() * sizeof(u64);

    auto& engine = params.system.GPU().KeplerCompute();
@@ -215,22 +266,23 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog
    entry.bindless_samplers = registry->GetBindlessSamplers();
    params.disk_cache.SaveEntry(std::move(entry));

-    return std::shared_ptr<CachedShader>(new CachedShader(
-        params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program)));
+    return std::unique_ptr<Shader>(new Shader(std::move(registry),
+                                              MakeEntries(params.device, ir, ShaderType::Compute),
+                                              std::move(program)));
 }

-Shader CachedShader::CreateFromCache(const ShaderParameters& params,
-                                     const PrecompiledShader& precompiled_shader,
-                                     std::size_t size_in_bytes) {
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params.cpu_addr, size_in_bytes, precompiled_shader.registry,
-                         precompiled_shader.entries, precompiled_shader.program));
+std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params,
+                                                const PrecompiledShader& precompiled_shader) {
+    return std::unique_ptr<Shader>(new Shader(
+        precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program));
 }

 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                                     Core::Frontend::EmuWindow& emu_window, const Device& device)
-    : RasterizerCache{rasterizer}, system{system}, emu_window{emu_window}, device{device},
-      disk_cache{system} {}
+    : VideoCommon::ShaderCache<Shader>{rasterizer}, system{system},
+      emu_window{emu_window}, device{device}, disk_cache{system} {}
+
+ShaderCacheOpenGL::~ShaderCacheOpenGL() = default;

 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                      const VideoCore::DiskResourceLoadCallback& callback) {
@@ -239,7 +291,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
        return;
    }

-    const std::vector gl_cache = disk_cache.LoadPrecompiled();
+    std::vector<ShaderDiskCachePrecompiled> gl_cache;
+    if (!device.UseAssemblyShaders()) {
+        // Only load precompiled cache when we are not using assembly shaders
+        gl_cache = disk_cache.LoadPrecompiled();
+    }
    const auto supported_formats = GetSupportedFormats();

    // Track if precompiled cache was altered during loading to know if we have to
@@ -278,7 +334,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
            auto registry = MakeRegistry(entry);
            const ShaderIR ir(entry.code, main_offset, COMPILER_SETTINGS, *registry);

-            std::shared_ptr<OGLProgram> program;
+            ProgramSharedPtr program;
            if (precompiled_entry) {
                // If the shader is precompiled, attempt to load it with
                program = GeneratePrecompiledProgram(entry, *precompiled_entry, supported_formats);
@@ -294,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
            PrecompiledShader shader;
            shader.program = std::move(program);
            shader.registry = std::move(registry);
-            shader.entries = MakeEntries(ir);
+            shader.entries = MakeEntries(device, ir, entry.type);

            std::scoped_lock lock{mutex};
            if (callback) {
@@ -332,6 +388,11 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
        return;
    }

+    if (device.UseAssemblyShaders()) {
+        // Don't store precompiled binaries for assembly shaders.
+        return;
+    }
+
    // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
    // before precompiling them

@@ -339,7 +400,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
        const u64 id = (*transferable)[i].unique_identifier;
        const auto it = find_precompiled(id);
        if (it == gl_cache.end()) {
-            const GLuint program = runtime_cache.at(id).program->handle;
+            const GLuint program = runtime_cache.at(id).program->source_program.handle;
            disk_cache.SavePrecompiled(id, program);
            precompiled_cache_altered = true;
        }
@@ -350,7 +411,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
    }
 }

-std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
+ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram(
    const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
    const std::unordered_set<GLenum>& supported_formats) {
    if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) {
@@ -358,15 +419,15 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
        return {};
    }

-    auto program = std::make_shared<OGLProgram>();
-    program->handle = glCreateProgram();
-    glProgramParameteri(program->handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
-    glProgramBinary(program->handle, precompiled_entry.binary_format,
-                    precompiled_entry.binary.data(),
+    auto program = std::make_shared<ProgramHandle>();
+    GLuint& handle = program->source_program.handle;
+    handle = glCreateProgram();
+    glProgramParameteri(handle, GL_PROGRAM_SEPARABLE, GL_TRUE);
+    glProgramBinary(handle, precompiled_entry.binary_format, precompiled_entry.binary.data(),
                    static_cast<GLsizei>(precompiled_entry.binary.size()));

    GLint link_status;
-    glGetProgramiv(program->handle, GL_LINK_STATUS, &link_status);
+    glGetProgramiv(handle, GL_LINK_STATUS, &link_status);
    if (link_status == GL_FALSE) {
        LOG_INFO(Render_OpenGL, "Precompiled cache rejected by the driver, removing");
        return {};
@@ -375,7 +436,7 @@ std::shared_ptr<OGLProgram> ShaderCacheOpenGL::GeneratePrecompiledProgram(
    return program;
 }

-Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
+Shader* ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
    if (!system.GPU().Maxwell3D().dirty.flags[Dirty::Shaders]) {
        return last_shaders[static_cast<std::size_t>(program)];
    }
@@ -385,8 +446,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {

    // Look up shader in the cache based on address
    const auto cpu_addr{memory_manager.GpuToCpuAddress(address)};
-    Shader shader{cpu_addr ? TryGet(*cpu_addr) : null_shader};
-    if (shader) {
+    if (Shader* const shader{cpu_addr ? TryGet(*cpu_addr) : null_shader.get()}) {
        return last_shaders[static_cast<std::size_t>(program)] = shader;
    }

@@ -400,62 +460,64 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
        const u8* host_ptr_b = memory_manager.GetPointer(address_b);
        code_b = GetShaderCode(memory_manager, address_b, host_ptr_b, false);
    }
+    const std::size_t code_size = code.size() * sizeof(u64);

-    const auto unique_identifier = GetUniqueIdentifier(
+    const u64 unique_identifier = GetUniqueIdentifier(
        GetShaderType(program), program == Maxwell::ShaderProgram::VertexA, code, code_b);

    const ShaderParameters params{system,    disk_cache, device,
                                  *cpu_addr, host_ptr,   unique_identifier};

+    std::unique_ptr<Shader> shader;
    const auto found = runtime_cache.find(unique_identifier);
    if (found == runtime_cache.end()) {
-        shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
-                                                     std::move(code_b));
+        shader = Shader::CreateStageFromMemory(params, program, std::move(code), std::move(code_b));
    } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        shader = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        shader = Shader::CreateFromCache(params, found->second);
    }

+    Shader* const result = shader.get();
    if (cpu_addr) {
-        Register(shader);
+        Register(std::move(shader), *cpu_addr, code_size);
    } else {
-        null_shader = shader;
+        null_shader = std::move(shader);
    }

-    return last_shaders[static_cast<std::size_t>(program)] = shader;
+    return last_shaders[static_cast<std::size_t>(program)] = result;
 }

-Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+Shader* ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
    auto& memory_manager{system.GPU().MemoryManager()};
    const auto cpu_addr{memory_manager.GpuToCpuAddress(code_addr)};

-    auto kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel;
-    if (kernel) {
+    if (Shader* const kernel = cpu_addr ? TryGet(*cpu_addr) : null_kernel.get()) {
        return kernel;
    }

    const auto host_ptr{memory_manager.GetPointer(code_addr)};
    // No kernel found, create a new one
-    auto code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
-    const auto unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};
+    ProgramCode code{GetShaderCode(memory_manager, code_addr, host_ptr, true)};
+    const std::size_t code_size{code.size() * sizeof(u64)};
+    const u64 unique_identifier{GetUniqueIdentifier(ShaderType::Compute, false, code)};

    const ShaderParameters params{system,    disk_cache, device,
                                  *cpu_addr, host_ptr,   unique_identifier};

+    std::unique_ptr<Shader> kernel;
    const auto found = runtime_cache.find(unique_identifier);
    if (found == runtime_cache.end()) {
-        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+        kernel = Shader::CreateKernelFromMemory(params, std::move(code));
    } else {
-        const std::size_t size_in_bytes = code.size() * sizeof(u64);
-        kernel = CachedShader::CreateFromCache(params, found->second, size_in_bytes);
+        kernel = Shader::CreateFromCache(params, found->second);
    }

+    Shader* const result = kernel.get();
    if (cpu_addr) {
-        Register(kernel);
+        Register(std::move(kernel), *cpu_addr, code_size);
    } else {
-        null_kernel = kernel;
+        null_kernel = std::move(kernel);
    }
-    return kernel;
+    return result;
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -18,12 +18,12 @@

 #include "common/common_types.h"
 #include "video_core/engines/shader_type.h"
-#include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
 #include "video_core/shader/registry.h"
 #include "video_core/shader/shader_ir.h"
+#include "video_core/shader_cache.h"

 namespace Core {
 class System;
@@ -35,16 +35,19 @@ class EmuWindow;

 namespace OpenGL {

-class CachedShader;
 class Device;
 class RasterizerOpenGL;
-struct UnspecializedShader;

-using Shader = std::shared_ptr<CachedShader>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;

+struct ProgramHandle {
+    OGLProgram source_program;
+    OGLAssemblyProgram assembly_program;
+};
+using ProgramSharedPtr = std::shared_ptr<ProgramHandle>;
+
 struct PrecompiledShader {
-    std::shared_ptr<OGLProgram> program;
+    ProgramSharedPtr program;
    std::shared_ptr<VideoCommon::Shader::Registry> registry;
    ShaderEntries entries;
 };
@@ -58,64 +61,56 @@ struct ShaderParameters {
    u64 unique_identifier;
 };

-class CachedShader final : public RasterizerCacheObject {
+class Shader final {
 public:
-    ~CachedShader();
+    ~Shader();

    /// Gets the GL program handle for the shader
    GLuint GetHandle() const;

-    /// Returns the size in bytes of the shader
-    std::size_t GetSizeInBytes() const override {
-        return size_in_bytes;
-    }
-
    /// Gets the shader entries for the shader
    const ShaderEntries& GetEntries() const {
        return entries;
    }

-    static Shader CreateStageFromMemory(const ShaderParameters& params,
-                                        Maxwell::ShaderProgram program_type,
-                                        ProgramCode program_code, ProgramCode program_code_b);
-    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);
+    static std::unique_ptr<Shader> CreateStageFromMemory(const ShaderParameters& params,
+                                                         Maxwell::ShaderProgram program_type,
+                                                         ProgramCode program_code,
+                                                         ProgramCode program_code_b);
+    static std::unique_ptr<Shader> CreateKernelFromMemory(const ShaderParameters& params,
+                                                          ProgramCode code);

-    static Shader CreateFromCache(const ShaderParameters& params,
-                                  const PrecompiledShader& precompiled_shader,
-                                  std::size_t size_in_bytes);
+    static std::unique_ptr<Shader> CreateFromCache(const ShaderParameters& params,
+                                                   const PrecompiledShader& precompiled_shader);

 private:
-    explicit CachedShader(VAddr cpu_addr, std::size_t size_in_bytes,
-                          std::shared_ptr<VideoCommon::Shader::Registry> registry,
-                          ShaderEntries entries, std::shared_ptr<OGLProgram> program);
+    explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries,
+                    ProgramSharedPtr program);

    std::shared_ptr<VideoCommon::Shader::Registry> registry;
    ShaderEntries entries;
-    std::size_t size_in_bytes = 0;
-    std::shared_ptr<OGLProgram> program;
+    ProgramSharedPtr program;
+    GLuint handle = 0;
 };

-class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
+class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> {
 public:
    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
                               Core::Frontend::EmuWindow& emu_window, const Device& device);
+    ~ShaderCacheOpenGL() override;

    /// Loads disk cache for the current game
    void LoadDiskCache(const std::atomic_bool& stop_loading,
                       const VideoCore::DiskResourceLoadCallback& callback);

    /// Gets the current specified shader stage program
-    Shader GetStageProgram(Maxwell::ShaderProgram program);
+    Shader* GetStageProgram(Maxwell::ShaderProgram program);

    /// Gets a compute kernel in the passed address
-    Shader GetComputeKernel(GPUVAddr code_addr);
-
-protected:
-    // We do not have to flush this cache as things in it are never modified by us.
-    void FlushObjectInner(const Shader& object) override {}
+    Shader* GetComputeKernel(GPUVAddr code_addr);

 private:
-    std::shared_ptr<OGLProgram> GeneratePrecompiledProgram(
+    ProgramSharedPtr GeneratePrecompiledProgram(
        const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry,
        const std::unordered_set<GLenum>& supported_formats);

@@ -125,10 +120,10 @@ private:
    ShaderDiskCacheOpenGL disk_cache;
    std::unordered_map<u64, PrecompiledShader> runtime_cache;

-    Shader null_shader{};
-    Shader null_kernel{};
+    std::unique_ptr<Shader> null_shader;
+    std::unique_ptr<Shader> null_kernel;

-    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+    std::array<Shader*, Maxwell::MaxShaderProgram> last_shaders{};
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -37,6 +37,7 @@ using Tegra::Shader::IpaMode;
 using Tegra::Shader::IpaSampleMode;
 using Tegra::Shader::PixelImap;
 using Tegra::Shader::Register;
+using Tegra::Shader::TextureType;
 using VideoCommon::Shader::BuildTransformFeedback;
 using VideoCommon::Shader::Registry;

@@ -61,8 +62,8 @@ struct TextureDerivates {};
 using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>;

-constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
-    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
+constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
+constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);

 constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt
 #define ftou floatBitsToUint
@@ -402,6 +403,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
    return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }

+bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
+    const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
+    // We waste one UBO for emulation
+    const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
+    return num_ubos > num_available_ubos;
+}
+
 struct GenericVaryingDescription {
    std::string name;
    u8 first_element = 0;
@@ -412,8 +420,9 @@ class GLSLDecompiler final {
 public:
    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry,
                            ShaderType stage, std::string_view identifier, std::string_view suffix)
-        : device{device}, ir{ir}, registry{registry}, stage{stage},
-          identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} {
+        : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier},
+          suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{
+                                                      UseUnifiedUniforms(device, ir, stage)} {
        if (stage != ShaderType::Compute) {
            transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
        }
@@ -518,6 +527,9 @@ private:
        if (device.HasImageLoadFormatted()) {
            code.AddLine("#extension GL_EXT_shader_image_load_formatted : require");
        }
+        if (device.HasTextureShadowLod()) {
+            code.AddLine("#extension GL_EXT_texture_shadow_lod : require");
+        }
        if (device.HasWarpIntrinsics()) {
            code.AddLine("#extension GL_NV_gpu_shader5 : require");
            code.AddLine("#extension GL_NV_shader_thread_group : require");
@@ -618,7 +630,9 @@ private:
                break;
            }
        }
-        if (stage != ShaderType::Vertex || device.HasVertexViewportLayer()) {
+
+        if (stage != ShaderType::Geometry &&
+            (stage != ShaderType::Vertex || device.HasVertexViewportLayer())) {
            if (ir.UsesLayer()) {
                code.AddLine("int gl_Layer;");
            }
@@ -647,6 +661,16 @@ private:
        --code.scope;
        code.AddLine("}};");
        code.AddNewLine();
+
+        if (stage == ShaderType::Geometry) {
+            if (ir.UsesLayer()) {
+                code.AddLine("out int gl_Layer;");
+            }
+            if (ir.UsesViewportIndex()) {
+                code.AddLine("out int gl_ViewportIndex;");
+            }
+        }
+        code.AddNewLine();
    }

    void DeclareRegisters() {
@@ -834,12 +858,24 @@ private:
    }

    void DeclareConstantBuffers() {
+        if (use_unified_uniforms) {
+            const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
+                                static_cast<u32>(ir.GetGlobalMemory().size());
+            code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
+                         binding);
+            code.AddLine("    uint cbufs[];");
+            code.AddLine("}};");
+            code.AddNewLine();
+            return;
+        }
+
        u32 binding = device.GetBaseBindings(stage).uniform_buffer;
-        for (const auto& buffers : ir.GetConstantBuffers()) {
-            const auto index = buffers.first;
+        for (const auto [index, info] : ir.GetConstantBuffers()) {
+            const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4;
+            const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements;
            code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++,
                         GetConstBufferBlock(index));
-            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
+            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), size);
            code.AddLine("}};");
            code.AddNewLine();
        }
@@ -877,13 +913,13 @@ private:
                    return "samplerBuffer";
                }
                switch (sampler.type) {
-                case Tegra::Shader::TextureType::Texture1D:
+                case TextureType::Texture1D:
                    return "sampler1D";
-                case Tegra::Shader::TextureType::Texture2D:
+                case TextureType::Texture2D:
                    return "sampler2D";
-                case Tegra::Shader::TextureType::Texture3D:
+                case TextureType::Texture3D:
                    return "sampler3D";
-                case Tegra::Shader::TextureType::TextureCube:
+                case TextureType::TextureCube:
                    return "samplerCube";
                default:
                    UNREACHABLE();
@@ -1038,42 +1074,51 @@ private:

        if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
            const Node offset = cbuf->GetOffset();
+            const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
+
            if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
                // Direct access
                const u32 offset_imm = immediate->GetValue();
                ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
-                return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
-                                    offset_imm / (4 * 4), (offset_imm / 4) % 4),
+                if (use_unified_uniforms) {
+                    return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
+                            Type::Uint};
+                } else {
+                    return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+                                        offset_imm / (4 * 4), (offset_imm / 4) % 4),
+                            Type::Uint};
+                }
+            }
+
+            // Indirect access
+            if (use_unified_uniforms) {
+                return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
+                                    Visit(offset).AsUint()),
                        Type::Uint};
            }

-            if (std::holds_alternative<OperationNode>(*offset)) {
-                // Indirect access
-                const std::string final_offset = code.GenerateTemporary();
-                code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
+            const std::string final_offset = code.GenerateTemporary();
+            code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());

-                if (!device.HasComponentIndexingBug()) {
-                    return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
-                                        final_offset, final_offset),
-                            Type::Uint};
-                }
-
-                // AMD's proprietary GLSL compiler emits ill code for variable component access.
-                // To bypass this driver bug generate 4 ifs, one per each component.
-                const std::string pack = code.GenerateTemporary();
-                code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
-                             final_offset);
-
-                const std::string result = code.GenerateTemporary();
-                code.AddLine("uint {};", result);
-                for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
-                    code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
-                                 pack, GetSwizzle(swizzle));
-                }
-                return {result, Type::Uint};
+            if (!device.HasComponentIndexingBug()) {
+                return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+                                    final_offset, final_offset),
+                        Type::Uint};
            }

-            UNREACHABLE_MSG("Unmanaged offset node type");
+            // AMD's proprietary GLSL compiler emits ill code for variable component access.
+            // To bypass this driver bug generate 4 ifs, one per each component.
+            const std::string pack = code.GenerateTemporary();
+            code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+                         final_offset);
+
+            const std::string result = code.GenerateTemporary();
+            code.AddLine("uint {};", result);
+            for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
+                code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack,
+                             GetSwizzle(swizzle));
+            }
+            return {result, Type::Uint};
        }

        if (const auto gmem = std::get_if<GmemNode>(&*node)) {
@@ -1339,8 +1384,19 @@ private:
        const std::size_t count = operation.GetOperandsCount();
        const bool has_array = meta->sampler.is_array;
        const bool has_shadow = meta->sampler.is_shadow;
+        const bool workaround_lod_array_shadow_as_grad =
+            !device.HasTextureShadowLod() && function_suffix == "Lod" && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube);
+
+        std::string expr = "texture";
+
+        if (workaround_lod_array_shadow_as_grad) {
+            expr += "Grad";
+        } else {
+            expr += function_suffix;
+        }

-        std::string expr = "texture" + function_suffix;
        if (!meta->aoffi.empty()) {
            expr += "Offset";
        } else if (!meta->ptp.empty()) {
@@ -1374,6 +1430,16 @@ private:
            expr += ')';
        }

+        if (workaround_lod_array_shadow_as_grad) {
+            switch (meta->sampler.type) {
+            case TextureType::Texture2D:
+                return expr + ", vec2(0.0), vec2(0.0))";
+            case TextureType::TextureCube:
+                return expr + ", vec3(0.0), vec3(0.0))";
+            }
+            UNREACHABLE();
+        }
+
        for (const auto& variant : extras) {
            if (const auto argument = std::get_if<TextureArgument>(&variant)) {
                expr += GenerateTextureArgument(*argument);
@@ -1538,7 +1604,9 @@ private:
        Expression target;
        if (const auto gpr = std::get_if<GprNode>(&*dest)) {
            if (gpr->GetIndex() == Register::ZeroIndex) {
-                // Writing to Register::ZeroIndex is a no op
+                // Writing to Register::ZeroIndex is a no op but we still have to visit the source
+                // as it might have side effects.
+                code.AddLine("{};", Visit(src).GetCode());
                return {};
            }
            target = {GetRegister(gpr->GetIndex()), Type::Float};
@@ -1998,8 +2066,19 @@ private:
        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
        ASSERT(meta);

-        std::string expr = GenerateTexture(
-            operation, "Lod", {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        std::string expr{};
+
+        if (!device.HasTextureShadowLod() && meta->sampler.is_shadow &&
+            ((meta->sampler.type == TextureType::Texture2D && meta->sampler.is_array) ||
+             meta->sampler.type == TextureType::TextureCube)) {
+            LOG_ERROR(Render_OpenGL,
+                      "Device lacks GL_EXT_texture_shadow_lod, using textureGrad as a workaround");
+            expr = GenerateTexture(operation, "Lod", {});
+        } else {
+            expr = GenerateTexture(operation, "Lod",
+                                   {TextureArgument{Type::Float, meta->lod}, TextureOffset{}});
+        }
+
        if (meta->sampler.is_shadow) {
            expr = "vec4(" + expr + ')';
        }
@@ -2309,6 +2388,18 @@ private:
        return {"gl_SubGroupInvocationARB", Type::Uint};
    }

+    template <const std::string_view& comparison>
+    Expression ThreadMask(Operation) {
+        if (device.HasWarpIntrinsics()) {
+            return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
+        }
+        if (device.HasShaderBallot()) {
+            return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
+        }
+        LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
+        return {"0U", Type::Uint};
+    }
+
    Expression ShuffleIndexed(Operation operation) {
        std::string value = VisitOperand(operation, 0).AsFloat();

@@ -2321,7 +2412,21 @@ private:
        return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
    }

-    Expression MemoryBarrierGL(Operation) {
+    Expression Barrier(Operation) {
+        if (!ir.IsDecompiled()) {
+            LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
+            return {};
+        }
+        code.AddLine("barrier();");
+        return {};
+    }
+
+    Expression MemoryBarrierGroup(Operation) {
+        code.AddLine("groupMemoryBarrier();");
+        return {};
+    }
+
+    Expression MemoryBarrierGlobal(Operation) {
        code.AddLine("memoryBarrier();");
        return {};
    }
@@ -2337,6 +2442,12 @@ private:
        static constexpr std::string_view NotEqual = "!=";
        static constexpr std::string_view GreaterEqual = ">=";

+        static constexpr std::string_view Eq = "Eq";
+        static constexpr std::string_view Ge = "Ge";
+        static constexpr std::string_view Gt = "Gt";
+        static constexpr std::string_view Le = "Le";
+        static constexpr std::string_view Lt = "Lt";
+
        static constexpr std::string_view Add = "Add";
        static constexpr std::string_view Min = "Min";
        static constexpr std::string_view Max = "Max";
@@ -2554,9 +2665,16 @@ private:
        &GLSLDecompiler::VoteEqual,

        &GLSLDecompiler::ThreadId,
+        &GLSLDecompiler::ThreadMask<Func::Eq>,
+        &GLSLDecompiler::ThreadMask<Func::Ge>,
+        &GLSLDecompiler::ThreadMask<Func::Gt>,
+        &GLSLDecompiler::ThreadMask<Func::Le>,
+        &GLSLDecompiler::ThreadMask<Func::Lt>,
        &GLSLDecompiler::ShuffleIndexed,

-        &GLSLDecompiler::MemoryBarrierGL,
+        &GLSLDecompiler::Barrier,
+        &GLSLDecompiler::MemoryBarrierGroup,
+        &GLSLDecompiler::MemoryBarrierGlobal,
    };
    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

@@ -2669,6 +2787,7 @@ private:
    const std::string_view identifier;
    const std::string_view suffix;
    const Header header;
+    const bool use_unified_uniforms;
    std::unordered_map<u8, VaryingTFB> transform_feedback;

    ShaderWriter code;
@@ -2864,7 +2983,7 @@ void GLSLDecompiler::DecompileAST() {

 } // Anonymous namespace

-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
+ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) {
    ShaderEntries entries;
    for (const auto& cbuf : ir.GetConstantBuffers()) {
        entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
@@ -2885,6 +3004,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) {
        entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
    }
    entries.shader_length = ir.GetLength();
+    entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
    return entries;
 }

--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -53,11 +53,13 @@ struct ShaderEntries {
    std::vector<GlobalMemoryEntry> global_memory_entries;
    std::vector<SamplerEntry> samplers;
    std::vector<ImageEntry> images;
-    u32 clip_distances{};
    std::size_t shader_length{};
+    u32 clip_distances{};
+    bool use_unified_uniforms{};
 };

-ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir);
+ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                          Tegra::Engines::ShaderType stage);

 std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
                            const VideoCommon::Shader::Registry& registry,
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -29,6 +29,8 @@ using VideoCommon::Shader::KeyMap;

 namespace {

+using VideoCommon::Shader::SeparateSamplerKey;
+
 using ShaderCacheVersionHash = std::array<u8, 64>;

 struct ConstBufferKey {
@@ -37,18 +39,26 @@ struct ConstBufferKey {
    u32 value = 0;
 };

-struct BoundSamplerKey {
+struct BoundSamplerEntry {
    u32 offset = 0;
    Tegra::Engines::SamplerDescriptor sampler;
 };

-struct BindlessSamplerKey {
+struct SeparateSamplerEntry {
+    u32 cbuf1 = 0;
+    u32 cbuf2 = 0;
+    u32 offset1 = 0;
+    u32 offset2 = 0;
+    Tegra::Engines::SamplerDescriptor sampler;
+};
+
+struct BindlessSamplerEntry {
    u32 cbuf = 0;
    u32 offset = 0;
    Tegra::Engines::SamplerDescriptor sampler;
 };

-constexpr u32 NativeVersion = 20;
+constexpr u32 NativeVersion = 21;

 ShaderCacheVersionHash GetShaderCacheVersionHash() {
    ShaderCacheVersionHash hash{};
@@ -87,12 +97,14 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
    u32 texture_handler_size_value;
    u32 num_keys;
    u32 num_bound_samplers;
+    u32 num_separate_samplers;
    u32 num_bindless_samplers;
    if (file.ReadArray(&unique_identifier, 1) != 1 || file.ReadArray(&bound_buffer, 1) != 1 ||
        file.ReadArray(&is_texture_handler_size_known, 1) != 1 ||
        file.ReadArray(&texture_handler_size_value, 1) != 1 ||
        file.ReadArray(&graphics_info, 1) != 1 || file.ReadArray(&compute_info, 1) != 1 ||
        file.ReadArray(&num_keys, 1) != 1 || file.ReadArray(&num_bound_samplers, 1) != 1 ||
+        file.ReadArray(&num_separate_samplers, 1) != 1 ||
        file.ReadArray(&num_bindless_samplers, 1) != 1) {
        return false;
    }
@@ -101,23 +113,32 @@ bool ShaderDiskCacheEntry::Load(FileUtil::IOFile& file) {
    }

    std::vector<ConstBufferKey> flat_keys(num_keys);
-    std::vector<BoundSamplerKey> flat_bound_samplers(num_bound_samplers);
-    std::vector<BindlessSamplerKey> flat_bindless_samplers(num_bindless_samplers);
+    std::vector<BoundSamplerEntry> flat_bound_samplers(num_bound_samplers);
+    std::vector<SeparateSamplerEntry> flat_separate_samplers(num_separate_samplers);
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers(num_bindless_samplers);
    if (file.ReadArray(flat_keys.data(), flat_keys.size()) != flat_keys.size() ||
        file.ReadArray(flat_bound_samplers.data(), flat_bound_samplers.size()) !=
            flat_bound_samplers.size() ||
+        file.ReadArray(flat_separate_samplers.data(), flat_separate_samplers.size()) !=
+            flat_separate_samplers.size() ||
        file.ReadArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) !=
            flat_bindless_samplers.size()) {
        return false;
    }
-    for (const auto& key : flat_keys) {
-        keys.insert({{key.cbuf, key.offset}, key.value});
+    for (const auto& entry : flat_keys) {
+        keys.insert({{entry.cbuf, entry.offset}, entry.value});
    }
-    for (const auto& key : flat_bound_samplers) {
-        bound_samplers.emplace(key.offset, key.sampler);
+    for (const auto& entry : flat_bound_samplers) {
+        bound_samplers.emplace(entry.offset, entry.sampler);
    }
-    for (const auto& key : flat_bindless_samplers) {
-        bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+    for (const auto& entry : flat_separate_samplers) {
+        SeparateSamplerKey key;
+        key.buffers = {entry.cbuf1, entry.cbuf2};
+        key.offsets = {entry.offset1, entry.offset2};
+        separate_samplers.emplace(key, entry.sampler);
+    }
+    for (const auto& entry : flat_bindless_samplers) {
+        bindless_samplers.insert({{entry.cbuf, entry.offset}, entry.sampler});
    }

    return true;
@@ -142,6 +163,7 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
        file.WriteObject(graphics_info) != 1 || file.WriteObject(compute_info) != 1 ||
        file.WriteObject(static_cast<u32>(keys.size())) != 1 ||
        file.WriteObject(static_cast<u32>(bound_samplers.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(separate_samplers.size())) != 1 ||
        file.WriteObject(static_cast<u32>(bindless_samplers.size())) != 1) {
        return false;
    }
@@ -152,22 +174,34 @@ bool ShaderDiskCacheEntry::Save(FileUtil::IOFile& file) const {
        flat_keys.push_back(ConstBufferKey{address.first, address.second, value});
    }

-    std::vector<BoundSamplerKey> flat_bound_samplers;
+    std::vector<BoundSamplerEntry> flat_bound_samplers;
    flat_bound_samplers.reserve(bound_samplers.size());
    for (const auto& [address, sampler] : bound_samplers) {
-        flat_bound_samplers.push_back(BoundSamplerKey{address, sampler});
+        flat_bound_samplers.push_back(BoundSamplerEntry{address, sampler});
    }

-    std::vector<BindlessSamplerKey> flat_bindless_samplers;
+    std::vector<SeparateSamplerEntry> flat_separate_samplers;
+    flat_separate_samplers.reserve(separate_samplers.size());
+    for (const auto& [key, sampler] : separate_samplers) {
+        SeparateSamplerEntry entry;
+        std::tie(entry.cbuf1, entry.cbuf2) = key.buffers;
+        std::tie(entry.offset1, entry.offset2) = key.offsets;
+        entry.sampler = sampler;
+        flat_separate_samplers.push_back(entry);
+    }
+
+    std::vector<BindlessSamplerEntry> flat_bindless_samplers;
    flat_bindless_samplers.reserve(bindless_samplers.size());
    for (const auto& [address, sampler] : bindless_samplers) {
        flat_bindless_samplers.push_back(
-            BindlessSamplerKey{address.first, address.second, sampler});
+            BindlessSamplerEntry{address.first, address.second, sampler});
    }

    return file.WriteArray(flat_keys.data(), flat_keys.size()) == flat_keys.size() &&
           file.WriteArray(flat_bound_samplers.data(), flat_bound_samplers.size()) ==
               flat_bound_samplers.size() &&
+           file.WriteArray(flat_separate_samplers.data(), flat_separate_samplers.size()) ==
+               flat_separate_samplers.size() &&
           file.WriteArray(flat_bindless_samplers.data(), flat_bindless_samplers.size()) ==
               flat_bindless_samplers.size();
 }
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -57,6 +57,7 @@ struct ShaderDiskCacheEntry {
    VideoCommon::Shader::ComputeInfo compute_info;
    VideoCommon::Shader::KeyMap keys;
    VideoCommon::Shader::BoundSamplerMap bound_samplers;
+    VideoCommon::Shader::SeparateSamplerMap separate_samplers;
    VideoCommon::Shader::BindlessSamplerMap bindless_samplers;
 };

--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -6,47 +6,111 @@

 #include "common/common_types.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_shader_manager.h"

-namespace OpenGL::GLShader {
+namespace OpenGL {

-ProgramManager::ProgramManager() = default;
+ProgramManager::ProgramManager(const Device& device) {
+    use_assembly_programs = device.UseAssemblyShaders();
+    if (use_assembly_programs) {
+        glEnable(GL_COMPUTE_PROGRAM_NV);
+    } else {
+        graphics_pipeline.Create();
+        glBindProgramPipeline(graphics_pipeline.handle);
+    }
+}

 ProgramManager::~ProgramManager() = default;

-void ProgramManager::Create() {
-    graphics_pipeline.Create();
-    glBindProgramPipeline(graphics_pipeline.handle);
+void ProgramManager::BindCompute(GLuint program) {
+    if (use_assembly_programs) {
+        glBindProgramARB(GL_COMPUTE_PROGRAM_NV, program);
+    } else {
+        is_graphics_bound = false;
+        glUseProgram(program);
+    }
 }

 void ProgramManager::BindGraphicsPipeline() {
+    if (use_assembly_programs) {
+        UpdateAssemblyPrograms();
+    } else {
+        UpdateSourcePrograms();
+    }
+}
+
+void ProgramManager::BindHostPipeline(GLuint pipeline) {
+    if (use_assembly_programs) {
+        if (geometry_enabled) {
+            geometry_enabled = false;
+            old_state.geometry = 0;
+            glDisable(GL_GEOMETRY_PROGRAM_NV);
+        }
+    } else {
+        if (!is_graphics_bound) {
+            glUseProgram(0);
+        }
+    }
+    glBindProgramPipeline(pipeline);
+}
+
+void ProgramManager::RestoreGuestPipeline() {
+    if (use_assembly_programs) {
+        glBindProgramPipeline(0);
+    } else {
+        glBindProgramPipeline(graphics_pipeline.handle);
+    }
+}
+
+void ProgramManager::UpdateAssemblyPrograms() {
+    const auto update_state = [](GLenum stage, bool& enabled, GLuint current, GLuint old) {
+        if (current == old) {
+            return;
+        }
+        if (current == 0) {
+            if (enabled) {
+                enabled = false;
+                glDisable(stage);
+            }
+            return;
+        }
+        if (!enabled) {
+            enabled = true;
+            glEnable(stage);
+        }
+        glBindProgramARB(stage, current);
+    };
+
+    update_state(GL_VERTEX_PROGRAM_NV, vertex_enabled, current_state.vertex, old_state.vertex);
+    update_state(GL_GEOMETRY_PROGRAM_NV, geometry_enabled, current_state.geometry,
+                 old_state.geometry);
+    update_state(GL_FRAGMENT_PROGRAM_NV, fragment_enabled, current_state.fragment,
+                 old_state.fragment);
+
+    old_state = current_state;
+}
+
+void ProgramManager::UpdateSourcePrograms() {
    if (!is_graphics_bound) {
        is_graphics_bound = true;
        glUseProgram(0);
    }

-    // Avoid updating the pipeline when values have no changed
-    if (old_state == current_state) {
-        return;
-    }
-
-    // Workaround for AMD bug
-    static constexpr GLenum all_used_stages{GL_VERTEX_SHADER_BIT | GL_GEOMETRY_SHADER_BIT |
-                                            GL_FRAGMENT_SHADER_BIT};
    const GLuint handle = graphics_pipeline.handle;
-    glUseProgramStages(handle, all_used_stages, 0);
-    glUseProgramStages(handle, GL_VERTEX_SHADER_BIT, current_state.vertex_shader);
-    glUseProgramStages(handle, GL_GEOMETRY_SHADER_BIT, current_state.geometry_shader);
-    glUseProgramStages(handle, GL_FRAGMENT_SHADER_BIT, current_state.fragment_shader);
+    const auto update_state = [handle](GLenum stage, GLuint current, GLuint old) {
+        if (current == old) {
+            return;
+        }
+        glUseProgramStages(handle, stage, current);
+    };
+    update_state(GL_VERTEX_SHADER_BIT, current_state.vertex, old_state.vertex);
+    update_state(GL_GEOMETRY_SHADER_BIT, current_state.geometry, old_state.geometry);
+    update_state(GL_FRAGMENT_SHADER_BIT, current_state.fragment, old_state.fragment);

    old_state = current_state;
 }

-void ProgramManager::BindComputeShader(GLuint program) {
-    is_graphics_bound = false;
-    glUseProgram(program);
-}
-
 void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
    const auto& regs = maxwell.regs;

@@ -54,4 +118,4 @@ void MaxwellUniformData::SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell) {
    y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
 }

-} // namespace OpenGL::GLShader
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -11,7 +11,9 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/maxwell_to_gl.h"

-namespace OpenGL::GLShader {
+namespace OpenGL {
+
+class Device;

 /// Uniform structure for the Uniform Buffer Object, all vectors must be 16-byte aligned
 /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
@@ -28,50 +30,58 @@ static_assert(sizeof(MaxwellUniformData) < 16384,

 class ProgramManager {
 public:
-    explicit ProgramManager();
+    explicit ProgramManager(const Device& device);
    ~ProgramManager();

-    void Create();
+    /// Binds a compute program
+    void BindCompute(GLuint program);

-    /// Updates the graphics pipeline and binds it.
+    /// Updates bound programs.
    void BindGraphicsPipeline();

-    /// Binds a compute shader.
-    void BindComputeShader(GLuint program);
+    /// Binds an OpenGL pipeline object unsynchronized with the guest state.
+    void BindHostPipeline(GLuint pipeline);
+
+    /// Rewinds BindHostPipeline state changes.
+    void RestoreGuestPipeline();

    void UseVertexShader(GLuint program) {
-        current_state.vertex_shader = program;
+        current_state.vertex = program;
    }

    void UseGeometryShader(GLuint program) {
-        current_state.geometry_shader = program;
+        current_state.geometry = program;
    }

    void UseFragmentShader(GLuint program) {
-        current_state.fragment_shader = program;
+        current_state.fragment = program;
    }

 private:
    struct PipelineState {
-        bool operator==(const PipelineState& rhs) const noexcept {
-            return vertex_shader == rhs.vertex_shader && fragment_shader == rhs.fragment_shader &&
-                   geometry_shader == rhs.geometry_shader;
-        }
-
-        bool operator!=(const PipelineState& rhs) const noexcept {
-            return !operator==(rhs);
-        }
-
-        GLuint vertex_shader = 0;
-        GLuint fragment_shader = 0;
-        GLuint geometry_shader = 0;
+        GLuint vertex = 0;
+        GLuint geometry = 0;
+        GLuint fragment = 0;
    };

+    /// Update NV_gpu_program5 programs.
+    void UpdateAssemblyPrograms();
+
+    /// Update GLSL programs.
+    void UpdateSourcePrograms();
+
    OGLPipeline graphics_pipeline;
-    OGLPipeline compute_pipeline;
+
    PipelineState current_state;
    PipelineState old_state;
+
+    bool use_assembly_programs = false;
+
    bool is_graphics_bound = true;
+
+    bool vertex_enabled = false;
+    bool geometry_enabled = false;
+    bool fragment_enabled = false;
 };

-} // namespace OpenGL::GLShader
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -2,11 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <deque>
+#include <tuple>
 #include <vector>
+
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"

 MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
@@ -14,8 +16,7 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",

 namespace OpenGL {

-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
-                                 bool use_persistent)
+OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage)
    : buffer_size(size) {
    gl_buffer.Create();

@@ -29,34 +30,22 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
        allocate_size *= 2;
    }

-    if (use_persistent) {
-        persistent = true;
-        coherent = prefer_coherent;
-        const GLbitfield flags =
-            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
-        mapped_ptr = static_cast<u8*>(glMapNamedBufferRange(
-            gl_buffer.handle, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
-    } else {
-        glNamedBufferData(gl_buffer.handle, allocate_size, nullptr, GL_STREAM_DRAW);
+    static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT;
+    glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags);
+    mapped_ptr = static_cast<u8*>(
+        glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT));
+
+    if (device.HasVertexBufferUnifiedMemory()) {
+        glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
+        glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
    }
 }

 OGLStreamBuffer::~OGLStreamBuffer() {
-    if (persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
-    }
+    glUnmapNamedBuffer(gl_buffer.handle);
    gl_buffer.Release();
 }

-GLuint OGLStreamBuffer::GetHandle() const {
-    return gl_buffer.handle;
-}
-
-GLsizeiptr OGLStreamBuffer::GetSize() const {
-    return buffer_size;
-}
-
 std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) {
    ASSERT(size <= buffer_size);
    ASSERT(alignment <= buffer_size);
@@ -68,36 +57,21 @@ std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr a

    bool invalidate = false;
    if (buffer_pos + size > buffer_size) {
+        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
+        glInvalidateBufferData(gl_buffer.handle);
+
        buffer_pos = 0;
        invalidate = true;
-
-        if (persistent) {
-            glUnmapNamedBuffer(gl_buffer.handle);
-        }
    }

-    if (invalidate || !persistent) {
-        MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
-        GLbitfield flags = GL_MAP_WRITE_BIT | (persistent ? GL_MAP_PERSISTENT_BIT : 0) |
-                           (coherent ? GL_MAP_COHERENT_BIT : GL_MAP_FLUSH_EXPLICIT_BIT) |
-                           (invalidate ? GL_MAP_INVALIDATE_BUFFER_BIT : GL_MAP_UNSYNCHRONIZED_BIT);
-        mapped_ptr = static_cast<u8*>(
-            glMapNamedBufferRange(gl_buffer.handle, buffer_pos, buffer_size - buffer_pos, flags));
-        mapped_offset = buffer_pos;
-    }
-
-    return std::make_tuple(mapped_ptr + buffer_pos - mapped_offset, buffer_pos, invalidate);
+    return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate);
 }

 void OGLStreamBuffer::Unmap(GLsizeiptr size) {
    ASSERT(size <= mapped_size);

-    if (!coherent && size > 0) {
-        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos - mapped_offset, size);
-    }
-
-    if (!persistent) {
-        glUnmapNamedBuffer(gl_buffer.handle);
+    if (size > 0) {
+        glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
    }

    buffer_pos += size;
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -11,15 +11,13 @@

 namespace OpenGL {

+class Device;
+
 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
-                             bool use_persistent = true);
+    explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage);
    ~OGLStreamBuffer();

-    GLuint GetHandle() const;
-    GLsizeiptr GetSize() const;
-
    /*
     * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
     * and the optional alignment requirement.
@@ -32,15 +30,24 @@ public:

    void Unmap(GLsizeiptr size);

+    GLuint Handle() const {
+        return gl_buffer.handle;
+    }
+
+    u64 Address() const {
+        return gpu_address;
+    }
+
+    GLsizeiptr Size() const noexcept {
+        return buffer_size;
+    }
+
 private:
    OGLBuffer gl_buffer;

-    bool coherent = false;
-    bool persistent = false;
-
+    GLuint64EXT gpu_address = 0;
    GLintptr buffer_pos = 0;
    GLsizeiptr buffer_size = 0;
-    GLintptr mapped_offset = 0;
    GLsizeiptr mapped_size = 0;
    u8* mapped_ptr = nullptr;
 };
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -35,7 +35,7 @@ MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
 namespace {

 struct FormatTuple {
-    GLint internal_format;
+    GLenum internal_format;
    GLenum format = GL_NONE;
    GLenum type = GL_NONE;
 };
@@ -238,6 +238,12 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
    return texture;
 }

+constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source,
+                            SwizzleSource w_source) {
+    return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) |
+           (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source);
+}
+
 } // Anonymous namespace

 CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
@@ -257,9 +263,14 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param
    target = GetTextureTarget(params.target);
    texture = CreateTexture(params, target, internal_format, texture_buffer);
    DecorateSurfaceName();
-    main_view = CreateViewInner(
-        ViewParams(params.target, 0, params.is_layered ? params.depth : 1, 0, params.num_levels),
-        true);
+
+    u32 num_layers = 1;
+    if (params.is_layered || params.target == SurfaceTarget::Texture3D) {
+        num_layers = params.depth;
+    }
+
+    main_view =
+        CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true);
 }

 CachedSurface::~CachedSurface() = default;
@@ -381,7 +392,7 @@ void CachedSurface::DecorateSurfaceName() {
 }

 void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, std::string prefix) {
-    LabelGLObject(GL_TEXTURE, texture_view.handle, gpu_addr, prefix);
+    LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix);
 }

 View CachedSurface::CreateView(const ViewParams& view_key) {
@@ -397,49 +408,50 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr
 }

 CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params,
-                                     const bool is_proxy)
-    : VideoCommon::ViewBase(params), surface{surface}, is_proxy{is_proxy} {
-    target = GetTextureTarget(params.target);
-    format = GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format;
+                                     bool is_proxy)
+    : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format},
+      target{GetTextureTarget(params.target)}, is_proxy{is_proxy} {
    if (!is_proxy) {
-        texture_view = CreateTextureView();
+        main_view = CreateTextureView();
    }
-    swizzle = EncodeSwizzle(SwizzleSource::R, SwizzleSource::G, SwizzleSource::B, SwizzleSource::A);
 }

 CachedSurfaceView::~CachedSurfaceView() = default;

-void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
+void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const {
    ASSERT(params.num_levels == 1);

-    if (params.num_layers > 1) {
-        // Layered framebuffer attachments
-        UNIMPLEMENTED_IF(params.base_layer != 0);
-
-        switch (params.target) {
-        case SurfaceTarget::Texture2DArray:
-            glFramebufferTexture(target, attachment, GetTexture(), 0);
-            break;
-        default:
-            UNIMPLEMENTED();
+    if (params.target == SurfaceTarget::Texture3D) {
+        if (params.num_layers > 1) {
+            ASSERT(params.base_layer == 0);
+            glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level);
+        } else {
+            glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle,
+                                   params.base_level, params.base_layer);
        }
        return;
    }

+    if (params.num_layers > 1) {
+        UNIMPLEMENTED_IF(params.base_layer != 0);
+        glFramebufferTexture(fb_target, attachment, GetTexture(), 0);
+        return;
+    }
+
    const GLenum view_target = surface.GetTarget();
    const GLuint texture = surface.GetTexture();
    switch (surface.GetSurfaceParams().target) {
    case SurfaceTarget::Texture1D:
-        glFramebufferTexture1D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level);
        break;
    case SurfaceTarget::Texture2D:
-        glFramebufferTexture2D(target, attachment, view_target, texture, params.base_level);
+        glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level);
        break;
    case SurfaceTarget::Texture1DArray:
    case SurfaceTarget::Texture2DArray:
    case SurfaceTarget::TextureCubemap:
    case SurfaceTarget::TextureCubeArray:
-        glFramebufferTextureLayer(target, attachment, texture, params.base_level,
+        glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level,
                                  params.base_layer);
        break;
    default:
@@ -447,35 +459,62 @@ void CachedSurfaceView::Attach(GLenum attachment, GLenum target) const {
    }
 }

-void CachedSurfaceView::ApplySwizzle(SwizzleSource x_source, SwizzleSource y_source,
+GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source,
                                     SwizzleSource z_source, SwizzleSource w_source) {
-    u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
-    if (new_swizzle == swizzle)
-        return;
-    swizzle = new_swizzle;
-    const std::array gl_swizzle = {GetSwizzleSource(x_source), GetSwizzleSource(y_source),
-                                   GetSwizzleSource(z_source), GetSwizzleSource(w_source)};
-    const GLuint handle = GetTexture();
-    const PixelFormat format = surface.GetSurfaceParams().pixel_format;
-    switch (format) {
+    if (GetSurfaceParams().IsBuffer()) {
+        return GetTexture();
+    }
+    const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source);
+    if (current_swizzle == new_swizzle) {
+        return current_view;
+    }
+    current_swizzle = new_swizzle;
+
+    const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle);
+    OGLTextureView& view = entry->second;
+    if (!is_cache_miss) {
+        current_view = view.handle;
+        return view.handle;
+    }
+    view = CreateTextureView();
+    current_view = view.handle;
+
+    std::array swizzle{x_source, y_source, z_source, w_source};
+
+    switch (const PixelFormat format = GetSurfaceParams().pixel_format) {
    case PixelFormat::Z24S8:
    case PixelFormat::Z32FS8:
    case PixelFormat::S8Z24:
-        glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
+        UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G);
+        glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE,
                            GetComponent(format, x_source == SwizzleSource::R));
-        break;
-    default:
-        glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
+
+        // Make sure we sample the first component
+        std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) {
+            return value == SwizzleSource::G ? SwizzleSource::R : value;
+        });
+        [[fallthrough]];
+    default: {
+        const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]),
+                                       GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])};
+        glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data());
        break;
    }
+    }
+    return view.handle;
 }

 OGLTextureView CachedSurfaceView::CreateTextureView() const {
    OGLTextureView texture_view;
    texture_view.Create();

-    glTextureView(texture_view.handle, target, surface.texture.handle, format, params.base_level,
-                  params.num_levels, params.base_layer, params.num_layers);
+    if (target == GL_TEXTURE_3D) {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, 0, 1);
+    } else {
+        glTextureView(texture_view.handle, target, surface.texture.handle, format,
+                      params.base_level, params.num_levels, params.base_layer, params.num_layers);
+    }
    ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle);

    return texture_view;
@@ -518,8 +557,8 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
                                   const Tegra::Engines::Fermi2D::Config& copy_config) {
    const auto& src_params{src_view->GetSurfaceParams()};
    const auto& dst_params{dst_view->GetSurfaceParams()};
-    UNIMPLEMENTED_IF(src_params.target == SurfaceTarget::Texture3D);
-    UNIMPLEMENTED_IF(dst_params.target == SurfaceTarget::Texture3D);
+    UNIMPLEMENTED_IF(src_params.depth != 1);
+    UNIMPLEMENTED_IF(dst_params.depth != 1);

    state_tracker.NotifyScissor0();
    state_tracker.NotifyFramebuffer();
--- a/Show More
+++ b/Show More