gl_shader_decompiler: Implement image binding settings

shader: Implement bindless images
shader: Decode SUST and implement backing image functionality
2019-05-16 20:03:51 -03:00 · 2019-05-16 20:03:51 -03:00 · 2019-05-16 20:03:51 -03:00 · 2019-05-16 20:03:51 -03:00 · 2019-05-16 20:03:50 -03:00 · 2019-05-16 20:03:50 -03:00
155 changed files with 3595 additions and 5961 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -132,7 +132,7 @@ find_package(Threads REQUIRED)
 if (ENABLE_SDL2)
    if (YUZU_USE_BUNDLED_SDL2)
        # Detect toolchain and platform
-        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1930) AND ARCHITECTURE_x86_64)
+        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1920) AND ARCHITECTURE_x86_64)
            set(SDL2_VER "SDL2-2.0.8")
        else()
            message(FATAL_ERROR "No bundled SDL2 binaries for your toolchain. Disable YUZU_USE_BUNDLED_SDL2 and provide your own.")
@@ -165,7 +165,7 @@ if (YUZU_USE_BUNDLED_UNICORN)
    if (MSVC)
        message(STATUS "unicorn not found, falling back to bundled")
        # Detect toolchain and platform
-        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1930) AND ARCHITECTURE_x86_64)
+        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1920) AND ARCHITECTURE_x86_64)
            set(UNICORN_VER "unicorn-yuzu")
        else()
            message(FATAL_ERROR "No bundled Unicorn binaries for your toolchain. Disable YUZU_USE_BUNDLED_UNICORN and provide your own.")
@@ -233,7 +233,7 @@ endif()

 if (ENABLE_QT)
    if (YUZU_USE_BUNDLED_QT)
-        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1930) AND ARCHITECTURE_x86_64)
+        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1920) AND ARCHITECTURE_x86_64)
            set(QT_VER qt-5.12.0-msvc2017_64)
        else()
            message(FATAL_ERROR "No bundled Qt binaries for your toolchain. Disable YUZU_USE_BUNDLED_QT and provide your own.")
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -70,6 +70,7 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/decode/half_set.cpp"
    "${VIDEO_CORE}/shader/decode/half_set_predicate.cpp"
    "${VIDEO_CORE}/shader/decode/hfma2.cpp"
+    "${VIDEO_CORE}/shader/decode/image.cpp"
    "${VIDEO_CORE}/shader/decode/integer_set.cpp"
    "${VIDEO_CORE}/shader/decode/integer_set_predicate.cpp"
    "${VIDEO_CORE}/shader/decode/memory.cpp"
--- a/externals/glad/include/KHR/khrplatform.h
+++ b/externals/glad/include/KHR/khrplatform.h
@@ -90,20 +90,12 @@
 *                                  int arg2) KHRONOS_APIATTRIBUTES;
 */

-#if defined(__SCITECH_SNAP__) && !defined(KHRONOS_STATIC)
-#   define KHRONOS_STATIC 1
-#endif
-
 /*-------------------------------------------------------------------------
 * Definition of KHRONOS_APICALL
 *-------------------------------------------------------------------------
 * This precedes the return type of the function in the function prototype.
 */
-#if defined(KHRONOS_STATIC)
-    /* If the preprocessor constant KHRONOS_STATIC is defined, make the
-     * header compatible with static linking. */
-#   define KHRONOS_APICALL
-#elif defined(_WIN32)
+#if defined(_WIN32) && !defined(__SCITECH_SNAP__)
 #   define KHRONOS_APICALL __declspec(dllimport)
 #elif defined (__SYMBIAN32__)
 #   define KHRONOS_APICALL IMPORT_C
@@ -119,7 +111,7 @@
 * This follows the return type of the function  and precedes the function
 * name in the function prototype.
 */
-#if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(KHRONOS_STATIC)
+#if defined(_WIN32) && !defined(_WIN32_WCE) && !defined(__SCITECH_SNAP__)
    /* Win32 but not WinCE */
 #   define KHRONOS_APIENTRY __stdcall
 #else
--- a/externals/glad/include/glad/glad.h
+++ b/externals/glad/include/glad/glad.h
--- a/externals/glad/src/glad.c
+++ b/externals/glad/src/glad.c
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -44,6 +44,7 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/decode/half_set.cpp"
      "${VIDEO_CORE}/shader/decode/half_set_predicate.cpp"
      "${VIDEO_CORE}/shader/decode/hfma2.cpp"
+      "${VIDEO_CORE}/shader/decode/image.cpp"
      "${VIDEO_CORE}/shader/decode/integer_set.cpp"
      "${VIDEO_CORE}/shader/decode/integer_set_predicate.cpp"
      "${VIDEO_CORE}/shader/decode/memory.cpp"
--- a/src/common/file_util.cpp
+++ b/src/common/file_util.cpp
@@ -78,17 +78,16 @@ namespace FileUtil {
 // Remove any ending forward slashes from directory paths
 // Modifies argument.
 static void StripTailDirSlashes(std::string& fname) {
-    if (fname.length() <= 1) {
-        return;
+    if (fname.length() > 1) {
+        std::size_t i = fname.length();
+        while (i > 0 && fname[i - 1] == DIR_SEP_CHR)
+            --i;
+        fname.resize(i);
    }
-
-    std::size_t i = fname.length();
-    while (i > 0 && fname[i - 1] == DIR_SEP_CHR) {
-        --i;
-    }
-    fname.resize(i);
+    return;
 }

+// Returns true if file filename exists
 bool Exists(const std::string& filename) {
    struct stat file_info;

@@ -108,6 +107,7 @@ bool Exists(const std::string& filename) {
    return (result == 0);
 }

+// Returns true if filename is a directory
 bool IsDirectory(const std::string& filename) {
    struct stat file_info;

@@ -132,6 +132,8 @@ bool IsDirectory(const std::string& filename) {
    return S_ISDIR(file_info.st_mode);
 }

+// Deletes a given filename, return true on success
+// Doesn't supports deleting a directory
 bool Delete(const std::string& filename) {
    LOG_TRACE(Common_Filesystem, "file {}", filename);

@@ -163,6 +165,7 @@ bool Delete(const std::string& filename) {
    return true;
 }

+// Returns true if successful, or path already exists.
 bool CreateDir(const std::string& path) {
    LOG_TRACE(Common_Filesystem, "directory {}", path);
 #ifdef _WIN32
@@ -191,6 +194,7 @@ bool CreateDir(const std::string& path) {
 #endif
 }

+// Creates the full path of fullPath returns true on success
 bool CreateFullPath(const std::string& fullPath) {
    int panicCounter = 100;
    LOG_TRACE(Common_Filesystem, "path {}", fullPath);
@@ -226,6 +230,7 @@ bool CreateFullPath(const std::string& fullPath) {
    }
 }

+// Deletes a directory filename, returns true on success
 bool DeleteDir(const std::string& filename) {
    LOG_TRACE(Common_Filesystem, "directory {}", filename);

@@ -247,6 +252,7 @@ bool DeleteDir(const std::string& filename) {
    return false;
 }

+// renames file srcFilename to destFilename, returns true on success
 bool Rename(const std::string& srcFilename, const std::string& destFilename) {
    LOG_TRACE(Common_Filesystem, "{} --> {}", srcFilename, destFilename);
 #ifdef _WIN32
@@ -262,6 +268,7 @@ bool Rename(const std::string& srcFilename, const std::string& destFilename) {
    return false;
 }

+// copies file srcFilename to destFilename, returns true on success
 bool Copy(const std::string& srcFilename, const std::string& destFilename) {
    LOG_TRACE(Common_Filesystem, "{} --> {}", srcFilename, destFilename);
 #ifdef _WIN32
@@ -317,6 +324,7 @@ bool Copy(const std::string& srcFilename, const std::string& destFilename) {
 #endif
 }

+// Returns the size of filename (64bit)
 u64 GetSize(const std::string& filename) {
    if (!Exists(filename)) {
        LOG_ERROR(Common_Filesystem, "failed {}: No such file", filename);
@@ -343,6 +351,7 @@ u64 GetSize(const std::string& filename) {
    return 0;
 }

+// Overloaded GetSize, accepts file descriptor
 u64 GetSize(const int fd) {
    struct stat buf;
    if (fstat(fd, &buf) != 0) {
@@ -352,6 +361,7 @@ u64 GetSize(const int fd) {
    return buf.st_size;
 }

+// Overloaded GetSize, accepts FILE*
 u64 GetSize(FILE* f) {
    // can't use off_t here because it can be 32-bit
    u64 pos = ftello(f);
@@ -367,6 +377,7 @@ u64 GetSize(FILE* f) {
    return size;
 }

+// creates an empty file filename, returns true on success
 bool CreateEmptyFile(const std::string& filename) {
    LOG_TRACE(Common_Filesystem, "{}", filename);

@@ -491,6 +502,7 @@ bool DeleteDirRecursively(const std::string& directory, unsigned int recursion)
    return true;
 }

+// Create directory and copy contents (does not overwrite existing files)
 void CopyDir(const std::string& source_path, const std::string& dest_path) {
 #ifndef _WIN32
    if (source_path == dest_path)
@@ -527,7 +539,8 @@ void CopyDir(const std::string& source_path, const std::string& dest_path) {
 #endif
 }

-std::optional<std::string> GetCurrentDir() {
+// Returns the current directory
+std::string GetCurrentDir() {
 // Get the current working directory (getcwd uses malloc)
 #ifdef _WIN32
    wchar_t* dir;
@@ -537,7 +550,7 @@ std::optional<std::string> GetCurrentDir() {
    if (!(dir = getcwd(nullptr, 0))) {
 #endif
        LOG_ERROR(Common_Filesystem, "GetCurrentDirectory failed: {}", GetLastErrorMsg());
-        return {};
+        return nullptr;
    }
 #ifdef _WIN32
    std::string strDir = Common::UTF16ToUTF8(dir);
@@ -548,6 +561,7 @@ std::optional<std::string> GetCurrentDir() {
    return strDir;
 }

+// Sets the current directory to the given directory
 bool SetCurrentDir(const std::string& directory) {
 #ifdef _WIN32
    return _wchdir(Common::UTF8ToUTF16W(directory).c_str()) == 0;
@@ -659,6 +673,8 @@ std::string GetSysDirectory() {
    return sysDir;
 }

+// Returns a string with a yuzu data dir or file in the user's home
+// directory. To be used in "multi-user" mode (that is, installed).
 const std::string& GetUserPath(UserPath path, const std::string& new_path) {
    static std::unordered_map<UserPath, std::string> paths;
    auto& user_path = paths[UserPath::UserDir];
@@ -746,11 +762,11 @@ std::string GetNANDRegistrationDir(bool system) {
    return GetUserPath(UserPath::NANDDir) + "user/Contents/registered/";
 }

-std::size_t WriteStringToFile(bool text_file, const std::string& filename, std::string_view str) {
-    return IOFile(filename, text_file ? "w" : "wb").WriteString(str);
+std::size_t WriteStringToFile(bool text_file, const std::string& str, const char* filename) {
+    return FileUtil::IOFile(filename, text_file ? "w" : "wb").WriteBytes(str.data(), str.size());
 }

-std::size_t ReadFileToString(bool text_file, const std::string& filename, std::string& str) {
+std::size_t ReadFileToString(bool text_file, const char* filename, std::string& str) {
    IOFile file(filename, text_file ? "r" : "rb");

    if (!file.IsOpen())
@@ -760,6 +776,13 @@ std::size_t ReadFileToString(bool text_file, const std::string& filename, std::s
    return file.ReadArray(&str[0], str.size());
 }

+/**
+ * Splits the filename into 8.3 format
+ * Loosely implemented following https://en.wikipedia.org/wiki/8.3_filename
+ * @param filename The normal filename to use
+ * @param short_name A 9-char array in which the short name will be written
+ * @param extension A 4-char array in which the extension will be written
+ */
 void SplitFilename83(const std::string& filename, std::array<char, 9>& short_name,
                     std::array<char, 4>& extension) {
    const std::string forbidden_characters = ".\"/\\[]:;=, ";
--- a/src/common/file_util.h
+++ b/src/common/file_util.h
@@ -9,7 +9,6 @@
 #include <fstream>
 #include <functional>
 #include <limits>
-#include <optional>
 #include <string>
 #include <string_view>
 #include <type_traits>
@@ -119,7 +118,7 @@ u64 ScanDirectoryTree(const std::string& directory, FSTEntry& parent_entry,
 bool DeleteDirRecursively(const std::string& directory, unsigned int recursion = 256);

 // Returns the current directory
-std::optional<std::string> GetCurrentDir();
+std::string GetCurrentDir();

 // Create directory and copy contents (does not overwrite existing files)
 void CopyDir(const std::string& source_path, const std::string& dest_path);
@@ -147,9 +146,9 @@ const std::string& GetExeDirectory();
 std::string AppDataRoamingDirectory();
 #endif

-std::size_t WriteStringToFile(bool text_file, const std::string& filename, std::string_view str);
+std::size_t WriteStringToFile(bool text_file, const std::string& str, const char* filename);

-std::size_t ReadFileToString(bool text_file, const std::string& filename, std::string& str);
+std::size_t ReadFileToString(bool text_file, const char* filename, std::string& str);

 /**
 * Splits the filename into 8.3 format
@@ -258,8 +257,8 @@ public:
        return WriteArray(&object, 1);
    }

-    std::size_t WriteString(std::string_view str) {
-        return WriteArray(str.data(), str.length());
+    std::size_t WriteString(const std::string& str) {
+        return WriteArray(str.c_str(), str.length());
    }

    bool IsOpen() const {
@@ -287,8 +286,8 @@ private:
 template <typename T>
 void OpenFStream(T& fstream, const std::string& filename, std::ios_base::openmode openmode) {
 #ifdef _MSC_VER
-    fstream.open(Common::UTF8ToUTF16W(filename), openmode);
+    fstream.open(Common::UTF8ToUTF16W(filename).c_str(), openmode);
 #else
-    fstream.open(filename, openmode);
+    fstream.open(filename.c_str(), openmode);
 #endif
 }
--- a/src/core/core_timing_util.cpp
+++ b/src/core/core_timing_util.cpp
@@ -14,11 +14,11 @@ namespace Core::Timing {
 constexpr u64 MAX_VALUE_TO_MULTIPLY = std::numeric_limits<s64>::max() / BASE_CLOCK_RATE;

 s64 usToCycles(s64 us) {
-    if (static_cast<u64>(us / 1000000) > MAX_VALUE_TO_MULTIPLY) {
+    if (us / 1000000 > MAX_VALUE_TO_MULTIPLY) {
        LOG_ERROR(Core_Timing, "Integer overflow, use max value");
        return std::numeric_limits<s64>::max();
    }
-    if (static_cast<u64>(us) > MAX_VALUE_TO_MULTIPLY) {
+    if (us > MAX_VALUE_TO_MULTIPLY) {
        LOG_DEBUG(Core_Timing, "Time very big, do rounding");
        return BASE_CLOCK_RATE * (us / 1000000);
    }
@@ -38,11 +38,11 @@ s64 usToCycles(u64 us) {
 }

 s64 nsToCycles(s64 ns) {
-    if (static_cast<u64>(ns / 1000000000) > MAX_VALUE_TO_MULTIPLY) {
+    if (ns / 1000000000 > MAX_VALUE_TO_MULTIPLY) {
        LOG_ERROR(Core_Timing, "Integer overflow, use max value");
        return std::numeric_limits<s64>::max();
    }
-    if (static_cast<u64>(ns) > MAX_VALUE_TO_MULTIPLY) {
+    if (ns > MAX_VALUE_TO_MULTIPLY) {
        LOG_DEBUG(Core_Timing, "Time very big, do rounding");
        return BASE_CLOCK_RATE * (ns / 1000000000);
    }
--- a/src/core/frontend/emu_window.h
+++ b/src/core/frontend/emu_window.h
@@ -169,7 +169,8 @@ private:
     * For the request to be honored, EmuWindow implementations will usually reimplement this
     * function.
     */
-    virtual void OnMinimalClientAreaChangeRequest(std::pair<unsigned, unsigned>) {
+    virtual void OnMinimalClientAreaChangeRequest(
+        const std::pair<unsigned, unsigned>& minimal_size) {
        // By default, ignore this request and do nothing.
    }

--- a/src/core/hle/ipc_helpers.h
+++ b/src/core/hle/ipc_helpers.h
@@ -438,7 +438,7 @@ inline float RequestParser::Pop() {
 template <>
 inline double RequestParser::Pop() {
    const u64 value = Pop<u64>();
-    double real;
+    float real;
    std::memcpy(&real, &value, sizeof(real));
    return real;
 }
--- a/src/core/hle/kernel/hle_ipc.cpp
+++ b/src/core/hle/kernel/hle_ipc.cpp
@@ -43,7 +43,7 @@ void SessionRequestHandler::ClientDisconnected(const SharedPtr<ServerSession>& s
 }

 SharedPtr<WritableEvent> HLERequestContext::SleepClientThread(
-    const std::string& reason, u64 timeout, WakeupCallback&& callback,
+    SharedPtr<Thread> thread, const std::string& reason, u64 timeout, WakeupCallback&& callback,
    SharedPtr<WritableEvent> writable_event) {
    // Put the client thread to sleep until the wait event is signaled or the timeout expires.
    thread->SetWakeupCallback([context = *this, callback](
@@ -58,7 +58,7 @@ SharedPtr<WritableEvent> HLERequestContext::SleepClientThread(
    auto& kernel = Core::System::GetInstance().Kernel();
    if (!writable_event) {
        // Create event if not provided
-        const auto pair = WritableEvent::CreateEventPair(kernel, ResetType::Automatic,
+        const auto pair = WritableEvent::CreateEventPair(kernel, Kernel::ResetType::OneShot,
                                                         "HLE Pause Event: " + reason);
        writable_event = pair.writable;
    }
@@ -76,9 +76,8 @@ SharedPtr<WritableEvent> HLERequestContext::SleepClientThread(
    return writable_event;
 }

-HLERequestContext::HLERequestContext(SharedPtr<Kernel::ServerSession> server_session,
-                                     SharedPtr<Thread> thread)
-    : server_session(std::move(server_session)), thread(std::move(thread)) {
+HLERequestContext::HLERequestContext(SharedPtr<Kernel::ServerSession> server_session)
+    : server_session(std::move(server_session)) {
    cmd_buf[0] = 0;
 }

--- a/src/core/hle/kernel/hle_ipc.h
+++ b/src/core/hle/kernel/hle_ipc.h
@@ -97,7 +97,7 @@ protected:
 */
 class HLERequestContext {
 public:
-    explicit HLERequestContext(SharedPtr<ServerSession> session, SharedPtr<Thread> thread);
+    explicit HLERequestContext(SharedPtr<ServerSession> session);
    ~HLERequestContext();

    /// Returns a pointer to the IPC command buffer for this request.
@@ -119,6 +119,7 @@ public:
    /**
     * Puts the specified guest thread to sleep until the returned event is signaled or until the
     * specified timeout expires.
+     * @param thread Thread to be put to sleep.
     * @param reason Reason for pausing the thread, to be used for debugging purposes.
     * @param timeout Timeout in nanoseconds after which the thread will be awoken and the callback
     * invoked with a Timeout reason.
@@ -129,8 +130,8 @@ public:
     * created.
     * @returns Event that when signaled will resume the thread and call the callback function.
     */
-    SharedPtr<WritableEvent> SleepClientThread(const std::string& reason, u64 timeout,
-                                               WakeupCallback&& callback,
+    SharedPtr<WritableEvent> SleepClientThread(SharedPtr<Thread> thread, const std::string& reason,
+                                               u64 timeout, WakeupCallback&& callback,
                                               SharedPtr<WritableEvent> writable_event = nullptr);

    /// Populates this context with data from the requesting process/thread.
@@ -267,7 +268,6 @@ private:

    std::array<u32, IPC::COMMAND_BUFFER_LENGTH> cmd_buf;
    SharedPtr<Kernel::ServerSession> server_session;
-    SharedPtr<Thread> thread;
    // TODO(yuriks): Check common usage of this and optimize size accordingly
    boost::container::small_vector<SharedPtr<Object>, 8> move_objects;
    boost::container::small_vector<SharedPtr<Object>, 8> copy_objects;
--- a/src/core/hle/kernel/object.h
+++ b/src/core/hle/kernel/object.h
@@ -33,8 +33,8 @@ enum class HandleType : u32 {
 };

 enum class ResetType {
-    Automatic, ///< Reset automatically on object acquisition
-    Manual,    ///< Never reset automatically
+    OneShot, ///< Reset automatically on object acquisition
+    Sticky,  ///< Never reset automatically
 };

 class Object : NonCopyable {
--- a/src/core/hle/kernel/readable_event.cpp
+++ b/src/core/hle/kernel/readable_event.cpp
@@ -21,9 +21,8 @@ bool ReadableEvent::ShouldWait(const Thread* thread) const {
 void ReadableEvent::Acquire(Thread* thread) {
    ASSERT_MSG(!ShouldWait(thread), "object unavailable!");

-    if (reset_type == ResetType::Automatic) {
+    if (reset_type == ResetType::OneShot)
        signaled = false;
-    }
 }

 void ReadableEvent::Signal() {
--- a/src/core/hle/kernel/server_session.cpp
+++ b/src/core/hle/kernel/server_session.cpp
@@ -130,7 +130,7 @@ ResultCode ServerSession::HandleSyncRequest(SharedPtr<Thread> thread) {
    // The ServerSession received a sync request, this means that there's new data available
    // from its ClientSession, so wake up any threads that may be waiting on a svcReplyAndReceive or
    // similar.
-    Kernel::HLERequestContext context(this, thread);
+    Kernel::HLERequestContext context(this);
    u32* cmd_buf = (u32*)Memory::GetPointer(thread->GetTLSAddress());
    context.PopulateFromIncomingCommandBuffer(kernel.CurrentProcess()->GetHandleTable(), cmd_buf);

--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -1255,8 +1255,8 @@ static ResultCode MapProcessCodeMemory(Core::System& system, Handle process_hand
    return vm_manager.MapCodeMemory(dst_address, src_address, size);
 }

-static ResultCode UnmapProcessCodeMemory(Core::System& system, Handle process_handle,
-                                         u64 dst_address, u64 src_address, u64 size) {
+ResultCode UnmapProcessCodeMemory(Core::System& system, Handle process_handle, u64 dst_address,
+                                  u64 src_address, u64 size) {
    LOG_DEBUG(Kernel_SVC,
              "called. process_handle=0x{:08X}, dst_address=0x{:016X}, src_address=0x{:016X}, "
              "size=0x{:016X}",
@@ -1342,7 +1342,7 @@ static void ExitProcess(Core::System& system) {
 /// Creates a new thread
 static ResultCode CreateThread(Core::System& system, Handle* out_handle, VAddr entry_point, u64 arg,
                               VAddr stack_top, u32 priority, s32 processor_id) {
-    LOG_DEBUG(Kernel_SVC,
+    LOG_TRACE(Kernel_SVC,
              "called entrypoint=0x{:08X}, arg=0x{:08X}, stacktop=0x{:08X}, "
              "threadpriority=0x{:08X}, processorid=0x{:08X} : created handle=0x{:08X}",
              entry_point, arg, stack_top, priority, processor_id, *out_handle);
@@ -1402,7 +1402,7 @@ static ResultCode CreateThread(Core::System& system, Handle* out_handle, VAddr e

 /// Starts the thread for the provided handle
 static ResultCode StartThread(Core::System& system, Handle thread_handle) {
-    LOG_DEBUG(Kernel_SVC, "called thread=0x{:08X}", thread_handle);
+    LOG_TRACE(Kernel_SVC, "called thread=0x{:08X}", thread_handle);

    const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
    const SharedPtr<Thread> thread = handle_table.Get<Thread>(thread_handle);
@@ -1425,7 +1425,7 @@ static ResultCode StartThread(Core::System& system, Handle thread_handle) {

 /// Called when a thread exits
 static void ExitThread(Core::System& system) {
-    LOG_DEBUG(Kernel_SVC, "called, pc=0x{:08X}", system.CurrentArmInterface().GetPC());
+    LOG_TRACE(Kernel_SVC, "called, pc=0x{:08X}", system.CurrentArmInterface().GetPC());

    auto* const current_thread = system.CurrentScheduler().GetCurrentThread();
    current_thread->Stop();
@@ -1435,7 +1435,7 @@ static void ExitThread(Core::System& system) {

 /// Sleep the current thread
 static void SleepThread(Core::System& system, s64 nanoseconds) {
-    LOG_DEBUG(Kernel_SVC, "called nanoseconds={}", nanoseconds);
+    LOG_TRACE(Kernel_SVC, "called nanoseconds={}", nanoseconds);

    enum class SleepType : s64 {
        YieldWithoutLoadBalancing = 0,
@@ -1880,51 +1880,11 @@ static ResultCode GetThreadCoreMask(Core::System& system, Handle thread_handle,
 }

 static ResultCode SetThreadCoreMask(Core::System& system, Handle thread_handle, u32 core,
-                                    u64 affinity_mask) {
-    LOG_DEBUG(Kernel_SVC, "called, handle=0x{:08X}, core=0x{:X}, affinity_mask=0x{:016X}",
-              thread_handle, core, affinity_mask);
+                                    u64 mask) {
+    LOG_DEBUG(Kernel_SVC, "called, handle=0x{:08X}, mask=0x{:016X}, core=0x{:X}", thread_handle,
+              mask, core);

-    const auto* const current_process = system.Kernel().CurrentProcess();
-
-    if (core == static_cast<u32>(THREADPROCESSORID_IDEAL)) {
-        const u8 ideal_cpu_core = current_process->GetIdealCore();
-
-        ASSERT(ideal_cpu_core != static_cast<u8>(THREADPROCESSORID_IDEAL));
-
-        // Set the target CPU to the ideal core specified by the process.
-        core = ideal_cpu_core;
-        affinity_mask = 1ULL << core;
-    } else {
-        const u64 core_mask = current_process->GetCoreMask();
-
-        if ((core_mask | affinity_mask) != core_mask) {
-            LOG_ERROR(
-                Kernel_SVC,
-                "Invalid processor ID specified (core_mask=0x{:08X}, affinity_mask=0x{:016X})",
-                core_mask, affinity_mask);
-            return ERR_INVALID_PROCESSOR_ID;
-        }
-
-        if (affinity_mask == 0) {
-            LOG_ERROR(Kernel_SVC, "Specfified affinity mask is zero.");
-            return ERR_INVALID_COMBINATION;
-        }
-
-        if (core < Core::NUM_CPU_CORES) {
-            if ((affinity_mask & (1ULL << core)) == 0) {
-                LOG_ERROR(Kernel_SVC,
-                          "Core is not enabled for the current mask, core={}, mask={:016X}", core,
-                          affinity_mask);
-                return ERR_INVALID_COMBINATION;
-            }
-        } else if (core != static_cast<u32>(THREADPROCESSORID_DONT_CARE) &&
-                   core != static_cast<u32>(THREADPROCESSORID_DONT_UPDATE)) {
-            LOG_ERROR(Kernel_SVC, "Invalid processor ID specified (core={}).", core);
-            return ERR_INVALID_PROCESSOR_ID;
-        }
-    }
-
-    const auto& handle_table = current_process->GetHandleTable();
+    const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
    const SharedPtr<Thread> thread = handle_table.Get<Thread>(thread_handle);
    if (!thread) {
        LOG_ERROR(Kernel_SVC, "Thread handle does not exist, thread_handle=0x{:08X}",
@@ -1932,7 +1892,40 @@ static ResultCode SetThreadCoreMask(Core::System& system, Handle thread_handle,
        return ERR_INVALID_HANDLE;
    }

-    thread->ChangeCore(core, affinity_mask);
+    if (core == static_cast<u32>(THREADPROCESSORID_IDEAL)) {
+        const u8 ideal_cpu_core = thread->GetOwnerProcess()->GetIdealCore();
+
+        ASSERT(ideal_cpu_core != static_cast<u8>(THREADPROCESSORID_IDEAL));
+
+        // Set the target CPU to the ideal core specified by the process.
+        core = ideal_cpu_core;
+        mask = 1ULL << core;
+    }
+
+    if (mask == 0) {
+        LOG_ERROR(Kernel_SVC, "Mask is 0");
+        return ERR_INVALID_COMBINATION;
+    }
+
+    /// This value is used to only change the affinity mask without changing the current ideal core.
+    static constexpr u32 OnlyChangeMask = static_cast<u32>(-3);
+
+    if (core == OnlyChangeMask) {
+        core = thread->GetIdealCore();
+    } else if (core >= Core::NUM_CPU_CORES && core != static_cast<u32>(-1)) {
+        LOG_ERROR(Kernel_SVC, "Invalid core specified, got {}", core);
+        return ERR_INVALID_PROCESSOR_ID;
+    }
+
+    // Error out if the input core isn't enabled in the input mask.
+    if (core < Core::NUM_CPU_CORES && (mask & (1ull << core)) == 0) {
+        LOG_ERROR(Kernel_SVC, "Core is not enabled for the current mask, core={}, mask={:016X}",
+                  core, mask);
+        return ERR_INVALID_COMBINATION;
+    }
+
+    thread->ChangeCore(core, mask);
+
    return RESULT_SUCCESS;
 }

@@ -1987,7 +1980,7 @@ static ResultCode CreateEvent(Core::System& system, Handle* write_handle, Handle

    auto& kernel = system.Kernel();
    const auto [readable_event, writable_event] =
-        WritableEvent::CreateEventPair(kernel, ResetType::Manual, "CreateEvent");
+        WritableEvent::CreateEventPair(kernel, ResetType::Sticky, "CreateEvent");

    HandleTable& handle_table = kernel.CurrentProcess()->GetHandleTable();

@@ -2190,8 +2183,8 @@ static ResultCode GetProcessList(Core::System& system, u32* out_num_processes,
    return RESULT_SUCCESS;
 }

-static ResultCode GetThreadList(Core::System& system, u32* out_num_threads, VAddr out_thread_ids,
-                                u32 out_thread_ids_size, Handle debug_handle) {
+ResultCode GetThreadList(Core::System& system, u32* out_num_threads, VAddr out_thread_ids,
+                         u32 out_thread_ids_size, Handle debug_handle) {
    // TODO: Handle this case when debug events are supported.
    UNIMPLEMENTED_IF(debug_handle != InvalidHandle);

--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -30,21 +30,12 @@ enum ThreadPriority : u32 {
 };

 enum ThreadProcessorId : s32 {
-    /// Indicates that no particular processor core is preferred.
-    THREADPROCESSORID_DONT_CARE = -1,
-
-    /// Run thread on the ideal core specified by the process.
-    THREADPROCESSORID_IDEAL = -2,
-
-    /// Indicates that the preferred processor ID shouldn't be updated in
-    /// a core mask setting operation.
-    THREADPROCESSORID_DONT_UPDATE = -3,
-
-    THREADPROCESSORID_0 = 0,   ///< Run thread on core 0
-    THREADPROCESSORID_1 = 1,   ///< Run thread on core 1
-    THREADPROCESSORID_2 = 2,   ///< Run thread on core 2
-    THREADPROCESSORID_3 = 3,   ///< Run thread on core 3
-    THREADPROCESSORID_MAX = 4, ///< Processor ID must be less than this
+    THREADPROCESSORID_IDEAL = -2, ///< Run thread on the ideal core specified by the process.
+    THREADPROCESSORID_0 = 0,      ///< Run thread on core 0
+    THREADPROCESSORID_1 = 1,      ///< Run thread on core 1
+    THREADPROCESSORID_2 = 2,      ///< Run thread on core 2
+    THREADPROCESSORID_3 = 3,      ///< Run thread on core 3
+    THREADPROCESSORID_MAX = 4,    ///< Processor ID must be less than this

    /// Allowed CPU mask
    THREADPROCESSORID_DEFAULT_MASK = (1 << THREADPROCESSORID_0) | (1 << THREADPROCESSORID_1) |
--- a/src/core/hle/service/am/am.cpp
+++ b/src/core/hle/service/am/am.cpp
@@ -276,7 +276,7 @@ ISelfController::ISelfController(std::shared_ptr<NVFlinger::NVFlinger> nvflinger
    RegisterHandlers(functions);

    auto& kernel = Core::System::GetInstance().Kernel();
-    launchable_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
+    launchable_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Sticky,
                                                              "ISelfController:LaunchableEvent");
 }

@@ -442,10 +442,10 @@ void ISelfController::GetIdleTimeDetectionExtension(Kernel::HLERequestContext& c

 AppletMessageQueue::AppletMessageQueue() {
    auto& kernel = Core::System::GetInstance().Kernel();
-    on_new_message = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
+    on_new_message = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Sticky,
                                                            "AMMessageQueue:OnMessageRecieved");
    on_operation_mode_changed = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Automatic, "AMMessageQueue:OperationModeChanged");
+        kernel, Kernel::ResetType::OneShot, "AMMessageQueue:OperationModeChanged");
 }

 AppletMessageQueue::~AppletMessageQueue() = default;
@@ -835,7 +835,6 @@ void IStorageAccessor::Write(Kernel::HLERequestContext& ctx) {

        IPC::ResponseBuilder rb{ctx, 2};
        rb.Push(ERR_SIZE_OUT_OF_BOUNDS);
-        return;
    }

    std::memcpy(backing.buffer.data() + offset, data.data(), data.size());
@@ -858,7 +857,6 @@ void IStorageAccessor::Read(Kernel::HLERequestContext& ctx) {

        IPC::ResponseBuilder rb{ctx, 2};
        rb.Push(ERR_SIZE_OUT_OF_BOUNDS);
-        return;
    }

    ctx.WriteBuffer(backing.buffer.data() + offset, size);
--- a/src/core/hle/service/am/applets/applets.cpp
+++ b/src/core/hle/service/am/applets/applets.cpp
@@ -26,11 +26,11 @@ namespace Service::AM::Applets {
 AppletDataBroker::AppletDataBroker() {
    auto& kernel = Core::System::GetInstance().Kernel();
    state_changed_event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Manual, "ILibraryAppletAccessor:StateChangedEvent");
+        kernel, Kernel::ResetType::Sticky, "ILibraryAppletAccessor:StateChangedEvent");
    pop_out_data_event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Manual, "ILibraryAppletAccessor:PopDataOutEvent");
+        kernel, Kernel::ResetType::Sticky, "ILibraryAppletAccessor:PopDataOutEvent");
    pop_interactive_out_data_event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Manual, "ILibraryAppletAccessor:PopInteractiveDataOutEvent");
+        kernel, Kernel::ResetType::Sticky, "ILibraryAppletAccessor:PopInteractiveDataOutEvent");
 }

 AppletDataBroker::~AppletDataBroker() = default;
--- a/src/core/hle/service/aoc/aoc_u.cpp
+++ b/src/core/hle/service/aoc/aoc_u.cpp
@@ -9,6 +9,7 @@
 #include "core/file_sys/content_archive.h"
 #include "core/file_sys/control_metadata.h"
 #include "core/file_sys/nca_metadata.h"
+#include "core/file_sys/partition_filesystem.h"
 #include "core/file_sys/patch_manager.h"
 #include "core/file_sys/registered_cache.h"
 #include "core/hle/ipc_helpers.h"
@@ -17,6 +18,7 @@
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/aoc/aoc_u.h"
+#include "core/hle/service/filesystem/filesystem.h"
 #include "core/loader/loader.h"
 #include "core/settings.h"

@@ -66,22 +68,14 @@ AOC_U::AOC_U() : ServiceFramework("aoc:u"), add_on_content(AccumulateAOCTitleIDs
    RegisterHandlers(functions);

    auto& kernel = Core::System::GetInstance().Kernel();
-    aoc_change_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
+    aoc_change_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Sticky,
                                                              "GetAddOnContentListChanged:Event");
 }

 AOC_U::~AOC_U() = default;

 void AOC_U::CountAddOnContent(Kernel::HLERequestContext& ctx) {
-    struct Parameters {
-        u64 process_id;
-    };
-    static_assert(sizeof(Parameters) == 8);
-
-    IPC::RequestParser rp{ctx};
-    const auto params = rp.PopRaw<Parameters>();
-
-    LOG_DEBUG(Service_AOC, "called. process_id={}", params.process_id);
+    LOG_DEBUG(Service_AOC, "called");

    IPC::ResponseBuilder rb{ctx, 3};
    rb.Push(RESULT_SUCCESS);
@@ -100,33 +94,24 @@ void AOC_U::CountAddOnContent(Kernel::HLERequestContext& ctx) {
 }

 void AOC_U::ListAddOnContent(Kernel::HLERequestContext& ctx) {
-    struct Parameters {
-        u32 offset;
-        u32 count;
-        u64 process_id;
-    };
-    static_assert(sizeof(Parameters) == 16);
-
    IPC::RequestParser rp{ctx};
-    const auto [offset, count, process_id] = rp.PopRaw<Parameters>();

-    LOG_DEBUG(Service_AOC, "called with offset={}, count={}, process_id={}", offset, count,
-              process_id);
+    const auto offset = rp.PopRaw<u32>();
+    auto count = rp.PopRaw<u32>();
+    LOG_DEBUG(Service_AOC, "called with offset={}, count={}", offset, count);

    const auto current = Core::System::GetInstance().CurrentProcess()->GetTitleID();

    std::vector<u32> out;
-    const auto& disabled = Settings::values.disabled_addons[current];
-    if (std::find(disabled.begin(), disabled.end(), "DLC") == disabled.end()) {
-        for (u64 content_id : add_on_content) {
-            if ((content_id & DLC_BASE_TITLE_ID_MASK) != current) {
-                continue;
-            }
-
-            out.push_back(static_cast<u32>(content_id & 0x7FF));
-        }
+    for (size_t i = 0; i < add_on_content.size(); ++i) {
+        if ((add_on_content[i] & DLC_BASE_TITLE_ID_MASK) == current)
+            out.push_back(static_cast<u32>(add_on_content[i] & 0x7FF));
    }

+    const auto& disabled = Settings::values.disabled_addons[current];
+    if (std::find(disabled.begin(), disabled.end(), "DLC") != disabled.end())
+        out = {};
+
    if (out.size() < offset) {
        IPC::ResponseBuilder rb{ctx, 2};
        // TODO(DarkLordZach): Find the correct error code.
@@ -134,31 +119,22 @@ void AOC_U::ListAddOnContent(Kernel::HLERequestContext& ctx) {
        return;
    }

-    const auto out_count = static_cast<u32>(std::min<size_t>(out.size() - offset, count));
+    count = static_cast<u32>(std::min<size_t>(out.size() - offset, count));
    std::rotate(out.begin(), out.begin() + offset, out.end());
-    out.resize(out_count);
+    out.resize(count);

    ctx.WriteBuffer(out);

    IPC::ResponseBuilder rb{ctx, 3};
    rb.Push(RESULT_SUCCESS);
-    rb.Push(out_count);
+    rb.Push(count);
 }

 void AOC_U::GetAddOnContentBaseId(Kernel::HLERequestContext& ctx) {
-    struct Parameters {
-        u64 process_id;
-    };
-    static_assert(sizeof(Parameters) == 8);
-
-    IPC::RequestParser rp{ctx};
-    const auto params = rp.PopRaw<Parameters>();
-
-    LOG_DEBUG(Service_AOC, "called. process_id={}", params.process_id);
+    LOG_DEBUG(Service_AOC, "called");

    IPC::ResponseBuilder rb{ctx, 4};
    rb.Push(RESULT_SUCCESS);
-
    const auto title_id = Core::System::GetInstance().CurrentProcess()->GetTitleID();
    FileSys::PatchManager pm{title_id};

@@ -172,17 +148,10 @@ void AOC_U::GetAddOnContentBaseId(Kernel::HLERequestContext& ctx) {
 }

 void AOC_U::PrepareAddOnContent(Kernel::HLERequestContext& ctx) {
-    struct Parameters {
-        s32 addon_index;
-        u64 process_id;
-    };
-    static_assert(sizeof(Parameters) == 16);
-
    IPC::RequestParser rp{ctx};
-    const auto [addon_index, process_id] = rp.PopRaw<Parameters>();

-    LOG_WARNING(Service_AOC, "(STUBBED) called with addon_index={}, process_id={}", addon_index,
-                process_id);
+    const auto aoc_id = rp.PopRaw<u32>();
+    LOG_WARNING(Service_AOC, "(STUBBED) called with aoc_id={:08X}", aoc_id);

    IPC::ResponseBuilder rb{ctx, 2};
    rb.Push(RESULT_SUCCESS);
--- a/src/core/hle/service/audio/audout_u.cpp
+++ b/src/core/hle/service/audio/audout_u.cpp
@@ -67,7 +67,7 @@ public:
        // This is the event handle used to check if the audio buffer was released
        auto& system = Core::System::GetInstance();
        buffer_event = Kernel::WritableEvent::CreateEventPair(
-            system.Kernel(), Kernel::ResetType::Manual, "IAudioOutBufferReleased");
+            system.Kernel(), Kernel::ResetType::Sticky, "IAudioOutBufferReleased");

        stream = audio_core.OpenStream(system.CoreTiming(), audio_params.sample_rate,
                                       audio_params.channel_count, std::move(unique_name),
--- a/src/core/hle/service/audio/audren_u.cpp
+++ b/src/core/hle/service/audio/audren_u.cpp
@@ -8,7 +8,6 @@

 #include "audio_core/audio_renderer.h"
 #include "common/alignment.h"
-#include "common/bit_util.h"
 #include "common/common_funcs.h"
 #include "common/logging/log.h"
 #include "common/string_util.h"
@@ -47,7 +46,7 @@ public:

        auto& system = Core::System::GetInstance();
        system_event = Kernel::WritableEvent::CreateEventPair(
-            system.Kernel(), Kernel::ResetType::Manual, "IAudioRenderer:SystemEvent");
+            system.Kernel(), Kernel::ResetType::Sticky, "IAudioRenderer:SystemEvent");
        renderer = std::make_unique<AudioCore::AudioRenderer>(system.CoreTiming(), audren_params,
                                                              system_event.writable);
    }
@@ -179,7 +178,7 @@ public:
        RegisterHandlers(functions);

        auto& kernel = Core::System::GetInstance().Kernel();
-        buffer_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
+        buffer_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::OneShot,
                                                              "IAudioOutBufferReleasedEvent");
    }

@@ -263,304 +262,64 @@ void AudRenU::OpenAudioRenderer(Kernel::HLERequestContext& ctx) {
    OpenAudioRendererImpl(ctx);
 }

-static u64 CalculateNumPerformanceEntries(const AudioCore::AudioRendererParameter& params) {
-    // +1 represents the final mix.
-    return u64{params.effect_count} + params.submix_count + params.sink_count + params.voice_count +
-           1;
-}
-
 void AudRenU::GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    auto params = rp.PopRaw<AudioCore::AudioRendererParameter>();
    LOG_DEBUG(Service_Audio, "called");

-    // Several calculations below align the sizes being calculated
-    // onto a 64 byte boundary.
-    static constexpr u64 buffer_alignment_size = 64;
+    u64 buffer_sz = Common::AlignUp(4 * params.mix_buffer_count, 0x40);
+    buffer_sz += params.submix_count * 1024;
+    buffer_sz += 0x940 * (params.submix_count + 1);
+    buffer_sz += 0x3F0 * params.voice_count;
+    buffer_sz += Common::AlignUp(8 * (params.submix_count + 1), 0x10);
+    buffer_sz += Common::AlignUp(8 * params.voice_count, 0x10);
+    buffer_sz += Common::AlignUp(
+        (0x3C0 * (params.sink_count + params.submix_count) + 4 * params.sample_count) *
+            (params.mix_buffer_count + 6),
+        0x40);

-    // Some calculations that calculate portions of the buffer
-    // that will contain information, on the other hand, align
-    // the result of some of their calcularions on a 16 byte boundary.
-    static constexpr u64 info_field_alignment_size = 16;
-
-    // Maximum detail entries that may exist at one time for performance
-    // frame statistics.
-    static constexpr u64 max_perf_detail_entries = 100;
-
-    // Size of the data structure representing the bulk of the voice-related state.
-    static constexpr u64 voice_state_size = 0x100;
-
-    // Size of the upsampler manager data structure
-    constexpr u64 upsampler_manager_size = 0x48;
-
-    // Calculates the part of the size that relates to mix buffers.
-    const auto calculate_mix_buffer_sizes = [](const AudioCore::AudioRendererParameter& params) {
-        // As of 8.0.0 this is the maximum on voice channels.
-        constexpr u64 max_voice_channels = 6;
-
-        // The service expects the sample_count member of the parameters to either be
-        // a value of 160 or 240, so the maximum sample count is assumed in order
-        // to adequately handle all values at runtime.
-        constexpr u64 default_max_sample_count = 240;
-
-        const u64 total_mix_buffers = params.mix_buffer_count + max_voice_channels;
-
-        u64 size = 0;
-        size += total_mix_buffers * (sizeof(s32) * params.sample_count);
-        size += total_mix_buffers * (sizeof(s32) * default_max_sample_count);
-        size += u64{params.submix_count} + params.sink_count;
-        size = Common::AlignUp(size, buffer_alignment_size);
-        size += Common::AlignUp(params.unknown_30, buffer_alignment_size);
-        size += Common::AlignUp(sizeof(s32) * params.mix_buffer_count, buffer_alignment_size);
-        return size;
-    };
-
-    // Calculates the portion of the size related to the mix data (and the sorting thereof).
-    const auto calculate_mix_info_size = [this](const AudioCore::AudioRendererParameter& params) {
-        // The size of the mixing info data structure.
-        constexpr u64 mix_info_size = 0x940;
-
-        // Consists of total submixes with the final mix included.
-        const u64 total_mix_count = u64{params.submix_count} + 1;
-
-        // The total number of effects that may be available to the audio renderer at any time.
-        constexpr u64 max_effects = 256;
-
-        // Calculates the part of the size related to the audio node state.
-        // This will only be used if the audio revision supports the splitter.
-        const auto calculate_node_state_size = [](std::size_t num_nodes) {
-            // Internally within a nodestate, it appears to use a data structure
-            // similar to a std::bitset<64> twice.
-            constexpr u64 bit_size = Common::BitSize<u64>();
-            constexpr u64 num_bitsets = 2;
-
-            // Node state instances have three states internally for performing
-            // depth-first searches of nodes. Initialized, Found, and Done Sorting.
-            constexpr u64 num_states = 3;
-
-            u64 size = 0;
-            size += (num_nodes * num_nodes) * sizeof(s32);
-            size += num_states * (num_nodes * sizeof(s32));
-            size += num_bitsets * (Common::AlignUp(num_nodes, bit_size) / Common::BitSize<u8>());
-            return size;
-        };
-
-        // Calculates the part of the size related to the adjacency (aka edge) matrix.
-        const auto calculate_edge_matrix_size = [](std::size_t num_nodes) {
-            return (num_nodes * num_nodes) * sizeof(s32);
-        };
-
-        u64 size = 0;
-        size += Common::AlignUp(sizeof(void*) * total_mix_count, info_field_alignment_size);
-        size += Common::AlignUp(mix_info_size * total_mix_count, info_field_alignment_size);
-        size += Common::AlignUp(sizeof(s32) * max_effects * params.submix_count,
-                                info_field_alignment_size);
-
-        if (IsFeatureSupported(AudioFeatures::Splitter, params.revision)) {
-            size += Common::AlignUp(calculate_node_state_size(total_mix_count) +
-                                        calculate_edge_matrix_size(total_mix_count),
-                                    info_field_alignment_size);
+    if (IsFeatureSupported(AudioFeatures::Splitter, params.revision)) {
+        const u32 count = params.submix_count + 1;
+        u64 node_count = Common::AlignUp(count, 0x40);
+        const u64 node_state_buffer_sz =
+            4 * (node_count * node_count) + 0xC * node_count + 2 * (node_count / 8);
+        u64 edge_matrix_buffer_sz = 0;
+        node_count = Common::AlignUp(count * count, 0x40);
+        if (node_count >> 31 != 0) {
+            edge_matrix_buffer_sz = (node_count | 7) / 8;
+        } else {
+            edge_matrix_buffer_sz = node_count / 8;
        }
+        buffer_sz += Common::AlignUp(node_state_buffer_sz + edge_matrix_buffer_sz, 0x10);
+    }

-        return size;
-    };
+    buffer_sz += 0x20 * (params.effect_count + 4 * params.voice_count) + 0x50;
+    if (IsFeatureSupported(AudioFeatures::Splitter, params.revision)) {
+        buffer_sz += 0xE0 * params.num_splitter_send_channels;
+        buffer_sz += 0x20 * params.splitter_count;
+        buffer_sz += Common::AlignUp(4 * params.num_splitter_send_channels, 0x10);
+    }
+    buffer_sz = Common::AlignUp(buffer_sz, 0x40) + 0x170 * params.sink_count;
+    u64 output_sz = buffer_sz + 0x280 * params.sink_count + 0x4B0 * params.effect_count +
+                    ((params.voice_count * 256) | 0x40);

-    // Calculates the part of the size related to voice channel info.
-    const auto calculate_voice_info_size = [](const AudioCore::AudioRendererParameter& params) {
-        constexpr u64 voice_info_size = 0x220;
-        constexpr u64 voice_resource_size = 0xD0;
-
-        u64 size = 0;
-        size += Common::AlignUp(sizeof(void*) * params.voice_count, info_field_alignment_size);
-        size += Common::AlignUp(voice_info_size * params.voice_count, info_field_alignment_size);
-        size +=
-            Common::AlignUp(voice_resource_size * params.voice_count, info_field_alignment_size);
-        size += Common::AlignUp(voice_state_size * params.voice_count, info_field_alignment_size);
-        return size;
-    };
-
-    // Calculates the part of the size related to memory pools.
-    const auto calculate_memory_pools_size = [](const AudioCore::AudioRendererParameter& params) {
-        const u64 num_memory_pools = sizeof(s32) * (u64{params.effect_count} + params.voice_count);
-        const u64 memory_pool_info_size = 0x20;
-        return Common::AlignUp(num_memory_pools * memory_pool_info_size, info_field_alignment_size);
-    };
-
-    // Calculates the part of the size related to the splitter context.
-    const auto calculate_splitter_context_size =
-        [this](const AudioCore::AudioRendererParameter& params) -> u64 {
-        if (!IsFeatureSupported(AudioFeatures::Splitter, params.revision)) {
-            return 0;
-        }
-
-        constexpr u64 splitter_info_size = 0x20;
-        constexpr u64 splitter_destination_data_size = 0xE0;
-
-        u64 size = 0;
-        size += params.num_splitter_send_channels;
-        size +=
-            Common::AlignUp(splitter_info_size * params.splitter_count, info_field_alignment_size);
-        size += Common::AlignUp(splitter_destination_data_size * params.num_splitter_send_channels,
-                                info_field_alignment_size);
-
-        return size;
-    };
-
-    // Calculates the part of the size related to the upsampler info.
-    const auto calculate_upsampler_info_size = [](const AudioCore::AudioRendererParameter& params) {
-        constexpr u64 upsampler_info_size = 0x280;
-        // Yes, using the buffer size over info alignment size is intentional here.
-        return Common::AlignUp(upsampler_info_size * (u64{params.submix_count} + params.sink_count),
-                               buffer_alignment_size);
-    };
-
-    // Calculates the part of the size related to effect info.
-    const auto calculate_effect_info_size = [](const AudioCore::AudioRendererParameter& params) {
-        constexpr u64 effect_info_size = 0x2B0;
-        return Common::AlignUp(effect_info_size * params.effect_count, info_field_alignment_size);
-    };
-
-    // Calculates the part of the size related to audio sink info.
-    const auto calculate_sink_info_size = [](const AudioCore::AudioRendererParameter& params) {
-        const u64 sink_info_size = 0x170;
-        return Common::AlignUp(sink_info_size * params.sink_count, info_field_alignment_size);
-    };
-
-    // Calculates the part of the size related to voice state info.
-    const auto calculate_voice_state_size = [](const AudioCore::AudioRendererParameter& params) {
-        const u64 voice_state_size = 0x100;
-        const u64 additional_size = buffer_alignment_size - 1;
-        return Common::AlignUp(voice_state_size * params.voice_count + additional_size,
-                               info_field_alignment_size);
-    };
-
-    // Calculates the part of the size related to performance statistics.
-    const auto calculate_perf_size = [this](const AudioCore::AudioRendererParameter& params) {
-        // Extra size value appended to the end of the calculation.
-        constexpr u64 appended = 128;
-
-        // Whether or not we assume the newer version of performance metrics data structures.
-        const bool is_v2 =
-            IsFeatureSupported(AudioFeatures::PerformanceMetricsVersion2, params.revision);
-
-        // Data structure sizes
-        constexpr u64 perf_statistics_size = 0x0C;
-        const u64 header_size = is_v2 ? 0x30 : 0x18;
-        const u64 entry_size = is_v2 ? 0x18 : 0x10;
-        const u64 detail_size = is_v2 ? 0x18 : 0x10;
-
-        const u64 entry_count = CalculateNumPerformanceEntries(params);
-        const u64 size_per_frame =
-            header_size + (entry_size * entry_count) + (detail_size * max_perf_detail_entries);
-
-        u64 size = 0;
-        size += Common::AlignUp(size_per_frame * params.performance_frame_count + 1,
-                                buffer_alignment_size);
-        size += Common::AlignUp(perf_statistics_size, buffer_alignment_size);
-        size += appended;
-        return size;
-    };
-
-    // Calculates the part of the size that relates to the audio command buffer.
-    const auto calculate_command_buffer_size =
-        [this](const AudioCore::AudioRendererParameter& params) {
-            constexpr u64 alignment = (buffer_alignment_size - 1) * 2;
-
-            if (!IsFeatureSupported(AudioFeatures::VariadicCommandBuffer, params.revision)) {
-                constexpr u64 command_buffer_size = 0x18000;
-
-                return command_buffer_size + alignment;
-            }
-
-            // When the variadic command buffer is supported, this means
-            // the command generator for the audio renderer can issue commands
-            // that are (as one would expect), variable in size. So what we need to do
-            // is determine the maximum possible size for a few command data structures
-            // then multiply them by the amount of present commands indicated by the given
-            // respective audio parameters.
-
-            constexpr u64 max_biquad_filters = 2;
-            constexpr u64 max_mix_buffers = 24;
-
-            constexpr u64 biquad_filter_command_size = 0x2C;
-
-            constexpr u64 depop_mix_command_size = 0x24;
-            constexpr u64 depop_setup_command_size = 0x50;
-
-            constexpr u64 effect_command_max_size = 0x540;
-
-            constexpr u64 mix_command_size = 0x1C;
-            constexpr u64 mix_ramp_command_size = 0x24;
-            constexpr u64 mix_ramp_grouped_command_size = 0x13C;
-
-            constexpr u64 perf_command_size = 0x28;
-
-            constexpr u64 sink_command_size = 0x130;
-
-            constexpr u64 submix_command_max_size =
-                depop_mix_command_size + (mix_command_size * max_mix_buffers) * max_mix_buffers;
-
-            constexpr u64 volume_command_size = 0x1C;
-            constexpr u64 volume_ramp_command_size = 0x20;
-
-            constexpr u64 voice_biquad_filter_command_size =
-                biquad_filter_command_size * max_biquad_filters;
-            constexpr u64 voice_data_command_size = 0x9C;
-            const u64 voice_command_max_size =
-                (params.splitter_count * depop_setup_command_size) +
-                (voice_data_command_size + voice_biquad_filter_command_size +
-                 volume_ramp_command_size + mix_ramp_grouped_command_size);
-
-            // Now calculate the individual elements that comprise the size and add them together.
-            const u64 effect_commands_size = params.effect_count * effect_command_max_size;
-
-            const u64 final_mix_commands_size =
-                depop_mix_command_size + volume_command_size * max_mix_buffers;
-
-            const u64 perf_commands_size =
-                perf_command_size *
-                (CalculateNumPerformanceEntries(params) + max_perf_detail_entries);
-
-            const u64 sink_commands_size = params.sink_count * sink_command_size;
-
-            const u64 splitter_commands_size =
-                params.num_splitter_send_channels * max_mix_buffers * mix_ramp_command_size;
-
-            const u64 submix_commands_size = params.submix_count * submix_command_max_size;
-
-            const u64 voice_commands_size = params.voice_count * voice_command_max_size;
-
-            return effect_commands_size + final_mix_commands_size + perf_commands_size +
-                   sink_commands_size + splitter_commands_size + submix_commands_size +
-                   voice_commands_size + alignment;
-        };
-
-    IPC::RequestParser rp{ctx};
-    const auto params = rp.PopRaw<AudioCore::AudioRendererParameter>();
-
-    u64 size = 0;
-    size += calculate_mix_buffer_sizes(params);
-    size += calculate_mix_info_size(params);
-    size += calculate_voice_info_size(params);
-    size += upsampler_manager_size;
-    size += calculate_memory_pools_size(params);
-    size += calculate_splitter_context_size(params);
-
-    size = Common::AlignUp(size, buffer_alignment_size);
-
-    size += calculate_upsampler_info_size(params);
-    size += calculate_effect_info_size(params);
-    size += calculate_sink_info_size(params);
-    size += calculate_voice_state_size(params);
-    size += calculate_perf_size(params);
-    size += calculate_command_buffer_size(params);
-
-    // finally, 4KB page align the size, and we're done.
-    size = Common::AlignUp(size, 4096);
+    if (params.performance_frame_count >= 1) {
+        output_sz = Common::AlignUp(((16 * params.sink_count + 16 * params.effect_count +
+                                      16 * params.voice_count + 16) +
+                                     0x658) *
+                                            (params.performance_frame_count + 1) +
+                                        0xc0,
+                                    0x40) +
+                    output_sz;
+    }
+    output_sz = Common::AlignUp(output_sz + 0x1807e, 0x1000);

    IPC::ResponseBuilder rb{ctx, 4};
-    rb.Push(RESULT_SUCCESS);
-    rb.Push<u64>(size);

-    LOG_DEBUG(Service_Audio, "buffer_size=0x{:X}", size);
+    rb.Push(RESULT_SUCCESS);
+    rb.Push<u64>(output_sz);
+
+    LOG_DEBUG(Service_Audio, "buffer_size=0x{:X}", output_sz);
 }

 void AudRenU::GetAudioDeviceService(Kernel::HLERequestContext& ctx) {
@@ -598,15 +357,10 @@ void AudRenU::OpenAudioRendererImpl(Kernel::HLERequestContext& ctx) {
 }

 bool AudRenU::IsFeatureSupported(AudioFeatures feature, u32_le revision) const {
-    // Byte swap
-    const u32_be version_num = revision - Common::MakeMagic('R', 'E', 'V', '0');
-
+    u32_be version_num = (revision - Common::MakeMagic('R', 'E', 'V', '0')); // Byte swap
    switch (feature) {
    case AudioFeatures::Splitter:
-        return version_num >= 2U;
-    case AudioFeatures::PerformanceMetricsVersion2:
-    case AudioFeatures::VariadicCommandBuffer:
-        return version_num >= 5U;
+        return version_num >= 2u;
    default:
        return false;
    }
--- a/src/core/hle/service/audio/audren_u.h
+++ b/src/core/hle/service/audio/audren_u.h
@@ -28,8 +28,6 @@ private:

    enum class AudioFeatures : u32 {
        Splitter,
-        PerformanceMetricsVersion2,
-        VariadicCommandBuffer,
    };

    bool IsFeatureSupported(AudioFeatures feature, u32_le revision) const;
--- a/src/core/hle/service/btdrv/btdrv.cpp
+++ b/src/core/hle/service/btdrv/btdrv.cpp
@@ -34,8 +34,8 @@ public:
        RegisterHandlers(functions);

        auto& kernel = Core::System::GetInstance().Kernel();
-        register_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "BT:RegisterEvent");
+        register_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::OneShot,
+                                                                "BT:RegisterEvent");
    }

 private:
--- a/src/core/hle/service/btm/btm.cpp
+++ b/src/core/hle/service/btm/btm.cpp
@@ -57,13 +57,13 @@ public:
        RegisterHandlers(functions);

        auto& kernel = Core::System::GetInstance().Kernel();
-        scan_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
+        scan_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::OneShot,
                                                            "IBtmUserCore:ScanEvent");
        connection_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IBtmUserCore:ConnectionEvent");
+            kernel, Kernel::ResetType::OneShot, "IBtmUserCore:ConnectionEvent");
        service_discovery = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IBtmUserCore:Discovery");
-        config_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
+            kernel, Kernel::ResetType::OneShot, "IBtmUserCore:Discovery");
+        config_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::OneShot,
                                                              "IBtmUserCore:ConfigEvent");
    }

--- a/src/core/hle/service/hid/controllers/npad.cpp
+++ b/src/core/hle/service/hid/controllers/npad.cpp
@@ -170,7 +170,7 @@ void Controller_NPad::InitNewlyAddedControler(std::size_t controller_idx) {
 void Controller_NPad::OnInit() {
    auto& kernel = Core::System::GetInstance().Kernel();
    styleset_changed_event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Automatic, "npad:NpadStyleSetChanged");
+        kernel, Kernel::ResetType::OneShot, "npad:NpadStyleSetChanged");

    if (!IsControllerActivated()) {
        return;
--- a/src/core/hle/service/nfp/nfp.cpp
+++ b/src/core/hle/service/nfp/nfp.cpp
@@ -26,7 +26,7 @@ constexpr ResultCode ERR_NO_APPLICATION_AREA(ErrorModule::NFP, 152);
 Module::Interface::Interface(std::shared_ptr<Module> module, const char* name)
    : ServiceFramework(name), module(std::move(module)) {
    auto& kernel = Core::System::GetInstance().Kernel();
-    nfc_tag_load = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
+    nfc_tag_load = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::OneShot,
                                                          "IUser:NFCTagDetected");
 }

@@ -67,9 +67,9 @@ public:

        auto& kernel = Core::System::GetInstance().Kernel();
        deactivate_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IUser:DeactivateEvent");
+            kernel, Kernel::ResetType::OneShot, "IUser:DeactivateEvent");
        availability_change_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IUser:AvailabilityChangeEvent");
+            kernel, Kernel::ResetType::OneShot, "IUser:AvailabilityChangeEvent");
    }

 private:
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -62,9 +62,9 @@ public:
        RegisterHandlers(functions);

        auto& kernel = Core::System::GetInstance().Kernel();
-        event1 = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
+        event1 = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::OneShot,
                                                        "IRequest:Event1");
-        event2 = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
+        event2 = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::OneShot,
                                                        "IRequest:Event2");
    }

--- a/src/core/hle/service/nim/nim.cpp
+++ b/src/core/hle/service/nim/nim.cpp
@@ -141,7 +141,7 @@ public:

        auto& kernel = Core::System::GetInstance().Kernel();
        finished_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic,
+            kernel, Kernel::ResetType::OneShot,
            "IEnsureNetworkClockAvailabilityService:FinishEvent");
    }

--- a/src/core/hle/service/nvdrv/interface.cpp
+++ b/src/core/hle/service/nvdrv/interface.cpp
@@ -129,7 +129,7 @@ NVDRV::NVDRV(std::shared_ptr<Module> nvdrv, const char* name)
    RegisterHandlers(functions);

    auto& kernel = Core::System::GetInstance().Kernel();
-    query_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
+    query_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::OneShot,
                                                         "NVDRV::query_event");
 }

--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -16,7 +16,7 @@ namespace Service::NVFlinger {

 BufferQueue::BufferQueue(u32 id, u64 layer_id) : id(id), layer_id(layer_id) {
    auto& kernel = Core::System::GetInstance().Kernel();
-    buffer_wait_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
+    buffer_wait_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Sticky,
                                                               "BufferQueue NativeHandle");
 }

--- a/src/core/hle/service/set/set.cpp
+++ b/src/core/hle/service/set/set.cpp
@@ -2,15 +2,16 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <algorithm>
 #include <chrono>
 #include "common/logging/log.h"
 #include "core/hle/ipc_helpers.h"
+#include "core/hle/kernel/client_port.h"
+#include "core/hle/kernel/client_session.h"
 #include "core/hle/service/set/set.h"
 #include "core/settings.h"

 namespace Service::Set {
-namespace {
+
 constexpr std::array<LanguageCode, 17> available_language_codes = {{
    LanguageCode::JA,
    LanguageCode::EN_US,
@@ -31,35 +32,41 @@ constexpr std::array<LanguageCode, 17> available_language_codes = {{
    LanguageCode::ZH_HANT,
 }};

-constexpr std::size_t pre4_0_0_max_entries = 15;
-constexpr std::size_t post4_0_0_max_entries = 17;
+constexpr std::size_t pre4_0_0_max_entries = 0xF;
+constexpr std::size_t post4_0_0_max_entries = 0x40;

 constexpr ResultCode ERR_INVALID_LANGUAGE{ErrorModule::Settings, 625};

-void PushResponseLanguageCode(Kernel::HLERequestContext& ctx, std::size_t num_language_codes) {
-    IPC::ResponseBuilder rb{ctx, 3};
-    rb.Push(RESULT_SUCCESS);
-    rb.Push(static_cast<u32>(num_language_codes));
-}
-
-void GetAvailableLanguageCodesImpl(Kernel::HLERequestContext& ctx, std::size_t max_size) {
-    const std::size_t requested_amount = ctx.GetWriteBufferSize() / sizeof(LanguageCode);
-    const std::size_t copy_amount = std::min(requested_amount, max_size);
-    const std::size_t copy_size = copy_amount * sizeof(LanguageCode);
-
-    ctx.WriteBuffer(available_language_codes.data(), copy_size);
-    PushResponseLanguageCode(ctx, copy_amount);
-}
-} // Anonymous namespace
-
 LanguageCode GetLanguageCodeFromIndex(std::size_t index) {
    return available_language_codes.at(index);
 }

+template <std::size_t size>
+static std::array<LanguageCode, size> MakeLanguageCodeSubset() {
+    std::array<LanguageCode, size> arr;
+    std::copy_n(available_language_codes.begin(), size, arr.begin());
+    return arr;
+}
+
+static void PushResponseLanguageCode(Kernel::HLERequestContext& ctx, std::size_t max_size) {
+    IPC::ResponseBuilder rb{ctx, 3};
+    rb.Push(RESULT_SUCCESS);
+    if (available_language_codes.size() > max_size) {
+        rb.Push(static_cast<u32>(max_size));
+    } else {
+        rb.Push(static_cast<u32>(available_language_codes.size()));
+    }
+}
+
 void SET::GetAvailableLanguageCodes(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service_SET, "called");

-    GetAvailableLanguageCodesImpl(ctx, pre4_0_0_max_entries);
+    if (available_language_codes.size() > pre4_0_0_max_entries) {
+        ctx.WriteBuffer(MakeLanguageCodeSubset<pre4_0_0_max_entries>());
+    } else {
+        ctx.WriteBuffer(available_language_codes);
+    }
+    PushResponseLanguageCode(ctx, pre4_0_0_max_entries);
 }

 void SET::MakeLanguageCode(Kernel::HLERequestContext& ctx) {
@@ -80,7 +87,12 @@ void SET::MakeLanguageCode(Kernel::HLERequestContext& ctx) {
 void SET::GetAvailableLanguageCodes2(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service_SET, "called");

-    GetAvailableLanguageCodesImpl(ctx, post4_0_0_max_entries);
+    if (available_language_codes.size() > post4_0_0_max_entries) {
+        ctx.WriteBuffer(MakeLanguageCodeSubset<post4_0_0_max_entries>());
+    } else {
+        ctx.WriteBuffer(available_language_codes);
+    }
+    PushResponseLanguageCode(ctx, post4_0_0_max_entries);
 }

 void SET::GetAvailableLanguageCodeCount(Kernel::HLERequestContext& ctx) {
@@ -90,9 +102,9 @@ void SET::GetAvailableLanguageCodeCount(Kernel::HLERequestContext& ctx) {
 }

 void SET::GetAvailableLanguageCodeCount2(Kernel::HLERequestContext& ctx) {
-    LOG_DEBUG(Service_SET, "called");
-
    PushResponseLanguageCode(ctx, post4_0_0_max_entries);
+
+    LOG_DEBUG(Service_SET, "called");
 }

 void SET::GetLanguageCode(Kernel::HLERequestContext& ctx) {
--- a/src/core/hle/service/vi/display/vi_display.cpp
+++ b/src/core/hle/service/vi/display/vi_display.cpp
@@ -17,7 +17,7 @@ namespace Service::VI {

 Display::Display(u64 id, std::string name) : id{id}, name{std::move(name)} {
    auto& kernel = Core::System::GetInstance().Kernel();
-    vsync_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
+    vsync_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Sticky,
                                                         fmt::format("Display VSync Event {}", id));
 }

--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -556,7 +556,7 @@ private:
            } else {
                // Wait the current thread until a buffer becomes available
                ctx.SleepClientThread(
-                    "IHOSBinderDriver::DequeueBuffer", -1,
+                    Kernel::GetCurrentThread(), "IHOSBinderDriver::DequeueBuffer", -1,
                    [=](Kernel::SharedPtr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx,
                        Kernel::ThreadWakeupReason reason) {
                        // Repeat TransactParcel DequeueBuffer when a buffer is available
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@@ -39,7 +39,7 @@ std::vector<u8> DecompressSegment(const std::vector<u8>& compressed_data,
    const std::vector<u8> uncompressed_data =
        Common::Compression::DecompressDataLZ4(compressed_data, header.size);

-    ASSERT_MSG(uncompressed_data.size() == header.size, "{} != {}", header.size,
+    ASSERT_MSG(uncompressed_data.size() == static_cast<int>(header.size), "{} != {}", header.size,
               uncompressed_data.size());

    return uncompressed_data;
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -90,6 +90,7 @@ void LogSettings() {
    LogSetting("Renderer_UseResolutionFactor", Settings::values.resolution_factor);
    LogSetting("Renderer_UseFrameLimit", Settings::values.use_frame_limit);
    LogSetting("Renderer_FrameLimit", Settings::values.frame_limit);
+    LogSetting("Renderer_UseCompatibilityProfile", Settings::values.use_compatibility_profile);
    LogSetting("Renderer_UseDiskShaderCache", Settings::values.use_disk_shader_cache);
    LogSetting("Renderer_UseAccurateGpuEmulation", Settings::values.use_accurate_gpu_emulation);
    LogSetting("Renderer_UseAsynchronousGpuEmulation",
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -390,6 +390,7 @@ struct Values {
    float resolution_factor;
    bool use_frame_limit;
    u16 frame_limit;
+    bool use_compatibility_profile;
    bool use_disk_shader_cache;
    bool use_accurate_gpu_emulation;
    bool use_asynchronous_gpu_emulation;
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -42,6 +42,8 @@ add_library(video_core STATIC
    renderer_opengl/gl_device.h
    renderer_opengl/gl_global_cache.cpp
    renderer_opengl/gl_global_cache.h
+    renderer_opengl/gl_primitive_assembler.cpp
+    renderer_opengl/gl_primitive_assembler.h
    renderer_opengl/gl_rasterizer.cpp
    renderer_opengl/gl_rasterizer.h
    renderer_opengl/gl_rasterizer_cache.cpp
@@ -87,6 +89,7 @@ add_library(video_core STATIC
    shader/decode/conversion.cpp
    shader/decode/memory.cpp
    shader/decode/texture.cpp
+    shader/decode/image.cpp
    shader/decode/float_set_predicate.cpp
    shader/decode/integer_set_predicate.cpp
    shader/decode/half_set_predicate.cpp
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -40,13 +40,6 @@ bool DmaPusher::Step() {
    }

    const CommandList& command_list{dma_pushbuffer.front()};
-    ASSERT_OR_EXECUTE(!command_list.empty(), {
-        // Somehow the command_list is empty, in order to avoid a crash
-        // We ignore it and assume its size is 0.
-        dma_pushbuffer.pop();
-        dma_pushbuffer_subindex = 0;
-        return true;
-    });
    const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]};
    GPUVAddr dma_get = command_list_header.addr;
    GPUVAddr dma_put = dma_get + command_list_header.size * sizeof(u32);
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -2,8 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <cstring>
-
 #include "common/assert.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/memory_manager.h"
@@ -12,9 +10,7 @@
 namespace Tegra::Engines::Upload {

 State::State(MemoryManager& memory_manager, Registers& regs)
-    : regs{regs}, memory_manager{memory_manager} {}
-
-State::~State() = default;
+    : memory_manager(memory_manager), regs(regs) {}

 void State::ProcessExec(const bool is_linear) {
    write_offset = 0;
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@@ -4,8 +4,10 @@

 #pragma once

+#include <cstddef>
 #include <vector>
 #include "common/bit_field.h"
+#include "common/common_funcs.h"
 #include "common/common_types.h"

 namespace Tegra {
@@ -55,10 +57,10 @@ struct Registers {
 class State {
 public:
    State(MemoryManager& memory_manager, Registers& regs);
-    ~State();
+    ~State() = default;

-    void ProcessExec(bool is_linear);
-    void ProcessData(u32 data, bool is_last_call);
+    void ProcessExec(const bool is_linear);
+    void ProcessData(const u32 data, const bool is_last_call);

 private:
    u32 write_offset = 0;
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -34,9 +34,9 @@ void Maxwell3D::InitializeRegisterDefaults() {

    // Depth range near/far is not always set, but is expected to be the default 0.0f, 1.0f. This is
    // needed for ARMS.
-    for (auto& viewport : regs.viewports) {
-        viewport.depth_range_near = 0.0f;
-        viewport.depth_range_far = 1.0f;
+    for (std::size_t viewport{}; viewport < Regs::NumViewports; ++viewport) {
+        regs.viewports[viewport].depth_range_near = 0.0f;
+        regs.viewports[viewport].depth_range_far = 1.0f;
    }

    // Doom and Bomberman seems to use the uninitialized registers and just enable blend
@@ -47,13 +47,13 @@ void Maxwell3D::InitializeRegisterDefaults() {
    regs.blend.equation_a = Regs::Blend::Equation::Add;
    regs.blend.factor_source_a = Regs::Blend::Factor::One;
    regs.blend.factor_dest_a = Regs::Blend::Factor::Zero;
-    for (auto& blend : regs.independent_blend) {
-        blend.equation_rgb = Regs::Blend::Equation::Add;
-        blend.factor_source_rgb = Regs::Blend::Factor::One;
-        blend.factor_dest_rgb = Regs::Blend::Factor::Zero;
-        blend.equation_a = Regs::Blend::Equation::Add;
-        blend.factor_source_a = Regs::Blend::Factor::One;
-        blend.factor_dest_a = Regs::Blend::Factor::Zero;
+    for (std::size_t blend_index = 0; blend_index < Regs::NumRenderTargets; blend_index++) {
+        regs.independent_blend[blend_index].equation_rgb = Regs::Blend::Equation::Add;
+        regs.independent_blend[blend_index].factor_source_rgb = Regs::Blend::Factor::One;
+        regs.independent_blend[blend_index].factor_dest_rgb = Regs::Blend::Factor::Zero;
+        regs.independent_blend[blend_index].equation_a = Regs::Blend::Equation::Add;
+        regs.independent_blend[blend_index].factor_source_a = Regs::Blend::Factor::One;
+        regs.independent_blend[blend_index].factor_dest_a = Regs::Blend::Factor::Zero;
    }
    regs.stencil_front_op_fail = Regs::StencilOp::Keep;
    regs.stencil_front_op_zfail = Regs::StencilOp::Keep;
@@ -75,11 +75,11 @@ void Maxwell3D::InitializeRegisterDefaults() {

    // TODO(bunnei): Some games do not initialize the color masks (e.g. Sonic Mania). Assuming a
    // default of enabled fixes rendering here.
-    for (auto& color_mask : regs.color_mask) {
-        color_mask.R.Assign(1);
-        color_mask.G.Assign(1);
-        color_mask.B.Assign(1);
-        color_mask.A.Assign(1);
+    for (std::size_t color_mask = 0; color_mask < Regs::NumRenderTargets; color_mask++) {
+        regs.color_mask[color_mask].R.Assign(1);
+        regs.color_mask[color_mask].G.Assign(1);
+        regs.color_mask[color_mask].B.Assign(1);
+        regs.color_mask[color_mask].A.Assign(1);
    }

    // Commercial games seem to assume this value is enabled and nouveau sets this value manually.
@@ -178,13 +178,13 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {

        // Vertex buffer
        if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
-            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) {
+            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) {
            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
        } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
-                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) {
+                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) {
            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
        } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
-                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) {
+                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) {
            dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
        }
    }
@@ -432,17 +432,13 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
    Texture::TICEntry tic_entry;
    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));

-    ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear ||
-                   tic_entry.header_version == Texture::TICHeaderVersion::Pitch,
-               "TIC versions other than BlockLinear or Pitch are unimplemented");
-
-    const auto r_type = tic_entry.r_type.Value();
-    const auto g_type = tic_entry.g_type.Value();
-    const auto b_type = tic_entry.b_type.Value();
-    const auto a_type = tic_entry.a_type.Value();
+    const auto r_type{tic_entry.r_type.Value()};
+    const auto g_type{tic_entry.g_type.Value()};
+    const auto b_type{tic_entry.b_type.Value()};
+    const auto a_type{tic_entry.a_type.Value()};

    // TODO(Subv): Different data types for separate components are not supported
-    DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
+    ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);

    return tic_entry;
 }
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -6,7 +6,6 @@

 #include <array>
 #include <bitset>
-#include <type_traits>
 #include <unordered_map>
 #include <vector>

@@ -59,7 +58,6 @@ public:
        static constexpr std::size_t NumCBData = 16;
        static constexpr std::size_t NumVertexArrays = 32;
        static constexpr std::size_t NumVertexAttributes = 32;
-        static constexpr std::size_t NumVaryings = 31;
        static constexpr std::size_t NumTextureSamplers = 32;
        static constexpr std::size_t NumClipDistances = 8;
        static constexpr std::size_t MaxShaderProgram = 6;
@@ -1109,7 +1107,6 @@ public:
    } regs{};

    static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "Maxwell3D Regs has wrong size");
-    static_assert(std::is_trivially_copyable_v<Regs>, "Maxwell3D Regs must be trivially copyable");

    struct State {
        struct ConstBufferInfo {
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -98,10 +98,6 @@ union Attribute {
        BitField<22, 2, u64> element;
        BitField<24, 6, Index> index;
        BitField<47, 3, AttributeSize> size;
-
-        bool IsPhysical() const {
-            return element == 0 && static_cast<u64>(index.Value()) == 0;
-        }
    } fmt20;

    union {
@@ -126,6 +122,15 @@ union Sampler {
    u64 value{};
 };

+union Image {
+    Image() = default;
+
+    constexpr explicit Image(u64 value) : value{value} {}
+
+    BitField<36, 13, u64> index;
+    u64 value;
+};
+
 } // namespace Tegra::Shader

 namespace std {
@@ -344,6 +349,26 @@ enum class TextureMiscMode : u64 {
    PTP,
 };

+enum class SurfaceDataMode : u64 {
+    P = 0,
+    D_BA = 1,
+};
+
+enum class OutOfBoundsStore : u64 {
+    Ignore = 0,
+    Clamp = 1,
+    Trap = 2,
+};
+
+enum class ImageType : u64 {
+    Texture1D = 0,
+    TextureBuffer = 1,
+    Texture1DArray = 2,
+    Texture2D = 3,
+    Texture2DArray = 4,
+    Texture3D = 5,
+};
+
 enum class IsberdMode : u64 {
    None = 0,
    Patch = 1,
@@ -398,7 +423,7 @@ enum class LmemLoadCacheManagement : u64 {
    CV = 3,
 };

-enum class LmemStoreCacheManagement : u64 {
+enum class StoreCacheManagement : u64 {
    Default = 0,
    CG = 1,
    CS = 2,
@@ -503,11 +528,6 @@ enum class SystemVariable : u64 {
    CircularQueueEntryAddressHigh = 0x63,
 };

-enum class PhysicalAttributeDirection : u64 {
-    Input = 0,
-    Output = 1,
-};
-
 union Instruction {
    Instruction& operator=(const Instruction& instr) {
        value = instr.value;
@@ -529,11 +549,6 @@ union Instruction {
    BitField<39, 8, Register> gpr39;
    BitField<48, 16, u64> opcode;

-    union {
-        BitField<8, 8, Register> gpr;
-        BitField<20, 24, s64> offset;
-    } gmem;
-
    union {
        BitField<20, 16, u64> imm20_16;
        BitField<20, 19, u64> imm20_19;
@@ -601,7 +616,6 @@ union Instruction {
    } alu;

    union {
-        BitField<38, 1, u64> idx;
        BitField<51, 1, u64> saturate;
        BitField<52, 2, IpaSampleMode> sample_mode;
        BitField<54, 2, IpaInterpMode> interp_mode;
@@ -811,30 +825,21 @@ union Instruction {
    } ld_l;

    union {
-        BitField<44, 2, LmemStoreCacheManagement> cache_management;
+        BitField<44, 2, StoreCacheManagement> cache_management;
    } st_l;

    union {
        BitField<48, 3, UniformType> type;
        BitField<46, 2, u64> cache_mode;
+        BitField<20, 24, s64> immediate_offset;
    } ldg;

    union {
        BitField<48, 3, UniformType> type;
        BitField<46, 2, u64> cache_mode;
+        BitField<20, 24, s64> immediate_offset;
    } stg;

-    union {
-        BitField<32, 1, PhysicalAttributeDirection> direction;
-        BitField<47, 3, AttributeSize> size;
-        BitField<20, 11, u64> address;
-    } al2p;
-
-    union {
-        BitField<53, 3, UniformType> type;
-        BitField<52, 1, u64> extended;
-    } generic;
-
    union {
        BitField<0, 3, u64> pred0;
        BitField<3, 3, u64> pred3;
@@ -1231,6 +1236,20 @@ union Instruction {
        }
    } texs;

+    union {
+        BitField<28, 1, u64> is_array;
+        BitField<29, 2, TextureType> texture_type;
+        BitField<35, 1, u64> aoffi;
+        BitField<49, 1, u64> nodep_flag;
+        BitField<50, 1, u64> ms; // Multisample?
+        BitField<54, 1, u64> cl;
+        BitField<55, 1, u64> process_mode;
+
+        TextureProcessMode GetTextureProcessMode() const {
+            return process_mode == 0 ? TextureProcessMode::LZ : TextureProcessMode::LL;
+        }
+    } tld;
+
    union {
        BitField<49, 1, u64> nodep_flag;
        BitField<53, 4, u64> texture_info;
@@ -1280,6 +1299,35 @@ union Instruction {
        }
    } tlds;

+    union {
+        BitField<24, 2, StoreCacheManagement> cache_management;
+        BitField<33, 3, ImageType> image_type;
+        BitField<49, 2, OutOfBoundsStore> out_of_bounds_store;
+        BitField<51, 1, u64> is_immediate;
+        BitField<52, 1, SurfaceDataMode> mode;
+
+        BitField<20, 3, StoreType> store_data_layout;
+        BitField<20, 4, u64> component_mask_selector;
+
+        bool IsComponentEnabled(std::size_t component) const {
+            ASSERT(mode == SurfaceDataMode::P);
+            constexpr u8 R = 0b0001;
+            constexpr u8 G = 0b0010;
+            constexpr u8 B = 0b0100;
+            constexpr u8 A = 0b1000;
+            constexpr std::array<u8, 16> mask = {
+                0,       (R),         (G),         (R | G),        (B),     (R | B),
+                (G | B), (R | G | B), (A),         (R | A),        (G | A), (R | G | A),
+                (B | A), (R | B | A), (G | B | A), (R | G | B | A)};
+            return std::bitset<4>{mask.at(component_mask_selector)}.test(component);
+        }
+
+        StoreType GetStoreDataLayout() const {
+            ASSERT(mode == SurfaceDataMode::D_BA);
+            return store_data_layout;
+        }
+    } sust;
+
    union {
        BitField<20, 24, u64> target;
        BitField<5, 1, u64> constant_buffer;
@@ -1371,6 +1419,7 @@ union Instruction {

    Attribute attribute;
    Sampler sampler;
+    Image image;

    u64 value;
 };
@@ -1395,24 +1444,23 @@ public:
        LD_L,
        LD_S,
        LD_C,
-        LD,  // Load from generic memory
-        LDG, // Load from global memory
        ST_A,
        ST_L,
        ST_S,
-        ST,   // Store in generic memory
-        STG,  // Store in global memory
-        AL2P, // Transforms attribute memory into physical memory
+        LDG, // Load from global memory
+        STG, // Store in global memory
        TEX,
        TEX_B,  // Texture Load Bindless
        TXQ,    // Texture Query
        TXQ_B,  // Texture Query Bindless
        TEXS,   // Texture Fetch with scalar/non-vec4 source/destinations
+        TLD,    // Texture Load
        TLDS,   // Texture Load with scalar/non-vec4 source/destinations
        TLD4,   // Texture Load 4
        TLD4S,  // Texture Load 4 with scalar / non - vec4 source / destinations
        TMML_B, // Texture Mip Map Level
        TMML,   // Texture Mip Map Level
+        SUST,   // Surface Store
        EXIT,
        IPA,
        OUT_R, // Emit vertex/primitive
@@ -1543,6 +1591,7 @@ public:
        Synch,
        Memory,
        Texture,
+        Image,
        FloatSet,
        FloatSetPredicate,
        IntegerSet,
@@ -1668,24 +1717,23 @@ private:
            INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
            INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
            INST("1110111110010---", Id::LD_C, Type::Memory, "LD_C"),
-            INST("100-------------", Id::LD, Type::Memory, "LD"),
-            INST("1110111011010---", Id::LDG, Type::Memory, "LDG"),
            INST("1110111111110---", Id::ST_A, Type::Memory, "ST_A"),
            INST("1110111101011---", Id::ST_S, Type::Memory, "ST_S"),
            INST("1110111101010---", Id::ST_L, Type::Memory, "ST_L"),
-            INST("101-------------", Id::ST, Type::Memory, "ST"),
+            INST("1110111011010---", Id::LDG, Type::Memory, "LDG"),
            INST("1110111011011---", Id::STG, Type::Memory, "STG"),
-            INST("1110111110100---", Id::AL2P, Type::Memory, "AL2P"),
            INST("110000----111---", Id::TEX, Type::Texture, "TEX"),
            INST("1101111010111---", Id::TEX_B, Type::Texture, "TEX_B"),
            INST("1101111101001---", Id::TXQ, Type::Texture, "TXQ"),
            INST("1101111101010---", Id::TXQ_B, Type::Texture, "TXQ_B"),
            INST("1101-00---------", Id::TEXS, Type::Texture, "TEXS"),
+            INST("11011100--11----", Id::TLD, Type::Texture, "TLD"),
            INST("1101101---------", Id::TLDS, Type::Texture, "TLDS"),
            INST("110010----111---", Id::TLD4, Type::Texture, "TLD4"),
            INST("1101111100------", Id::TLD4S, Type::Texture, "TLD4S"),
            INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
            INST("1101111101011---", Id::TMML, Type::Texture, "TMML"),
+            INST("11101011001-----", Id::SUST, Type::Image, "SUST"),
            INST("111000110000----", Id::EXIT, Type::Trivial, "EXIT"),
            INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
            INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -118,7 +118,7 @@ void SynchState::WaitForSynchronization(u64 fence) {
    // Wait for the GPU to be idle (all commands to be executed)
    {
        MICROPROFILE_SCOPE(GPU_wait);
-        std::unique_lock lock{synchronization_mutex};
+        std::unique_lock<std::mutex> lock{synchronization_mutex};
        synchronization_condition.wait(lock, [this, fence] { return signaled_fence >= fence; });
    }
 }
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -81,6 +81,12 @@ struct CommandDataContainer {
    CommandDataContainer(CommandData&& data, u64 next_fence)
        : data{std::move(data)}, fence{next_fence} {}

+    CommandDataContainer& operator=(const CommandDataContainer& t) {
+        data = std::move(t.data);
+        fence = t.fence;
+        return *this;
+    }
+
    CommandData data;
    u64 fence{};
 };
@@ -103,7 +109,7 @@ struct SynchState final {

    void TrySynchronize() {
        if (IsSynchronized()) {
-            std::lock_guard lock{synchronization_mutex};
+            std::lock_guard<std::mutex> lock{synchronization_mutex};
            synchronization_condition.notify_one();
        }
    }
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -118,12 +118,10 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) {
                          static_cast<u32>(opcode.operation.Value()));
    }

-    // An instruction with the Exit flag will not actually
-    // cause an exit if it's executed inside a delay slot.
-    // TODO(Blinkhawk): Reversed to always exit. The behavior explained above requires further
-    // testing on the MME code.
    if (opcode.is_exit) {
        // Exit has a delay slot, execute the next instruction
+        // Note: Executing an exit during a branch delay slot will cause the instruction at the
+        // branch target to be executed before exiting.
        Step(offset, true);
        return false;
    }
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -144,9 +144,8 @@ protected:

        object->SetIsRegistered(false);
        rasterizer.UpdatePagesCachedCount(object->GetCpuAddr(), object->GetSizeInBytes(), -1);
-        const CacheAddr addr = object->GetCacheAddr();
        interval_cache.subtract({GetInterval(object), ObjectSet{object}});
-        map_cache.erase(addr);
+        map_cache.erase(object->GetCacheAddr());
    }

    /// Returns a ticks counter used for tracking when cached objects were last modified
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -71,6 +71,16 @@ GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t s
    return uploaded_offset;
 }

+std::tuple<u8*, GLintptr> OGLBufferCache::ReserveMemory(std::size_t size, std::size_t alignment) {
+    AlignBuffer(alignment);
+    u8* const uploaded_ptr = buffer_ptr;
+    const GLintptr uploaded_offset = buffer_offset;
+
+    buffer_ptr += size;
+    buffer_offset += size;
+    return std::make_tuple(uploaded_ptr, uploaded_offset);
+}
+
 bool OGLBufferCache::Map(std::size_t max_size) {
    bool invalidate;
    std::tie(buffer_ptr, buffer_offset_base, invalidate) =
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -61,6 +61,9 @@ public:
    /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
    GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);

+    /// Reserves memory to be used by host's CPU. Returns mapped address and offset.
+    std::tuple<u8*, GLintptr> ReserveMemory(std::size_t size, std::size_t alignment = 4);
+
    bool Map(std::size_t max_size);
    void Unmap();

--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -21,21 +21,16 @@ T GetInteger(GLenum pname) {

 Device::Device() {
    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
-    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
-    max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
    has_variable_aoffi = TestVariableAoffi();
 }

 Device::Device(std::nullptr_t) {
    uniform_buffer_alignment = 0;
-    max_vertex_attributes = 16;
-    max_varyings = 15;
    has_variable_aoffi = true;
 }

 bool Device::TestVariableAoffi() {
    const GLchar* AOFFI_TEST = R"(#version 430 core
-// This is a unit test, please ignore me on apitrace bug reports.
 uniform sampler2D tex;
 uniform ivec2 variable_offset;
 void main() {
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -5,7 +5,6 @@
 #pragma once

 #include <cstddef>
-#include "common/common_types.h"

 namespace OpenGL {

@@ -18,14 +17,6 @@ public:
        return uniform_buffer_alignment;
    }

-    u32 GetMaxVertexAttributes() const {
-        return max_vertex_attributes;
-    }
-
-    u32 GetMaxVaryings() const {
-        return max_varyings;
-    }
-
    bool HasVariableAoffi() const {
        return has_variable_aoffi;
    }
@@ -34,8 +25,6 @@ private:
    static bool TestVariableAoffi();

    std::size_t uniform_buffer_alignment{};
-    u32 max_vertex_attributes{};
-    u32 max_varyings{};
    bool has_variable_aoffi{};
 };

--- a/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
+++ b/src/video_core/renderer_opengl/gl_primitive_assembler.cpp
@@ -0,0 +1,63 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/memory_manager.h"
+#include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_primitive_assembler.h"
+
+namespace OpenGL {
+
+constexpr u32 TRIANGLES_PER_QUAD = 6;
+constexpr std::array<u32, TRIANGLES_PER_QUAD> QUAD_MAP = {0, 1, 2, 0, 2, 3};
+
+PrimitiveAssembler::PrimitiveAssembler(OGLBufferCache& buffer_cache) : buffer_cache(buffer_cache) {}
+
+PrimitiveAssembler::~PrimitiveAssembler() = default;
+
+std::size_t PrimitiveAssembler::CalculateQuadSize(u32 count) const {
+    ASSERT_MSG(count % 4 == 0, "Quad count is expected to be a multiple of 4");
+    return (count / 4) * TRIANGLES_PER_QUAD * sizeof(GLuint);
+}
+
+GLintptr PrimitiveAssembler::MakeQuadArray(u32 first, u32 count) {
+    const std::size_t size{CalculateQuadSize(count)};
+    auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(size);
+
+    for (u32 primitive = 0; primitive < count / 4; ++primitive) {
+        for (u32 i = 0; i < TRIANGLES_PER_QUAD; ++i) {
+            const u32 index = first + primitive * 4 + QUAD_MAP[i];
+            std::memcpy(dst_pointer, &index, sizeof(index));
+            dst_pointer += sizeof(index);
+        }
+    }
+
+    return index_offset;
+}
+
+GLintptr PrimitiveAssembler::MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count) {
+    const std::size_t map_size{CalculateQuadSize(count)};
+    auto [dst_pointer, index_offset] = buffer_cache.ReserveMemory(map_size);
+
+    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
+    const u8* source{memory_manager.GetPointer(gpu_addr)};
+
+    for (u32 primitive = 0; primitive < count / 4; ++primitive) {
+        for (std::size_t i = 0; i < TRIANGLES_PER_QUAD; ++i) {
+            const u32 index = primitive * 4 + QUAD_MAP[i];
+            const u8* src_offset = source + (index * index_size);
+
+            std::memcpy(dst_pointer, src_offset, index_size);
+            dst_pointer += index_size;
+        }
+    }
+
+    return index_offset;
+}
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_primitive_assembler.h
+++ b/src/video_core/renderer_opengl/gl_primitive_assembler.h
@@ -0,0 +1,31 @@
+// Copyright 2018 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <glad/glad.h>
+
+#include "common/common_types.h"
+
+namespace OpenGL {
+
+class OGLBufferCache;
+
+class PrimitiveAssembler {
+public:
+    explicit PrimitiveAssembler(OGLBufferCache& buffer_cache);
+    ~PrimitiveAssembler();
+
+    /// Calculates the size required by MakeQuadArray and MakeQuadIndexed.
+    std::size_t CalculateQuadSize(u32 count) const;
+
+    GLintptr MakeQuadArray(u32 first, u32 count);
+
+    GLintptr MakeQuadIndexed(GPUVAddr gpu_addr, std::size_t index_size, u32 count);
+
+private:
+    OGLBufferCache& buffer_cache;
+};
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -29,8 +29,10 @@
 namespace OpenGL {

 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using PixelFormat = VideoCore::Surface::PixelFormat;
-using SurfaceType = VideoCore::Surface::SurfaceType;
+
+using VideoCore::Surface::PixelFormat;
+using VideoCore::Surface::SurfaceTarget;
+using VideoCore::Surface::SurfaceType;

 MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192));
@@ -98,11 +100,9 @@ struct FramebufferCacheKey {
    }
 };

-RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                                   ScreenInfo& info)
-    : res_cache{*this}, shader_cache{*this, system, emu_window, device},
-      global_cache{*this}, system{system}, screen_info{info},
-      buffer_cache(*this, STREAM_BUFFER_SIZE) {
+RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info)
+    : res_cache{*this}, shader_cache{*this, system, device}, global_cache{*this}, system{system},
+      screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) {
    OpenGLState::ApplyDefaultState();

    shader_program_manager = std::make_unique<GLShader::ProgramManager>();
@@ -121,11 +121,6 @@ void RasterizerOpenGL::CheckExtensions() {
            Render_OpenGL,
            "Anisotropic filter is not supported! This can cause graphical issues in some games.");
    }
-    if (!GLAD_GL_ARB_buffer_storage) {
-        LOG_WARNING(
-            Render_OpenGL,
-            "Buffer storage control is not supported! This can cause performance degradation.");
-    }
 }

 GLuint RasterizerOpenGL::SetupVertexFormat() {
@@ -246,6 +241,29 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
    DrawParameters params{};
    params.current_instance = gpu.state.current_instance;

+    if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
+        MICROPROFILE_SCOPE(OpenGL_PrimitiveAssembly);
+
+        params.use_indexed = true;
+        params.primitive_mode = GL_TRIANGLES;
+
+        if (is_indexed) {
+            params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
+            params.count = (regs.index_array.count / 4) * 6;
+            params.index_buffer_offset = primitive_assembler.MakeQuadIndexed(
+                regs.index_array.IndexStart(), regs.index_array.FormatSizeInBytes(),
+                regs.index_array.count);
+            params.base_vertex = static_cast<GLint>(regs.vb_element_base);
+        } else {
+            // MakeQuadArray always generates u32 indexes
+            params.index_format = GL_UNSIGNED_INT;
+            params.count = (regs.vertex_buffer.count / 4) * 6;
+            params.index_buffer_offset =
+                primitive_assembler.MakeQuadArray(regs.vertex_buffer.first, params.count);
+        }
+        return params;
+    }
+
    params.use_indexed = is_indexed;
    params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);

@@ -302,8 +320,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
                                 static_cast<GLsizeiptr>(sizeof(ubo)));

        Shader shader{shader_cache.GetStageProgram(program)};
-        const auto [program_handle, next_bindings] =
-            shader->GetProgramHandle(primitive_mode, base_bindings);
+
+        const auto stage_enum{static_cast<Maxwell::ShaderStage>(stage)};
+        SetupConstBuffers(stage_enum, shader, base_bindings);
+        SetupGlobalRegions(stage_enum, shader, base_bindings);
+        const auto texture_buffer_usage{SetupTextures(stage_enum, shader, base_bindings)};
+
+        const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage};
+        const auto [program_handle, next_bindings] = shader->GetProgramHandle(variant);

        switch (program) {
        case Maxwell::ShaderProgram::VertexA:
@@ -321,11 +345,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
                              shader_config.enable.Value(), shader_config.offset);
        }

-        const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
-        SetupConstBuffers(stage_enum, shader, program_handle, base_bindings);
-        SetupGlobalRegions(stage_enum, shader, program_handle, base_bindings);
-        SetupTextures(stage_enum, shader, program_handle, base_bindings);
-
        // Workaround for Intel drivers.
        // When a clip distance is enabled but not set in the shader it crops parts of the screen
        // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
@@ -663,19 +682,30 @@ void RasterizerOpenGL::DrawArrays() {
    SyncCullMode();
    SyncPrimitiveRestart();
    SyncScissorTest(state);
+    // Alpha Testing is synced on shaders.
    SyncTransformFeedback();
    SyncPointState();
+    CheckAlphaTests();
    SyncPolygonOffset();
-    SyncAlphaTest();
+    // TODO(bunnei): Sync framebuffer_scale uniform here
+    // TODO(bunnei): Sync scissorbox uniform(s) here

    // Draw the vertex batch
    const bool is_indexed = accelerate_draw == AccelDraw::Indexed;

    std::size_t buffer_size = CalculateVertexArraysSize();

-    // Add space for index buffer
-    if (is_indexed) {
-        buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
+    // Add space for index buffer (keeping in mind non-core primitives)
+    switch (regs.draw.topology) {
+    case Maxwell::PrimitiveTopology::Quads:
+        buffer_size = Common::AlignUp(buffer_size, 4) +
+                      primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count);
+        break;
+    default:
+        if (is_indexed) {
+            buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
+        }
+        break;
    }

    // Uniform space for the 5 shader stages
@@ -777,8 +807,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 }

 void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                         const Shader& shader, GLuint program_handle,
-                                         BaseBindings base_bindings) {
+                                         const Shader& shader, BaseBindings base_bindings) {
    MICROPROFILE_SCOPE(OpenGL_UBO);
    const auto& gpu = system.GPU();
    const auto& maxwell3d = gpu.Maxwell3D();
@@ -825,8 +854,7 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader
 }

 void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                          const Shader& shader, GLenum primitive_mode,
-                                          BaseBindings base_bindings) {
+                                          const Shader& shader, BaseBindings base_bindings) {
    const auto& entries = shader->GetShaderEntries().global_memory_entries;
    for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
        const auto& entry{entries[bindpoint]};
@@ -839,8 +867,8 @@ void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::Shade
    }
 }

-void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
-                                     GLuint program_handle, BaseBindings base_bindings) {
+TextureBufferUsage RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
+                                                   BaseBindings base_bindings) {
    MICROPROFILE_SCOPE(OpenGL_Texture);
    const auto& gpu = system.GPU();
    const auto& maxwell3d = gpu.Maxwell3D();
@@ -849,6 +877,8 @@ void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& s
    ASSERT_MSG(base_bindings.sampler + entries.size() <= std::size(state.texture_units),
               "Exceeded the number of active textures.");

+    TextureBufferUsage texture_buffer_usage{0};
+
    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
        const auto& entry = entries[bindpoint];
        Tegra::Texture::FullTextureInfo texture;
@@ -862,18 +892,25 @@ void RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& s
        }
        const u32 current_bindpoint = base_bindings.sampler + bindpoint;

-        state.texture_units[current_bindpoint].sampler = sampler_cache.GetSampler(texture.tsc);
+        auto& unit{state.texture_units[current_bindpoint]};
+        unit.sampler = sampler_cache.GetSampler(texture.tsc);

        if (Surface surface = res_cache.GetTextureSurface(texture, entry); surface) {
-            state.texture_units[current_bindpoint].texture =
-                surface->Texture(entry.IsArray()).handle;
+            if (surface->GetSurfaceParams().target == SurfaceTarget::TextureBuffer) {
+                // Record that this texture is a texture buffer.
+                texture_buffer_usage.set(bindpoint);
+            }
+
+            unit.texture = surface->Texture(entry.IsArray()).handle;
            surface->UpdateSwizzle(texture.tic.x_source, texture.tic.y_source, texture.tic.z_source,
                                   texture.tic.w_source);
        } else {
            // Can occur when texture addr is null or its memory is unmapped/invalid
-            state.texture_units[current_bindpoint].texture = 0;
+            unit.texture = 0;
        }
    }
+
+    return texture_buffer_usage;
 }

 void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
@@ -1103,9 +1140,7 @@ void RasterizerOpenGL::SyncTransformFeedback() {

 void RasterizerOpenGL::SyncPointState() {
    const auto& regs = system.GPU().Maxwell3D().regs;
-    // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid
-    // in OpenGL).
-    state.point.size = std::max(1.0f, regs.point_size);
+    state.point.size = regs.point_size;
 }

 void RasterizerOpenGL::SyncPolygonOffset() {
@@ -1118,17 +1153,10 @@ void RasterizerOpenGL::SyncPolygonOffset() {
    state.polygon_offset.clamp = regs.polygon_offset_clamp;
 }

-void RasterizerOpenGL::SyncAlphaTest() {
+void RasterizerOpenGL::CheckAlphaTests() {
    const auto& regs = system.GPU().Maxwell3D().regs;
    UNIMPLEMENTED_IF_MSG(regs.alpha_test_enabled != 0 && regs.rt_control.count > 1,
                         "Alpha Testing is enabled with more than one rendertarget");
-
-    state.alpha_test.enabled = regs.alpha_test_enabled;
-    if (!state.alpha_test.enabled) {
-        return;
-    }
-    state.alpha_test.func = MaxwellToGL::ComparisonOp(regs.alpha_test_func);
-    state.alpha_test.ref = regs.alpha_test_ref;
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -23,6 +23,7 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_global_cache.h"
+#include "video_core/renderer_opengl/gl_primitive_assembler.h"
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
@@ -47,8 +48,7 @@ struct FramebufferCacheKey;

 class RasterizerOpenGL : public VideoCore::RasterizerInterface {
 public:
-    explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
-                              ScreenInfo& info);
+    explicit RasterizerOpenGL(Core::System& system, ScreenInfo& info);
    ~RasterizerOpenGL() override;

    void DrawArrays() override;
@@ -106,16 +106,16 @@ private:

    /// Configures the current constbuffers to use for the draw command.
    void SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
-                           GLuint program_handle, BaseBindings base_bindings);
+                           BaseBindings base_bindings);

    /// Configures the current global memory entries to use for the draw command.
    void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                            const Shader& shader, GLenum primitive_mode,
-                            BaseBindings base_bindings);
+                            const Shader& shader, BaseBindings base_bindings);

-    /// Configures the current textures to use for the draw command.
-    void SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage, const Shader& shader,
-                       GLuint program_handle, BaseBindings base_bindings);
+    /// Configures the current textures to use for the draw command. Returns shaders texture buffer
+    /// usage.
+    TextureBufferUsage SetupTextures(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                     const Shader& shader, BaseBindings base_bindings);

    /// Syncs the viewport and depth range to match the guest state
    void SyncViewport(OpenGLState& current_state);
@@ -166,8 +166,8 @@ private:
    /// Syncs the polygon offsets
    void SyncPolygonOffset();

-    /// Syncs the alpha test state to match the guest state
-    void SyncAlphaTest();
+    /// Check asserts for alpha testing.
+    void CheckAlphaTests();

    /// Check for extension that are not strictly required
    /// but are needed for correct emulation
@@ -196,6 +196,7 @@ private:

    static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
    OGLBufferCache buffer_cache;
+    PrimitiveAssembler primitive_assembler{buffer_cache};

    BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
    BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -140,7 +140,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only,

    params.width = Common::AlignUp(config.tic.Width(), GetCompressionFactor(params.pixel_format));
    params.height = Common::AlignUp(config.tic.Height(), GetCompressionFactor(params.pixel_format));
-    if (!params.is_tiled) {
+    if (config.tic.IsLineal()) {
        params.pitch = config.tic.Pitch();
    }
    params.unaligned_height = config.tic.Height();
@@ -149,6 +149,7 @@ std::size_t SurfaceParams::InnerMemorySize(bool force_gl, bool layer_only,

    switch (params.target) {
    case SurfaceTarget::Texture1D:
+    case SurfaceTarget::TextureBuffer:
    case SurfaceTarget::Texture2D:
        params.depth = 1;
        break;
@@ -389,6 +390,8 @@ static GLenum SurfaceTargetToGL(SurfaceTarget target) {
    switch (target) {
    case SurfaceTarget::Texture1D:
        return GL_TEXTURE_1D;
+    case SurfaceTarget::TextureBuffer:
+        return GL_TEXTURE_BUFFER;
    case SurfaceTarget::Texture2D:
        return GL_TEXTURE_2D;
    case SurfaceTarget::Texture3D:
@@ -600,29 +603,35 @@ CachedSurface::CachedSurface(const SurfaceParams& params)

    switch (params.target) {
    case SurfaceTarget::Texture1D:
-        glTextureStorage1D(texture.handle, params.max_mip_level, format_tuple.internal_format,
-                           width);
+        glTextureStorage1D(texture.handle, params.max_mip_level, gl_internal_format, width);
+        break;
+    case SurfaceTarget::TextureBuffer:
+        texture_buffer.Create();
+        glNamedBufferStorage(texture_buffer.handle,
+                             params.width * GetBytesPerPixel(params.pixel_format), nullptr,
+                             GL_DYNAMIC_STORAGE_BIT);
+        glTextureBuffer(texture.handle, gl_internal_format, texture_buffer.handle);
        break;
    case SurfaceTarget::Texture2D:
    case SurfaceTarget::TextureCubemap:
-        glTextureStorage2D(texture.handle, params.max_mip_level, format_tuple.internal_format,
-                           width, height);
+        glTextureStorage2D(texture.handle, params.max_mip_level, gl_internal_format, width, height);
        break;
    case SurfaceTarget::Texture3D:
    case SurfaceTarget::Texture2DArray:
    case SurfaceTarget::TextureCubeArray:
-        glTextureStorage3D(texture.handle, params.max_mip_level, format_tuple.internal_format,
-                           width, height, params.depth);
+        glTextureStorage3D(texture.handle, params.max_mip_level, gl_internal_format, width, height,
+                           params.depth);
        break;
    default:
        LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}",
                     static_cast<u32>(params.target));
        UNREACHABLE();
-        glTextureStorage2D(texture.handle, params.max_mip_level, format_tuple.internal_format,
-                           width, height);
+        glTextureStorage2D(texture.handle, params.max_mip_level, gl_internal_format, width, height);
    }

-    ApplyTextureDefaults(texture.handle, params.max_mip_level);
+    if (params.target != SurfaceTarget::TextureBuffer) {
+        ApplyTextureDefaults(texture.handle, params.max_mip_level);
+    }

    OpenGL::LabelGLObject(GL_TEXTURE, texture.handle, params.gpu_addr, params.IdentityString());
 }
@@ -785,6 +794,13 @@ void CachedSurface::UploadGLMipmapTexture(RasterizerTemporaryMemory& res_cache_t
            glTextureSubImage1D(texture.handle, mip_map, x0, static_cast<GLsizei>(rect.GetWidth()),
                                tuple.format, tuple.type, &gl_buffer[mip_map][buffer_offset]);
            break;
+        case SurfaceTarget::TextureBuffer:
+            ASSERT(mip_map == 0);
+            glNamedBufferSubData(texture_buffer.handle, x0,
+                                 static_cast<GLsizeiptr>(rect.GetWidth()) *
+                                     GetBytesPerPixel(params.pixel_format),
+                                 &gl_buffer[mip_map][buffer_offset]);
+            break;
        case SurfaceTarget::Texture2D:
            glTextureSubImage2D(texture.handle, mip_map, x0, y0,
                                static_cast<GLsizei>(rect.GetWidth()),
@@ -860,6 +876,9 @@ void CachedSurface::UpdateSwizzle(Tegra::Texture::SwizzleSource swizzle_x,
                                  Tegra::Texture::SwizzleSource swizzle_y,
                                  Tegra::Texture::SwizzleSource swizzle_z,
                                  Tegra::Texture::SwizzleSource swizzle_w) {
+    if (params.target == SurfaceTarget::TextureBuffer) {
+        return;
+    }
    const GLenum new_x = MaxwellToGL::SwizzleSource(swizzle_x);
    const GLenum new_y = MaxwellToGL::SwizzleSource(swizzle_y);
    const GLenum new_z = MaxwellToGL::SwizzleSource(swizzle_z);
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -250,6 +250,8 @@ struct SurfaceParams {
        switch (target) {
        case SurfaceTarget::Texture1D:
            return "1D";
+        case SurfaceTarget::TextureBuffer:
+            return "Buffer";
        case SurfaceTarget::Texture2D:
            return "2D";
        case SurfaceTarget::Texture3D:
@@ -439,6 +441,7 @@ private:

    OGLTexture texture;
    OGLTexture discrepant_view;
+    OGLBuffer texture_buffer;
    SurfaceParams params{};
    GLenum gl_target{};
    GLenum gl_internal_format{};
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -2,14 +2,10 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <mutex>
-#include <thread>
 #include <boost/functional/hash.hpp>
 #include "common/assert.h"
 #include "common/hash.h"
-#include "common/scope_exit.h"
 #include "core/core.h"
-#include "core/frontend/emu_window.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -168,10 +164,13 @@ GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgr
 }

 CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries,
-                               Maxwell::ShaderProgram program_type, BaseBindings base_bindings,
-                               GLenum primitive_mode, bool hint_retrievable = false) {
-    std::string source = "#version 430 core\n"
-                         "#extension GL_ARB_separate_shader_objects : enable\n\n";
+                               Maxwell::ShaderProgram program_type, const ProgramVariant& variant,
+                               bool hint_retrievable = false) {
+    auto base_bindings{variant.base_bindings};
+    const auto primitive_mode{variant.primitive_mode};
+    const auto texture_buffer_usage{variant.texture_buffer_usage};
+
+    std::string source = "#version 430 core\n";
    source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);

    for (const auto& cbuf : entries.const_buffers) {
@@ -186,6 +185,18 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
        source += fmt::format("#define SAMPLER_BINDING_{} {}\n", sampler.GetIndex(),
                              base_bindings.sampler++);
    }
+    for (const auto& image : entries.images) {
+        source +=
+            fmt::format("#define IMAGE_BINDING_{} {}\n", image.GetIndex(), base_bindings.image++);
+    }
+
+    // Transform 1D textures to texture samplers by declaring its preprocessor macros.
+    for (std::size_t i = 0; i < texture_buffer_usage.size(); ++i) {
+        if (!texture_buffer_usage.test(i)) {
+            continue;
+        }
+        source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i);
+    }

    if (program_type == Maxwell::ShaderProgram::Geometry) {
        const auto [glsl_topology, debug_name, max_vertices] =
@@ -261,20 +272,18 @@ CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
    shader_length = entries.shader_length;
 }

-std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(GLenum primitive_mode,
-                                                                BaseBindings base_bindings) {
+std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) {
    GLuint handle{};
    if (program_type == Maxwell::ShaderProgram::Geometry) {
-        handle = GetGeometryShader(primitive_mode, base_bindings);
+        handle = GetGeometryShader(variant);
    } else {
-        const auto [entry, is_cache_miss] = programs.try_emplace(base_bindings);
+        const auto [entry, is_cache_miss] = programs.try_emplace(variant);
        auto& program = entry->second;
        if (is_cache_miss) {
-            program = TryLoadProgram(primitive_mode, base_bindings);
+            program = TryLoadProgram(variant);
            if (!program) {
-                program =
-                    SpecializeShader(code, entries, program_type, base_bindings, primitive_mode);
-                disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));
+                program = SpecializeShader(code, entries, program_type, variant);
+                disk_cache.SaveUsage(GetUsage(variant));
            }

            LabelGLObject(GL_PROGRAM, program->handle, cpu_addr);
@@ -283,6 +292,7 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(GLenum primitive
        handle = program->handle;
    }

+    auto base_bindings{variant.base_bindings};
    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + RESERVED_UBOS;
    base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
    base_bindings.sampler += static_cast<u32>(entries.samplers.size());
@@ -290,43 +300,42 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(GLenum primitive
    return {handle, base_bindings};
 }

-GLuint CachedShader::GetGeometryShader(GLenum primitive_mode, BaseBindings base_bindings) {
-    const auto [entry, is_cache_miss] = geometry_programs.try_emplace(base_bindings);
+GLuint CachedShader::GetGeometryShader(const ProgramVariant& variant) {
+    const auto [entry, is_cache_miss] = geometry_programs.try_emplace(variant);
    auto& programs = entry->second;

-    switch (primitive_mode) {
+    switch (variant.primitive_mode) {
    case GL_POINTS:
-        return LazyGeometryProgram(programs.points, base_bindings, primitive_mode);
+        return LazyGeometryProgram(programs.points, variant);
    case GL_LINES:
    case GL_LINE_STRIP:
-        return LazyGeometryProgram(programs.lines, base_bindings, primitive_mode);
+        return LazyGeometryProgram(programs.lines, variant);
    case GL_LINES_ADJACENCY:
    case GL_LINE_STRIP_ADJACENCY:
-        return LazyGeometryProgram(programs.lines_adjacency, base_bindings, primitive_mode);
+        return LazyGeometryProgram(programs.lines_adjacency, variant);
    case GL_TRIANGLES:
    case GL_TRIANGLE_STRIP:
    case GL_TRIANGLE_FAN:
-        return LazyGeometryProgram(programs.triangles, base_bindings, primitive_mode);
+        return LazyGeometryProgram(programs.triangles, variant);
    case GL_TRIANGLES_ADJACENCY:
    case GL_TRIANGLE_STRIP_ADJACENCY:
-        return LazyGeometryProgram(programs.triangles_adjacency, base_bindings, primitive_mode);
+        return LazyGeometryProgram(programs.triangles_adjacency, variant);
    default:
        UNREACHABLE_MSG("Unknown primitive mode.");
-        return LazyGeometryProgram(programs.points, base_bindings, primitive_mode);
+        return LazyGeometryProgram(programs.points, variant);
    }
 }

-GLuint CachedShader::LazyGeometryProgram(CachedProgram& target_program, BaseBindings base_bindings,
-                                         GLenum primitive_mode) {
+GLuint CachedShader::LazyGeometryProgram(CachedProgram& target_program,
+                                         const ProgramVariant& variant) {
    if (target_program) {
        return target_program->handle;
    }
-    const auto [glsl_name, debug_name, vertices] = GetPrimitiveDescription(primitive_mode);
-    target_program = TryLoadProgram(primitive_mode, base_bindings);
+    const auto [glsl_name, debug_name, vertices] = GetPrimitiveDescription(variant.primitive_mode);
+    target_program = TryLoadProgram(variant);
    if (!target_program) {
-        target_program =
-            SpecializeShader(code, entries, program_type, base_bindings, primitive_mode);
-        disk_cache.SaveUsage(GetUsage(primitive_mode, base_bindings));
+        target_program = SpecializeShader(code, entries, program_type, variant);
+        disk_cache.SaveUsage(GetUsage(variant));
    }

    LabelGLObject(GL_PROGRAM, target_program->handle, cpu_addr, debug_name);
@@ -334,23 +343,24 @@ GLuint CachedShader::LazyGeometryProgram(CachedProgram& target_program, BaseBind
    return target_program->handle;
 };

-CachedProgram CachedShader::TryLoadProgram(GLenum primitive_mode,
-                                           BaseBindings base_bindings) const {
-    const auto found = precompiled_programs.find(GetUsage(primitive_mode, base_bindings));
+CachedProgram CachedShader::TryLoadProgram(const ProgramVariant& variant) const {
+    const auto found = precompiled_programs.find(GetUsage(variant));
    if (found == precompiled_programs.end()) {
        return {};
    }
    return found->second;
 }

-ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode,
-                                            BaseBindings base_bindings) const {
-    return {unique_identifier, base_bindings, primitive_mode};
+ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant) const {
+    ShaderDiskCacheUsage usage;
+    usage.unique_identifier = unique_identifier;
+    usage.variant = variant;
+    return usage;
 }

 ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
-                                     Core::Frontend::EmuWindow& emu_window, const Device& device)
-    : RasterizerCache{rasterizer}, emu_window{emu_window}, device{device}, disk_cache{system} {}
+                                     const Device& device)
+    : RasterizerCache{rasterizer}, device{device}, disk_cache{system} {}

 void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                                      const VideoCore::DiskResourceLoadCallback& callback) {
@@ -358,107 +368,62 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
    if (!transferable) {
        return;
    }
-    const auto [raws, shader_usages] = *transferable;
+    const auto [raws, usages] = *transferable;

    auto [decompiled, dumps] = disk_cache.LoadPrecompiled();

    const auto supported_formats{GetSupportedFormats()};
-    const auto unspecialized_shaders{
+    const auto unspecialized{
        GenerateUnspecializedShaders(stop_loading, callback, raws, decompiled)};
-    if (stop_loading) {
+    if (stop_loading)
        return;
-    }

    // Track if precompiled cache was altered during loading to know if we have to serialize the
    // virtual precompiled cache file back to the hard drive
    bool precompiled_cache_altered = false;

-    // Inform the frontend about shader build initialization
-    if (callback) {
-        callback(VideoCore::LoadCallbackStage::Build, 0, shader_usages.size());
-    }
+    // Build shaders
+    if (callback)
+        callback(VideoCore::LoadCallbackStage::Build, 0, usages.size());
+    for (std::size_t i = 0; i < usages.size(); ++i) {
+        if (stop_loading)
+            return;

-    std::mutex mutex;
-    std::size_t built_shaders = 0; // It doesn't have be atomic since it's used behind a mutex
-    std::atomic_bool compilation_failed = false;
+        const auto& usage{usages[i]};
+        LOG_INFO(Render_OpenGL, "Building shader {:016x} ({} of {})", usage.unique_identifier,
+                 i + 1, usages.size());

-    const auto Worker = [&](Core::Frontend::GraphicsContext* context, std::size_t begin,
-                            std::size_t end, const std::vector<ShaderDiskCacheUsage>& shader_usages,
-                            const ShaderDumpsMap& dumps) {
-        context->MakeCurrent();
-        SCOPE_EXIT({ return context->DoneCurrent(); });
+        const auto& unspec{unspecialized.at(usage.unique_identifier)};
+        const auto dump_it = dumps.find(usage);

-        for (std::size_t i = begin; i < end; ++i) {
-            if (stop_loading || compilation_failed) {
-                return;
-            }
-            const auto& usage{shader_usages[i]};
-            LOG_INFO(Render_OpenGL, "Building shader {:016x} (index {} of {})",
-                     usage.unique_identifier, i, shader_usages.size());
-
-            const auto& unspecialized{unspecialized_shaders.at(usage.unique_identifier)};
-            const auto dump{dumps.find(usage)};
-
-            CachedProgram shader;
-            if (dump != dumps.end()) {
-                // If the shader is dumped, attempt to load it with
-                shader = GeneratePrecompiledProgram(dump->second, supported_formats);
-                if (!shader) {
-                    compilation_failed = true;
-                    return;
-                }
-            }
+        CachedProgram shader;
+        if (dump_it != dumps.end()) {
+            // If the shader is dumped, attempt to load it with
+            shader = GeneratePrecompiledProgram(dump_it->second, supported_formats);
            if (!shader) {
-                shader = SpecializeShader(unspecialized.code, unspecialized.entries,
-                                          unspecialized.program_type, usage.bindings,
-                                          usage.primitive, true);
+                // Invalidate the precompiled cache if a shader dumped shader was rejected
+                disk_cache.InvalidatePrecompiled();
+                precompiled_cache_altered = true;
+                dumps.clear();
            }
-
-            std::scoped_lock lock(mutex);
-            if (callback) {
-                callback(VideoCore::LoadCallbackStage::Build, ++built_shaders,
-                         shader_usages.size());
-            }
-
-            precompiled_programs.emplace(usage, std::move(shader));
        }
-    };
+        if (!shader) {
+            shader = SpecializeShader(unspec.code, unspec.entries, unspec.program_type,
+                                      usage.variant, true);
+        }
+        precompiled_programs.insert({usage, std::move(shader)});

-    const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1)};
-    const std::size_t bucket_size{shader_usages.size() / num_workers};
-    std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers);
-    std::vector<std::thread> threads(num_workers);
-    for (std::size_t i = 0; i < num_workers; ++i) {
-        const bool is_last_worker = i + 1 == num_workers;
-        const std::size_t start{bucket_size * i};
-        const std::size_t end{is_last_worker ? shader_usages.size() : start + bucket_size};
-
-        // On some platforms the shared context has to be created from the GUI thread
-        contexts[i] = emu_window.CreateSharedContext();
-        threads[i] = std::thread(Worker, contexts[i].get(), start, end, shader_usages, dumps);
-    }
-    for (auto& thread : threads) {
-        thread.join();
-    }
-
-    if (compilation_failed) {
-        // Invalidate the precompiled cache if a shader dumped shader was rejected
-        disk_cache.InvalidatePrecompiled();
-        dumps.clear();
-        precompiled_cache_altered = true;
-        return;
-    }
-    if (stop_loading) {
-        return;
+        if (callback)
+            callback(VideoCore::LoadCallbackStage::Build, i + 1, usages.size());
    }

    // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw before
    // precompiling them

-    for (std::size_t i = 0; i < shader_usages.size(); ++i) {
-        const auto& usage{shader_usages[i]};
+    for (std::size_t i = 0; i < usages.size(); ++i) {
+        const auto& usage{usages[i]};
        if (dumps.find(usage) == dumps.end()) {
-            const auto& program{precompiled_programs.at(usage)};
+            const auto& program = precompiled_programs.at(usage);
            disk_cache.SaveDump(usage, program->handle);
            precompiled_cache_altered = true;
        }
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -6,6 +6,7 @@

 #include <array>
 #include <atomic>
+#include <bitset>
 #include <memory>
 #include <set>
 #include <tuple>
@@ -24,10 +25,6 @@ namespace Core {
 class System;
 }

-namespace Core::Frontend {
-class EmuWindow;
-}
-
 namespace OpenGL {

 class CachedShader;
@@ -67,8 +64,7 @@ public:
    }

    /// Gets the GL program handle for the shader
-    std::tuple<GLuint, BaseBindings> GetProgramHandle(GLenum primitive_mode,
-                                                      BaseBindings base_bindings);
+    std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant);

 private:
    // Geometry programs. These are needed because GLSL needs an input topology but it's not
@@ -82,15 +78,14 @@ private:
        CachedProgram triangles_adjacency;
    };

-    GLuint GetGeometryShader(GLenum primitive_mode, BaseBindings base_bindings);
+    GLuint GetGeometryShader(const ProgramVariant& variant);

    /// Generates a geometry shader or returns one that already exists.
-    GLuint LazyGeometryProgram(CachedProgram& target_program, BaseBindings base_bindings,
-                               GLenum primitive_mode);
+    GLuint LazyGeometryProgram(CachedProgram& target_program, const ProgramVariant& variant);

-    CachedProgram TryLoadProgram(GLenum primitive_mode, BaseBindings base_bindings) const;
+    CachedProgram TryLoadProgram(const ProgramVariant& variant) const;

-    ShaderDiskCacheUsage GetUsage(GLenum primitive_mode, BaseBindings base_bindings) const;
+    ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const;

    u8* host_ptr{};
    VAddr cpu_addr{};
@@ -104,8 +99,8 @@ private:

    std::string code;

-    std::unordered_map<BaseBindings, CachedProgram> programs;
-    std::unordered_map<BaseBindings, GeometryPrograms> geometry_programs;
+    std::unordered_map<ProgramVariant, CachedProgram> programs;
+    std::unordered_map<ProgramVariant, GeometryPrograms> geometry_programs;

    std::unordered_map<u32, GLuint> cbuf_resource_cache;
    std::unordered_map<u32, GLuint> gmem_resource_cache;
@@ -115,7 +110,7 @@ private:
 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
 public:
    explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
-                               Core::Frontend::EmuWindow& emu_window, const Device& device);
+                               const Device& device);

    /// Loads disk cache for the current game
    void LoadDiskCache(const std::atomic_bool& stop_loading,
@@ -137,13 +132,13 @@ private:
    CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
                                             const std::set<GLenum>& supported_formats);

-    Core::Frontend::EmuWindow& emu_window;
    const Device& device;
-    ShaderDiskCacheOpenGL disk_cache;

+    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
+
+    ShaderDiskCacheOpenGL disk_cache;
    PrecompiledShaders precompiled_shaders;
    PrecompiledPrograms precompiled_programs;
-    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -27,6 +27,7 @@ struct ShaderEntries;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 using ProgramResult = std::pair<std::string, ShaderEntries>;
 using SamplerEntry = VideoCommon::Shader::Sampler;
+using ImageEntry = VideoCommon::Shader::Image;

 class ConstBufferEntry : public VideoCommon::Shader::ConstBuffer {
 public:
@@ -74,6 +75,7 @@ struct ShaderEntries {
    std::vector<ConstBufferEntry> const_buffers;
    std::vector<SamplerEntry> samplers;
    std::vector<SamplerEntry> bindless_samplers;
+    std::vector<ImageEntry> images;
    std::vector<GlobalMemoryEntry> global_memory_entries;
    std::array<bool, Maxwell::NumClipDistances> clip_distances{};
    std::size_t shader_length{};
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -34,11 +34,11 @@ enum class PrecompiledEntryKind : u32 {
    Dump,
 };

-constexpr u32 NativeVersion = 1;
+constexpr u32 NativeVersion = 3;

 // Making sure sizes doesn't change by accident
-static_assert(sizeof(BaseBindings) == 12);
-static_assert(sizeof(ShaderDiskCacheUsage) == 24);
+static_assert(sizeof(BaseBindings) == 16);
+static_assert(sizeof(ShaderDiskCacheUsage) == 40);

 namespace {

@@ -104,9 +104,8 @@ bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const {
    return true;
 }

-ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {}
-
-ShaderDiskCacheOpenGL::~ShaderDiskCacheOpenGL() = default;
+ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system)
+    : system{system}, precompiled_cache_virtual_file_offset{0} {}

 std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
 ShaderDiskCacheOpenGL::LoadTransferable() {
@@ -183,7 +182,8 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
    return {{raws, usages}};
 }

-std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>, ShaderDumpsMap>
+std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>,
+          std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>
 ShaderDiskCacheOpenGL::LoadPrecompiled() {
    if (!IsUsable())
        return {};
@@ -207,7 +207,8 @@ ShaderDiskCacheOpenGL::LoadPrecompiled() {
    return *result;
 }

-std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>, ShaderDumpsMap>>
+std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>,
+                        std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>>
 ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
    // Read compressed file from disk and decompress to virtual precompiled cache file
    std::vector<u8> compressed(file.GetSize());
@@ -228,7 +229,7 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
    }

    std::unordered_map<u64, ShaderDiskCacheDecompiled> decompiled;
-    ShaderDumpsMap dumps;
+    std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> dumps;
    while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) {
        PrecompiledEntryKind kind{};
        if (!LoadObjectFromPrecompiled(kind)) {
@@ -242,7 +243,7 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
                return {};
            }

-            auto entry = LoadDecompiledEntry();
+            const auto entry = LoadDecompiledEntry();
            if (!entry) {
                return {};
            }
@@ -285,82 +286,97 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
    if (!LoadObjectFromPrecompiled(code_size)) {
        return {};
    }
-
-    std::string code(code_size, '\0');
+    std::vector<u8> code(code_size);
    if (!LoadArrayFromPrecompiled(code.data(), code.size())) {
        return {};
    }

    ShaderDiskCacheDecompiled entry;
-    entry.code = std::move(code);
+    entry.code = std::string(reinterpret_cast<const char*>(code.data()), code_size);

    u32 const_buffers_count{};
    if (!LoadObjectFromPrecompiled(const_buffers_count)) {
        return {};
    }
-
    for (u32 i = 0; i < const_buffers_count; ++i) {
        u32 max_offset{};
        u32 index{};
-        bool is_indirect{};
+        u8 is_indirect{};
        if (!LoadObjectFromPrecompiled(max_offset) || !LoadObjectFromPrecompiled(index) ||
            !LoadObjectFromPrecompiled(is_indirect)) {
            return {};
        }
-        entry.entries.const_buffers.emplace_back(max_offset, is_indirect, index);
+        entry.entries.const_buffers.emplace_back(max_offset, is_indirect != 0, index);
    }

    u32 samplers_count{};
    if (!LoadObjectFromPrecompiled(samplers_count)) {
        return {};
    }
-
    for (u32 i = 0; i < samplers_count; ++i) {
        u64 offset{};
        u64 index{};
        u32 type{};
-        bool is_array{};
-        bool is_shadow{};
-        bool is_bindless{};
+        u8 is_array{};
+        u8 is_shadow{};
+        u8 is_bindless{};
        if (!LoadObjectFromPrecompiled(offset) || !LoadObjectFromPrecompiled(index) ||
            !LoadObjectFromPrecompiled(type) || !LoadObjectFromPrecompiled(is_array) ||
            !LoadObjectFromPrecompiled(is_shadow) || !LoadObjectFromPrecompiled(is_bindless)) {
            return {};
        }
-        entry.entries.samplers.emplace_back(
+        entry.entries.samplers.emplace_back(static_cast<std::size_t>(offset),
+                                            static_cast<std::size_t>(index),
+                                            static_cast<Tegra::Shader::TextureType>(type),
+                                            is_array != 0, is_shadow != 0, is_bindless != 0);
+    }
+
+    u32 images_count{};
+    if (!LoadObjectFromPrecompiled(images_count)) {
+        return {};
+    }
+    for (u32 i = 0; i < images_count; ++i) {
+        u64 offset{};
+        u64 index{};
+        u32 type{};
+        u8 is_bindless{};
+        if (!LoadObjectFromPrecompiled(offset) || !LoadObjectFromPrecompiled(index) ||
+            !LoadObjectFromPrecompiled(type) || !LoadObjectFromPrecompiled(is_bindless)) {
+            return {};
+        }
+        entry.entries.images.emplace_back(
            static_cast<std::size_t>(offset), static_cast<std::size_t>(index),
-            static_cast<Tegra::Shader::TextureType>(type), is_array, is_shadow, is_bindless);
+            static_cast<Tegra::Shader::ImageType>(type), is_bindless != 0);
    }

    u32 global_memory_count{};
    if (!LoadObjectFromPrecompiled(global_memory_count)) {
        return {};
    }
-
    for (u32 i = 0; i < global_memory_count; ++i) {
        u32 cbuf_index{};
        u32 cbuf_offset{};
-        bool is_read{};
-        bool is_written{};
+        u8 is_read{};
+        u8 is_written{};
        if (!LoadObjectFromPrecompiled(cbuf_index) || !LoadObjectFromPrecompiled(cbuf_offset) ||
            !LoadObjectFromPrecompiled(is_read) || !LoadObjectFromPrecompiled(is_written)) {
            return {};
        }
-        entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read,
-                                                         is_written);
+        entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read != 0,
+                                                         is_written != 0);
    }

    for (auto& clip_distance : entry.entries.clip_distances) {
-        if (!LoadObjectFromPrecompiled(clip_distance)) {
+        u8 clip_distance_raw{};
+        if (!LoadObjectFromPrecompiled(clip_distance_raw))
            return {};
-        }
+        clip_distance = clip_distance_raw != 0;
    }

    u64 shader_length{};
    if (!LoadObjectFromPrecompiled(shader_length)) {
        return {};
    }
-
    entry.entries.shader_length = static_cast<std::size_t>(shader_length);

    return entry;
@@ -381,7 +397,7 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std:
    for (const auto& cbuf : entries.const_buffers) {
        if (!SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetMaxOffset())) ||
            !SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetIndex())) ||
-            !SaveObjectToPrecompiled(cbuf.IsIndirect())) {
+            !SaveObjectToPrecompiled(static_cast<u8>(cbuf.IsIndirect() ? 1 : 0))) {
            return false;
        }
    }
@@ -393,9 +409,21 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std:
        if (!SaveObjectToPrecompiled(static_cast<u64>(sampler.GetOffset())) ||
            !SaveObjectToPrecompiled(static_cast<u64>(sampler.GetIndex())) ||
            !SaveObjectToPrecompiled(static_cast<u32>(sampler.GetType())) ||
-            !SaveObjectToPrecompiled(sampler.IsArray()) ||
-            !SaveObjectToPrecompiled(sampler.IsShadow()) ||
-            !SaveObjectToPrecompiled(sampler.IsBindless())) {
+            !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsArray() ? 1 : 0)) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsShadow() ? 1 : 0)) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsBindless() ? 1 : 0))) {
+            return false;
+        }
+    }
+
+    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.images.size()))) {
+        return false;
+    }
+    for (const auto& image : entries.images) {
+        if (!SaveObjectToPrecompiled(static_cast<u64>(image.GetOffset())) ||
+            !SaveObjectToPrecompiled(static_cast<u64>(image.GetIndex())) ||
+            !SaveObjectToPrecompiled(static_cast<u32>(image.GetType())) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(image.IsBindless() ? 1 : 0))) {
            return false;
        }
    }
@@ -406,13 +434,14 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std:
    for (const auto& gmem : entries.global_memory_entries) {
        if (!SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufIndex())) ||
            !SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufOffset())) ||
-            !SaveObjectToPrecompiled(gmem.IsRead()) || !SaveObjectToPrecompiled(gmem.IsWritten())) {
+            !SaveObjectToPrecompiled(static_cast<u8>(gmem.IsRead() ? 1 : 0)) ||
+            !SaveObjectToPrecompiled(static_cast<u8>(gmem.IsWritten() ? 1 : 0))) {
            return false;
        }
    }

    for (const bool clip_distance : entries.clip_distances) {
-        if (!SaveObjectToPrecompiled(clip_distance)) {
+        if (!SaveObjectToPrecompiled(static_cast<u8>(clip_distance ? 1 : 0))) {
            return false;
        }
    }
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -33,19 +33,18 @@ namespace OpenGL {
 using ProgramCode = std::vector<u64>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;

-struct ShaderDiskCacheUsage;
-struct ShaderDiskCacheDump;
+using TextureBufferUsage = std::bitset<64>;

-using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
-
-/// Allocated bindings used by an OpenGL shader program
+/// Allocated bindings used by an OpenGL shader program.
 struct BaseBindings {
    u32 cbuf{};
    u32 gmem{};
    u32 sampler{};
+    u32 image{};

    bool operator==(const BaseBindings& rhs) const {
-        return std::tie(cbuf, gmem, sampler) == std::tie(rhs.cbuf, rhs.gmem, rhs.sampler);
+        return std::tie(cbuf, gmem, sampler, image) ==
+               std::tie(rhs.cbuf, rhs.gmem, rhs.sampler, rhs.image);
    }

    bool operator!=(const BaseBindings& rhs) const {
@@ -53,15 +52,29 @@ struct BaseBindings {
    }
 };

-/// Describes how a shader is used
+/// Describes the different variants a single program can be compiled.
+struct ProgramVariant {
+    BaseBindings base_bindings;
+    GLenum primitive_mode{};
+    TextureBufferUsage texture_buffer_usage{};
+
+    bool operator==(const ProgramVariant& rhs) const {
+        return std::tie(base_bindings, primitive_mode, texture_buffer_usage) ==
+               std::tie(rhs.base_bindings, rhs.primitive_mode, rhs.texture_buffer_usage);
+    }
+
+    bool operator!=(const ProgramVariant& rhs) const {
+        return !operator==(rhs);
+    }
+};
+
+/// Describes how a shader is used.
 struct ShaderDiskCacheUsage {
    u64 unique_identifier{};
-    BaseBindings bindings;
-    GLenum primitive{};
+    ProgramVariant variant;

    bool operator==(const ShaderDiskCacheUsage& rhs) const {
-        return std::tie(unique_identifier, bindings, primitive) ==
-               std::tie(rhs.unique_identifier, rhs.bindings, rhs.primitive);
+        return std::tie(unique_identifier, variant) == std::tie(rhs.unique_identifier, rhs.variant);
    }

    bool operator!=(const ShaderDiskCacheUsage& rhs) const {
@@ -75,16 +88,28 @@ namespace std {

 template <>
 struct hash<OpenGL::BaseBindings> {
-    std::size_t operator()(const OpenGL::BaseBindings& bindings) const noexcept {
-        return bindings.cbuf | bindings.gmem << 8 | bindings.sampler << 16;
+    std::size_t operator()(const OpenGL::BaseBindings& bindings) const {
+        return static_cast<std::size_t>(bindings.cbuf) ^
+               (static_cast<std::size_t>(bindings.gmem) << 8) ^
+               (static_cast<std::size_t>(bindings.sampler) << 16) ^
+               (static_cast<std::size_t>(bindings.image) << 24);
+    }
+};
+
+template <>
+struct hash<OpenGL::ProgramVariant> {
+    std::size_t operator()(const OpenGL::ProgramVariant& variant) const {
+        return std::hash<OpenGL::BaseBindings>()(variant.base_bindings) ^
+               std::hash<OpenGL::TextureBufferUsage>()(variant.texture_buffer_usage) ^
+               (static_cast<std::size_t>(variant.primitive_mode) << 6);
    }
 };

 template <>
 struct hash<OpenGL::ShaderDiskCacheUsage> {
-    std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const noexcept {
+    std::size_t operator()(const OpenGL::ShaderDiskCacheUsage& usage) const {
        return static_cast<std::size_t>(usage.unique_identifier) ^
-               std::hash<OpenGL::BaseBindings>()(usage.bindings) ^ usage.primitive << 16;
+               std::hash<OpenGL::ProgramVariant>()(usage.variant);
    }
 };

@@ -167,7 +192,6 @@ struct ShaderDiskCacheDump {
 class ShaderDiskCacheOpenGL {
 public:
    explicit ShaderDiskCacheOpenGL(Core::System& system);
-    ~ShaderDiskCacheOpenGL();

    /// Loads transferable cache. If file has a old version or on failure, it deletes the file.
    std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>>
@@ -265,38 +289,24 @@ private:
        return SaveArrayToPrecompiled(&object, 1);
    }

-    bool SaveObjectToPrecompiled(bool object) {
-        const auto value = static_cast<u8>(object);
-        return SaveArrayToPrecompiled(&value, 1);
-    }
-
    template <typename T>
    bool LoadObjectFromPrecompiled(T& object) {
        return LoadArrayFromPrecompiled(&object, 1);
    }

-    bool LoadObjectFromPrecompiled(bool& object) {
-        u8 value;
-        const bool read_ok = LoadArrayFromPrecompiled(&value, 1);
-        if (!read_ok) {
-            return false;
-        }
-
-        object = value != 0;
-        return true;
-    }
-
-    // Core system
    Core::System& system;
-    // Stored transferable shaders
-    std::map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable;
-    // Stores whole precompiled cache which will be read from/saved to the precompiled cache file
+
+    // Stores whole precompiled cache which will be read from or saved to the precompiled chache
+    // file
    FileSys::VectorVfsFile precompiled_cache_virtual_file;
    // Stores the current offset of the precompiled cache file for IO purposes
-    std::size_t precompiled_cache_virtual_file_offset = 0;
+    std::size_t precompiled_cache_virtual_file_offset;
+
+    // Stored transferable shaders
+    std::unordered_map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable;

    // The cache has been loaded at boot
    bool tried_to_load{};
 };

-} // namespace OpenGL
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -19,7 +19,8 @@ static constexpr u32 PROGRAM_OFFSET{10};
 ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);

-    std::string out = "// Shader Unique Id: VS" + id + "\n\n";
+    std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
+    out += "// Shader Unique Id: VS" + id + "\n\n";
    out += GetCommonDeclarations();

    out += R"(
@@ -28,17 +29,18 @@ layout (location = 0) out vec4 position;
 layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
    vec4 viewport_flip;
    uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
+    uvec4 alpha_test;
 };

 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
    ProgramResult program =
        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");

    out += program.first;

    if (setup.IsDualProgram()) {
-        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
+        ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
        ProgramResult program_b =
            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");

@@ -74,13 +76,14 @@ void main() {
    }
 })";

-    return {std::move(out), std::move(program.second)};
+    return {out, program.second};
 }

 ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup) {
    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);

-    std::string out = "// Shader Unique Id: GS" + id + "\n\n";
+    std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
+    out += "// Shader Unique Id: GS" + id + "\n\n";
    out += GetCommonDeclarations();

    out += R"(
@@ -90,10 +93,11 @@ layout (location = 0) out vec4 position;
 layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
    vec4 viewport_flip;
    uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
+    uvec4 alpha_test;
 };

 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
    ProgramResult program =
        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
    out += program.first;
@@ -103,13 +107,14 @@ void main() {
    execute_geometry();
 };)";

-    return {std::move(out), std::move(program.second)};
+    return {out, program.second};
 }

 ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup) {
    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);

-    std::string out = "// Shader Unique Id: FS" + id + "\n\n";
+    std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
+    out += "// Shader Unique Id: FS" + id + "\n\n";
    out += GetCommonDeclarations();

    out += R"(
@@ -127,10 +132,35 @@ layout (location = 0) in noperspective vec4 position;
 layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
    vec4 viewport_flip;
    uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
+    uvec4 alpha_test;
 };

+bool AlphaFunc(in float value) {
+    float ref = uintBitsToFloat(alpha_test[2]);
+    switch (alpha_test[1]) {
+        case 1:
+            return false;
+        case 2:
+            return value < ref;
+        case 3:
+            return value == ref;
+        case 4:
+            return value <= ref;
+        case 5:
+            return value > ref;
+        case 6:
+            return value != ref;
+        case 7:
+            return value >= ref;
+        case 8:
+            return true;
+        default:
+            return false;
+    }
+}
+
 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
+    ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
    ProgramResult program =
        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");

@@ -142,7 +172,7 @@ void main() {
 }

 )";
-    return {std::move(out), std::move(program.second)};
+    return {out, program.second};
 }

 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -48,6 +48,17 @@ void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell, std::size_t shade
    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0f : 1.0f;

+    auto func{static_cast<u32>(regs.alpha_test_func)};
+    // Normalize the gl variants of opCompare to be the same as the normal variants
+    const u32 op_gl_variant_base = static_cast<u32>(Maxwell3D::Regs::ComparisonOp::Never);
+    if (func >= op_gl_variant_base) {
+        func = func - op_gl_variant_base + 1U;
+    }
+
+    alpha_test.enabled = regs.alpha_test_enabled;
+    alpha_test.func = func;
+    alpha_test.ref = regs.alpha_test_ref;
+
    instance_id = state.current_instance;

    // Assign in which stage the position has to be flipped
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -27,8 +27,14 @@ struct MaxwellUniformData {
        GLuint flip_stage;
        GLfloat y_direction;
    };
+    struct alignas(16) {
+        GLuint enabled;
+        GLuint func;
+        GLfloat ref;
+        GLuint padding;
+    } alpha_test;
 };
-static_assert(sizeof(MaxwellUniformData) == 32, "MaxwellUniformData structure size is incorrect");
+static_assert(sizeof(MaxwellUniformData) == 48, "MaxwellUniformData structure size is incorrect");
 static_assert(sizeof(MaxwellUniformData) < 16384,
              "MaxwellUniformData structure must be less than 16kb as per the OpenGL spec");

--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -156,10 +156,6 @@ OpenGLState::OpenGLState() {
    polygon_offset.factor = 0.0f;
    polygon_offset.units = 0.0f;
    polygon_offset.clamp = 0.0f;
-
-    alpha_test.enabled = false;
-    alpha_test.func = GL_ALWAYS;
-    alpha_test.ref = 0.0f;
 }

 void OpenGLState::ApplyDefaultState() {
@@ -465,14 +461,6 @@ void OpenGLState::ApplyPolygonOffset() const {
    }
 }

-void OpenGLState::ApplyAlphaTest() const {
-    Enable(GL_ALPHA_TEST, cur_state.alpha_test.enabled, alpha_test.enabled);
-    if (UpdateTie(std::tie(cur_state.alpha_test.func, cur_state.alpha_test.ref),
-                  std::tie(alpha_test.func, alpha_test.ref))) {
-        glAlphaFunc(alpha_test.func, alpha_test.ref);
-    }
-}
-
 void OpenGLState::ApplyTextures() const {
    bool has_delta{};
    std::size_t first{};
@@ -545,7 +533,6 @@ void OpenGLState::Apply() const {
    ApplyTextures();
    ApplySamplers();
    ApplyPolygonOffset();
-    ApplyAlphaTest();
 }

 void OpenGLState::EmulateViewportWithScissor() {
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -172,12 +172,6 @@ public:
        GLfloat clamp;
    } polygon_offset;

-    struct {
-        bool enabled; // GL_ALPHA_TEST
-        GLenum func;  // GL_ALPHA_TEST_FUNC
-        GLfloat ref;  // GL_ALPHA_TEST_REF
-    } alpha_test;
-
    std::array<bool, 8> clip_distance; // GL_CLIP_DISTANCE

    OpenGLState();
@@ -221,7 +215,6 @@ public:
    void ApplySamplers() const;
    void ApplyDepthClamp() const;
    void ApplyPolygonOffset() const;
-    void ApplyAlphaTest() const;

    /// Set the initial OpenGL state
    static void ApplyDefaultState();
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@@ -15,7 +15,8 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",

 namespace OpenGL {

-OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent)
+OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent,
+                                 bool use_persistent)
    : buffer_size(size) {
    gl_buffer.Create();

@@ -29,7 +30,7 @@ OGLStreamBuffer::OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool p
        allocate_size *= 2;
    }

-    if (GLAD_GL_ARB_buffer_storage) {
+    if (use_persistent) {
        persistent = true;
        coherent = prefer_coherent;
        const GLbitfield flags =
--- a/src/video_core/renderer_opengl/gl_stream_buffer.h
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.h
@@ -13,7 +13,8 @@ namespace OpenGL {

 class OGLStreamBuffer : private NonCopyable {
 public:
-    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false);
+    explicit OGLStreamBuffer(GLsizeiptr size, bool vertex_data_usage, bool prefer_coherent = false,
+                             bool use_persistent = true);
    ~OGLStreamBuffer();

    GLuint GetHandle() const;
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -126,10 +126,6 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
        return GL_TRIANGLES;
    case Maxwell::PrimitiveTopology::TriangleStrip:
        return GL_TRIANGLE_STRIP;
-    case Maxwell::PrimitiveTopology::TriangleFan:
-        return GL_TRIANGLE_FAN;
-    case Maxwell::PrimitiveTopology::Quads:
-        return GL_QUADS;
    default:
        LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology));
        UNREACHABLE();
@@ -175,8 +171,11 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) {
        return GL_CLAMP_TO_EDGE;
    case Tegra::Texture::WrapMode::Border:
        return GL_CLAMP_TO_BORDER;
-    case Tegra::Texture::WrapMode::Clamp:
-        return GL_CLAMP;
+    case Tegra::Texture::WrapMode::ClampOGL:
+        // TODO(Subv): GL_CLAMP was removed as of OpenGL 3.1, to implement GL_CLAMP, we can use
+        // GL_CLAMP_TO_BORDER to get the border color of the texture, and then sample the edge to
+        // manually mix them. However the shader part of this is not yet implemented.
+        return GL_CLAMP_TO_BORDER;
    case Tegra::Texture::WrapMode::MirrorOnceClampToEdge:
        return GL_MIRROR_CLAMP_TO_EDGE;
    case Tegra::Texture::WrapMode::MirrorOnceBorder:
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -97,8 +97,8 @@ static std::array<GLfloat, 3 * 2> MakeOrthographicMatrix(const float width, cons
    return matrix;
 }

-RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system)
-    : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system} {}
+RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& window, Core::System& system)
+    : VideoCore::RendererBase{window}, system{system} {}

 RendererOpenGL::~RendererOpenGL() = default;

@@ -265,7 +265,7 @@ void RendererOpenGL::CreateRasterizer() {
    }
    // Initialize sRGB Usage
    OpenGLState::ClearsRGBUsed();
-    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info);
+    rasterizer = std::make_unique<RasterizerOpenGL>(system, screen_info);
 }

 void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
@@ -472,7 +472,6 @@ static void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum
    }
 }

-/// Initialize the renderer
 bool RendererOpenGL::Init() {
    Core::Frontend::ScopeAcquireWindowContext acquire_context{render_window};

--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -45,7 +45,7 @@ struct ScreenInfo {

 class RendererOpenGL : public VideoCore::RendererBase {
 public:
-    explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system);
+    explicit RendererOpenGL(Core::Frontend::EmuWindow& window, Core::System& system);
    ~RendererOpenGL() override;

    /// Swap buffers (render frame)
@@ -77,7 +77,6 @@ private:
    void LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, u8 color_a,
                                    const TextureInfo& texture);

-    Core::Frontend::EmuWindow& emu_window;
    Core::System& system;

    OpenGLState state;
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -38,27 +38,27 @@ void BindBuffersRangePushBuffer::Bind() const {
                       sizes.data());
 }

-void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info) {
+void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string extra_info) {
    if (!GLAD_GL_KHR_debug) {
-        // We don't need to throw an error as this is just for debugging
-        return;
+        return; // We don't need to throw an error as this is just for debugging
    }
-
+    const std::string nice_addr = fmt::format("0x{:016x}", addr);
    std::string object_label;
+
    if (extra_info.empty()) {
        switch (identifier) {
        case GL_TEXTURE:
-            object_label = fmt::format("Texture@0x{:016X}", addr);
+            object_label = "Texture@" + nice_addr;
            break;
        case GL_PROGRAM:
-            object_label = fmt::format("Shader@0x{:016X}", addr);
+            object_label = "Shader@" + nice_addr;
            break;
        default:
-            object_label = fmt::format("Object(0x{:X})@0x{:016X}", identifier, addr);
+            object_label = fmt::format("Object(0x{:x})@{}", identifier, nice_addr);
            break;
        }
    } else {
-        object_label = fmt::format("{}@0x{:016X}", extra_info, addr);
+        object_label = extra_info + '@' + nice_addr;
    }
    glObjectLabel(identifier, handle, -1, static_cast<const GLchar*>(object_label.c_str()));
 }
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -4,7 +4,7 @@

 #pragma once

-#include <string_view>
+#include <string>
 #include <vector>
 #include <glad/glad.h>
 #include "common/common_types.h"
@@ -30,6 +30,6 @@ private:
    std::vector<GLsizeiptr> sizes;
 };

-void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info = {});
+void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string extra_info = "");

 } // namespace OpenGL
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -52,7 +52,7 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode) {
        return vk::SamplerAddressMode::eClampToEdge;
    case Tegra::Texture::WrapMode::Border:
        return vk::SamplerAddressMode::eClampToBorder;
-    case Tegra::Texture::WrapMode::Clamp:
+    case Tegra::Texture::WrapMode::ClampOGL:
        // TODO(Rodrigo): GL_CLAMP was removed as of OpenGL 3.1, to implement GL_CLAMP, we can use
        // eClampToBorder to get the border color of the texture, and then sample the edge to
        // manually mix them. However the shader part of this is not yet implemented.
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -194,8 +194,8 @@ public:
        for (const auto& sampler : ir.GetSamplers()) {
            entries.samplers.emplace_back(sampler);
        }
-        for (const auto& attribute : ir.GetInputAttributes()) {
-            entries.attributes.insert(GetGenericAttributeLocation(attribute));
+        for (const auto& attr : ir.GetInputAttributes()) {
+            entries.attributes.insert(GetGenericAttributeLocation(attr.first));
        }
        entries.clip_distances = ir.GetClipDistances();
        entries.shader_length = ir.GetLength();
@@ -321,7 +321,8 @@ private:
    }

    void DeclareInputAttributes() {
-        for (const auto index : ir.GetInputAttributes()) {
+        for (const auto element : ir.GetInputAttributes()) {
+            const Attribute::Index index = element.first;
            if (!IsGenericAttribute(index)) {
                continue;
            }
@@ -929,6 +930,11 @@ private:
        return {};
    }

+    Id ImageStore(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
    Id Branch(Operation operation) {
        const auto target = std::get_if<ImmediateNode>(operation[0]);
        UNIMPLEMENTED_IF(!target);
@@ -1035,18 +1041,6 @@ private:
        return {};
    }

-    template <u32 element>
-    Id LocalInvocationId(Operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    template <u32 element>
-    Id WorkGroupId(Operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
    Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
                      const std::string& name) {
        const Id id = OpVariable(type, storage);
@@ -1293,6 +1287,8 @@ private:
        &SPIRVDecompiler::TextureQueryLod,
        &SPIRVDecompiler::TexelFetch,

+        &SPIRVDecompiler::ImageStore,
+
        &SPIRVDecompiler::Branch,
        &SPIRVDecompiler::PushFlowStack,
        &SPIRVDecompiler::PopFlowStack,
@@ -1303,12 +1299,6 @@ private:
        &SPIRVDecompiler::EndPrimitive,

        &SPIRVDecompiler::YNegate,
-        &SPIRVDecompiler::LocalInvocationId<0>,
-        &SPIRVDecompiler::LocalInvocationId<1>,
-        &SPIRVDecompiler::LocalInvocationId<2>,
-        &SPIRVDecompiler::WorkGroupId<0>,
-        &SPIRVDecompiler::WorkGroupId<1>,
-        &SPIRVDecompiler::WorkGroupId<2>,
    };

    const ShaderIR& ir;
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -168,6 +168,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
        {OpCode::Type::Conversion, &ShaderIR::DecodeConversion},
        {OpCode::Type::Memory, &ShaderIR::DecodeMemory},
        {OpCode::Type::Texture, &ShaderIR::DecodeTexture},
+        {OpCode::Type::Image, &ShaderIR::DecodeImage},
        {OpCode::Type::FloatSetPredicate, &ShaderIR::DecodeFloatSetPredicate},
        {OpCode::Type::IntegerSetPredicate, &ShaderIR::DecodeIntegerSetPredicate},
        {OpCode::Type::HalfSetPredicate, &ShaderIR::DecodeHalfSetPredicate},
--- a/src/video_core/shader/decode/arithmetic.cpp
+++ b/src/video_core/shader/decode/arithmetic.cpp
@@ -4,7 +4,6 @@

 #include "common/assert.h"
 #include "common/common_types.h"
-#include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"

@@ -153,4 +152,4 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -4,7 +4,6 @@

 #include "common/assert.h"
 #include "common/common_types.h"
-#include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"

--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -4,7 +4,6 @@

 #include "common/assert.h"
 #include "common/common_types.h"
-#include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"

@@ -48,4 +47,4 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/arithmetic_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_immediate.cpp
@@ -49,4 +49,4 @@ u32 ShaderIR::DecodeArithmeticImmediate(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp
@@ -93,4 +93,4 @@ void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation
    }
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/bfe.cpp
+++ b/src/video_core/shader/decode/bfe.cpp
@@ -46,4 +46,4 @@ u32 ShaderIR::DecodeBfe(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/bfi.cpp
+++ b/src/video_core/shader/decode/bfi.cpp
@@ -38,4 +38,4 @@ u32 ShaderIR::DecodeBfi(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/ffma.cpp
+++ b/src/video_core/shader/decode/ffma.cpp
@@ -56,4 +56,4 @@ u32 ShaderIR::DecodeFfma(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/float_set.cpp
+++ b/src/video_core/shader/decode/float_set.cpp
@@ -55,4 +55,4 @@ u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/float_set_predicate.cpp
+++ b/src/video_core/shader/decode/float_set_predicate.cpp
@@ -53,4 +53,4 @@ u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -6,7 +6,6 @@

 #include "common/assert.h"
 #include "common/common_types.h"
-#include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"

@@ -65,4 +64,4 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -59,4 +59,4 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -0,0 +1,115 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+
+namespace {
+std::size_t GetImageTypeNumCoordinates(Tegra::Shader::ImageType image_type) {
+    switch (image_type) {
+    case Tegra::Shader::ImageType::Texture1D:
+    case Tegra::Shader::ImageType::TextureBuffer:
+        return 1;
+    case Tegra::Shader::ImageType::Texture1DArray:
+    case Tegra::Shader::ImageType::Texture2D:
+        return 2;
+    case Tegra::Shader::ImageType::Texture2DArray:
+    case Tegra::Shader::ImageType::Texture3D:
+        return 3;
+    }
+    UNREACHABLE();
+    return 1;
+}
+} // Anonymous namespace
+
+u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) {
+    const Instruction instr = {program_code[pc]};
+    const auto opcode = OpCode::Decode(instr);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::SUST: {
+        UNIMPLEMENTED_IF(instr.sust.mode != Tegra::Shader::SurfaceDataMode::P);
+        UNIMPLEMENTED_IF(instr.sust.image_type == Tegra::Shader::ImageType::TextureBuffer);
+        UNIMPLEMENTED_IF(instr.sust.out_of_bounds_store != Tegra::Shader::OutOfBoundsStore::Ignore);
+        UNIMPLEMENTED_IF(instr.sust.component_mask_selector != 0xf); // Ensure we have an RGBA store
+
+        std::vector<Node> values;
+        constexpr std::size_t hardcoded_size{4};
+        for (std::size_t i = 0; i < hardcoded_size; ++i) {
+            values.push_back(GetRegister(instr.gpr0.Value() + i));
+        }
+
+        std::vector<Node> coords;
+        const std::size_t num_coords{GetImageTypeNumCoordinates(instr.sust.image_type)};
+        for (std::size_t i = 0; i < num_coords; ++i) {
+            coords.push_back(GetRegister(instr.gpr8.Value() + i));
+        }
+
+        const auto type{instr.sust.image_type};
+        const auto& image{instr.sust.is_immediate ? GetImage(instr.image, type)
+                                                  : GetBindlessImage(instr.gpr39, type)};
+        MetaImage meta{image, values};
+        const Node store{Operation(OperationCode::ImageStore, meta, std::move(coords))};
+        bb.push_back(store);
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unhandled conversion instruction: {}", opcode->get().GetName());
+    }
+
+    return pc;
+}
+
+const Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type) {
+    const auto offset{static_cast<std::size_t>(image.index.Value())};
+
+    // If this image has already been used, return the existing mapping.
+    const auto itr{std::find_if(used_images.begin(), used_images.end(),
+                                [=](const Image& entry) { return entry.GetOffset() == offset; })};
+    if (itr != used_images.end()) {
+        ASSERT(itr->GetType() == type);
+        return *itr;
+    }
+
+    // Otherwise create a new mapping for this image.
+    const std::size_t next_index{used_images.size()};
+    const Image entry{offset, next_index, type};
+    return *used_images.emplace(entry).first;
+}
+
+const Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg,
+                                        Tegra::Shader::ImageType type) {
+    const Node image_register{GetRegister(reg)};
+    const Node base_image{
+        TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()))};
+    const auto cbuf{std::get_if<CbufNode>(base_image)};
+    const auto cbuf_offset_imm{std::get_if<ImmediateNode>(cbuf->GetOffset())};
+    const auto cbuf_offset{cbuf_offset_imm->GetValue()};
+    const auto cbuf_index{cbuf->GetIndex()};
+    const auto cbuf_key{(static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset)};
+
+    // If this image has already been used, return the existing mapping.
+    const auto itr{std::find_if(used_images.begin(), used_images.end(),
+                                [=](const Image& entry) { return entry.GetOffset() == cbuf_key; })};
+    if (itr != used_images.end()) {
+        ASSERT(itr->GetType() == type);
+        return *itr;
+    }
+
+    // Otherwise create a new mapping for this image.
+    const std::size_t next_index{used_images.size()};
+    const Image entry{cbuf_index, cbuf_offset, next_index, type};
+    return *used_images.emplace(entry).first;
+}
+
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/integer_set.cpp
+++ b/src/video_core/shader/decode/integer_set.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "common/assert.h"
 #include "common/common_types.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"
@@ -46,4 +47,4 @@ u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/integer_set_predicate.cpp
+++ b/src/video_core/shader/decode/integer_set_predicate.cpp
@@ -50,4 +50,4 @@ u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -47,20 +47,17 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                             "Indirect attribute loads are not supported");
        UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0,
                             "Unaligned attribute loads are not supported");
-        UNIMPLEMENTED_IF_MSG(instr.attribute.fmt20.IsPhysical() &&
-                                 instr.attribute.fmt20.size != Tegra::Shader::AttributeSize::Word,
-                             "Non-32 bits PHYS reads are not implemented");

-        const Node buffer{GetRegister(instr.gpr39)};
+        Tegra::Shader::IpaMode input_mode{Tegra::Shader::IpaInterpMode::Pass,
+                                          Tegra::Shader::IpaSampleMode::Default};

        u64 next_element = instr.attribute.fmt20.element;
        auto next_index = static_cast<u64>(instr.attribute.fmt20.index.Value());

        const auto LoadNextElement = [&](u32 reg_offset) {
-            const Node attribute{instr.attribute.fmt20.IsPhysical()
-                                     ? GetPhysicalInputAttribute(instr.gpr8, buffer)
-                                     : GetInputAttribute(static_cast<Attribute::Index>(next_index),
-                                                         next_element, buffer)};
+            const Node buffer = GetRegister(instr.gpr39);
+            const Node attribute = GetInputAttribute(static_cast<Attribute::Index>(next_index),
+                                                     next_element, input_mode, buffer);

            SetRegister(bb, instr.gpr0.Value() + reg_offset, attribute);

@@ -146,25 +143,12 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
        }
        break;
    }
-    case OpCode::Id::LD:
    case OpCode::Id::LDG: {
-        const auto type = [instr, &opcode]() -> Tegra::Shader::UniformType {
-            switch (opcode->get().GetId()) {
-            case OpCode::Id::LD:
-                UNIMPLEMENTED_IF_MSG(!instr.generic.extended, "Unextended LD is not implemented");
-                return instr.generic.type;
-            case OpCode::Id::LDG:
-                return instr.ldg.type;
-            default:
-                UNREACHABLE();
-                return {};
-            }
-        }();
-
        const auto [real_address_base, base_address, descriptor] =
-            TrackAndGetGlobalMemory(bb, instr, false);
+            TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
+                                    static_cast<u32>(instr.ldg.immediate_offset.Value()), false);

-        const u32 count = GetUniformTypeElementsCount(type);
+        const u32 count = GetUniformTypeElementsCount(instr.ldg.type);
        for (u32 i = 0; i < count; ++i) {
            const Node it_offset = Immediate(i * 4);
            const Node real_address =
@@ -178,6 +162,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
        }
        break;
    }
+    case OpCode::Id::STG: {
+        const auto [real_address_base, base_address, descriptor] =
+            TrackAndGetGlobalMemory(bb, GetRegister(instr.gpr8),
+                                    static_cast<u32>(instr.stg.immediate_offset.Value()), true);
+
+        // Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
+        SetTemporal(bb, 0, real_address_base);
+
+        const u32 count = GetUniformTypeElementsCount(instr.stg.type);
+        for (u32 i = 0; i < count; ++i) {
+            SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
+        }
+        for (u32 i = 0; i < count; ++i) {
+            const Node it_offset = Immediate(i * 4);
+            const Node real_address =
+                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
+            const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
+
+            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
+        }
+        break;
+    }
    case OpCode::Id::ST_A: {
        UNIMPLEMENTED_IF_MSG(instr.gpr8.Value() != Register::ZeroIndex,
                             "Indirect attribute loads are not supported");
@@ -233,56 +239,6 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
        }
        break;
    }
-    case OpCode::Id::ST:
-    case OpCode::Id::STG: {
-        const auto type = [instr, &opcode]() -> Tegra::Shader::UniformType {
-            switch (opcode->get().GetId()) {
-            case OpCode::Id::ST:
-                UNIMPLEMENTED_IF_MSG(!instr.generic.extended, "Unextended ST is not implemented");
-                return instr.generic.type;
-            case OpCode::Id::STG:
-                return instr.stg.type;
-            default:
-                UNREACHABLE();
-                return {};
-            }
-        }();
-
-        const auto [real_address_base, base_address, descriptor] =
-            TrackAndGetGlobalMemory(bb, instr, true);
-
-        // Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
-        SetTemporal(bb, 0, real_address_base);
-
-        const u32 count = GetUniformTypeElementsCount(type);
-        for (u32 i = 0; i < count; ++i) {
-            SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
-        }
-        for (u32 i = 0; i < count; ++i) {
-            const Node it_offset = Immediate(i * 4);
-            const Node real_address =
-                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
-            const Node gmem = StoreNode(GmemNode(real_address, base_address, descriptor));
-
-            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
-        }
-        break;
-    }
-    case OpCode::Id::AL2P: {
-        // Ignore al2p.direction since we don't care about it.
-
-        // Calculate emulation fake physical address.
-        const Node fixed_address{Immediate(static_cast<u32>(instr.al2p.address))};
-        const Node reg{GetRegister(instr.gpr8)};
-        const Node fake_address{Operation(OperationCode::IAdd, NO_PRECISE, reg, fixed_address)};
-
-        // Set the fake address to target register.
-        SetRegister(bb, instr.gpr0, fake_address);
-
-        // Signal the shader IR to declare all possible attributes and varyings
-        uses_physical_attributes = true;
-        break;
-    }
    default:
        UNIMPLEMENTED_MSG("Unhandled memory instruction: {}", opcode->get().GetName());
    }
@@ -291,11 +247,9 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
 }

 std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeBlock& bb,
-                                                                           Instruction instr,
+                                                                           Node addr_register,
+                                                                           u32 immediate_offset,
                                                                           bool is_write) {
-    const auto addr_register{GetRegister(instr.gmem.gpr)};
-    const auto immediate_offset{static_cast<u32>(instr.gmem.offset)};
-
    const Node base_address{
        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
    const auto cbuf = std::get_if<CbufNode>(base_address);
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -4,7 +4,6 @@

 #include "common/assert.h"
 #include "common/common_types.h"
-#include "common/logging/log.h"
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/shader/shader_ir.h"

@@ -14,7 +13,6 @@ using Tegra::Shader::ConditionCode;
 using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
-using Tegra::Shader::SystemVariable;

 u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
    const Instruction instr = {program_code[pc]};
@@ -60,33 +58,20 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        break;
    }
    case OpCode::Id::MOV_SYS: {
-        const Node value = [&]() {
-            switch (instr.sys20) {
-            case SystemVariable::Ydirection:
-                return Operation(OperationCode::YNegate);
-            case SystemVariable::InvocationInfo:
-                LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");
-                return Immediate(0u);
-            case SystemVariable::TidX:
-                return Operation(OperationCode::LocalInvocationIdX);
-            case SystemVariable::TidY:
-                return Operation(OperationCode::LocalInvocationIdY);
-            case SystemVariable::TidZ:
-                return Operation(OperationCode::LocalInvocationIdZ);
-            case SystemVariable::CtaIdX:
-                return Operation(OperationCode::WorkGroupIdX);
-            case SystemVariable::CtaIdY:
-                return Operation(OperationCode::WorkGroupIdY);
-            case SystemVariable::CtaIdZ:
-                return Operation(OperationCode::WorkGroupIdZ);
-            default:
-                UNIMPLEMENTED_MSG("Unhandled system move: {}",
-                                  static_cast<u32>(instr.sys20.Value()));
-                return Immediate(0u);
-            }
-        }();
-        SetRegister(bb, instr.gpr0, value);
-
+        switch (instr.sys20) {
+        case Tegra::Shader::SystemVariable::InvocationInfo: {
+            LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");
+            SetRegister(bb, instr.gpr0, Immediate(0u));
+            break;
+        }
+        case Tegra::Shader::SystemVariable::Ydirection: {
+            // Config pack's third value is Y_NEGATE's state.
+            SetRegister(bb, instr.gpr0, Operation(OperationCode::YNegate));
+            break;
+        }
+        default:
+            UNIMPLEMENTED_MSG("Unhandled system move: {}", static_cast<u32>(instr.sys20.Value()));
+        }
        break;
    }
    case OpCode::Id::BRA: {
@@ -145,18 +130,15 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
        break;
    }
    case OpCode::Id::IPA: {
-        const bool is_physical = instr.ipa.idx && instr.gpr8.Value() != 0xff;
-
-        const auto attribute = instr.attribute.fmt28;
+        const auto& attribute = instr.attribute.fmt28;
        const Tegra::Shader::IpaMode input_mode{instr.ipa.interp_mode.Value(),
                                                instr.ipa.sample_mode.Value()};

-        Node value = is_physical ? GetPhysicalInputAttribute(instr.gpr8)
-                                 : GetInputAttribute(attribute.index, attribute.element);
+        const Node attr = GetInputAttribute(attribute.index, attribute.element, input_mode);
+        Node value = attr;
        const Tegra::Shader::Attribute::Index index = attribute.index.Value();
-        const bool is_generic = index >= Tegra::Shader::Attribute::Index::Attribute_0 &&
-                                index <= Tegra::Shader::Attribute::Index::Attribute_31;
-        if (is_generic || is_physical) {
+        if (index >= Tegra::Shader::Attribute::Index::Attribute_0 &&
+            index <= Tegra::Shader::Attribute::Index::Attribute_31) {
            // TODO(Blinkhawk): There are cases where a perspective attribute use PASS.
            // In theory by setting them as perspective, OpenGL does the perspective correction.
            // A way must figured to reverse the last step of it.
--- a/src/video_core/shader/decode/predicate_set_predicate.cpp
+++ b/src/video_core/shader/decode/predicate_set_predicate.cpp
@@ -64,4 +64,4 @@ u32 ShaderIR::DecodePredicateSetPredicate(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/predicate_set_register.cpp
+++ b/src/video_core/shader/decode/predicate_set_register.cpp
@@ -43,4 +43,4 @@ u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/src/video_core/shader/decode/register_set_predicate.cpp
+++ b/src/video_core/shader/decode/register_set_predicate.cpp
@@ -48,4 +48,4 @@ u32 ShaderIR::DecodeRegisterSetPredicate(NodeBlock& bb, u32 pc) {
    return pc;
 }

-} // namespace VideoCommon::Shader
+} // namespace VideoCommon::Shader
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ReinUsesLisp	9a8c1745f1	gl_shader_decompiler: Implement image binding settings	2019-05-16 20:03:51 -03:00
ReinUsesLisp	f96d50165f	shader: Implement bindless images	2019-05-16 20:03:51 -03:00
ReinUsesLisp	f9f541470e	shader: Decode SUST and implement backing image functionality	2019-05-16 20:03:51 -03:00
ReinUsesLisp	ce691745dc	gl_rasterizer: Track texture buffer usage	2019-05-16 20:03:51 -03:00
ReinUsesLisp	1d59af8f7c	video_core: Make ARB_buffer_storage a required extension	2019-05-16 20:03:50 -03:00
ReinUsesLisp	a6252257eb	gl_rasterizer_cache: Use texture buffers to emulate texture buffers	2019-05-16 20:03:50 -03:00
ReinUsesLisp	dc5e5ac3b0	maxwell_3d: Partially implement texture buffers as 1D textures	2019-05-16 18:55:20 -03:00
ReinUsesLisp	4f612052b2	gl_shader_decompiler: Allow 1D textures to be texture buffers	2019-05-16 18:55:20 -03:00
ReinUsesLisp	89eef17670	shader: Implement texture buffers	2019-05-16 18:55:20 -03:00