renderer_vulkan: Wait on present semaphore at queue submit

The present semaphore is being signalled by the call to acquire the swapchain image. This semaphore is meant to be waited on when rendering to the swapchain image. Currently it is waited on when presenting, but moving its usage to be waited on in the command buffer submission allows for proper usage of this semaphore. Fixes the device lost when launching titles on the Intel Linux Mesa driver.
Merge pull request #6900 from ameerj/attr-reorder
2021-09-02 13:13:20 -04:00 · 2021-09-01 17:36:26 -07:00 · 2021-09-01 20:21:15 -04:00 · 2021-09-01 19:13:33 -05:00 · 2021-08-31 09:11:21 -07:00 · 2021-08-30 18:16:31 -07:00
133 changed files with 3346 additions and 2673 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -376,7 +376,7 @@ if (ENABLE_SDL2)
    if (YUZU_USE_BUNDLED_SDL2)
        # Detect toolchain and platform
        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1930) AND ARCHITECTURE_x86_64)
-            set(SDL2_VER "SDL2-2.0.15-prerelease")
+            set(SDL2_VER "SDL2-2.0.16")
        else()
            message(FATAL_ERROR "No bundled SDL2 binaries for your toolchain. Disable YUZU_USE_BUNDLED_SDL2 and provide your own.")
        endif()
@@ -396,7 +396,7 @@ if (ENABLE_SDL2)
    elseif (YUZU_USE_EXTERNAL_SDL2)
        message(STATUS "Using SDL2 from externals.")
    else()
-        find_package(SDL2 2.0.15 REQUIRED)
+        find_package(SDL2 2.0.16 REQUIRED)

        # Some installations don't set SDL2_LIBRARIES
        if("${SDL2_LIBRARIES}" STREQUAL "")
@@ -583,8 +583,32 @@ if (YUZU_USE_BUNDLED_FFMPEG)
            "${FFmpeg_PREFIX};${FFmpeg_BUILD_DIR}"
            CACHE PATH "Path to FFmpeg headers" FORCE)

+        if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+            Include(FindPkgConfig REQUIRED)
+            pkg_check_modules(LIBVA libva)
+        endif()
+        if(LIBVA_FOUND)
+            pkg_check_modules(LIBDRM libdrm REQUIRED)
+            find_package(X11 REQUIRED)
+            pkg_check_modules(LIBVA-DRM libva-drm REQUIRED)
+            pkg_check_modules(LIBVA-X11 libva-x11 REQUIRED)
+            set(FFmpeg_LIBVA_LIBRARIES
+                ${LIBDRM_LIBRARIES}
+                ${X11_LIBRARIES}
+                ${LIBVA-DRM_LIBRARIES}
+                ${LIBVA-X11_LIBRARIES}
+                ${LIBVA_LIBRARIES})
+            set(FFmpeg_HWACCEL_FLAGS
+                --enable-hwaccel=h264_vaapi
+                --enable-hwaccel=vp9_vaapi
+                --enable-libdrm)
+            message(STATUS "VA-API found")
+        else()
+            set(FFmpeg_HWACCEL_FLAGS --disable-vaapi)
+        endif()
+
        # `configure` parameters builds only exactly what yuzu needs from FFmpeg
-        # `--disable-{vaapi,vdpau}` is needed to avoid linking issues
+        # `--disable-vdpau` is needed to avoid linking issues
        add_custom_command(
            OUTPUT
                ${FFmpeg_MAKEFILE}
@@ -600,15 +624,16 @@ if (YUZU_USE_BUNDLED_FFMPEG)
                    --disable-network
                    --disable-postproc
                    --disable-swresample
-                    --disable-vaapi
                    --disable-vdpau
                    --enable-decoder=h264
                    --enable-decoder=vp9
                    --cc="${CMAKE_C_COMPILER}"
                    --cxx="${CMAKE_CXX_COMPILER}"
+                    ${FFmpeg_HWACCEL_FLAGS}
            WORKING_DIRECTORY
                ${FFmpeg_BUILD_DIR}
        )
+        unset(FFmpeg_HWACCEL_FLAGS)

        # Workaround for Ubuntu 18.04's older version of make not being able to call make as a child
        # with context of the jobserver. Also helps ninja users.
@@ -618,9 +643,10 @@ if (YUZU_USE_BUNDLED_FFMPEG)
            OUTPUT_VARIABLE
                SYSTEM_THREADS)

+        set(FFmpeg_BUILD_LIBRARIES ${FFmpeg_LIBRARIES})
        add_custom_command(
            OUTPUT
-                ${FFmpeg_LIBRARIES}
+                ${FFmpeg_BUILD_LIBRARIES}
            COMMAND
                make -j${SYSTEM_THREADS}
            WORKING_DIRECTORY
@@ -630,7 +656,12 @@ if (YUZU_USE_BUNDLED_FFMPEG)
        # ALL makes this custom target build every time
        # but it won't actually build if the DEPENDS parameter is up to date
        add_custom_target(ffmpeg-configure ALL DEPENDS ${FFmpeg_MAKEFILE})
-        add_custom_target(ffmpeg-build ALL DEPENDS ${FFmpeg_LIBRARIES} ffmpeg-configure)
+        add_custom_target(ffmpeg-build ALL DEPENDS ${FFmpeg_BUILD_LIBRARIES} ffmpeg-configure)
+        link_libraries(${FFmpeg_LIBVA_LIBRARIES})
+        set(FFmpeg_LIBRARIES ${FFmpeg_LIBVA_LIBRARIES} ${FFmpeg_BUILD_LIBRARIES}
+            CACHE PATH "Paths to FFmpeg libraries" FORCE)
+        unset(FFmpeg_BUILD_LIBRARIES)
+        unset(FFmpeg_LIBVA_LIBRARIES)

        if (FFmpeg_FOUND)
            message(STATUS "Found FFmpeg version ${FFmpeg_VERSION}")
@@ -670,7 +701,7 @@ if (APPLE)
 elseif (WIN32)
    # WSAPoll and SHGetKnownFolderPath (AppData/Roaming) didn't exist before WinNT 6.x (Vista)
    add_definitions(-D_WIN32_WINNT=0x0600 -DWINVER=0x0600)
-    set(PLATFORM_LIBRARIES winmm ws2_32)
+    set(PLATFORM_LIBRARIES winmm ws2_32 iphlpapi)
    if (MINGW)
        # PSAPI is the Process Status API
        set(PLATFORM_LIBRARIES ${PLATFORM_LIBRARIES} psapi imm32 version)
--- a/dist/qt_themes/default/style.qss
+++ b/dist/qt_themes/default/style.qss
@@ -51,11 +51,11 @@ QPushButton#GPUStatusBarButton:hover {
 }

 QPushButton#GPUStatusBarButton:checked {
-    color: #ff8040;
+    color: #b06020;
 }

 QPushButton#GPUStatusBarButton:!checked {
-    color: #40dd40;
+    color: #109010;
 }

 QPushButton#buttonRefreshDevices {
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -7,7 +7,9 @@ include(DownloadExternals)
 # xbyak
 if (ARCHITECTURE_x86 OR ARCHITECTURE_x86_64)
    add_library(xbyak INTERFACE)
-    target_include_directories(xbyak SYSTEM INTERFACE ./xbyak/xbyak)
+    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/xbyak/include)
+    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/xbyak/xbyak DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/xbyak/include)
+    target_include_directories(xbyak SYSTEM INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/xbyak/include)
    target_compile_definitions(xbyak INTERFACE XBYAK_NO_OP_NAMES)
 endif()

@@ -19,6 +21,7 @@ target_include_directories(catch-single-include INTERFACE catch/single_include)
 if (ARCHITECTURE_x86_64)
    set(DYNARMIC_TESTS OFF)
    set(DYNARMIC_NO_BUNDLED_FMT ON)
+    set(DYNARMIC_IGNORE_ASSERTS ON CACHE BOOL "" FORCE)
    add_subdirectory(dynarmic)
 endif()

--- a/externals/SDL
+++ b/externals/SDL
--- a/externals/dynarmic
+++ b/externals/dynarmic
--- a/src/common/assert.h
+++ b/src/common/assert.h
@@ -52,8 +52,12 @@ assert_noinline_call(const Fn& fn) {
 #define DEBUG_ASSERT(_a_) ASSERT(_a_)
 #define DEBUG_ASSERT_MSG(_a_, ...) ASSERT_MSG(_a_, __VA_ARGS__)
 #else // not debug
-#define DEBUG_ASSERT(_a_)
-#define DEBUG_ASSERT_MSG(_a_, _desc_, ...)
+#define DEBUG_ASSERT(_a_)                                                                          \
+    do {                                                                                           \
+    } while (0)
+#define DEBUG_ASSERT_MSG(_a_, _desc_, ...)                                                         \
+    do {                                                                                           \
+    } while (0)
 #endif

 #define UNIMPLEMENTED() ASSERT_MSG(false, "Unimplemented code!")
--- a/src/common/hex_util.h
+++ b/src/common/hex_util.h
@@ -61,7 +61,7 @@ template <typename ContiguousContainer>
    return out;
 }

-[[nodiscard]] constexpr std::array<u8, 16> AsArray(const char (&data)[17]) {
+[[nodiscard]] constexpr std::array<u8, 16> AsArray(const char (&data)[33]) {
    return HexStringToArray<16>(data);
 }

--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -2,13 +2,10 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <algorithm>
 #include <atomic>
 #include <chrono>
 #include <climits>
-#include <condition_variable>
-#include <memory>
-#include <mutex>
+#include <exception>
 #include <thread>
 #include <vector>

@@ -16,28 +13,174 @@
 #include <windows.h> // For OutputDebugStringW
 #endif

-#include "common/assert.h"
 #include "common/fs/file.h"
 #include "common/fs/fs.h"
+#include "common/fs/fs_paths.h"
+#include "common/fs/path_util.h"
 #include "common/literals.h"
+#include "common/thread.h"

 #include "common/logging/backend.h"
 #include "common/logging/log.h"
 #include "common/logging/text_formatter.h"
 #include "common/settings.h"
+#ifdef _WIN32
 #include "common/string_util.h"
+#endif
 #include "common/threadsafe_queue.h"

 namespace Common::Log {

+namespace {
+
+/**
+ * Interface for logging backends.
+ */
+class Backend {
+public:
+    virtual ~Backend() = default;
+
+    virtual void Write(const Entry& entry) = 0;
+
+    virtual void EnableForStacktrace() = 0;
+
+    virtual void Flush() = 0;
+};
+
+/**
+ * Backend that writes to stderr and with color
+ */
+class ColorConsoleBackend final : public Backend {
+public:
+    explicit ColorConsoleBackend() = default;
+
+    ~ColorConsoleBackend() override = default;
+
+    void Write(const Entry& entry) override {
+        if (enabled.load(std::memory_order_relaxed)) {
+            PrintColoredMessage(entry);
+        }
+    }
+
+    void Flush() override {
+        // stderr shouldn't be buffered
+    }
+
+    void EnableForStacktrace() override {
+        enabled = true;
+    }
+
+    void SetEnabled(bool enabled_) {
+        enabled = enabled_;
+    }
+
+private:
+    std::atomic_bool enabled{false};
+};
+
+/**
+ * Backend that writes to a file passed into the constructor
+ */
+class FileBackend final : public Backend {
+public:
+    explicit FileBackend(const std::filesystem::path& filename) {
+        auto old_filename = filename;
+        old_filename += ".old.txt";
+
+        // Existence checks are done within the functions themselves.
+        // We don't particularly care if these succeed or not.
+        static_cast<void>(FS::RemoveFile(old_filename));
+        static_cast<void>(FS::RenameFile(filename, old_filename));
+
+        file = std::make_unique<FS::IOFile>(filename, FS::FileAccessMode::Write,
+                                            FS::FileType::TextFile);
+    }
+
+    ~FileBackend() override = default;
+
+    void Write(const Entry& entry) override {
+        if (!enabled) {
+            return;
+        }
+
+        bytes_written += file->WriteString(FormatLogMessage(entry).append(1, '\n'));
+
+        using namespace Common::Literals;
+        // Prevent logs from exceeding a set maximum size in the event that log entries are spammed.
+        const auto write_limit = Settings::values.extended_logging ? 1_GiB : 100_MiB;
+        const bool write_limit_exceeded = bytes_written > write_limit;
+        if (entry.log_level >= Level::Error || write_limit_exceeded) {
+            if (write_limit_exceeded) {
+                // Stop writing after the write limit is exceeded.
+                // Don't close the file so we can print a stacktrace if necessary
+                enabled = false;
+            }
+            file->Flush();
+        }
+    }
+
+    void Flush() override {
+        file->Flush();
+    }
+
+    void EnableForStacktrace() override {
+        enabled = true;
+        bytes_written = 0;
+    }
+
+private:
+    std::unique_ptr<FS::IOFile> file;
+    bool enabled = true;
+    std::size_t bytes_written = 0;
+};
+
+/**
+ * Backend that writes to Visual Studio's output window
+ */
+class DebuggerBackend final : public Backend {
+public:
+    explicit DebuggerBackend() = default;
+
+    ~DebuggerBackend() override = default;
+
+    void Write(const Entry& entry) override {
+#ifdef _WIN32
+        ::OutputDebugStringW(UTF8ToUTF16W(FormatLogMessage(entry).append(1, '\n')).c_str());
+#endif
+    }
+
+    void Flush() override {}
+
+    void EnableForStacktrace() override {}
+};
+
+bool initialization_in_progress_suppress_logging = true;
+
 /**
 * Static state as a singleton.
 */
 class Impl {
 public:
    static Impl& Instance() {
-        static Impl backend;
-        return backend;
+        if (!instance) {
+            throw std::runtime_error("Using Logging instance before its initialization");
+        }
+        return *instance;
+    }
+
+    static void Initialize() {
+        if (instance) {
+            LOG_WARNING(Log, "Reinitializing logging backend");
+            return;
+        }
+        using namespace Common::FS;
+        const auto& log_dir = GetYuzuPath(YuzuPath::LogDir);
+        void(CreateDir(log_dir));
+        Filter filter;
+        filter.ParseFilterString(Settings::values.log_filter.GetValue());
+        instance = std::unique_ptr<Impl, decltype(&Deleter)>(new Impl(log_dir / LOG_FILE, filter),
+                                                             Deleter);
+        initialization_in_progress_suppress_logging = false;
    }

    Impl(const Impl&) = delete;
@@ -46,74 +189,54 @@ public:
    Impl(Impl&&) = delete;
    Impl& operator=(Impl&&) = delete;

-    void PushEntry(Class log_class, Level log_level, const char* filename, unsigned int line_num,
-                   const char* function, std::string message) {
-        message_queue.Push(
-            CreateEntry(log_class, log_level, filename, line_num, function, std::move(message)));
-    }
-
-    void AddBackend(std::unique_ptr<Backend> backend) {
-        std::lock_guard lock{writing_mutex};
-        backends.push_back(std::move(backend));
-    }
-
-    void RemoveBackend(std::string_view backend_name) {
-        std::lock_guard lock{writing_mutex};
-
-        std::erase_if(backends, [&backend_name](const auto& backend) {
-            return backend_name == backend->GetName();
-        });
-    }
-
-    const Filter& GetGlobalFilter() const {
-        return filter;
-    }
-
    void SetGlobalFilter(const Filter& f) {
        filter = f;
    }

-    Backend* GetBackend(std::string_view backend_name) {
-        const auto it =
-            std::find_if(backends.begin(), backends.end(),
-                         [&backend_name](const auto& i) { return backend_name == i->GetName(); });
-        if (it == backends.end())
-            return nullptr;
-        return it->get();
+    void SetColorConsoleBackendEnabled(bool enabled) {
+        color_console_backend.SetEnabled(enabled);
+    }
+
+    void PushEntry(Class log_class, Level log_level, const char* filename, unsigned int line_num,
+                   const char* function, std::string message) {
+        if (!filter.CheckMessage(log_class, log_level))
+            return;
+        const Entry& entry =
+            CreateEntry(log_class, log_level, filename, line_num, function, std::move(message));
+        message_queue.Push(entry);
    }

 private:
-    Impl() {
-        backend_thread = std::thread([&] {
-            Entry entry;
-            auto write_logs = [&](Entry& e) {
-                std::lock_guard lock{writing_mutex};
-                for (const auto& backend : backends) {
-                    backend->Write(e);
-                }
-            };
-            while (true) {
-                entry = message_queue.PopWait();
-                if (entry.final_entry) {
-                    break;
-                }
-                write_logs(entry);
-            }
-
-            // Drain the logging queue. Only writes out up to MAX_LOGS_TO_WRITE to prevent a
-            // case where a system is repeatedly spamming logs even on close.
-            const int MAX_LOGS_TO_WRITE = filter.IsDebug() ? INT_MAX : 100;
-            int logs_written = 0;
-            while (logs_written++ < MAX_LOGS_TO_WRITE && message_queue.Pop(entry)) {
-                write_logs(entry);
-            }
-        });
-    }
+    Impl(const std::filesystem::path& file_backend_filename, const Filter& filter_)
+        : filter{filter_}, file_backend{file_backend_filename}, backend_thread{std::thread([this] {
+              Common::SetCurrentThreadName("yuzu:Log");
+              Entry entry;
+              const auto write_logs = [this, &entry]() {
+                  ForEachBackend([&entry](Backend& backend) { backend.Write(entry); });
+              };
+              while (true) {
+                  entry = message_queue.PopWait();
+                  if (entry.final_entry) {
+                      break;
+                  }
+                  write_logs();
+              }
+              // Drain the logging queue. Only writes out up to MAX_LOGS_TO_WRITE to prevent a
+              // case where a system is repeatedly spamming logs even on close.
+              int max_logs_to_write = filter.IsDebug() ? INT_MAX : 100;
+              while (max_logs_to_write-- && message_queue.Pop(entry)) {
+                  write_logs();
+              }
+          })} {}

    ~Impl() {
-        Entry entry;
-        entry.final_entry = true;
-        message_queue.Push(entry);
+        StopBackendThread();
+    }
+
+    void StopBackendThread() {
+        Entry stop_entry{};
+        stop_entry.final_entry = true;
+        message_queue.Push(stop_entry);
        backend_thread.join();
    }

@@ -135,100 +258,51 @@ private:
        };
    }

-    std::mutex writing_mutex;
-    std::thread backend_thread;
-    std::vector<std::unique_ptr<Backend>> backends;
-    MPSCQueue<Entry> message_queue;
+    void ForEachBackend(auto lambda) {
+        lambda(static_cast<Backend&>(debugger_backend));
+        lambda(static_cast<Backend&>(color_console_backend));
+        lambda(static_cast<Backend&>(file_backend));
+    }
+
+    static void Deleter(Impl* ptr) {
+        delete ptr;
+    }
+
+    static inline std::unique_ptr<Impl, decltype(&Deleter)> instance{nullptr, Deleter};
+
    Filter filter;
+    DebuggerBackend debugger_backend{};
+    ColorConsoleBackend color_console_backend{};
+    FileBackend file_backend;
+
+    std::thread backend_thread;
+    MPSCQueue<Entry> message_queue{};
    std::chrono::steady_clock::time_point time_origin{std::chrono::steady_clock::now()};
 };
+} // namespace

-ConsoleBackend::~ConsoleBackend() = default;
-
-void ConsoleBackend::Write(const Entry& entry) {
-    PrintMessage(entry);
+void Initialize() {
+    Impl::Initialize();
 }

-ColorConsoleBackend::~ColorConsoleBackend() = default;
-
-void ColorConsoleBackend::Write(const Entry& entry) {
-    PrintColoredMessage(entry);
-}
-
-FileBackend::FileBackend(const std::filesystem::path& filename) {
-    auto old_filename = filename;
-    old_filename += ".old.txt";
-
-    // Existence checks are done within the functions themselves.
-    // We don't particularly care if these succeed or not.
-    FS::RemoveFile(old_filename);
-    void(FS::RenameFile(filename, old_filename));
-
-    file =
-        std::make_unique<FS::IOFile>(filename, FS::FileAccessMode::Write, FS::FileType::TextFile);
-}
-
-FileBackend::~FileBackend() = default;
-
-void FileBackend::Write(const Entry& entry) {
-    if (!file->IsOpen()) {
-        return;
-    }
-
-    using namespace Common::Literals;
-    // Prevent logs from exceeding a set maximum size in the event that log entries are spammed.
-    constexpr std::size_t MAX_BYTES_WRITTEN = 100_MiB;
-    constexpr std::size_t MAX_BYTES_WRITTEN_EXTENDED = 1_GiB;
-
-    const bool write_limit_exceeded =
-        bytes_written > MAX_BYTES_WRITTEN_EXTENDED ||
-        (bytes_written > MAX_BYTES_WRITTEN && !Settings::values.extended_logging);
-
-    // Close the file after the write limit is exceeded.
-    if (write_limit_exceeded) {
-        file->Close();
-        return;
-    }
-
-    bytes_written += file->WriteString(FormatLogMessage(entry).append(1, '\n'));
-    if (entry.log_level >= Level::Error) {
-        file->Flush();
-    }
-}
-
-DebuggerBackend::~DebuggerBackend() = default;
-
-void DebuggerBackend::Write(const Entry& entry) {
-#ifdef _WIN32
-    ::OutputDebugStringW(UTF8ToUTF16W(FormatLogMessage(entry).append(1, '\n')).c_str());
-#endif
+void DisableLoggingInTests() {
+    initialization_in_progress_suppress_logging = true;
 }

 void SetGlobalFilter(const Filter& filter) {
    Impl::Instance().SetGlobalFilter(filter);
 }

-void AddBackend(std::unique_ptr<Backend> backend) {
-    Impl::Instance().AddBackend(std::move(backend));
-}
-
-void RemoveBackend(std::string_view backend_name) {
-    Impl::Instance().RemoveBackend(backend_name);
-}
-
-Backend* GetBackend(std::string_view backend_name) {
-    return Impl::Instance().GetBackend(backend_name);
+void SetColorConsoleBackendEnabled(bool enabled) {
+    Impl::Instance().SetColorConsoleBackendEnabled(enabled);
 }

 void FmtLogMessageImpl(Class log_class, Level log_level, const char* filename,
                       unsigned int line_num, const char* function, const char* format,
                       const fmt::format_args& args) {
-    auto& instance = Impl::Instance();
-    const auto& filter = instance.GetGlobalFilter();
-    if (!filter.CheckMessage(log_class, log_level))
-        return;
-
-    instance.PushEntry(log_class, log_level, filename, line_num, function,
-                       fmt::vformat(format, args));
+    if (!initialization_in_progress_suppress_logging) {
+        Impl::Instance().PushEntry(log_class, log_level, filename, line_num, function,
+                                   fmt::vformat(format, args));
+    }
 }
 } // namespace Common::Log
--- a/src/common/logging/backend.h
+++ b/src/common/logging/backend.h
@@ -5,120 +5,21 @@
 #pragma once

 #include <filesystem>
-#include <memory>
-#include <string>
-#include <string_view>
 #include "common/logging/filter.h"
-#include "common/logging/log.h"
-
-namespace Common::FS {
-class IOFile;
-}

 namespace Common::Log {

 class Filter;

-/**
- * Interface for logging backends. As loggers can be created and removed at runtime, this can be
- * used by a frontend for adding a custom logging backend as needed
- */
-class Backend {
-public:
-    virtual ~Backend() = default;
+/// Initializes the logging system. This should be the first thing called in main.
+void Initialize();

-    virtual void SetFilter(const Filter& new_filter) {
-        filter = new_filter;
-    }
-    virtual const char* GetName() const = 0;
-    virtual void Write(const Entry& entry) = 0;
-
-private:
-    Filter filter;
-};
+void DisableLoggingInTests();

 /**
- * Backend that writes to stderr without any color commands
- */
-class ConsoleBackend : public Backend {
-public:
-    ~ConsoleBackend() override;
-
-    static const char* Name() {
-        return "console";
-    }
-    const char* GetName() const override {
-        return Name();
-    }
-    void Write(const Entry& entry) override;
-};
-
-/**
- * Backend that writes to stderr and with color
- */
-class ColorConsoleBackend : public Backend {
-public:
-    ~ColorConsoleBackend() override;
-
-    static const char* Name() {
-        return "color_console";
-    }
-
-    const char* GetName() const override {
-        return Name();
-    }
-    void Write(const Entry& entry) override;
-};
-
-/**
- * Backend that writes to a file passed into the constructor
- */
-class FileBackend : public Backend {
-public:
-    explicit FileBackend(const std::filesystem::path& filename);
-    ~FileBackend() override;
-
-    static const char* Name() {
-        return "file";
-    }
-
-    const char* GetName() const override {
-        return Name();
-    }
-
-    void Write(const Entry& entry) override;
-
-private:
-    std::unique_ptr<FS::IOFile> file;
-    std::size_t bytes_written = 0;
-};
-
-/**
- * Backend that writes to Visual Studio's output window
- */
-class DebuggerBackend : public Backend {
-public:
-    ~DebuggerBackend() override;
-
-    static const char* Name() {
-        return "debugger";
-    }
-    const char* GetName() const override {
-        return Name();
-    }
-    void Write(const Entry& entry) override;
-};
-
-void AddBackend(std::unique_ptr<Backend> backend);
-
-void RemoveBackend(std::string_view backend_name);
-
-Backend* GetBackend(std::string_view backend_name);
-
-/**
- * The global filter will prevent any messages from even being processed if they are filtered. Each
- * backend can have a filter, but if the level is lower than the global filter, the backend will
- * never get the message
+ * The global filter will prevent any messages from even being processed if they are filtered.
 */
 void SetGlobalFilter(const Filter& filter);
-} // namespace Common::Log
+
+void SetColorConsoleBackendEnabled(bool enabled);
+} // namespace Common::Log
--- a/src/common/logging/filter.cpp
+++ b/src/common/logging/filter.cpp
@@ -111,6 +111,7 @@ bool ParseFilterRule(Filter& instance, Iterator begin, Iterator end) {
    SUB(Service, NCM)                                                                              \
    SUB(Service, NFC)                                                                              \
    SUB(Service, NFP)                                                                              \
+    SUB(Service, NGCT)                                                                             \
    SUB(Service, NIFM)                                                                             \
    SUB(Service, NIM)                                                                              \
    SUB(Service, NPNS)                                                                             \
--- a/src/common/logging/types.h
+++ b/src/common/logging/types.h
@@ -81,6 +81,7 @@ enum class Class : u8 {
    Service_NCM,       ///< The NCM service
    Service_NFC,       ///< The NFC (Near-field communication) service
    Service_NFP,       ///< The NFP service
+    Service_NGCT,      ///< The NGCT (No Good Content for Terra) service
    Service_NIFM,      ///< The NIFM (Network interface) service
    Service_NIM,       ///< The NIM service
    Service_NPNS,      ///< The NPNS service
--- a/src/common/lru_cache.h
+++ b/src/common/lru_cache.h
@@ -0,0 +1,140 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2+ or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <type_traits>
+
+#include "common/common_types.h"
+
+namespace Common {
+
+template <class Traits>
+class LeastRecentlyUsedCache {
+    using ObjectType = typename Traits::ObjectType;
+    using TickType = typename Traits::TickType;
+
+    struct Item {
+        ObjectType obj;
+        TickType tick;
+        Item* next{};
+        Item* prev{};
+    };
+
+public:
+    LeastRecentlyUsedCache() : first_item{}, last_item{} {}
+    ~LeastRecentlyUsedCache() = default;
+
+    size_t Insert(ObjectType obj, TickType tick) {
+        const auto new_id = Build();
+        auto& item = item_pool[new_id];
+        item.obj = obj;
+        item.tick = tick;
+        Attach(item);
+        return new_id;
+    }
+
+    void Touch(size_t id, TickType tick) {
+        auto& item = item_pool[id];
+        if (item.tick >= tick) {
+            return;
+        }
+        item.tick = tick;
+        if (&item == last_item) {
+            return;
+        }
+        Detach(item);
+        Attach(item);
+    }
+
+    void Free(size_t id) {
+        auto& item = item_pool[id];
+        Detach(item);
+        item.prev = nullptr;
+        item.next = nullptr;
+        free_items.push_back(id);
+    }
+
+    template <typename Func>
+    void ForEachItemBelow(TickType tick, Func&& func) {
+        static constexpr bool RETURNS_BOOL =
+            std::is_same_v<std::invoke_result<Func, ObjectType>, bool>;
+        Item* iterator = first_item;
+        while (iterator) {
+            if (static_cast<s64>(tick) - static_cast<s64>(iterator->tick) < 0) {
+                return;
+            }
+            Item* next = iterator->next;
+            if constexpr (RETURNS_BOOL) {
+                if (func(iterator->obj)) {
+                    return;
+                }
+            } else {
+                func(iterator->obj);
+            }
+            iterator = next;
+        }
+    }
+
+private:
+    size_t Build() {
+        if (free_items.empty()) {
+            const size_t item_id = item_pool.size();
+            auto& item = item_pool.emplace_back();
+            item.next = nullptr;
+            item.prev = nullptr;
+            return item_id;
+        }
+        const size_t item_id = free_items.front();
+        free_items.pop_front();
+        auto& item = item_pool[item_id];
+        item.next = nullptr;
+        item.prev = nullptr;
+        return item_id;
+    }
+
+    void Attach(Item& item) {
+        if (!first_item) {
+            first_item = &item;
+        }
+        if (!last_item) {
+            last_item = &item;
+        } else {
+            item.prev = last_item;
+            last_item->next = &item;
+            item.next = nullptr;
+            last_item = &item;
+        }
+    }
+
+    void Detach(Item& item) {
+        if (item.prev) {
+            item.prev->next = item.next;
+        }
+        if (item.next) {
+            item.next->prev = item.prev;
+        }
+        if (&item == first_item) {
+            first_item = item.next;
+            if (first_item) {
+                first_item->prev = nullptr;
+            }
+        }
+        if (&item == last_item) {
+            last_item = item.prev;
+            if (last_item) {
+                last_item->next = nullptr;
+            }
+        }
+    }
+
+    std::deque<Item> item_pool;
+    std::deque<size_t> free_items;
+    Item* first_item{};
+    Item* last_item{};
+};
+
+} // namespace Common
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@@ -59,7 +59,6 @@ void LogSettings() {
    log_setting("Renderer_UseVsync", values.use_vsync.GetValue());
    log_setting("Renderer_ShaderBackend", values.shader_backend.GetValue());
    log_setting("Renderer_UseAsynchronousShaders", values.use_asynchronous_shaders.GetValue());
-    log_setting("Renderer_UseGarbageCollection", values.use_caches_gc.GetValue());
    log_setting("Renderer_AnisotropicFilteringLevel", values.max_anisotropy.GetValue());
    log_setting("Audio_OutputEngine", values.sink_id.GetValue());
    log_setting("Audio_EnableAudioStretching", values.enable_audio_stretching.GetValue());
@@ -143,7 +142,6 @@ void RestoreGlobalState(bool is_powered_on) {
    values.shader_backend.SetGlobal(true);
    values.use_asynchronous_shaders.SetGlobal(true);
    values.use_fast_gpu_time.SetGlobal(true);
-    values.use_caches_gc.SetGlobal(true);
    values.bg_red.SetGlobal(true);
    values.bg_green.SetGlobal(true);
    values.bg_blue.SetGlobal(true);
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -4,6 +4,7 @@

 #pragma once

+#include <algorithm>
 #include <array>
 #include <atomic>
 #include <chrono>
@@ -74,14 +75,14 @@ public:
     */
    explicit BasicSetting(const Type& default_val, const std::string& name)
        : default_value{default_val}, global{default_val}, label{name} {}
-    ~BasicSetting() = default;
+    virtual ~BasicSetting() = default;

    /**
     *  Returns a reference to the setting's value.
     *
     * @returns A reference to the setting
     */
-    [[nodiscard]] const Type& GetValue() const {
+    [[nodiscard]] virtual const Type& GetValue() const {
        return global;
    }

@@ -90,7 +91,7 @@ public:
     *
     * @param value The desired value
     */
-    void SetValue(const Type& value) {
+    virtual void SetValue(const Type& value) {
        Type temp{value};
        std::swap(global, temp);
    }
@@ -120,7 +121,7 @@ public:
     *
     * @returns A reference to the setting
     */
-    const Type& operator=(const Type& value) {
+    virtual const Type& operator=(const Type& value) {
        Type temp{value};
        std::swap(global, temp);
        return global;
@@ -131,7 +132,7 @@ public:
     *
     * @returns A reference to the setting
     */
-    explicit operator const Type&() const {
+    explicit virtual operator const Type&() const {
        return global;
    }

@@ -141,6 +142,51 @@ protected:
    const std::string label{};  ///< The setting's label
 };

+/**
+ * BasicRangedSetting class is intended for use with quantifiable settings that need a more
+ * restrictive range than implicitly defined by its type. Implements a minimum and maximum that is
+ * simply used to sanitize SetValue and the assignment overload.
+ */
+template <typename Type>
+class BasicRangedSetting : virtual public BasicSetting<Type> {
+public:
+    /**
+     * Sets a default value, minimum value, maximum value, and label.
+     *
+     * @param default_val Intial value of the setting, and default value of the setting
+     * @param min_val Sets the minimum allowed value of the setting
+     * @param max_val Sets the maximum allowed value of the setting
+     * @param name Label for the setting
+     */
+    explicit BasicRangedSetting(const Type& default_val, const Type& min_val, const Type& max_val,
+                                const std::string& name)
+        : BasicSetting<Type>{default_val, name}, minimum{min_val}, maximum{max_val} {}
+    virtual ~BasicRangedSetting() = default;
+
+    /**
+     * Like BasicSetting's SetValue, except value is clamped to the range of the setting.
+     *
+     * @param value The desired value
+     */
+    void SetValue(const Type& value) override {
+        this->global = std::clamp(value, minimum, maximum);
+    }
+
+    /**
+     * Like BasicSetting's assignment overload, except value is clamped to the range of the setting.
+     *
+     * @param value The desired value
+     * @returns A reference to the setting's value
+     */
+    const Type& operator=(const Type& value) override {
+        this->global = std::clamp(value, minimum, maximum);
+        return this->global;
+    }
+
+    const Type minimum; ///< Minimum allowed value of the setting
+    const Type maximum; ///< Maximum allowed value of the setting
+};
+
 /**
 * The Setting class is a slightly more complex version of the BasicSetting class. This adds a
 * custom setting to switch to when a guest application specifically requires it. The effect is that
@@ -152,7 +198,7 @@ protected:
 * Like the BasicSetting, this requires setting a default value and label to use.
 */
 template <typename Type>
-class Setting final : public BasicSetting<Type> {
+class Setting : virtual public BasicSetting<Type> {
 public:
    /**
     * Sets a default value, label, and setting value.
@@ -162,7 +208,7 @@ public:
     */
    explicit Setting(const Type& default_val, const std::string& name)
        : BasicSetting<Type>(default_val, name) {}
-    ~Setting() = default;
+    virtual ~Setting() = default;

    /**
     * Tells this setting to represent either the global or custom setting when other member
@@ -191,7 +237,13 @@ public:
     *
     * @returns The required value of the setting
     */
-    [[nodiscard]] const Type& GetValue(bool need_global = false) const {
+    [[nodiscard]] virtual const Type& GetValue() const override {
+        if (use_global) {
+            return this->global;
+        }
+        return custom;
+    }
+    [[nodiscard]] virtual const Type& GetValue(bool need_global) const {
        if (use_global || need_global) {
            return this->global;
        }
@@ -203,7 +255,7 @@ public:
     *
     * @param value The new value
     */
-    void SetValue(const Type& value) {
+    void SetValue(const Type& value) override {
        Type temp{value};
        if (use_global) {
            std::swap(this->global, temp);
@@ -219,7 +271,7 @@ public:
     *
     * @returns A reference to the current setting value
     */
-    const Type& operator=(const Type& value) {
+    const Type& operator=(const Type& value) override {
        Type temp{value};
        if (use_global) {
            std::swap(this->global, temp);
@@ -234,18 +286,87 @@ public:
     *
     * @returns A reference to the current setting value
     */
-    explicit operator const Type&() const {
+    virtual explicit operator const Type&() const override {
        if (use_global) {
            return this->global;
        }
        return custom;
    }

-private:
+protected:
    bool use_global{true}; ///< The setting's global state
    Type custom{};         ///< The custom value of the setting
 };

+/**
+ * RangedSetting is a Setting that implements a maximum and minimum value for its setting. Intended
+ * for use with quantifiable settings.
+ */
+template <typename Type>
+class RangedSetting final : public BasicRangedSetting<Type>, public Setting<Type> {
+public:
+    /**
+     * Sets a default value, minimum value, maximum value, and label.
+     *
+     * @param default_val Intial value of the setting, and default value of the setting
+     * @param min_val Sets the minimum allowed value of the setting
+     * @param max_val Sets the maximum allowed value of the setting
+     * @param name Label for the setting
+     */
+    explicit RangedSetting(const Type& default_val, const Type& min_val, const Type& max_val,
+                           const std::string& name)
+        : BasicSetting<Type>{default_val, name},
+          BasicRangedSetting<Type>{default_val, min_val, max_val, name}, Setting<Type>{default_val,
+                                                                                       name} {}
+    virtual ~RangedSetting() = default;
+
+    // The following are needed to avoid a MSVC bug
+    // (source: https://stackoverflow.com/questions/469508)
+    [[nodiscard]] const Type& GetValue() const override {
+        return Setting<Type>::GetValue();
+    }
+    [[nodiscard]] const Type& GetValue(bool need_global) const override {
+        return Setting<Type>::GetValue(need_global);
+    }
+    explicit operator const Type&() const override {
+        if (this->use_global) {
+            return this->global;
+        }
+        return this->custom;
+    }
+
+    /**
+     * Like BasicSetting's SetValue, except value is clamped to the range of the setting. Sets the
+     * appropriate value depending on the global state.
+     *
+     * @param value The desired value
+     */
+    void SetValue(const Type& value) override {
+        const Type temp = std::clamp(value, this->minimum, this->maximum);
+        if (this->use_global) {
+            this->global = temp;
+        }
+        this->custom = temp;
+    }
+
+    /**
+     * Like BasicSetting's assignment overload, except value is clamped to the range of the setting.
+     * Uses the appropriate value depending on the global state.
+     *
+     * @param value The desired value
+     * @returns A reference to the setting's value
+     */
+    const Type& operator=(const Type& value) override {
+        const Type temp = std::clamp(value, this->minimum, this->maximum);
+        if (this->use_global) {
+            this->global = temp;
+            return this->global;
+        }
+        this->custom = temp;
+        return this->custom;
+    }
+};
+
 /**
 * The InputSetting class allows for getting a reference to either the global or custom members.
 * This is required as we cannot easily modify the values of user-defined types within containers
@@ -289,13 +410,14 @@ struct Values {
    BasicSetting<std::string> sink_id{"auto", "output_engine"};
    BasicSetting<bool> audio_muted{false, "audio_muted"};
    Setting<bool> enable_audio_stretching{true, "enable_audio_stretching"};
-    Setting<u8> volume{100, "volume"};
+    RangedSetting<u8> volume{100, 0, 100, "volume"};

    // Core
    Setting<bool> use_multi_core{true, "use_multi_core"};

    // Cpu
-    Setting<CPUAccuracy> cpu_accuracy{CPUAccuracy::Auto, "cpu_accuracy"};
+    RangedSetting<CPUAccuracy> cpu_accuracy{CPUAccuracy::Auto, CPUAccuracy::Auto,
+                                            CPUAccuracy::Unsafe, "cpu_accuracy"};
    // TODO: remove cpu_accuracy_first_time, migration setting added 8 July 2021
    BasicSetting<bool> cpu_accuracy_first_time{true, "cpu_accuracy_first_time"};
    BasicSetting<bool> cpu_debug_mode{false, "cpu_debug_mode"};
@@ -317,7 +439,8 @@ struct Values {
    Setting<bool> cpuopt_unsafe_fastmem_check{true, "cpuopt_unsafe_fastmem_check"};

    // Renderer
-    Setting<RendererBackend> renderer_backend{RendererBackend::OpenGL, "backend"};
+    RangedSetting<RendererBackend> renderer_backend{
+        RendererBackend::OpenGL, RendererBackend::OpenGL, RendererBackend::Vulkan, "backend"};
    BasicSetting<bool> renderer_debug{false, "debug"};
    BasicSetting<bool> renderer_shader_feedback{false, "shader_feedback"};
    BasicSetting<bool> enable_nsight_aftermath{false, "nsight_aftermath"};
@@ -328,29 +451,30 @@ struct Values {
    Setting<u16> resolution_factor{1, "resolution_factor"};
    // *nix platforms may have issues with the borderless windowed fullscreen mode.
    // Default to exclusive fullscreen on these platforms for now.
-    Setting<FullscreenMode> fullscreen_mode{
+    RangedSetting<FullscreenMode> fullscreen_mode{
 #ifdef _WIN32
        FullscreenMode::Borderless,
 #else
        FullscreenMode::Exclusive,
 #endif
-        "fullscreen_mode"};
-    Setting<int> aspect_ratio{0, "aspect_ratio"};
-    Setting<int> max_anisotropy{0, "max_anisotropy"};
+        FullscreenMode::Borderless, FullscreenMode::Exclusive, "fullscreen_mode"};
+    RangedSetting<int> aspect_ratio{0, 0, 3, "aspect_ratio"};
+    RangedSetting<int> max_anisotropy{0, 0, 4, "max_anisotropy"};
    Setting<bool> use_speed_limit{true, "use_speed_limit"};
-    Setting<u16> speed_limit{100, "speed_limit"};
+    RangedSetting<u16> speed_limit{100, 0, 9999, "speed_limit"};
    Setting<bool> use_disk_shader_cache{true, "use_disk_shader_cache"};
-    Setting<GPUAccuracy> gpu_accuracy{GPUAccuracy::High, "gpu_accuracy"};
+    RangedSetting<GPUAccuracy> gpu_accuracy{GPUAccuracy::High, GPUAccuracy::Normal,
+                                            GPUAccuracy::Extreme, "gpu_accuracy"};
    Setting<bool> use_asynchronous_gpu_emulation{true, "use_asynchronous_gpu_emulation"};
    Setting<bool> use_nvdec_emulation{true, "use_nvdec_emulation"};
    Setting<bool> accelerate_astc{true, "accelerate_astc"};
    Setting<bool> use_vsync{true, "use_vsync"};
-    BasicSetting<u16> fps_cap{1000, "fps_cap"};
+    BasicRangedSetting<u16> fps_cap{1000, 1, 1000, "fps_cap"};
    BasicSetting<bool> disable_fps_limit{false, "disable_fps_limit"};
-    Setting<ShaderBackend> shader_backend{ShaderBackend::GLASM, "shader_backend"};
+    RangedSetting<ShaderBackend> shader_backend{ShaderBackend::GLASM, ShaderBackend::GLSL,
+                                                ShaderBackend::SPIRV, "shader_backend"};
    Setting<bool> use_asynchronous_shaders{false, "use_asynchronous_shaders"};
    Setting<bool> use_fast_gpu_time{true, "use_fast_gpu_time"};
-    Setting<bool> use_caches_gc{false, "use_caches_gc"};

    Setting<u8> bg_red{0, "bg_red"};
    Setting<u8> bg_green{0, "bg_green"};
@@ -364,10 +488,10 @@ struct Values {
    std::chrono::seconds custom_rtc_differential;

    BasicSetting<s32> current_user{0, "current_user"};
-    Setting<s32> language_index{1, "language_index"};
-    Setting<s32> region_index{1, "region_index"};
-    Setting<s32> time_zone_index{0, "time_zone_index"};
-    Setting<s32> sound_index{1, "sound_index"};
+    RangedSetting<s32> language_index{1, 0, 17, "language_index"};
+    RangedSetting<s32> region_index{1, 0, 6, "region_index"};
+    RangedSetting<s32> time_zone_index{0, 0, 45, "time_zone_index"};
+    RangedSetting<s32> sound_index{1, 0, 2, "sound_index"};

    // Controls
    InputSetting<std::array<PlayerInput, 10>> players;
@@ -384,7 +508,7 @@ struct Values {
                                                "udp_input_servers"};

    BasicSetting<bool> mouse_panning{false, "mouse_panning"};
-    BasicSetting<u8> mouse_panning_sensitivity{10, "mouse_panning_sensitivity"};
+    BasicRangedSetting<u8> mouse_panning_sensitivity{10, 1, 100, "mouse_panning_sensitivity"};
    BasicSetting<bool> mouse_enabled{false, "mouse_enabled"};
    std::string mouse_device;
    MouseButtonsRaw mouse_buttons;
@@ -433,9 +557,10 @@ struct Values {
    BasicSetting<std::string> log_filter{"*:Info", "log_filter"};
    BasicSetting<bool> use_dev_keys{false, "use_dev_keys"};

-    // Services
+    // Network
    BasicSetting<std::string> bcat_backend{"none", "bcat_backend"};
    BasicSetting<bool> bcat_boxcat_local{false, "bcat_boxcat_local"};
+    BasicSetting<std::string> network_interface{std::string(), "network_interface"};

    // WebService
    BasicSetting<bool> enable_telemetry{true, "enable_telemetry"};
--- a/src/common/threadsafe_queue.h
+++ b/src/common/threadsafe_queue.h
@@ -46,15 +46,13 @@ public:
        ElementPtr* new_ptr = new ElementPtr();
        write_ptr->next.store(new_ptr, std::memory_order_release);
        write_ptr = new_ptr;
+        ++size;

-        const size_t previous_size{size++};
-
-        // Acquire the mutex and then immediately release it as a fence.
+        // cv_mutex must be held or else there will be a missed wakeup if the other thread is in the
+        // line before cv.wait
        // TODO(bunnei): This can be replaced with C++20 waitable atomics when properly supported.
        // See discussion on https://github.com/yuzu-emu/yuzu/pull/3173 for details.
-        if (previous_size == 0) {
-            std::lock_guard lock{cv_mutex};
-        }
+        std::lock_guard lock{cv_mutex};
        cv.notify_one();
    }

--- a/src/common/uuid.cpp
+++ b/src/common/uuid.cpp
@@ -6,10 +6,64 @@

 #include <fmt/format.h>

+#include "common/assert.h"
 #include "common/uuid.h"

 namespace Common {

+namespace {
+
+bool IsHexDigit(char c) {
+    return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+
+u8 HexCharToByte(char c) {
+    if (c >= '0' && c <= '9') {
+        return static_cast<u8>(c - '0');
+    }
+    if (c >= 'a' && c <= 'f') {
+        return static_cast<u8>(c - 'a' + 10);
+    }
+    if (c >= 'A' && c <= 'F') {
+        return static_cast<u8>(c - 'A' + 10);
+    }
+    ASSERT_MSG(false, "{} is not a hexadecimal digit!", c);
+    return u8{0};
+}
+
+} // Anonymous namespace
+
+u128 HexStringToU128(std::string_view hex_string) {
+    const size_t length = hex_string.length();
+
+    // Detect "0x" prefix.
+    const bool has_0x_prefix = length > 2 && hex_string[0] == '0' && hex_string[1] == 'x';
+    const size_t offset = has_0x_prefix ? 2 : 0;
+
+    // Check length.
+    if (length > 32 + offset) {
+        ASSERT_MSG(false, "hex_string has more than 32 hexadecimal characters!");
+        return INVALID_UUID;
+    }
+
+    u64 lo = 0;
+    u64 hi = 0;
+    for (size_t i = 0; i < length - offset; ++i) {
+        const char c = hex_string[length - 1 - i];
+        if (!IsHexDigit(c)) {
+            ASSERT_MSG(false, "{} is not a hexadecimal digit!", c);
+            return INVALID_UUID;
+        }
+        if (i < 16) {
+            lo |= u64{HexCharToByte(c)} << (i * 4);
+        }
+        if (i >= 16) {
+            hi |= u64{HexCharToByte(c)} << ((i - 16) * 4);
+        }
+    }
+    return u128{lo, hi};
+}
+
 UUID UUID::Generate() {
    std::random_device device;
    std::mt19937 gen(device());
--- a/src/common/uuid.h
+++ b/src/common/uuid.h
@@ -5,6 +5,7 @@
 #pragma once

 #include <string>
+#include <string_view>

 #include "common/common_types.h"

@@ -12,12 +13,30 @@ namespace Common {

 constexpr u128 INVALID_UUID{{0, 0}};

+/**
+ * Converts a hex string to a 128-bit unsigned integer.
+ *
+ * The hex string can be formatted in lowercase or uppercase, with or without the "0x" prefix.
+ *
+ * This function will assert and return INVALID_UUID under the following conditions:
+ * - If the hex string is more than 32 characters long
+ * - If the hex string contains non-hexadecimal characters
+ *
+ * @param hex_string Hexadecimal string
+ *
+ * @returns A 128-bit unsigned integer if successfully converted, INVALID_UUID otherwise.
+ */
+[[nodiscard]] u128 HexStringToU128(std::string_view hex_string);
+
 struct UUID {
    // UUIDs which are 0 are considered invalid!
    u128 uuid;
    UUID() = default;
    constexpr explicit UUID(const u128& id) : uuid{id} {}
    constexpr explicit UUID(const u64 lo, const u64 hi) : uuid{{lo, hi}} {}
+    explicit UUID(std::string_view hex_string) {
+        uuid = HexStringToU128(hex_string);
+    }

    [[nodiscard]] constexpr explicit operator bool() const {
        return uuid != INVALID_UUID;
@@ -50,3 +69,14 @@ struct UUID {
 static_assert(sizeof(UUID) == 16, "UUID is an invalid size!");

 } // namespace Common
+
+namespace std {
+
+template <>
+struct hash<Common::UUID> {
+    size_t operator()(const Common::UUID& uuid) const noexcept {
+        return uuid.uuid[1] ^ uuid.uuid[0];
+    }
+};
+
+} // namespace std
--- a/src/common/x64/xbyak_abi.h
+++ b/src/common/x64/xbyak_abi.h
@@ -6,7 +6,7 @@

 #include <bitset>
 #include <initializer_list>
-#include <xbyak.h>
+#include <xbyak/xbyak.h>
 #include "common/assert.h"

 namespace Common::X64 {
--- a/src/common/x64/xbyak_util.h
+++ b/src/common/x64/xbyak_util.h
@@ -5,7 +5,7 @@
 #pragma once

 #include <type_traits>
-#include <xbyak.h>
+#include <xbyak/xbyak.h>
 #include "common/x64/xbyak_abi.h"

 namespace Common::X64 {
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -452,6 +452,8 @@ add_library(core STATIC
    hle/service/nfp/nfp.h
    hle/service/nfp/nfp_user.cpp
    hle/service/nfp/nfp_user.h
+    hle/service/ngct/ngct.cpp
+    hle/service/ngct/ngct.h
    hle/service/nifm/nifm.cpp
    hle/service/nifm/nifm.h
    hle/service/nim/nim.cpp
@@ -636,6 +638,8 @@ add_library(core STATIC
    memory.h
    network/network.cpp
    network/network.h
+    network/network_interface.cpp
+    network/network_interface.h
    network/sockets.h
    perf_stats.cpp
    perf_stats.h
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -4,6 +4,7 @@

 #include <array>
 #include <atomic>
+#include <exception>
 #include <memory>
 #include <utility>

@@ -84,8 +85,6 @@ FileSys::StorageId GetStorageIdForFrontendSlot(

 } // Anonymous namespace

-/*static*/ System System::s_instance;
-
 FileSys::VirtualFile GetGameFileFromPath(const FileSys::VirtualFilesystem& vfs,
                                         const std::string& path) {
    // To account for split 00+01+etc files.
@@ -425,6 +424,20 @@ struct System::Impl {
 System::System() : impl{std::make_unique<Impl>(*this)} {}
 System::~System() = default;

+System& System::GetInstance() {
+    if (!s_instance) {
+        throw std::runtime_error("Using System instance before its initialization");
+    }
+    return *s_instance;
+}
+
+void System::InitializeGlobalInstance() {
+    if (s_instance) {
+        throw std::runtime_error("Reinitializing Global System instance.");
+    }
+    s_instance = std::unique_ptr<System>(new System);
+}
+
 CpuManager& System::GetCpuManager() {
    return impl->cpu_manager;
 }
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -120,9 +120,9 @@ public:
     * Gets the instance of the System singleton class.
     * @returns Reference to the instance of the System singleton class.
     */
-    [[deprecated("Use of the global system instance is deprecated")]] static System& GetInstance() {
-        return s_instance;
-    }
+    [[deprecated("Use of the global system instance is deprecated")]] static System& GetInstance();
+
+    static void InitializeGlobalInstance();

    /// Enumeration representing the return values of the System Initialize and Load process.
    enum class ResultStatus : u32 {
@@ -396,7 +396,7 @@ private:
    struct Impl;
    std::unique_ptr<Impl> impl;

-    static System s_instance;
+    inline static std::unique_ptr<System> s_instance{};
 };

 } // namespace Core
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -267,20 +267,23 @@ struct KernelCore::Impl {
        }
    }

-    /// Creates a new host thread ID, should only be called by GetHostThreadId
-    u32 AllocateHostThreadId(std::optional<std::size_t> core_id) {
-        if (core_id) {
-            // The first for slots are reserved for CPU core threads
-            ASSERT(*core_id < Core::Hardware::NUM_CPU_CORES);
-            return static_cast<u32>(*core_id);
-        } else {
-            return next_host_thread_id++;
+    static inline thread_local u32 host_thread_id = UINT32_MAX;
+
+    /// Gets the host thread ID for the caller, allocating a new one if this is the first time
+    u32 GetHostThreadId(std::size_t core_id) {
+        if (host_thread_id == UINT32_MAX) {
+            // The first four slots are reserved for CPU core threads
+            ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+            host_thread_id = static_cast<u32>(core_id);
        }
+        return host_thread_id;
    }

    /// Gets the host thread ID for the caller, allocating a new one if this is the first time
-    u32 GetHostThreadId(std::optional<std::size_t> core_id = std::nullopt) {
-        const thread_local auto host_thread_id{AllocateHostThreadId(core_id)};
+    u32 GetHostThreadId() {
+        if (host_thread_id == UINT32_MAX) {
+            host_thread_id = next_host_thread_id++;
+        }
        return host_thread_id;
    }

--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -1078,8 +1078,8 @@ static ResultCode GetThreadContext(Core::System& system, VAddr out_context, Hand
            for (auto i = 0; i < static_cast<s32>(Core::Hardware::NUM_CPU_CORES); ++i) {
                if (thread.GetPointerUnsafe() == kernel.Scheduler(i).GetCurrentThread()) {
                    current = true;
+                    break;
                }
-                break;
            }

            // If the thread is current, retry until it isn't.
--- a/src/core/hle/service/am/applets/applet_error.cpp
+++ b/src/core/hle/service/am/applets/applet_error.cpp
@@ -16,6 +16,30 @@

 namespace Service::AM::Applets {

+struct ErrorCode {
+    u32 error_category{};
+    u32 error_number{};
+
+    static constexpr ErrorCode FromU64(u64 error_code) {
+        return {
+            .error_category{static_cast<u32>(error_code >> 32)},
+            .error_number{static_cast<u32>(error_code & 0xFFFFFFFF)},
+        };
+    }
+
+    static constexpr ErrorCode FromResultCode(ResultCode result) {
+        return {
+            .error_category{2000 + static_cast<u32>(result.module.Value())},
+            .error_number{result.description.Value()},
+        };
+    }
+
+    constexpr ResultCode ToResultCode() const {
+        return ResultCode{static_cast<ErrorModule>(error_category - 2000), error_number};
+    }
+};
+static_assert(sizeof(ErrorCode) == 0x8, "ErrorCode has incorrect size.");
+
 #pragma pack(push, 4)
 struct ShowError {
    u8 mode;
@@ -76,12 +100,7 @@ void CopyArgumentData(const std::vector<u8>& data, T& variable) {
 }

 ResultCode Decode64BitError(u64 error) {
-    const auto description = (error >> 32) & 0x1FFF;
-    auto module = error & 0x3FF;
-    if (module >= 2000)
-        module -= 2000;
-    module &= 0x1FF;
-    return {static_cast<ErrorModule>(module), static_cast<u32>(description)};
+    return ErrorCode::FromU64(error).ToResultCode();
 }

 } // Anonymous namespace
--- a/src/core/hle/service/am/applets/applet_software_keyboard.cpp
+++ b/src/core/hle/service/am/applets/applet_software_keyboard.cpp
@@ -377,7 +377,8 @@ void SoftwareKeyboard::SubmitForTextCheck(std::u16string submitted_text) {

    if (swkbd_config_common.use_utf8) {
        std::string utf8_submitted_text = Common::UTF16ToUTF8(current_text);
-        const u64 buffer_size = utf8_submitted_text.size();
+        // Include the null terminator in the buffer size.
+        const u64 buffer_size = utf8_submitted_text.size() + 1;

        LOG_DEBUG(Service_AM, "\nBuffer Size: {}\nUTF-8 Submitted Text: {}", buffer_size,
                  utf8_submitted_text);
@@ -386,7 +387,8 @@ void SoftwareKeyboard::SubmitForTextCheck(std::u16string submitted_text) {
        std::memcpy(out_data.data() + sizeof(u64), utf8_submitted_text.data(),
                    utf8_submitted_text.size());
    } else {
-        const u64 buffer_size = current_text.size() * sizeof(char16_t);
+        // Include the null terminator in the buffer size.
+        const u64 buffer_size = (current_text.size() + 1) * sizeof(char16_t);

        LOG_DEBUG(Service_AM, "\nBuffer Size: {}\nUTF-16 Submitted Text: {}", buffer_size,
                  Common::UTF16ToUTF8(current_text));
--- a/src/core/hle/service/ngct/ngct.cpp
+++ b/src/core/hle/service/ngct/ngct.cpp
@@ -0,0 +1,46 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included
+
+#include "common/string_util.h"
+#include "core/core.h"
+#include "core/hle/ipc_helpers.h"
+#include "core/hle/service/ngct/ngct.h"
+#include "core/hle/service/service.h"
+
+namespace Service::NGCT {
+
+class IService final : public ServiceFramework<IService> {
+public:
+    explicit IService(Core::System& system_) : ServiceFramework{system_, "ngct:u"} {
+        // clang-format off
+        static const FunctionInfo functions[] = {
+            {0, nullptr, "Match"},
+            {1, &IService::Filter, "Filter"},
+        };
+        // clang-format on
+
+        RegisterHandlers(functions);
+    }
+
+private:
+    void Filter(Kernel::HLERequestContext& ctx) {
+        const auto buffer = ctx.ReadBuffer();
+        const auto text = Common::StringFromFixedZeroTerminatedBuffer(
+            reinterpret_cast<const char*>(buffer.data()), buffer.size());
+
+        LOG_WARNING(Service_NGCT, "(STUBBED) called, text={}", text);
+
+        // Return the same string since we don't censor anything
+        ctx.WriteBuffer(buffer);
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(ResultSuccess);
+    }
+};
+
+void InstallInterfaces(SM::ServiceManager& service_manager, Core::System& system) {
+    std::make_shared<IService>(system)->InstallAsService(system.ServiceManager());
+}
+
+} // namespace Service::NGCT
--- a/src/core/hle/service/ngct/ngct.h
+++ b/src/core/hle/service/ngct/ngct.h
@@ -0,0 +1,20 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included
+
+#pragma once
+
+namespace Core {
+class System;
+}
+
+namespace Service::SM {
+class ServiceManager;
+}
+
+namespace Service::NGCT {
+
+/// Registers all NGCT services with the specified service manager.
+void InstallInterfaces(SM::ServiceManager& service_manager, Core::System& system);
+
+} // namespace Service::NGCT
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -11,6 +11,7 @@
 #include "core/hle/service/nifm/nifm.h"
 #include "core/hle/service/service.h"
 #include "core/network/network.h"
+#include "core/network/network_interface.h"

 namespace Service::NIFM {

@@ -179,10 +180,10 @@ private:
        IPC::ResponseBuilder rb{ctx, 3};
        rb.Push(ResultSuccess);

-        if (Settings::values.bcat_backend.GetValue() == "none") {
-            rb.PushEnum(RequestState::NotSubmitted);
-        } else {
+        if (Network::GetHostIPv4Address().has_value()) {
            rb.PushEnum(RequestState::Connected);
+        } else {
+            rb.PushEnum(RequestState::NotSubmitted);
        }
    }

@@ -276,37 +277,45 @@ private:
    void GetCurrentNetworkProfile(Kernel::HLERequestContext& ctx) {
        LOG_WARNING(Service_NIFM, "(STUBBED) called");

-        const SfNetworkProfileData network_profile_data{
-            .ip_setting_data{
-                .ip_address_setting{
-                    .is_automatic{true},
-                    .current_address{192, 168, 1, 100},
-                    .subnet_mask{255, 255, 255, 0},
-                    .gateway{192, 168, 1, 1},
+        const auto net_iface = Network::GetSelectedNetworkInterface();
+
+        const SfNetworkProfileData network_profile_data = [&net_iface] {
+            if (!net_iface) {
+                return SfNetworkProfileData{};
+            }
+
+            return SfNetworkProfileData{
+                .ip_setting_data{
+                    .ip_address_setting{
+                        .is_automatic{true},
+                        .current_address{Network::TranslateIPv4(net_iface->ip_address)},
+                        .subnet_mask{Network::TranslateIPv4(net_iface->subnet_mask)},
+                        .gateway{Network::TranslateIPv4(net_iface->gateway)},
+                    },
+                    .dns_setting{
+                        .is_automatic{true},
+                        .primary_dns{1, 1, 1, 1},
+                        .secondary_dns{1, 0, 0, 1},
+                    },
+                    .proxy_setting{
+                        .enabled{false},
+                        .port{},
+                        .proxy_server{},
+                        .automatic_auth_enabled{},
+                        .user{},
+                        .password{},
+                    },
+                    .mtu{1500},
                },
-                .dns_setting{
-                    .is_automatic{true},
-                    .primary_dns{1, 1, 1, 1},
-                    .secondary_dns{1, 0, 0, 1},
+                .uuid{0xdeadbeef, 0xdeadbeef},
+                .network_name{"yuzu Network"},
+                .wireless_setting_data{
+                    .ssid_length{12},
+                    .ssid{"yuzu Network"},
+                    .passphrase{"yuzupassword"},
                },
-                .proxy_setting{
-                    .enabled{false},
-                    .port{},
-                    .proxy_server{},
-                    .automatic_auth_enabled{},
-                    .user{},
-                    .password{},
-                },
-                .mtu{1500},
-            },
-            .uuid{0xdeadbeef, 0xdeadbeef},
-            .network_name{"yuzu Network"},
-            .wireless_setting_data{
-                .ssid_length{12},
-                .ssid{"yuzu Network"},
-                .passphrase{"yuzupassword"},
-            },
-        };
+            };
+        }();

        ctx.WriteBuffer(network_profile_data);

@@ -322,12 +331,15 @@ private:
    void GetCurrentIpAddress(Kernel::HLERequestContext& ctx) {
        LOG_WARNING(Service_NIFM, "(STUBBED) called");

-        const auto [ipv4, error] = Network::GetHostIPv4Address();
-        UNIMPLEMENTED_IF(error != Network::Errno::SUCCESS);
+        auto ipv4 = Network::GetHostIPv4Address();
+        if (!ipv4) {
+            LOG_ERROR(Service_NIFM, "Couldn't get host IPv4 address, defaulting to 0.0.0.0");
+            ipv4.emplace(Network::IPv4Address{0, 0, 0, 0});
+        }

        IPC::ResponseBuilder rb{ctx, 3};
        rb.Push(ResultSuccess);
-        rb.PushRaw(ipv4);
+        rb.PushRaw(*ipv4);
    }
    void CreateTemporaryNetworkProfile(Kernel::HLERequestContext& ctx) {
        LOG_DEBUG(Service_NIFM, "called");
@@ -348,25 +360,33 @@ private:
        LOG_WARNING(Service_NIFM, "(STUBBED) called");

        struct IpConfigInfo {
-            IpAddressSetting ip_address_setting;
-            DnsSetting dns_setting;
+            IpAddressSetting ip_address_setting{};
+            DnsSetting dns_setting{};
        };
        static_assert(sizeof(IpConfigInfo) == sizeof(IpAddressSetting) + sizeof(DnsSetting),
                      "IpConfigInfo has incorrect size.");

-        const IpConfigInfo ip_config_info{
-            .ip_address_setting{
-                .is_automatic{true},
-                .current_address{192, 168, 1, 100},
-                .subnet_mask{255, 255, 255, 0},
-                .gateway{192, 168, 1, 1},
-            },
-            .dns_setting{
-                .is_automatic{true},
-                .primary_dns{1, 1, 1, 1},
-                .secondary_dns{1, 0, 0, 1},
-            },
-        };
+        const auto net_iface = Network::GetSelectedNetworkInterface();
+
+        const IpConfigInfo ip_config_info = [&net_iface] {
+            if (!net_iface) {
+                return IpConfigInfo{};
+            }
+
+            return IpConfigInfo{
+                .ip_address_setting{
+                    .is_automatic{true},
+                    .current_address{Network::TranslateIPv4(net_iface->ip_address)},
+                    .subnet_mask{Network::TranslateIPv4(net_iface->subnet_mask)},
+                    .gateway{Network::TranslateIPv4(net_iface->gateway)},
+                },
+                .dns_setting{
+                    .is_automatic{true},
+                    .primary_dns{1, 1, 1, 1},
+                    .secondary_dns{1, 0, 0, 1},
+                },
+            };
+        }();

        IPC::ResponseBuilder rb{ctx, 2 + (sizeof(IpConfigInfo) + 3) / sizeof(u32)};
        rb.Push(ResultSuccess);
@@ -384,10 +404,10 @@ private:

        IPC::ResponseBuilder rb{ctx, 3};
        rb.Push(ResultSuccess);
-        if (Settings::values.bcat_backend.GetValue() == "none") {
-            rb.Push<u8>(0);
-        } else {
+        if (Network::GetHostIPv4Address().has_value()) {
            rb.Push<u8>(1);
+        } else {
+            rb.Push<u8>(0);
        }
    }
    void IsAnyInternetRequestAccepted(Kernel::HLERequestContext& ctx) {
@@ -395,10 +415,10 @@ private:

        IPC::ResponseBuilder rb{ctx, 3};
        rb.Push(ResultSuccess);
-        if (Settings::values.bcat_backend.GetValue() == "none") {
-            rb.Push<u8>(0);
-        } else {
+        if (Network::GetHostIPv4Address().has_value()) {
            rb.Push<u8>(1);
+        } else {
+            rb.Push<u8>(0);
        }
    }
 };
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -166,8 +166,6 @@ NvResult nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vecto
            LOG_ERROR(Service_NVDRV, "failed to map size={}", object->size);
        } else {
            cmd_buffer.map_address = object->dma_map_addr;
-            AddBufferMap(object->dma_map_addr, object->size, object->addr,
-                         object->status == nvmap::Object::Status::Allocated);
        }
    }
    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
@@ -178,30 +176,11 @@ NvResult nvhost_nvdec_common::MapBuffer(const std::vector<u8>& input, std::vecto
 }

 NvResult nvhost_nvdec_common::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
-    IoctlMapBuffer params{};
-    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
-    std::vector<MapBufferEntry> cmd_buffer_handles(params.num_entries);
-    SliceVectors(input, cmd_buffer_handles, params.num_entries, sizeof(IoctlMapBuffer));
-
-    auto& gpu = system.GPU();
-
-    for (auto& cmd_buffer : cmd_buffer_handles) {
-        const auto object{nvmap_dev->GetObject(cmd_buffer.map_handle)};
-        if (!object) {
-            LOG_ERROR(Service_NVDRV, "invalid cmd_buffer nvmap_handle={:X}", cmd_buffer.map_handle);
-            std::memcpy(output.data(), &params, output.size());
-            return NvResult::InvalidState;
-        }
-        if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) {
-            gpu.MemoryManager().Unmap(object->dma_map_addr, *size);
-        } else {
-            // This occurs quite frequently, however does not seem to impact functionality
-            LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr,
-                      object->dma_map_addr);
-        }
-        object->dma_map_addr = 0;
-    }
+    // This is intntionally stubbed.
+    // Skip unmapping buffers here, as to not break the continuity of the VP9 reference frame
+    // addresses, and risk invalidating data before the async GPU thread is done with it
    std::memset(output.data(), 0, output.size());
+    LOG_DEBUG(Service_NVDRV, "(STUBBED) called");
    return NvResult::Success;
 }

@@ -212,33 +191,4 @@ NvResult nvhost_nvdec_common::SetSubmitTimeout(const std::vector<u8>& input,
    return NvResult::Success;
 }

-std::optional<nvhost_nvdec_common::BufferMap> nvhost_nvdec_common::FindBufferMap(
-    GPUVAddr gpu_addr) const {
-    const auto it = std::find_if(
-        buffer_mappings.begin(), buffer_mappings.upper_bound(gpu_addr), [&](const auto& entry) {
-            return (gpu_addr >= entry.second.StartAddr() && gpu_addr < entry.second.EndAddr());
-        });
-
-    ASSERT(it != buffer_mappings.end());
-    return it->second;
-}
-
-void nvhost_nvdec_common::AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr,
-                                       bool is_allocated) {
-    buffer_mappings.insert_or_assign(gpu_addr, BufferMap{gpu_addr, size, cpu_addr, is_allocated});
-}
-
-std::optional<std::size_t> nvhost_nvdec_common::RemoveBufferMap(GPUVAddr gpu_addr) {
-    const auto iter{buffer_mappings.find(gpu_addr)};
-    if (iter == buffer_mappings.end()) {
-        return std::nullopt;
-    }
-    std::size_t size = 0;
-    if (iter->second.IsAllocated()) {
-        size = iter->second.Size();
-    }
-    buffer_mappings.erase(iter);
-    return size;
-}
-
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -23,45 +23,6 @@ public:
    ~nvhost_nvdec_common() override;

 protected:
-    class BufferMap final {
-    public:
-        constexpr BufferMap() = default;
-
-        constexpr BufferMap(GPUVAddr start_addr_, std::size_t size_)
-            : start_addr{start_addr_}, end_addr{start_addr_ + size_} {}
-
-        constexpr BufferMap(GPUVAddr start_addr_, std::size_t size_, VAddr cpu_addr_,
-                            bool is_allocated_)
-            : start_addr{start_addr_}, end_addr{start_addr_ + size_}, cpu_addr{cpu_addr_},
-              is_allocated{is_allocated_} {}
-
-        constexpr VAddr StartAddr() const {
-            return start_addr;
-        }
-
-        constexpr VAddr EndAddr() const {
-            return end_addr;
-        }
-
-        constexpr std::size_t Size() const {
-            return end_addr - start_addr;
-        }
-
-        constexpr VAddr CpuAddr() const {
-            return cpu_addr;
-        }
-
-        constexpr bool IsAllocated() const {
-            return is_allocated;
-        }
-
-    private:
-        GPUVAddr start_addr{};
-        GPUVAddr end_addr{};
-        VAddr cpu_addr{};
-        bool is_allocated{};
-    };
-
    struct IoctlSetNvmapFD {
        s32_le nvmap_fd{};
    };
@@ -154,17 +115,11 @@ protected:
    NvResult UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
    NvResult SetSubmitTimeout(const std::vector<u8>& input, std::vector<u8>& output);

-    std::optional<BufferMap> FindBufferMap(GPUVAddr gpu_addr) const;
-    void AddBufferMap(GPUVAddr gpu_addr, std::size_t size, VAddr cpu_addr, bool is_allocated);
-    std::optional<std::size_t> RemoveBufferMap(GPUVAddr gpu_addr);
-
    s32_le nvmap_fd{};
    u32_le submit_timeout{};
    std::shared_ptr<nvmap> nvmap_dev;
    SyncpointManager& syncpoint_manager;
    std::array<u32, MaxSyncPoints> device_syncpoints{};
-    // This is expected to be ordered, therefore we must use a map, not unordered_map
-    std::map<GPUVAddr, BufferMap> buffer_mappings;
 };
 }; // namespace Devices
 } // namespace Service::Nvidia
--- a/src/core/hle/service/service.cpp
+++ b/src/core/hle/service/service.cpp
@@ -46,6 +46,7 @@
 #include "core/hle/service/ncm/ncm.h"
 #include "core/hle/service/nfc/nfc.h"
 #include "core/hle/service/nfp/nfp.h"
+#include "core/hle/service/ngct/ngct.h"
 #include "core/hle/service/nifm/nifm.h"
 #include "core/hle/service/nim/nim.h"
 #include "core/hle/service/npns/npns.h"
@@ -271,6 +272,7 @@ Services::Services(std::shared_ptr<SM::ServiceManager>& sm, Core::System& system
    NCM::InstallInterfaces(*sm, system);
    NFC::InstallInterfaces(*sm, system);
    NFP::InstallInterfaces(*sm, system);
+    NGCT::InstallInterfaces(*sm, system);
    NIFM::InstallInterfaces(*sm, system);
    NIM::InstallInterfaces(*sm, system);
    NPNS::InstallInterfaces(*sm, system);
--- a/src/core/hle/service/set/set.cpp
+++ b/src/core/hle/service/set/set.cpp
@@ -85,7 +85,8 @@ void PushResponseLanguageCode(Kernel::HLERequestContext& ctx, std::size_t num_la

 void GetAvailableLanguageCodesImpl(Kernel::HLERequestContext& ctx, std::size_t max_entries) {
    const std::size_t requested_amount = ctx.GetWriteBufferSize() / sizeof(LanguageCode);
-    const std::size_t copy_amount = std::min(requested_amount, max_entries);
+    const std::size_t max_amount = std::min(requested_amount, max_entries);
+    const std::size_t copy_amount = std::min(available_language_codes.size(), max_amount);
    const std::size_t copy_size = copy_amount * sizeof(LanguageCode);

    ctx.WriteBuffer(available_language_codes.data(), copy_size);
--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -1158,7 +1158,7 @@ private:

        const auto layer_id = nv_flinger.CreateLayer(display_id);
        if (!layer_id) {
-            LOG_ERROR(Service_VI, "Layer not found! layer_id={}", *layer_id);
+            LOG_ERROR(Service_VI, "Layer not found! display_id={}", display_id);
            IPC::ResponseBuilder rb{ctx, 2};
            rb.Push(ERR_NOT_FOUND);
            return;
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -4,8 +4,6 @@

 #include <algorithm>
 #include <cstring>
-#include <optional>
-#include <utility>

 #include "common/assert.h"
 #include "common/atomic_ops.h"
@@ -14,12 +12,10 @@
 #include "common/page_table.h"
 #include "common/settings.h"
 #include "common/swap.h"
-#include "core/arm/arm_interface.h"
 #include "core/core.h"
 #include "core/device_memory.h"
 #include "core/hle/kernel/k_page_table.h"
 #include "core/hle/kernel/k_process.h"
-#include "core/hle/kernel/physical_memory.h"
 #include "core/memory.h"
 #include "video_core/gpu.h"

@@ -62,17 +58,7 @@ struct Memory::Impl {
        }
    }

-    bool IsValidVirtualAddress(const Kernel::KProcess& process, const VAddr vaddr) const {
-        const auto& page_table = process.PageTable().PageTableImpl();
-        const auto [pointer, type] = page_table.pointers[vaddr >> PAGE_BITS].PointerType();
-        return pointer != nullptr || type == Common::PageType::RasterizerCachedMemory;
-    }
-
-    bool IsValidVirtualAddress(VAddr vaddr) const {
-        return IsValidVirtualAddress(*system.CurrentProcess(), vaddr);
-    }
-
-    u8* GetPointerFromRasterizerCachedMemory(VAddr vaddr) const {
+    [[nodiscard]] u8* GetPointerFromRasterizerCachedMemory(VAddr vaddr) const {
        const PAddr paddr{current_page_table->backing_addr[vaddr >> PAGE_BITS]};

        if (!paddr) {
@@ -82,18 +68,6 @@ struct Memory::Impl {
        return system.DeviceMemory().GetPointer(paddr) + vaddr;
    }

-    u8* GetPointer(const VAddr vaddr) const {
-        const uintptr_t raw_pointer = current_page_table->pointers[vaddr >> PAGE_BITS].Raw();
-        if (u8* const pointer = Common::PageTable::PageInfo::ExtractPointer(raw_pointer)) {
-            return pointer + vaddr;
-        }
-        const auto type = Common::PageTable::PageInfo::ExtractType(raw_pointer);
-        if (type == Common::PageType::RasterizerCachedMemory) {
-            return GetPointerFromRasterizerCachedMemory(vaddr);
-        }
-        return nullptr;
-    }
-
    u8 Read8(const VAddr addr) {
        return Read<u8>(addr);
    }
@@ -179,7 +153,7 @@ struct Memory::Impl {
        std::string string;
        string.reserve(max_length);
        for (std::size_t i = 0; i < max_length; ++i) {
-            const char c = Read8(vaddr);
+            const char c = Read<s8>(vaddr);
            if (c == '\0') {
                break;
            }
@@ -190,15 +164,14 @@ struct Memory::Impl {
        return string;
    }

-    void ReadBlock(const Kernel::KProcess& process, const VAddr src_addr, void* dest_buffer,
-                   const std::size_t size) {
+    void WalkBlock(const Kernel::KProcess& process, const VAddr addr, const std::size_t size,
+                   auto on_unmapped, auto on_memory, auto on_rasterizer, auto increment) {
        const auto& page_table = process.PageTable().PageTableImpl();
-
        std::size_t remaining_size = size;
-        std::size_t page_index = src_addr >> PAGE_BITS;
-        std::size_t page_offset = src_addr & PAGE_MASK;
+        std::size_t page_index = addr >> PAGE_BITS;
+        std::size_t page_offset = addr & PAGE_MASK;

-        while (remaining_size > 0) {
+        while (remaining_size) {
            const std::size_t copy_amount =
                std::min(static_cast<std::size_t>(PAGE_SIZE) - page_offset, remaining_size);
            const auto current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
@@ -206,22 +179,18 @@ struct Memory::Impl {
            const auto [pointer, type] = page_table.pointers[page_index].PointerType();
            switch (type) {
            case Common::PageType::Unmapped: {
-                LOG_ERROR(HW_Memory,
-                          "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
-                          current_vaddr, src_addr, size);
-                std::memset(dest_buffer, 0, copy_amount);
+                on_unmapped(copy_amount, current_vaddr);
                break;
            }
            case Common::PageType::Memory: {
                DEBUG_ASSERT(pointer);
-                const u8* const src_ptr = pointer + page_offset + (page_index << PAGE_BITS);
-                std::memcpy(dest_buffer, src_ptr, copy_amount);
+                u8* mem_ptr = pointer + page_offset + (page_index << PAGE_BITS);
+                on_memory(copy_amount, mem_ptr);
                break;
            }
            case Common::PageType::RasterizerCachedMemory: {
-                const u8* const host_ptr{GetPointerFromRasterizerCachedMemory(current_vaddr)};
-                system.GPU().FlushRegion(current_vaddr, copy_amount);
-                std::memcpy(dest_buffer, host_ptr, copy_amount);
+                u8* const host_ptr{GetPointerFromRasterizerCachedMemory(current_vaddr)};
+                on_rasterizer(current_vaddr, copy_amount, host_ptr);
                break;
            }
            default:
@@ -230,248 +199,122 @@ struct Memory::Impl {

            page_index++;
            page_offset = 0;
-            dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+            increment(copy_amount);
            remaining_size -= copy_amount;
        }
    }

-    void ReadBlockUnsafe(const Kernel::KProcess& process, const VAddr src_addr, void* dest_buffer,
-                         const std::size_t size) {
-        const auto& page_table = process.PageTable().PageTableImpl();
-
-        std::size_t remaining_size = size;
-        std::size_t page_index = src_addr >> PAGE_BITS;
-        std::size_t page_offset = src_addr & PAGE_MASK;
-
-        while (remaining_size > 0) {
-            const std::size_t copy_amount =
-                std::min(static_cast<std::size_t>(PAGE_SIZE) - page_offset, remaining_size);
-            const auto current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
-
-            const auto [pointer, type] = page_table.pointers[page_index].PointerType();
-            switch (type) {
-            case Common::PageType::Unmapped: {
+    template <bool UNSAFE>
+    void ReadBlockImpl(const Kernel::KProcess& process, const VAddr src_addr, void* dest_buffer,
+                       const std::size_t size) {
+        WalkBlock(
+            process, src_addr, size,
+            [src_addr, size, &dest_buffer](const std::size_t copy_amount,
+                                           const VAddr current_vaddr) {
                LOG_ERROR(HW_Memory,
                          "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
                          current_vaddr, src_addr, size);
                std::memset(dest_buffer, 0, copy_amount);
-                break;
-            }
-            case Common::PageType::Memory: {
-                DEBUG_ASSERT(pointer);
-                const u8* const src_ptr = pointer + page_offset + (page_index << PAGE_BITS);
+            },
+            [&dest_buffer](const std::size_t copy_amount, const u8* const src_ptr) {
                std::memcpy(dest_buffer, src_ptr, copy_amount);
-                break;
-            }
-            case Common::PageType::RasterizerCachedMemory: {
-                const u8* const host_ptr{GetPointerFromRasterizerCachedMemory(current_vaddr)};
+            },
+            [&system = system, &dest_buffer](const VAddr current_vaddr,
+                                             const std::size_t copy_amount,
+                                             const u8* const host_ptr) {
+                if constexpr (!UNSAFE) {
+                    system.GPU().FlushRegion(current_vaddr, copy_amount);
+                }
                std::memcpy(dest_buffer, host_ptr, copy_amount);
-                break;
-            }
-            default:
-                UNREACHABLE();
-            }
-
-            page_index++;
-            page_offset = 0;
-            dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
-            remaining_size -= copy_amount;
-        }
+            },
+            [&dest_buffer](const std::size_t copy_amount) {
+                dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
+            });
    }

    void ReadBlock(const VAddr src_addr, void* dest_buffer, const std::size_t size) {
-        ReadBlock(*system.CurrentProcess(), src_addr, dest_buffer, size);
+        ReadBlockImpl<false>(*system.CurrentProcess(), src_addr, dest_buffer, size);
    }

    void ReadBlockUnsafe(const VAddr src_addr, void* dest_buffer, const std::size_t size) {
-        ReadBlockUnsafe(*system.CurrentProcess(), src_addr, dest_buffer, size);
+        ReadBlockImpl<true>(*system.CurrentProcess(), src_addr, dest_buffer, size);
    }

-    void WriteBlock(const Kernel::KProcess& process, const VAddr dest_addr, const void* src_buffer,
-                    const std::size_t size) {
-        const auto& page_table = process.PageTable().PageTableImpl();
-        std::size_t remaining_size = size;
-        std::size_t page_index = dest_addr >> PAGE_BITS;
-        std::size_t page_offset = dest_addr & PAGE_MASK;
-
-        while (remaining_size > 0) {
-            const std::size_t copy_amount =
-                std::min(static_cast<std::size_t>(PAGE_SIZE) - page_offset, remaining_size);
-            const auto current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
-
-            const auto [pointer, type] = page_table.pointers[page_index].PointerType();
-            switch (type) {
-            case Common::PageType::Unmapped: {
+    template <bool UNSAFE>
+    void WriteBlockImpl(const Kernel::KProcess& process, const VAddr dest_addr,
+                        const void* src_buffer, const std::size_t size) {
+        WalkBlock(
+            process, dest_addr, size,
+            [dest_addr, size](const std::size_t copy_amount, const VAddr current_vaddr) {
                LOG_ERROR(HW_Memory,
                          "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
                          current_vaddr, dest_addr, size);
-                break;
-            }
-            case Common::PageType::Memory: {
-                DEBUG_ASSERT(pointer);
-                u8* const dest_ptr = pointer + page_offset + (page_index << PAGE_BITS);
+            },
+            [&src_buffer](const std::size_t copy_amount, u8* const dest_ptr) {
                std::memcpy(dest_ptr, src_buffer, copy_amount);
-                break;
-            }
-            case Common::PageType::RasterizerCachedMemory: {
-                u8* const host_ptr{GetPointerFromRasterizerCachedMemory(current_vaddr)};
-                system.GPU().InvalidateRegion(current_vaddr, copy_amount);
+            },
+            [&system = system, &src_buffer](const VAddr current_vaddr,
+                                            const std::size_t copy_amount, u8* const host_ptr) {
+                if constexpr (!UNSAFE) {
+                    system.GPU().InvalidateRegion(current_vaddr, copy_amount);
+                }
                std::memcpy(host_ptr, src_buffer, copy_amount);
-                break;
-            }
-            default:
-                UNREACHABLE();
-            }
-
-            page_index++;
-            page_offset = 0;
-            src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
-            remaining_size -= copy_amount;
-        }
-    }
-
-    void WriteBlockUnsafe(const Kernel::KProcess& process, const VAddr dest_addr,
-                          const void* src_buffer, const std::size_t size) {
-        const auto& page_table = process.PageTable().PageTableImpl();
-        std::size_t remaining_size = size;
-        std::size_t page_index = dest_addr >> PAGE_BITS;
-        std::size_t page_offset = dest_addr & PAGE_MASK;
-
-        while (remaining_size > 0) {
-            const std::size_t copy_amount =
-                std::min(static_cast<std::size_t>(PAGE_SIZE) - page_offset, remaining_size);
-            const auto current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
-
-            const auto [pointer, type] = page_table.pointers[page_index].PointerType();
-            switch (type) {
-            case Common::PageType::Unmapped: {
-                LOG_ERROR(HW_Memory,
-                          "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
-                          current_vaddr, dest_addr, size);
-                break;
-            }
-            case Common::PageType::Memory: {
-                DEBUG_ASSERT(pointer);
-                u8* const dest_ptr = pointer + page_offset + (page_index << PAGE_BITS);
-                std::memcpy(dest_ptr, src_buffer, copy_amount);
-                break;
-            }
-            case Common::PageType::RasterizerCachedMemory: {
-                u8* const host_ptr{GetPointerFromRasterizerCachedMemory(current_vaddr)};
-                std::memcpy(host_ptr, src_buffer, copy_amount);
-                break;
-            }
-            default:
-                UNREACHABLE();
-            }
-
-            page_index++;
-            page_offset = 0;
-            src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
-            remaining_size -= copy_amount;
-        }
+            },
+            [&src_buffer](const std::size_t copy_amount) {
+                src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
+            });
    }

    void WriteBlock(const VAddr dest_addr, const void* src_buffer, const std::size_t size) {
-        WriteBlock(*system.CurrentProcess(), dest_addr, src_buffer, size);
+        WriteBlockImpl<false>(*system.CurrentProcess(), dest_addr, src_buffer, size);
    }

    void WriteBlockUnsafe(const VAddr dest_addr, const void* src_buffer, const std::size_t size) {
-        WriteBlockUnsafe(*system.CurrentProcess(), dest_addr, src_buffer, size);
+        WriteBlockImpl<true>(*system.CurrentProcess(), dest_addr, src_buffer, size);
    }

    void ZeroBlock(const Kernel::KProcess& process, const VAddr dest_addr, const std::size_t size) {
-        const auto& page_table = process.PageTable().PageTableImpl();
-        std::size_t remaining_size = size;
-        std::size_t page_index = dest_addr >> PAGE_BITS;
-        std::size_t page_offset = dest_addr & PAGE_MASK;
-
-        while (remaining_size > 0) {
-            const std::size_t copy_amount =
-                std::min(static_cast<std::size_t>(PAGE_SIZE) - page_offset, remaining_size);
-            const auto current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
-
-            const auto [pointer, type] = page_table.pointers[page_index].PointerType();
-            switch (type) {
-            case Common::PageType::Unmapped: {
+        WalkBlock(
+            process, dest_addr, size,
+            [dest_addr, size](const std::size_t copy_amount, const VAddr current_vaddr) {
                LOG_ERROR(HW_Memory,
                          "Unmapped ZeroBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
                          current_vaddr, dest_addr, size);
-                break;
-            }
-            case Common::PageType::Memory: {
-                DEBUG_ASSERT(pointer);
-                u8* const dest_ptr = pointer + page_offset + (page_index << PAGE_BITS);
+            },
+            [](const std::size_t copy_amount, u8* const dest_ptr) {
                std::memset(dest_ptr, 0, copy_amount);
-                break;
-            }
-            case Common::PageType::RasterizerCachedMemory: {
-                u8* const host_ptr{GetPointerFromRasterizerCachedMemory(current_vaddr)};
+            },
+            [&system = system](const VAddr current_vaddr, const std::size_t copy_amount,
+                               u8* const host_ptr) {
                system.GPU().InvalidateRegion(current_vaddr, copy_amount);
                std::memset(host_ptr, 0, copy_amount);
-                break;
-            }
-            default:
-                UNREACHABLE();
-            }
-
-            page_index++;
-            page_offset = 0;
-            remaining_size -= copy_amount;
-        }
-    }
-
-    void ZeroBlock(const VAddr dest_addr, const std::size_t size) {
-        ZeroBlock(*system.CurrentProcess(), dest_addr, size);
+            },
+            [](const std::size_t copy_amount) {});
    }

    void CopyBlock(const Kernel::KProcess& process, VAddr dest_addr, VAddr src_addr,
                   const std::size_t size) {
-        const auto& page_table = process.PageTable().PageTableImpl();
-        std::size_t remaining_size = size;
-        std::size_t page_index = src_addr >> PAGE_BITS;
-        std::size_t page_offset = src_addr & PAGE_MASK;
-
-        while (remaining_size > 0) {
-            const std::size_t copy_amount =
-                std::min(static_cast<std::size_t>(PAGE_SIZE) - page_offset, remaining_size);
-            const auto current_vaddr = static_cast<VAddr>((page_index << PAGE_BITS) + page_offset);
-
-            const auto [pointer, type] = page_table.pointers[page_index].PointerType();
-            switch (type) {
-            case Common::PageType::Unmapped: {
+        WalkBlock(
+            process, dest_addr, size,
+            [this, &process, &dest_addr, &src_addr, size](const std::size_t copy_amount,
+                                                          const VAddr current_vaddr) {
                LOG_ERROR(HW_Memory,
                          "Unmapped CopyBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})",
                          current_vaddr, src_addr, size);
                ZeroBlock(process, dest_addr, copy_amount);
-                break;
-            }
-            case Common::PageType::Memory: {
-                DEBUG_ASSERT(pointer);
-                const u8* src_ptr = pointer + page_offset + (page_index << PAGE_BITS);
-                WriteBlock(process, dest_addr, src_ptr, copy_amount);
-                break;
-            }
-            case Common::PageType::RasterizerCachedMemory: {
-                const u8* const host_ptr{GetPointerFromRasterizerCachedMemory(current_vaddr)};
+            },
+            [this, &process, &dest_addr](const std::size_t copy_amount, const u8* const src_ptr) {
+                WriteBlockImpl<false>(process, dest_addr, src_ptr, copy_amount);
+            },
+            [this, &system = system, &process, &dest_addr](
+                const VAddr current_vaddr, const std::size_t copy_amount, u8* const host_ptr) {
                system.GPU().FlushRegion(current_vaddr, copy_amount);
-                WriteBlock(process, dest_addr, host_ptr, copy_amount);
-                break;
-            }
-            default:
-                UNREACHABLE();
-            }
-
-            page_index++;
-            page_offset = 0;
-            dest_addr += static_cast<VAddr>(copy_amount);
-            src_addr += static_cast<VAddr>(copy_amount);
-            remaining_size -= copy_amount;
-        }
-    }
-
-    void CopyBlock(VAddr dest_addr, VAddr src_addr, std::size_t size) {
-        return CopyBlock(*system.CurrentProcess(), dest_addr, src_addr, size);
+                WriteBlockImpl<false>(process, dest_addr, host_ptr, copy_amount);
+            },
+            [&dest_addr, &src_addr](const std::size_t copy_amount) {
+                dest_addr += static_cast<VAddr>(copy_amount);
+                src_addr += static_cast<VAddr>(copy_amount);
+            });
    }

    void RasterizerMarkRegionCached(VAddr vaddr, u64 size, bool cached) {
@@ -514,7 +357,7 @@ struct Memory::Impl {
            } else {
                // Switch page type to uncached if now uncached
                switch (page_type) {
-                case Common::PageType::Unmapped:
+                case Common::PageType::Unmapped: // NOLINT(bugprone-branch-clone)
                    // It is not necessary for a process to have this region mapped into its address
                    // space, for example, a system module need not have a VRAM mapping.
                    break;
@@ -597,6 +440,44 @@ struct Memory::Impl {
        }
    }

+    [[nodiscard]] u8* GetPointerImpl(VAddr vaddr, auto on_unmapped, auto on_rasterizer) const {
+        // AARCH64 masks the upper 16 bit of all memory accesses
+        vaddr &= 0xffffffffffffLL;
+
+        if (vaddr >= 1uLL << current_page_table->GetAddressSpaceBits()) {
+            on_unmapped();
+            return nullptr;
+        }
+
+        // Avoid adding any extra logic to this fast-path block
+        const uintptr_t raw_pointer = current_page_table->pointers[vaddr >> PAGE_BITS].Raw();
+        if (u8* const pointer = Common::PageTable::PageInfo::ExtractPointer(raw_pointer)) {
+            return &pointer[vaddr];
+        }
+        switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) {
+        case Common::PageType::Unmapped:
+            on_unmapped();
+            return nullptr;
+        case Common::PageType::Memory:
+            ASSERT_MSG(false, "Mapped memory page without a pointer @ 0x{:016X}", vaddr);
+            return nullptr;
+        case Common::PageType::RasterizerCachedMemory: {
+            u8* const host_ptr{GetPointerFromRasterizerCachedMemory(vaddr)};
+            on_rasterizer();
+            return host_ptr;
+        }
+        default:
+            UNREACHABLE();
+        }
+        return nullptr;
+    }
+
+    [[nodiscard]] u8* GetPointer(const VAddr vaddr) const {
+        return GetPointerImpl(
+            vaddr, [vaddr]() { LOG_ERROR(HW_Memory, "Unmapped GetPointer @ 0x{:016X}", vaddr); },
+            []() {});
+    }
+
    /**
     * Reads a particular data type out of memory at the given virtual address.
     *
@@ -610,39 +491,17 @@ struct Memory::Impl {
     */
    template <typename T>
    T Read(VAddr vaddr) {
-        // AARCH64 masks the upper 16 bit of all memory accesses
-        vaddr &= 0xffffffffffffLL;
-
-        if (vaddr >= 1uLL << current_page_table->GetAddressSpaceBits()) {
-            LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:08X}", sizeof(T) * 8, vaddr);
-            return 0;
+        T result = 0;
+        const u8* const ptr = GetPointerImpl(
+            vaddr,
+            [vaddr]() {
+                LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:016X}", sizeof(T) * 8, vaddr);
+            },
+            [&system = system, vaddr]() { system.GPU().FlushRegion(vaddr, sizeof(T)); });
+        if (ptr) {
+            std::memcpy(&result, ptr, sizeof(T));
        }
-
-        // Avoid adding any extra logic to this fast-path block
-        const uintptr_t raw_pointer = current_page_table->pointers[vaddr >> PAGE_BITS].Raw();
-        if (const u8* const pointer = Common::PageTable::PageInfo::ExtractPointer(raw_pointer)) {
-            T value;
-            std::memcpy(&value, &pointer[vaddr], sizeof(T));
-            return value;
-        }
-        switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) {
-        case Common::PageType::Unmapped:
-            LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:08X}", sizeof(T) * 8, vaddr);
-            return 0;
-        case Common::PageType::Memory:
-            ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
-            break;
-        case Common::PageType::RasterizerCachedMemory: {
-            const u8* const host_ptr{GetPointerFromRasterizerCachedMemory(vaddr)};
-            system.GPU().FlushRegion(vaddr, sizeof(T));
-            T value;
-            std::memcpy(&value, host_ptr, sizeof(T));
-            return value;
-        }
-        default:
-            UNREACHABLE();
-        }
-        return {};
+        return result;
    }

    /**
@@ -656,110 +515,46 @@ struct Memory::Impl {
     */
    template <typename T>
    void Write(VAddr vaddr, const T data) {
-        // AARCH64 masks the upper 16 bit of all memory accesses
-        vaddr &= 0xffffffffffffLL;
-
-        if (vaddr >= 1uLL << current_page_table->GetAddressSpaceBits()) {
-            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8,
-                      static_cast<u32>(data), vaddr);
-            return;
-        }
-
-        // Avoid adding any extra logic to this fast-path block
-        const uintptr_t raw_pointer = current_page_table->pointers[vaddr >> PAGE_BITS].Raw();
-        if (u8* const pointer = Common::PageTable::PageInfo::ExtractPointer(raw_pointer)) {
-            std::memcpy(&pointer[vaddr], &data, sizeof(T));
-            return;
-        }
-        switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) {
-        case Common::PageType::Unmapped:
-            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8,
-                      static_cast<u32>(data), vaddr);
-            return;
-        case Common::PageType::Memory:
-            ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
-            break;
-        case Common::PageType::RasterizerCachedMemory: {
-            u8* const host_ptr{GetPointerFromRasterizerCachedMemory(vaddr)};
-            system.GPU().InvalidateRegion(vaddr, sizeof(T));
-            std::memcpy(host_ptr, &data, sizeof(T));
-            break;
-        }
-        default:
-            UNREACHABLE();
+        u8* const ptr = GetPointerImpl(
+            vaddr,
+            [vaddr, data]() {
+                LOG_ERROR(HW_Memory, "Unmapped Write{} @ 0x{:016X} = 0x{:016X}", sizeof(T) * 8,
+                          vaddr, static_cast<u64>(data));
+            },
+            [&system = system, vaddr]() { system.GPU().InvalidateRegion(vaddr, sizeof(T)); });
+        if (ptr) {
+            std::memcpy(ptr, &data, sizeof(T));
        }
    }

    template <typename T>
    bool WriteExclusive(VAddr vaddr, const T data, const T expected) {
-        // AARCH64 masks the upper 16 bit of all memory accesses
-        vaddr &= 0xffffffffffffLL;
-
-        if (vaddr >= 1uLL << current_page_table->GetAddressSpaceBits()) {
-            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8,
-                      static_cast<u32>(data), vaddr);
-            return true;
-        }
-
-        const uintptr_t raw_pointer = current_page_table->pointers[vaddr >> PAGE_BITS].Raw();
-        if (u8* const pointer = Common::PageTable::PageInfo::ExtractPointer(raw_pointer)) {
-            // NOTE: Avoid adding any extra logic to this fast-path block
-            const auto volatile_pointer = reinterpret_cast<volatile T*>(&pointer[vaddr]);
+        u8* const ptr = GetPointerImpl(
+            vaddr,
+            [vaddr, data]() {
+                LOG_ERROR(HW_Memory, "Unmapped WriteExclusive{} @ 0x{:016X} = 0x{:016X}",
+                          sizeof(T) * 8, vaddr, static_cast<u64>(data));
+            },
+            [&system = system, vaddr]() { system.GPU().InvalidateRegion(vaddr, sizeof(T)); });
+        if (ptr) {
+            const auto volatile_pointer = reinterpret_cast<volatile T*>(ptr);
            return Common::AtomicCompareAndSwap(volatile_pointer, data, expected);
        }
-        switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) {
-        case Common::PageType::Unmapped:
-            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8,
-                      static_cast<u32>(data), vaddr);
-            return true;
-        case Common::PageType::Memory:
-            ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
-            break;
-        case Common::PageType::RasterizerCachedMemory: {
-            u8* host_ptr{GetPointerFromRasterizerCachedMemory(vaddr)};
-            system.GPU().InvalidateRegion(vaddr, sizeof(T));
-            auto* pointer = reinterpret_cast<volatile T*>(&host_ptr);
-            return Common::AtomicCompareAndSwap(pointer, data, expected);
-        }
-        default:
-            UNREACHABLE();
-        }
        return true;
    }

    bool WriteExclusive128(VAddr vaddr, const u128 data, const u128 expected) {
-        // AARCH64 masks the upper 16 bit of all memory accesses
-        vaddr &= 0xffffffffffffLL;
-
-        if (vaddr >= 1uLL << current_page_table->GetAddressSpaceBits()) {
-            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8,
-                      static_cast<u32>(data[0]), vaddr);
-            return true;
-        }
-
-        const uintptr_t raw_pointer = current_page_table->pointers[vaddr >> PAGE_BITS].Raw();
-        if (u8* const pointer = Common::PageTable::PageInfo::ExtractPointer(raw_pointer)) {
-            // NOTE: Avoid adding any extra logic to this fast-path block
-            const auto volatile_pointer = reinterpret_cast<volatile u64*>(&pointer[vaddr]);
+        u8* const ptr = GetPointerImpl(
+            vaddr,
+            [vaddr, data]() {
+                LOG_ERROR(HW_Memory, "Unmapped WriteExclusive128 @ 0x{:016X} = 0x{:016X}{:016X}",
+                          vaddr, static_cast<u64>(data[1]), static_cast<u64>(data[0]));
+            },
+            [&system = system, vaddr]() { system.GPU().InvalidateRegion(vaddr, sizeof(u128)); });
+        if (ptr) {
+            const auto volatile_pointer = reinterpret_cast<volatile u64*>(ptr);
            return Common::AtomicCompareAndSwap(volatile_pointer, data, expected);
        }
-        switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) {
-        case Common::PageType::Unmapped:
-            LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}{:016X}", sizeof(data) * 8,
-                      static_cast<u64>(data[1]), static_cast<u64>(data[0]), vaddr);
-            return true;
-        case Common::PageType::Memory:
-            ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr);
-            break;
-        case Common::PageType::RasterizerCachedMemory: {
-            u8* host_ptr{GetPointerFromRasterizerCachedMemory(vaddr)};
-            system.GPU().InvalidateRegion(vaddr, sizeof(u128));
-            auto* pointer = reinterpret_cast<volatile u64*>(&host_ptr);
-            return Common::AtomicCompareAndSwap(pointer, data, expected);
-        }
-        default:
-            UNREACHABLE();
-        }
        return true;
    }

@@ -789,12 +584,11 @@ void Memory::UnmapRegion(Common::PageTable& page_table, VAddr base, u64 size) {
    impl->UnmapRegion(page_table, base, size);
 }

-bool Memory::IsValidVirtualAddress(const Kernel::KProcess& process, const VAddr vaddr) const {
-    return impl->IsValidVirtualAddress(process, vaddr);
-}
-
 bool Memory::IsValidVirtualAddress(const VAddr vaddr) const {
-    return impl->IsValidVirtualAddress(vaddr);
+    const Kernel::KProcess& process = *system.CurrentProcess();
+    const auto& page_table = process.PageTable().PageTableImpl();
+    const auto [pointer, type] = page_table.pointers[vaddr >> PAGE_BITS].PointerType();
+    return pointer != nullptr || type == Common::PageType::RasterizerCachedMemory;
 }

 u8* Memory::GetPointer(VAddr vaddr) {
@@ -863,64 +657,38 @@ std::string Memory::ReadCString(VAddr vaddr, std::size_t max_length) {

 void Memory::ReadBlock(const Kernel::KProcess& process, const VAddr src_addr, void* dest_buffer,
                       const std::size_t size) {
-    impl->ReadBlock(process, src_addr, dest_buffer, size);
+    impl->ReadBlockImpl<false>(process, src_addr, dest_buffer, size);
 }

 void Memory::ReadBlock(const VAddr src_addr, void* dest_buffer, const std::size_t size) {
    impl->ReadBlock(src_addr, dest_buffer, size);
 }

-void Memory::ReadBlockUnsafe(const Kernel::KProcess& process, const VAddr src_addr,
-                             void* dest_buffer, const std::size_t size) {
-    impl->ReadBlockUnsafe(process, src_addr, dest_buffer, size);
-}
-
 void Memory::ReadBlockUnsafe(const VAddr src_addr, void* dest_buffer, const std::size_t size) {
    impl->ReadBlockUnsafe(src_addr, dest_buffer, size);
 }

 void Memory::WriteBlock(const Kernel::KProcess& process, VAddr dest_addr, const void* src_buffer,
                        std::size_t size) {
-    impl->WriteBlock(process, dest_addr, src_buffer, size);
+    impl->WriteBlockImpl<false>(process, dest_addr, src_buffer, size);
 }

 void Memory::WriteBlock(const VAddr dest_addr, const void* src_buffer, const std::size_t size) {
    impl->WriteBlock(dest_addr, src_buffer, size);
 }

-void Memory::WriteBlockUnsafe(const Kernel::KProcess& process, VAddr dest_addr,
-                              const void* src_buffer, std::size_t size) {
-    impl->WriteBlockUnsafe(process, dest_addr, src_buffer, size);
-}
-
 void Memory::WriteBlockUnsafe(const VAddr dest_addr, const void* src_buffer,
                              const std::size_t size) {
    impl->WriteBlockUnsafe(dest_addr, src_buffer, size);
 }

-void Memory::ZeroBlock(const Kernel::KProcess& process, VAddr dest_addr, std::size_t size) {
-    impl->ZeroBlock(process, dest_addr, size);
-}
-
-void Memory::ZeroBlock(VAddr dest_addr, std::size_t size) {
-    impl->ZeroBlock(dest_addr, size);
-}
-
 void Memory::CopyBlock(const Kernel::KProcess& process, VAddr dest_addr, VAddr src_addr,
                       const std::size_t size) {
    impl->CopyBlock(process, dest_addr, src_addr, size);
 }

-void Memory::CopyBlock(VAddr dest_addr, VAddr src_addr, std::size_t size) {
-    impl->CopyBlock(dest_addr, src_addr, size);
-}
-
 void Memory::RasterizerMarkRegionCached(VAddr vaddr, u64 size, bool cached) {
    impl->RasterizerMarkRegionCached(vaddr, size, cached);
 }

-bool IsKernelVirtualAddress(const VAddr vaddr) {
-    return KERNEL_REGION_VADDR <= vaddr && vaddr < KERNEL_REGION_END;
-}
-
 } // namespace Core::Memory
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -39,11 +39,6 @@ enum : VAddr {

    /// Application stack
    DEFAULT_STACK_SIZE = 0x100000,
-
-    /// Kernel Virtual Address Range
-    KERNEL_REGION_VADDR = 0xFFFFFF8000000000,
-    KERNEL_REGION_SIZE = 0x7FFFE00000,
-    KERNEL_REGION_END = KERNEL_REGION_VADDR + KERNEL_REGION_SIZE,
 };

 /// Central class that handles all memory operations and state.
@@ -56,7 +51,7 @@ public:
    Memory& operator=(const Memory&) = delete;

    Memory(Memory&&) = default;
-    Memory& operator=(Memory&&) = default;
+    Memory& operator=(Memory&&) = delete;

    /**
     * Resets the state of the Memory system.
@@ -90,17 +85,6 @@ public:
     */
    void UnmapRegion(Common::PageTable& page_table, VAddr base, u64 size);

-    /**
-     * Checks whether or not the supplied address is a valid virtual
-     * address for the given process.
-     *
-     * @param process The emulated process to check the address against.
-     * @param vaddr   The virtual address to check the validity of.
-     *
-     * @returns True if the given virtual address is valid, false otherwise.
-     */
-    bool IsValidVirtualAddress(const Kernel::KProcess& process, VAddr vaddr) const;
-
    /**
     * Checks whether or not the supplied address is a valid virtual
     * address for the current process.
@@ -109,7 +93,7 @@ public:
     *
     * @returns True if the given virtual address is valid, false otherwise.
     */
-    bool IsValidVirtualAddress(VAddr vaddr) const;
+    [[nodiscard]] bool IsValidVirtualAddress(VAddr vaddr) const;

    /**
     * Gets a pointer to the given address.
@@ -134,7 +118,7 @@ public:
     * @returns The pointer to the given address, if the address is valid.
     *          If the address is not valid, nullptr will be returned.
     */
-    const u8* GetPointer(VAddr vaddr) const;
+    [[nodiscard]] const u8* GetPointer(VAddr vaddr) const;

    template <typename T>
    const T* GetPointer(VAddr vaddr) const {
@@ -327,27 +311,6 @@ public:
    void ReadBlock(const Kernel::KProcess& process, VAddr src_addr, void* dest_buffer,
                   std::size_t size);

-    /**
-     * Reads a contiguous block of bytes from a specified process' address space.
-     * This unsafe version does not trigger GPU flushing.
-     *
-     * @param process     The process to read the data from.
-     * @param src_addr    The virtual address to begin reading from.
-     * @param dest_buffer The buffer to place the read bytes into.
-     * @param size        The amount of data to read, in bytes.
-     *
-     * @note If a size of 0 is specified, then this function reads nothing and
-     *       no attempts to access memory are made at all.
-     *
-     * @pre dest_buffer must be at least size bytes in length, otherwise a
-     *      buffer overrun will occur.
-     *
-     * @post The range [dest_buffer, size) contains the read bytes from the
-     *       process' address space.
-     */
-    void ReadBlockUnsafe(const Kernel::KProcess& process, VAddr src_addr, void* dest_buffer,
-                         std::size_t size);
-
    /**
     * Reads a contiguous block of bytes from the current process' address space.
     *
@@ -408,26 +371,6 @@ public:
    void WriteBlock(const Kernel::KProcess& process, VAddr dest_addr, const void* src_buffer,
                    std::size_t size);

-    /**
-     * Writes a range of bytes into a given process' address space at the specified
-     * virtual address.
-     * This unsafe version does not invalidate GPU Memory.
-     *
-     * @param process    The process to write data into the address space of.
-     * @param dest_addr  The destination virtual address to begin writing the data at.
-     * @param src_buffer The data to write into the process' address space.
-     * @param size       The size of the data to write, in bytes.
-     *
-     * @post The address range [dest_addr, size) in the process' address space
-     *       contains the data that was within src_buffer.
-     *
-     * @post If an attempt is made to write into an unmapped region of memory, the writes
-     *       will be ignored and an error will be logged.
-     *
-     */
-    void WriteBlockUnsafe(const Kernel::KProcess& process, VAddr dest_addr, const void* src_buffer,
-                          std::size_t size);
-
    /**
     * Writes a range of bytes into the current process' address space at the specified
     * virtual address.
@@ -467,29 +410,6 @@ public:
     */
    void WriteBlockUnsafe(VAddr dest_addr, const void* src_buffer, std::size_t size);

-    /**
-     * Fills the specified address range within a process' address space with zeroes.
-     *
-     * @param process   The process that will have a portion of its memory zeroed out.
-     * @param dest_addr The starting virtual address of the range to zero out.
-     * @param size      The size of the address range to zero out, in bytes.
-     *
-     * @post The range [dest_addr, size) within the process' address space is
-     *       filled with zeroes.
-     */
-    void ZeroBlock(const Kernel::KProcess& process, VAddr dest_addr, std::size_t size);
-
-    /**
-     * Fills the specified address range within the current process' address space with zeroes.
-     *
-     * @param dest_addr The starting virtual address of the range to zero out.
-     * @param size      The size of the address range to zero out, in bytes.
-     *
-     * @post The range [dest_addr, size) within the current process' address space is
-     *       filled with zeroes.
-     */
-    void ZeroBlock(VAddr dest_addr, std::size_t size);
-
    /**
     * Copies data within a process' address space to another location within the
     * same address space.
@@ -505,19 +425,6 @@ public:
    void CopyBlock(const Kernel::KProcess& process, VAddr dest_addr, VAddr src_addr,
                   std::size_t size);

-    /**
-     * Copies data within the current process' address space to another location within the
-     * same address space.
-     *
-     * @param dest_addr The destination virtual address to begin copying the data into.
-     * @param src_addr  The source virtual address to begin copying the data from.
-     * @param size      The size of the data to copy, in bytes.
-     *
-     * @post The range [dest_addr, size) within the current process' address space
-     *       contains the same data within the range [src_addr, size).
-     */
-    void CopyBlock(VAddr dest_addr, VAddr src_addr, std::size_t size);
-
    /**
     * Marks each page within the specified address range as cached or uncached.
     *
@@ -535,7 +442,4 @@ private:
    std::unique_ptr<Impl> impl;
 };

-/// Determines if the given VAddr is a kernel address
-bool IsKernelVirtualAddress(VAddr vaddr);
-
 } // namespace Core::Memory
--- a/src/core/network/network.cpp
+++ b/src/core/network/network.cpp
@@ -10,9 +10,10 @@
 #include "common/common_funcs.h"

 #ifdef _WIN32
-#define _WINSOCK_DEPRECATED_NO_WARNINGS // gethostname
 #include <winsock2.h>
+#include <ws2tcpip.h>
 #elif YUZU_UNIX
+#include <arpa/inet.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <netdb.h>
@@ -27,7 +28,9 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "core/network/network.h"
+#include "core/network/network_interface.h"
 #include "core/network/sockets.h"

 namespace Network {
@@ -47,11 +50,6 @@ void Finalize() {
    WSACleanup();
 }

-constexpr IPv4Address TranslateIPv4(in_addr addr) {
-    auto& bytes = addr.S_un.S_un_b;
-    return IPv4Address{bytes.s_b1, bytes.s_b2, bytes.s_b3, bytes.s_b4};
-}
-
 sockaddr TranslateFromSockAddrIn(SockAddrIn input) {
    sockaddr_in result;

@@ -138,12 +136,6 @@ void Initialize() {}

 void Finalize() {}

-constexpr IPv4Address TranslateIPv4(in_addr addr) {
-    const u32 bytes = addr.s_addr;
-    return IPv4Address{static_cast<u8>(bytes), static_cast<u8>(bytes >> 8),
-                       static_cast<u8>(bytes >> 16), static_cast<u8>(bytes >> 24)};
-}
-
 sockaddr TranslateFromSockAddrIn(SockAddrIn input) {
    sockaddr_in result;

@@ -182,7 +174,7 @@ linger MakeLinger(bool enable, u32 linger_value) {
 }

 bool EnableNonBlock(int fd, bool enable) {
-    int flags = fcntl(fd, F_GETFD);
+    int flags = fcntl(fd, F_GETFL);
    if (flags == -1) {
        return false;
    }
@@ -191,7 +183,7 @@ bool EnableNonBlock(int fd, bool enable) {
    } else {
        flags &= ~O_NONBLOCK;
    }
-    return fcntl(fd, F_SETFD, flags) == 0;
+    return fcntl(fd, F_SETFL, flags) == 0;
 }

 Errno TranslateNativeError(int e) {
@@ -227,8 +219,12 @@ Errno GetAndLogLastError() {
 #else
    int e = errno;
 #endif
+    const Errno err = TranslateNativeError(e);
+    if (err == Errno::AGAIN) {
+        return err;
+    }
    LOG_ERROR(Network, "Socket operation error: {}", NativeErrorToString(e));
-    return TranslateNativeError(e);
+    return err;
 }

 int TranslateDomain(Domain domain) {
@@ -353,27 +349,29 @@ NetworkInstance::~NetworkInstance() {
    Finalize();
 }

-std::pair<IPv4Address, Errno> GetHostIPv4Address() {
-    std::array<char, 256> name{};
-    if (gethostname(name.data(), static_cast<int>(name.size()) - 1) == SOCKET_ERROR) {
-        return {IPv4Address{}, GetAndLogLastError()};
+std::optional<IPv4Address> GetHostIPv4Address() {
+    const std::string& selected_network_interface = Settings::values.network_interface.GetValue();
+    const auto network_interfaces = Network::GetAvailableNetworkInterfaces();
+    if (network_interfaces.size() == 0) {
+        LOG_ERROR(Network, "GetAvailableNetworkInterfaces returned no interfaces");
+        return {};
    }

-    hostent* const ent = gethostbyname(name.data());
-    if (!ent) {
-        return {IPv4Address{}, GetAndLogLastError()};
-    }
-    if (ent->h_addr_list == nullptr) {
-        UNIMPLEMENTED_MSG("No addr provided in hostent->h_addr_list");
-        return {IPv4Address{}, Errno::SUCCESS};
-    }
-    if (ent->h_length != sizeof(in_addr)) {
-        UNIMPLEMENTED_MSG("Unexpected size={} in hostent->h_length", ent->h_length);
-    }
+    const auto res =
+        std::ranges::find_if(network_interfaces, [&selected_network_interface](const auto& iface) {
+            return iface.name == selected_network_interface;
+        });

-    in_addr addr;
-    std::memcpy(&addr, ent->h_addr_list[0], sizeof(addr));
-    return {TranslateIPv4(addr), Errno::SUCCESS};
+    if (res != network_interfaces.end()) {
+        char ip_addr[16] = {};
+        ASSERT(inet_ntop(AF_INET, &res->ip_address, ip_addr, sizeof(ip_addr)) != nullptr);
+        LOG_INFO(Network, "IP address: {}", ip_addr);
+
+        return TranslateIPv4(res->ip_address);
+    } else {
+        LOG_ERROR(Network, "Couldn't find selected interface \"{}\"", selected_network_interface);
+        return {};
+    }
 }

 std::pair<s32, Errno> Poll(std::vector<PollFD>& pollfds, s32 timeout) {
@@ -570,7 +568,7 @@ std::pair<s32, Errno> Socket::SendTo(u32 flags, const std::vector<u8>& message,
    ASSERT(flags == 0);

    const sockaddr* to = nullptr;
-    const int tolen = addr ? 0 : sizeof(sockaddr);
+    const int tolen = addr ? sizeof(sockaddr) : 0;
    sockaddr host_addr_in;

    if (addr) {
--- a/src/core/network/network.h
+++ b/src/core/network/network.h
@@ -5,11 +5,18 @@
 #pragma once

 #include <array>
+#include <optional>
 #include <utility>

 #include "common/common_funcs.h"
 #include "common/common_types.h"

+#ifdef _WIN32
+#include <winsock2.h>
+#elif YUZU_UNIX
+#include <netinet/in.h>
+#endif
+
 namespace Network {

 class Socket;
@@ -92,8 +99,21 @@ public:
    ~NetworkInstance();
 };

+#ifdef _WIN32
+constexpr IPv4Address TranslateIPv4(in_addr addr) {
+    auto& bytes = addr.S_un.S_un_b;
+    return IPv4Address{bytes.s_b1, bytes.s_b2, bytes.s_b3, bytes.s_b4};
+}
+#elif YUZU_UNIX
+constexpr IPv4Address TranslateIPv4(in_addr addr) {
+    const u32 bytes = addr.s_addr;
+    return IPv4Address{static_cast<u8>(bytes), static_cast<u8>(bytes >> 8),
+                       static_cast<u8>(bytes >> 16), static_cast<u8>(bytes >> 24)};
+}
+#endif
+
 /// @brief Returns host's IPv4 address
-/// @return Pair of an array of human ordered IPv4 address (e.g. 192.168.0.1) and an error code
-std::pair<IPv4Address, Errno> GetHostIPv4Address();
+/// @return human ordered IPv4 address (e.g. 192.168.0.1) as an array
+std::optional<IPv4Address> GetHostIPv4Address();

 } // namespace Network
--- a/src/core/network/network_interface.cpp
+++ b/src/core/network/network_interface.cpp
@@ -0,0 +1,210 @@
+// Copyright 2021 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <fstream>
+#include <sstream>
+#include <vector>
+
+#include "common/bit_cast.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "common/settings.h"
+#include "common/string_util.h"
+#include "core/network/network_interface.h"
+
+#ifdef _WIN32
+#include <iphlpapi.h>
+#else
+#include <cerrno>
+#include <ifaddrs.h>
+#include <net/if.h>
+#endif
+
+namespace Network {
+
+#ifdef _WIN32
+
+std::vector<NetworkInterface> GetAvailableNetworkInterfaces() {
+    std::vector<IP_ADAPTER_ADDRESSES> adapter_addresses;
+    DWORD ret = ERROR_BUFFER_OVERFLOW;
+    DWORD buf_size = 0;
+
+    // retry up to 5 times
+    for (int i = 0; i < 5 && ret == ERROR_BUFFER_OVERFLOW; i++) {
+        ret = GetAdaptersAddresses(
+            AF_INET, GAA_FLAG_SKIP_MULTICAST | GAA_FLAG_SKIP_DNS_SERVER | GAA_FLAG_INCLUDE_GATEWAYS,
+            nullptr, adapter_addresses.data(), &buf_size);
+
+        if (ret != ERROR_BUFFER_OVERFLOW) {
+            break;
+        }
+
+        adapter_addresses.resize((buf_size / sizeof(IP_ADAPTER_ADDRESSES)) + 1);
+    }
+
+    if (ret != NO_ERROR) {
+        LOG_ERROR(Network, "Failed to get network interfaces with GetAdaptersAddresses");
+        return {};
+    }
+
+    std::vector<NetworkInterface> result;
+
+    for (auto current_address = adapter_addresses.data(); current_address != nullptr;
+         current_address = current_address->Next) {
+        if (current_address->FirstUnicastAddress == nullptr ||
+            current_address->FirstUnicastAddress->Address.lpSockaddr == nullptr) {
+            continue;
+        }
+
+        if (current_address->OperStatus != IfOperStatusUp) {
+            continue;
+        }
+
+        const auto ip_addr = Common::BitCast<struct sockaddr_in>(
+                                 *current_address->FirstUnicastAddress->Address.lpSockaddr)
+                                 .sin_addr;
+
+        ULONG mask = 0;
+        if (ConvertLengthToIpv4Mask(current_address->FirstUnicastAddress->OnLinkPrefixLength,
+                                    &mask) != NO_ERROR) {
+            LOG_ERROR(Network, "Failed to convert IPv4 prefix length to subnet mask");
+            continue;
+        }
+
+        struct in_addr gateway = {.S_un{.S_addr{0}}};
+        if (current_address->FirstGatewayAddress != nullptr &&
+            current_address->FirstGatewayAddress->Address.lpSockaddr != nullptr) {
+            gateway = Common::BitCast<struct sockaddr_in>(
+                          *current_address->FirstGatewayAddress->Address.lpSockaddr)
+                          .sin_addr;
+        }
+
+        result.emplace_back(NetworkInterface{
+            .name{Common::UTF16ToUTF8(std::wstring{current_address->FriendlyName})},
+            .ip_address{ip_addr},
+            .subnet_mask = in_addr{.S_un{.S_addr{mask}}},
+            .gateway = gateway});
+    }
+
+    return result;
+}
+
+#else
+
+std::vector<NetworkInterface> GetAvailableNetworkInterfaces() {
+    struct ifaddrs* ifaddr = nullptr;
+
+    if (getifaddrs(&ifaddr) != 0) {
+        LOG_ERROR(Network, "Failed to get network interfaces with getifaddrs: {}",
+                  std::strerror(errno));
+        return {};
+    }
+
+    std::vector<NetworkInterface> result;
+
+    for (auto ifa = ifaddr; ifa != nullptr; ifa = ifa->ifa_next) {
+        if (ifa->ifa_addr == nullptr || ifa->ifa_netmask == nullptr) {
+            continue;
+        }
+
+        if (ifa->ifa_addr->sa_family != AF_INET) {
+            continue;
+        }
+
+        if ((ifa->ifa_flags & IFF_UP) == 0 || (ifa->ifa_flags & IFF_LOOPBACK) != 0) {
+            continue;
+        }
+
+        u32 gateway{};
+
+        std::ifstream file{"/proc/net/route"};
+        if (!file.is_open()) {
+            LOG_ERROR(Network, "Failed to open \"/proc/net/route\"");
+
+            result.emplace_back(NetworkInterface{
+                .name{ifa->ifa_name},
+                .ip_address{Common::BitCast<struct sockaddr_in>(*ifa->ifa_addr).sin_addr},
+                .subnet_mask{Common::BitCast<struct sockaddr_in>(*ifa->ifa_netmask).sin_addr},
+                .gateway{in_addr{.s_addr = gateway}}});
+            continue;
+        }
+
+        // ignore header
+        file.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+
+        bool gateway_found = false;
+
+        for (std::string line; std::getline(file, line);) {
+            std::istringstream iss{line};
+
+            std::string iface_name;
+            iss >> iface_name;
+            if (iface_name != ifa->ifa_name) {
+                continue;
+            }
+
+            iss >> std::hex;
+
+            u32 dest{};
+            iss >> dest;
+            if (dest != 0) {
+                // not the default route
+                continue;
+            }
+
+            iss >> gateway;
+
+            u16 flags{};
+            iss >> flags;
+
+            // flag RTF_GATEWAY (defined in <linux/route.h>)
+            if ((flags & 0x2) == 0) {
+                continue;
+            }
+
+            gateway_found = true;
+            break;
+        }
+
+        if (!gateway_found) {
+            gateway = 0;
+        }
+
+        result.emplace_back(NetworkInterface{
+            .name{ifa->ifa_name},
+            .ip_address{Common::BitCast<struct sockaddr_in>(*ifa->ifa_addr).sin_addr},
+            .subnet_mask{Common::BitCast<struct sockaddr_in>(*ifa->ifa_netmask).sin_addr},
+            .gateway{in_addr{.s_addr = gateway}}});
+    }
+
+    freeifaddrs(ifaddr);
+
+    return result;
+}
+
+#endif
+
+std::optional<NetworkInterface> GetSelectedNetworkInterface() {
+    const auto& selected_network_interface = Settings::values.network_interface.GetValue();
+    const auto network_interfaces = Network::GetAvailableNetworkInterfaces();
+    if (network_interfaces.size() == 0) {
+        LOG_ERROR(Network, "GetAvailableNetworkInterfaces returned no interfaces");
+        return std::nullopt;
+    }
+
+    const auto res =
+        std::ranges::find_if(network_interfaces, [&selected_network_interface](const auto& iface) {
+            return iface.name == selected_network_interface;
+        });
+
+    if (res == network_interfaces.end()) {
+        LOG_ERROR(Network, "Couldn't find selected interface \"{}\"", selected_network_interface);
+        return std::nullopt;
+    }
+
+    return *res;
+}
+
+} // namespace Network
--- a/src/core/network/network_interface.h
+++ b/src/core/network/network_interface.h
@@ -0,0 +1,29 @@
+// Copyright 2021 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#ifdef _WIN32
+#include <winsock2.h>
+#else
+#include <netinet/in.h>
+#endif
+
+namespace Network {
+
+struct NetworkInterface {
+    std::string name;
+    struct in_addr ip_address;
+    struct in_addr subnet_mask;
+    struct in_addr gateway;
+};
+
+std::vector<NetworkInterface> GetAvailableNetworkInterfaces();
+std::optional<NetworkInterface> GetSelectedNetworkInterface();
+
+} // namespace Network
--- a/src/input_common/main.cpp
+++ b/src/input_common/main.cpp
@@ -304,10 +304,10 @@ std::vector<std::unique_ptr<Polling::DevicePoller>> InputSubsystem::GetPollers([
 }

 std::string GenerateKeyboardParam(int key_code) {
-    Common::ParamPackage param{
-        {"engine", "keyboard"},
-        {"code", std::to_string(key_code)},
-    };
+    Common::ParamPackage param;
+    param.Set("engine", "keyboard");
+    param.Set("code", key_code);
+    param.Set("toggle", false);
    return param.Serialize();
 }

--- a/src/input_common/mouse/mouse_poller.cpp
+++ b/src/input_common/mouse/mouse_poller.cpp
@@ -57,6 +57,7 @@ Common::ParamPackage MouseButtonFactory::GetNextInput() const {
        if (pad.button != MouseInput::MouseButton::Undefined) {
            params.Set("engine", "mouse");
            params.Set("button", static_cast<u16>(pad.button));
+            params.Set("toggle", false);
            return params;
        }
    }
--- a/src/input_common/sdl/sdl_impl.cpp
+++ b/src/input_common/sdl/sdl_impl.cpp
@@ -82,6 +82,12 @@ public:
        state.buttons.insert_or_assign(button, value);
    }

+    void PreSetButton(int button) {
+        if (!state.buttons.contains(button)) {
+            SetButton(button, false);
+        }
+    }
+
    void SetMotion(SDL_ControllerSensorEvent event) {
        constexpr float gravity_constant = 9.80665f;
        std::lock_guard lock{mutex};
@@ -155,9 +161,16 @@ public:
        state.axes.insert_or_assign(axis, value);
    }

-    float GetAxis(int axis, float range) const {
+    void PreSetAxis(int axis) {
+        if (!state.axes.contains(axis)) {
+            SetAxis(axis, 0);
+        }
+    }
+
+    float GetAxis(int axis, float range, float offset) const {
        std::lock_guard lock{mutex};
-        return static_cast<float>(state.axes.at(axis)) / (32767.0f * range);
+        const float value = static_cast<float>(state.axes.at(axis)) / 32767.0f;
+        return (value + offset) / range;
    }

    bool RumblePlay(u16 amp_low, u16 amp_high) {
@@ -174,9 +187,10 @@ public:
        return false;
    }

-    std::tuple<float, float> GetAnalog(int axis_x, int axis_y, float range) const {
-        float x = GetAxis(axis_x, range);
-        float y = GetAxis(axis_y, range);
+    std::tuple<float, float> GetAnalog(int axis_x, int axis_y, float range, float offset_x,
+                                       float offset_y) const {
+        float x = GetAxis(axis_x, range, offset_x);
+        float y = GetAxis(axis_y, range, offset_y);
        y = -y; // 3DS uses an y-axis inverse from SDL

        // Make sure the coordinates are in the unit circle,
@@ -483,7 +497,7 @@ public:
          trigger_if_greater(trigger_if_greater_) {}

    bool GetStatus() const override {
-        const float axis_value = joystick->GetAxis(axis, 1.0f);
+        const float axis_value = joystick->GetAxis(axis, 1.0f, 0.0f);
        if (trigger_if_greater) {
            return axis_value > threshold;
        }
@@ -500,12 +514,14 @@ private:
 class SDLAnalog final : public Input::AnalogDevice {
 public:
    explicit SDLAnalog(std::shared_ptr<SDLJoystick> joystick_, int axis_x_, int axis_y_,
-                       bool invert_x_, bool invert_y_, float deadzone_, float range_)
+                       bool invert_x_, bool invert_y_, float deadzone_, float range_,
+                       float offset_x_, float offset_y_)
        : joystick(std::move(joystick_)), axis_x(axis_x_), axis_y(axis_y_), invert_x(invert_x_),
-          invert_y(invert_y_), deadzone(deadzone_), range(range_) {}
+          invert_y(invert_y_), deadzone(deadzone_), range(range_), offset_x(offset_x_),
+          offset_y(offset_y_) {}

    std::tuple<float, float> GetStatus() const override {
-        auto [x, y] = joystick->GetAnalog(axis_x, axis_y, range);
+        auto [x, y] = joystick->GetAnalog(axis_x, axis_y, range, offset_x, offset_y);
        const float r = std::sqrt((x * x) + (y * y));
        if (invert_x) {
            x = -x;
@@ -522,8 +538,8 @@ public:
    }

    std::tuple<float, float> GetRawStatus() const override {
-        const float x = joystick->GetAxis(axis_x, range);
-        const float y = joystick->GetAxis(axis_y, range);
+        const float x = joystick->GetAxis(axis_x, range, offset_x);
+        const float y = joystick->GetAxis(axis_y, range, offset_y);
        return {x, -y};
    }

@@ -555,6 +571,8 @@ private:
    const bool invert_y;
    const float deadzone;
    const float range;
+    const float offset_x;
+    const float offset_y;
 };

 class SDLVibration final : public Input::VibrationDevice {
@@ -621,7 +639,7 @@ public:
          trigger_if_greater(trigger_if_greater_) {}

    Input::MotionStatus GetStatus() const override {
-        const float axis_value = joystick->GetAxis(axis, 1.0f);
+        const float axis_value = joystick->GetAxis(axis, 1.0f, 0.0f);
        bool trigger = axis_value < threshold;
        if (trigger_if_greater) {
            trigger = axis_value > threshold;
@@ -720,13 +738,13 @@ public:
                LOG_ERROR(Input, "Unknown direction {}", direction_name);
            }
            // This is necessary so accessing GetAxis with axis won't crash
-            joystick->SetAxis(axis, 0);
+            joystick->PreSetAxis(axis);
            return std::make_unique<SDLAxisButton>(joystick, axis, threshold, trigger_if_greater);
        }

        const int button = params.Get("button", 0);
        // This is necessary so accessing GetButton with button won't crash
-        joystick->SetButton(button, false);
+        joystick->PreSetButton(button);
        return std::make_unique<SDLButton>(joystick, button, toggle);
    }

@@ -757,13 +775,15 @@ public:
        const std::string invert_y_value = params.Get("invert_y", "+");
        const bool invert_x = invert_x_value == "-";
        const bool invert_y = invert_y_value == "-";
+        const float offset_x = params.Get("offset_x", 0.0f);
+        const float offset_y = params.Get("offset_y", 0.0f);
        auto joystick = state.GetSDLJoystickByGUID(guid, port);

        // This is necessary so accessing GetAxis with axis_x and axis_y won't crash
-        joystick->SetAxis(axis_x, 0);
-        joystick->SetAxis(axis_y, 0);
+        joystick->PreSetAxis(axis_x);
+        joystick->PreSetAxis(axis_y);
        return std::make_unique<SDLAnalog>(joystick, axis_x, axis_y, invert_x, invert_y, deadzone,
-                                           range);
+                                           range, offset_x, offset_y);
    }

 private:
@@ -844,13 +864,13 @@ public:
                LOG_ERROR(Input, "Unknown direction {}", direction_name);
            }
            // This is necessary so accessing GetAxis with axis won't crash
-            joystick->SetAxis(axis, 0);
+            joystick->PreSetAxis(axis);
            return std::make_unique<SDLAxisMotion>(joystick, axis, threshold, trigger_if_greater);
        }

        const int button = params.Get("button", 0);
        // This is necessary so accessing GetButton with button won't crash
-        joystick->SetButton(button, false);
+        joystick->PreSetButton(button);
        return std::make_unique<SDLButtonMotion>(joystick, button);
    }

@@ -869,6 +889,9 @@ SDLState::SDLState() {
    RegisterFactory<VibrationDevice>("sdl", vibration_factory);
    RegisterFactory<MotionDevice>("sdl", motion_factory);

+    // Disable raw input. When enabled this setting causes SDL to die when a web applet opens
+    SDL_SetHint(SDL_HINT_JOYSTICK_RAWINPUT, "0");
+
    // Enable HIDAPI rumble. This prevents SDL from disabling motion on PS4 and PS5 controllers
    SDL_SetHint(SDL_HINT_JOYSTICK_HIDAPI_PS4_RUMBLE, "1");
    SDL_SetHint(SDL_HINT_JOYSTICK_HIDAPI_PS5_RUMBLE, "1");
@@ -995,6 +1018,7 @@ Common::ParamPackage BuildButtonParamPackageForButton(int port, std::string guid
    params.Set("port", port);
    params.Set("guid", std::move(guid));
    params.Set("button", button);
+    params.Set("toggle", false);
    return params;
 }

@@ -1134,13 +1158,15 @@ Common::ParamPackage BuildParamPackageForBinding(int port, const std::string& gu
 }

 Common::ParamPackage BuildParamPackageForAnalog(int port, const std::string& guid, int axis_x,
-                                                int axis_y) {
+                                                int axis_y, float offset_x, float offset_y) {
    Common::ParamPackage params;
    params.Set("engine", "sdl");
    params.Set("port", port);
    params.Set("guid", guid);
    params.Set("axis_x", axis_x);
    params.Set("axis_y", axis_y);
+    params.Set("offset_x", offset_x);
+    params.Set("offset_y", offset_y);
    params.Set("invert_x", "+");
    params.Set("invert_y", "+");
    return params;
@@ -1342,24 +1368,39 @@ AnalogMapping SDLState::GetAnalogMappingForDevice(const Common::ParamPackage& pa
    const auto& binding_left_y =
        SDL_GameControllerGetBindForAxis(controller, SDL_CONTROLLER_AXIS_LEFTY);
    if (params.Has("guid2")) {
+        joystick2->PreSetAxis(binding_left_x.value.axis);
+        joystick2->PreSetAxis(binding_left_y.value.axis);
+        const auto left_offset_x = -joystick2->GetAxis(binding_left_x.value.axis, 1.0f, 0);
+        const auto left_offset_y = -joystick2->GetAxis(binding_left_y.value.axis, 1.0f, 0);
        mapping.insert_or_assign(
            Settings::NativeAnalog::LStick,
            BuildParamPackageForAnalog(joystick2->GetPort(), joystick2->GetGUID(),
-                                       binding_left_x.value.axis, binding_left_y.value.axis));
+                                       binding_left_x.value.axis, binding_left_y.value.axis,
+                                       left_offset_x, left_offset_y));
    } else {
+        joystick->PreSetAxis(binding_left_x.value.axis);
+        joystick->PreSetAxis(binding_left_y.value.axis);
+        const auto left_offset_x = -joystick->GetAxis(binding_left_x.value.axis, 1.0f, 0);
+        const auto left_offset_y = -joystick->GetAxis(binding_left_y.value.axis, 1.0f, 0);
        mapping.insert_or_assign(
            Settings::NativeAnalog::LStick,
            BuildParamPackageForAnalog(joystick->GetPort(), joystick->GetGUID(),
-                                       binding_left_x.value.axis, binding_left_y.value.axis));
+                                       binding_left_x.value.axis, binding_left_y.value.axis,
+                                       left_offset_x, left_offset_y));
    }
    const auto& binding_right_x =
        SDL_GameControllerGetBindForAxis(controller, SDL_CONTROLLER_AXIS_RIGHTX);
    const auto& binding_right_y =
        SDL_GameControllerGetBindForAxis(controller, SDL_CONTROLLER_AXIS_RIGHTY);
+    joystick->PreSetAxis(binding_right_x.value.axis);
+    joystick->PreSetAxis(binding_right_y.value.axis);
+    const auto right_offset_x = -joystick->GetAxis(binding_right_x.value.axis, 1.0f, 0);
+    const auto right_offset_y = -joystick->GetAxis(binding_right_y.value.axis, 1.0f, 0);
    mapping.insert_or_assign(Settings::NativeAnalog::RStick,
                             BuildParamPackageForAnalog(joystick->GetPort(), joystick->GetGUID(),
                                                        binding_right_x.value.axis,
-                                                        binding_right_y.value.axis));
+                                                        binding_right_y.value.axis, right_offset_x,
+                                                        right_offset_y));
    return mapping;
 }

@@ -1563,8 +1604,9 @@ public:
            }

            if (const auto joystick = state.GetSDLJoystickBySDLID(event.jaxis.which)) {
+                // Set offset to zero since the joystick is not on center
                auto params = BuildParamPackageForAnalog(joystick->GetPort(), joystick->GetGUID(),
-                                                         first_axis, axis);
+                                                         first_axis, axis, 0, 0);
                first_axis = -1;
                return params;
            }
--- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp
@@ -298,14 +298,10 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
    if (IR::IsGeneric(attr)) {
        const u32 index{IR::GenericAttributeIndex(attr)};
        const std::optional<AttrInfo> type{AttrTypes(ctx, index)};
-        if (!type) {
-            // Attribute is disabled
+        if (!type || !ctx.runtime_info.previous_stage_stores.Generic(index, element)) {
+            // Attribute is disabled or varying component is not written
            return ctx.Const(element == 3 ? 1.0f : 0.0f);
        }
-        if (!ctx.runtime_info.previous_stage_stores.Generic(index, element)) {
-            // Varying component is not written
-            return ctx.Const(type && element == 3 ? 1.0f : 0.0f);
-        }
        const Id generic_id{ctx.input_generics.at(index)};
        const Id pointer{AttrPointer(ctx, type->pointer, vertex, generic_id, ctx.Const(element))};
        const Id value{ctx.OpLoad(type->id, pointer)};
@@ -337,8 +333,9 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
            return ctx.OpBitcast(ctx.F32[1], ctx.OpISub(ctx.U32[1], index, base));
        }
    case IR::Attribute::FrontFace:
-        return ctx.OpSelect(ctx.U32[1], ctx.OpLoad(ctx.U1, ctx.front_face),
-                            ctx.Const(std::numeric_limits<u32>::max()), ctx.u32_zero_value);
+        return ctx.OpSelect(ctx.F32[1], ctx.OpLoad(ctx.U1, ctx.front_face),
+                            ctx.OpBitcast(ctx.F32[1], ctx.Const(std::numeric_limits<u32>::max())),
+                            ctx.f32_zero_value);
    case IR::Attribute::PointSpriteS:
        return ctx.OpLoad(ctx.F32[1],
                          ctx.OpAccessChain(ctx.input_f32, ctx.point_coord, ctx.u32_zero_value));
--- a/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp
+++ b/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp
@@ -20,6 +20,7 @@
 #include "shader_recompiler/frontend/maxwell/decode.h"
 #include "shader_recompiler/frontend/maxwell/structured_control_flow.h"
 #include "shader_recompiler/frontend/maxwell/translate/translate.h"
+#include "shader_recompiler/host_translate_info.h"
 #include "shader_recompiler/object_pool.h"

 namespace Shader::Maxwell {
@@ -652,7 +653,7 @@ class TranslatePass {
 public:
    TranslatePass(ObjectPool<IR::Inst>& inst_pool_, ObjectPool<IR::Block>& block_pool_,
                  ObjectPool<Statement>& stmt_pool_, Environment& env_, Statement& root_stmt,
-                  IR::AbstractSyntaxList& syntax_list_)
+                  IR::AbstractSyntaxList& syntax_list_, const HostTranslateInfo& host_info)
        : stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_}, env{env_},
          syntax_list{syntax_list_} {
        Visit(root_stmt, nullptr, nullptr);
@@ -660,6 +661,9 @@ public:
        IR::Block& first_block{*syntax_list.front().data.block};
        IR::IREmitter ir(first_block, first_block.begin());
        ir.Prologue();
+        if (uses_demote_to_helper && host_info.needs_demote_reorder) {
+            DemoteCombinationPass();
+        }
    }

 private:
@@ -809,7 +813,14 @@ private:
            }
            case StatementType::Return: {
                ensure_block();
-                IR::IREmitter{*current_block}.Epilogue();
+                IR::Block* return_block{block_pool.Create(inst_pool)};
+                IR::IREmitter{*return_block}.Epilogue();
+                current_block->AddBranch(return_block);
+
+                auto& merge{syntax_list.emplace_back()};
+                merge.type = IR::AbstractSyntaxNode::Type::Block;
+                merge.data.block = return_block;
+
                current_block = nullptr;
                syntax_list.emplace_back().type = IR::AbstractSyntaxNode::Type::Return;
                break;
@@ -824,6 +835,7 @@ private:
                auto& merge{syntax_list.emplace_back()};
                merge.type = IR::AbstractSyntaxNode::Type::Block;
                merge.data.block = demote_block;
+                uses_demote_to_helper = true;
                break;
            }
            case StatementType::Unreachable: {
@@ -855,11 +867,117 @@ private:
        return block_pool.Create(inst_pool);
    }

+    void DemoteCombinationPass() {
+        using Type = IR::AbstractSyntaxNode::Type;
+        std::vector<IR::Block*> demote_blocks;
+        std::vector<IR::U1> demote_conds;
+        u32 num_epilogues{};
+        u32 branch_depth{};
+        for (const IR::AbstractSyntaxNode& node : syntax_list) {
+            if (node.type == Type::If) {
+                ++branch_depth;
+            }
+            if (node.type == Type::EndIf) {
+                --branch_depth;
+            }
+            if (node.type != Type::Block) {
+                continue;
+            }
+            if (branch_depth > 1) {
+                // Skip reordering nested demote branches.
+                continue;
+            }
+            for (const IR::Inst& inst : node.data.block->Instructions()) {
+                const IR::Opcode op{inst.GetOpcode()};
+                if (op == IR::Opcode::DemoteToHelperInvocation) {
+                    demote_blocks.push_back(node.data.block);
+                    break;
+                }
+                if (op == IR::Opcode::Epilogue) {
+                    ++num_epilogues;
+                }
+            }
+        }
+        if (demote_blocks.size() == 0) {
+            return;
+        }
+        if (num_epilogues > 1) {
+            LOG_DEBUG(Shader, "Combining demotes with more than one return is not implemented.");
+            return;
+        }
+        s64 last_iterator_offset{};
+        auto& asl{syntax_list};
+        for (const IR::Block* demote_block : demote_blocks) {
+            const auto start_it{asl.begin() + last_iterator_offset};
+            auto asl_it{std::find_if(start_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) {
+                return asn.type == Type::If && asn.data.if_node.body == demote_block;
+            })};
+            if (asl_it == asl.end()) {
+                // Demote without a conditional branch.
+                // No need to proceed since all fragment instances will be demoted regardless.
+                return;
+            }
+            const IR::Block* const end_if = asl_it->data.if_node.merge;
+            demote_conds.push_back(asl_it->data.if_node.cond);
+            last_iterator_offset = std::distance(asl.begin(), asl_it);
+
+            asl_it = asl.erase(asl_it);
+            asl_it = std::find_if(asl_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) {
+                return asn.type == Type::Block && asn.data.block == demote_block;
+            });
+
+            asl_it = asl.erase(asl_it);
+            asl_it = std::find_if(asl_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) {
+                return asn.type == Type::EndIf && asn.data.end_if.merge == end_if;
+            });
+            asl_it = asl.erase(asl_it);
+        }
+        const auto epilogue_func{[](const IR::AbstractSyntaxNode& asn) {
+            if (asn.type != Type::Block) {
+                return false;
+            }
+            for (const auto& inst : asn.data.block->Instructions()) {
+                if (inst.GetOpcode() == IR::Opcode::Epilogue) {
+                    return true;
+                }
+            }
+            return false;
+        }};
+        const auto reverse_it{std::find_if(asl.rbegin(), asl.rend(), epilogue_func)};
+        const auto return_block_it{(reverse_it + 1).base()};
+
+        IR::IREmitter ir{*(return_block_it - 1)->data.block};
+        IR::U1 cond(IR::Value(false));
+        for (const auto& demote_cond : demote_conds) {
+            cond = ir.LogicalOr(cond, demote_cond);
+        }
+        cond.Inst()->DestructiveAddUsage(1);
+
+        IR::AbstractSyntaxNode demote_if_node{};
+        demote_if_node.type = Type::If;
+        demote_if_node.data.if_node.cond = cond;
+        demote_if_node.data.if_node.body = demote_blocks[0];
+        demote_if_node.data.if_node.merge = return_block_it->data.block;
+
+        IR::AbstractSyntaxNode demote_node{};
+        demote_node.type = Type::Block;
+        demote_node.data.block = demote_blocks[0];
+
+        IR::AbstractSyntaxNode demote_endif_node{};
+        demote_endif_node.type = Type::EndIf;
+        demote_endif_node.data.end_if.merge = return_block_it->data.block;
+
+        asl.insert(return_block_it, demote_endif_node);
+        asl.insert(return_block_it, demote_node);
+        asl.insert(return_block_it, demote_if_node);
+    }
+
    ObjectPool<Statement>& stmt_pool;
    ObjectPool<IR::Inst>& inst_pool;
    ObjectPool<IR::Block>& block_pool;
    Environment& env;
    IR::AbstractSyntaxList& syntax_list;
+    bool uses_demote_to_helper{};

 // TODO: C++20 Remove this when all compilers support constexpr std::vector
 #if __cpp_lib_constexpr_vector >= 201907
@@ -871,12 +989,13 @@ private:
 } // Anonymous namespace

 IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
-                                Environment& env, Flow::CFG& cfg) {
+                                Environment& env, Flow::CFG& cfg,
+                                const HostTranslateInfo& host_info) {
    ObjectPool<Statement> stmt_pool{64};
    GotoPass goto_pass{cfg, stmt_pool};
    Statement& root{goto_pass.RootStatement()};
    IR::AbstractSyntaxList syntax_list;
-    TranslatePass{inst_pool, block_pool, stmt_pool, env, root, syntax_list};
+    TranslatePass{inst_pool, block_pool, stmt_pool, env, root, syntax_list, host_info};
    return syntax_list;
 }

--- a/src/shader_recompiler/frontend/maxwell/structured_control_flow.h
+++ b/src/shader_recompiler/frontend/maxwell/structured_control_flow.h
@@ -11,10 +11,13 @@
 #include "shader_recompiler/frontend/maxwell/control_flow.h"
 #include "shader_recompiler/object_pool.h"

-namespace Shader::Maxwell {
+namespace Shader {
+struct HostTranslateInfo;
+namespace Maxwell {

 [[nodiscard]] IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool,
                                              ObjectPool<IR::Block>& block_pool, Environment& env,
-                                              Flow::CFG& cfg);
+                                              Flow::CFG& cfg, const HostTranslateInfo& host_info);

-} // namespace Shader::Maxwell
+} // namespace Maxwell
+} // namespace Shader
--- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
@@ -130,7 +130,7 @@ void AddNVNStorageBuffers(IR::Program& program) {
 IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
                             Environment& env, Flow::CFG& cfg, const HostTranslateInfo& host_info) {
    IR::Program program;
-    program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg);
+    program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg, host_info);
    program.blocks = GenerateBlocks(program.syntax_list);
    program.post_order_blocks = PostOrder(program.syntax_list.front());
    program.stage = env.ShaderStage();
--- a/src/shader_recompiler/host_translate_info.h
+++ b/src/shader_recompiler/host_translate_info.h
@@ -11,8 +11,9 @@ namespace Shader {

 /// Misc information about the host
 struct HostTranslateInfo {
-    bool support_float16{}; ///< True when the device supports 16-bit floats
-    bool support_int64{};   ///< True when the device supports 64-bit integers
+    bool support_float16{};      ///< True when the device supports 16-bit floats
+    bool support_int64{};        ///< True when the device supports 64-bit integers
+    bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered
 };

 } // namespace Shader
--- a/src/tests/common/param_package.cpp
+++ b/src/tests/common/param_package.cpp
@@ -4,11 +4,13 @@

 #include <catch2/catch.hpp>
 #include <math.h>
+#include "common/logging/backend.h"
 #include "common/param_package.h"

 namespace Common {

 TEST_CASE("ParamPackage", "[common]") {
+    Common::Log::DisableLoggingInTests();
    ParamPackage original{
        {"abc", "xyz"},
        {"def", "42"},
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,5 +1,10 @@
 add_subdirectory(host_shaders)

+if(LIBVA_FOUND)
+    set_source_files_properties(command_classes/codecs/codec.cpp
+        PROPERTIES COMPILE_DEFINITIONS LIBVA_FOUND=1)
+endif()
+
 add_library(video_core STATIC
    buffer_cache/buffer_base.h
    buffer_cache/buffer_cache.cpp
@@ -92,6 +97,7 @@ add_library(video_core STATIC
    renderer_opengl/gl_stream_buffer.h
    renderer_opengl/gl_texture_cache.cpp
    renderer_opengl/gl_texture_cache.h
+    renderer_opengl/gl_texture_cache_base.cpp
    renderer_opengl/gl_query_cache.cpp
    renderer_opengl/gl_query_cache.h
    renderer_opengl/maxwell_to_gl.h
@@ -150,6 +156,7 @@ add_library(video_core STATIC
    renderer_vulkan/vk_swapchain.h
    renderer_vulkan/vk_texture_cache.cpp
    renderer_vulkan/vk_texture_cache.h
+    renderer_vulkan/vk_texture_cache_base.cpp
    renderer_vulkan/vk_update_descriptor.cpp
    renderer_vulkan/vk_update_descriptor.h
    shader_cache.cpp
@@ -181,6 +188,7 @@ add_library(video_core STATIC
    texture_cache/samples_helper.h
    texture_cache/slot_vector.h
    texture_cache/texture_cache.h
+    texture_cache/texture_cache_base.h
    texture_cache/types.h
    texture_cache/util.cpp
    texture_cache/util.h
--- a/src/video_core/buffer_cache/buffer_base.h
+++ b/src/video_core/buffer_cache/buffer_base.h
@@ -261,16 +261,6 @@ public:
        stream_score += score;
    }

-    /// Sets the new frame tick
-    void SetFrameTick(u64 new_frame_tick) noexcept {
-        frame_tick = new_frame_tick;
-    }
-
-    /// Returns the new frame tick
-    [[nodiscard]] u64 FrameTick() const noexcept {
-        return frame_tick;
-    }
-
    /// Returns the likeliness of this being a stream buffer
    [[nodiscard]] int StreamScore() const noexcept {
        return stream_score;
@@ -307,6 +297,14 @@ public:
        return words.size_bytes;
    }

+    size_t getLRUID() const noexcept {
+        return lru_id;
+    }
+
+    void setLRUID(size_t lru_id_) {
+        lru_id = lru_id_;
+    }
+
 private:
    template <Type type>
    u64* Array() noexcept {
@@ -603,9 +601,9 @@ private:
    RasterizerInterface* rasterizer = nullptr;
    VAddr cpu_addr = 0;
    Words words;
-    u64 frame_tick = 0;
    BufferFlagBits flags{};
    int stream_score = 0;
+    size_t lru_id = SIZE_MAX;
 };

 } // namespace VideoCommon
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -20,6 +20,7 @@
 #include "common/common_types.h"
 #include "common/div_ceil.h"
 #include "common/literals.h"
+#include "common/lru_cache.h"
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
 #include "common/settings.h"
@@ -330,7 +331,7 @@ private:
    template <bool insert>
    void ChangeRegister(BufferId buffer_id);

-    void TouchBuffer(Buffer& buffer) const noexcept;
+    void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept;

    bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size);

@@ -428,7 +429,11 @@ private:
    size_t immediate_buffer_capacity = 0;
    std::unique_ptr<u8[]> immediate_buffer_alloc;

-    typename SlotVector<Buffer>::Iterator deletion_iterator;
+    struct LRUItemParams {
+        using ObjectType = BufferId;
+        using TickType = u64;
+    };
+    Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache;
    u64 frame_tick = 0;
    u64 total_used_memory = 0;

@@ -445,7 +450,6 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_,
      kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_} {
    // Ensure the first slot is used for the null buffer
    void(slot_buffers.insert(runtime, NullBufferParams{}));
-    deletion_iterator = slot_buffers.end();
    common_ranges.clear();
 }

@@ -454,20 +458,17 @@ void BufferCache<P>::RunGarbageCollector() {
    const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY;
    const u64 ticks_to_destroy = aggressive_gc ? 60 : 120;
    int num_iterations = aggressive_gc ? 64 : 32;
-    for (; num_iterations > 0; --num_iterations) {
-        if (deletion_iterator == slot_buffers.end()) {
-            deletion_iterator = slot_buffers.begin();
+    const auto clean_up = [this, &num_iterations](BufferId buffer_id) {
+        if (num_iterations == 0) {
+            return true;
        }
-        ++deletion_iterator;
-        if (deletion_iterator == slot_buffers.end()) {
-            break;
-        }
-        const auto [buffer_id, buffer] = *deletion_iterator;
-        if (buffer->FrameTick() + ticks_to_destroy < frame_tick) {
-            DownloadBufferMemory(*buffer);
-            DeleteBuffer(buffer_id);
-        }
-    }
+        --num_iterations;
+        auto& buffer = slot_buffers[buffer_id];
+        DownloadBufferMemory(buffer);
+        DeleteBuffer(buffer_id);
+        return false;
+    };
+    lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, clean_up);
 }

 template <class P>
@@ -485,7 +486,7 @@ void BufferCache<P>::TickFrame() {
    const bool skip_preferred = hits * 256 < shots * 251;
    uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0;

-    if (Settings::values.use_caches_gc.GetValue() && total_used_memory >= EXPECTED_MEMORY) {
+    if (total_used_memory >= EXPECTED_MEMORY) {
        RunGarbageCollector();
    }
    ++frame_tick;
@@ -954,7 +955,7 @@ bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) {
 template <class P>
 void BufferCache<P>::BindHostIndexBuffer() {
    Buffer& buffer = slot_buffers[index_buffer.buffer_id];
-    TouchBuffer(buffer);
+    TouchBuffer(buffer, index_buffer.buffer_id);
    const u32 offset = buffer.Offset(index_buffer.cpu_addr);
    const u32 size = index_buffer.size;
    SynchronizeBuffer(buffer, index_buffer.cpu_addr, size);
@@ -975,7 +976,7 @@ void BufferCache<P>::BindHostVertexBuffers() {
    for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) {
        const Binding& binding = vertex_buffers[index];
        Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
        SynchronizeBuffer(buffer, binding.cpu_addr, binding.size);
        if (!flags[Dirty::VertexBuffer0 + index]) {
            continue;
@@ -1011,7 +1012,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32
    const VAddr cpu_addr = binding.cpu_addr;
    const u32 size = std::min(binding.size, (*uniform_buffer_sizes)[stage][index]);
    Buffer& buffer = slot_buffers[binding.buffer_id];
-    TouchBuffer(buffer);
+    TouchBuffer(buffer, binding.buffer_id);
    const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID &&
                                 size <= uniform_buffer_skip_cache_size &&
                                 !buffer.IsRegionGpuModified(cpu_addr, size);
@@ -1083,7 +1084,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) {
    ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) {
        const Binding& binding = storage_buffers[stage][index];
        Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
        const u32 size = binding.size;
        SynchronizeBuffer(buffer, binding.cpu_addr, size);

@@ -1128,7 +1129,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() {
    for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) {
        const Binding& binding = transform_feedback_buffers[index];
        Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
        const u32 size = binding.size;
        SynchronizeBuffer(buffer, binding.cpu_addr, size);

@@ -1148,7 +1149,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() {
    ForEachEnabledBit(enabled_compute_uniform_buffer_mask, [&](u32 index) {
        const Binding& binding = compute_uniform_buffers[index];
        Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
        const u32 size = std::min(binding.size, (*compute_uniform_buffer_sizes)[index]);
        SynchronizeBuffer(buffer, binding.cpu_addr, size);

@@ -1168,7 +1169,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() {
    ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) {
        const Binding& binding = compute_storage_buffers[index];
        Buffer& buffer = slot_buffers[binding.buffer_id];
-        TouchBuffer(buffer);
+        TouchBuffer(buffer, binding.buffer_id);
        const u32 size = binding.size;
        SynchronizeBuffer(buffer, binding.cpu_addr, size);

@@ -1513,11 +1514,11 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) {
    const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size);
    const u32 size = static_cast<u32>(overlap.end - overlap.begin);
    const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size);
-    TouchBuffer(slot_buffers[new_buffer_id]);
    for (const BufferId overlap_id : overlap.ids) {
        JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap);
    }
    Register(new_buffer_id);
+    TouchBuffer(slot_buffers[new_buffer_id], new_buffer_id);
    return new_buffer_id;
 }

@@ -1534,12 +1535,14 @@ void BufferCache<P>::Unregister(BufferId buffer_id) {
 template <class P>
 template <bool insert>
 void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
-    const Buffer& buffer = slot_buffers[buffer_id];
+    Buffer& buffer = slot_buffers[buffer_id];
    const auto size = buffer.SizeBytes();
    if (insert) {
        total_used_memory += Common::AlignUp(size, 1024);
+        buffer.setLRUID(lru_cache.Insert(buffer_id, frame_tick));
    } else {
        total_used_memory -= Common::AlignUp(size, 1024);
+        lru_cache.Free(buffer.getLRUID());
    }
    const VAddr cpu_addr_begin = buffer.CpuAddr();
    const VAddr cpu_addr_end = cpu_addr_begin + size;
@@ -1555,8 +1558,10 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) {
 }

 template <class P>
-void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept {
-    buffer.SetFrameTick(frame_tick);
+void BufferCache<P>::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept {
+    if (buffer_id != NULL_BUFFER_ID) {
+        lru_cache.Touch(buffer.getLRUID(), frame_tick);
+    }
 }

 template <class P>
--- a/src/video_core/command_classes/codecs/codec.cpp
+++ b/src/video_core/command_classes/codecs/codec.cpp
@@ -2,7 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <cstring>
 #include <fstream>
 #include <vector>
 #include "common/assert.h"
@@ -17,10 +16,47 @@ extern "C" {
 }

 namespace Tegra {
+#if defined(LIBVA_FOUND)
+// Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c originally under MIT license
+namespace {
+constexpr std::array<const char*, 2> VAAPI_DRIVERS = {
+    "i915",
+    "amdgpu",
+};
+
+AVPixelFormat GetHwFormat(AVCodecContext*, const AVPixelFormat* pix_fmts) {
+    for (const AVPixelFormat* p = pix_fmts; *p != AV_PIX_FMT_NONE; ++p) {
+        if (*p == AV_PIX_FMT_VAAPI) {
+            return AV_PIX_FMT_VAAPI;
+        }
+    }
+    LOG_INFO(Service_NVDRV, "Could not find compatible GPU AV format, falling back to CPU");
+    return *pix_fmts;
+}
+
+bool CreateVaapiHwdevice(AVBufferRef** av_hw_device) {
+    AVDictionary* hwdevice_options = nullptr;
+    av_dict_set(&hwdevice_options, "connection_type", "drm", 0);
+    for (const auto& driver : VAAPI_DRIVERS) {
+        av_dict_set(&hwdevice_options, "kernel_driver", driver, 0);
+        const int hwdevice_error = av_hwdevice_ctx_create(av_hw_device, AV_HWDEVICE_TYPE_VAAPI,
+                                                          nullptr, hwdevice_options, 0);
+        if (hwdevice_error >= 0) {
+            LOG_INFO(Service_NVDRV, "Using VA-API with {}", driver);
+            av_dict_free(&hwdevice_options);
+            return true;
+        }
+        LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed {}", hwdevice_error);
+    }
+    LOG_DEBUG(Service_NVDRV, "VA-API av_hwdevice_ctx_create failed for all drivers");
+    av_dict_free(&hwdevice_options);
+    return false;
+}
+} // namespace
+#endif

 void AVFrameDeleter(AVFrame* ptr) {
-    av_frame_unref(ptr);
-    av_free(ptr);
+    av_frame_free(&ptr);
 }

 Codec::Codec(GPU& gpu_, const NvdecCommon::NvdecRegisters& regs)
@@ -32,19 +68,31 @@ Codec::~Codec() {
        return;
    }
    // Free libav memory
-    AVFrame* av_frame{nullptr};
    avcodec_send_packet(av_codec_ctx, nullptr);
-    av_frame = av_frame_alloc();
+    AVFrame* av_frame = av_frame_alloc();
    avcodec_receive_frame(av_codec_ctx, av_frame);
    avcodec_flush_buffers(av_codec_ctx);
-
-    av_frame_unref(av_frame);
-    av_free(av_frame);
+    av_frame_free(&av_frame);
    avcodec_close(av_codec_ctx);
+    av_buffer_unref(&av_hw_device);
+}
+
+void Codec::InitializeHwdec() {
+    // Prioritize integrated GPU to mitigate bandwidth bottlenecks
+#if defined(LIBVA_FOUND)
+    if (CreateVaapiHwdevice(&av_hw_device)) {
+        const auto hw_device_ctx = av_buffer_ref(av_hw_device);
+        ASSERT_MSG(hw_device_ctx, "av_buffer_ref failed");
+        av_codec_ctx->hw_device_ctx = hw_device_ctx;
+        av_codec_ctx->get_format = GetHwFormat;
+        return;
+    }
+#endif
+    // TODO more GPU accelerated decoders
 }

 void Codec::Initialize() {
-    AVCodecID codec{AV_CODEC_ID_NONE};
+    AVCodecID codec;
    switch (current_codec) {
    case NvdecCommon::VideoCodec::H264:
        codec = AV_CODEC_ID_H264;
@@ -53,22 +101,24 @@ void Codec::Initialize() {
        codec = AV_CODEC_ID_VP9;
        break;
    default:
+        UNIMPLEMENTED_MSG("Unknown codec {}", current_codec);
        return;
    }
    av_codec = avcodec_find_decoder(codec);
    av_codec_ctx = avcodec_alloc_context3(av_codec);
    av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0);
-
-    // TODO(ameerj): libavcodec gpu hw acceleration
-
+    InitializeHwdec();
+    if (!av_codec_ctx->hw_device_ctx) {
+        LOG_INFO(Service_NVDRV, "Using FFmpeg software decoding");
+    }
    const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr);
    if (av_error < 0) {
        LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed.");
        avcodec_close(av_codec_ctx);
+        av_buffer_unref(&av_hw_device);
        return;
    }
    initialized = true;
-    return;
 }

 void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {
@@ -80,36 +130,64 @@ void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) {

 void Codec::Decode() {
    const bool is_first_frame = !initialized;
-    if (!initialized) {
+    if (is_first_frame) {
        Initialize();
    }
-
    bool vp9_hidden_frame = false;
-    AVPacket packet{};
-    av_init_packet(&packet);
    std::vector<u8> frame_data;
-
    if (current_codec == NvdecCommon::VideoCodec::H264) {
        frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame);
    } else if (current_codec == NvdecCommon::VideoCodec::Vp9) {
        frame_data = vp9_decoder->ComposeFrameHeader(state);
        vp9_hidden_frame = vp9_decoder->WasFrameHidden();
    }
-
+    AVPacket packet{};
+    av_init_packet(&packet);
    packet.data = frame_data.data();
    packet.size = static_cast<s32>(frame_data.size());
-
-    avcodec_send_packet(av_codec_ctx, &packet);
-
-    if (!vp9_hidden_frame) {
-        // Only receive/store visible frames
-        AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter};
-        avcodec_receive_frame(av_codec_ctx, frame.get());
-        av_frames.push(std::move(frame));
-        // Limit queue to 10 frames. Workaround for ZLA decode and queue spam
-        if (av_frames.size() > 10) {
-            av_frames.pop();
-        }
+    if (const int ret = avcodec_send_packet(av_codec_ctx, &packet); ret) {
+        LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", ret);
+        return;
+    }
+    // Only receive/store visible frames
+    if (vp9_hidden_frame) {
+        return;
+    }
+    AVFrame* hw_frame = av_frame_alloc();
+    AVFrame* sw_frame = hw_frame;
+    ASSERT_MSG(hw_frame, "av_frame_alloc hw_frame failed");
+    if (const int ret = avcodec_receive_frame(av_codec_ctx, hw_frame); ret) {
+        LOG_DEBUG(Service_NVDRV, "avcodec_receive_frame error {}", ret);
+        av_frame_free(&hw_frame);
+        return;
+    }
+    if (!hw_frame->width || !hw_frame->height) {
+        LOG_WARNING(Service_NVDRV, "Zero width or height in frame");
+        av_frame_free(&hw_frame);
+        return;
+    }
+#if defined(LIBVA_FOUND)
+    // Hardware acceleration code from FFmpeg/doc/examples/hw_decode.c under MIT license
+    if (hw_frame->format == AV_PIX_FMT_VAAPI) {
+        sw_frame = av_frame_alloc();
+        ASSERT_MSG(sw_frame, "av_frame_alloc sw_frame failed");
+        // Can't use AV_PIX_FMT_YUV420P and share code with software decoding in vic.cpp
+        // because Intel drivers crash unless using AV_PIX_FMT_NV12
+        sw_frame->format = AV_PIX_FMT_NV12;
+        const int transfer_data_ret = av_hwframe_transfer_data(sw_frame, hw_frame, 0);
+        ASSERT_MSG(!transfer_data_ret, "av_hwframe_transfer_data error {}", transfer_data_ret);
+        av_frame_free(&hw_frame);
+    }
+#endif
+    if (sw_frame->format != AV_PIX_FMT_YUV420P && sw_frame->format != AV_PIX_FMT_NV12) {
+        UNIMPLEMENTED_MSG("Unexpected video format from host graphics: {}", sw_frame->format);
+        av_frame_free(&sw_frame);
+        return;
+    }
+    av_frames.push(AVFramePtr{sw_frame, AVFrameDeleter});
+    if (av_frames.size() > 10) {
+        LOG_TRACE(Service_NVDRV, "av_frames.push overflow dropped frame");
+        av_frames.pop();
    }
 }

@@ -119,7 +197,6 @@ AVFramePtr Codec::GetCurrentFrame() {
    if (av_frames.empty()) {
        return AVFramePtr{nullptr, AVFrameDeleter};
    }
-
    AVFramePtr frame = std::move(av_frames.front());
    av_frames.pop();
    return frame;
@@ -144,6 +221,5 @@ std::string_view Codec::GetCurrentCodecName() const {
    default:
        return "Unknown";
    }
-};
-
+}
 } // namespace Tegra
--- a/src/video_core/command_classes/codecs/codec.h
+++ b/src/video_core/command_classes/codecs/codec.h
@@ -22,7 +22,6 @@ extern "C" {

 namespace Tegra {
 class GPU;
-struct VicRegisters;

 void AVFrameDeleter(AVFrame* ptr);
 using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>;
@@ -55,10 +54,13 @@ public:
    [[nodiscard]] std::string_view GetCurrentCodecName() const;

 private:
+    void InitializeHwdec();
+
    bool initialized{};
    NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None};

    AVCodec* av_codec{nullptr};
+    AVBufferRef* av_hw_device{nullptr};
    AVCodecContext* av_codec_ctx{nullptr};

    GPU& gpu;
--- a/src/video_core/command_classes/codecs/vp9.cpp
+++ b/src/video_core/command_classes/codecs/vp9.cpp
@@ -11,6 +11,9 @@

 namespace Tegra::Decoder {
 namespace {
+constexpr u32 diff_update_probability = 252;
+constexpr u32 frame_sync_code = 0x498342;
+
 // Default compressed header probabilities once frame context resets
 constexpr Vp9EntropyProbs default_probs{
    .y_mode_prob{
@@ -361,8 +364,7 @@ Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state)
    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);

    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
-    // order: last, golden, altref, current. It may be worthwhile to track the updates done here
-    // to avoid buffering frame data needed for reference frame updating in the header composition.
+    // order: last, golden, altref, current.
    std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4,
              vp9_info.frame_offsets.begin());

@@ -384,40 +386,25 @@ Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state)
        gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
                                      current_frame.info.bitstream_size);
    }
-    // Buffer two frames, saving the last show frame info
-    if (!next_next_frame.bit_stream.empty()) {
+    if (!next_frame.bit_stream.empty()) {
        Vp9FrameContainer temp{
            .info = current_frame.info,
            .bit_stream = std::move(current_frame.bit_stream),
        };
-        next_next_frame.info.show_frame = current_frame.info.last_frame_shown;
-        current_frame.info = next_next_frame.info;
-        current_frame.bit_stream = std::move(next_next_frame.bit_stream);
-        next_next_frame = std::move(temp);
-
-        if (!next_frame.bit_stream.empty()) {
-            Vp9FrameContainer temp2{
-                .info = current_frame.info,
-                .bit_stream = std::move(current_frame.bit_stream),
-            };
-            next_frame.info.show_frame = current_frame.info.last_frame_shown;
-            current_frame.info = next_frame.info;
-            current_frame.bit_stream = std::move(next_frame.bit_stream);
-            next_frame = std::move(temp2);
-        } else {
-            next_frame.info = current_frame.info;
-            next_frame.bit_stream = std::move(current_frame.bit_stream);
-        }
+        next_frame.info.show_frame = current_frame.info.last_frame_shown;
+        current_frame.info = next_frame.info;
+        current_frame.bit_stream = std::move(next_frame.bit_stream);
+        next_frame = std::move(temp);
    } else {
-        next_next_frame.info = current_frame.info;
-        next_next_frame.bit_stream = std::move(current_frame.bit_stream);
+        next_frame.info = current_frame.info;
+        next_frame.bit_stream = current_frame.bit_stream;
    }
    return current_frame;
 }

 std::vector<u8> VP9::ComposeCompressedHeader() {
    VpxRangeEncoder writer{};
-    const bool update_probs = current_frame_info.show_frame && !current_frame_info.is_key_frame;
+    const bool update_probs = !current_frame_info.is_key_frame && current_frame_info.show_frame;
    if (!current_frame_info.lossless) {
        if (static_cast<u32>(current_frame_info.transform_mode) >= 3) {
            writer.Write(3, 2);
@@ -613,86 +600,64 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {

        // Reset context
        prev_frame_probs = default_probs;
-        swap_next_golden = false;
+        swap_ref_indices = false;
        loop_filter_ref_deltas.fill(0);
        loop_filter_mode_deltas.fill(0);
-
-        // allow frames offsets to stabilize before checking for golden frames
-        grace_period = 4;
-
-        // On key frames, all frame slots are set to the current frame,
-        // so the value of the selected slot doesn't really matter.
-        frame_ctxs.fill({current_frame_number, false, default_probs});
+        frame_ctxs.fill(default_probs);

        // intra only, meaning the frame can be recreated with no other references
        current_frame_info.intra_only = true;
-
    } else {
-
        if (!current_frame_info.show_frame) {
            uncomp_writer.WriteBit(current_frame_info.intra_only);
-            if (!current_frame_info.last_frame_was_key) {
-                swap_next_golden = !swap_next_golden;
-            }
        } else {
            current_frame_info.intra_only = false;
        }
        if (!current_frame_info.error_resilient_mode) {
            uncomp_writer.WriteU(0, 2); // Reset frame context.
        }
-
-        // Last, Golden, Altref frames
-        std::array<s32, 3> ref_frame_index{0, 1, 2};
-
-        // Set when next frame is hidden
-        // altref and golden references are swapped
-        if (swap_next_golden) {
-            ref_frame_index = std::array<s32, 3>{0, 2, 1};
+        const auto& curr_offsets = current_frame_info.frame_offsets;
+        const auto& next_offsets = next_frame.info.frame_offsets;
+        const bool ref_frames_different = curr_offsets[1] != curr_offsets[2];
+        const bool next_references_swap =
+            (next_offsets[1] == curr_offsets[2]) || (next_offsets[2] == curr_offsets[1]);
+        const bool needs_ref_swap = ref_frames_different && next_references_swap;
+        if (needs_ref_swap) {
+            swap_ref_indices = !swap_ref_indices;
        }
+        union {
+            u32 raw;
+            BitField<0, 1, u32> refresh_last;
+            BitField<1, 2, u32> refresh_golden;
+            BitField<2, 1, u32> refresh_alt;
+        } refresh_frame_flags;

-        // update Last Frame
-        u64 refresh_frame_flags = 1;
-
-        // golden frame may refresh, determined if the next golden frame offset is changed
-        bool golden_refresh = false;
-        if (grace_period <= 0) {
-            for (s32 index = 1; index < 3; ++index) {
-                if (current_frame_info.frame_offsets[index] !=
-                    next_frame.info.frame_offsets[index]) {
-                    current_frame_info.refresh_frame[index] = true;
-                    golden_refresh = true;
-                    grace_period = 3;
-                }
+        refresh_frame_flags.raw = 0;
+        for (u32 index = 0; index < 3; ++index) {
+            // Refresh indices that use the current frame as an index
+            if (curr_offsets[3] == next_offsets[index]) {
+                refresh_frame_flags.raw |= 1u << index;
            }
        }
-
-        if (current_frame_info.show_frame &&
-            (!next_frame.info.show_frame || next_frame.info.is_key_frame)) {
-            // Update golden frame
-            refresh_frame_flags = swap_next_golden ? 2 : 4;
+        if (swap_ref_indices) {
+            const u32 temp = refresh_frame_flags.refresh_golden;
+            refresh_frame_flags.refresh_golden.Assign(refresh_frame_flags.refresh_alt.Value());
+            refresh_frame_flags.refresh_alt.Assign(temp);
        }
-
-        if (!current_frame_info.show_frame) {
-            // Update altref
-            refresh_frame_flags = swap_next_golden ? 2 : 4;
-        } else if (golden_refresh) {
-            refresh_frame_flags = 3;
-        }
-
        if (current_frame_info.intra_only) {
            uncomp_writer.WriteU(frame_sync_code, 24);
-            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
+            uncomp_writer.WriteU(refresh_frame_flags.raw, 8);
            uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16);
            uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16);
            uncomp_writer.WriteBit(false); // Render and frame size different.
        } else {
-            uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8);
-
-            for (s32 index = 1; index < 4; index++) {
+            const bool swap_indices = needs_ref_swap ^ swap_ref_indices;
+            const auto ref_frame_index = swap_indices ? std::array{0, 2, 1} : std::array{0, 1, 2};
+            uncomp_writer.WriteU(refresh_frame_flags.raw, 8);
+            for (size_t index = 1; index < 4; index++) {
                uncomp_writer.WriteU(ref_frame_index[index - 1], 3);
                uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1);
            }
-
            uncomp_writer.WriteBit(true);  // Frame size with refs.
            uncomp_writer.WriteBit(false); // Render and frame size different.
            uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv);
@@ -714,10 +679,9 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
        frame_ctx_idx = 1;
    }

-    uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index.
-    prev_frame_probs =
-        frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header
-    frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy};
+    uncomp_writer.WriteU(frame_ctx_idx, 2);       // Frame context index.
+    prev_frame_probs = frame_ctxs[frame_ctx_idx]; // reference probabilities for compressed header
+    frame_ctxs[frame_ctx_idx] = current_frame_info.entropy;

    uncomp_writer.WriteU(current_frame_info.first_level, 6);
    uncomp_writer.WriteU(current_frame_info.sharpness_level, 3);
@@ -778,6 +742,7 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
    uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
    uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);

+    ASSERT(!current_frame_info.segment_enabled);
    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).

    const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
@@ -812,7 +777,6 @@ const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters
        current_frame_info = curr_frame.info;
        bitstream = std::move(curr_frame.bit_stream);
    }
-
    // The uncompressed header routine sets PrevProb parameters needed for the compressed header
    auto uncomp_writer = ComposeUncompressedHeader();
    std::vector<u8> compressed_header = ComposeCompressedHeader();
@@ -828,13 +792,6 @@ const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters
              frame.begin() + uncompressed_header.size());
    std::copy(bitstream.begin(), bitstream.end(),
              frame.begin() + uncompressed_header.size() + compressed_header.size());
-
-    // keep track of frame number
-    current_frame_number++;
-    grace_period--;
-
-    // don't display hidden frames
-    hidden = !current_frame_info.show_frame;
    return frame;
 }

--- a/src/video_core/command_classes/codecs/vp9.h
+++ b/src/video_core/command_classes/codecs/vp9.h
@@ -14,7 +14,6 @@

 namespace Tegra {
 class GPU;
-enum class FrameType { KeyFrame = 0, InterFrame = 1 };
 namespace Decoder {

 /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
@@ -124,7 +123,7 @@ public:

    /// Returns true if the most recent frame was a hidden frame.
    [[nodiscard]] bool WasFrameHidden() const {
-        return hidden;
+        return !current_frame_info.show_frame;
    }

 private:
@@ -178,19 +177,12 @@ private:
    std::array<s8, 4> loop_filter_ref_deltas{};
    std::array<s8, 2> loop_filter_mode_deltas{};

-    bool hidden = false;
-    s64 current_frame_number = -2; // since we buffer 2 frames
-    s32 grace_period = 6;          // frame offsets need to stabilize
-    std::array<FrameContexts, 4> frame_ctxs{};
    Vp9FrameContainer next_frame{};
-    Vp9FrameContainer next_next_frame{};
-    bool swap_next_golden{};
+    std::array<Vp9EntropyProbs, 4> frame_ctxs{};
+    bool swap_ref_indices{};

    Vp9PictureInfo current_frame_info{};
    Vp9EntropyProbs prev_frame_probs{};
-
-    s32 diff_update_probability = 252;
-    s32 frame_sync_code = 0x498342;
 };

 } // namespace Decoder
--- a/src/video_core/command_classes/codecs/vp9_types.h
+++ b/src/video_core/command_classes/codecs/vp9_types.h
@@ -22,7 +22,7 @@ struct Vp9FrameDimensions {
 };
 static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size");

-enum FrameFlags : u32 {
+enum class FrameFlags : u32 {
    IsKeyFrame = 1 << 0,
    LastFrameIsKeyFrame = 1 << 1,
    FrameSizeChanged = 1 << 2,
@@ -30,6 +30,7 @@ enum FrameFlags : u32 {
    LastShowFrame = 1 << 4,
    IntraOnly = 1 << 5,
 };
+DECLARE_ENUM_FLAG_OPERATORS(FrameFlags)

 enum class TxSize {
    Tx4x4 = 0,   // 4x4 transform
@@ -92,44 +93,34 @@ struct Vp9EntropyProbs {
 static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size");

 struct Vp9PictureInfo {
-    bool is_key_frame;
-    bool intra_only;
-    bool last_frame_was_key;
-    bool frame_size_changed;
-    bool error_resilient_mode;
-    bool last_frame_shown;
-    bool show_frame;
+    u32 bitstream_size;
+    std::array<u64, 4> frame_offsets;
    std::array<s8, 4> ref_frame_sign_bias;
    s32 base_q_index;
    s32 y_dc_delta_q;
    s32 uv_dc_delta_q;
    s32 uv_ac_delta_q;
-    bool lossless;
    s32 transform_mode;
-    bool allow_high_precision_mv;
    s32 interp_filter;
    s32 reference_mode;
-    s8 comp_fixed_ref;
-    std::array<s8, 2> comp_var_ref;
    s32 log2_tile_cols;
    s32 log2_tile_rows;
-    bool segment_enabled;
-    bool segment_map_update;
-    bool segment_map_temporal_update;
-    s32 segment_abs_delta;
-    std::array<u32, 8> segment_feature_enable;
-    std::array<std::array<s16, 4>, 8> segment_feature_data;
-    bool mode_ref_delta_enabled;
-    bool use_prev_in_find_mv_refs;
    std::array<s8, 4> ref_deltas;
    std::array<s8, 2> mode_deltas;
    Vp9EntropyProbs entropy;
    Vp9FrameDimensions frame_size;
    u8 first_level;
    u8 sharpness_level;
-    u32 bitstream_size;
-    std::array<u64, 4> frame_offsets;
-    std::array<bool, 4> refresh_frame;
+    bool is_key_frame;
+    bool intra_only;
+    bool last_frame_was_key;
+    bool error_resilient_mode;
+    bool last_frame_shown;
+    bool show_frame;
+    bool lossless;
+    bool allow_high_precision_mv;
+    bool segment_enabled;
+    bool mode_ref_delta_enabled;
 };

 struct Vp9FrameContainer {
@@ -145,7 +136,7 @@ struct PictureInfo {
    Vp9FrameDimensions golden_frame_size;  ///< 0x50
    Vp9FrameDimensions alt_frame_size;     ///< 0x58
    Vp9FrameDimensions current_frame_size; ///< 0x60
-    u32 vp9_flags;                         ///< 0x68
+    FrameFlags vp9_flags;                  ///< 0x68
    std::array<s8, 4> ref_frame_sign_bias; ///< 0x6C
    u8 first_level;                        ///< 0x70
    u8 sharpness_level;                    ///< 0x71
@@ -158,60 +149,43 @@ struct PictureInfo {
    u8 allow_high_precision_mv;            ///< 0x78
    u8 interp_filter;                      ///< 0x79
    u8 reference_mode;                     ///< 0x7A
-    s8 comp_fixed_ref;                     ///< 0x7B
-    std::array<s8, 2> comp_var_ref;        ///< 0x7C
+    INSERT_PADDING_BYTES_NOINIT(3);        ///< 0x7B
    u8 log2_tile_cols;                     ///< 0x7E
    u8 log2_tile_rows;                     ///< 0x7F
    Segmentation segmentation;             ///< 0x80
    LoopFilter loop_filter;                ///< 0xE4
-    INSERT_PADDING_BYTES_NOINIT(5);        ///< 0xEB
-    u32 surface_params;                    ///< 0xF0
-    INSERT_PADDING_WORDS_NOINIT(3);        ///< 0xF4
+    INSERT_PADDING_BYTES_NOINIT(21);       ///< 0xEB

    [[nodiscard]] Vp9PictureInfo Convert() const {
        return {
-            .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
-            .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
-            .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
-            .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0,
-            .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0,
-            .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0,
-            .show_frame = false,
+            .bitstream_size = bitstream_size,
+            .frame_offsets{},
            .ref_frame_sign_bias = ref_frame_sign_bias,
            .base_q_index = base_q_index,
            .y_dc_delta_q = y_dc_delta_q,
            .uv_dc_delta_q = uv_dc_delta_q,
            .uv_ac_delta_q = uv_ac_delta_q,
-            .lossless = lossless != 0,
            .transform_mode = tx_mode,
-            .allow_high_precision_mv = allow_high_precision_mv != 0,
            .interp_filter = interp_filter,
            .reference_mode = reference_mode,
-            .comp_fixed_ref = comp_fixed_ref,
-            .comp_var_ref = comp_var_ref,
            .log2_tile_cols = log2_tile_cols,
            .log2_tile_rows = log2_tile_rows,
-            .segment_enabled = segmentation.enabled != 0,
-            .segment_map_update = segmentation.update_map != 0,
-            .segment_map_temporal_update = segmentation.temporal_update != 0,
-            .segment_abs_delta = segmentation.abs_delta,
-            .segment_feature_enable = segmentation.feature_mask,
-            .segment_feature_data = segmentation.feature_data,
-            .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
-            .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) &&
-                                        !(vp9_flags == (FrameFlags::FrameSizeChanged)) &&
-                                        !(vp9_flags == (FrameFlags::IntraOnly)) &&
-                                        (vp9_flags == (FrameFlags::LastShowFrame)) &&
-                                        !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)),
            .ref_deltas = loop_filter.ref_deltas,
            .mode_deltas = loop_filter.mode_deltas,
            .entropy{},
            .frame_size = current_frame_size,
            .first_level = first_level,
            .sharpness_level = sharpness_level,
-            .bitstream_size = bitstream_size,
-            .frame_offsets{},
-            .refresh_frame{},
+            .is_key_frame = True(vp9_flags & FrameFlags::IsKeyFrame),
+            .intra_only = True(vp9_flags & FrameFlags::IntraOnly),
+            .last_frame_was_key = True(vp9_flags & FrameFlags::LastFrameIsKeyFrame),
+            .error_resilient_mode = True(vp9_flags & FrameFlags::ErrorResilientMode),
+            .last_frame_shown = True(vp9_flags & FrameFlags::LastShowFrame),
+            .show_frame = true,
+            .lossless = lossless != 0,
+            .allow_high_precision_mv = allow_high_precision_mv != 0,
+            .segment_enabled = segmentation.enabled != 0,
+            .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0,
        };
    }
 };
@@ -296,12 +270,6 @@ struct RefPoolElement {
    bool refresh{};
 };

-struct FrameContexts {
-    s64 from;
-    bool adapted;
-    Vp9EntropyProbs probs;
-};
-
 #define ASSERT_POSITION(field_name, position)                                                      \
    static_assert(offsetof(Vp9EntropyProbs, field_name) == position,                               \
                  "Field " #field_name " has invalid position")
@@ -322,7 +290,6 @@ ASSERT_POSITION(last_frame_size, 0x48);
 ASSERT_POSITION(first_level, 0x70);
 ASSERT_POSITION(segmentation, 0x80);
 ASSERT_POSITION(loop_filter, 0xE4);
-ASSERT_POSITION(surface_params, 0xF0);
 #undef ASSERT_POSITION

 #define ASSERT_POSITION(field_name, position)                                                      \
--- a/src/video_core/command_classes/nvdec.cpp
+++ b/src/video_core/command_classes/nvdec.cpp
@@ -39,7 +39,7 @@ void Nvdec::Execute() {
        codec->Decode();
        break;
    default:
-        UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec()));
+        UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName());
        break;
    }
 }
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -46,11 +46,8 @@ void Vic::ProcessMethod(Method method, u32 argument) {
    case Method::SetOutputSurfaceLumaOffset:
        output_surface_luma_address = arg;
        break;
-    case Method::SetOutputSurfaceChromaUOffset:
-        output_surface_chroma_u_address = arg;
-        break;
-    case Method::SetOutputSurfaceChromaVOffset:
-        output_surface_chroma_v_address = arg;
+    case Method::SetOutputSurfaceChromaOffset:
+        output_surface_chroma_address = arg;
        break;
    default:
        break;
@@ -65,11 +62,10 @@ void Vic::Execute() {
    const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)};
    const AVFramePtr frame_ptr = nvdec_processor->GetFrame();
    const auto* frame = frame_ptr.get();
-    if (!frame || frame->width == 0 || frame->height == 0) {
+    if (!frame) {
        return;
    }
-    const VideoPixelFormat pixel_format =
-        static_cast<VideoPixelFormat>(config.pixel_format.Value());
+    const auto pixel_format = static_cast<VideoPixelFormat>(config.pixel_format.Value());
    switch (pixel_format) {
    case VideoPixelFormat::BGRA8:
    case VideoPixelFormat::RGBA8: {
@@ -83,37 +79,37 @@ void Vic::Execute() {
            sws_freeContext(scaler_ctx);
            scaler_ctx = nullptr;

-            // FFmpeg returns all frames in YUV420, convert it into expected format
-            scaler_ctx =
-                sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width,
-                               frame->height, target_format, 0, nullptr, nullptr, nullptr);
+            // Frames are decoded into either YUV420 or NV12 formats. Convert to desired format
+            scaler_ctx = sws_getContext(frame->width, frame->height,
+                                        static_cast<AVPixelFormat>(frame->format), frame->width,
+                                        frame->height, target_format, 0, nullptr, nullptr, nullptr);

            scaler_width = frame->width;
            scaler_height = frame->height;
        }
        // Get Converted frame
-        const std::size_t linear_size = frame->width * frame->height * 4;
+        const u32 width = static_cast<u32>(frame->width);
+        const u32 height = static_cast<u32>(frame->height);
+        const std::size_t linear_size = width * height * 4;

        // Only allocate frame_buffer once per stream, as the size is not expected to change
        if (!converted_frame_buffer) {
            converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(linear_size)), av_free};
        }
-
-        const int converted_stride{frame->width * 4};
+        const std::array<int, 4> converted_stride{frame->width * 4, frame->height * 4, 0, 0};
        u8* const converted_frame_buf_addr{converted_frame_buffer.get()};

        sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height,
-                  &converted_frame_buf_addr, &converted_stride);
+                  &converted_frame_buf_addr, converted_stride.data());

        const u32 blk_kind = static_cast<u32>(config.block_linear_kind);
        if (blk_kind != 0) {
            // swizzle pitch linear to block linear
            const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
-            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
-                                                            block_height, 0);
+            const auto size =
+                Tegra::Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
            luma_buffer.resize(size);
-            Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4,
-                                           frame->width, 4, luma_buffer.data(),
+            Tegra::Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
                                           converted_frame_buffer.get(), block_height, 0, 0);

            gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
@@ -131,41 +127,65 @@ void Vic::Execute() {
        const std::size_t surface_height = config.surface_height_minus1 + 1;
        const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width));
        const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height));
-        const std::size_t half_width = frame_width / 2;
-        const std::size_t half_height = frame_height / 2;
-        const std::size_t aligned_width = (surface_width + 0xff) & ~0xff;
+        const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL;

-        const auto* luma_ptr = frame->data[0];
-        const auto* chroma_b_ptr = frame->data[1];
-        const auto* chroma_r_ptr = frame->data[2];
        const auto stride = static_cast<size_t>(frame->linesize[0]);
-        const auto half_stride = static_cast<size_t>(frame->linesize[1]);

        luma_buffer.resize(aligned_width * surface_height);
        chroma_buffer.resize(aligned_width * surface_height / 2);

        // Populate luma buffer
+        const u8* luma_src = frame->data[0];
        for (std::size_t y = 0; y < frame_height; ++y) {
            const std::size_t src = y * stride;
            const std::size_t dst = y * aligned_width;
            for (std::size_t x = 0; x < frame_width; ++x) {
-                luma_buffer[dst + x] = luma_ptr[src + x];
+                luma_buffer[dst + x] = luma_src[src + x];
            }
        }
        gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(),
                                       luma_buffer.size());

-        // Populate chroma buffer from both channels with interleaving.
-        for (std::size_t y = 0; y < half_height; ++y) {
-            const std::size_t src = y * half_stride;
-            const std::size_t dst = y * aligned_width;
+        // Chroma
+        const std::size_t half_height = frame_height / 2;
+        const auto half_stride = static_cast<size_t>(frame->linesize[1]);

-            for (std::size_t x = 0; x < half_width; ++x) {
-                chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x];
-                chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x];
+        switch (frame->format) {
+        case AV_PIX_FMT_YUV420P: {
+            // Frame from FFmpeg software
+            // Populate chroma buffer from both channels with interleaving.
+            const std::size_t half_width = frame_width / 2;
+            const u8* chroma_b_src = frame->data[1];
+            const u8* chroma_r_src = frame->data[2];
+            for (std::size_t y = 0; y < half_height; ++y) {
+                const std::size_t src = y * half_stride;
+                const std::size_t dst = y * aligned_width;
+
+                for (std::size_t x = 0; x < half_width; ++x) {
+                    chroma_buffer[dst + x * 2] = chroma_b_src[src + x];
+                    chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x];
+                }
            }
+            break;
        }
-        gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
+        case AV_PIX_FMT_NV12: {
+            // Frame from VA-API hardware
+            // This is already interleaved so just copy
+            const u8* chroma_src = frame->data[1];
+            for (std::size_t y = 0; y < half_height; ++y) {
+                const std::size_t src = y * stride;
+                const std::size_t dst = y * aligned_width;
+                for (std::size_t x = 0; x < frame_width; ++x) {
+                    chroma_buffer[dst + x] = chroma_src[src + x];
+                }
+            }
+            break;
+        }
+        default:
+            UNREACHABLE();
+            break;
+        }
+        gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(),
                                       chroma_buffer.size());
        break;
    }
--- a/src/video_core/command_classes/vic.h
+++ b/src/video_core/command_classes/vic.h
@@ -22,8 +22,8 @@ public:
        SetControlParams = 0x1c1,
        SetConfigStructOffset = 0x1c2,
        SetOutputSurfaceLumaOffset = 0x1c8,
-        SetOutputSurfaceChromaUOffset = 0x1c9,
-        SetOutputSurfaceChromaVOffset = 0x1ca
+        SetOutputSurfaceChromaOffset = 0x1c9,
+        SetOutputSurfaceChromaUnusedOffset = 0x1ca
    };

    explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor);
@@ -64,8 +64,7 @@ private:

    GPUVAddr config_struct_address{};
    GPUVAddr output_surface_luma_address{};
-    GPUVAddr output_surface_chroma_u_address{};
-    GPUVAddr output_surface_chroma_v_address{};
+    GPUVAddr output_surface_chroma_address{};

    SwsContext* scaler_ctx{};
    s32 scaler_width{};
--- a/src/video_core/host_shaders/astc_decoder.comp
+++ b/src/video_core/host_shaders/astc_decoder.comp
@@ -10,33 +10,27 @@
 #define END_PUSH_CONSTANTS };
 #define UNIFORM(n)
 #define BINDING_INPUT_BUFFER 0
-#define BINDING_ENC_BUFFER 1
-#define BINDING_SWIZZLE_BUFFER 2
-#define BINDING_OUTPUT_IMAGE 3
+#define BINDING_OUTPUT_IMAGE 1

 #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv

 #define BEGIN_PUSH_CONSTANTS
 #define END_PUSH_CONSTANTS
 #define UNIFORM(n) layout(location = n) uniform
-#define BINDING_SWIZZLE_BUFFER 0
-#define BINDING_INPUT_BUFFER 1
-#define BINDING_ENC_BUFFER 2
+#define BINDING_INPUT_BUFFER 0
 #define BINDING_OUTPUT_IMAGE 0

 #endif

-layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;

 BEGIN_PUSH_CONSTANTS
 UNIFORM(1) uvec2 block_dims;
-
-UNIFORM(2) uint bytes_per_block_log2;
-UNIFORM(3) uint layer_stride;
-UNIFORM(4) uint block_size;
-UNIFORM(5) uint x_shift;
-UNIFORM(6) uint block_height;
-UNIFORM(7) uint block_height_mask;
+UNIFORM(2) uint layer_stride;
+UNIFORM(3) uint block_size;
+UNIFORM(4) uint x_shift;
+UNIFORM(5) uint block_height;
+UNIFORM(6) uint block_height_mask;
 END_PUSH_CONSTANTS

 struct EncodingData {
@@ -55,45 +49,35 @@ struct TexelWeightParams {
    bool void_extent_hdr;
 };

-// Swizzle data
-layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
-    uint swizzle_table[];
-};
-
 layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
-    uint astc_data[];
-};
-
-// ASTC Encodings data
-layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
-    EncodingData encoding_values[];
+    uvec4 astc_data[];
 };

 layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image;

-const uint GOB_SIZE_X = 64;
-const uint GOB_SIZE_Y = 8;
-const uint GOB_SIZE_Z = 1;
-const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
-
 const uint GOB_SIZE_X_SHIFT = 6;
 const uint GOB_SIZE_Y_SHIFT = 3;
-const uint GOB_SIZE_Z_SHIFT = 0;
-const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;

-const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
-
-const int BLOCK_SIZE_IN_BYTES = 16;
-
-const int BLOCK_INFO_ERROR = 0;
-const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
-const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
-const int BLOCK_INFO_NORMAL = 3;
+const uint BYTES_PER_BLOCK_LOG2 = 4;

 const int JUST_BITS = 0;
 const int QUINT = 1;
 const int TRIT = 2;

+// ASTC Encodings data, sorted in ascending order based on their BitLength value
+// (see GetBitLength() function)
+EncodingData encoding_values[22] = EncodingData[](
+    EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0),
+    EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0),
+    EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0),
+    EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0),
+    EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0),
+    EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0),
+    EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0),
+    EncodingData(JUST_BITS, 8, 0, 0)
+);
+
 // The following constants are expanded variants of the Replicate()
 // function calls corresponding to the following arguments:
 // value: index into the generated table
@@ -135,44 +119,37 @@ const uint REPLICATE_7_BIT_TO_8_TABLE[128] =
 // Input ASTC texture globals
 uint current_index = 0;
 int bitsread = 0;
-uint total_bitsread = 0;
-uint local_buff[16];
+int total_bitsread = 0;
+uvec4 local_buff;

 // Color data globals
-uint color_endpoint_data[16];
+uvec4 color_endpoint_data;
 int color_bitsread = 0;
-uint total_color_bitsread = 0;
-int color_index = 0;

 // Four values, two endpoints, four maximum paritions
 uint color_values[32];
 int colvals_index = 0;

 // Weight data globals
-uint texel_weight_data[16];
+uvec4 texel_weight_data;
 int texel_bitsread = 0;
-uint total_texel_bitsread = 0;
-int texel_index = 0;

 bool texel_flag = false;

 // Global "vectors" to be pushed into when decoding
-EncodingData result_vector[100];
+EncodingData result_vector[144];
 int result_index = 0;

-EncodingData texel_vector[100];
+EncodingData texel_vector[144];
 int texel_vector_index = 0;

 uint unquantized_texel_weights[2][144];

 uint SwizzleOffset(uvec2 pos) {
-    pos = pos & SWIZZLE_MASK;
-    return swizzle_table[pos.y * 64 + pos.x];
-}
-
-uint ReadTexel(uint offset) {
-    // extract the 8-bit value from the 32-bit packed data.
-    return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
+    uint x = pos.x;
+    uint y = pos.y;
+    return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
+                          (y % 2) * 16 + (x % 16);
 }

 // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
@@ -278,14 +255,10 @@ uint Hash52(uint p) {
    return p;
 }

-uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
-    if (partition_count == 1) {
-        return 0;
-    }
+uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
    if (small_block) {
        x <<= 1;
        y <<= 1;
-        z <<= 1;
    }

    seed += (partition_count - 1) * 1024;
@@ -299,10 +272,6 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
    uint seed6 = uint((rnum >> 20) & 0xF);
    uint seed7 = uint((rnum >> 24) & 0xF);
    uint seed8 = uint((rnum >> 28) & 0xF);
-    uint seed9 = uint((rnum >> 18) & 0xF);
-    uint seed10 = uint((rnum >> 22) & 0xF);
-    uint seed11 = uint((rnum >> 26) & 0xF);
-    uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF);

    seed1 = (seed1 * seed1);
    seed2 = (seed2 * seed2);
@@ -312,12 +281,8 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
    seed6 = (seed6 * seed6);
    seed7 = (seed7 * seed7);
    seed8 = (seed8 * seed8);
-    seed9 = (seed9 * seed9);
-    seed10 = (seed10 * seed10);
-    seed11 = (seed11 * seed11);
-    seed12 = (seed12 * seed12);

-    int sh1, sh2, sh3;
+    uint sh1, sh2;
    if ((seed & 1) > 0) {
        sh1 = (seed & 2) > 0 ? 4 : 5;
        sh2 = (partition_count == 3) ? 6 : 5;
@@ -325,25 +290,19 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
        sh1 = (partition_count == 3) ? 6 : 5;
        sh2 = (seed & 2) > 0 ? 4 : 5;
    }
-    sh3 = (seed & 0x10) > 0 ? sh1 : sh2;
+    seed1 >>= sh1;
+    seed2 >>= sh2;
+    seed3 >>= sh1;
+    seed4 >>= sh2;
+    seed5 >>= sh1;
+    seed6 >>= sh2;
+    seed7 >>= sh1;
+    seed8 >>= sh2;

-    seed1 = (seed1 >> sh1);
-    seed2 = (seed2 >> sh2);
-    seed3 = (seed3 >> sh1);
-    seed4 = (seed4 >> sh2);
-    seed5 = (seed5 >> sh1);
-    seed6 = (seed6 >> sh2);
-    seed7 = (seed7 >> sh1);
-    seed8 = (seed8 >> sh2);
-    seed9 = (seed9 >> sh3);
-    seed10 = (seed10 >> sh3);
-    seed11 = (seed11 >> sh3);
-    seed12 = (seed12 >> sh3);
-
-    uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
-    uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
-    uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
-    uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+    uint a = seed1 * x + seed2 * y + (rnum >> 14);
+    uint b = seed3 * x + seed4 * y + (rnum >> 10);
+    uint c = seed5 * x + seed6 * y + (rnum >> 6);
+    uint d = seed7 * x + seed8 * y + (rnum >> 2);

    a &= 0x3F;
    b &= 0x3F;
@@ -368,58 +327,37 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
    }
 }

-uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
-    return SelectPartition(seed, x, y, 0, partition_count, small_block);
-}
-
-uint ReadBit() {
-    if (current_index >= local_buff.length()) {
+uint ExtractBits(uvec4 payload, int offset, int bits) {
+    if (bits <= 0) {
        return 0;
    }
-    uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
-    ++bitsread;
-    ++total_bitsread;
-    if (bitsread == 8) {
-        ++current_index;
-        bitsread = 0;
+    int last_offset = offset + bits - 1;
+    int shifted_offset = offset >> 5;
+    if ((last_offset >> 5) == shifted_offset) {
+        return bitfieldExtract(payload[shifted_offset], offset & 31, bits);
    }
-    return bit;
+    int first_bits = 32 - (offset & 31);
+    int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits));
+    int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits));
+    return result_first | (result_second << first_bits);
 }

 uint StreamBits(uint num_bits) {
-    uint ret = 0;
-    for (uint i = 0; i < num_bits; i++) {
-        ret |= ((ReadBit() & 1) << i);
-    }
+    int int_bits = int(num_bits);
+    uint ret = ExtractBits(local_buff, total_bitsread, int_bits);
+    total_bitsread += int_bits;
    return ret;
 }

-uint ReadColorBit() {
-    uint bit = 0;
-    if (texel_flag) {
-        bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
-        ++texel_bitsread;
-        ++total_texel_bitsread;
-        if (texel_bitsread == 8) {
-            ++texel_index;
-            texel_bitsread = 0;
-        }
-    } else {
-        bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
-        ++color_bitsread;
-        ++total_color_bitsread;
-        if (color_bitsread == 8) {
-            ++color_index;
-            color_bitsread = 0;
-        }
-    }
-    return bit;
-}
-
 uint StreamColorBits(uint num_bits) {
    uint ret = 0;
-    for (uint i = 0; i < num_bits; i++) {
-        ret |= ((ReadColorBit() & 1) << i);
+    int int_bits = int(num_bits);
+    if (texel_flag) {
+        ret = ExtractBits(texel_weight_data, texel_bitsread, int_bits);
+        texel_bitsread += int_bits;
+    } else {
+        ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits);
+        color_bitsread += int_bits;
    }
    return ret;
 }
@@ -596,22 +534,16 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
    for (uint i = 0; i < num_partitions; i++) {
        num_values += ((modes[i] >> 2) + 1) << 1;
    }
-    int range = 256;
-    while (--range > 0) {
-        EncodingData val = encoding_values[range];
+    // Find the largest encoding that's within color_data_bits
+    // TODO(ameerj): profile with binary search
+    int range = 0;
+    while (++range < encoding_values.length()) {
        uint bit_length = GetBitLength(num_values, range);
-        if (bit_length <= color_data_bits) {
-            while (--range > 0) {
-                EncodingData newval = encoding_values[range];
-                if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
-                    break;
-                }
-            }
-            ++range;
+        if (bit_length > color_data_bits) {
            break;
        }
    }
-    DecodeIntegerSequence(range, num_values);
+    DecodeIntegerSequence(range - 1, num_values);
    uint out_index = 0;
    for (int itr = 0; itr < result_index; ++itr) {
        if (out_index >= num_values) {
@@ -1028,7 +960,7 @@ int FindLayout(uint mode) {
    return 5;
 }

-TexelWeightParams DecodeBlockInfo(uint block_index) {
+TexelWeightParams DecodeBlockInfo() {
    TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false);
    uint mode = StreamBits(11);
    if ((mode & 0x1ff) == 0x1fc) {
@@ -1110,10 +1042,10 @@ TexelWeightParams DecodeBlockInfo(uint block_index) {
    }
    weight_index -= 2;
    if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
-        const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31);
+        const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12);
        params.max_weight = max_weights[weight_index];
    } else {
-        const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7);
+        const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6);
        params.max_weight = max_weights[weight_index];
    }
    return params;
@@ -1144,8 +1076,8 @@ void FillVoidExtentLDR(ivec3 coord) {
    }
 }

-void DecompressBlock(ivec3 coord, uint block_index) {
-    TexelWeightParams params = DecodeBlockInfo(block_index);
+void DecompressBlock(ivec3 coord) {
+    TexelWeightParams params = DecodeBlockInfo();
    if (params.error_state) {
        FillError(coord);
        return;
@@ -1212,7 +1144,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
    // Read color data...
    uint color_data_bits = remaining_bits;
    while (remaining_bits > 0) {
-        int nb = int(min(remaining_bits, 8U));
+        int nb = int(min(remaining_bits, 32U));
        uint b = StreamBits(nb);
        color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
        ++ced_pointer;
@@ -1254,25 +1186,20 @@ void DecompressBlock(ivec3 coord, uint block_index) {
        ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]);
    }

-    for (uint i = 0; i < 16; i++) {
-        texel_weight_data[i] = local_buff[i];
-    }
-    for (uint i = 0; i < 8; i++) {
-#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16
-        uint a = REVERSE_BYTE(texel_weight_data[i]);
-        uint b = REVERSE_BYTE(texel_weight_data[15 - i]);
-#undef REVERSE_BYTE
-        texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8));
-        texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8));
-    }
+    texel_weight_data = local_buff;
+    texel_weight_data = bitfieldReverse(texel_weight_data).wzyx;
    uint clear_byte_start =
        (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1;
-    texel_weight_data[clear_byte_start - 1] =
-        texel_weight_data[clear_byte_start - 1] &
+
+    uint byte_insert = ExtractBits(texel_weight_data, int(clear_byte_start - 1) * 8, 8) &
        uint(
            ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
-    for (uint i = 0; i < 16 - clear_byte_start; i++) {
-        texel_weight_data[clear_byte_start + i] = 0U;
+    uint vec_index = (clear_byte_start - 1) >> 2;
+    texel_weight_data[vec_index] =
+        bitfieldInsert(texel_weight_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8);
+    for (uint i = clear_byte_start; i < 16; ++i) {
+        uint idx = i >> 2;
+        texel_weight_data[idx] = bitfieldInsert(texel_weight_data[idx], 0, int(i % 4) * 8, 8);
    }
    texel_flag = true; // use texel "vector" and bit stream in integer decoding
    DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
@@ -1281,8 +1208,11 @@ void DecompressBlock(ivec3 coord, uint block_index) {

    for (uint j = 0; j < block_dims.y; j++) {
        for (uint i = 0; i < block_dims.x; i++) {
-            uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
+            uint local_partition = 0;
+            if (num_partitions > 1) {
+                local_partition = Select2DPartition(partition_index, i, j, num_partitions,
                                                     (block_dims.y * block_dims.x) < 32);
+            }
            vec4 p;
            uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
            uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
@@ -1303,7 +1233,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {

 void main() {
    uvec3 pos = gl_GlobalInvocationID;
-    pos.x <<= bytes_per_block_log2;
+    pos.x <<= BYTES_PER_BLOCK_LOG2;

    // Read as soon as possible due to its latency
    const uint swizzle = SwizzleOffset(pos.xy);
@@ -1321,13 +1251,8 @@ void main() {
    if (any(greaterThanEqual(coord, imageSize(dest_image)))) {
        return;
    }
-    uint block_index =
-        pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x;
-
    current_index = 0;
    bitsread = 0;
-    for (int i = 0; i < 16; i++) {
-        local_buff[i] = ReadTexel(offset + i);
-    }
-    DecompressBlock(coord, block_index);
+    local_buff = astc_data[offset / 16];
+    DecompressBlock(coord);
 }
--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -6,7 +6,7 @@

 #include <array>
 #include <bitset>
-#include <xbyak.h>
+#include <xbyak/xbyak.h>
 #include "common/bit_field.h"
 #include "common/common_types.h"
 #include "common/x64/xbyak_abi.h"
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -463,6 +463,7 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
        ++page_index;
        page_offset = 0;
        remaining_size -= num_bytes;
+        old_page_addr = page_addr;
    }
    split();
    return result;
--- a/src/video_core/renderer_base.cpp
+++ b/src/video_core/renderer_base.cpp
@@ -27,7 +27,7 @@ void RendererBase::UpdateCurrentFramebufferLayout() {
    render_window.UpdateCurrentFramebufferLayout(layout.width, layout.height);
 }

-void RendererBase::RequestScreenshot(void* data, std::function<void()> callback,
+void RendererBase::RequestScreenshot(void* data, std::function<void(bool)> callback,
                                     const Layout::FramebufferLayout& layout) {
    if (renderer_settings.screenshot_requested) {
        LOG_ERROR(Render, "A screenshot is already requested or in progress, ignoring the request");
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -24,7 +24,7 @@ struct RendererSettings {
    // Screenshot
    std::atomic<bool> screenshot_requested{false};
    void* screenshot_bits{};
-    std::function<void()> screenshot_complete_callback;
+    std::function<void(bool)> screenshot_complete_callback;
    Layout::FramebufferLayout screenshot_framebuffer_layout;
 };

@@ -80,7 +80,7 @@ public:
    void RefreshBaseSettings();

    /// Request a screenshot of the next frame
-    void RequestScreenshot(void* data, std::function<void()> callback,
+    void RequestScreenshot(void* data, std::function<void(bool)> callback,
                           const Layout::FramebufferLayout& layout);

 protected:
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -156,6 +156,10 @@ public:
        return shader_backend;
    }

+    bool IsAmd() const {
+        return vendor_name == "ATI Technologies Inc.";
+    }
+
 private:
    static bool TestVariableAoffi();
    static bool TestPreciseBug();
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
@@ -15,7 +15,7 @@
 #include "video_core/renderer_opengl/gl_shader_util.h"
 #include "video_core/renderer_opengl/gl_state_tracker.h"
 #include "video_core/shader_notify.h"
-#include "video_core/texture_cache/texture_cache.h"
+#include "video_core/texture_cache/texture_cache_base.h"

 #if defined(_MSC_VER) && defined(NDEBUG)
 #define LAMBDA_FORCEINLINE [[msvc::forceinline]]
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -32,7 +32,7 @@
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/renderer_opengl.h"
 #include "video_core/shader_cache.h"
-#include "video_core/texture_cache/texture_cache.h"
+#include "video_core/texture_cache/texture_cache_base.h"

 namespace OpenGL {

--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -219,6 +219,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
      host_info{
          .support_float16 = false,
          .support_int64 = device.HasShaderInt64(),
+          .needs_demote_reorder = device.IsAmd(),
      } {
    if (use_asynchronous_shaders) {
        workers = CreateWorkers();
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -18,10 +18,8 @@
 #include "video_core/renderer_opengl/maxwell_to_gl.h"
 #include "video_core/renderer_opengl/util_shaders.h"
 #include "video_core/surface.h"
-#include "video_core/texture_cache/format_lookup_table.h"
+#include "video_core/texture_cache/formatter.h"
 #include "video_core/texture_cache/samples_helper.h"
-#include "video_core/texture_cache/texture_cache.h"
-#include "video_core/textures/decoders.h"

 namespace OpenGL {
 namespace {
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -12,7 +12,7 @@
 #include "shader_recompiler/shader_info.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/util_shaders.h"
-#include "video_core/texture_cache/texture_cache.h"
+#include "video_core/texture_cache/texture_cache_base.h"

 namespace OpenGL {

--- a/src/video_core/renderer_opengl/gl_texture_cache_base.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache_base.cpp
@@ -0,0 +1,10 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/renderer_opengl/gl_texture_cache.h"
+#include "video_core/texture_cache/texture_cache.h"
+
+namespace VideoCommon {
+template class VideoCommon::TextureCache<OpenGL::TextureCacheParams>;
+}
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -486,7 +486,7 @@ void RendererOpenGL::RenderScreenshot() {
    glBindFramebuffer(GL_READ_FRAMEBUFFER, old_read_fb);
    glBindFramebuffer(GL_DRAW_FRAMEBUFFER, old_draw_fb);

-    renderer_settings.screenshot_complete_callback();
+    renderer_settings.screenshot_complete_callback(true);
    renderer_settings.screenshot_requested = false;
 }

--- a/src/video_core/renderer_opengl/util_shaders.cpp
+++ b/src/video_core/renderer_opengl/util_shaders.cpp
@@ -60,19 +60,14 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
      copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
    const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
    swizzle_table_buffer.Create();
-    astc_buffer.Create();
    glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
-    glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_ENCODINGS_VALUES), &ASTC_ENCODINGS_VALUES,
-                         0);
 }

 UtilShaders::~UtilShaders() = default;

 void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
                             std::span<const VideoCommon::SwizzleParameters> swizzles) {
-    static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
-    static constexpr GLuint BINDING_INPUT_BUFFER = 1;
-    static constexpr GLuint BINDING_ENC_BUFFER = 2;
+    static constexpr GLuint BINDING_INPUT_BUFFER = 0;
    static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;

    const Extent2D tile_size{
@@ -80,34 +75,32 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
        .height = VideoCore::Surface::DefaultBlockHeight(image.info.format),
    };
    program_manager.BindComputeProgram(astc_decoder_program.handle);
-    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
-    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle);
-
    glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
    glUniform2ui(1, tile_size.width, tile_size.height);
+
    // Ensure buffer data is valid before dispatching
    glFlush();
    for (const SwizzleParameters& swizzle : swizzles) {
        const size_t input_offset = swizzle.buffer_offset + map.offset;
-        const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
-        const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
+        const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U);
+        const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U);

        const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
        ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
        ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
+        ASSERT(params.bytes_per_block_log2 == 4);

-        glUniform1ui(2, params.bytes_per_block_log2);
-        glUniform1ui(3, params.layer_stride);
-        glUniform1ui(4, params.block_size);
-        glUniform1ui(5, params.x_shift);
-        glUniform1ui(6, params.block_height);
-        glUniform1ui(7, params.block_height_mask);
+        glUniform1ui(2, params.layer_stride);
+        glUniform1ui(3, params.block_size);
+        glUniform1ui(4, params.x_shift);
+        glUniform1ui(5, params.block_height);
+        glUniform1ui(6, params.block_height_mask);

-        glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0,
-                           GL_WRITE_ONLY, GL_RGBA8);
        // ASTC texture data
        glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
                          image.guest_size_bytes - swizzle.buffer_offset);
+        glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0,
+                           GL_WRITE_ONLY, GL_RGBA8);

        glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers);
    }
--- a/src/video_core/renderer_opengl/util_shaders.h
+++ b/src/video_core/renderer_opengl/util_shaders.h
@@ -62,7 +62,6 @@ private:
    ProgramManager& program_manager;

    OGLBuffer swizzle_table_buffer;
-    OGLBuffer astc_buffer;

    OGLProgram astc_decoder_program;
    OGLProgram block_linear_unswizzle_2d_program;
--- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp
@@ -138,6 +138,7 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
    const bool use_accelerated =
        rasterizer.AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride);
    const bool is_srgb = use_accelerated && screen_info.is_srgb;
+    RenderScreenshot(*framebuffer, use_accelerated);

    bool has_been_recreated = false;
    const auto recreate_swapchain = [&] {
@@ -162,8 +163,9 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
    if (has_been_recreated) {
        blit_screen.Recreate();
    }
-    const VkSemaphore render_semaphore = blit_screen.Draw(*framebuffer, use_accelerated);
-    scheduler.Flush(render_semaphore);
+    const VkSemaphore render_semaphore = blit_screen.DrawToSwapchain(*framebuffer, use_accelerated);
+    const VkSemaphore present_semaphore = swapchain.CurrentPresentSemaphore();
+    scheduler.Flush(render_semaphore, present_semaphore);
    scheduler.WaitWorker();
    swapchain.Present(render_semaphore);

@@ -193,4 +195,153 @@ void RendererVulkan::Report() const {
    telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions);
 }

+void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& framebuffer,
+                                              bool use_accelerated) {
+    if (!renderer_settings.screenshot_requested) {
+        return;
+    }
+    const Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
+    vk::Image staging_image = device.GetLogical().CreateImage(VkImageCreateInfo{
+        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT,
+        .imageType = VK_IMAGE_TYPE_2D,
+        .format = VK_FORMAT_B8G8R8A8_UNORM,
+        .extent =
+            {
+                .width = layout.width,
+                .height = layout.height,
+                .depth = 1,
+            },
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .tiling = VK_IMAGE_TILING_OPTIMAL,
+        .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT |
+                 VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+    });
+    const auto image_commit = memory_allocator.Commit(staging_image, MemoryUsage::DeviceLocal);
+
+    const vk::ImageView dst_view = device.GetLogical().CreateImageView(VkImageViewCreateInfo{
+        .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .image = *staging_image,
+        .viewType = VK_IMAGE_VIEW_TYPE_2D,
+        .format = screen_info.is_srgb ? VK_FORMAT_B8G8R8A8_SRGB : VK_FORMAT_B8G8R8A8_UNORM,
+        .components{
+            .r = VK_COMPONENT_SWIZZLE_IDENTITY,
+            .g = VK_COMPONENT_SWIZZLE_IDENTITY,
+            .b = VK_COMPONENT_SWIZZLE_IDENTITY,
+            .a = VK_COMPONENT_SWIZZLE_IDENTITY,
+        },
+        .subresourceRange{
+            .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+            .baseMipLevel = 0,
+            .levelCount = 1,
+            .baseArrayLayer = 0,
+            .layerCount = VK_REMAINING_ARRAY_LAYERS,
+        },
+    });
+    const VkExtent2D render_area{.width = layout.width, .height = layout.height};
+    const vk::Framebuffer screenshot_fb = blit_screen.CreateFramebuffer(*dst_view, render_area);
+    // Since we're not rendering to the screen, ignore the render semaphore.
+    void(blit_screen.Draw(framebuffer, *screenshot_fb, layout, render_area, use_accelerated));
+
+    const auto buffer_size = static_cast<VkDeviceSize>(layout.width * layout.height * 4);
+    const VkBufferCreateInfo dst_buffer_info{
+        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .size = buffer_size,
+        .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 0,
+        .pQueueFamilyIndices = nullptr,
+    };
+    const vk::Buffer dst_buffer = device.GetLogical().CreateBuffer(dst_buffer_info);
+    MemoryCommit dst_buffer_memory = memory_allocator.Commit(dst_buffer, MemoryUsage::Download);
+
+    scheduler.RequestOutsideRenderPassOperationContext();
+    scheduler.Record([&](vk::CommandBuffer cmdbuf) {
+        const VkImageMemoryBarrier read_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
+            .oldLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+            .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .image = *staging_image,
+            .subresourceRange{
+                .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                .baseMipLevel = 0,
+                .levelCount = VK_REMAINING_MIP_LEVELS,
+                .baseArrayLayer = 0,
+                .layerCount = VK_REMAINING_ARRAY_LAYERS,
+            },
+        };
+        const VkImageMemoryBarrier image_write_barrier{
+            .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = 0,
+            .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+            .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+            .newLayout = VK_IMAGE_LAYOUT_GENERAL,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .image = *staging_image,
+            .subresourceRange{
+                .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                .baseMipLevel = 0,
+                .levelCount = VK_REMAINING_MIP_LEVELS,
+                .baseArrayLayer = 0,
+                .layerCount = VK_REMAINING_ARRAY_LAYERS,
+            },
+        };
+        static constexpr VkMemoryBarrier memory_write_barrier{
+            .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
+            .pNext = nullptr,
+            .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
+            .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
+        };
+        const VkBufferImageCopy copy{
+            .bufferOffset = 0,
+            .bufferRowLength = 0,
+            .bufferImageHeight = 0,
+            .imageSubresource{
+                .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+                .mipLevel = 0,
+                .baseArrayLayer = 0,
+                .layerCount = 1,
+            },
+            .imageOffset{.x = 0, .y = 0, .z = 0},
+            .imageExtent{
+                .width = layout.width,
+                .height = layout.height,
+                .depth = 1,
+            },
+        };
+        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                               0, read_barrier);
+        cmdbuf.CopyImageToBuffer(*staging_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, *dst_buffer,
+                                 copy);
+        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+                               0, memory_write_barrier, nullptr, image_write_barrier);
+    });
+    // Ensure the copy is fully completed before saving the screenshot
+    scheduler.Finish();
+
+    // Copy backing image data to the QImage screenshot buffer
+    const auto dst_memory_map = dst_buffer_memory.Map();
+    std::memcpy(renderer_settings.screenshot_bits, dst_memory_map.data(), dst_memory_map.size());
+    renderer_settings.screenshot_complete_callback(false);
+    renderer_settings.screenshot_requested = false;
+}
+
 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/renderer_vulkan.h
+++ b/src/video_core/renderer_vulkan/renderer_vulkan.h
@@ -54,6 +54,8 @@ public:
 private:
    void Report() const;

+    void RenderScreenshot(const Tegra::FramebufferConfig& framebuffer, bool use_accelerated);
+
    Core::TelemetrySession& telemetry_session;
    Core::Memory::Memory& cpu_memory;
    Tegra::GPU& gpu;
--- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp
@@ -130,7 +130,10 @@ void VKBlitScreen::Recreate() {
    CreateDynamicResources();
 }

-VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool use_accelerated) {
+VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer,
+                               const VkFramebuffer& host_framebuffer,
+                               const Layout::FramebufferLayout layout, VkExtent2D render_area,
+                               bool use_accelerated) {
    RefreshResources(framebuffer);

    // Finish any pending renderpass
@@ -145,8 +148,8 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
                        use_accelerated ? screen_info.image_view : *raw_image_views[image_index]);

    BufferData data;
-    SetUniformData(data, framebuffer);
-    SetVertexData(data, framebuffer);
+    SetUniformData(data, layout);
+    SetVertexData(data, framebuffer, layout);

    const std::span<u8> mapped_span = buffer_commit.Map();
    std::memcpy(mapped_span.data(), &data, sizeof(data));
@@ -156,11 +159,13 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool

        const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset;
        const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr);
-        const size_t size_bytes = GetSizeInBytes(framebuffer);

        // TODO(Rodrigo): Read this from HLE
        constexpr u32 block_height_log2 = 4;
        const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer);
+        const u64 size_bytes{Tegra::Texture::CalculateSize(true, bytes_per_pixel,
+                                                           framebuffer.stride, framebuffer.height,
+                                                           1, block_height_log2, 0)};
        Tegra::Texture::UnswizzleTexture(
            mapped_span.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes),
            bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0);
@@ -220,52 +225,75 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
                                   VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, write_barrier);
        });
    }
-    scheduler.Record([this, image_index, size = swapchain.GetSize()](vk::CommandBuffer cmdbuf) {
-        const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f;
-        const f32 bg_green = Settings::values.bg_green.GetValue() / 255.0f;
-        const f32 bg_blue = Settings::values.bg_blue.GetValue() / 255.0f;
-        const VkClearValue clear_color{
-            .color = {.float32 = {bg_red, bg_green, bg_blue, 1.0f}},
-        };
-        const VkRenderPassBeginInfo renderpass_bi{
-            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
-            .pNext = nullptr,
-            .renderPass = *renderpass,
-            .framebuffer = *framebuffers[image_index],
-            .renderArea =
-                {
-                    .offset = {0, 0},
-                    .extent = size,
-                },
-            .clearValueCount = 1,
-            .pClearValues = &clear_color,
-        };
-        const VkViewport viewport{
-            .x = 0.0f,
-            .y = 0.0f,
-            .width = static_cast<float>(size.width),
-            .height = static_cast<float>(size.height),
-            .minDepth = 0.0f,
-            .maxDepth = 1.0f,
-        };
-        const VkRect2D scissor{
-            .offset = {0, 0},
-            .extent = size,
-        };
-        cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE);
-        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline);
-        cmdbuf.SetViewport(0, viewport);
-        cmdbuf.SetScissor(0, scissor);
+    scheduler.Record(
+        [this, host_framebuffer, image_index, size = render_area](vk::CommandBuffer cmdbuf) {
+            const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f;
+            const f32 bg_green = Settings::values.bg_green.GetValue() / 255.0f;
+            const f32 bg_blue = Settings::values.bg_blue.GetValue() / 255.0f;
+            const VkClearValue clear_color{
+                .color = {.float32 = {bg_red, bg_green, bg_blue, 1.0f}},
+            };
+            const VkRenderPassBeginInfo renderpass_bi{
+                .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+                .pNext = nullptr,
+                .renderPass = *renderpass,
+                .framebuffer = host_framebuffer,
+                .renderArea =
+                    {
+                        .offset = {0, 0},
+                        .extent = size,
+                    },
+                .clearValueCount = 1,
+                .pClearValues = &clear_color,
+            };
+            const VkViewport viewport{
+                .x = 0.0f,
+                .y = 0.0f,
+                .width = static_cast<float>(size.width),
+                .height = static_cast<float>(size.height),
+                .minDepth = 0.0f,
+                .maxDepth = 1.0f,
+            };
+            const VkRect2D scissor{
+                .offset = {0, 0},
+                .extent = size,
+            };
+            cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE);
+            cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline);
+            cmdbuf.SetViewport(0, viewport);
+            cmdbuf.SetScissor(0, scissor);

-        cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices));
-        cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline_layout, 0,
-                                  descriptor_sets[image_index], {});
-        cmdbuf.Draw(4, 1, 0, 0);
-        cmdbuf.EndRenderPass();
-    });
+            cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices));
+            cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline_layout, 0,
+                                      descriptor_sets[image_index], {});
+            cmdbuf.Draw(4, 1, 0, 0);
+            cmdbuf.EndRenderPass();
+        });
    return *semaphores[image_index];
 }

+VkSemaphore VKBlitScreen::DrawToSwapchain(const Tegra::FramebufferConfig& framebuffer,
+                                          bool use_accelerated) {
+    const std::size_t image_index = swapchain.GetImageIndex();
+    const VkExtent2D render_area = swapchain.GetSize();
+    const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout();
+    return Draw(framebuffer, *framebuffers[image_index], layout, render_area, use_accelerated);
+}
+
+vk::Framebuffer VKBlitScreen::CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent) {
+    return device.GetLogical().CreateFramebuffer(VkFramebufferCreateInfo{
+        .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+        .pNext = nullptr,
+        .flags = 0,
+        .renderPass = *renderpass,
+        .attachmentCount = 1,
+        .pAttachments = &image_view,
+        .width = extent.width,
+        .height = extent.height,
+        .layers = 1,
+    });
+}
+
 void VKBlitScreen::CreateStaticResources() {
    CreateShaders();
    CreateSemaphores();
@@ -609,22 +637,9 @@ void VKBlitScreen::CreateFramebuffers() {
    const VkExtent2D size{swapchain.GetSize()};
    framebuffers.resize(image_count);

-    VkFramebufferCreateInfo ci{
-        .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
-        .pNext = nullptr,
-        .flags = 0,
-        .renderPass = *renderpass,
-        .attachmentCount = 1,
-        .pAttachments = nullptr,
-        .width = size.width,
-        .height = size.height,
-        .layers = 1,
-    };
-
    for (std::size_t i = 0; i < image_count; ++i) {
        const VkImageView image_view{swapchain.GetImageViewIndex(i)};
-        ci.pAttachments = &image_view;
-        framebuffers[i] = device.GetLogical().CreateFramebuffer(ci);
+        framebuffers[i] = CreateFramebuffer(image_view, size);
    }
 }

@@ -752,15 +767,13 @@ void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView imag
    device.GetLogical().UpdateDescriptorSets(std::array{ubo_write, sampler_write}, {});
 }

-void VKBlitScreen::SetUniformData(BufferData& data,
-                                  const Tegra::FramebufferConfig& framebuffer) const {
-    const auto& layout = render_window.GetFramebufferLayout();
+void VKBlitScreen::SetUniformData(BufferData& data, const Layout::FramebufferLayout layout) const {
    data.uniform.modelview_matrix =
        MakeOrthographicMatrix(static_cast<f32>(layout.width), static_cast<f32>(layout.height));
 }

-void VKBlitScreen::SetVertexData(BufferData& data,
-                                 const Tegra::FramebufferConfig& framebuffer) const {
+void VKBlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer,
+                                 const Layout::FramebufferLayout layout) const {
    const auto& framebuffer_transform_flags = framebuffer.transform_flags;
    const auto& framebuffer_crop_rect = framebuffer.crop_rect;

@@ -798,7 +811,7 @@ void VKBlitScreen::SetVertexData(BufferData& data,
                  static_cast<f32>(screen_info.height);
    }

-    const auto& screen = render_window.GetFramebufferLayout().screen;
+    const auto& screen = layout.screen;
    const auto x = static_cast<f32>(screen.left);
    const auto y = static_cast<f32>(screen.top);
    const auto w = static_cast<f32>(screen.GetWidth());
--- a/src/video_core/renderer_vulkan/vk_blit_screen.h
+++ b/src/video_core/renderer_vulkan/vk_blit_screen.h
@@ -56,8 +56,16 @@ public:
    void Recreate();

    [[nodiscard]] VkSemaphore Draw(const Tegra::FramebufferConfig& framebuffer,
+                                   const VkFramebuffer& host_framebuffer,
+                                   const Layout::FramebufferLayout layout, VkExtent2D render_area,
                                   bool use_accelerated);

+    [[nodiscard]] VkSemaphore DrawToSwapchain(const Tegra::FramebufferConfig& framebuffer,
+                                              bool use_accelerated);
+
+    [[nodiscard]] vk::Framebuffer CreateFramebuffer(const VkImageView& image_view,
+                                                    VkExtent2D extent);
+
 private:
    struct BufferData;

@@ -81,8 +89,9 @@ private:
    void CreateRawImages(const Tegra::FramebufferConfig& framebuffer);

    void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view) const;
-    void SetUniformData(BufferData& data, const Tegra::FramebufferConfig& framebuffer) const;
-    void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer) const;
+    void SetUniformData(BufferData& data, const Layout::FramebufferLayout layout) const;
+    void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer,
+                       const Layout::FramebufferLayout layout) const;

    u64 CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const;
    u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer,
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -30,16 +30,12 @@
 namespace Vulkan {

 using Tegra::Texture::SWIZZLE_TABLE;
-using Tegra::Texture::ASTC::ASTC_ENCODINGS_VALUES;
-using namespace Tegra::Texture::ASTC;

 namespace {

 constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
-constexpr u32 ASTC_BINDING_ENC_BUFFER = 1;
-constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 2;
-constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 3;
-constexpr size_t ASTC_NUM_BINDINGS = 4;
+constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1;
+constexpr size_t ASTC_NUM_BINDINGS = 2;

 template <size_t size>
 inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{
@@ -75,7 +71,7 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
    .score = 2,
 };

-constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDINGS{{
+constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
    {
        .binding = ASTC_BINDING_INPUT_BUFFER,
        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
@@ -83,20 +79,6 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDIN
        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
        .pImmutableSamplers = nullptr,
    },
-    {
-        .binding = ASTC_BINDING_ENC_BUFFER,
-        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-        .descriptorCount = 1,
-        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
-        .pImmutableSamplers = nullptr,
-    },
-    {
-        .binding = ASTC_BINDING_SWIZZLE_BUFFER,
-        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-        .descriptorCount = 1,
-        .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
-        .pImmutableSamplers = nullptr,
-    },
    {
        .binding = ASTC_BINDING_OUTPUT_IMAGE,
        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
@@ -108,12 +90,12 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDIN

 constexpr DescriptorBankInfo ASTC_BANK_INFO{
    .uniform_buffers = 0,
-    .storage_buffers = 3,
+    .storage_buffers = 1,
    .texture_buffers = 0,
    .image_buffers = 0,
    .textures = 0,
    .images = 1,
-    .score = 4,
+    .score = 2,
 };

 constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{
@@ -135,22 +117,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
            .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
            .stride = sizeof(DescriptorUpdateEntry),
        },
-        {
-            .dstBinding = ASTC_BINDING_ENC_BUFFER,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .offset = ASTC_BINDING_ENC_BUFFER * sizeof(DescriptorUpdateEntry),
-            .stride = sizeof(DescriptorUpdateEntry),
-        },
-        {
-            .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER,
-            .dstArrayElement = 0,
-            .descriptorCount = 1,
-            .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
-            .offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry),
-            .stride = sizeof(DescriptorUpdateEntry),
-        },
        {
            .dstBinding = ASTC_BINDING_OUTPUT_IMAGE,
            .dstArrayElement = 0,
@@ -163,7 +129,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>

 struct AstcPushConstants {
    std::array<u32, 2> blocks_dims;
-    u32 bytes_per_block_log2;
    u32 layer_stride;
    u32 block_size;
    u32 x_shift;
@@ -354,46 +319,6 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,

 ASTCDecoderPass::~ASTCDecoderPass() = default;

-void ASTCDecoderPass::MakeDataBuffer() {
-    constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_ENCODINGS_VALUES) + sizeof(SWIZZLE_TABLE);
-    data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
-        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
-        .pNext = nullptr,
-        .flags = 0,
-        .size = TOTAL_BUFFER_SIZE,
-        .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-        .queueFamilyIndexCount = 0,
-        .pQueueFamilyIndices = nullptr,
-    });
-    data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload);
-
-    const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload);
-    std::memcpy(staging_ref.mapped_span.data(), &ASTC_ENCODINGS_VALUES,
-                sizeof(ASTC_ENCODINGS_VALUES));
-    // Tack on the swizzle table at the end of the buffer
-    std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_ENCODINGS_VALUES), &SWIZZLE_TABLE,
-                sizeof(SWIZZLE_TABLE));
-
-    scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer,
-                      TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) {
-        static constexpr VkMemoryBarrier write_barrier{
-            .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
-            .pNext = nullptr,
-            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
-            .dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
-        };
-        const VkBufferCopy copy{
-            .srcOffset = offset,
-            .dstOffset = 0,
-            .size = TOTAL_BUFFER_SIZE,
-        };
-        cmdbuf.CopyBuffer(src, dst, copy);
-        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-                               0, write_barrier);
-    });
-}
-
 void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
                               std::span<const VideoCommon::SwizzleParameters> swizzles) {
    using namespace VideoCommon::Accelerated;
@@ -402,9 +327,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
        VideoCore::Surface::DefaultBlockHeight(image.info.format),
    };
    scheduler.RequestOutsideRenderPassOperationContext();
-    if (!data_buffer) {
-        MakeDataBuffer();
-    }
    const VkPipeline vk_pipeline = *pipeline;
    const VkImageAspectFlags aspect_mask = image.AspectMask();
    const VkImage vk_image = image.Handle();
@@ -436,16 +358,13 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
        });
    for (const VideoCommon::SwizzleParameters& swizzle : swizzles) {
        const size_t input_offset = swizzle.buffer_offset + map.offset;
-        const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
-        const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
+        const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U);
+        const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U);
        const u32 num_dispatches_z = image.info.resources.layers;

        update_descriptor_queue.Acquire();
        update_descriptor_queue.AddBuffer(map.buffer, input_offset,
                                          image.guest_size_bytes - swizzle.buffer_offset);
-        update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(ASTC_ENCODINGS_VALUES));
-        update_descriptor_queue.AddBuffer(*data_buffer, sizeof(ASTC_ENCODINGS_VALUES),
-                                          sizeof(SWIZZLE_TABLE));
        update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level));
        const void* const descriptor_data{update_descriptor_queue.UpdateData()};

@@ -453,11 +372,11 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
        const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
        ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
        ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
+        ASSERT(params.bytes_per_block_log2 == 4);
        scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims,
                          params, descriptor_data](vk::CommandBuffer cmdbuf) {
            const AstcPushConstants uniforms{
                .blocks_dims = block_dims,
-                .bytes_per_block_log2 = params.bytes_per_block_log2,
                .layer_stride = params.layer_stride,
                .block_size = params.block_size,
                .x_shift = params.x_shift,
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -96,15 +96,10 @@ public:
                  std::span<const VideoCommon::SwizzleParameters> swizzles);

 private:
-    void MakeDataBuffer();
-
    VKScheduler& scheduler;
    StagingBufferPool& staging_buffer_pool;
    VKUpdateDescriptorQueue& update_descriptor_queue;
    MemoryAllocator& memory_allocator;
-
-    vk::Buffer data_buffer;
-    MemoryCommit data_buffer_commit;
 };

 } // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -281,7 +281,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, Tegra::Engines::Maxw
        .supported_spirv = device.IsKhrSpirv1_4Supported() ? 0x00010400U : 0x00010000U,
        .unified_descriptor_binding = true,
        .support_descriptor_aliasing = true,
-        .support_int8 = true,
+        .support_int8 = device.IsInt8Supported(),
        .support_int16 = device.IsShaderInt16Supported(),
        .support_int64 = device.IsShaderInt64Supported(),
        .support_vertex_instance_id = false,
@@ -325,6 +325,8 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, Tegra::Engines::Maxw
    host_info = Shader::HostTranslateInfo{
        .support_float16 = device.IsFloat16Supported(),
        .support_int64 = device.IsShaderInt64Supported(),
+        .needs_demote_reorder = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY_KHR ||
+                                driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR,
    };
 }

--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -32,7 +32,7 @@
 #include "video_core/renderer_vulkan/vk_texture_cache.h"
 #include "video_core/renderer_vulkan/vk_update_descriptor.h"
 #include "video_core/shader_cache.h"
-#include "video_core/texture_cache/texture_cache.h"
+#include "video_core/texture_cache/texture_cache_base.h"
 #include "video_core/vulkan_common/vulkan_device.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"

--- a/src/video_core/renderer_vulkan/vk_scheduler.cpp
+++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp
@@ -55,14 +55,14 @@ VKScheduler::~VKScheduler() {
    worker_thread.join();
 }

-void VKScheduler::Flush(VkSemaphore semaphore) {
-    SubmitExecution(semaphore);
+void VKScheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
+    SubmitExecution(signal_semaphore, wait_semaphore);
    AllocateNewContext();
 }

-void VKScheduler::Finish(VkSemaphore semaphore) {
+void VKScheduler::Finish(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
    const u64 presubmit_tick = CurrentTick();
-    SubmitExecution(semaphore);
+    SubmitExecution(signal_semaphore, wait_semaphore);
    WaitWorker();
    Wait(presubmit_tick);
    AllocateNewContext();
@@ -171,37 +171,41 @@ void VKScheduler::AllocateWorkerCommandBuffer() {
    });
 }

-void VKScheduler::SubmitExecution(VkSemaphore semaphore) {
+void VKScheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) {
    EndPendingOperations();
    InvalidateState();

    const u64 signal_value = master_semaphore->NextTick();
-    Record([semaphore, signal_value, this](vk::CommandBuffer cmdbuf) {
+    Record([signal_semaphore, wait_semaphore, signal_value, this](vk::CommandBuffer cmdbuf) {
        cmdbuf.End();
-
-        const u32 num_signal_semaphores = semaphore ? 2U : 1U;
-
-        const u64 wait_value = signal_value - 1;
-        const VkPipelineStageFlags wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;
-
        const VkSemaphore timeline_semaphore = master_semaphore->Handle();
+
+        const u32 num_signal_semaphores = signal_semaphore ? 2U : 1U;
        const std::array signal_values{signal_value, u64(0)};
-        const std::array signal_semaphores{timeline_semaphore, semaphore};
+        const std::array signal_semaphores{timeline_semaphore, signal_semaphore};
+
+        const u32 num_wait_semaphores = wait_semaphore ? 2U : 1U;
+        const std::array wait_values{signal_value - 1, u64(1)};
+        const std::array wait_semaphores{timeline_semaphore, wait_semaphore};
+        static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{
+            VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
+            VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+        };

        const VkTimelineSemaphoreSubmitInfoKHR timeline_si{
            .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR,
            .pNext = nullptr,
-            .waitSemaphoreValueCount = 1,
-            .pWaitSemaphoreValues = &wait_value,
+            .waitSemaphoreValueCount = num_wait_semaphores,
+            .pWaitSemaphoreValues = wait_values.data(),
            .signalSemaphoreValueCount = num_signal_semaphores,
            .pSignalSemaphoreValues = signal_values.data(),
        };
        const VkSubmitInfo submit_info{
            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
            .pNext = &timeline_si,
-            .waitSemaphoreCount = 1,
-            .pWaitSemaphores = &timeline_semaphore,
-            .pWaitDstStageMask = &wait_stage_mask,
+            .waitSemaphoreCount = num_wait_semaphores,
+            .pWaitSemaphores = wait_semaphores.data(),
+            .pWaitDstStageMask = wait_stage_masks.data(),
            .commandBufferCount = 1,
            .pCommandBuffers = cmdbuf.address(),
            .signalSemaphoreCount = num_signal_semaphores,
--- a/src/video_core/renderer_vulkan/vk_scheduler.h
+++ b/src/video_core/renderer_vulkan/vk_scheduler.h
@@ -34,10 +34,10 @@ public:
    ~VKScheduler();

    /// Sends the current execution context to the GPU.
-    void Flush(VkSemaphore semaphore = nullptr);
+    void Flush(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr);

    /// Sends the current execution context to the GPU and waits for it to complete.
-    void Finish(VkSemaphore semaphore = nullptr);
+    void Finish(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr);

    /// Waits for the worker thread to finish executing everything. After this function returns it's
    /// safe to touch worker resources.
@@ -191,7 +191,7 @@ private:

    void AllocateWorkerCommandBuffer();

-    void SubmitExecution(VkSemaphore semaphore);
+    void SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore);

    void AllocateNewContext();

--- a/src/video_core/renderer_vulkan/vk_swapchain.cpp
+++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp
@@ -107,14 +107,12 @@ void VKSwapchain::AcquireNextImage() {
 }

 void VKSwapchain::Present(VkSemaphore render_semaphore) {
-    const VkSemaphore present_semaphore{*present_semaphores[frame_index]};
-    const std::array<VkSemaphore, 2> semaphores{present_semaphore, render_semaphore};
    const auto present_queue{device.GetPresentQueue()};
    const VkPresentInfoKHR present_info{
        .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
        .pNext = nullptr,
-        .waitSemaphoreCount = render_semaphore ? 2U : 1U,
-        .pWaitSemaphores = semaphores.data(),
+        .waitSemaphoreCount = render_semaphore ? 1U : 0U,
+        .pWaitSemaphores = &render_semaphore,
        .swapchainCount = 1,
        .pSwapchains = swapchain.address(),
        .pImageIndices = &image_index,
--- a/src/video_core/renderer_vulkan/vk_swapchain.h
+++ b/src/video_core/renderer_vulkan/vk_swapchain.h
@@ -72,6 +72,10 @@ public:
        return image_format;
    }

+    VkSemaphore CurrentPresentSemaphore() const {
+        return *present_semaphores[frame_index];
+    }
+
 private:
    void CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, u32 width, u32 height,
                         bool srgb);
--- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp
@@ -19,6 +19,8 @@
 #include "video_core/renderer_vulkan/vk_scheduler.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
 #include "video_core/renderer_vulkan/vk_texture_cache.h"
+#include "video_core/texture_cache/formatter.h"
+#include "video_core/texture_cache/samples_helper.h"
 #include "video_core/vulkan_common/vulkan_device.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"
--- a/src/video_core/renderer_vulkan/vk_texture_cache.h
+++ b/src/video_core/renderer_vulkan/vk_texture_cache.h
@@ -9,7 +9,7 @@

 #include "shader_recompiler/shader_info.h"
 #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
-#include "video_core/texture_cache/texture_cache.h"
+#include "video_core/texture_cache/texture_cache_base.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
 #include "video_core/vulkan_common/vulkan_wrapper.h"

--- a/src/video_core/renderer_vulkan/vk_texture_cache_base.cpp
+++ b/src/video_core/renderer_vulkan/vk_texture_cache_base.cpp
@@ -0,0 +1,10 @@
+// Copyright 2021 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/renderer_vulkan/vk_texture_cache.h"
+#include "video_core/texture_cache/texture_cache.h"
+
+namespace VideoCommon {
+template class VideoCommon::TextureCache<Vulkan::TextureCacheParams>;
+}
--- a/src/video_core/texture_cache/image_base.h
+++ b/src/video_core/texture_cache/image_base.h
@@ -80,7 +80,7 @@ struct ImageBase {
    VAddr cpu_addr_end = 0;

    u64 modification_tick = 0;
-    u64 frame_tick = 0;
+    size_t lru_index = SIZE_MAX;

    std::array<u32, MAX_MIP_LEVELS> mip_level_offsets{};

--- a/src/video_core/texture_cache/image_view_info.cpp
+++ b/src/video_core/texture_cache/image_view_info.cpp
@@ -6,7 +6,7 @@

 #include "common/assert.h"
 #include "video_core/texture_cache/image_view_info.h"
-#include "video_core/texture_cache/texture_cache.h"
+#include "video_core/texture_cache/texture_cache_base.h"
 #include "video_core/texture_cache/types.h"
 #include "video_core/textures/texture.h"

@@ -14,6 +14,8 @@ namespace VideoCommon {

 namespace {

+using Tegra::Texture::TextureType;
+
 constexpr u8 RENDER_TARGET_SWIZZLE = std::numeric_limits<u8>::max();

 [[nodiscard]] u8 CastSwizzle(SwizzleSource source) {
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -4,48 +4,10 @@

 #pragma once

-#include <algorithm>
-#include <array>
-#include <bit>
-#include <memory>
-#include <mutex>
-#include <optional>
-#include <span>
-#include <type_traits>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include <boost/container/small_vector.hpp>
-
 #include "common/alignment.h"
-#include "common/common_types.h"
-#include "common/literals.h"
-#include "common/logging/log.h"
-#include "common/settings.h"
-#include "video_core/compatible_formats.h"
-#include "video_core/delayed_destruction_ring.h"
 #include "video_core/dirty_flags.h"
-#include "video_core/engines/fermi_2d.h"
-#include "video_core/engines/kepler_compute.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/memory_manager.h"
-#include "video_core/rasterizer_interface.h"
-#include "video_core/surface.h"
-#include "video_core/texture_cache/descriptor_table.h"
-#include "video_core/texture_cache/format_lookup_table.h"
-#include "video_core/texture_cache/formatter.h"
-#include "video_core/texture_cache/image_base.h"
-#include "video_core/texture_cache/image_info.h"
-#include "video_core/texture_cache/image_view_base.h"
-#include "video_core/texture_cache/image_view_info.h"
-#include "video_core/texture_cache/render_targets.h"
 #include "video_core/texture_cache/samples_helper.h"
-#include "video_core/texture_cache/slot_vector.h"
-#include "video_core/texture_cache/types.h"
-#include "video_core/texture_cache/util.h"
-#include "video_core/textures/texture.h"
+#include "video_core/texture_cache/texture_cache_base.h"

 namespace VideoCommon {

@@ -61,352 +23,6 @@ using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
 using VideoCore::Surface::SurfaceType;
 using namespace Common::Literals;

-template <class P>
-class TextureCache {
-    /// Address shift for caching images into a hash table
-    static constexpr u64 PAGE_BITS = 20;
-
-    /// Enables debugging features to the texture cache
-    static constexpr bool ENABLE_VALIDATION = P::ENABLE_VALIDATION;
-    /// Implement blits as copies between framebuffers
-    static constexpr bool FRAMEBUFFER_BLITS = P::FRAMEBUFFER_BLITS;
-    /// True when some copies have to be emulated
-    static constexpr bool HAS_EMULATED_COPIES = P::HAS_EMULATED_COPIES;
-    /// True when the API can provide info about the memory of the device.
-    static constexpr bool HAS_DEVICE_MEMORY_INFO = P::HAS_DEVICE_MEMORY_INFO;
-
-    /// Image view ID for null descriptors
-    static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0};
-    /// Sampler ID for bugged sampler ids
-    static constexpr SamplerId NULL_SAMPLER_ID{0};
-
-    static constexpr u64 DEFAULT_EXPECTED_MEMORY = 1_GiB;
-    static constexpr u64 DEFAULT_CRITICAL_MEMORY = 2_GiB;
-
-    using Runtime = typename P::Runtime;
-    using Image = typename P::Image;
-    using ImageAlloc = typename P::ImageAlloc;
-    using ImageView = typename P::ImageView;
-    using Sampler = typename P::Sampler;
-    using Framebuffer = typename P::Framebuffer;
-
-    struct BlitImages {
-        ImageId dst_id;
-        ImageId src_id;
-        PixelFormat dst_format;
-        PixelFormat src_format;
-    };
-
-    template <typename T>
-    struct IdentityHash {
-        [[nodiscard]] size_t operator()(T value) const noexcept {
-            return static_cast<size_t>(value);
-        }
-    };
-
-public:
-    explicit TextureCache(Runtime&, VideoCore::RasterizerInterface&, Tegra::Engines::Maxwell3D&,
-                          Tegra::Engines::KeplerCompute&, Tegra::MemoryManager&);
-
-    /// Notify the cache that a new frame has been queued
-    void TickFrame();
-
-    /// Return a constant reference to the given image view id
-    [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept;
-
-    /// Return a reference to the given image view id
-    [[nodiscard]] ImageView& GetImageView(ImageViewId id) noexcept;
-
-    /// Mark an image as modified from the GPU
-    void MarkModification(ImageId id) noexcept;
-
-    /// Fill image_view_ids with the graphics images in indices
-    void FillGraphicsImageViews(std::span<const u32> indices,
-                                std::span<ImageViewId> image_view_ids);
-
-    /// Fill image_view_ids with the compute images in indices
-    void FillComputeImageViews(std::span<const u32> indices, std::span<ImageViewId> image_view_ids);
-
-    /// Get the sampler from the graphics descriptor table in the specified index
-    Sampler* GetGraphicsSampler(u32 index);
-
-    /// Get the sampler from the compute descriptor table in the specified index
-    Sampler* GetComputeSampler(u32 index);
-
-    /// Refresh the state for graphics image view and sampler descriptors
-    void SynchronizeGraphicsDescriptors();
-
-    /// Refresh the state for compute image view and sampler descriptors
-    void SynchronizeComputeDescriptors();
-
-    /// Update bound render targets and upload memory if necessary
-    /// @param is_clear True when the render targets are being used for clears
-    void UpdateRenderTargets(bool is_clear);
-
-    /// Find a framebuffer with the currently bound render targets
-    /// UpdateRenderTargets should be called before this
-    Framebuffer* GetFramebuffer();
-
-    /// Mark images in a range as modified from the CPU
-    void WriteMemory(VAddr cpu_addr, size_t size);
-
-    /// Download contents of host images to guest memory in a region
-    void DownloadMemory(VAddr cpu_addr, size_t size);
-
-    /// Remove images in a region
-    void UnmapMemory(VAddr cpu_addr, size_t size);
-
-    /// Remove images in a region
-    void UnmapGPUMemory(GPUVAddr gpu_addr, size_t size);
-
-    /// Blit an image with the given parameters
-    void BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
-                   const Tegra::Engines::Fermi2D::Surface& src,
-                   const Tegra::Engines::Fermi2D::Config& copy);
-
-    /// Invalidate the contents of the color buffer index
-    /// These contents become unspecified, the cache can assume aggressive optimizations.
-    void InvalidateColorBuffer(size_t index);
-
-    /// Invalidate the contents of the depth buffer
-    /// These contents become unspecified, the cache can assume aggressive optimizations.
-    void InvalidateDepthBuffer();
-
-    /// Try to find a cached image view in the given CPU address
-    [[nodiscard]] ImageView* TryFindFramebufferImageView(VAddr cpu_addr);
-
-    /// Return true when there are uncommitted images to be downloaded
-    [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
-
-    /// Return true when the caller should wait for async downloads
-    [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
-
-    /// Commit asynchronous downloads
-    void CommitAsyncFlushes();
-
-    /// Pop asynchronous downloads
-    void PopAsyncFlushes();
-
-    /// Return true when a CPU region is modified from the GPU
-    [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
-
-    std::mutex mutex;
-
-private:
-    /// Iterate over all page indices in a range
-    template <typename Func>
-    static void ForEachCPUPage(VAddr addr, size_t size, Func&& func) {
-        static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>;
-        const u64 page_end = (addr + size - 1) >> PAGE_BITS;
-        for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) {
-            if constexpr (RETURNS_BOOL) {
-                if (func(page)) {
-                    break;
-                }
-            } else {
-                func(page);
-            }
-        }
-    }
-
-    template <typename Func>
-    static void ForEachGPUPage(GPUVAddr addr, size_t size, Func&& func) {
-        static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>;
-        const u64 page_end = (addr + size - 1) >> PAGE_BITS;
-        for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) {
-            if constexpr (RETURNS_BOOL) {
-                if (func(page)) {
-                    break;
-                }
-            } else {
-                func(page);
-            }
-        }
-    }
-
-    /// Runs the Garbage Collector.
-    void RunGarbageCollector();
-
-    /// Fills image_view_ids in the image views in indices
-    void FillImageViews(DescriptorTable<TICEntry>& table,
-                        std::span<ImageViewId> cached_image_view_ids, std::span<const u32> indices,
-                        std::span<ImageViewId> image_view_ids);
-
-    /// Find or create an image view in the guest descriptor table
-    ImageViewId VisitImageView(DescriptorTable<TICEntry>& table,
-                               std::span<ImageViewId> cached_image_view_ids, u32 index);
-
-    /// Find or create a framebuffer with the given render target parameters
-    FramebufferId GetFramebufferId(const RenderTargets& key);
-
-    /// Refresh the contents (pixel data) of an image
-    void RefreshContents(Image& image, ImageId image_id);
-
-    /// Upload data from guest to an image
-    template <typename StagingBuffer>
-    void UploadImageContents(Image& image, StagingBuffer& staging_buffer);
-
-    /// Find or create an image view from a guest descriptor
-    [[nodiscard]] ImageViewId FindImageView(const TICEntry& config);
-
-    /// Create a new image view from a guest descriptor
-    [[nodiscard]] ImageViewId CreateImageView(const TICEntry& config);
-
-    /// Find or create an image from the given parameters
-    [[nodiscard]] ImageId FindOrInsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
-                                            RelaxedOptions options = RelaxedOptions{});
-
-    /// Find an image from the given parameters
-    [[nodiscard]] ImageId FindImage(const ImageInfo& info, GPUVAddr gpu_addr,
-                                    RelaxedOptions options);
-
-    /// Create an image from the given parameters
-    [[nodiscard]] ImageId InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
-                                      RelaxedOptions options);
-
-    /// Create a new image and join perfectly matching existing images
-    /// Remove joined images from the cache
-    [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr);
-
-    /// Return a blit image pair from the given guest blit parameters
-    [[nodiscard]] BlitImages GetBlitImages(const Tegra::Engines::Fermi2D::Surface& dst,
-                                           const Tegra::Engines::Fermi2D::Surface& src);
-
-    /// Find or create a sampler from a guest descriptor sampler
-    [[nodiscard]] SamplerId FindSampler(const TSCEntry& config);
-
-    /// Find or create an image view for the given color buffer index
-    [[nodiscard]] ImageViewId FindColorBuffer(size_t index, bool is_clear);
-
-    /// Find or create an image view for the depth buffer
-    [[nodiscard]] ImageViewId FindDepthBuffer(bool is_clear);
-
-    /// Find or create a view for a render target with the given image parameters
-    [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr,
-                                                   bool is_clear);
-
-    /// Iterates over all the images in a region calling func
-    template <typename Func>
-    void ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& func);
-
-    template <typename Func>
-    void ForEachImageInRegionGPU(GPUVAddr gpu_addr, size_t size, Func&& func);
-
-    template <typename Func>
-    void ForEachSparseImageInRegion(GPUVAddr gpu_addr, size_t size, Func&& func);
-
-    /// Iterates over all the images in a region calling func
-    template <typename Func>
-    void ForEachSparseSegment(ImageBase& image, Func&& func);
-
-    /// Find or create an image view in the given image with the passed parameters
-    [[nodiscard]] ImageViewId FindOrEmplaceImageView(ImageId image_id, const ImageViewInfo& info);
-
-    /// Register image in the page table
-    void RegisterImage(ImageId image);
-
-    /// Unregister image from the page table
-    void UnregisterImage(ImageId image);
-
-    /// Track CPU reads and writes for image
-    void TrackImage(ImageBase& image, ImageId image_id);
-
-    /// Stop tracking CPU reads and writes for image
-    void UntrackImage(ImageBase& image, ImageId image_id);
-
-    /// Delete image from the cache
-    void DeleteImage(ImageId image);
-
-    /// Remove image views references from the cache
-    void RemoveImageViewReferences(std::span<const ImageViewId> removed_views);
-
-    /// Remove framebuffers using the given image views from the cache
-    void RemoveFramebuffers(std::span<const ImageViewId> removed_views);
-
-    /// Mark an image as modified from the GPU
-    void MarkModification(ImageBase& image) noexcept;
-
-    /// Synchronize image aliases, copying data if needed
-    void SynchronizeAliases(ImageId image_id);
-
-    /// Prepare an image to be used
-    void PrepareImage(ImageId image_id, bool is_modification, bool invalidate);
-
-    /// Prepare an image view to be used
-    void PrepareImageView(ImageViewId image_view_id, bool is_modification, bool invalidate);
-
-    /// Execute copies from one image to the other, even if they are incompatible
-    void CopyImage(ImageId dst_id, ImageId src_id, std::span<const ImageCopy> copies);
-
-    /// Bind an image view as render target, downloading resources preemtively if needed
-    void BindRenderTarget(ImageViewId* old_id, ImageViewId new_id);
-
-    /// Create a render target from a given image and image view parameters
-    [[nodiscard]] std::pair<FramebufferId, ImageViewId> RenderTargetFromImage(
-        ImageId, const ImageViewInfo& view_info);
-
-    /// Returns true if the current clear parameters clear the whole image of a given image view
-    [[nodiscard]] bool IsFullClear(ImageViewId id);
-
-    Runtime& runtime;
-    VideoCore::RasterizerInterface& rasterizer;
-    Tegra::Engines::Maxwell3D& maxwell3d;
-    Tegra::Engines::KeplerCompute& kepler_compute;
-    Tegra::MemoryManager& gpu_memory;
-
-    DescriptorTable<TICEntry> graphics_image_table{gpu_memory};
-    DescriptorTable<TSCEntry> graphics_sampler_table{gpu_memory};
-    std::vector<SamplerId> graphics_sampler_ids;
-    std::vector<ImageViewId> graphics_image_view_ids;
-
-    DescriptorTable<TICEntry> compute_image_table{gpu_memory};
-    DescriptorTable<TSCEntry> compute_sampler_table{gpu_memory};
-    std::vector<SamplerId> compute_sampler_ids;
-    std::vector<ImageViewId> compute_image_view_ids;
-
-    RenderTargets render_targets;
-
-    std::unordered_map<TICEntry, ImageViewId> image_views;
-    std::unordered_map<TSCEntry, SamplerId> samplers;
-    std::unordered_map<RenderTargets, FramebufferId> framebuffers;
-
-    std::unordered_map<u64, std::vector<ImageMapId>, IdentityHash<u64>> page_table;
-    std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> gpu_page_table;
-    std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> sparse_page_table;
-
-    std::unordered_map<ImageId, std::vector<ImageViewId>> sparse_views;
-
-    VAddr virtual_invalid_space{};
-
-    bool has_deleted_images = false;
-    u64 total_used_memory = 0;
-    u64 minimum_memory;
-    u64 expected_memory;
-    u64 critical_memory;
-
-    SlotVector<Image> slot_images;
-    SlotVector<ImageMapView> slot_map_views;
-    SlotVector<ImageView> slot_image_views;
-    SlotVector<ImageAlloc> slot_image_allocs;
-    SlotVector<Sampler> slot_samplers;
-    SlotVector<Framebuffer> slot_framebuffers;
-
-    // TODO: This data structure is not optimal and it should be reworked
-    std::vector<ImageId> uncommitted_downloads;
-    std::queue<std::vector<ImageId>> committed_downloads;
-
-    static constexpr size_t TICKS_TO_DESTROY = 6;
-    DelayedDestructionRing<Image, TICKS_TO_DESTROY> sentenced_images;
-    DelayedDestructionRing<ImageView, TICKS_TO_DESTROY> sentenced_image_view;
-    DelayedDestructionRing<Framebuffer, TICKS_TO_DESTROY> sentenced_framebuffers;
-
-    std::unordered_map<GPUVAddr, ImageAllocId> image_allocs_table;
-
-    u64 modification_tick = 0;
-    u64 frame_tick = 0;
-    typename SlotVector<Image>::Iterator deletion_iterator;
-};
-
 template <class P>
 TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& rasterizer_,
                              Tegra::Engines::Maxwell3D& maxwell3d_,
@@ -426,8 +42,6 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface&
    void(slot_image_views.insert(runtime, NullImageParams{}));
    void(slot_samplers.insert(runtime, sampler_descriptor));

-    deletion_iterator = slot_images.begin();
-
    if constexpr (HAS_DEVICE_MEMORY_INFO) {
        const auto device_memory = runtime.GetDeviceLocalMemory();
        const u64 possible_expected_memory = (device_memory * 3) / 10;
@@ -447,70 +61,38 @@ template <class P>
 void TextureCache<P>::RunGarbageCollector() {
    const bool high_priority_mode = total_used_memory >= expected_memory;
    const bool aggressive_mode = total_used_memory >= critical_memory;
-    const u64 ticks_to_destroy = high_priority_mode ? 60 : 100;
-    int num_iterations = aggressive_mode ? 256 : (high_priority_mode ? 128 : 64);
-    for (; num_iterations > 0; --num_iterations) {
-        if (deletion_iterator == slot_images.end()) {
-            deletion_iterator = slot_images.begin();
-            if (deletion_iterator == slot_images.end()) {
-                break;
-            }
+    const u64 ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 25ULL : 100ULL;
+    size_t num_iterations = aggressive_mode ? 10000 : (high_priority_mode ? 100 : 5);
+    const auto clean_up = [this, &num_iterations, high_priority_mode](ImageId image_id) {
+        if (num_iterations == 0) {
+            return true;
        }
-        auto [image_id, image_tmp] = *deletion_iterator;
-        Image* image = image_tmp; // fix clang error.
-        const bool is_alias = True(image->flags & ImageFlagBits::Alias);
-        const bool is_bad_overlap = True(image->flags & ImageFlagBits::BadOverlap);
-        const bool must_download = image->IsSafeDownload();
-        bool should_care = is_bad_overlap || is_alias || (high_priority_mode && !must_download);
-        const u64 ticks_needed =
-            is_bad_overlap
-                ? ticks_to_destroy >> 4
-                : ((should_care && aggressive_mode) ? ticks_to_destroy >> 1 : ticks_to_destroy);
-        should_care |= aggressive_mode;
-        if (should_care && image->frame_tick + ticks_needed < frame_tick) {
-            if (is_bad_overlap) {
-                const bool overlap_check = std::ranges::all_of(
-                    image->overlapping_images, [&, image](const ImageId& overlap_id) {
-                        auto& overlap = slot_images[overlap_id];
-                        return overlap.frame_tick >= image->frame_tick;
-                    });
-                if (!overlap_check) {
-                    ++deletion_iterator;
-                    continue;
-                }
-            }
-            if (!is_bad_overlap && must_download) {
-                const bool alias_check = std::ranges::none_of(
-                    image->aliased_images, [&, image](const AliasedImage& alias) {
-                        auto& alias_image = slot_images[alias.id];
-                        return (alias_image.frame_tick < image->frame_tick) ||
-                               (alias_image.modification_tick < image->modification_tick);
-                    });
-
-                if (alias_check) {
-                    auto map = runtime.DownloadStagingBuffer(image->unswizzled_size_bytes);
-                    const auto copies = FullDownloadCopies(image->info);
-                    image->DownloadMemory(map, copies);
-                    runtime.Finish();
-                    SwizzleImage(gpu_memory, image->gpu_addr, image->info, copies, map.mapped_span);
-                }
-            }
-            if (True(image->flags & ImageFlagBits::Tracked)) {
-                UntrackImage(*image, image_id);
-            }
-            UnregisterImage(image_id);
-            DeleteImage(image_id);
-            if (is_bad_overlap) {
-                ++num_iterations;
-            }
+        --num_iterations;
+        auto& image = slot_images[image_id];
+        const bool must_download = image.IsSafeDownload();
+        if (!high_priority_mode && must_download) {
+            return false;
        }
-        ++deletion_iterator;
-    }
+        if (must_download) {
+            auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
+            const auto copies = FullDownloadCopies(image.info);
+            image.DownloadMemory(map, copies);
+            runtime.Finish();
+            SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
+        }
+        if (True(image.flags & ImageFlagBits::Tracked)) {
+            UntrackImage(image, image_id);
+        }
+        UnregisterImage(image_id);
+        DeleteImage(image_id);
+        return false;
+    };
+    lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, clean_up);
 }

 template <class P>
 void TextureCache<P>::TickFrame() {
-    if (Settings::values.use_caches_gc.GetValue() && total_used_memory > minimum_memory) {
+    if (total_used_memory > minimum_memory) {
        RunGarbageCollector();
    }
    sentenced_images.Tick();
@@ -820,40 +402,6 @@ void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
    }
 }

-template <class P>
-void TextureCache<P>::InvalidateColorBuffer(size_t index) {
-    ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index];
-    color_buffer_id = FindColorBuffer(index, false);
-    if (!color_buffer_id) {
-        LOG_ERROR(HW_GPU, "Invalidating invalid color buffer in index={}", index);
-        return;
-    }
-    // When invalidating a color buffer, the old contents are no longer relevant
-    ImageView& color_buffer = slot_image_views[color_buffer_id];
-    Image& image = slot_images[color_buffer.image_id];
-    image.flags &= ~ImageFlagBits::CpuModified;
-    image.flags &= ~ImageFlagBits::GpuModified;
-
-    runtime.InvalidateColorBuffer(color_buffer, index);
-}
-
-template <class P>
-void TextureCache<P>::InvalidateDepthBuffer() {
-    ImageViewId& depth_buffer_id = render_targets.depth_buffer_id;
-    depth_buffer_id = FindDepthBuffer(false);
-    if (!depth_buffer_id) {
-        LOG_ERROR(HW_GPU, "Invalidating invalid depth buffer");
-        return;
-    }
-    // When invalidating the depth buffer, the old contents are no longer relevant
-    ImageBase& image = slot_images[slot_image_views[depth_buffer_id].image_id];
-    image.flags &= ~ImageFlagBits::CpuModified;
-    image.flags &= ~ImageFlagBits::GpuModified;
-
-    ImageView& depth_buffer = slot_image_views[depth_buffer_id];
-    runtime.InvalidateDepthBuffer(depth_buffer);
-}
-
 template <class P>
 typename P::ImageView* TextureCache<P>::TryFindFramebufferImageView(VAddr cpu_addr) {
    // TODO: Properly implement this
@@ -1495,6 +1043,8 @@ void TextureCache<P>::RegisterImage(ImageId image_id) {
        tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
    }
    total_used_memory += Common::AlignUp(tentative_size, 1024);
+    image.lru_index = lru_cache.Insert(image_id, frame_tick);
+
    ForEachGPUPage(image.gpu_addr, image.guest_size_bytes,
                   [this, image_id](u64 page) { gpu_page_table[page].push_back(image_id); });
    if (False(image.flags & ImageFlagBits::Sparse)) {
@@ -1532,6 +1082,7 @@ void TextureCache<P>::UnregisterImage(ImageId image_id) {
        tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format);
    }
    total_used_memory -= Common::AlignUp(tentative_size, 1024);
+    lru_cache.Free(image.lru_index);
    const auto& clear_page_table =
        [this, image_id](
            u64 page,
@@ -1801,7 +1352,7 @@ void TextureCache<P>::PrepareImage(ImageId image_id, bool is_modification, bool
    if (is_modification) {
        MarkModification(image);
    }
-    image.frame_tick = frame_tick;
+    lru_cache.Touch(image.lru_index, frame_tick);
 }

 template <class P>
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -0,0 +1,391 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <mutex>
+#include <span>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "common/common_types.h"
+#include "common/literals.h"
+#include "common/lru_cache.h"
+#include "video_core/compatible_formats.h"
+#include "video_core/delayed_destruction_ring.h"
+#include "video_core/engines/fermi_2d.h"
+#include "video_core/engines/kepler_compute.h"
+#include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+#include "video_core/surface.h"
+#include "video_core/texture_cache/descriptor_table.h"
+#include "video_core/texture_cache/image_base.h"
+#include "video_core/texture_cache/image_info.h"
+#include "video_core/texture_cache/image_view_info.h"
+#include "video_core/texture_cache/render_targets.h"
+#include "video_core/texture_cache/slot_vector.h"
+#include "video_core/texture_cache/types.h"
+#include "video_core/texture_cache/util.h"
+#include "video_core/textures/texture.h"
+
+namespace VideoCommon {
+
+using Tegra::Texture::SwizzleSource;
+using Tegra::Texture::TICEntry;
+using Tegra::Texture::TSCEntry;
+using VideoCore::Surface::GetFormatType;
+using VideoCore::Surface::IsCopyCompatible;
+using VideoCore::Surface::PixelFormat;
+using VideoCore::Surface::PixelFormatFromDepthFormat;
+using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
+using namespace Common::Literals;
+
+template <class P>
+class TextureCache {
+    /// Address shift for caching images into a hash table
+    static constexpr u64 PAGE_BITS = 20;
+
+    /// Enables debugging features to the texture cache
+    static constexpr bool ENABLE_VALIDATION = P::ENABLE_VALIDATION;
+    /// Implement blits as copies between framebuffers
+    static constexpr bool FRAMEBUFFER_BLITS = P::FRAMEBUFFER_BLITS;
+    /// True when some copies have to be emulated
+    static constexpr bool HAS_EMULATED_COPIES = P::HAS_EMULATED_COPIES;
+    /// True when the API can provide info about the memory of the device.
+    static constexpr bool HAS_DEVICE_MEMORY_INFO = P::HAS_DEVICE_MEMORY_INFO;
+
+    /// Image view ID for null descriptors
+    static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0};
+    /// Sampler ID for bugged sampler ids
+    static constexpr SamplerId NULL_SAMPLER_ID{0};
+
+    static constexpr u64 DEFAULT_EXPECTED_MEMORY = 1_GiB;
+    static constexpr u64 DEFAULT_CRITICAL_MEMORY = 2_GiB;
+
+    using Runtime = typename P::Runtime;
+    using Image = typename P::Image;
+    using ImageAlloc = typename P::ImageAlloc;
+    using ImageView = typename P::ImageView;
+    using Sampler = typename P::Sampler;
+    using Framebuffer = typename P::Framebuffer;
+
+    struct BlitImages {
+        ImageId dst_id;
+        ImageId src_id;
+        PixelFormat dst_format;
+        PixelFormat src_format;
+    };
+
+    template <typename T>
+    struct IdentityHash {
+        [[nodiscard]] size_t operator()(T value) const noexcept {
+            return static_cast<size_t>(value);
+        }
+    };
+
+public:
+    explicit TextureCache(Runtime&, VideoCore::RasterizerInterface&, Tegra::Engines::Maxwell3D&,
+                          Tegra::Engines::KeplerCompute&, Tegra::MemoryManager&);
+
+    /// Notify the cache that a new frame has been queued
+    void TickFrame();
+
+    /// Return a constant reference to the given image view id
+    [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept;
+
+    /// Return a reference to the given image view id
+    [[nodiscard]] ImageView& GetImageView(ImageViewId id) noexcept;
+
+    /// Mark an image as modified from the GPU
+    void MarkModification(ImageId id) noexcept;
+
+    /// Fill image_view_ids with the graphics images in indices
+    void FillGraphicsImageViews(std::span<const u32> indices,
+                                std::span<ImageViewId> image_view_ids);
+
+    /// Fill image_view_ids with the compute images in indices
+    void FillComputeImageViews(std::span<const u32> indices, std::span<ImageViewId> image_view_ids);
+
+    /// Get the sampler from the graphics descriptor table in the specified index
+    Sampler* GetGraphicsSampler(u32 index);
+
+    /// Get the sampler from the compute descriptor table in the specified index
+    Sampler* GetComputeSampler(u32 index);
+
+    /// Refresh the state for graphics image view and sampler descriptors
+    void SynchronizeGraphicsDescriptors();
+
+    /// Refresh the state for compute image view and sampler descriptors
+    void SynchronizeComputeDescriptors();
+
+    /// Update bound render targets and upload memory if necessary
+    /// @param is_clear True when the render targets are being used for clears
+    void UpdateRenderTargets(bool is_clear);
+
+    /// Find a framebuffer with the currently bound render targets
+    /// UpdateRenderTargets should be called before this
+    Framebuffer* GetFramebuffer();
+
+    /// Mark images in a range as modified from the CPU
+    void WriteMemory(VAddr cpu_addr, size_t size);
+
+    /// Download contents of host images to guest memory in a region
+    void DownloadMemory(VAddr cpu_addr, size_t size);
+
+    /// Remove images in a region
+    void UnmapMemory(VAddr cpu_addr, size_t size);
+
+    /// Remove images in a region
+    void UnmapGPUMemory(GPUVAddr gpu_addr, size_t size);
+
+    /// Blit an image with the given parameters
+    void BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
+                   const Tegra::Engines::Fermi2D::Surface& src,
+                   const Tegra::Engines::Fermi2D::Config& copy);
+
+    /// Try to find a cached image view in the given CPU address
+    [[nodiscard]] ImageView* TryFindFramebufferImageView(VAddr cpu_addr);
+
+    /// Return true when there are uncommitted images to be downloaded
+    [[nodiscard]] bool HasUncommittedFlushes() const noexcept;
+
+    /// Return true when the caller should wait for async downloads
+    [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept;
+
+    /// Commit asynchronous downloads
+    void CommitAsyncFlushes();
+
+    /// Pop asynchronous downloads
+    void PopAsyncFlushes();
+
+    /// Return true when a CPU region is modified from the GPU
+    [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
+
+    std::mutex mutex;
+
+private:
+    /// Iterate over all page indices in a range
+    template <typename Func>
+    static void ForEachCPUPage(VAddr addr, size_t size, Func&& func) {
+        static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>;
+        const u64 page_end = (addr + size - 1) >> PAGE_BITS;
+        for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) {
+            if constexpr (RETURNS_BOOL) {
+                if (func(page)) {
+                    break;
+                }
+            } else {
+                func(page);
+            }
+        }
+    }
+
+    template <typename Func>
+    static void ForEachGPUPage(GPUVAddr addr, size_t size, Func&& func) {
+        static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>;
+        const u64 page_end = (addr + size - 1) >> PAGE_BITS;
+        for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) {
+            if constexpr (RETURNS_BOOL) {
+                if (func(page)) {
+                    break;
+                }
+            } else {
+                func(page);
+            }
+        }
+    }
+
+    /// Runs the Garbage Collector.
+    void RunGarbageCollector();
+
+    /// Fills image_view_ids in the image views in indices
+    void FillImageViews(DescriptorTable<TICEntry>& table,
+                        std::span<ImageViewId> cached_image_view_ids, std::span<const u32> indices,
+                        std::span<ImageViewId> image_view_ids);
+
+    /// Find or create an image view in the guest descriptor table
+    ImageViewId VisitImageView(DescriptorTable<TICEntry>& table,
+                               std::span<ImageViewId> cached_image_view_ids, u32 index);
+
+    /// Find or create a framebuffer with the given render target parameters
+    FramebufferId GetFramebufferId(const RenderTargets& key);
+
+    /// Refresh the contents (pixel data) of an image
+    void RefreshContents(Image& image, ImageId image_id);
+
+    /// Upload data from guest to an image
+    template <typename StagingBuffer>
+    void UploadImageContents(Image& image, StagingBuffer& staging_buffer);
+
+    /// Find or create an image view from a guest descriptor
+    [[nodiscard]] ImageViewId FindImageView(const TICEntry& config);
+
+    /// Create a new image view from a guest descriptor
+    [[nodiscard]] ImageViewId CreateImageView(const TICEntry& config);
+
+    /// Find or create an image from the given parameters
+    [[nodiscard]] ImageId FindOrInsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
+                                            RelaxedOptions options = RelaxedOptions{});
+
+    /// Find an image from the given parameters
+    [[nodiscard]] ImageId FindImage(const ImageInfo& info, GPUVAddr gpu_addr,
+                                    RelaxedOptions options);
+
+    /// Create an image from the given parameters
+    [[nodiscard]] ImageId InsertImage(const ImageInfo& info, GPUVAddr gpu_addr,
+                                      RelaxedOptions options);
+
+    /// Create a new image and join perfectly matching existing images
+    /// Remove joined images from the cache
+    [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr);
+
+    /// Return a blit image pair from the given guest blit parameters
+    [[nodiscard]] BlitImages GetBlitImages(const Tegra::Engines::Fermi2D::Surface& dst,
+                                           const Tegra::Engines::Fermi2D::Surface& src);
+
+    /// Find or create a sampler from a guest descriptor sampler
+    [[nodiscard]] SamplerId FindSampler(const TSCEntry& config);
+
+    /// Find or create an image view for the given color buffer index
+    [[nodiscard]] ImageViewId FindColorBuffer(size_t index, bool is_clear);
+
+    /// Find or create an image view for the depth buffer
+    [[nodiscard]] ImageViewId FindDepthBuffer(bool is_clear);
+
+    /// Find or create a view for a render target with the given image parameters
+    [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr,
+                                                   bool is_clear);
+
+    /// Iterates over all the images in a region calling func
+    template <typename Func>
+    void ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& func);
+
+    template <typename Func>
+    void ForEachImageInRegionGPU(GPUVAddr gpu_addr, size_t size, Func&& func);
+
+    template <typename Func>
+    void ForEachSparseImageInRegion(GPUVAddr gpu_addr, size_t size, Func&& func);
+
+    /// Iterates over all the images in a region calling func
+    template <typename Func>
+    void ForEachSparseSegment(ImageBase& image, Func&& func);
+
+    /// Find or create an image view in the given image with the passed parameters
+    [[nodiscard]] ImageViewId FindOrEmplaceImageView(ImageId image_id, const ImageViewInfo& info);
+
+    /// Register image in the page table
+    void RegisterImage(ImageId image);
+
+    /// Unregister image from the page table
+    void UnregisterImage(ImageId image);
+
+    /// Track CPU reads and writes for image
+    void TrackImage(ImageBase& image, ImageId image_id);
+
+    /// Stop tracking CPU reads and writes for image
+    void UntrackImage(ImageBase& image, ImageId image_id);
+
+    /// Delete image from the cache
+    void DeleteImage(ImageId image);
+
+    /// Remove image views references from the cache
+    void RemoveImageViewReferences(std::span<const ImageViewId> removed_views);
+
+    /// Remove framebuffers using the given image views from the cache
+    void RemoveFramebuffers(std::span<const ImageViewId> removed_views);
+
+    /// Mark an image as modified from the GPU
+    void MarkModification(ImageBase& image) noexcept;
+
+    /// Synchronize image aliases, copying data if needed
+    void SynchronizeAliases(ImageId image_id);
+
+    /// Prepare an image to be used
+    void PrepareImage(ImageId image_id, bool is_modification, bool invalidate);
+
+    /// Prepare an image view to be used
+    void PrepareImageView(ImageViewId image_view_id, bool is_modification, bool invalidate);
+
+    /// Execute copies from one image to the other, even if they are incompatible
+    void CopyImage(ImageId dst_id, ImageId src_id, std::span<const ImageCopy> copies);
+
+    /// Bind an image view as render target, downloading resources preemtively if needed
+    void BindRenderTarget(ImageViewId* old_id, ImageViewId new_id);
+
+    /// Create a render target from a given image and image view parameters
+    [[nodiscard]] std::pair<FramebufferId, ImageViewId> RenderTargetFromImage(
+        ImageId, const ImageViewInfo& view_info);
+
+    /// Returns true if the current clear parameters clear the whole image of a given image view
+    [[nodiscard]] bool IsFullClear(ImageViewId id);
+
+    Runtime& runtime;
+    VideoCore::RasterizerInterface& rasterizer;
+    Tegra::Engines::Maxwell3D& maxwell3d;
+    Tegra::Engines::KeplerCompute& kepler_compute;
+    Tegra::MemoryManager& gpu_memory;
+
+    DescriptorTable<TICEntry> graphics_image_table{gpu_memory};
+    DescriptorTable<TSCEntry> graphics_sampler_table{gpu_memory};
+    std::vector<SamplerId> graphics_sampler_ids;
+    std::vector<ImageViewId> graphics_image_view_ids;
+
+    DescriptorTable<TICEntry> compute_image_table{gpu_memory};
+    DescriptorTable<TSCEntry> compute_sampler_table{gpu_memory};
+    std::vector<SamplerId> compute_sampler_ids;
+    std::vector<ImageViewId> compute_image_view_ids;
+
+    RenderTargets render_targets;
+
+    std::unordered_map<TICEntry, ImageViewId> image_views;
+    std::unordered_map<TSCEntry, SamplerId> samplers;
+    std::unordered_map<RenderTargets, FramebufferId> framebuffers;
+
+    std::unordered_map<u64, std::vector<ImageMapId>, IdentityHash<u64>> page_table;
+    std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> gpu_page_table;
+    std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> sparse_page_table;
+
+    std::unordered_map<ImageId, std::vector<ImageViewId>> sparse_views;
+
+    VAddr virtual_invalid_space{};
+
+    bool has_deleted_images = false;
+    u64 total_used_memory = 0;
+    u64 minimum_memory;
+    u64 expected_memory;
+    u64 critical_memory;
+
+    SlotVector<Image> slot_images;
+    SlotVector<ImageMapView> slot_map_views;
+    SlotVector<ImageView> slot_image_views;
+    SlotVector<ImageAlloc> slot_image_allocs;
+    SlotVector<Sampler> slot_samplers;
+    SlotVector<Framebuffer> slot_framebuffers;
+
+    // TODO: This data structure is not optimal and it should be reworked
+    std::vector<ImageId> uncommitted_downloads;
+    std::queue<std::vector<ImageId>> committed_downloads;
+
+    struct LRUItemParams {
+        using ObjectType = ImageId;
+        using TickType = u64;
+    };
+    Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache;
+
+    static constexpr size_t TICKS_TO_DESTROY = 6;
+    DelayedDestructionRing<Image, TICKS_TO_DESTROY> sentenced_images;
+    DelayedDestructionRing<ImageView, TICKS_TO_DESTROY> sentenced_image_view;
+    DelayedDestructionRing<Framebuffer, TICKS_TO_DESTROY> sentenced_framebuffers;
+
+    std::unordered_map<GPUVAddr, ImageAllocId> image_allocs_table;
+
+    u64 modification_tick = 0;
+    u64 frame_tick = 0;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -151,6 +151,76 @@ private:
    const IntType& m_Bits;
 };

+enum class IntegerEncoding { JustBits, Quint, Trit };
+
+struct IntegerEncodedValue {
+    constexpr IntegerEncodedValue() = default;
+
+    constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
+        : encoding{encoding_}, num_bits{num_bits_} {}
+
+    constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
+        return encoding == other.encoding && num_bits == other.num_bits;
+    }
+
+    // Returns the number of bits required to encode num_vals values.
+    u32 GetBitLength(u32 num_vals) const {
+        u32 total_bits = num_bits * num_vals;
+        if (encoding == IntegerEncoding::Trit) {
+            total_bits += (num_vals * 8 + 4) / 5;
+        } else if (encoding == IntegerEncoding::Quint) {
+            total_bits += (num_vals * 7 + 2) / 3;
+        }
+        return total_bits;
+    }
+
+    IntegerEncoding encoding{};
+    u32 num_bits = 0;
+    u32 bit_value = 0;
+    union {
+        u32 quint_value = 0;
+        u32 trit_value;
+    };
+};
+
+// Returns a new instance of this struct that corresponds to the
+// can take no more than mav_value values
+static constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) {
+    while (mav_value > 0) {
+        u32 check = mav_value + 1;
+
+        // Is mav_value a power of two?
+        if (!(check & (check - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value));
+        }
+
+        // Is mav_value of the type 3*2^n - 1?
+        if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1));
+        }
+
+        // Is mav_value of the type 5*2^n - 1?
+        if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
+            return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1));
+        }
+
+        // Apparently it can't be represented with a bounded integer sequence...
+        // just iterate.
+        mav_value--;
+    }
+    return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
+    std::array<IntegerEncodedValue, 256> encodings{};
+    for (std::size_t i = 0; i < encodings.size(); ++i) {
+        encodings[i] = CreateEncoding(static_cast<u32>(i));
+    }
+    return encodings;
+}
+
+static constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues();
+
 namespace Tegra::Texture::ASTC {
 using IntegerEncodedVector = boost::container::static_vector<
    IntegerEncodedValue, 256,
@@ -521,35 +591,41 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
    return params;
 }

-static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
-                              u32 blockHeight) {
-    // Don't actually care about the void extent, just read the bits...
-    for (s32 i = 0; i < 4; ++i) {
-        strm.ReadBits<13>();
+// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
+// is the same as [(num_bits - 1):0] and repeats all the way down.
+template <typename IntType>
+static constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) {
+    if (num_bits == 0 || to_bit == 0) {
+        return 0;
    }
-
-    // Decode the RGBA components and renormalize them to the range [0, 255]
-    u16 r = static_cast<u16>(strm.ReadBits<16>());
-    u16 g = static_cast<u16>(strm.ReadBits<16>());
-    u16 b = static_cast<u16>(strm.ReadBits<16>());
-    u16 a = static_cast<u16>(strm.ReadBits<16>());
-
-    u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
-               (static_cast<u32>(a) & 0xFF00) << 16;
-
-    for (u32 j = 0; j < blockHeight; j++) {
-        for (u32 i = 0; i < blockWidth; i++) {
-            outBuf[j * blockWidth + i] = rgba;
+    const IntType v = val & static_cast<IntType>((1 << num_bits) - 1);
+    IntType res = v;
+    u32 reslen = num_bits;
+    while (reslen < to_bit) {
+        u32 comp = 0;
+        if (num_bits > to_bit - reslen) {
+            u32 newshift = to_bit - reslen;
+            comp = num_bits - newshift;
+            num_bits = newshift;
        }
+        res = static_cast<IntType>(res << num_bits);
+        res = static_cast<IntType>(res | (v >> comp));
+        reslen += num_bits;
    }
+    return res;
 }

-static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
-    for (u32 j = 0; j < blockHeight; j++) {
-        for (u32 i = 0; i < blockWidth; i++) {
-            outBuf[j * blockWidth + i] = 0xFFFF00FF;
-        }
+static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
+    return std::size_t(1) << num_bits;
+}
+
+template <typename IntType, u32 num_bits, u32 to_bit>
+static constexpr auto MakeReplicateTable() {
+    std::array<IntType, NumReplicateEntries(num_bits)> table{};
+    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
+        table[value] = Replicate(value, num_bits, to_bit);
    }
+    return table;
 }

 static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
@@ -572,6 +648,9 @@ static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>
 static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
 static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
 static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
+static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
+static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
+static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
 /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
 /// to the runtime implementation
 static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
@@ -1316,6 +1395,37 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
 #undef READ_INT_VALUES
 }

+static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
+                              u32 blockHeight) {
+    // Don't actually care about the void extent, just read the bits...
+    for (s32 i = 0; i < 4; ++i) {
+        strm.ReadBits<13>();
+    }
+
+    // Decode the RGBA components and renormalize them to the range [0, 255]
+    u16 r = static_cast<u16>(strm.ReadBits<16>());
+    u16 g = static_cast<u16>(strm.ReadBits<16>());
+    u16 b = static_cast<u16>(strm.ReadBits<16>());
+    u16 a = static_cast<u16>(strm.ReadBits<16>());
+
+    u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
+               (static_cast<u32>(a) & 0xFF00) << 16;
+
+    for (u32 j = 0; j < blockHeight; j++) {
+        for (u32 i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = rgba;
+        }
+    }
+}
+
+static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
+    for (u32 j = 0; j < blockHeight; j++) {
+        for (u32 i = 0; i < blockWidth; i++) {
+            outBuf[j * blockWidth + i] = 0xFFFF00FF;
+        }
+    }
+}
+
 static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
                            const u32 blockHeight, std::span<u32, 12 * 12> outBuf) {
    InputBitStream strm(inBuf);
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -9,117 +9,6 @@

 namespace Tegra::Texture::ASTC {

-enum class IntegerEncoding { JustBits, Quint, Trit };
-
-struct IntegerEncodedValue {
-    constexpr IntegerEncodedValue() = default;
-
-    constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
-        : encoding{encoding_}, num_bits{num_bits_} {}
-
-    constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
-        return encoding == other.encoding && num_bits == other.num_bits;
-    }
-
-    // Returns the number of bits required to encode num_vals values.
-    u32 GetBitLength(u32 num_vals) const {
-        u32 total_bits = num_bits * num_vals;
-        if (encoding == IntegerEncoding::Trit) {
-            total_bits += (num_vals * 8 + 4) / 5;
-        } else if (encoding == IntegerEncoding::Quint) {
-            total_bits += (num_vals * 7 + 2) / 3;
-        }
-        return total_bits;
-    }
-
-    IntegerEncoding encoding{};
-    u32 num_bits = 0;
-    u32 bit_value = 0;
-    union {
-        u32 quint_value = 0;
-        u32 trit_value;
-    };
-};
-
-// Returns a new instance of this struct that corresponds to the
-// can take no more than mav_value values
-constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) {
-    while (mav_value > 0) {
-        u32 check = mav_value + 1;
-
-        // Is mav_value a power of two?
-        if (!(check & (check - 1))) {
-            return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value));
-        }
-
-        // Is mav_value of the type 3*2^n - 1?
-        if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
-            return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1));
-        }
-
-        // Is mav_value of the type 5*2^n - 1?
-        if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
-            return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1));
-        }
-
-        // Apparently it can't be represented with a bounded integer sequence...
-        // just iterate.
-        mav_value--;
-    }
-    return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
-}
-
-constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
-    std::array<IntegerEncodedValue, 256> encodings{};
-    for (std::size_t i = 0; i < encodings.size(); ++i) {
-        encodings[i] = CreateEncoding(static_cast<u32>(i));
-    }
-    return encodings;
-}
-
-constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues();
-
-// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
-// is the same as [(num_bits - 1):0] and repeats all the way down.
-template <typename IntType>
-constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) {
-    if (num_bits == 0 || to_bit == 0) {
-        return 0;
-    }
-    const IntType v = val & static_cast<IntType>((1 << num_bits) - 1);
-    IntType res = v;
-    u32 reslen = num_bits;
-    while (reslen < to_bit) {
-        u32 comp = 0;
-        if (num_bits > to_bit - reslen) {
-            u32 newshift = to_bit - reslen;
-            comp = num_bits - newshift;
-            num_bits = newshift;
-        }
-        res = static_cast<IntType>(res << num_bits);
-        res = static_cast<IntType>(res | (v >> comp));
-        reslen += num_bits;
-    }
-    return res;
-}
-
-constexpr std::size_t NumReplicateEntries(u32 num_bits) {
-    return std::size_t(1) << num_bits;
-}
-
-template <typename IntType, u32 num_bits, u32 to_bit>
-constexpr auto MakeReplicateTable() {
-    std::array<IntType, NumReplicateEntries(num_bits)> table{};
-    for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
-        table[value] = Replicate(value, num_bits, to_bit);
-    }
-    return table;
-}
-
-constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
-constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
-constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
-
 void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
                uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);

--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -18,9 +18,9 @@

 namespace Tegra::Texture {
 namespace {
-template <bool TO_LINEAR>
-void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
-             u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
+template <bool TO_LINEAR, u32 BYTES_PER_PIXEL>
+void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height, u32 depth,
+                 u32 block_height, u32 block_depth, u32 stride_alignment) {
    // The origin of the transformation can be configured here, leave it as zero as the current API
    // doesn't expose it.
    static constexpr u32 origin_x = 0;
@@ -28,9 +28,9 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
    static constexpr u32 origin_z = 0;

    // We can configure here a custom pitch
-    // As it's not exposed 'width * bpp' will be the expected pitch.
-    const u32 pitch = width * bytes_per_pixel;
-    const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel;
+    // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
+    const u32 pitch = width * BYTES_PER_PIXEL;
+    const u32 stride = Common::AlignUpLog2(width, stride_alignment) * BYTES_PER_PIXEL;

    const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
@@ -54,30 +54,129 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
                                 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);

            for (u32 column = 0; column < width; ++column) {
-                const u32 x = (column + origin_x) * bytes_per_pixel;
+                const u32 x = (column + origin_x) * BYTES_PER_PIXEL;
                const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;

                const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
                const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X];

                const u32 unswizzled_offset =
-                    slice * pitch * height + line * pitch + column * bytes_per_pixel;
-
-                if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset);
-                    offset >= input.size()) {
-                    // TODO(Rodrigo): This is an out of bounds access that should never happen. To
-                    // avoid crashing the emulator, break.
-                    ASSERT_MSG(false, "offset {} exceeds input size {}!", offset, input.size());
-                    break;
-                }
+                    slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL;

                u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
                const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
-                std::memcpy(dst, src, bytes_per_pixel);
+
+                std::memcpy(dst, src, BYTES_PER_PIXEL);
            }
        }
    }
 }
+
+template <bool TO_LINEAR>
+void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
+             u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
+    switch (bytes_per_pixel) {
+#define BPP_CASE(x)                                                                                \
+    case x:                                                                                        \
+        return SwizzleImpl<TO_LINEAR, x>(output, input, width, height, depth, block_height,        \
+                                         block_depth, stride_alignment);
+        BPP_CASE(1)
+        BPP_CASE(2)
+        BPP_CASE(3)
+        BPP_CASE(4)
+        BPP_CASE(6)
+        BPP_CASE(8)
+        BPP_CASE(12)
+        BPP_CASE(16)
+#undef BPP_CASE
+    default:
+        UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
+    }
+}
+
+template <u32 BYTES_PER_PIXEL>
+void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+                    u8* swizzled_data, const u8* unswizzled_data, u32 block_height_bit,
+                    u32 offset_x, u32 offset_y) {
+    const u32 block_height = 1U << block_height_bit;
+    const u32 image_width_in_gobs =
+        (swizzled_width * BYTES_PER_PIXEL + (GOB_SIZE_X - 1)) / GOB_SIZE_X;
+    for (u32 line = 0; line < subrect_height; ++line) {
+        const u32 dst_y = line + offset_y;
+        const u32 gob_address_y =
+            (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
+            ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
+        const auto& table = SWIZZLE_TABLE[dst_y % GOB_SIZE_Y];
+        for (u32 x = 0; x < subrect_width; ++x) {
+            const u32 dst_x = x + offset_x;
+            const u32 gob_address =
+                gob_address_y + (dst_x * BYTES_PER_PIXEL / GOB_SIZE_X) * GOB_SIZE * block_height;
+            const u32 swizzled_offset = gob_address + table[(dst_x * BYTES_PER_PIXEL) % GOB_SIZE_X];
+            const u32 unswizzled_offset = line * source_pitch + x * BYTES_PER_PIXEL;
+
+            const u8* const source_line = unswizzled_data + unswizzled_offset;
+            u8* const dest_addr = swizzled_data + swizzled_offset;
+            std::memcpy(dest_addr, source_line, BYTES_PER_PIXEL);
+        }
+    }
+}
+
+template <u32 BYTES_PER_PIXEL>
+void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 block_height,
+                      u32 origin_x, u32 origin_y, u8* output, const u8* input) {
+    const u32 stride = width * BYTES_PER_PIXEL;
+    const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
+    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
+
+    const u32 block_height_mask = (1U << block_height) - 1;
+    const u32 x_shift = GOB_SIZE_SHIFT + block_height;
+
+    for (u32 line = 0; line < line_count; ++line) {
+        const u32 src_y = line + origin_y;
+        const auto& table = SWIZZLE_TABLE[src_y % GOB_SIZE_Y];
+
+        const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;
+        const u32 src_offset_y = (block_y >> block_height) * block_size +
+                                 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
+        for (u32 column = 0; column < line_length_in; ++column) {
+            const u32 src_x = (column + origin_x) * BYTES_PER_PIXEL;
+            const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift;
+
+            const u32 swizzled_offset = src_offset_y + src_offset_x + table[src_x % GOB_SIZE_X];
+            const u32 unswizzled_offset = line * pitch + column * BYTES_PER_PIXEL;
+
+            std::memcpy(output + unswizzled_offset, input + swizzled_offset, BYTES_PER_PIXEL);
+        }
+    }
+}
+
+template <u32 BYTES_PER_PIXEL>
+void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
+                         u32 block_height, u32 block_depth, u32 origin_x, u32 origin_y, u8* output,
+                         const u8* input) {
+    UNIMPLEMENTED_IF(origin_x > 0);
+    UNIMPLEMENTED_IF(origin_y > 0);
+
+    const u32 stride = width * BYTES_PER_PIXEL;
+    const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
+    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
+
+    const u32 block_height_mask = (1U << block_height) - 1;
+    const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;
+
+    for (u32 line = 0; line < line_count; ++line) {
+        const auto& table = SWIZZLE_TABLE[line % GOB_SIZE_Y];
+        const u32 block_y = line / GOB_SIZE_Y;
+        const u32 dst_offset_y =
+            (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;
+        for (u32 x = 0; x < line_length_in; ++x) {
+            const u32 dst_offset =
+                ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + table[x % GOB_SIZE_X];
+            const u32 src_offset = x * BYTES_PER_PIXEL + line * pitch;
+            std::memcpy(output + dst_offset, input + src_offset, BYTES_PER_PIXEL);
+        }
+    }
+}
 } // Anonymous namespace

 void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
@@ -97,81 +196,67 @@ void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_p
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
                    u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
                    u32 block_height_bit, u32 offset_x, u32 offset_y) {
-    const u32 block_height = 1U << block_height_bit;
-    const u32 image_width_in_gobs =
-        (swizzled_width * bytes_per_pixel + (GOB_SIZE_X - 1)) / GOB_SIZE_X;
-    for (u32 line = 0; line < subrect_height; ++line) {
-        const u32 dst_y = line + offset_y;
-        const u32 gob_address_y =
-            (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
-            ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
-        const auto& table = SWIZZLE_TABLE[dst_y % GOB_SIZE_Y];
-        for (u32 x = 0; x < subrect_width; ++x) {
-            const u32 dst_x = x + offset_x;
-            const u32 gob_address =
-                gob_address_y + (dst_x * bytes_per_pixel / GOB_SIZE_X) * GOB_SIZE * block_height;
-            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % GOB_SIZE_X];
-            const u32 unswizzled_offset = line * source_pitch + x * bytes_per_pixel;
-
-            const u8* const source_line = unswizzled_data + unswizzled_offset;
-            u8* const dest_addr = swizzled_data + swizzled_offset;
-            std::memcpy(dest_addr, source_line, bytes_per_pixel);
-        }
+    switch (bytes_per_pixel) {
+#define BPP_CASE(x)                                                                                \
+    case x:                                                                                        \
+        return SwizzleSubrect<x>(subrect_width, subrect_height, source_pitch, swizzled_width,      \
+                                 swizzled_data, unswizzled_data, block_height_bit, offset_x,       \
+                                 offset_y);
+        BPP_CASE(1)
+        BPP_CASE(2)
+        BPP_CASE(3)
+        BPP_CASE(4)
+        BPP_CASE(6)
+        BPP_CASE(8)
+        BPP_CASE(12)
+        BPP_CASE(16)
+#undef BPP_CASE
+    default:
+        UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
    }
 }

 void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
                      u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) {
-    const u32 stride = width * bytes_per_pixel;
-    const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
-    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
-
-    const u32 block_height_mask = (1U << block_height) - 1;
-    const u32 x_shift = GOB_SIZE_SHIFT + block_height;
-
-    for (u32 line = 0; line < line_count; ++line) {
-        const u32 src_y = line + origin_y;
-        const auto& table = SWIZZLE_TABLE[src_y % GOB_SIZE_Y];
-
-        const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;
-        const u32 src_offset_y = (block_y >> block_height) * block_size +
-                                 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
-        for (u32 column = 0; column < line_length_in; ++column) {
-            const u32 src_x = (column + origin_x) * bytes_per_pixel;
-            const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift;
-
-            const u32 swizzled_offset = src_offset_y + src_offset_x + table[src_x % GOB_SIZE_X];
-            const u32 unswizzled_offset = line * pitch + column * bytes_per_pixel;
-
-            std::memcpy(output + unswizzled_offset, input + swizzled_offset, bytes_per_pixel);
-        }
+    switch (bytes_per_pixel) {
+#define BPP_CASE(x)                                                                                \
+    case x:                                                                                        \
+        return UnswizzleSubrect<x>(line_length_in, line_count, pitch, width, block_height,         \
+                                   origin_x, origin_y, output, input);
+        BPP_CASE(1)
+        BPP_CASE(2)
+        BPP_CASE(3)
+        BPP_CASE(4)
+        BPP_CASE(6)
+        BPP_CASE(8)
+        BPP_CASE(12)
+        BPP_CASE(16)
+#undef BPP_CASE
+    default:
+        UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
    }
 }

 void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
                         u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
                         u32 origin_y, u8* output, const u8* input) {
-    UNIMPLEMENTED_IF(origin_x > 0);
-    UNIMPLEMENTED_IF(origin_y > 0);
-
-    const u32 stride = width * bytes_per_pixel;
-    const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
-    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
-
-    const u32 block_height_mask = (1U << block_height) - 1;
-    const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;
-
-    for (u32 line = 0; line < line_count; ++line) {
-        const auto& table = SWIZZLE_TABLE[line % GOB_SIZE_Y];
-        const u32 block_y = line / GOB_SIZE_Y;
-        const u32 dst_offset_y =
-            (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;
-        for (u32 x = 0; x < line_length_in; ++x) {
-            const u32 dst_offset =
-                ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + table[x % GOB_SIZE_X];
-            const u32 src_offset = x * bytes_per_pixel + line * pitch;
-            std::memcpy(output + dst_offset, input + src_offset, bytes_per_pixel);
-        }
+    switch (bytes_per_pixel) {
+#define BPP_CASE(x)                                                                                \
+    case x:                                                                                        \
+        return SwizzleSliceToVoxel<x>(line_length_in, line_count, pitch, width, height,            \
+                                      block_height, block_depth, origin_x, origin_y, output,       \
+                                      input);
+        BPP_CASE(1)
+        BPP_CASE(2)
+        BPP_CASE(3)
+        BPP_CASE(4)
+        BPP_CASE(6)
+        BPP_CASE(8)
+        BPP_CASE(12)
+        BPP_CASE(16)
+#undef BPP_CASE
+    default:
+        UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel);
    }
 }

@@ -194,7 +279,7 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32
            u8* dest_addr = swizzle_data + swizzled_offset;
            count++;

-            std::memcpy(dest_addr, source_line, 1);
+            *dest_addr = *source_line;
        }
    }
 }
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -159,7 +159,7 @@ static_assert(sizeof(TextureHandle) == 4, "TextureHandle has wrong size");
        return {raw, raw};
    } else {
        const Tegra::Texture::TextureHandle handle{raw};
-        return {handle.tic_id, via_header_index ? handle.tic_id : handle.tsc_id};
+        return {handle.tic_id, handle.tsc_id};
    }
 }

--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -368,18 +368,21 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
    };
    SetNext(next, demote);

-    VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8;
-    if (is_float16_supported) {
-        float16_int8 = {
+    if (is_int8_supported || is_float16_supported) {
+        VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8{
            .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR,
            .pNext = nullptr,
-            .shaderFloat16 = true,
-            .shaderInt8 = false,
+            .shaderFloat16 = is_float16_supported,
+            .shaderInt8 = is_int8_supported,
        };
        SetNext(next, float16_int8);
-    } else {
+    }
+    if (!is_float16_supported) {
        LOG_INFO(Render_Vulkan, "Device doesn't support float16 natively");
    }
+    if (!is_int8_supported) {
+        LOG_INFO(Render_Vulkan, "Device doesn't support int8 natively");
+    }

    if (!nv_viewport_swizzle) {
        LOG_INFO(Render_Vulkan, "Device doesn't support viewport swizzles");
@@ -909,6 +912,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {

        physical.GetFeatures2KHR(features);
        is_float16_supported = float16_int8_features.shaderFloat16;
+        is_int8_supported = float16_int8_features.shaderInt8;
        extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME);
    }
    if (has_ext_subgroup_size_control) {
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -139,11 +139,16 @@ public:
        return is_optimal_astc_supported;
    }

-    /// Returns true if the device supports float16 natively
+    /// Returns true if the device supports float16 natively.
    bool IsFloat16Supported() const {
        return is_float16_supported;
    }

+    /// Returns true if the device supports int8 natively.
+    bool IsInt8Supported() const {
+        return is_int8_supported;
+    }
+
    /// Returns true if the device warp size can potentially be bigger than guest's warp size.
    bool IsWarpSizePotentiallyBiggerThanGuest() const {
        return is_warp_potentially_bigger;
@@ -367,7 +372,8 @@ private:
    u64 device_access_memory{};                 ///< Total size of device local memory in bytes.
    u32 max_push_descriptors{};                 ///< Maximum number of push descriptors
    bool is_optimal_astc_supported{};           ///< Support for native ASTC.
-    bool is_float16_supported{};                ///< Support for float16 arithmetics.
+    bool is_float16_supported{};                ///< Support for float16 arithmetic.
+    bool is_int8_supported{};                   ///< Support for int8 arithmetic.
    bool is_warp_potentially_bigger{};          ///< Host warp size can be bigger than guest.
    bool is_formatless_image_load_supported{};  ///< Support for shader image read without format.
    bool is_depth_bounds_supported{};           ///< Support for depth bounds.
--- a/Show More
+++ b/Show More