vulkan_instance: Allow different Vulkan versions and enforce 1.1

For listing the available physical devices we can use Vulkan 1.0. Now that MoltenVK supports 1.1 we can require it for running games. Add missing documentation.
vk_device: Use an array to report lacking device limits
2020-12-31 02:07:34 -03:00 · 2020-12-31 02:07:34 -03:00 · 2020-12-31 02:07:33 -03:00 · 2020-12-31 02:07:33 -03:00 · 2020-12-31 02:07:33 -03:00 · 2020-12-31 02:07:33 -03:00
263 changed files with 12784 additions and 9503 deletions
--- a/externals/dynarmic
+++ b/externals/dynarmic
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -62,6 +62,7 @@ else()
        -Werror=implicit-fallthrough
        -Werror=missing-declarations
        -Werror=reorder
+        -Werror=uninitialized
        -Werror=unused-result
        -Wextra
        -Wmissing-declarations
--- a/src/audio_core/audio_renderer.cpp
+++ b/src/audio_core/audio_renderer.cpp
@@ -11,6 +11,7 @@
 #include "audio_core/info_updater.h"
 #include "audio_core/voice_context.h"
 #include "common/logging/log.h"
+#include "core/hle/kernel/writable_event.h"
 #include "core/memory.h"
 #include "core/settings.h"

@@ -70,9 +71,10 @@ namespace {
 namespace AudioCore {
 AudioRenderer::AudioRenderer(Core::Timing::CoreTiming& core_timing, Core::Memory::Memory& memory_,
                             AudioCommon::AudioRendererParameter params,
-                             Stream::ReleaseCallback&& release_callback,
+                             std::shared_ptr<Kernel::WritableEvent> buffer_event_,
                             std::size_t instance_number)
-    : worker_params{params}, memory_pool_info(params.effect_count + params.voice_count * 4),
+    : worker_params{params}, buffer_event{buffer_event_},
+      memory_pool_info(params.effect_count + params.voice_count * 4),
      voice_context(params.voice_count), effect_context(params.effect_count), mix_context(),
      sink_context(params.sink_count), splitter_context(),
      voices(params.voice_count), memory{memory_},
@@ -83,9 +85,10 @@ AudioRenderer::AudioRenderer(Core::Timing::CoreTiming& core_timing, Core::Memory
                                params.num_splitter_send_channels);
    mix_context.Initialize(behavior_info, params.submix_count + 1, params.effect_count);
    audio_out = std::make_unique<AudioCore::AudioOut>();
-    stream = audio_out->OpenStream(
-        core_timing, params.sample_rate, AudioCommon::STREAM_NUM_CHANNELS,
-        fmt::format("AudioRenderer-Instance{}", instance_number), std::move(release_callback));
+    stream =
+        audio_out->OpenStream(core_timing, params.sample_rate, AudioCommon::STREAM_NUM_CHANNELS,
+                              fmt::format("AudioRenderer-Instance{}", instance_number),
+                              [=]() { buffer_event_->Signal(); });
    audio_out->StartStream(stream);

    QueueMixedBuffer(0);
--- a/src/audio_core/audio_renderer.h
+++ b/src/audio_core/audio_renderer.h
@@ -27,6 +27,10 @@ namespace Core::Timing {
 class CoreTiming;
 }

+namespace Kernel {
+class WritableEvent;
+}
+
 namespace Core::Memory {
 class Memory;
 }
@@ -40,7 +44,8 @@ class AudioRenderer {
 public:
    AudioRenderer(Core::Timing::CoreTiming& core_timing, Core::Memory::Memory& memory_,
                  AudioCommon::AudioRendererParameter params,
-                  Stream::ReleaseCallback&& release_callback, std::size_t instance_number);
+                  std::shared_ptr<Kernel::WritableEvent> buffer_event_,
+                  std::size_t instance_number);
    ~AudioRenderer();

    [[nodiscard]] ResultCode UpdateAudioRenderer(const std::vector<u8>& input_params,
@@ -56,6 +61,7 @@ private:
    BehaviorInfo behavior_info{};

    AudioCommon::AudioRendererParameter worker_params;
+    std::shared_ptr<Kernel::WritableEvent> buffer_event;
    std::vector<ServerMemoryPoolInfo> memory_pool_info;
    VoiceContext voice_context;
    EffectContext effect_context;
--- a/src/audio_core/stream.cpp
+++ b/src/audio_core/stream.cpp
@@ -130,11 +130,7 @@ bool Stream::ContainsBuffer([[maybe_unused]] Buffer::Tag tag) const {
 std::vector<Buffer::Tag> Stream::GetTagsAndReleaseBuffers(std::size_t max_count) {
    std::vector<Buffer::Tag> tags;
    for (std::size_t count = 0; count < max_count && !released_buffers.empty(); ++count) {
-        if (released_buffers.front()) {
-            tags.push_back(released_buffers.front()->GetTag());
-        } else {
-            ASSERT_MSG(false, "Invalid tag in released_buffers!");
-        }
+        tags.push_back(released_buffers.front()->GetTag());
        released_buffers.pop();
    }
    return tags;
@@ -144,11 +140,7 @@ std::vector<Buffer::Tag> Stream::GetTagsAndReleaseBuffers() {
    std::vector<Buffer::Tag> tags;
    tags.reserve(released_buffers.size());
    while (!released_buffers.empty()) {
-        if (released_buffers.front()) {
-            tags.push_back(released_buffers.front()->GetTag());
-        } else {
-            ASSERT_MSG(false, "Invalid tag in released_buffers!");
-        }
+        tags.push_back(released_buffers.front()->GetTag());
        released_buffers.pop();
    }
    return tags;
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -162,8 +162,6 @@ add_library(common STATIC
    thread.cpp
    thread.h
    thread_queue_list.h
-    thread_worker.cpp
-    thread_worker.h
    threadsafe_queue.h
    time_zone.cpp
    time_zone.h
--- a/src/common/concepts.h
+++ b/src/common/concepts.h
@@ -31,4 +31,8 @@ concept DerivedFrom = requires {
    std::is_convertible_v<const volatile Derived*, const volatile Base*>;
 };

+// TODO: Replace with std::convertible_to when libc++ implements it.
+template <typename From, typename To>
+concept ConvertibleTo = std::is_convertible_v<From, To>;
+
 } // namespace Common
--- a/src/common/thread_worker.cpp
+++ b/src/common/thread_worker.cpp
@@ -1,58 +0,0 @@
-// Copyright 2020 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include "common/thread.h"
-#include "common/thread_worker.h"
-
-namespace Common {
-
-ThreadWorker::ThreadWorker(std::size_t num_workers, const std::string& name) {
-    for (std::size_t i = 0; i < num_workers; ++i)
-        threads.emplace_back([this, thread_name{std::string{name}}] {
-            Common::SetCurrentThreadName(thread_name.c_str());
-
-            // Wait for first request
-            {
-                std::unique_lock lock{queue_mutex};
-                condition.wait(lock, [this] { return stop || !requests.empty(); });
-            }
-
-            while (true) {
-                std::function<void()> task;
-
-                {
-                    std::unique_lock lock{queue_mutex};
-                    condition.wait(lock, [this] { return stop || !requests.empty(); });
-                    if (stop || requests.empty()) {
-                        return;
-                    }
-                    task = std::move(requests.front());
-                    requests.pop();
-                }
-
-                task();
-            }
-        });
-}
-
-ThreadWorker::~ThreadWorker() {
-    {
-        std::unique_lock lock{queue_mutex};
-        stop = true;
-    }
-    condition.notify_all();
-    for (std::thread& thread : threads) {
-        thread.join();
-    }
-}
-
-void ThreadWorker::QueueWork(std::function<void()>&& work) {
-    {
-        std::unique_lock lock{queue_mutex};
-        requests.emplace(work);
-    }
-    condition.notify_one();
-}
-
-} // namespace Common
--- a/src/common/thread_worker.h
+++ b/src/common/thread_worker.h
@@ -1,30 +0,0 @@
-// Copyright 2020 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <atomic>
-#include <functional>
-#include <mutex>
-#include <string>
-#include <vector>
-#include <queue>
-
-namespace Common {
-
-class ThreadWorker final {
-public:
-    explicit ThreadWorker(std::size_t num_workers, const std::string& name);
-    ~ThreadWorker();
-    void QueueWork(std::function<void()>&& work);
-
-private:
-    std::vector<std::thread> threads;
-    std::queue<std::function<void()>> requests;
-    std::mutex queue_mutex;
-    std::condition_variable condition;
-    std::atomic_bool stop{};
-};
-
-} // namespace Common
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -202,8 +202,6 @@ add_library(core STATIC
    hle/kernel/server_port.h
    hle/kernel/server_session.cpp
    hle/kernel/server_session.h
-    hle/kernel/service_thread.cpp
-    hle/kernel/service_thread.h
    hle/kernel/session.cpp
    hle/kernel/session.h
    hle/kernel/shared_memory.cpp
@@ -502,6 +500,7 @@ add_library(core STATIC
    hle/service/sm/controller.h
    hle/service/sm/sm.cpp
    hle/service/sm/sm.h
+    hle/service/sockets/blocking_worker.h
    hle/service/sockets/bsd.cpp
    hle/service/sockets/bsd.h
    hle/service/sockets/ethc.cpp
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -159,7 +159,7 @@ struct System::Impl {
        device_memory = std::make_unique<Core::DeviceMemory>();

        is_multicore = Settings::values.use_multi_core.GetValue();
-        is_async_gpu = Settings::values.use_asynchronous_gpu_emulation.GetValue();
+        is_async_gpu = is_multicore || Settings::values.use_asynchronous_gpu_emulation.GetValue();

        kernel.SetMulticore(is_multicore);
        cpu_manager.SetMulticore(is_multicore);
@@ -307,6 +307,7 @@ struct System::Impl {
        service_manager.reset();
        cheat_engine.reset();
        telemetry_session.reset();
+        device_memory.reset();

        // Close all CPU/threading state
        cpu_manager.Shutdown();
--- a/src/core/hle/kernel/hle_ipc.cpp
+++ b/src/core/hle/kernel/hle_ipc.cpp
@@ -46,6 +46,43 @@ void SessionRequestHandler::ClientDisconnected(
    boost::range::remove_erase(connected_sessions, server_session);
 }

+std::shared_ptr<WritableEvent> HLERequestContext::SleepClientThread(
+    const std::string& reason, u64 timeout, WakeupCallback&& callback,
+    std::shared_ptr<WritableEvent> writable_event) {
+    // Put the client thread to sleep until the wait event is signaled or the timeout expires.
+
+    if (!writable_event) {
+        // Create event if not provided
+        const auto pair = WritableEvent::CreateEventPair(kernel, "HLE Pause Event: " + reason);
+        writable_event = pair.writable;
+    }
+
+    Handle event_handle = InvalidHandle;
+    {
+        KScopedSchedulerLockAndSleep lock(kernel, event_handle, thread.get(), timeout);
+        thread->SetHLECallback(
+            [context = *this, callback](std::shared_ptr<Thread> thread) mutable -> bool {
+                ThreadWakeupReason reason = thread->GetSignalingResult() == RESULT_TIMEOUT
+                                                ? ThreadWakeupReason::Timeout
+                                                : ThreadWakeupReason::Signal;
+                callback(thread, context, reason);
+                context.WriteToOutgoingCommandBuffer(*thread);
+                return true;
+            });
+        const auto readable_event{writable_event->GetReadableEvent()};
+        writable_event->Clear();
+        thread->SetHLESyncObject(readable_event.get());
+        thread->SetStatus(ThreadStatus::WaitHLEEvent);
+        thread->SetSynchronizationResults(nullptr, RESULT_TIMEOUT);
+        readable_event->AddWaitingThread(thread);
+    }
+    thread->SetHLETimeEvent(event_handle);
+
+    is_thread_waiting = true;
+
+    return writable_event;
+}
+
 HLERequestContext::HLERequestContext(KernelCore& kernel, Core::Memory::Memory& memory,
                                     std::shared_ptr<ServerSession> server_session,
                                     std::shared_ptr<Thread> thread)
--- a/src/core/hle/kernel/hle_ipc.h
+++ b/src/core/hle/kernel/hle_ipc.h
@@ -129,6 +129,23 @@ public:
    using WakeupCallback = std::function<void(
        std::shared_ptr<Thread> thread, HLERequestContext& context, ThreadWakeupReason reason)>;

+    /**
+     * Puts the specified guest thread to sleep until the returned event is signaled or until the
+     * specified timeout expires.
+     * @param reason Reason for pausing the thread, to be used for debugging purposes.
+     * @param timeout Timeout in nanoseconds after which the thread will be awoken and the callback
+     * invoked with a Timeout reason.
+     * @param callback Callback to be invoked when the thread is resumed. This callback must write
+     * the entire command response once again, regardless of the state of it before this function
+     * was called.
+     * @param writable_event Event to use to wake up the thread. If unspecified, an event will be
+     * created.
+     * @returns Event that when signaled will resume the thread and call the callback function.
+     */
+    std::shared_ptr<WritableEvent> SleepClientThread(
+        const std::string& reason, u64 timeout, WakeupCallback&& callback,
+        std::shared_ptr<WritableEvent> writable_event = nullptr);
+
    /// Populates this context with data from the requesting process/thread.
    ResultCode PopulateFromIncomingCommandBuffer(const HandleTable& handle_table,
                                                 u32_le* src_cmdbuf);
--- a/src/core/hle/kernel/k_priority_queue.h
+++ b/src/core/hle/kernel/k_priority_queue.h
@@ -8,11 +8,13 @@
 #pragma once

 #include <array>
+#include <concepts>

 #include "common/assert.h"
 #include "common/bit_set.h"
 #include "common/bit_util.h"
 #include "common/common_types.h"
+#include "common/concepts.h"

 namespace Kernel {

@@ -21,7 +23,7 @@ class Thread;
 template <typename T>
 concept KPriorityQueueAffinityMask = !std::is_reference_v<T> && requires(T & t) {
    { t.GetAffinityMask() }
-    ->std::convertible_to<u64>;
+    ->Common::ConvertibleTo<u64>;
    {t.SetAffinityMask(std::declval<u64>())};

    { t.GetAffinity(std::declval<int32_t>()) }
@@ -48,9 +50,9 @@ concept KPriorityQueueMember = !std::is_reference_v<T> && requires(T & t) {
    ->KPriorityQueueAffinityMask;

    { t.GetActiveCore() }
-    ->std::convertible_to<s32>;
+    ->Common::ConvertibleTo<s32>;
    { t.GetPriority() }
-    ->std::convertible_to<s32>;
+    ->Common::ConvertibleTo<s32>;
 };

 template <typename Member, size_t _NumCores, int LowestPriority, int HighestPriority>
--- a/src/core/hle/kernel/k_scheduler_lock.h
+++ b/src/core/hle/kernel/k_scheduler_lock.h
@@ -10,6 +10,7 @@
 #include "common/assert.h"
 #include "common/spin_lock.h"
 #include "core/hardware_properties.h"
+#include "core/hle/kernel/kernel.h"

 namespace Kernel {

--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -8,14 +8,13 @@
 #include <functional>
 #include <memory>
 #include <thread>
-#include <unordered_set>
+#include <unordered_map>
 #include <utility>

 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "common/thread.h"
-#include "common/thread_worker.h"
 #include "core/arm/arm_interface.h"
 #include "core/arm/cpu_interrupt_handler.h"
 #include "core/arm/exclusive_monitor.h"
@@ -36,7 +35,6 @@
 #include "core/hle/kernel/physical_core.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/resource_limit.h"
-#include "core/hle/kernel/service_thread.h"
 #include "core/hle/kernel/shared_memory.h"
 #include "core/hle/kernel/synchronization.h"
 #include "core/hle/kernel/thread.h"
@@ -62,8 +60,6 @@ struct KernelCore::Impl {
        RegisterHostThread();

        global_scheduler_context = std::make_unique<Kernel::GlobalSchedulerContext>(kernel);
-        service_thread_manager =
-            std::make_unique<Common::ThreadWorker>(1, "yuzu:ServiceThreadManager");

        InitializePhysicalCores();
        InitializeSystemResourceLimit(kernel);
@@ -80,12 +76,6 @@ struct KernelCore::Impl {
    }

    void Shutdown() {
-        process_list.clear();
-
-        // Ensures all service threads gracefully shutdown
-        service_thread_manager.reset();
-        service_threads.clear();
-
        next_object_id = 0;
        next_kernel_process_id = Process::InitialKIPIDMin;
        next_user_process_id = Process::ProcessIDMin;
@@ -99,6 +89,8 @@ struct KernelCore::Impl {

        cores.clear();

+        process_list.clear();
+
        current_process = nullptr;

        system_resource_limit = nullptr;
@@ -111,8 +103,10 @@ struct KernelCore::Impl {

        exclusive_monitor.reset();

-        // Next host thead ID to use, 0-3 IDs represent core threads, >3 represent others
-        next_host_thread_id = Core::Hardware::NUM_CPU_CORES;
+        num_host_threads = 0;
+        std::fill(register_host_thread_keys.begin(), register_host_thread_keys.end(),
+                  std::thread::id{});
+        std::fill(register_host_thread_values.begin(), register_host_thread_values.end(), 0);
    }

    void InitializePhysicalCores() {
@@ -192,46 +186,52 @@ struct KernelCore::Impl {
        }
    }

-    /// Creates a new host thread ID, should only be called by GetHostThreadId
-    u32 AllocateHostThreadId(std::optional<std::size_t> core_id) {
-        if (core_id) {
-            // The first for slots are reserved for CPU core threads
-            ASSERT(*core_id < Core::Hardware::NUM_CPU_CORES);
-            return static_cast<u32>(*core_id);
-        } else {
-            return next_host_thread_id++;
-        }
-    }
-
-    /// Gets the host thread ID for the caller, allocating a new one if this is the first time
-    u32 GetHostThreadId(std::optional<std::size_t> core_id = std::nullopt) {
-        const thread_local auto host_thread_id{AllocateHostThreadId(core_id)};
-        return host_thread_id;
-    }
-
-    /// Registers a CPU core thread by allocating a host thread ID for it
    void RegisterCoreThread(std::size_t core_id) {
-        ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
-        const auto this_id = GetHostThreadId(core_id);
+        const std::thread::id this_id = std::this_thread::get_id();
        if (!is_multicore) {
            single_core_thread_id = this_id;
        }
+        const auto end =
+            register_host_thread_keys.begin() + static_cast<ptrdiff_t>(num_host_threads);
+        const auto it = std::find(register_host_thread_keys.begin(), end, this_id);
+        ASSERT(core_id < Core::Hardware::NUM_CPU_CORES);
+        ASSERT(it == end);
+        InsertHostThread(static_cast<u32>(core_id));
    }

-    /// Registers a new host thread by allocating a host thread ID for it
    void RegisterHostThread() {
-        [[maybe_unused]] const auto this_id = GetHostThreadId();
+        const std::thread::id this_id = std::this_thread::get_id();
+        const auto end =
+            register_host_thread_keys.begin() + static_cast<ptrdiff_t>(num_host_threads);
+        const auto it = std::find(register_host_thread_keys.begin(), end, this_id);
+        if (it == end) {
+            InsertHostThread(registered_thread_ids++);
+        }
    }

-    [[nodiscard]] u32 GetCurrentHostThreadID() {
-        const auto this_id = GetHostThreadId();
+    void InsertHostThread(u32 value) {
+        const size_t index = num_host_threads++;
+        ASSERT_MSG(index < NUM_REGISTRABLE_HOST_THREADS, "Too many host threads");
+        register_host_thread_values[index] = value;
+        register_host_thread_keys[index] = std::this_thread::get_id();
+    }
+
+    [[nodiscard]] u32 GetCurrentHostThreadID() const {
+        const std::thread::id this_id = std::this_thread::get_id();
        if (!is_multicore && single_core_thread_id == this_id) {
            return static_cast<u32>(system.GetCpuManager().CurrentCore());
        }
-        return this_id;
+        const auto end =
+            register_host_thread_keys.begin() + static_cast<ptrdiff_t>(num_host_threads);
+        const auto it = std::find(register_host_thread_keys.begin(), end, this_id);
+        if (it == end) {
+            return Core::INVALID_HOST_THREAD_ID;
+        }
+        return register_host_thread_values[static_cast<size_t>(
+            std::distance(register_host_thread_keys.begin(), it))];
    }

-    [[nodiscard]] Core::EmuThreadHandle GetCurrentEmuThreadID() {
+    Core::EmuThreadHandle GetCurrentEmuThreadID() const {
        Core::EmuThreadHandle result = Core::EmuThreadHandle::InvalidHandle();
        result.host_handle = GetCurrentHostThreadID();
        if (result.host_handle >= Core::Hardware::NUM_CPU_CORES) {
@@ -325,8 +325,15 @@ struct KernelCore::Impl {
    std::unique_ptr<Core::ExclusiveMonitor> exclusive_monitor;
    std::vector<Kernel::PhysicalCore> cores;

-    // Next host thead ID to use, 0-3 IDs represent core threads, >3 represent others
-    std::atomic<u32> next_host_thread_id{Core::Hardware::NUM_CPU_CORES};
+    // 0-3 IDs represent core threads, >3 represent others
+    std::atomic<u32> registered_thread_ids{Core::Hardware::NUM_CPU_CORES};
+
+    // Number of host threads is a relatively high number to avoid overflowing
+    static constexpr size_t NUM_REGISTRABLE_HOST_THREADS = 64;
+    std::atomic<size_t> num_host_threads{0};
+    std::array<std::atomic<std::thread::id>, NUM_REGISTRABLE_HOST_THREADS>
+        register_host_thread_keys{};
+    std::array<std::atomic<u32>, NUM_REGISTRABLE_HOST_THREADS> register_host_thread_values{};

    // Kernel memory management
    std::unique_ptr<Memory::MemoryManager> memory_manager;
@@ -338,19 +345,12 @@ struct KernelCore::Impl {
    std::shared_ptr<Kernel::SharedMemory> irs_shared_mem;
    std::shared_ptr<Kernel::SharedMemory> time_shared_mem;

-    // Threads used for services
-    std::unordered_set<std::shared_ptr<Kernel::ServiceThread>> service_threads;
-
-    // Service threads are managed by a worker thread, so that a calling service thread can queue up
-    // the release of itself
-    std::unique_ptr<Common::ThreadWorker> service_thread_manager;
-
    std::array<std::shared_ptr<Thread>, Core::Hardware::NUM_CPU_CORES> suspend_threads{};
    std::array<Core::CPUInterruptHandler, Core::Hardware::NUM_CPU_CORES> interrupts{};
    std::array<std::unique_ptr<Kernel::KScheduler>, Core::Hardware::NUM_CPU_CORES> schedulers{};

    bool is_multicore{};
-    u32 single_core_thread_id{};
+    std::thread::id single_core_thread_id{};

    std::array<u64, Core::Hardware::NUM_CPU_CORES> svc_ticks{};

@@ -639,19 +639,4 @@ void KernelCore::ExitSVCProfile() {
    MicroProfileLeave(MICROPROFILE_TOKEN(Kernel_SVC), impl->svc_ticks[core]);
 }

-std::weak_ptr<Kernel::ServiceThread> KernelCore::CreateServiceThread(const std::string& name) {
-    auto service_thread = std::make_shared<Kernel::ServiceThread>(*this, 1, name);
-    impl->service_thread_manager->QueueWork(
-        [this, service_thread] { impl->service_threads.emplace(service_thread); });
-    return service_thread;
-}
-
-void KernelCore::ReleaseServiceThread(std::weak_ptr<Kernel::ServiceThread> service_thread) {
-    impl->service_thread_manager->QueueWork([this, service_thread] {
-        if (auto strong_ptr = service_thread.lock()) {
-            impl->service_threads.erase(strong_ptr);
-        }
-    });
-}
-
 } // namespace Kernel
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -42,7 +42,6 @@ class Process;
 class ResourceLimit;
 class KScheduler;
 class SharedMemory;
-class ServiceThread;
 class Synchronization;
 class Thread;
 class TimeManager;
@@ -228,22 +227,6 @@ public:

    void ExitSVCProfile();

-    /**
-     * Creates an HLE service thread, which are used to execute service routines asynchronously.
-     * While these are allocated per ServerSession, these need to be owned and managed outside of
-     * ServerSession to avoid a circular dependency.
-     * @param name String name for the ServerSession creating this thread, used for debug purposes.
-     * @returns The a weak pointer newly created service thread.
-     */
-    std::weak_ptr<Kernel::ServiceThread> CreateServiceThread(const std::string& name);
-
-    /**
-     * Releases a HLE service thread, instructing KernelCore to free it. This should be called when
-     * the ServerSession associated with the thread is destroyed.
-     * @param service_thread Service thread to release.
-     */
-    void ReleaseServiceThread(std::weak_ptr<Kernel::ServiceThread> service_thread);
-
 private:
    friend class Object;
    friend class Process;
--- a/src/core/hle/kernel/server_session.cpp
+++ b/src/core/hle/kernel/server_session.cpp
@@ -25,19 +25,19 @@
 namespace Kernel {

 ServerSession::ServerSession(KernelCore& kernel) : SynchronizationObject{kernel} {}
-
-ServerSession::~ServerSession() {
-    kernel.ReleaseServiceThread(service_thread);
-}
+ServerSession::~ServerSession() = default;

 ResultVal<std::shared_ptr<ServerSession>> ServerSession::Create(KernelCore& kernel,
                                                                std::shared_ptr<Session> parent,
                                                                std::string name) {
    std::shared_ptr<ServerSession> session{std::make_shared<ServerSession>(kernel)};

+    session->request_event =
+        Core::Timing::CreateEvent(name, [session](std::uintptr_t, std::chrono::nanoseconds) {
+            session->CompleteSyncRequest();
+        });
    session->name = std::move(name);
    session->parent = std::move(parent);
-    session->service_thread = kernel.CreateServiceThread(session->name);

    return MakeResult(std::move(session));
 }
@@ -142,16 +142,16 @@ ResultCode ServerSession::QueueSyncRequest(std::shared_ptr<Thread> thread,
        std::make_shared<HLERequestContext>(kernel, memory, SharedFrom(this), std::move(thread));

    context->PopulateFromIncomingCommandBuffer(kernel.CurrentProcess()->GetHandleTable(), cmd_buf);
-
-    if (auto strong_ptr = service_thread.lock()) {
-        strong_ptr->QueueSyncRequest(*this, std::move(context));
-        return RESULT_SUCCESS;
-    }
+    request_queue.Push(std::move(context));

    return RESULT_SUCCESS;
 }

-ResultCode ServerSession::CompleteSyncRequest(HLERequestContext& context) {
+ResultCode ServerSession::CompleteSyncRequest() {
+    ASSERT(!request_queue.Empty());
+
+    auto& context = *request_queue.Front();
+
    ResultCode result = RESULT_SUCCESS;
    // If the session has been converted to a domain, handle the domain request
    if (IsDomain() && context.HasDomainMessageHeader()) {
@@ -177,13 +177,18 @@ ResultCode ServerSession::CompleteSyncRequest(HLERequestContext& context) {
        }
    }

+    request_queue.Pop();
+
    return result;
 }

 ResultCode ServerSession::HandleSyncRequest(std::shared_ptr<Thread> thread,
                                            Core::Memory::Memory& memory,
                                            Core::Timing::CoreTiming& core_timing) {
-    return QueueSyncRequest(std::move(thread), memory);
+    const ResultCode result = QueueSyncRequest(std::move(thread), memory);
+    const auto delay = std::chrono::nanoseconds{kernel.IsMulticore() ? 0 : 20000};
+    core_timing.ScheduleEvent(delay, request_event, {});
+    return result;
 }

 } // namespace Kernel
--- a/src/core/hle/kernel/server_session.h
+++ b/src/core/hle/kernel/server_session.h
@@ -10,7 +10,6 @@
 #include <vector>

 #include "common/threadsafe_queue.h"
-#include "core/hle/kernel/service_thread.h"
 #include "core/hle/kernel/synchronization_object.h"
 #include "core/hle/result.h"

@@ -44,8 +43,6 @@ class Thread;
 * TLS buffer and control is transferred back to it.
 */
 class ServerSession final : public SynchronizationObject {
-    friend class ServiceThread;
-
 public:
    explicit ServerSession(KernelCore& kernel);
    ~ServerSession() override;
@@ -135,7 +132,7 @@ private:
    ResultCode QueueSyncRequest(std::shared_ptr<Thread> thread, Core::Memory::Memory& memory);

    /// Completes a sync request from the emulated application.
-    ResultCode CompleteSyncRequest(HLERequestContext& context);
+    ResultCode CompleteSyncRequest();

    /// Handles a SyncRequest to a domain, forwarding the request to the proper object or closing an
    /// object handle.
@@ -166,8 +163,11 @@ private:
    /// The name of this session (optional)
    std::string name;

-    /// Thread to dispatch service requests
-    std::weak_ptr<ServiceThread> service_thread;
+    /// Core timing event used to schedule the service request at some point in the future
+    std::shared_ptr<Core::Timing::EventType> request_event;
+
+    /// Queue of scheduled service requests
+    Common::MPSCQueue<std::shared_ptr<Kernel::HLERequestContext>> request_queue;
 };

 } // namespace Kernel
--- a/src/core/hle/kernel/service_thread.cpp
+++ b/src/core/hle/kernel/service_thread.cpp
@@ -1,110 +0,0 @@
-// Copyright 2020 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <condition_variable>
-#include <functional>
-#include <mutex>
-#include <thread>
-#include <vector>
-#include <queue>
-
-#include "common/assert.h"
-#include "common/scope_exit.h"
-#include "common/thread.h"
-#include "core/core.h"
-#include "core/hle/kernel/kernel.h"
-#include "core/hle/kernel/server_session.h"
-#include "core/hle/kernel/service_thread.h"
-#include "core/hle/lock.h"
-#include "video_core/renderer_base.h"
-
-namespace Kernel {
-
-class ServiceThread::Impl final {
-public:
-    explicit Impl(KernelCore& kernel, std::size_t num_threads, const std::string& name);
-    ~Impl();
-
-    void QueueSyncRequest(ServerSession& session, std::shared_ptr<HLERequestContext>&& context);
-
-private:
-    std::vector<std::thread> threads;
-    std::queue<std::function<void()>> requests;
-    std::mutex queue_mutex;
-    std::condition_variable condition;
-    const std::string service_name;
-    bool stop{};
-};
-
-ServiceThread::Impl::Impl(KernelCore& kernel, std::size_t num_threads, const std::string& name)
-    : service_name{name} {
-    for (std::size_t i = 0; i < num_threads; ++i)
-        threads.emplace_back([this, &kernel] {
-            Common::SetCurrentThreadName(std::string{"yuzu:HleService:" + service_name}.c_str());
-
-            // Wait for first request before trying to acquire a render context
-            {
-                std::unique_lock lock{queue_mutex};
-                condition.wait(lock, [this] { return stop || !requests.empty(); });
-            }
-
-            kernel.RegisterHostThread();
-
-            while (true) {
-                std::function<void()> task;
-
-                {
-                    std::unique_lock lock{queue_mutex};
-                    condition.wait(lock, [this] { return stop || !requests.empty(); });
-                    if (stop || requests.empty()) {
-                        return;
-                    }
-                    task = std::move(requests.front());
-                    requests.pop();
-                }
-
-                task();
-            }
-        });
-}
-
-void ServiceThread::Impl::QueueSyncRequest(ServerSession& session,
-                                           std::shared_ptr<HLERequestContext>&& context) {
-    {
-        std::unique_lock lock{queue_mutex};
-
-        // ServerSession owns the service thread, so we cannot caption a strong pointer here in the
-        // event that the ServerSession is terminated.
-        std::weak_ptr<ServerSession> weak_ptr{SharedFrom(&session)};
-        requests.emplace([weak_ptr, context{std::move(context)}]() {
-            if (auto strong_ptr = weak_ptr.lock()) {
-                strong_ptr->CompleteSyncRequest(*context);
-            }
-        });
-    }
-    condition.notify_one();
-}
-
-ServiceThread::Impl::~Impl() {
-    {
-        std::unique_lock lock{queue_mutex};
-        stop = true;
-    }
-    condition.notify_all();
-    for (std::thread& thread : threads) {
-        thread.join();
-    }
-}
-
-ServiceThread::ServiceThread(KernelCore& kernel, std::size_t num_threads, const std::string& name)
-    : impl{std::make_unique<Impl>(kernel, num_threads, name)} {}
-
-ServiceThread::~ServiceThread() = default;
-
-void ServiceThread::QueueSyncRequest(ServerSession& session,
-                                     std::shared_ptr<HLERequestContext>&& context) {
-    impl->QueueSyncRequest(session, std::move(context));
-}
-
-} // namespace Kernel
--- a/src/core/hle/kernel/service_thread.h
+++ b/src/core/hle/kernel/service_thread.h
@@ -1,28 +0,0 @@
-// Copyright 2020 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <memory>
-#include <string>
-
-namespace Kernel {
-
-class HLERequestContext;
-class KernelCore;
-class ServerSession;
-
-class ServiceThread final {
-public:
-    explicit ServiceThread(KernelCore& kernel, std::size_t num_threads, const std::string& name);
-    ~ServiceThread();
-
-    void QueueSyncRequest(ServerSession& session, std::shared_ptr<HLERequestContext>&& context);
-
-private:
-    class Impl;
-    std::unique_ptr<Impl> impl;
-};
-
-} // namespace Kernel
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -1583,7 +1583,7 @@ static void ExitThread32(Core::System& system) {

 /// Sleep the current thread
 static void SleepThread(Core::System& system, s64 nanoseconds) {
-    LOG_DEBUG(Kernel_SVC, "called nanoseconds={}", nanoseconds);
+    LOG_TRACE(Kernel_SVC, "called nanoseconds={}", nanoseconds);

    enum class SleepType : s64 {
        YieldWithoutCoreMigration = 0,
--- a/src/core/hle/service/audio/audout_u.cpp
+++ b/src/core/hle/service/audio/audout_u.cpp
@@ -70,10 +70,8 @@ public:
            Kernel::WritableEvent::CreateEventPair(system.Kernel(), "IAudioOutBufferReleased");

        stream = audio_core.OpenStream(system.CoreTiming(), audio_params.sample_rate,
-                                       audio_params.channel_count, std::move(unique_name), [this] {
-                                           const auto guard = LockService();
-                                           buffer_event.writable->Signal();
-                                       });
+                                       audio_params.channel_count, std::move(unique_name),
+                                       [this] { buffer_event.writable->Signal(); });
    }

 private:
--- a/src/core/hle/service/audio/audren_u.cpp
+++ b/src/core/hle/service/audio/audren_u.cpp
@@ -49,16 +49,16 @@ public:

        system_event =
            Kernel::WritableEvent::CreateEventPair(system.Kernel(), "IAudioRenderer:SystemEvent");
-        renderer = std::make_unique<AudioCore::AudioRenderer>(
-            system.CoreTiming(), system.Memory(), audren_params,
-            [this]() {
-                const auto guard = LockService();
-                system_event.writable->Signal();
-            },
-            instance_number);
+        renderer = std::make_unique<AudioCore::AudioRenderer>(system.CoreTiming(), system.Memory(),
+                                                              audren_params, system_event.writable,
+                                                              instance_number);
    }

 private:
+    void UpdateAudioCallback() {
+        system_event.writable->Signal();
+    }
+
    void GetSampleRate(Kernel::HLERequestContext& ctx) {
        LOG_DEBUG(Service_Audio, "called");

--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -78,13 +78,11 @@ IAppletResource::IAppletResource(Core::System& system_)
    pad_update_event = Core::Timing::CreateEvent(
        "HID::UpdatePadCallback",
        [this](std::uintptr_t user_data, std::chrono::nanoseconds ns_late) {
-            const auto guard = LockService();
            UpdateControllers(user_data, ns_late);
        });
    motion_update_event = Core::Timing::CreateEvent(
        "HID::MotionPadCallback",
        [this](std::uintptr_t user_data, std::chrono::nanoseconds ns_late) {
-            const auto guard = LockService();
            UpdateMotion(user_data, ns_late);
        });

--- a/src/core/hle/service/nvdrv/devices/nvdevice.h
+++ b/src/core/hle/service/nvdrv/devices/nvdevice.h
@@ -31,8 +31,8 @@ public:
     * @param output A buffer where the output data will be written to.
     * @returns The result code of the ioctl.
     */
-    virtual NvResult Ioctl1(Ioctl command, const std::vector<u8>& input,
-                            std::vector<u8>& output) = 0;
+    virtual NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                            IoctlCtrl& ctrl) = 0;

    /**
     * Handles an ioctl2 request.
@@ -43,7 +43,8 @@ public:
     * @returns The result code of the ioctl.
     */
    virtual NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                            const std::vector<u8>& inline_input, std::vector<u8>& output) = 0;
+                            const std::vector<u8>& inline_input, std::vector<u8>& output,
+                            IoctlCtrl& ctrl) = 0;

    /**
     * Handles an ioctl3 request.
@@ -54,7 +55,7 @@ public:
     * @returns The result code of the ioctl.
     */
    virtual NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                            std::vector<u8>& inline_output) = 0;
+                            std::vector<u8>& inline_output, IoctlCtrl& ctrl) = 0;

 protected:
    Core::System& system;
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@@ -18,20 +18,21 @@ nvdisp_disp0::nvdisp_disp0(Core::System& system, std::shared_ptr<nvmap> nvmap_de
    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
 nvdisp_disp0 ::~nvdisp_disp0() = default;

-NvResult nvdisp_disp0::Ioctl1(Ioctl command, const std::vector<u8>& input,
-                              std::vector<u8>& output) {
+NvResult nvdisp_disp0::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                              IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }

 NvResult nvdisp_disp0::Ioctl2(Ioctl command, const std::vector<u8>& input,
-                              const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                              const std::vector<u8>& inline_input, std::vector<u8>& output,
+                              IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }

 NvResult nvdisp_disp0::Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                              std::vector<u8>& inline_output) {
+                              std::vector<u8>& inline_output, IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
@@ -20,11 +20,13 @@ public:
    explicit nvdisp_disp0(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvdisp_disp0() override;

-    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output) override;
+                    const std::vector<u8>& inline_input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::vector<u8>& inline_output, IoctlCtrl& ctrl) override;

    /// Performs a screen flip, drawing the buffer pointed to by the handle.
    void flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u32 height, u32 stride,
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -21,8 +21,8 @@ nvhost_as_gpu::nvhost_as_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_
    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
 nvhost_as_gpu::~nvhost_as_gpu() = default;

-NvResult nvhost_as_gpu::Ioctl1(Ioctl command, const std::vector<u8>& input,
-                               std::vector<u8>& output) {
+NvResult nvhost_as_gpu::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                               IoctlCtrl& ctrl) {
    switch (command.group) {
    case 'A':
        switch (command.cmd) {
@@ -55,13 +55,14 @@ NvResult nvhost_as_gpu::Ioctl1(Ioctl command, const std::vector<u8>& input,
 }

 NvResult nvhost_as_gpu::Ioctl2(Ioctl command, const std::vector<u8>& input,
-                               const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                               const std::vector<u8>& inline_input, std::vector<u8>& output,
+                               IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }

 NvResult nvhost_as_gpu::Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                               std::vector<u8>& inline_output) {
+                               std::vector<u8>& inline_output, IoctlCtrl& ctrl) {
    switch (command.group) {
    case 'A':
        switch (command.cmd) {
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.h
@@ -30,11 +30,13 @@ public:
    explicit nvhost_as_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvhost_as_gpu() override;

-    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output) override;
+                    const std::vector<u8>& inline_input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::vector<u8>& inline_output, IoctlCtrl& ctrl) override;

 private:
    class BufferMap final {
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
@@ -20,7 +20,8 @@ nvhost_ctrl::nvhost_ctrl(Core::System& system, EventInterface& events_interface,
    : nvdevice(system), events_interface{events_interface}, syncpoint_manager{syncpoint_manager} {}
 nvhost_ctrl::~nvhost_ctrl() = default;

-NvResult nvhost_ctrl::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+NvResult nvhost_ctrl::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                             IoctlCtrl& ctrl) {
    switch (command.group) {
    case 0x0:
        switch (command.cmd) {
@@ -29,9 +30,9 @@ NvResult nvhost_ctrl::Ioctl1(Ioctl command, const std::vector<u8>& input, std::v
        case 0x1c:
            return IocCtrlClearEventWait(input, output);
        case 0x1d:
-            return IocCtrlEventWait(input, output, false);
+            return IocCtrlEventWait(input, output, false, ctrl);
        case 0x1e:
-            return IocCtrlEventWait(input, output, true);
+            return IocCtrlEventWait(input, output, true, ctrl);
        case 0x1f:
            return IocCtrlEventRegister(input, output);
        case 0x20:
@@ -47,13 +48,14 @@ NvResult nvhost_ctrl::Ioctl1(Ioctl command, const std::vector<u8>& input, std::v
 }

 NvResult nvhost_ctrl::Ioctl2(Ioctl command, const std::vector<u8>& input,
-                             const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                             const std::vector<u8>& inline_input, std::vector<u8>& output,
+                             IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }

 NvResult nvhost_ctrl::Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                             std::vector<u8>& inline_outpu) {
+                             std::vector<u8>& inline_output, IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }
@@ -67,7 +69,7 @@ NvResult nvhost_ctrl::NvOsGetConfigU32(const std::vector<u8>& input, std::vector
 }

 NvResult nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output,
-                                       bool is_async) {
+                                       bool is_async, IoctlCtrl& ctrl) {
    IocCtrlEventWaitParams params{};
    std::memcpy(&params, input.data(), sizeof(params));
    LOG_DEBUG(Service_NVDRV, "syncpt_id={}, threshold={}, timeout={}, is_async={}",
@@ -139,6 +141,12 @@ NvResult nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector
        params.value |= event_id;
        event.event.writable->Clear();
        gpu.RegisterSyncptInterrupt(params.syncpt_id, target_value);
+        if (!is_async && ctrl.fresh_call) {
+            ctrl.must_delay = true;
+            ctrl.timeout = params.timeout;
+            ctrl.event_id = event_id;
+            return NvResult::Timeout;
+        }
        std::memcpy(output.data(), &params, sizeof(params));
        return NvResult::Timeout;
    }
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
@@ -18,11 +18,13 @@ public:
                         SyncpointManager& syncpoint_manager);
    ~nvhost_ctrl() override;

-    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output) override;
+                    const std::vector<u8>& inline_input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::vector<u8>& inline_output, IoctlCtrl& ctrl) override;

 private:
    struct IocSyncptReadParams {
@@ -121,7 +123,8 @@ private:
    static_assert(sizeof(IocCtrlEventKill) == 8, "IocCtrlEventKill is incorrect size");

    NvResult NvOsGetConfigU32(const std::vector<u8>& input, std::vector<u8>& output);
-    NvResult IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output, bool is_async);
+    NvResult IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>& output, bool is_async,
+                              IoctlCtrl& ctrl);
    NvResult IocCtrlEventRegister(const std::vector<u8>& input, std::vector<u8>& output);
    NvResult IocCtrlEventUnregister(const std::vector<u8>& input, std::vector<u8>& output);
    NvResult IocCtrlClearEventWait(const std::vector<u8>& input, std::vector<u8>& output);
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.cpp
@@ -16,7 +16,7 @@ nvhost_ctrl_gpu::nvhost_ctrl_gpu(Core::System& system) : nvdevice(system) {}
 nvhost_ctrl_gpu::~nvhost_ctrl_gpu() = default;

 NvResult nvhost_ctrl_gpu::Ioctl1(Ioctl command, const std::vector<u8>& input,
-                                 std::vector<u8>& output) {
+                                 std::vector<u8>& output, IoctlCtrl& ctrl) {
    switch (command.group) {
    case 'G':
        switch (command.cmd) {
@@ -48,13 +48,15 @@ NvResult nvhost_ctrl_gpu::Ioctl1(Ioctl command, const std::vector<u8>& input,
 }

 NvResult nvhost_ctrl_gpu::Ioctl2(Ioctl command, const std::vector<u8>& input,
-                                 const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                                 const std::vector<u8>& inline_input, std::vector<u8>& output,
+                                 IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }

 NvResult nvhost_ctrl_gpu::Ioctl3(Ioctl command, const std::vector<u8>& input,
-                                 std::vector<u8>& output, std::vector<u8>& inline_output) {
+                                 std::vector<u8>& output, std::vector<u8>& inline_output,
+                                 IoctlCtrl& ctrl) {
    switch (command.group) {
    case 'G':
        switch (command.cmd) {
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl_gpu.h
@@ -16,11 +16,13 @@ public:
    explicit nvhost_ctrl_gpu(Core::System& system);
    ~nvhost_ctrl_gpu() override;

-    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output) override;
+                    const std::vector<u8>& inline_input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::vector<u8>& inline_output, IoctlCtrl& ctrl) override;

 private:
    struct IoctlGpuCharacteristics {
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -23,7 +23,8 @@ nvhost_gpu::nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev,

 nvhost_gpu::~nvhost_gpu() = default;

-NvResult nvhost_gpu::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+NvResult nvhost_gpu::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                            IoctlCtrl& ctrl) {
    switch (command.group) {
    case 0x0:
        switch (command.cmd) {
@@ -75,7 +76,8 @@ NvResult nvhost_gpu::Ioctl1(Ioctl command, const std::vector<u8>& input, std::ve
 };

 NvResult nvhost_gpu::Ioctl2(Ioctl command, const std::vector<u8>& input,
-                            const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                            const std::vector<u8>& inline_input, std::vector<u8>& output,
+                            IoctlCtrl& ctrl) {
    switch (command.group) {
    case 'H':
        switch (command.cmd) {
@@ -89,7 +91,7 @@ NvResult nvhost_gpu::Ioctl2(Ioctl command, const std::vector<u8>& input,
 }

 NvResult nvhost_gpu::Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                            std::vector<u8>& inline_output) {
+                            std::vector<u8>& inline_output, IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@@ -26,11 +26,13 @@ public:
                        SyncpointManager& syncpoint_manager);
    ~nvhost_gpu() override;

-    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output) override;
+                    const std::vector<u8>& inline_input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::vector<u8>& inline_output, IoctlCtrl& ctrl) override;

 private:
    enum class CtxObjects : u32_le {
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -15,8 +15,8 @@ nvhost_nvdec::nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_de
    : nvhost_nvdec_common(system, std::move(nvmap_dev)) {}
 nvhost_nvdec::~nvhost_nvdec() = default;

-NvResult nvhost_nvdec::Ioctl1(Ioctl command, const std::vector<u8>& input,
-                              std::vector<u8>& output) {
+NvResult nvhost_nvdec::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                              IoctlCtrl& ctrl) {
    switch (command.group) {
    case 0x0:
        switch (command.cmd) {
@@ -58,13 +58,14 @@ NvResult nvhost_nvdec::Ioctl1(Ioctl command, const std::vector<u8>& input,
 }

 NvResult nvhost_nvdec::Ioctl2(Ioctl command, const std::vector<u8>& input,
-                              const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                              const std::vector<u8>& inline_input, std::vector<u8>& output,
+                              IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }

 NvResult nvhost_nvdec::Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                              std::vector<u8>& inline_output) {
+                              std::vector<u8>& inline_output, IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
@@ -14,11 +14,13 @@ public:
    explicit nvhost_nvdec(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvhost_nvdec() override;

-    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output) override;
+                    const std::vector<u8>& inline_input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::vector<u8>& inline_output, IoctlCtrl& ctrl) override;
 };

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.cpp
@@ -13,8 +13,8 @@ namespace Service::Nvidia::Devices {
 nvhost_nvjpg::nvhost_nvjpg(Core::System& system) : nvdevice(system) {}
 nvhost_nvjpg::~nvhost_nvjpg() = default;

-NvResult nvhost_nvjpg::Ioctl1(Ioctl command, const std::vector<u8>& input,
-                              std::vector<u8>& output) {
+NvResult nvhost_nvjpg::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                              IoctlCtrl& ctrl) {
    switch (command.group) {
    case 'H':
        switch (command.cmd) {
@@ -33,13 +33,14 @@ NvResult nvhost_nvjpg::Ioctl1(Ioctl command, const std::vector<u8>& input,
 }

 NvResult nvhost_nvjpg::Ioctl2(Ioctl command, const std::vector<u8>& input,
-                              const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                              const std::vector<u8>& inline_input, std::vector<u8>& output,
+                              IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }

 NvResult nvhost_nvjpg::Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                              std::vector<u8>& inline_output) {
+                              std::vector<u8>& inline_output, IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvjpg.h
@@ -16,11 +16,13 @@ public:
    explicit nvhost_nvjpg(Core::System& system);
    ~nvhost_nvjpg() override;

-    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output) override;
+                    const std::vector<u8>& inline_input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::vector<u8>& inline_output, IoctlCtrl& ctrl) override;

 private:
    struct IoctlSetNvmapFD {
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -15,7 +15,8 @@ nvhost_vic::nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)

 nvhost_vic::~nvhost_vic() = default;

-NvResult nvhost_vic::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+NvResult nvhost_vic::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                            IoctlCtrl& ctrl) {
    switch (command.group) {
    case 0x0:
        switch (command.cmd) {
@@ -50,13 +51,14 @@ NvResult nvhost_vic::Ioctl1(Ioctl command, const std::vector<u8>& input, std::ve
 }

 NvResult nvhost_vic::Ioctl2(Ioctl command, const std::vector<u8>& input,
-                            const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                            const std::vector<u8>& inline_input, std::vector<u8>& output,
+                            IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }

 NvResult nvhost_vic::Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                            std::vector<u8>& inline_output) {
+                            std::vector<u8>& inline_output, IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
@@ -14,10 +14,12 @@ public:
    explicit nvhost_vic(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
    ~nvhost_vic();

-    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output) override;
+                    const std::vector<u8>& inline_input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::vector<u8>& inline_output, IoctlCtrl& ctrl) override;
 };
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvmap.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvmap.cpp
@@ -19,7 +19,8 @@ nvmap::nvmap(Core::System& system) : nvdevice(system) {

 nvmap::~nvmap() = default;

-NvResult nvmap::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) {
+NvResult nvmap::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                       IoctlCtrl& ctrl) {
    switch (command.group) {
    case 0x1:
        switch (command.cmd) {
@@ -48,13 +49,14 @@ NvResult nvmap::Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<
 }

 NvResult nvmap::Ioctl2(Ioctl command, const std::vector<u8>& input,
-                       const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                       const std::vector<u8>& inline_input, std::vector<u8>& output,
+                       IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }

 NvResult nvmap::Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                       std::vector<u8>& inline_output) {
+                       std::vector<u8>& inline_output, IoctlCtrl& ctrl) {
    UNIMPLEMENTED_MSG("Unimplemented ioctl={:08X}", command.raw);
    return NvResult::NotImplemented;
 }
--- a/src/core/hle/service/nvdrv/devices/nvmap.h
+++ b/src/core/hle/service/nvdrv/devices/nvmap.h
@@ -19,11 +19,13 @@ public:
    explicit nvmap(Core::System& system);
    ~nvmap() override;

-    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output) override;
+    NvResult Ioctl1(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl2(Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output) override;
+                    const std::vector<u8>& inline_input, std::vector<u8>& output,
+                    IoctlCtrl& ctrl) override;
    NvResult Ioctl3(Ioctl command, const std::vector<u8>& input, std::vector<u8>& output,
-                    std::vector<u8>& inline_output) override;
+                    std::vector<u8>& inline_output, IoctlCtrl& ctrl) override;

    /// Returns the allocated address of an nvmap object given its handle.
    VAddr GetObjectAddress(u32 handle) const;
--- a/src/core/hle/service/nvdrv/interface.cpp
+++ b/src/core/hle/service/nvdrv/interface.cpp
@@ -61,9 +61,32 @@ void NVDRV::Ioctl1(Kernel::HLERequestContext& ctx) {
    std::vector<u8> output_buffer(ctx.GetWriteBufferSize(0));
    const auto input_buffer = ctx.ReadBuffer(0);

-    const auto nv_result = nvdrv->Ioctl1(fd, command, input_buffer, output_buffer);
-    if (command.is_out != 0) {
-        ctx.WriteBuffer(output_buffer);
+    IoctlCtrl ctrl{};
+
+    const auto nv_result = nvdrv->Ioctl1(fd, command, input_buffer, output_buffer, ctrl);
+    if (ctrl.must_delay) {
+        ctrl.fresh_call = false;
+        ctx.SleepClientThread(
+            "NVServices::DelayedResponse", ctrl.timeout,
+            [=, this](std::shared_ptr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx_,
+                      Kernel::ThreadWakeupReason reason) {
+                IoctlCtrl ctrl2{ctrl};
+                std::vector<u8> tmp_output = output_buffer;
+                const auto nv_result2 = nvdrv->Ioctl1(fd, command, input_buffer, tmp_output, ctrl2);
+
+                if (command.is_out != 0) {
+                    ctx.WriteBuffer(tmp_output);
+                }
+
+                IPC::ResponseBuilder rb{ctx_, 3};
+                rb.Push(RESULT_SUCCESS);
+                rb.PushEnum(nv_result2);
+            },
+            nvdrv->GetEventWriteable(ctrl.event_id));
+    } else {
+        if (command.is_out != 0) {
+            ctx.WriteBuffer(output_buffer);
+        }
    }

    IPC::ResponseBuilder rb{ctx, 3};
@@ -87,8 +110,36 @@ void NVDRV::Ioctl2(Kernel::HLERequestContext& ctx) {
    const auto input_inlined_buffer = ctx.ReadBuffer(1);
    std::vector<u8> output_buffer(ctx.GetWriteBufferSize(0));

+    IoctlCtrl ctrl{};
+
    const auto nv_result =
-        nvdrv->Ioctl2(fd, command, input_buffer, input_inlined_buffer, output_buffer);
+        nvdrv->Ioctl2(fd, command, input_buffer, input_inlined_buffer, output_buffer, ctrl);
+    if (ctrl.must_delay) {
+        ctrl.fresh_call = false;
+        ctx.SleepClientThread(
+            "NVServices::DelayedResponse", ctrl.timeout,
+            [=, this](std::shared_ptr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx_,
+                      Kernel::ThreadWakeupReason reason) {
+                IoctlCtrl ctrl2{ctrl};
+                std::vector<u8> tmp_output = output_buffer;
+                const auto nv_result2 = nvdrv->Ioctl2(fd, command, input_buffer,
+                                                      input_inlined_buffer, tmp_output, ctrl2);
+
+                if (command.is_out != 0) {
+                    ctx.WriteBuffer(tmp_output);
+                }
+
+                IPC::ResponseBuilder rb{ctx_, 3};
+                rb.Push(RESULT_SUCCESS);
+                rb.PushEnum(nv_result2);
+            },
+            nvdrv->GetEventWriteable(ctrl.event_id));
+    } else {
+        if (command.is_out != 0) {
+            ctx.WriteBuffer(output_buffer);
+        }
+    }
+
    if (command.is_out != 0) {
        ctx.WriteBuffer(output_buffer);
    }
@@ -114,11 +165,36 @@ void NVDRV::Ioctl3(Kernel::HLERequestContext& ctx) {
    std::vector<u8> output_buffer(ctx.GetWriteBufferSize(0));
    std::vector<u8> output_buffer_inline(ctx.GetWriteBufferSize(1));

+    IoctlCtrl ctrl{};
    const auto nv_result =
-        nvdrv->Ioctl3(fd, command, input_buffer, output_buffer, output_buffer_inline);
-    if (command.is_out != 0) {
-        ctx.WriteBuffer(output_buffer, 0);
-        ctx.WriteBuffer(output_buffer_inline, 1);
+        nvdrv->Ioctl3(fd, command, input_buffer, output_buffer, output_buffer_inline, ctrl);
+    if (ctrl.must_delay) {
+        ctrl.fresh_call = false;
+        ctx.SleepClientThread(
+            "NVServices::DelayedResponse", ctrl.timeout,
+            [=, this](std::shared_ptr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx_,
+                      Kernel::ThreadWakeupReason reason) {
+                IoctlCtrl ctrl2{ctrl};
+                std::vector<u8> tmp_output = output_buffer;
+                std::vector<u8> tmp_output2 = output_buffer;
+                const auto nv_result2 =
+                    nvdrv->Ioctl3(fd, command, input_buffer, tmp_output, tmp_output2, ctrl2);
+
+                if (command.is_out != 0) {
+                    ctx.WriteBuffer(tmp_output, 0);
+                    ctx.WriteBuffer(tmp_output2, 1);
+                }
+
+                IPC::ResponseBuilder rb{ctx_, 3};
+                rb.Push(RESULT_SUCCESS);
+                rb.PushEnum(nv_result2);
+            },
+            nvdrv->GetEventWriteable(ctrl.event_id));
+    } else {
+        if (command.is_out != 0) {
+            ctx.WriteBuffer(output_buffer, 0);
+            ctx.WriteBuffer(output_buffer_inline, 1);
+        }
    }

    IPC::ResponseBuilder rb{ctx, 3};
--- a/src/core/hle/service/nvdrv/nvdata.h
+++ b/src/core/hle/service/nvdrv/nvdata.h
@@ -97,4 +97,15 @@ union Ioctl {
    BitField<31, 1, u32> is_out;
 };

+struct IoctlCtrl {
+    // First call done to the servioce for services that call itself again after a call.
+    bool fresh_call{true};
+    // Tells the Ioctl Wrapper that it must delay the IPC response and send the thread to sleep
+    bool must_delay{};
+    // Timeout for the delay
+    s64 timeout{};
+    // NV Event Id
+    s32 event_id{-1};
+};
+
 } // namespace Service::Nvidia
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@@ -91,7 +91,7 @@ DeviceFD Module::Open(const std::string& device_name) {
 }

 NvResult Module::Ioctl1(DeviceFD fd, Ioctl command, const std::vector<u8>& input,
-                        std::vector<u8>& output) {
+                        std::vector<u8>& output, IoctlCtrl& ctrl) {
    if (fd < 0) {
        LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd);
        return NvResult::InvalidState;
@@ -104,11 +104,12 @@ NvResult Module::Ioctl1(DeviceFD fd, Ioctl command, const std::vector<u8>& input
        return NvResult::NotImplemented;
    }

-    return itr->second->Ioctl1(command, input, output);
+    return itr->second->Ioctl1(command, input, output, ctrl);
 }

 NvResult Module::Ioctl2(DeviceFD fd, Ioctl command, const std::vector<u8>& input,
-                        const std::vector<u8>& inline_input, std::vector<u8>& output) {
+                        const std::vector<u8>& inline_input, std::vector<u8>& output,
+                        IoctlCtrl& ctrl) {
    if (fd < 0) {
        LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd);
        return NvResult::InvalidState;
@@ -121,11 +122,11 @@ NvResult Module::Ioctl2(DeviceFD fd, Ioctl command, const std::vector<u8>& input
        return NvResult::NotImplemented;
    }

-    return itr->second->Ioctl2(command, input, inline_input, output);
+    return itr->second->Ioctl2(command, input, inline_input, output, ctrl);
 }

 NvResult Module::Ioctl3(DeviceFD fd, Ioctl command, const std::vector<u8>& input,
-                        std::vector<u8>& output, std::vector<u8>& inline_output) {
+                        std::vector<u8>& output, std::vector<u8>& inline_output, IoctlCtrl& ctrl) {
    if (fd < 0) {
        LOG_ERROR(Service_NVDRV, "Invalid DeviceFD={}!", fd);
        return NvResult::InvalidState;
@@ -138,7 +139,7 @@ NvResult Module::Ioctl3(DeviceFD fd, Ioctl command, const std::vector<u8>& input
        return NvResult::NotImplemented;
    }

-    return itr->second->Ioctl3(command, input, output, inline_output);
+    return itr->second->Ioctl3(command, input, output, inline_output, ctrl);
 }

 NvResult Module::Close(DeviceFD fd) {
--- a/src/core/hle/service/nvdrv/nvdrv.h
+++ b/src/core/hle/service/nvdrv/nvdrv.h
@@ -119,13 +119,13 @@ public:

    /// Sends an ioctl command to the specified file descriptor.
    NvResult Ioctl1(DeviceFD fd, Ioctl command, const std::vector<u8>& input,
-                    std::vector<u8>& output);
+                    std::vector<u8>& output, IoctlCtrl& ctrl);

    NvResult Ioctl2(DeviceFD fd, Ioctl command, const std::vector<u8>& input,
-                    const std::vector<u8>& inline_input, std::vector<u8>& output);
+                    const std::vector<u8>& inline_input, std::vector<u8>& output, IoctlCtrl& ctrl);

    NvResult Ioctl3(DeviceFD fd, Ioctl command, const std::vector<u8>& input,
-                    std::vector<u8>& output, std::vector<u8>& inline_output);
+                    std::vector<u8>& output, std::vector<u8>& inline_output, IoctlCtrl& ctrl);

    /// Closes a device file descriptor and returns operation success.
    NvResult Close(DeviceFD fd);
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -25,12 +25,7 @@ void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer)
    ASSERT(slot < buffer_slots);
    LOG_WARNING(Service, "Adding graphics buffer {}", slot);

-    {
-        std::unique_lock lock{queue_mutex};
-        free_buffers.push_back(slot);
-    }
-    condition.notify_one();
-
+    free_buffers.push_back(slot);
    buffers[slot] = {
        .slot = slot,
        .status = Buffer::Status::Free,
@@ -46,20 +41,10 @@ void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer)

 std::optional<std::pair<u32, Service::Nvidia::MultiFence*>> BufferQueue::DequeueBuffer(u32 width,
                                                                                       u32 height) {
-    // Wait for first request before trying to dequeue
-    {
-        std::unique_lock lock{queue_mutex};
-        condition.wait(lock, [this] { return !free_buffers.empty() || !is_connect; });
-    }

-    if (!is_connect) {
-        // Buffer was disconnected while the thread was blocked, this is most likely due to
-        // emulation being stopped
+    if (free_buffers.empty()) {
        return std::nullopt;
    }
-
-    std::unique_lock lock{queue_mutex};
-
    auto f_itr = free_buffers.begin();
    auto slot = buffers.size();

@@ -112,11 +97,7 @@ void BufferQueue::CancelBuffer(u32 slot, const Service::Nvidia::MultiFence& mult
    buffers[slot].multi_fence = multi_fence;
    buffers[slot].swap_interval = 0;

-    {
-        std::unique_lock lock{queue_mutex};
-        free_buffers.push_back(slot);
-    }
-    condition.notify_one();
+    free_buffers.push_back(slot);

    buffer_wait_event.writable->Signal();
 }
@@ -146,28 +127,15 @@ void BufferQueue::ReleaseBuffer(u32 slot) {
    ASSERT(buffers[slot].slot == slot);

    buffers[slot].status = Buffer::Status::Free;
-    {
-        std::unique_lock lock{queue_mutex};
-        free_buffers.push_back(slot);
-    }
-    condition.notify_one();
+    free_buffers.push_back(slot);

    buffer_wait_event.writable->Signal();
 }

-void BufferQueue::Connect() {
-    queue_sequence.clear();
-    id = 1;
-    layer_id = 1;
-    is_connect = true;
-}
-
 void BufferQueue::Disconnect() {
    buffers.fill({});
    queue_sequence.clear();
    buffer_wait_event.writable->Signal();
-    is_connect = false;
-    condition.notify_one();
 }

 u32 BufferQueue::Query(QueryType type) {
--- a/src/core/hle/service/nvflinger/buffer_queue.h
+++ b/src/core/hle/service/nvflinger/buffer_queue.h
@@ -4,9 +4,7 @@

 #pragma once

-#include <condition_variable>
 #include <list>
-#include <mutex>
 #include <optional>
 #include <vector>

@@ -101,7 +99,6 @@ public:
    void CancelBuffer(u32 slot, const Service::Nvidia::MultiFence& multi_fence);
    std::optional<std::reference_wrapper<const Buffer>> AcquireBuffer();
    void ReleaseBuffer(u32 slot);
-    void Connect();
    void Disconnect();
    u32 Query(QueryType type);

@@ -109,28 +106,18 @@ public:
        return id;
    }

-    bool IsConnected() const {
-        return is_connect;
-    }
-
    std::shared_ptr<Kernel::WritableEvent> GetWritableBufferWaitEvent() const;

    std::shared_ptr<Kernel::ReadableEvent> GetBufferWaitEvent() const;

 private:
-    BufferQueue(const BufferQueue&) = delete;
-
-    u32 id{};
-    u64 layer_id{};
-    std::atomic_bool is_connect{};
+    u32 id;
+    u64 layer_id;

    std::list<u32> free_buffers;
    std::array<Buffer, buffer_slots> buffers;
    std::list<u32> queue_sequence;
    Kernel::EventPair buffer_wait_event;
-
-    std::mutex queue_mutex;
-    std::condition_variable condition;
 };

 } // namespace Service::NVFlinger
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -88,10 +88,6 @@ NVFlinger::NVFlinger(Core::System& system) : system(system) {
 }

 NVFlinger::~NVFlinger() {
-    for (auto& buffer_queue : buffer_queues) {
-        buffer_queue->Disconnect();
-    }
-
    if (system.IsMulticore()) {
        is_running = false;
        wait_event->Set();
@@ -108,8 +104,6 @@ void NVFlinger::SetNVDrvInstance(std::shared_ptr<Nvidia::Module> instance) {
 }

 std::optional<u64> NVFlinger::OpenDisplay(std::string_view name) {
-    const auto guard = Lock();
-
    LOG_DEBUG(Service, "Opening \"{}\" display", name);

    // TODO(Subv): Currently we only support the Default display.
@@ -127,7 +121,6 @@ std::optional<u64> NVFlinger::OpenDisplay(std::string_view name) {
 }

 std::optional<u64> NVFlinger::CreateLayer(u64 display_id) {
-    const auto guard = Lock();
    auto* const display = FindDisplay(display_id);

    if (display == nullptr) {
@@ -136,22 +129,18 @@ std::optional<u64> NVFlinger::CreateLayer(u64 display_id) {

    const u64 layer_id = next_layer_id++;
    const u32 buffer_queue_id = next_buffer_queue_id++;
-    buffer_queues.emplace_back(
-        std::make_unique<BufferQueue>(system.Kernel(), buffer_queue_id, layer_id));
-    display->CreateLayer(layer_id, *buffer_queues.back());
+    buffer_queues.emplace_back(system.Kernel(), buffer_queue_id, layer_id);
+    display->CreateLayer(layer_id, buffer_queues.back());
    return layer_id;
 }

 void NVFlinger::CloseLayer(u64 layer_id) {
-    const auto guard = Lock();
-
    for (auto& display : displays) {
        display.CloseLayer(layer_id);
    }
 }

 std::optional<u32> NVFlinger::FindBufferQueueId(u64 display_id, u64 layer_id) const {
-    const auto guard = Lock();
    const auto* const layer = FindLayer(display_id, layer_id);

    if (layer == nullptr) {
@@ -162,7 +151,6 @@ std::optional<u32> NVFlinger::FindBufferQueueId(u64 display_id, u64 layer_id) co
 }

 std::shared_ptr<Kernel::ReadableEvent> NVFlinger::FindVsyncEvent(u64 display_id) const {
-    const auto guard = Lock();
    auto* const display = FindDisplay(display_id);

    if (display == nullptr) {
@@ -172,16 +160,20 @@ std::shared_ptr<Kernel::ReadableEvent> NVFlinger::FindVsyncEvent(u64 display_id)
    return display->GetVSyncEvent();
 }

-BufferQueue* NVFlinger::FindBufferQueue(u32 id) {
-    const auto guard = Lock();
+BufferQueue& NVFlinger::FindBufferQueue(u32 id) {
    const auto itr = std::find_if(buffer_queues.begin(), buffer_queues.end(),
-                                  [id](const auto& queue) { return queue->GetId() == id; });
+                                  [id](const auto& queue) { return queue.GetId() == id; });

-    if (itr == buffer_queues.end()) {
-        return nullptr;
-    }
+    ASSERT(itr != buffer_queues.end());
+    return *itr;
+}

-    return itr->get();
+const BufferQueue& NVFlinger::FindBufferQueue(u32 id) const {
+    const auto itr = std::find_if(buffer_queues.begin(), buffer_queues.end(),
+                                  [id](const auto& queue) { return queue.GetId() == id; });
+
+    ASSERT(itr != buffer_queues.end());
+    return *itr;
 }

 VI::Display* NVFlinger::FindDisplay(u64 display_id) {
--- a/src/core/hle/service/nvflinger/nvflinger.h
+++ b/src/core/hle/service/nvflinger/nvflinger.h
@@ -75,7 +75,10 @@ public:
    [[nodiscard]] std::shared_ptr<Kernel::ReadableEvent> FindVsyncEvent(u64 display_id) const;

    /// Obtains a buffer queue identified by the ID.
-    [[nodiscard]] BufferQueue* FindBufferQueue(u32 id);
+    [[nodiscard]] BufferQueue& FindBufferQueue(u32 id);
+
+    /// Obtains a buffer queue identified by the ID.
+    [[nodiscard]] const BufferQueue& FindBufferQueue(u32 id) const;

    /// Performs a composition request to the emulated nvidia GPU and triggers the vsync events when
    /// finished.
@@ -83,11 +86,11 @@ public:

    [[nodiscard]] s64 GetNextTicks() const;

-private:
    [[nodiscard]] std::unique_lock<std::mutex> Lock() const {
        return std::unique_lock{*guard};
    }

+private:
    /// Finds the display identified by the specified ID.
    [[nodiscard]] VI::Display* FindDisplay(u64 display_id);

@@ -107,7 +110,7 @@ private:
    std::shared_ptr<Nvidia::Module> nvdrv;

    std::vector<VI::Display> displays;
-    std::vector<std::unique_ptr<BufferQueue>> buffer_queues;
+    std::vector<BufferQueue> buffer_queues;

    /// Id to use for the next layer that is created, this counter is shared among all displays.
    u64 next_layer_id = 1;
--- a/src/core/hle/service/pcie/pcie.cpp
+++ b/src/core/hle/service/pcie/pcie.cpp
@@ -48,7 +48,7 @@ public:

 class PCIe final : public ServiceFramework<PCIe> {
 public:
-    explicit PCIe(Core::System& system_) : ServiceFramework{system, "pcie"} {
+    explicit PCIe(Core::System& system_) : ServiceFramework{system_, "pcie"} {
        // clang-format off
        static const FunctionInfo functions[] = {
            {0, nullptr, "RegisterClassDriver"},
--- a/src/core/hle/service/service.cpp
+++ b/src/core/hle/service/service.cpp
@@ -95,14 +95,9 @@ ServiceFrameworkBase::ServiceFrameworkBase(Core::System& system_, const char* se
    : system{system_}, service_name{service_name_}, max_sessions{max_sessions_},
      handler_invoker{handler_invoker_} {}

-ServiceFrameworkBase::~ServiceFrameworkBase() {
-    // Wait for other threads to release access before destroying
-    const auto guard = LockService();
-}
+ServiceFrameworkBase::~ServiceFrameworkBase() = default;

 void ServiceFrameworkBase::InstallAsService(SM::ServiceManager& service_manager) {
-    const auto guard = LockService();
-
    ASSERT(!port_installed);

    auto port = service_manager.RegisterService(service_name, max_sessions).Unwrap();
@@ -111,8 +106,6 @@ void ServiceFrameworkBase::InstallAsService(SM::ServiceManager& service_manager)
 }

 void ServiceFrameworkBase::InstallAsNamedPort(Kernel::KernelCore& kernel) {
-    const auto guard = LockService();
-
    ASSERT(!port_installed);

    auto [server_port, client_port] =
@@ -122,6 +115,17 @@ void ServiceFrameworkBase::InstallAsNamedPort(Kernel::KernelCore& kernel) {
    port_installed = true;
 }

+std::shared_ptr<Kernel::ClientPort> ServiceFrameworkBase::CreatePort(Kernel::KernelCore& kernel) {
+    ASSERT(!port_installed);
+
+    auto [server_port, client_port] =
+        Kernel::ServerPort::CreatePortPair(kernel, max_sessions, service_name);
+    auto port = MakeResult(std::move(server_port)).Unwrap();
+    port->SetHleHandler(shared_from_this());
+    port_installed = true;
+    return client_port;
+}
+
 void ServiceFrameworkBase::RegisterHandlersBase(const FunctionInfoBase* functions, std::size_t n) {
    handlers.reserve(handlers.size() + n);
    for (std::size_t i = 0; i < n; ++i) {
@@ -160,8 +164,6 @@ void ServiceFrameworkBase::InvokeRequest(Kernel::HLERequestContext& ctx) {
 }

 ResultCode ServiceFrameworkBase::HandleSyncRequest(Kernel::HLERequestContext& context) {
-    const auto guard = LockService();
-
    switch (context.GetCommandType()) {
    case IPC::CommandType::Close: {
        IPC::ResponseBuilder rb{context, 2};
@@ -182,11 +184,7 @@ ResultCode ServiceFrameworkBase::HandleSyncRequest(Kernel::HLERequestContext& co
        UNIMPLEMENTED_MSG("command_type={}", context.GetCommandType());
    }

-    // If emulation was shutdown, we are closing service threads, do not write the response back to
-    // memory that may be shutting down as well.
-    if (system.IsPoweredOn()) {
-        context.WriteToOutgoingCommandBuffer(context.GetThread());
-    }
+    context.WriteToOutgoingCommandBuffer(context.GetThread());

    return RESULT_SUCCESS;
 }
--- a/src/core/hle/service/service.h
+++ b/src/core/hle/service/service.h
@@ -5,11 +5,9 @@
 #pragma once

 #include <cstddef>
-#include <mutex>
 #include <string>
 #include <boost/container/flat_map.hpp>
 #include "common/common_types.h"
-#include "common/spin_lock.h"
 #include "core/hle/kernel/hle_ipc.h"
 #include "core/hle/kernel/object.h"

@@ -70,9 +68,11 @@ public:
    void InstallAsService(SM::ServiceManager& service_manager);
    /// Creates a port pair and registers it on the kernel's global port registry.
    void InstallAsNamedPort(Kernel::KernelCore& kernel);
-    /// Invokes a service request routine.
+    /// Creates and returns an unregistered port for the service.
+    std::shared_ptr<Kernel::ClientPort> CreatePort(Kernel::KernelCore& kernel);
+
    void InvokeRequest(Kernel::HLERequestContext& ctx);
-    /// Handles a synchronization request for the service.
+
    ResultCode HandleSyncRequest(Kernel::HLERequestContext& context) override;

 protected:
@@ -80,11 +80,6 @@ protected:
    template <typename Self>
    using HandlerFnP = void (Self::*)(Kernel::HLERequestContext&);

-    /// Used to gain exclusive access to the service members, e.g. from CoreTiming thread.
-    [[nodiscard]] std::scoped_lock<Common::SpinLock> LockService() {
-        return std::scoped_lock{lock_service};
-    }
-
    /// System context that the service operates under.
    Core::System& system;

@@ -120,9 +115,6 @@ private:
    /// Function used to safely up-cast pointers to the derived class before invoking a handler.
    InvokerFn* handler_invoker;
    boost::container::flat_map<u32, FunctionInfoBase> handlers;
-
-    /// Used to gain exclusive access to the service members, e.g. from CoreTiming thread.
-    Common::SpinLock lock_service;
 };

 /**
--- a/src/core/hle/service/sockets/blocking_worker.h
+++ b/src/core/hle/service/sockets/blocking_worker.h
@@ -0,0 +1,161 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <thread>
+#include <variant>
+#include <vector>
+
+#include <fmt/format.h>
+
+#include "common/assert.h"
+#include "common/microprofile.h"
+#include "common/thread.h"
+#include "core/core.h"
+#include "core/hle/kernel/hle_ipc.h"
+#include "core/hle/kernel/kernel.h"
+#include "core/hle/kernel/thread.h"
+#include "core/hle/kernel/writable_event.h"
+
+namespace Service::Sockets {
+
+/**
+ * Worker abstraction to execute blocking calls on host without blocking the guest thread
+ *
+ * @tparam Service  Service where the work is executed
+ * @tparam Types Types of work to execute
+ */
+template <class Service, class... Types>
+class BlockingWorker {
+    using This = BlockingWorker<Service, Types...>;
+    using WorkVariant = std::variant<std::monostate, Types...>;
+
+public:
+    /// Create a new worker
+    static std::unique_ptr<This> Create(Core::System& system, Service* service,
+                                        std::string_view name) {
+        return std::unique_ptr<This>(new This(system, service, name));
+    }
+
+    ~BlockingWorker() {
+        while (!is_available.load(std::memory_order_relaxed)) {
+            // Busy wait until work is finished
+            std::this_thread::yield();
+        }
+        // Monostate means to exit the thread
+        work = std::monostate{};
+        work_event.Set();
+        thread.join();
+    }
+
+    /**
+     * Try to capture the worker to send work after a success
+     * @returns True when the worker has been successfully captured
+     */
+    bool TryCapture() {
+        bool expected = true;
+        return is_available.compare_exchange_weak(expected, false, std::memory_order_relaxed,
+                                                  std::memory_order_relaxed);
+    }
+
+    /**
+     * Send work to this worker abstraction
+     * @see TryCapture must be called before attempting to call this function
+     */
+    template <class Work>
+    void SendWork(Work new_work) {
+        ASSERT_MSG(!is_available, "Trying to send work on a worker that's not captured");
+        work = std::move(new_work);
+        work_event.Set();
+    }
+
+    /// Generate a callback for @see SleepClientThread
+    template <class Work>
+    auto Callback() {
+        return [this](std::shared_ptr<Kernel::Thread>, Kernel::HLERequestContext& ctx,
+                      Kernel::ThreadWakeupReason reason) {
+            ASSERT(reason == Kernel::ThreadWakeupReason::Signal);
+            std::get<Work>(work).Response(ctx);
+            is_available.store(true);
+        };
+    }
+
+    /// Get kernel event that will be signalled by the worker when the host operation finishes
+    std::shared_ptr<Kernel::WritableEvent> KernelEvent() const {
+        return kernel_event;
+    }
+
+private:
+    explicit BlockingWorker(Core::System& system, Service* service, std::string_view name) {
+        auto pair = Kernel::WritableEvent::CreateEventPair(system.Kernel(), std::string(name));
+        kernel_event = std::move(pair.writable);
+        thread = std::thread([this, &system, service, name] { Run(system, service, name); });
+    }
+
+    void Run(Core::System& system, Service* service, std::string_view name) {
+        system.RegisterHostThread();
+
+        const std::string thread_name = fmt::format("yuzu:{}", name);
+        MicroProfileOnThreadCreate(thread_name.c_str());
+        Common::SetCurrentThreadName(thread_name.c_str());
+
+        bool keep_running = true;
+        while (keep_running) {
+            work_event.Wait();
+
+            const auto visit_fn = [service, &keep_running]<typename T>(T&& w) {
+                if constexpr (std::is_same_v<std::decay_t<T>, std::monostate>) {
+                    keep_running = false;
+                } else {
+                    w.Execute(service);
+                }
+            };
+            std::visit(visit_fn, work);
+
+            kernel_event->Signal();
+        }
+    }
+
+    std::thread thread;
+    WorkVariant work;
+    Common::Event work_event;
+    std::shared_ptr<Kernel::WritableEvent> kernel_event;
+    std::atomic_bool is_available{true};
+};
+
+template <class Service, class... Types>
+class BlockingWorkerPool {
+    using Worker = BlockingWorker<Service, Types...>;
+
+public:
+    explicit BlockingWorkerPool(Core::System& system_, Service* service_)
+        : system{system_}, service{service_} {}
+
+    /// Returns a captured worker thread, creating new ones if necessary
+    Worker* CaptureWorker() {
+        for (auto& worker : workers) {
+            if (worker->TryCapture()) {
+                return worker.get();
+            }
+        }
+        auto new_worker = Worker::Create(system, service, fmt::format("BSD:{}", workers.size()));
+        [[maybe_unused]] const bool success = new_worker->TryCapture();
+        ASSERT(success);
+
+        return workers.emplace_back(std::move(new_worker)).get();
+    }
+
+private:
+    Core::System& system;
+    Service* const service;
+
+    std::vector<std::unique_ptr<Worker>> workers;
+};
+
+} // namespace Service::Sockets
--- a/src/core/hle/service/sockets/bsd.cpp
+++ b/src/core/hle/service/sockets/bsd.cpp
@@ -178,12 +178,13 @@ void BSD::Poll(Kernel::HLERequestContext& ctx) {

    LOG_DEBUG(Service, "called. nfds={} timeout={}", nfds, timeout);

-    ExecuteWork(ctx, PollWork{
-                         .nfds = nfds,
-                         .timeout = timeout,
-                         .read_buffer = ctx.ReadBuffer(),
-                         .write_buffer = std::vector<u8>(ctx.GetWriteBufferSize()),
-                     });
+    ExecuteWork(ctx, "BSD:Poll", timeout != 0,
+                PollWork{
+                    .nfds = nfds,
+                    .timeout = timeout,
+                    .read_buffer = ctx.ReadBuffer(),
+                    .write_buffer = std::vector<u8>(ctx.GetWriteBufferSize()),
+                });
 }

 void BSD::Accept(Kernel::HLERequestContext& ctx) {
@@ -192,10 +193,11 @@ void BSD::Accept(Kernel::HLERequestContext& ctx) {

    LOG_DEBUG(Service, "called. fd={}", fd);

-    ExecuteWork(ctx, AcceptWork{
-                         .fd = fd,
-                         .write_buffer = std::vector<u8>(ctx.GetWriteBufferSize()),
-                     });
+    ExecuteWork(ctx, "BSD:Accept", IsBlockingSocket(fd),
+                AcceptWork{
+                    .fd = fd,
+                    .write_buffer = std::vector<u8>(ctx.GetWriteBufferSize()),
+                });
 }

 void BSD::Bind(Kernel::HLERequestContext& ctx) {
@@ -213,10 +215,11 @@ void BSD::Connect(Kernel::HLERequestContext& ctx) {

    LOG_DEBUG(Service, "called. fd={} addrlen={}", fd, ctx.GetReadBufferSize());

-    ExecuteWork(ctx, ConnectWork{
-                         .fd = fd,
-                         .addr = ctx.ReadBuffer(),
-                     });
+    ExecuteWork(ctx, "BSD:Connect", IsBlockingSocket(fd),
+                ConnectWork{
+                    .fd = fd,
+                    .addr = ctx.ReadBuffer(),
+                });
 }

 void BSD::GetPeerName(Kernel::HLERequestContext& ctx) {
@@ -324,11 +327,12 @@ void BSD::Recv(Kernel::HLERequestContext& ctx) {

    LOG_DEBUG(Service, "called. fd={} flags=0x{:x} len={}", fd, flags, ctx.GetWriteBufferSize());

-    ExecuteWork(ctx, RecvWork{
-                         .fd = fd,
-                         .flags = flags,
-                         .message = std::vector<u8>(ctx.GetWriteBufferSize()),
-                     });
+    ExecuteWork(ctx, "BSD:Recv", IsBlockingSocket(fd),
+                RecvWork{
+                    .fd = fd,
+                    .flags = flags,
+                    .message = std::vector<u8>(ctx.GetWriteBufferSize()),
+                });
 }

 void BSD::RecvFrom(Kernel::HLERequestContext& ctx) {
@@ -340,12 +344,13 @@ void BSD::RecvFrom(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service, "called. fd={} flags=0x{:x} len={} addrlen={}", fd, flags,
              ctx.GetWriteBufferSize(0), ctx.GetWriteBufferSize(1));

-    ExecuteWork(ctx, RecvFromWork{
-                         .fd = fd,
-                         .flags = flags,
-                         .message = std::vector<u8>(ctx.GetWriteBufferSize(0)),
-                         .addr = std::vector<u8>(ctx.GetWriteBufferSize(1)),
-                     });
+    ExecuteWork(ctx, "BSD:RecvFrom", IsBlockingSocket(fd),
+                RecvFromWork{
+                    .fd = fd,
+                    .flags = flags,
+                    .message = std::vector<u8>(ctx.GetWriteBufferSize(0)),
+                    .addr = std::vector<u8>(ctx.GetWriteBufferSize(1)),
+                });
 }

 void BSD::Send(Kernel::HLERequestContext& ctx) {
@@ -356,11 +361,12 @@ void BSD::Send(Kernel::HLERequestContext& ctx) {

    LOG_DEBUG(Service, "called. fd={} flags=0x{:x} len={}", fd, flags, ctx.GetReadBufferSize());

-    ExecuteWork(ctx, SendWork{
-                         .fd = fd,
-                         .flags = flags,
-                         .message = ctx.ReadBuffer(),
-                     });
+    ExecuteWork(ctx, "BSD:Send", IsBlockingSocket(fd),
+                SendWork{
+                    .fd = fd,
+                    .flags = flags,
+                    .message = ctx.ReadBuffer(),
+                });
 }

 void BSD::SendTo(Kernel::HLERequestContext& ctx) {
@@ -371,12 +377,13 @@ void BSD::SendTo(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service, "called. fd={} flags=0x{} len={} addrlen={}", fd, flags,
              ctx.GetReadBufferSize(0), ctx.GetReadBufferSize(1));

-    ExecuteWork(ctx, SendToWork{
-                         .fd = fd,
-                         .flags = flags,
-                         .message = ctx.ReadBuffer(0),
-                         .addr = ctx.ReadBuffer(1),
-                     });
+    ExecuteWork(ctx, "BSD:SendTo", IsBlockingSocket(fd),
+                SendToWork{
+                    .fd = fd,
+                    .flags = flags,
+                    .message = ctx.ReadBuffer(0),
+                    .addr = ctx.ReadBuffer(1),
+                });
 }

 void BSD::Write(Kernel::HLERequestContext& ctx) {
@@ -385,11 +392,12 @@ void BSD::Write(Kernel::HLERequestContext& ctx) {

    LOG_DEBUG(Service, "called. fd={} len={}", fd, ctx.GetReadBufferSize());

-    ExecuteWork(ctx, SendWork{
-                         .fd = fd,
-                         .flags = 0,
-                         .message = ctx.ReadBuffer(),
-                     });
+    ExecuteWork(ctx, "BSD:Write", IsBlockingSocket(fd),
+                SendWork{
+                    .fd = fd,
+                    .flags = 0,
+                    .message = ctx.ReadBuffer(),
+                });
 }

 void BSD::Close(Kernel::HLERequestContext& ctx) {
@@ -402,9 +410,24 @@ void BSD::Close(Kernel::HLERequestContext& ctx) {
 }

 template <typename Work>
-void BSD::ExecuteWork(Kernel::HLERequestContext& ctx, Work work) {
-    work.Execute(this);
+void BSD::ExecuteWork(Kernel::HLERequestContext& ctx, std::string_view sleep_reason,
+                      bool is_blocking, Work work) {
+    if (!is_blocking) {
+        work.Execute(this);
+        work.Response(ctx);
+        return;
+    }
+
+    // Signal a dummy response to make IPC validation happy
+    // This will be overwritten by the SleepClientThread callback
    work.Response(ctx);
+
+    auto worker = worker_pool.CaptureWorker();
+
+    ctx.SleepClientThread(std::string(sleep_reason), std::numeric_limits<u64>::max(),
+                          worker->Callback<Work>(), worker->KernelEvent());
+
+    worker->SendWork(std::move(work));
 }

 std::pair<s32, Errno> BSD::SocketImpl(Domain domain, Type type, Protocol protocol) {
@@ -784,6 +807,18 @@ bool BSD::IsFileDescriptorValid(s32 fd) const noexcept {
    return true;
 }

+bool BSD::IsBlockingSocket(s32 fd) const noexcept {
+    // Inform invalid sockets as non-blocking
+    // This way we avoid using a worker thread as it will fail without blocking host
+    if (fd > static_cast<s32>(MAX_FD) || fd < 0) {
+        return false;
+    }
+    if (!file_descriptors[fd]) {
+        return false;
+    }
+    return (file_descriptors[fd]->flags & FLAG_O_NONBLOCK) != 0;
+}
+
 void BSD::BuildErrnoResponse(Kernel::HLERequestContext& ctx, Errno bsd_errno) const noexcept {
    IPC::ResponseBuilder rb{ctx, 4};

@@ -792,7 +827,8 @@ void BSD::BuildErrnoResponse(Kernel::HLERequestContext& ctx, Errno bsd_errno) co
    rb.PushEnum(bsd_errno);
 }

-BSD::BSD(Core::System& system_, const char* name) : ServiceFramework{system_, name} {
+BSD::BSD(Core::System& system_, const char* name)
+    : ServiceFramework{system_, name}, worker_pool{system_, this} {
    // clang-format off
    static const FunctionInfo functions[] = {
        {0, &BSD::RegisterClient, "RegisterClient"},
--- a/src/core/hle/service/sockets/bsd.h
+++ b/src/core/hle/service/sockets/bsd.h
@@ -11,6 +11,7 @@
 #include "common/common_types.h"
 #include "core/hle/kernel/hle_ipc.h"
 #include "core/hle/service/service.h"
+#include "core/hle/service/sockets/blocking_worker.h"
 #include "core/hle/service/sockets/sockets.h"

 namespace Core {
@@ -137,7 +138,8 @@ private:
    void Close(Kernel::HLERequestContext& ctx);

    template <typename Work>
-    void ExecuteWork(Kernel::HLERequestContext& ctx, Work work);
+    void ExecuteWork(Kernel::HLERequestContext& ctx, std::string_view sleep_reason,
+                     bool is_blocking, Work work);

    std::pair<s32, Errno> SocketImpl(Domain domain, Type type, Protocol protocol);
    std::pair<s32, Errno> PollImpl(std::vector<u8>& write_buffer, std::vector<u8> read_buffer,
@@ -161,10 +163,15 @@ private:

    s32 FindFreeFileDescriptorHandle() noexcept;
    bool IsFileDescriptorValid(s32 fd) const noexcept;
+    bool IsBlockingSocket(s32 fd) const noexcept;

    void BuildErrnoResponse(Kernel::HLERequestContext& ctx, Errno bsd_errno) const noexcept;

    std::array<std::optional<FileDescriptor>, MAX_FD> file_descriptors;
+
+    BlockingWorkerPool<BSD, PollWork, AcceptWork, ConnectWork, RecvWork, RecvFromWork, SendWork,
+                       SendToWork>
+        worker_pool;
 };

 class BSDCFG final : public ServiceFramework<BSDCFG> {
--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -536,7 +536,8 @@ private:
        LOG_DEBUG(Service_VI, "called. id=0x{:08X} transaction={:X}, flags=0x{:08X}", id,
                  transaction, flags);

-        auto& buffer_queue = *nv_flinger.FindBufferQueue(id);
+        const auto guard = nv_flinger.Lock();
+        auto& buffer_queue = nv_flinger.FindBufferQueue(id);

        switch (transaction) {
        case TransactionId::Connect: {
@@ -546,9 +547,6 @@ private:
                                 Settings::values.resolution_factor.GetValue()),
                static_cast<u32>(static_cast<u32>(DisplayResolution::UndockedHeight) *
                                 Settings::values.resolution_factor.GetValue())};
-
-            buffer_queue.Connect();
-
            ctx.WriteBuffer(response.Serialize());
            break;
        }
@@ -565,25 +563,40 @@ private:
            IGBPDequeueBufferRequestParcel request{ctx.ReadBuffer()};
            const u32 width{request.data.width};
            const u32 height{request.data.height};
+            auto result = buffer_queue.DequeueBuffer(width, height);

-            do {
-                if (auto result = buffer_queue.DequeueBuffer(width, height); result) {
-                    // Buffer is available
-                    IGBPDequeueBufferResponseParcel response{result->first, *result->second};
-                    ctx.WriteBuffer(response.Serialize());
-                    break;
-                }
-            } while (buffer_queue.IsConnected());
+            if (result) {
+                // Buffer is available
+                IGBPDequeueBufferResponseParcel response{result->first, *result->second};
+                ctx.WriteBuffer(response.Serialize());
+            } else {
+                // Wait the current thread until a buffer becomes available
+                ctx.SleepClientThread(
+                    "IHOSBinderDriver::DequeueBuffer", UINT64_MAX,
+                    [=, this](std::shared_ptr<Kernel::Thread> thread,
+                              Kernel::HLERequestContext& ctx, Kernel::ThreadWakeupReason reason) {
+                        // Repeat TransactParcel DequeueBuffer when a buffer is available
+                        const auto guard = nv_flinger.Lock();
+                        auto& buffer_queue = nv_flinger.FindBufferQueue(id);
+                        auto result = buffer_queue.DequeueBuffer(width, height);
+                        ASSERT_MSG(result != std::nullopt, "Could not dequeue buffer.");

+                        IGBPDequeueBufferResponseParcel response{result->first, *result->second};
+                        ctx.WriteBuffer(response.Serialize());
+                        IPC::ResponseBuilder rb{ctx, 2};
+                        rb.Push(RESULT_SUCCESS);
+                    },
+                    buffer_queue.GetWritableBufferWaitEvent());
+            }
            break;
        }
        case TransactionId::RequestBuffer: {
            IGBPRequestBufferRequestParcel request{ctx.ReadBuffer()};

            auto& buffer = buffer_queue.RequestBuffer(request.slot);
+
            IGBPRequestBufferResponseParcel response{buffer};
            ctx.WriteBuffer(response.Serialize());
-
            break;
        }
        case TransactionId::QueueBuffer: {
@@ -669,7 +682,7 @@ private:

        LOG_WARNING(Service_VI, "(STUBBED) called id={}, unknown={:08X}", id, unknown);

-        const auto& buffer_queue = *nv_flinger.FindBufferQueue(id);
+        const auto& buffer_queue = nv_flinger.FindBufferQueue(id);

        // TODO(Subv): Find out what this actually is.
        IPC::ResponseBuilder rb{ctx, 2, 1};
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -148,4 +148,9 @@ void RestoreGlobalState(bool is_powered_on) {
    values.motion_enabled.SetGlobal(true);
 }

+void Sanitize() {
+    values.use_asynchronous_gpu_emulation.SetValue(
+        values.use_asynchronous_gpu_emulation.GetValue() || values.use_multi_core.GetValue());
+}
+
 } // namespace Settings
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -257,4 +257,7 @@ void LogSettings();
 // Restore the global state of all applicable settings in the Values struct
 void RestoreGlobalState(bool is_powered_on);

+// Fixes settings that are known to cause issues with the emulator
+void Sanitize();
+
 } // namespace Settings
--- a/src/input_common/udp/client.cpp
+++ b/src/input_common/udp/client.cpp
@@ -225,6 +225,11 @@ void Client::OnPortInfo([[maybe_unused]] Response::PortInfo data) {
 }

 void Client::OnPadData(Response::PadData data, std::size_t client) {
+    // Accept packets only for the correct pad
+    if (static_cast<u8>(clients[client].pad_index) != data.info.id) {
+        return;
+    }
+
    LOG_TRACE(Input, "PadData packet received");
    if (data.packet_counter == clients[client].packet_sequence) {
        LOG_WARNING(
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -25,6 +25,7 @@ add_library(video_core STATIC
    command_classes/vic.h
    compatible_formats.cpp
    compatible_formats.h
+    delayed_destruction_ring.h
    dirty_flags.cpp
    dirty_flags.h
    dma_pusher.cpp
@@ -47,7 +48,6 @@ add_library(video_core STATIC
    engines/shader_bytecode.h
    engines/shader_header.h
    engines/shader_type.h
-    framebuffer_config.h
    macro/macro.cpp
    macro/macro.h
    macro/macro_hle.cpp
@@ -59,6 +59,10 @@ add_library(video_core STATIC
    fence_manager.h
    gpu.cpp
    gpu.h
+    gpu_asynch.cpp
+    gpu_asynch.h
+    gpu_synch.cpp
+    gpu_synch.h
    gpu_thread.cpp
    gpu_thread.h
    guest_driver.cpp
@@ -81,14 +85,10 @@ add_library(video_core STATIC
    renderer_opengl/gl_device.h
    renderer_opengl/gl_fence_manager.cpp
    renderer_opengl/gl_fence_manager.h
-    renderer_opengl/gl_framebuffer_cache.cpp
-    renderer_opengl/gl_framebuffer_cache.h
    renderer_opengl/gl_rasterizer.cpp
    renderer_opengl/gl_rasterizer.h
    renderer_opengl/gl_resource_manager.cpp
    renderer_opengl/gl_resource_manager.h
-    renderer_opengl/gl_sampler_cache.cpp
-    renderer_opengl/gl_sampler_cache.h
    renderer_opengl/gl_shader_cache.cpp
    renderer_opengl/gl_shader_cache.h
    renderer_opengl/gl_shader_decompiler.cpp
@@ -110,8 +110,10 @@ add_library(video_core STATIC
    renderer_opengl/maxwell_to_gl.h
    renderer_opengl/renderer_opengl.cpp
    renderer_opengl/renderer_opengl.h
-    renderer_opengl/utils.cpp
-    renderer_opengl/utils.h
+    renderer_opengl/util_shaders.cpp
+    renderer_opengl/util_shaders.h
+    renderer_vulkan/blit_image.cpp
+    renderer_vulkan/blit_image.h
    renderer_vulkan/fixed_pipeline_state.cpp
    renderer_vulkan/fixed_pipeline_state.h
    renderer_vulkan/maxwell_to_vk.cpp
@@ -138,8 +140,6 @@ add_library(video_core STATIC
    renderer_vulkan/vk_fence_manager.h
    renderer_vulkan/vk_graphics_pipeline.cpp
    renderer_vulkan/vk_graphics_pipeline.h
-    renderer_vulkan/vk_image.cpp
-    renderer_vulkan/vk_image.h
    renderer_vulkan/vk_master_semaphore.cpp
    renderer_vulkan/vk_master_semaphore.h
    renderer_vulkan/vk_memory_manager.cpp
@@ -150,12 +150,8 @@ add_library(video_core STATIC
    renderer_vulkan/vk_query_cache.h
    renderer_vulkan/vk_rasterizer.cpp
    renderer_vulkan/vk_rasterizer.h
-    renderer_vulkan/vk_renderpass_cache.cpp
-    renderer_vulkan/vk_renderpass_cache.h
    renderer_vulkan/vk_resource_pool.cpp
    renderer_vulkan/vk_resource_pool.h
-    renderer_vulkan/vk_sampler_cache.cpp
-    renderer_vulkan/vk_sampler_cache.h
    renderer_vulkan/vk_scheduler.cpp
    renderer_vulkan/vk_scheduler.h
    renderer_vulkan/vk_shader_decompiler.cpp
@@ -174,10 +170,6 @@ add_library(video_core STATIC
    renderer_vulkan/vk_texture_cache.h
    renderer_vulkan/vk_update_descriptor.cpp
    renderer_vulkan/vk_update_descriptor.h
-    renderer_vulkan/wrapper.cpp
-    renderer_vulkan/wrapper.h
-    sampler_cache.cpp
-    sampler_cache.h
    shader_cache.h
    shader_notify.cpp
    shader_notify.h
@@ -234,25 +226,48 @@ add_library(video_core STATIC
    shader/transform_feedback.h
    surface.cpp
    surface.h
+    texture_cache/accelerated_swizzle.cpp
+    texture_cache/accelerated_swizzle.h
+    texture_cache/decode_bc4.cpp
+    texture_cache/decode_bc4.h
+    texture_cache/descriptor_table.h
+    texture_cache/formatter.cpp
+    texture_cache/formatter.h
    texture_cache/format_lookup_table.cpp
    texture_cache/format_lookup_table.h
-    texture_cache/surface_base.cpp
-    texture_cache/surface_base.h
-    texture_cache/surface_params.cpp
-    texture_cache/surface_params.h
-    texture_cache/surface_view.cpp
-    texture_cache/surface_view.h
+    texture_cache/image_base.cpp
+    texture_cache/image_base.h
+    texture_cache/image_info.cpp
+    texture_cache/image_info.h
+    texture_cache/image_view_base.cpp
+    texture_cache/image_view_base.h
+    texture_cache/image_view_info.cpp
+    texture_cache/image_view_info.h
+    texture_cache/render_targets.h
+    texture_cache/samples_helper.h
+    texture_cache/slot_vector.h
    texture_cache/texture_cache.h
+    texture_cache/types.h
+    texture_cache/util.cpp
+    texture_cache/util.h
    textures/astc.cpp
    textures/astc.h
-    textures/convert.cpp
-    textures/convert.h
    textures/decoders.cpp
    textures/decoders.h
    textures/texture.cpp
    textures/texture.h
    video_core.cpp
    video_core.h
+    vulkan_common/vulkan_debug_callback.cpp
+    vulkan_common/vulkan_debug_callback.h
+    vulkan_common/vulkan_instance.cpp
+    vulkan_common/vulkan_instance.h
+    vulkan_common/vulkan_library.cpp
+    vulkan_common/vulkan_library.h
+    vulkan_common/vulkan_surface.cpp
+    vulkan_common/vulkan_surface.h
+    vulkan_common/vulkan_wrapper.cpp
+    vulkan_common/vulkan_wrapper.h
 )

 create_target_directory_groups(video_core)
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -118,20 +118,17 @@ public:
    /// Prepares the buffer cache for data uploading
    /// @param max_size Maximum number of bytes that will be uploaded
    /// @return True when a stream buffer invalidation was required, false otherwise
-    bool Map(std::size_t max_size) {
+    void Map(std::size_t max_size) {
        std::lock_guard lock{mutex};

-        bool invalidated;
-        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
+        std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4);
        buffer_offset = buffer_offset_base;
-
-        return invalidated;
    }

    /// Finishes the upload stream
    void Unmap() {
        std::lock_guard lock{mutex};
-        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
+        stream_buffer.Unmap(buffer_offset - buffer_offset_base);
    }

    /// Function called at the end of each frame, inteded for deferred operations
@@ -261,9 +258,9 @@ public:
 protected:
    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_,
                         Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
-                         std::unique_ptr<StreamBuffer> stream_buffer_)
+                         StreamBuffer& stream_buffer_)
        : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_},
-          stream_buffer{std::move(stream_buffer_)}, stream_buffer_handle{stream_buffer->Handle()} {}
+          stream_buffer{stream_buffer_} {}

    ~BufferCache() = default;

@@ -441,7 +438,7 @@ private:

        buffer_ptr += size;
        buffer_offset += size;
-        return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()};
+        return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()};
    }

    void AlignBuffer(std::size_t alignment) {
@@ -567,9 +564,7 @@ private:
    VideoCore::RasterizerInterface& rasterizer;
    Tegra::MemoryManager& gpu_memory;
    Core::Memory::Memory& cpu_memory;
-
-    std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle;
+    StreamBuffer& stream_buffer;

    u8* buffer_ptr = nullptr;
    u64 buffer_offset = 0;
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@@ -9,7 +9,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
-#include "video_core/texture_cache/surface_params.h"
+#include "video_core/textures/decoders.h"

 extern "C" {
 #include <libswscale/swscale.h>
@@ -105,9 +105,9 @@ void Vic::Execute() {
            const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1,
                                                            block_height, 0);
            std::vector<u8> swizzled_data(size);
-            Tegra::Texture::CopySwizzledData(frame->width, frame->height, 1, 4, 4,
-                                             swizzled_data.data(), converted_frame_buffer.get(),
-                                             false, block_height, 0, 1);
+            Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4,
+                                           frame->width, 4, swizzled_data.data(),
+                                           converted_frame_buffer.get(), block_height, 0, 0);

            gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
            gpu.Maxwell3D().OnMemoryWrite();
--- a/src/video_core/compatible_formats.cpp
+++ b/src/video_core/compatible_formats.cpp
@@ -3,9 +3,9 @@
 // Refer to the license.txt file included.

 #include <array>
-#include <bitset>
 #include <cstddef>

+#include "common/common_types.h"
 #include "video_core/compatible_formats.h"
 #include "video_core/surface.h"

@@ -13,23 +13,25 @@ namespace VideoCore::Surface {

 namespace {

+using Table = std::array<std::array<u64, 2>, MaxPixelFormat>;
+
 // Compatibility table taken from Table 3.X.2 in:
 // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt

-constexpr std::array VIEW_CLASS_128_BITS = {
+constexpr std::array VIEW_CLASS_128_BITS{
    PixelFormat::R32G32B32A32_FLOAT,
    PixelFormat::R32G32B32A32_UINT,
    PixelFormat::R32G32B32A32_SINT,
 };

-constexpr std::array VIEW_CLASS_96_BITS = {
+constexpr std::array VIEW_CLASS_96_BITS{
    PixelFormat::R32G32B32_FLOAT,
 };
 // Missing formats:
 // PixelFormat::RGB32UI,
 // PixelFormat::RGB32I,

-constexpr std::array VIEW_CLASS_64_BITS = {
+constexpr std::array VIEW_CLASS_64_BITS{
    PixelFormat::R32G32_FLOAT,       PixelFormat::R32G32_UINT,
    PixelFormat::R32G32_SINT,        PixelFormat::R16G16B16A16_FLOAT,
    PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
@@ -38,7 +40,7 @@ constexpr std::array VIEW_CLASS_64_BITS = {

 // TODO: How should we handle 48 bits?

-constexpr std::array VIEW_CLASS_32_BITS = {
+constexpr std::array VIEW_CLASS_32_BITS{
    PixelFormat::R16G16_FLOAT,      PixelFormat::B10G11R11_FLOAT, PixelFormat::R32_FLOAT,
    PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT,     PixelFormat::R32_UINT,
    PixelFormat::R16G16_SINT,       PixelFormat::R32_SINT,        PixelFormat::A8B8G8R8_UNORM,
@@ -50,43 +52,105 @@ constexpr std::array VIEW_CLASS_32_BITS = {

 // TODO: How should we handle 24 bits?

-constexpr std::array VIEW_CLASS_16_BITS = {
+constexpr std::array VIEW_CLASS_16_BITS{
    PixelFormat::R16_FLOAT,  PixelFormat::R8G8_UINT,  PixelFormat::R16_UINT,
    PixelFormat::R16_SINT,   PixelFormat::R8G8_UNORM, PixelFormat::R16_UNORM,
    PixelFormat::R8G8_SNORM, PixelFormat::R16_SNORM,  PixelFormat::R8G8_SINT,
 };

-constexpr std::array VIEW_CLASS_8_BITS = {
+constexpr std::array VIEW_CLASS_8_BITS{
    PixelFormat::R8_UINT,
    PixelFormat::R8_UNORM,
    PixelFormat::R8_SINT,
    PixelFormat::R8_SNORM,
 };

-constexpr std::array VIEW_CLASS_RGTC1_RED = {
+constexpr std::array VIEW_CLASS_RGTC1_RED{
    PixelFormat::BC4_UNORM,
    PixelFormat::BC4_SNORM,
 };

-constexpr std::array VIEW_CLASS_RGTC2_RG = {
+constexpr std::array VIEW_CLASS_RGTC2_RG{
    PixelFormat::BC5_UNORM,
    PixelFormat::BC5_SNORM,
 };

-constexpr std::array VIEW_CLASS_BPTC_UNORM = {
+constexpr std::array VIEW_CLASS_BPTC_UNORM{
    PixelFormat::BC7_UNORM,
    PixelFormat::BC7_SRGB,
 };

-constexpr std::array VIEW_CLASS_BPTC_FLOAT = {
+constexpr std::array VIEW_CLASS_BPTC_FLOAT{
    PixelFormat::BC6H_SFLOAT,
    PixelFormat::BC6H_UFLOAT,
 };

+constexpr std::array VIEW_CLASS_ASTC_4x4_RGBA{
+    PixelFormat::ASTC_2D_4X4_UNORM,
+    PixelFormat::ASTC_2D_4X4_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_5x4_RGBA{
+    PixelFormat::ASTC_2D_5X4_UNORM,
+    PixelFormat::ASTC_2D_5X4_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_5x5_RGBA{
+    PixelFormat::ASTC_2D_5X5_UNORM,
+    PixelFormat::ASTC_2D_5X5_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_6x5_RGBA{
+    PixelFormat::ASTC_2D_6X5_UNORM,
+    PixelFormat::ASTC_2D_6X5_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_6x6_RGBA{
+    PixelFormat::ASTC_2D_6X6_UNORM,
+    PixelFormat::ASTC_2D_6X6_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_8x5_RGBA{
+    PixelFormat::ASTC_2D_8X5_UNORM,
+    PixelFormat::ASTC_2D_8X5_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_8x8_RGBA{
+    PixelFormat::ASTC_2D_8X8_UNORM,
+    PixelFormat::ASTC_2D_8X8_SRGB,
+};
+
+// Missing formats:
+// PixelFormat::ASTC_2D_10X5_UNORM
+// PixelFormat::ASTC_2D_10X5_SRGB
+
+// Missing formats:
+// PixelFormat::ASTC_2D_10X6_UNORM
+// PixelFormat::ASTC_2D_10X6_SRGB
+
+constexpr std::array VIEW_CLASS_ASTC_10x8_RGBA{
+    PixelFormat::ASTC_2D_10X8_UNORM,
+    PixelFormat::ASTC_2D_10X8_SRGB,
+};
+
+constexpr std::array VIEW_CLASS_ASTC_10x10_RGBA{
+    PixelFormat::ASTC_2D_10X10_UNORM,
+    PixelFormat::ASTC_2D_10X10_SRGB,
+};
+
+// Missing formats
+// ASTC_2D_12X10_UNORM,
+// ASTC_2D_12X10_SRGB,
+
+constexpr std::array VIEW_CLASS_ASTC_12x12_RGBA{
+    PixelFormat::ASTC_2D_12X12_UNORM,
+    PixelFormat::ASTC_2D_12X12_SRGB,
+};
+
 // Compatibility table taken from Table 4.X.1 in:
 // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt

-constexpr std::array COPY_CLASS_128_BITS = {
+constexpr std::array COPY_CLASS_128_BITS{
    PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_SINT,
    PixelFormat::BC2_UNORM,         PixelFormat::BC2_SRGB,           PixelFormat::BC3_UNORM,
    PixelFormat::BC3_SRGB,          PixelFormat::BC5_UNORM,          PixelFormat::BC5_SNORM,
@@ -97,7 +161,7 @@ constexpr std::array COPY_CLASS_128_BITS = {
 // PixelFormat::RGBA32I
 // COMPRESSED_RG_RGTC2

-constexpr std::array COPY_CLASS_64_BITS = {
+constexpr std::array COPY_CLASS_64_BITS{
    PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UINT,
    PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM,
    PixelFormat::R16G16B16A16_SINT,  PixelFormat::R32G32_UINT,
@@ -110,32 +174,36 @@ constexpr std::array COPY_CLASS_64_BITS = {
 // COMPRESSED_RGBA_S3TC_DXT1_EXT
 // COMPRESSED_SIGNED_RED_RGTC1

-void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) {
-    compatiblity[format_a][format_b] = true;
-    compatiblity[format_b][format_a] = true;
+constexpr void Enable(Table& table, size_t format_a, size_t format_b) {
+    table[format_a][format_b / 64] |= u64(1) << (format_b % 64);
+    table[format_b][format_a / 64] |= u64(1) << (format_a % 64);
 }

-void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) {
-    Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
+constexpr void Enable(Table& table, PixelFormat format_a, PixelFormat format_b) {
+    Enable(table, static_cast<size_t>(format_a), static_cast<size_t>(format_b));
 }

 template <typename Range>
-void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) {
+constexpr void EnableRange(Table& table, const Range& range) {
    for (auto it_a = range.begin(); it_a != range.end(); ++it_a) {
        for (auto it_b = it_a; it_b != range.end(); ++it_b) {
-            Enable(compatibility, *it_a, *it_b);
+            Enable(table, *it_a, *it_b);
        }
    }
 }

-} // Anonymous namespace
+constexpr bool IsSupported(const Table& table, PixelFormat format_a, PixelFormat format_b) {
+    const size_t a = static_cast<size_t>(format_a);
+    const size_t b = static_cast<size_t>(format_b);
+    return ((table[a][b / 64] >> (b % 64)) & 1) != 0;
+}

-FormatCompatibility::FormatCompatibility() {
+constexpr Table MakeViewTable() {
+    Table view{};
    for (size_t i = 0; i < MaxPixelFormat; ++i) {
        // Identity is allowed
        Enable(view, i, i);
    }
-
    EnableRange(view, VIEW_CLASS_128_BITS);
    EnableRange(view, VIEW_CLASS_96_BITS);
    EnableRange(view, VIEW_CLASS_64_BITS);
@@ -146,10 +214,36 @@ FormatCompatibility::FormatCompatibility() {
    EnableRange(view, VIEW_CLASS_RGTC2_RG);
    EnableRange(view, VIEW_CLASS_BPTC_UNORM);
    EnableRange(view, VIEW_CLASS_BPTC_FLOAT);
+    EnableRange(view, VIEW_CLASS_ASTC_4x4_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_5x4_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_5x5_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_6x5_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_6x6_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_8x5_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_8x8_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_10x8_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_10x10_RGBA);
+    EnableRange(view, VIEW_CLASS_ASTC_12x12_RGBA);
+    return view;
+}

-    copy = view;
+constexpr Table MakeCopyTable() {
+    Table copy = MakeViewTable();
    EnableRange(copy, COPY_CLASS_128_BITS);
    EnableRange(copy, COPY_CLASS_64_BITS);
+    return copy;
+}
+
+} // Anonymous namespace
+
+bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b) {
+    static constexpr Table TABLE = MakeViewTable();
+    return IsSupported(TABLE, format_a, format_b);
+}
+
+bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b) {
+    static constexpr Table TABLE = MakeCopyTable();
+    return IsSupported(TABLE, format_a, format_b);
 }

 } // namespace VideoCore::Surface
--- a/src/video_core/compatible_formats.h
+++ b/src/video_core/compatible_formats.h
@@ -4,31 +4,12 @@

 #pragma once

-#include <array>
-#include <bitset>
-#include <cstddef>
-
 #include "video_core/surface.h"

 namespace VideoCore::Surface {

-class FormatCompatibility {
-public:
-    using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>;
+bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b);

-    explicit FormatCompatibility();
-
-    bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept {
-        return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
-    }
-
-    bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept {
-        return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)];
-    }
-
-private:
-    Table view;
-    Table copy;
-};
+bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b);

 } // namespace VideoCore::Surface
--- a/src/video_core/delayed_destruction_ring.h
+++ b/src/video_core/delayed_destruction_ring.h
@@ -0,0 +1,32 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace VideoCommon {
+
+/// Container to push objects to be destroyed a few ticks in the future
+template <typename T, size_t TICKS_TO_DESTROY>
+class DelayedDestructionRing {
+public:
+    void Tick() {
+        index = (index + 1) % TICKS_TO_DESTROY;
+        elements[index].clear();
+    }
+
+    void Push(T&& object) {
+        elements[index].push_back(std::move(object));
+    }
+
+private:
+    size_t index = 0;
+    std::array<std::vector<T>, TICKS_TO_DESTROY> elements;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/dirty_flags.cpp
+++ b/src/video_core/dirty_flags.cpp
@@ -16,6 +16,9 @@ namespace VideoCommon::Dirty {
 using Tegra::Engines::Maxwell3D;

 void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) {
+    FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors);
+    FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors);
+
    static constexpr std::size_t num_per_rt = NUM(rt[0]);
    static constexpr std::size_t begin = OFF(rt);
    static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
@@ -23,6 +26,10 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl
        FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt);
    }
    FillBlock(tables[1], begin, num, RenderTargets);
+    FillBlock(tables[0], OFF(render_area), NUM(render_area), RenderTargets);
+
+    tables[0][OFF(rt_control)] = RenderTargets;
+    tables[1][OFF(rt_control)] = RenderTargetControl;

    static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets};
    for (std::size_t i = 0; i < std::size(zeta_flags); ++i) {
--- a/src/video_core/dirty_flags.h
+++ b/src/video_core/dirty_flags.h
@@ -16,7 +16,10 @@ namespace VideoCommon::Dirty {
 enum : u8 {
    NullEntry = 0,

+    Descriptors,
+
    RenderTargets,
+    RenderTargetControl,
    ColorBuffer0,
    ColorBuffer1,
    ColorBuffer2,
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -10,7 +10,11 @@

 namespace Tegra::Engines {

-Fermi2D::Fermi2D() = default;
+Fermi2D::Fermi2D() {
+    // Nvidia's OpenGL driver seems to assume these values
+    regs.src.depth = 1;
+    regs.dst.depth = 1;
+}

 Fermi2D::~Fermi2D() = default;

@@ -21,78 +25,43 @@ void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) {
 void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Fermi2D register, increase the size of the Regs structure");
-
    regs.reg_array[method] = method_argument;

-    switch (method) {
-    // Trigger the surface copy on the last register write. This is blit_src_y, but this is 64-bit,
-    // so trigger on the second 32-bit write.
-    case FERMI2D_REG_INDEX(blit_src_y) + 1: {
-        HandleSurfaceCopy();
-        break;
-    }
+    if (method == FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1) {
+        Blit();
    }
 }

 void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) {
-    for (std::size_t i = 0; i < amount; i++) {
-        CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
+    for (u32 i = 0; i < amount; ++i) {
+        CallMethod(method, base_start[i], methods_pending - i <= 1);
    }
 }

-static std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) {
-    const u32 line_a = src_2 - src_1;
-    const u32 line_b = dst_2 - dst_1;
-    const u32 excess = std::max<s32>(0, line_a - src_line + src_1);
-    return {line_b - (excess * line_b) / line_a, excess};
-}
+void Fermi2D::Blit() {
+    LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
+              regs.src.Address(), regs.dst.Address());

-void Fermi2D::HandleSurfaceCopy() {
-    LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}", regs.operation);
+    UNIMPLEMENTED_IF_MSG(regs.operation != Operation::SrcCopy, "Operation is not copy");
+    UNIMPLEMENTED_IF_MSG(regs.src.layer != 0, "Source layer is not zero");
+    UNIMPLEMENTED_IF_MSG(regs.dst.layer != 0, "Destination layer is not zero");
+    UNIMPLEMENTED_IF_MSG(regs.src.depth != 1, "Source depth is not one");
+    UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");

-    // TODO(Subv): Only raw copies are implemented.
-    ASSERT(regs.operation == Operation::SrcCopy);
-
-    const u32 src_blit_x1{static_cast<u32>(regs.blit_src_x >> 32)};
-    const u32 src_blit_y1{static_cast<u32>(regs.blit_src_y >> 32)};
-    u32 src_blit_x2, src_blit_y2;
-    if (regs.blit_control.origin == Origin::Corner) {
-        src_blit_x2 =
-            static_cast<u32>((regs.blit_src_x + (regs.blit_du_dx * regs.blit_dst_width)) >> 32);
-        src_blit_y2 =
-            static_cast<u32>((regs.blit_src_y + (regs.blit_dv_dy * regs.blit_dst_height)) >> 32);
-    } else {
-        src_blit_x2 = static_cast<u32>((regs.blit_src_x >> 32) + regs.blit_dst_width);
-        src_blit_y2 = static_cast<u32>((regs.blit_src_y >> 32) + regs.blit_dst_height);
-    }
-    u32 dst_blit_x2 = regs.blit_dst_x + regs.blit_dst_width;
-    u32 dst_blit_y2 = regs.blit_dst_y + regs.blit_dst_height;
-    const auto [new_dst_w, src_excess_x] =
-        DelimitLine(src_blit_x1, src_blit_x2, regs.blit_dst_x, dst_blit_x2, regs.src.width);
-    const auto [new_dst_h, src_excess_y] =
-        DelimitLine(src_blit_y1, src_blit_y2, regs.blit_dst_y, dst_blit_y2, regs.src.height);
-    dst_blit_x2 = new_dst_w + regs.blit_dst_x;
-    src_blit_x2 = src_blit_x2 - src_excess_x;
-    dst_blit_y2 = new_dst_h + regs.blit_dst_y;
-    src_blit_y2 = src_blit_y2 - src_excess_y;
-    const auto [new_src_w, dst_excess_x] =
-        DelimitLine(regs.blit_dst_x, dst_blit_x2, src_blit_x1, src_blit_x2, regs.dst.width);
-    const auto [new_src_h, dst_excess_y] =
-        DelimitLine(regs.blit_dst_y, dst_blit_y2, src_blit_y1, src_blit_y2, regs.dst.height);
-    src_blit_x2 = new_src_w + src_blit_x1;
-    dst_blit_x2 = dst_blit_x2 - dst_excess_x;
-    src_blit_y2 = new_src_h + src_blit_y1;
-    dst_blit_y2 = dst_blit_y2 - dst_excess_y;
-    const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
-    const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2,
-                                          dst_blit_y2};
-    const Config copy_config{
+    const auto& args = regs.pixels_from_memory;
+    const Config config{
        .operation = regs.operation,
-        .filter = regs.blit_control.filter,
-        .src_rect = src_rect,
-        .dst_rect = dst_rect,
+        .filter = args.sample_mode.filter,
+        .dst_x0 = args.dst_x0,
+        .dst_y0 = args.dst_y0,
+        .dst_x1 = args.dst_x0 + args.dst_width,
+        .dst_y1 = args.dst_y0 + args.dst_height,
+        .src_x0 = static_cast<s32>(args.src_x0 >> 32),
+        .src_y0 = static_cast<s32>(args.src_y0 >> 32),
+        .src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
+        .src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
    };
-    if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) {
+    if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, config)) {
        UNIMPLEMENTED();
    }
 }
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -53,8 +53,8 @@ public:
    };

    enum class Filter : u32 {
-        PointSample = 0, // Nearest
-        Linear = 1,
+        Point = 0,
+        Bilinear = 1,
    };

    enum class Operation : u32 {
@@ -67,88 +67,235 @@ public:
        BlendPremult = 6,
    };

-    struct Regs {
-        static constexpr std::size_t NUM_REGS = 0x258;
+    enum class MemoryLayout : u32 {
+        BlockLinear = 0,
+        Pitch = 1,
+    };

-        struct Surface {
-            RenderTargetFormat format;
-            BitField<0, 1, u32> linear;
-            union {
-                BitField<0, 4, u32> block_width;
-                BitField<4, 4, u32> block_height;
-                BitField<8, 4, u32> block_depth;
-            };
-            u32 depth;
-            u32 layer;
-            u32 pitch;
-            u32 width;
-            u32 height;
-            u32 address_high;
-            u32 address_low;
-
-            GPUVAddr Address() const {
-                return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
-                                             address_low);
-            }
-
-            u32 BlockWidth() const {
-                return block_width.Value();
-            }
-
-            u32 BlockHeight() const {
-                return block_height.Value();
-            }
-
-            u32 BlockDepth() const {
-                return block_depth.Value();
-            }
-        };
-        static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
+    enum class CpuIndexWrap : u32 {
+        Wrap = 0,
+        NoWrap = 1,
+    };

+    struct Surface {
+        RenderTargetFormat format;
+        MemoryLayout linear;
        union {
+            BitField<0, 4, u32> block_width;
+            BitField<4, 4, u32> block_height;
+            BitField<8, 4, u32> block_depth;
+        };
+        u32 depth;
+        u32 layer;
+        u32 pitch;
+        u32 width;
+        u32 height;
+        u32 addr_upper;
+        u32 addr_lower;
+
+        [[nodiscard]] constexpr GPUVAddr Address() const noexcept {
+            return (static_cast<GPUVAddr>(addr_upper) << 32) | static_cast<GPUVAddr>(addr_lower);
+        }
+    };
+    static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size");
+
+    enum class SectorPromotion : u32 {
+        NoPromotion = 0,
+        PromoteTo2V = 1,
+        PromoteTo2H = 2,
+        PromoteTo4 = 3,
+    };
+
+    enum class NumTpcs : u32 {
+        All = 0,
+        One = 1,
+    };
+
+    enum class RenderEnableMode : u32 {
+        False = 0,
+        True = 1,
+        Conditional = 2,
+        RenderIfEqual = 3,
+        RenderIfNotEqual = 4,
+    };
+
+    enum class ColorKeyFormat : u32 {
+        A16R56G6B5 = 0,
+        A1R5G55B5 = 1,
+        A8R8G8B8 = 2,
+        A2R10G10B10 = 3,
+        Y8 = 4,
+        Y16 = 5,
+        Y32 = 6,
+    };
+
+    union Beta4 {
+        BitField<0, 8, u32> b;
+        BitField<8, 8, u32> g;
+        BitField<16, 8, u32> r;
+        BitField<24, 8, u32> a;
+    };
+
+    struct Point {
+        u32 x;
+        u32 y;
+    };
+
+    enum class PatternSelect : u32 {
+        MonoChrome8x8 = 0,
+        MonoChrome64x1 = 1,
+        MonoChrome1x64 = 2,
+        Color = 3,
+    };
+
+    enum class NotifyType : u32 {
+        WriteOnly = 0,
+        WriteThenAwaken = 1,
+    };
+
+    enum class MonochromePatternColorFormat : u32 {
+        A8X8R8G6B5 = 0,
+        A1R5G5B5 = 1,
+        A8R8G8B8 = 2,
+        A8Y8 = 3,
+        A8X8Y16 = 4,
+        Y32 = 5,
+    };
+
+    enum class MonochromePatternFormat : u32 {
+        CGA6_M1 = 0,
+        LE_M1 = 1,
+    };
+
+    union Regs {
+        static constexpr std::size_t NUM_REGS = 0x258;
+        struct {
+            u32 object;
+            INSERT_UNION_PADDING_WORDS(0x3F);
+            u32 no_operation;
+            NotifyType notify;
+            INSERT_UNION_PADDING_WORDS(0x2);
+            u32 wait_for_idle;
+            INSERT_UNION_PADDING_WORDS(0xB);
+            u32 pm_trigger;
+            INSERT_UNION_PADDING_WORDS(0xF);
+            u32 context_dma_notify;
+            u32 dst_context_dma;
+            u32 src_context_dma;
+            u32 semaphore_context_dma;
+            INSERT_UNION_PADDING_WORDS(0x1C);
+            Surface dst;
+            CpuIndexWrap pixels_from_cpu_index_wrap;
+            u32 kind2d_check_enable;
+            Surface src;
+            SectorPromotion pixels_from_memory_sector_promotion;
+            INSERT_UNION_PADDING_WORDS(0x1);
+            NumTpcs num_tpcs;
+            u32 render_enable_addr_upper;
+            u32 render_enable_addr_lower;
+            RenderEnableMode render_enable_mode;
+            INSERT_UNION_PADDING_WORDS(0x4);
+            u32 clip_x0;
+            u32 clip_y0;
+            u32 clip_width;
+            u32 clip_height;
+            BitField<0, 1, u32> clip_enable;
+            BitField<0, 3, ColorKeyFormat> color_key_format;
+            u32 color_key;
+            BitField<0, 1, u32> color_key_enable;
+            BitField<0, 8, u32> rop;
+            u32 beta1;
+            Beta4 beta4;
+            Operation operation;
+            union {
+                BitField<0, 6, u32> x;
+                BitField<8, 6, u32> y;
+            } pattern_offset;
+            BitField<0, 2, PatternSelect> pattern_select;
+            INSERT_UNION_PADDING_WORDS(0xC);
            struct {
-                INSERT_UNION_PADDING_WORDS(0x80);
-
-                Surface dst;
-
-                INSERT_UNION_PADDING_WORDS(2);
-
-                Surface src;
-
-                INSERT_UNION_PADDING_WORDS(0x15);
-
-                Operation operation;
-
-                INSERT_UNION_PADDING_WORDS(0x177);
-
+                BitField<0, 3, MonochromePatternColorFormat> color_format;
+                BitField<0, 1, MonochromePatternFormat> format;
+                u32 color0;
+                u32 color1;
+                u32 pattern0;
+                u32 pattern1;
+            } monochrome_pattern;
+            struct {
+                std::array<u32, 0x40> X8R8G8B8;
+                std::array<u32, 0x20> R5G6B5;
+                std::array<u32, 0x20> X1R5G5B5;
+                std::array<u32, 0x10> Y8;
+            } color_pattern;
+            INSERT_UNION_PADDING_WORDS(0x10);
+            struct {
+                u32 prim_mode;
+                u32 prim_color_format;
+                u32 prim_color;
+                u32 line_tie_break_bits;
+                INSERT_UNION_PADDING_WORDS(0x14);
+                u32 prim_point_xy;
+                INSERT_UNION_PADDING_WORDS(0x7);
+                std::array<Point, 0x40> prim_point;
+            } render_solid;
+            struct {
+                u32 data_type;
+                u32 color_format;
+                u32 index_format;
+                u32 mono_format;
+                u32 wrap;
+                u32 color0;
+                u32 color1;
+                u32 mono_opacity;
+                INSERT_UNION_PADDING_WORDS(0x6);
+                u32 src_width;
+                u32 src_height;
+                u32 dx_du_frac;
+                u32 dx_du_int;
+                u32 dx_dv_frac;
+                u32 dy_dv_int;
+                u32 dst_x0_frac;
+                u32 dst_x0_int;
+                u32 dst_y0_frac;
+                u32 dst_y0_int;
+                u32 data;
+            } pixels_from_cpu;
+            INSERT_UNION_PADDING_WORDS(0x3);
+            u32 big_endian_control;
+            INSERT_UNION_PADDING_WORDS(0x3);
+            struct {
+                BitField<0, 3, u32> block_shape;
+                BitField<0, 5, u32> corral_size;
+                BitField<0, 1, u32> safe_overlap;
                union {
-                    u32 raw;
                    BitField<0, 1, Origin> origin;
                    BitField<4, 1, Filter> filter;
-                } blit_control;
-
+                } sample_mode;
                INSERT_UNION_PADDING_WORDS(0x8);
-
-                u32 blit_dst_x;
-                u32 blit_dst_y;
-                u32 blit_dst_width;
-                u32 blit_dst_height;
-                u64 blit_du_dx;
-                u64 blit_dv_dy;
-                u64 blit_src_x;
-                u64 blit_src_y;
-
-                INSERT_UNION_PADDING_WORDS(0x21);
-            };
-            std::array<u32, NUM_REGS> reg_array;
+                s32 dst_x0;
+                s32 dst_y0;
+                s32 dst_width;
+                s32 dst_height;
+                s64 du_dx;
+                s64 dv_dy;
+                s64 src_x0;
+                s64 src_y0;
+            } pixels_from_memory;
        };
+        std::array<u32, NUM_REGS> reg_array;
    } regs{};

    struct Config {
-        Operation operation{};
-        Filter filter{};
-        Common::Rectangle<u32> src_rect;
-        Common::Rectangle<u32> dst_rect;
+        Operation operation;
+        Filter filter;
+        s32 dst_x0;
+        s32 dst_y0;
+        s32 dst_x1;
+        s32 dst_y1;
+        s32 src_x0;
+        s32 src_y0;
+        s32 src_x1;
+        s32 src_y1;
    };

 private:
@@ -156,25 +303,49 @@ private:

    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
-    void HandleSurfaceCopy();
+    void Blit();
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
-    static_assert(offsetof(Fermi2D::Regs, field_name) == position * 4,                             \
+    static_assert(offsetof(Fermi2D::Regs, field_name) == position,                                 \
                  "Field " #field_name " has invalid position")

-ASSERT_REG_POSITION(dst, 0x80);
-ASSERT_REG_POSITION(src, 0x8C);
-ASSERT_REG_POSITION(operation, 0xAB);
-ASSERT_REG_POSITION(blit_control, 0x223);
-ASSERT_REG_POSITION(blit_dst_x, 0x22c);
-ASSERT_REG_POSITION(blit_dst_y, 0x22d);
-ASSERT_REG_POSITION(blit_dst_width, 0x22e);
-ASSERT_REG_POSITION(blit_dst_height, 0x22f);
-ASSERT_REG_POSITION(blit_du_dx, 0x230);
-ASSERT_REG_POSITION(blit_dv_dy, 0x232);
-ASSERT_REG_POSITION(blit_src_x, 0x234);
-ASSERT_REG_POSITION(blit_src_y, 0x236);
+ASSERT_REG_POSITION(object, 0x0);
+ASSERT_REG_POSITION(no_operation, 0x100);
+ASSERT_REG_POSITION(notify, 0x104);
+ASSERT_REG_POSITION(wait_for_idle, 0x110);
+ASSERT_REG_POSITION(pm_trigger, 0x140);
+ASSERT_REG_POSITION(context_dma_notify, 0x180);
+ASSERT_REG_POSITION(dst_context_dma, 0x184);
+ASSERT_REG_POSITION(src_context_dma, 0x188);
+ASSERT_REG_POSITION(semaphore_context_dma, 0x18C);
+ASSERT_REG_POSITION(dst, 0x200);
+ASSERT_REG_POSITION(pixels_from_cpu_index_wrap, 0x228);
+ASSERT_REG_POSITION(kind2d_check_enable, 0x22C);
+ASSERT_REG_POSITION(src, 0x230);
+ASSERT_REG_POSITION(pixels_from_memory_sector_promotion, 0x258);
+ASSERT_REG_POSITION(num_tpcs, 0x260);
+ASSERT_REG_POSITION(render_enable_addr_upper, 0x264);
+ASSERT_REG_POSITION(render_enable_addr_lower, 0x268);
+ASSERT_REG_POSITION(clip_x0, 0x280);
+ASSERT_REG_POSITION(clip_y0, 0x284);
+ASSERT_REG_POSITION(clip_width, 0x288);
+ASSERT_REG_POSITION(clip_height, 0x28c);
+ASSERT_REG_POSITION(clip_enable, 0x290);
+ASSERT_REG_POSITION(color_key_format, 0x294);
+ASSERT_REG_POSITION(color_key, 0x298);
+ASSERT_REG_POSITION(rop, 0x2A0);
+ASSERT_REG_POSITION(beta1, 0x2A4);
+ASSERT_REG_POSITION(beta4, 0x2A8);
+ASSERT_REG_POSITION(operation, 0x2AC);
+ASSERT_REG_POSITION(pattern_offset, 0x2B0);
+ASSERT_REG_POSITION(pattern_select, 0x2B4);
+ASSERT_REG_POSITION(monochrome_pattern, 0x2E8);
+ASSERT_REG_POSITION(color_pattern, 0x300);
+ASSERT_REG_POSITION(render_solid, 0x580);
+ASSERT_REG_POSITION(pixels_from_cpu, 0x800);
+ASSERT_REG_POSITION(big_endian_control, 0x870);
+ASSERT_REG_POSITION(pixels_from_memory, 0x880);

 #undef ASSERT_REG_POSITION

--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -58,24 +58,6 @@ void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amoun
    }
 }

-Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const {
-    const std::bitset<8> cbuf_mask = launch_description.const_buffer_enable_mask.Value();
-    ASSERT(cbuf_mask[regs.tex_cb_index]);
-
-    const auto& texinfo = launch_description.const_buffer_config[regs.tex_cb_index];
-    ASSERT(texinfo.Address() != 0);
-
-    const GPUVAddr address = texinfo.Address() + offset * sizeof(Texture::TextureHandle);
-    ASSERT(address < texinfo.Address() + texinfo.size);
-
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(address)};
-    return GetTextureInfo(tex_handle);
-}
-
-Texture::FullTextureInfo KeplerCompute::GetTextureInfo(Texture::TextureHandle tex_handle) const {
-    return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)};
-}
-
 u32 KeplerCompute::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
    ASSERT(stage == ShaderType::Compute);
    const auto& buffer = launch_description.const_buffer_config[const_buffer];
@@ -98,9 +80,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con

 SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const {
    const Texture::TextureHandle tex_handle{handle};
-    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
-    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
-    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
+    const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
+    const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
+
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
+    result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
    return result;
 }

--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -209,11 +209,6 @@ public:
    void CallMultiMethod(u32 method, const u32* base_start, u32 amount,
                         u32 methods_pending) override;

-    Texture::FullTextureInfo GetTexture(std::size_t offset) const;
-
-    /// Given a texture handle, returns the TSC and TIC entries.
-    Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const;
-
    u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;

    SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -2,7 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <cinttypes>
 #include <cstring>
 #include <optional>
 #include "common/assert.h"
@@ -227,6 +226,10 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
            OnMemoryWrite();
        }
        return;
+    case MAXWELL3D_REG_INDEX(fragment_barrier):
+        return rasterizer->FragmentBarrier();
+    case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
+        return rasterizer->TiledCacheBarrier();
    }
 }

@@ -639,7 +642,7 @@ void Maxwell3D::FinishCBData() {
 }

 Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
-    const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)};
+    const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)};

    Texture::TICEntry tic_entry;
    memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
@@ -648,43 +651,19 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
 }

 Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
-    const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)};
+    const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)};

    Texture::TSCEntry tsc_entry;
    memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
    return tsc_entry;
 }

-Texture::FullTextureInfo Maxwell3D::GetTextureInfo(Texture::TextureHandle tex_handle) const {
-    return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)};
-}
-
-Texture::FullTextureInfo Maxwell3D::GetStageTexture(ShaderType stage, std::size_t offset) const {
-    const auto stage_index = static_cast<std::size_t>(stage);
-    const auto& shader = state.shader_stages[stage_index];
-    const auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index];
-    ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
-
-    const GPUVAddr tex_info_address =
-        tex_info_buffer.address + offset * sizeof(Texture::TextureHandle);
-
-    ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size);
-
-    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
-
-    return GetTextureInfo(tex_handle);
-}
-
 u32 Maxwell3D::GetRegisterValue(u32 method) const {
    ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register");
    return regs.reg_array[method];
 }

 void Maxwell3D::ProcessClearBuffers() {
-    ASSERT(regs.clear_buffers.R == regs.clear_buffers.G &&
-           regs.clear_buffers.R == regs.clear_buffers.B &&
-           regs.clear_buffers.R == regs.clear_buffers.A);
-
    rasterizer->Clear();
 }

@@ -692,9 +671,7 @@ u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offse
    ASSERT(stage != ShaderType::Compute);
    const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)];
    const auto& buffer = shader_stage.const_buffers[const_buffer];
-    u32 result;
-    std::memcpy(&result, memory_manager.GetPointer(buffer.address + offset), sizeof(u32));
-    return result;
+    return memory_manager.Read<u32>(buffer.address + offset);
 }

 SamplerDescriptor Maxwell3D::AccessBoundSampler(ShaderType stage, u64 offset) const {
@@ -712,9 +689,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b

 SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const {
    const Texture::TextureHandle tex_handle{handle};
-    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
-    SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic);
-    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
+    const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id);
+    const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id);
+
+    SamplerDescriptor result = SamplerDescriptor::FromTIC(tic);
+    result.is_shadow.Assign(tsc.depth_compare_enabled.Value());
    return result;
 }

--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -438,16 +438,6 @@ public:
            DecrWrapOGL = 0x8508,
        };

-        enum class MemoryLayout : u32 {
-            Linear = 0,
-            BlockLinear = 1,
-        };
-
-        enum class InvMemoryLayout : u32 {
-            BlockLinear = 0,
-            Linear = 1,
-        };
-
        enum class CounterReset : u32 {
            SampleCnt = 0x01,
            Unk02 = 0x02,
@@ -589,21 +579,31 @@ public:
            NegativeW = 7,
        };

+        enum class SamplerIndex : u32 {
+            Independently = 0,
+            ViaHeaderIndex = 1,
+        };
+
+        struct TileMode {
+            union {
+                BitField<0, 4, u32> block_width;
+                BitField<4, 4, u32> block_height;
+                BitField<8, 4, u32> block_depth;
+                BitField<12, 1, u32> is_pitch_linear;
+                BitField<16, 1, u32> is_3d;
+            };
+        };
+        static_assert(sizeof(TileMode) == 4);
+
        struct RenderTargetConfig {
            u32 address_high;
            u32 address_low;
            u32 width;
            u32 height;
            Tegra::RenderTargetFormat format;
+            TileMode tile_mode;
            union {
-                BitField<0, 3, u32> block_width;
-                BitField<4, 3, u32> block_height;
-                BitField<8, 3, u32> block_depth;
-                BitField<12, 1, InvMemoryLayout> type;
-                BitField<16, 1, u32> is_3d;
-            } memory_layout;
-            union {
-                BitField<0, 16, u32> layers;
+                BitField<0, 16, u32> depth;
                BitField<16, 1, u32> volume;
            };
            u32 layer_stride;
@@ -832,7 +832,11 @@ public:

                u32 patch_vertices;

-                INSERT_UNION_PADDING_WORDS(0xC);
+                INSERT_UNION_PADDING_WORDS(0x4);
+
+                u32 fragment_barrier;
+
+                INSERT_UNION_PADDING_WORDS(0x7);

                std::array<ScissorTest, NumViewports> scissor_test;

@@ -842,7 +846,15 @@ public:
                u32 stencil_back_mask;
                u32 stencil_back_func_mask;

-                INSERT_UNION_PADDING_WORDS(0xC);
+                INSERT_UNION_PADDING_WORDS(0x5);
+
+                u32 invalidate_texture_data_cache;
+
+                INSERT_UNION_PADDING_WORDS(0x1);
+
+                u32 tiled_cache_barrier;
+
+                INSERT_UNION_PADDING_WORDS(0x4);

                u32 color_mask_common;

@@ -866,12 +878,7 @@ public:
                    u32 address_high;
                    u32 address_low;
                    Tegra::DepthFormat format;
-                    union {
-                        BitField<0, 4, u32> block_width;
-                        BitField<4, 4, u32> block_height;
-                        BitField<8, 4, u32> block_depth;
-                        BitField<20, 1, InvMemoryLayout> type;
-                    } memory_layout;
+                    TileMode tile_mode;
                    u32 layer_stride;

                    GPUVAddr Address() const {
@@ -880,7 +887,18 @@ public:
                    }
                } zeta;

-                INSERT_UNION_PADDING_WORDS(0x41);
+                struct {
+                    union {
+                        BitField<0, 16, u32> x;
+                        BitField<16, 16, u32> width;
+                    };
+                    union {
+                        BitField<0, 16, u32> y;
+                        BitField<16, 16, u32> height;
+                    };
+                } render_area;
+
+                INSERT_UNION_PADDING_WORDS(0x3F);

                union {
                    BitField<0, 4, u32> stencil;
@@ -921,7 +939,7 @@ public:
                        BitField<25, 3, u32> map_7;
                    };

-                    u32 GetMap(std::size_t index) const {
+                    u32 Map(std::size_t index) const {
                        const std::array<u32, NumRenderTargets> maps{map_0, map_1, map_2, map_3,
                                                                     map_4, map_5, map_6, map_7};
                        ASSERT(index < maps.size());
@@ -934,11 +952,13 @@ public:
                u32 zeta_width;
                u32 zeta_height;
                union {
-                    BitField<0, 16, u32> zeta_layers;
+                    BitField<0, 16, u32> zeta_depth;
                    BitField<16, 1, u32> zeta_volume;
                };

-                INSERT_UNION_PADDING_WORDS(0x26);
+                SamplerIndex sampler_index;
+
+                INSERT_UNION_PADDING_WORDS(0x25);

                u32 depth_test_enable;

@@ -964,6 +984,7 @@ public:
                    float b;
                    float a;
                } blend_color;
+
                INSERT_UNION_PADDING_WORDS(0x4);

                struct {
@@ -1001,7 +1022,12 @@ public:
                float line_width_smooth;
                float line_width_aliased;

-                INSERT_UNION_PADDING_WORDS(0x1F);
+                INSERT_UNION_PADDING_WORDS(0x1B);
+
+                u32 invalidate_sampler_cache_no_wfi;
+                u32 invalidate_texture_header_cache_no_wfi;
+
+                INSERT_UNION_PADDING_WORDS(0x2);

                u32 vb_element_base;
                u32 vb_base_instance;
@@ -1045,13 +1071,13 @@ public:
                } condition;

                struct {
-                    u32 tsc_address_high;
-                    u32 tsc_address_low;
-                    u32 tsc_limit;
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;

-                    GPUVAddr TSCAddress() const {
-                        return static_cast<GPUVAddr>(
-                            (static_cast<GPUVAddr>(tsc_address_high) << 32) | tsc_address_low);
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
                    }
                } tsc;

@@ -1062,13 +1088,13 @@ public:
                u32 line_smooth_enable;

                struct {
-                    u32 tic_address_high;
-                    u32 tic_address_low;
-                    u32 tic_limit;
+                    u32 address_high;
+                    u32 address_low;
+                    u32 limit;

-                    GPUVAddr TICAddress() const {
-                        return static_cast<GPUVAddr>(
-                            (static_cast<GPUVAddr>(tic_address_high) << 32) | tic_address_low);
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
                    }
                } tic;

@@ -1397,12 +1423,6 @@ public:

    void FlushMMEInlineDraw();

-    /// Given a texture handle, returns the TSC and TIC entries.
-    Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const;
-
-    /// Returns the texture information for a specific texture in a specific shader stage.
-    Texture::FullTextureInfo GetStageTexture(ShaderType stage, std::size_t offset) const;
-
    u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;

    SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
@@ -1598,10 +1618,13 @@ ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370);
 ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371);
 ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372);
 ASSERT_REG_POSITION(patch_vertices, 0x373);
+ASSERT_REG_POSITION(fragment_barrier, 0x378);
 ASSERT_REG_POSITION(scissor_test, 0x380);
 ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5);
 ASSERT_REG_POSITION(stencil_back_mask, 0x3D6);
 ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7);
+ASSERT_REG_POSITION(invalidate_texture_data_cache, 0x3DD);
+ASSERT_REG_POSITION(tiled_cache_barrier, 0x3DF);
 ASSERT_REG_POSITION(color_mask_common, 0x3E4);
 ASSERT_REG_POSITION(depth_bounds, 0x3E7);
 ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB);
@@ -1609,6 +1632,7 @@ ASSERT_REG_POSITION(multisample_raster_enable, 0x3ED);
 ASSERT_REG_POSITION(multisample_raster_samples, 0x3EE);
 ASSERT_REG_POSITION(multisample_sample_mask, 0x3EF);
 ASSERT_REG_POSITION(zeta, 0x3F8);
+ASSERT_REG_POSITION(render_area, 0x3FD);
 ASSERT_REG_POSITION(clear_flags, 0x43E);
 ASSERT_REG_POSITION(fill_rectangle, 0x44F);
 ASSERT_REG_POSITION(vertex_attrib_format, 0x458);
@@ -1617,7 +1641,8 @@ ASSERT_REG_POSITION(multisample_coverage_to_color, 0x47E);
 ASSERT_REG_POSITION(rt_control, 0x487);
 ASSERT_REG_POSITION(zeta_width, 0x48a);
 ASSERT_REG_POSITION(zeta_height, 0x48b);
-ASSERT_REG_POSITION(zeta_layers, 0x48c);
+ASSERT_REG_POSITION(zeta_depth, 0x48c);
+ASSERT_REG_POSITION(sampler_index, 0x48D);
 ASSERT_REG_POSITION(depth_test_enable, 0x4B3);
 ASSERT_REG_POSITION(independent_blend_enable, 0x4B9);
 ASSERT_REG_POSITION(depth_write_enabled, 0x4BA);
@@ -1641,6 +1666,8 @@ ASSERT_REG_POSITION(frag_color_clamp, 0x4EA);
 ASSERT_REG_POSITION(screen_y_control, 0x4EB);
 ASSERT_REG_POSITION(line_width_smooth, 0x4EC);
 ASSERT_REG_POSITION(line_width_aliased, 0x4ED);
+ASSERT_REG_POSITION(invalidate_sampler_cache_no_wfi, 0x509);
+ASSERT_REG_POSITION(invalidate_texture_header_cache_no_wfi, 0x50A);
 ASSERT_REG_POSITION(vb_element_base, 0x50D);
 ASSERT_REG_POSITION(vb_base_instance, 0x50E);
 ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -96,6 +96,7 @@ void MaxwellDMA::CopyPitchToPitch() {
 }

 void MaxwellDMA::CopyBlockLinearToPitch() {
+    UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
    UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
    UNIMPLEMENTED_IF(regs.src_params.layer != 0);

@@ -135,6 +136,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
 }

 void MaxwellDMA::CopyPitchToBlockLinear() {
+    UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
+
    const auto& dst_params = regs.dst_params;
    const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in;
    const u32 width = dst_params.width;
--- a/src/video_core/fence_manager.h
+++ b/src/video_core/fence_manager.h
@@ -9,6 +9,7 @@

 #include "common/common_types.h"
 #include "core/core.h"
+#include "video_core/delayed_destruction_ring.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
@@ -47,6 +48,11 @@ protected:
 template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache>
 class FenceManager {
 public:
+    /// Notify the fence manager about a new frame
+    void TickFrame() {
+        delayed_destruction_ring.Tick();
+    }
+
    void SignalSemaphore(GPUVAddr addr, u32 value) {
        TryReleasePendingFences();
        const bool should_flush = ShouldFlush();
@@ -86,7 +92,7 @@ public:
            } else {
                gpu.IncrementSyncPoint(current_fence->GetPayload());
            }
-            fences.pop();
+            PopFence();
        }
    }

@@ -132,7 +138,7 @@ private:
            } else {
                gpu.IncrementSyncPoint(current_fence->GetPayload());
            }
-            fences.pop();
+            PopFence();
        }
    }

@@ -158,7 +164,14 @@ private:
        query_cache.CommitAsyncFlushes();
    }

+    void PopFence() {
+        delayed_destruction_ring.Push(std::move(fences.front()));
+        fences.pop();
+    }
+
    std::queue<TFence> fences;
+
+    DelayedDestructionRing<TFence, 6> delayed_destruction_ring;
 };

 } // namespace VideoCommon
--- a/src/video_core/framebuffer_config.h
+++ b/src/video_core/framebuffer_config.h
@@ -1,31 +0,0 @@
-// Copyright 2020 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-namespace Tegra {
-
-/**
- * Struct describing framebuffer configuration
- */
-struct FramebufferConfig {
-    enum class PixelFormat : u32 {
-        A8B8G8R8_UNORM = 1,
-        RGB565_UNORM = 4,
-        B8G8R8A8_UNORM = 5,
-    };
-
-    VAddr address{};
-    u32 offset{};
-    u32 width{};
-    u32 height{};
-    u32 stride{};
-    PixelFormat pixel_format{};
-
-    using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
-    TransformFlags transform_flags{};
-    Common::Rectangle<int> crop_rect;
-};
-
-} // namespace Tegra
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -10,7 +10,6 @@
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
 #include "core/frontend/emu_window.h"
-#include "core/hardware_interrupt_manager.h"
 #include "core/memory.h"
 #include "core/settings.h"
 #include "video_core/engines/fermi_2d.h"
@@ -37,8 +36,7 @@ GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_)
      kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)},
      maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)},
      kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)},
-      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_},
-      gpu_thread{system_, is_async_} {}
+      shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_} {}

 GPU::~GPU() = default;

@@ -200,6 +198,10 @@ void GPU::SyncGuestHost() {
    renderer->Rasterizer().SyncGuestHost();
 }

+void GPU::OnCommandListEnd() {
+    renderer->Rasterizer().ReleaseFences();
+}
+
 enum class GpuSemaphoreOperation {
    AcquireEqual = 0x1,
    WriteLong = 0x2,
@@ -459,75 +461,4 @@ void GPU::ProcessSemaphoreAcquire() {
    }
 }

-void GPU::Start() {
-    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
-    cpu_context = renderer->GetRenderWindow().CreateSharedContext();
-    cpu_context->MakeCurrent();
-}
-
-void GPU::ObtainContext() {
-    cpu_context->MakeCurrent();
-}
-
-void GPU::ReleaseContext() {
-    cpu_context->DoneCurrent();
-}
-
-void GPU::PushGPUEntries(Tegra::CommandList&& entries) {
-    gpu_thread.SubmitList(std::move(entries));
-}
-
-void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
-    if (!use_nvdec) {
-        return;
-    }
-    // This condition fires when a video stream ends, clear all intermediary data
-    if (entries[0].raw == 0xDEADB33F) {
-        cdma_pusher.reset();
-        return;
-    }
-    if (!cdma_pusher) {
-        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
-    }
-
-    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
-    // TODO(ameerj): RE proper async nvdec operation
-    // gpu_thread.SubmitCommandBuffer(std::move(entries));
-
-    cdma_pusher->Push(std::move(entries));
-    cdma_pusher->DispatchCalls();
-}
-
-void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
-    gpu_thread.SwapBuffers(framebuffer);
-}
-
-void GPU::FlushRegion(VAddr addr, u64 size) {
-    gpu_thread.FlushRegion(addr, size);
-}
-
-void GPU::InvalidateRegion(VAddr addr, u64 size) {
-    gpu_thread.InvalidateRegion(addr, size);
-}
-
-void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) {
-    gpu_thread.FlushAndInvalidateRegion(addr, size);
-}
-
-void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
-    auto& interrupt_manager = system.InterruptManager();
-    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
-}
-
-void GPU::WaitIdle() const {
-    gpu_thread.WaitIdle();
-}
-
-void GPU::OnCommandListEnd() {
-    if (is_async) {
-        // This command only applies to asynchronous GPU mode
-        gpu_thread.OnCommandListEnd();
-    }
-}
-
 } // namespace Tegra
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -15,8 +15,6 @@
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/cdma_pusher.h"
 #include "video_core/dma_pusher.h"
-#include "video_core/framebuffer_config.h"
-#include "video_core/gpu_thread.h"

 using CacheAddr = std::uintptr_t;
 [[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) {
@@ -103,6 +101,28 @@ enum class DepthFormat : u32 {
 struct CommandListHeader;
 class DebugContext;

+/**
+ * Struct describing framebuffer configuration
+ */
+struct FramebufferConfig {
+    enum class PixelFormat : u32 {
+        A8B8G8R8_UNORM = 1,
+        RGB565_UNORM = 4,
+        B8G8R8A8_UNORM = 5,
+    };
+
+    VAddr address;
+    u32 offset;
+    u32 width;
+    u32 height;
+    u32 stride;
+    PixelFormat pixel_format;
+
+    using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
+    TransformFlags transform_flags;
+    Common::Rectangle<int> crop_rect;
+};
+
 namespace Engines {
 class Fermi2D;
 class Maxwell3D;
@@ -121,7 +141,7 @@ enum class EngineID {

 class MemoryManager;

-class GPU final {
+class GPU {
 public:
    struct MethodCall {
        u32 method{};
@@ -139,7 +159,7 @@ public:
    };

    explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_);
-    ~GPU();
+    virtual ~GPU();

    /// Binds a renderer to the GPU.
    void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer);
@@ -156,7 +176,7 @@ public:
    /// Synchronizes CPU writes with Host GPU memory.
    void SyncGuestHost();
    /// Signal the ending of command list.
-    void OnCommandListEnd();
+    virtual void OnCommandListEnd();

    /// Request a host GPU memory flush from the CPU.
    [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size);
@@ -220,7 +240,7 @@ public:
    }

    // Waits for the GPU to finish working
-    void WaitIdle() const;
+    virtual void WaitIdle() const = 0;

    /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
    void WaitFence(u32 syncpoint_id, u32 value);
@@ -310,34 +330,34 @@ public:
    /// Performs any additional setup necessary in order to begin GPU emulation.
    /// This can be used to launch any necessary threads and register any necessary
    /// core timing events.
-    void Start();
+    virtual void Start() = 0;

    /// Obtain the CPU Context
-    void ObtainContext();
+    virtual void ObtainContext() = 0;

    /// Release the CPU Context
-    void ReleaseContext();
+    virtual void ReleaseContext() = 0;

    /// Push GPU command entries to be processed
-    void PushGPUEntries(Tegra::CommandList&& entries);
+    virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;

    /// Push GPU command buffer entries to be processed
-    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries);
+    virtual void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) = 0;

    /// Swap buffers (render frame)
-    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
+    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;

    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    void FlushRegion(VAddr addr, u64 size);
+    virtual void FlushRegion(VAddr addr, u64 size) = 0;

    /// Notify rasterizer that any caches of the specified region should be invalidated
-    void InvalidateRegion(VAddr addr, u64 size);
+    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;

    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
-    void FlushAndInvalidateRegion(VAddr addr, u64 size);
+    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;

 protected:
-    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const;
+    virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0;

 private:
    void ProcessBindMethod(const MethodCall& method_call);
@@ -407,9 +427,6 @@ private:
    std::mutex flush_request_mutex;

    const bool is_async;
-
-    VideoCommon::GPUThread::ThreadManager gpu_thread;
-    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -0,0 +1,86 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/core.h"
+#include "core/hardware_interrupt_manager.h"
+#include "video_core/gpu_asynch.h"
+#include "video_core/gpu_thread.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCommon {
+
+GPUAsynch::GPUAsynch(Core::System& system_, bool use_nvdec_)
+    : GPU{system_, true, use_nvdec_}, gpu_thread{system_} {}
+
+GPUAsynch::~GPUAsynch() = default;
+
+void GPUAsynch::Start() {
+    gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher);
+    cpu_context = renderer->GetRenderWindow().CreateSharedContext();
+    cpu_context->MakeCurrent();
+}
+
+void GPUAsynch::ObtainContext() {
+    cpu_context->MakeCurrent();
+}
+
+void GPUAsynch::ReleaseContext() {
+    cpu_context->DoneCurrent();
+}
+
+void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
+    gpu_thread.SubmitList(std::move(entries));
+}
+
+void GPUAsynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clear all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+
+    // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
+    // TODO(ameerj): RE proper async nvdec operation
+    // gpu_thread.SubmitCommandBuffer(std::move(entries));
+
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
+void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    gpu_thread.SwapBuffers(framebuffer);
+}
+
+void GPUAsynch::FlushRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushRegion(addr, size);
+}
+
+void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.InvalidateRegion(addr, size);
+}
+
+void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushAndInvalidateRegion(addr, size);
+}
+
+void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const {
+    auto& interrupt_manager = system.InterruptManager();
+    interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value);
+}
+
+void GPUAsynch::WaitIdle() const {
+    gpu_thread.WaitIdle();
+}
+
+void GPUAsynch::OnCommandListEnd() {
+    gpu_thread.OnCommandListEnd();
+}
+
+} // namespace VideoCommon
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -0,0 +1,47 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "video_core/gpu.h"
+#include "video_core/gpu_thread.h"
+
+namespace Core::Frontend {
+class GraphicsContext;
+}
+
+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
+namespace VideoCommon {
+
+/// Implementation of GPU interface that runs the GPU asynchronously
+class GPUAsynch final : public Tegra::GPU {
+public:
+    explicit GPUAsynch(Core::System& system_, bool use_nvdec_);
+    ~GPUAsynch() override;
+
+    void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
+    void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
+    void FlushRegion(VAddr addr, u64 size) override;
+    void InvalidateRegion(VAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void WaitIdle() const override;
+
+    void OnCommandListEnd() override;
+
+protected:
+    void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override;
+
+private:
+    GPUThread::ThreadManager gpu_thread;
+    std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -0,0 +1,61 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/gpu_synch.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCommon {
+
+GPUSynch::GPUSynch(Core::System& system_, bool use_nvdec_) : GPU{system_, false, use_nvdec_} {}
+
+GPUSynch::~GPUSynch() = default;
+
+void GPUSynch::Start() {}
+
+void GPUSynch::ObtainContext() {
+    renderer->Context().MakeCurrent();
+}
+
+void GPUSynch::ReleaseContext() {
+    renderer->Context().DoneCurrent();
+}
+
+void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
+    dma_pusher->Push(std::move(entries));
+    dma_pusher->DispatchCalls();
+}
+
+void GPUSynch::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) {
+    if (!use_nvdec) {
+        return;
+    }
+    // This condition fires when a video stream ends, clears all intermediary data
+    if (entries[0].raw == 0xDEADB33F) {
+        cdma_pusher.reset();
+        return;
+    }
+    if (!cdma_pusher) {
+        cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this);
+    }
+    cdma_pusher->Push(std::move(entries));
+    cdma_pusher->DispatchCalls();
+}
+
+void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    renderer->SwapBuffers(framebuffer);
+}
+
+void GPUSynch::FlushRegion(VAddr addr, u64 size) {
+    renderer->Rasterizer().FlushRegion(addr, size);
+}
+
+void GPUSynch::InvalidateRegion(VAddr addr, u64 size) {
+    renderer->Rasterizer().InvalidateRegion(addr, size);
+}
+
+void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    renderer->Rasterizer().FlushAndInvalidateRegion(addr, size);
+}
+
+} // namespace VideoCommon
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -0,0 +1,41 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "video_core/gpu.h"
+
+namespace Core::Frontend {
+class GraphicsContext;
+}
+
+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
+namespace VideoCommon {
+
+/// Implementation of GPU interface that runs the GPU synchronously
+class GPUSynch final : public Tegra::GPU {
+public:
+    explicit GPUSynch(Core::System& system_, bool use_nvdec_);
+    ~GPUSynch() override;
+
+    void Start() override;
+    void ObtainContext() override;
+    void ReleaseContext() override;
+    void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) override;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
+    void FlushRegion(VAddr addr, u64 size) override;
+    void InvalidateRegion(VAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+    void WaitIdle() const override {}
+
+protected:
+    void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id,
+                             [[maybe_unused]] u32 value) const override {}
+};
+
+} // namespace VideoCommon
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -4,7 +4,6 @@

 #include "common/assert.h"
 #include "common/microprofile.h"
-#include "common/scope_exit.h"
 #include "common/thread.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
@@ -22,8 +21,6 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
                      SynchState& state, Tegra::CDmaPusher& cdma_pusher) {
    std::string name = "yuzu:GPU";
    MicroProfileOnThreadCreate(name.c_str());
-    SCOPE_EXIT({ MicroProfileOnThreadExit(); });
-
    Common::SetCurrentThreadName(name.c_str());
    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
    system.RegisterHostThread();
@@ -68,8 +65,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
    }
 }

-ThreadManager::ThreadManager(Core::System& system_, bool is_async_)
-    : system{system_}, is_async{is_async_} {}
+ThreadManager::ThreadManager(Core::System& system_) : system{system_} {}

 ThreadManager::~ThreadManager() {
    if (!thread.joinable()) {
@@ -101,30 +97,19 @@ void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
 }

 void ThreadManager::FlushRegion(VAddr addr, u64 size) {
-    if (!is_async) {
-        // Always flush with synchronous GPU mode
+    if (!Settings::IsGPULevelHigh()) {
        PushCommand(FlushRegionCommand(addr, size));
        return;
    }
-
-    // Asynchronous GPU mode
-    switch (Settings::values.gpu_accuracy.GetValue()) {
-    case Settings::GPUAccuracy::Normal:
-        PushCommand(FlushRegionCommand(addr, size));
-        break;
-    case Settings::GPUAccuracy::High:
-        // TODO(bunnei): Is this right? Preserving existing behavior for now
-        break;
-    case Settings::GPUAccuracy::Extreme: {
+    if (!Settings::IsGPULevelExtreme()) {
+        return;
+    }
+    if (system.Renderer().Rasterizer().MustFlushRegion(addr, size)) {
        auto& gpu = system.GPU();
        u64 fence = gpu.RequestFlush(addr, size);
        PushCommand(GPUTickCommand());
        while (fence > gpu.CurrentFlushRequestFence()) {
        }
-        break;
-    }
-    default:
-        UNIMPLEMENTED_MSG("Unsupported gpu_accuracy {}", Settings::values.gpu_accuracy.GetValue());
    }
 }

@@ -138,8 +123,7 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
 }

 void ThreadManager::WaitIdle() const {
-    while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) &&
-           system.IsPoweredOn()) {
+    while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed)) {
    }
 }

@@ -150,12 +134,6 @@ void ThreadManager::OnCommandListEnd() {
 u64 ThreadManager::PushCommand(CommandData&& command_data) {
    const u64 fence{++state.last_fence};
    state.queue.Push(CommandDataContainer(std::move(command_data), fence));
-
-    if (!is_async) {
-        // In synchronous GPU mode, block the caller until the command has executed
-        WaitIdle();
-    }
-
    return fence;
 }

--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -10,9 +10,8 @@
 #include <optional>
 #include <thread>
 #include <variant>
-
 #include "common/threadsafe_queue.h"
-#include "video_core/framebuffer_config.h"
+#include "video_core/gpu.h"

 namespace Tegra {
 struct FramebufferConfig;
@@ -26,10 +25,6 @@ class GraphicsContext;
 class System;
 } // namespace Core

-namespace VideoCore {
-class RendererBase;
-} // namespace VideoCore
-
 namespace VideoCommon::GPUThread {

 /// Command to signal to the GPU thread that processing has ended
@@ -117,7 +112,7 @@ struct SynchState final {
 /// Class used to manage the GPU thread
 class ThreadManager final {
 public:
-    explicit ThreadManager(Core::System& system_, bool is_async_);
+    explicit ThreadManager(Core::System& system_);
    ~ThreadManager();

    /// Creates and starts the GPU thread.
@@ -155,7 +150,6 @@ private:
    Core::System& system;
    std::thread thread;
    std::thread::id thread_id;
-    const bool is_async;
 };

 } // namespace VideoCommon::GPUThread
--- a/src/video_core/host_shaders/CMakeLists.txt
+++ b/src/video_core/host_shaders/CMakeLists.txt
@@ -1,8 +1,26 @@
-set(SHADER_SOURCES
+set(SHADER_FILES
+    block_linear_unswizzle_2d.comp
+    block_linear_unswizzle_3d.comp
+    convert_depth_to_float.frag
+    convert_float_to_depth.frag
+    full_screen_triangle.vert
+    opengl_copy_bc4.comp
    opengl_present.frag
    opengl_present.vert
+    pitch_unswizzle.comp
+    vulkan_blit_color_float.frag
+    vulkan_blit_depth_stencil.frag
+    vulkan_present.frag
+    vulkan_present.vert
+    vulkan_quad_array.comp
+    vulkan_quad_indexed.comp
+    vulkan_uint8.comp
 )

+find_program(GLSLANGVALIDATOR "glslangValidator" REQUIRED)
+
+set(GLSL_FLAGS "")
+
 set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
 set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders)
 set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)
@@ -10,27 +28,44 @@ set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE)
 set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in)
 set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake)

-foreach(FILENAME IN ITEMS ${SHADER_SOURCES})
+foreach(FILENAME IN ITEMS ${SHADER_FILES})
    string(REPLACE "." "_" SHADER_NAME ${FILENAME})
    set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME})
-    set(HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
-    add_custom_command(
-        OUTPUT
-            ${HEADER_FILE}
-        COMMAND
-            ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${HEADER_FILE} ${INPUT_FILE}
-        MAIN_DEPENDENCY
-            ${SOURCE_FILE}
-        DEPENDS
-            ${INPUT_FILE}
-            # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified
-    )
-    set(SHADER_HEADERS ${SHADER_HEADERS} ${HEADER_FILE})
+    # Skip generating source headers on Vulkan exclusive files
+    if (NOT ${FILENAME} MATCHES "vulkan.*")
+        set(SOURCE_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h)
+        add_custom_command(
+            OUTPUT
+                ${SOURCE_HEADER_FILE}
+            COMMAND
+                ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${SOURCE_HEADER_FILE} ${INPUT_FILE}
+            MAIN_DEPENDENCY
+                ${SOURCE_FILE}
+            DEPENDS
+                ${INPUT_FILE}
+                # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified
+        )
+        set(SHADER_HEADERS ${SHADER_HEADERS} ${SOURCE_HEADER_FILE})
+    endif()
+    # Skip compiling to SPIR-V OpenGL exclusive files
+    if (NOT ${FILENAME} MATCHES "opengl.*")
+        string(TOUPPER ${SHADER_NAME}_SPV SPIRV_VARIABLE_NAME)
+        set(SPIRV_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}_spv.h)
+        add_custom_command(
+            OUTPUT
+                ${SPIRV_HEADER_FILE}
+            COMMAND
+                ${GLSLANGVALIDATOR} -V ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE}
+            MAIN_DEPENDENCY
+                ${SOURCE_FILE}
+        )
+        set(SHADER_HEADERS ${SHADER_HEADERS} ${SPIRV_HEADER_FILE})
+    endif()
 endforeach()

 add_custom_target(host_shaders
    DEPENDS
        ${SHADER_HEADERS}
    SOURCES
-        ${SHADER_SOURCES}
+        ${SHADER_FILES}
 )
--- a/src/video_core/host_shaders/block_linear_unswizzle_2d.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_2d.comp
@@ -0,0 +1,122 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430
+
+#ifdef VULKAN
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 2
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec3 origin;
+UNIFORM(1) ivec3 destination;
+UNIFORM(2) uint bytes_per_block_log2;
+UNIFORM(3) uint layer_stride;
+UNIFORM(4) uint block_size;
+UNIFORM(5) uint x_shift;
+UNIFORM(6) uint block_height;
+UNIFORM(7) uint block_height_mask;
+END_PUSH_CONSTANTS
+
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
+
+layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage2DArray output_image;
+
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
+
+uint SwizzleOffset(uvec2 pos) {
+    pos = pos & SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64 + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    switch (bytes_per_block_log2) {
+#if HAS_EXTENDED_TYPES
+    case 0:
+        return uvec4(u8data[offset], 0, 0, 0);
+    case 1:
+        return uvec4(u16data[offset / 2], 0, 0, 0);
+#else
+    case 0:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
+    case 1:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
+#endif
+    case 2:
+        return uvec4(u32data[offset / 4], 0, 0, 0);
+    case 3:
+        return uvec4(u64data[offset / 8], 0, 0);
+    case 4:
+        return u128data[offset / 16];
+    }
+    return uvec4(0);
+}
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID + origin;
+    pos.x <<= bytes_per_block_log2;
+
+    // Read as soon as possible due to its latency
+    const uint swizzle = SwizzleOffset(pos.xy);
+
+    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+
+    uint offset = 0;
+    offset += pos.z * layer_stride;
+    offset += (block_y >> block_height) * block_size;
+    offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
+    offset += swizzle;
+
+    const uvec4 texel = ReadTexel(offset);
+    const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
+    imageStore(output_image, coord, texel);
+}
--- a/src/video_core/host_shaders/block_linear_unswizzle_3d.comp
+++ b/src/video_core/host_shaders/block_linear_unswizzle_3d.comp
@@ -0,0 +1,125 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430
+
+#ifdef VULKAN
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 2
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#define BINDING_SWIZZLE_BUFFER 0
+#define BINDING_INPUT_BUFFER 1
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec3 origin;
+UNIFORM(1) ivec3 destination;
+UNIFORM(2) uint bytes_per_block_log2;
+UNIFORM(3) uint slice_size;
+UNIFORM(4) uint block_size;
+UNIFORM(5) uint x_shift;
+UNIFORM(6) uint block_height;
+UNIFORM(7) uint block_height_mask;
+UNIFORM(8) uint block_depth;
+UNIFORM(9) uint block_depth_mask;
+END_PUSH_CONSTANTS
+
+layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
+    uint swizzle_table[];
+};
+
+#if HAS_EXTENDED_TYPES
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; };
+
+layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage3D output_image;
+
+layout(local_size_x = 16, local_size_y = 8, local_size_z = 8) in;
+
+const uint GOB_SIZE_X = 64;
+const uint GOB_SIZE_Y = 8;
+const uint GOB_SIZE_Z = 1;
+const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
+
+const uint GOB_SIZE_X_SHIFT = 6;
+const uint GOB_SIZE_Y_SHIFT = 3;
+const uint GOB_SIZE_Z_SHIFT = 0;
+const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
+
+const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
+
+uint SwizzleOffset(uvec2 pos) {
+    pos = pos & SWIZZLE_MASK;
+    return swizzle_table[pos.y * 64 + pos.x];
+}
+
+uvec4 ReadTexel(uint offset) {
+    switch (bytes_per_block_log2) {
+#if HAS_EXTENDED_TYPES
+    case 0:
+        return uvec4(u8data[offset], 0, 0, 0);
+    case 1:
+        return uvec4(u16data[offset / 2], 0, 0, 0);
+#else
+    case 0:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
+    case 1:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
+#endif
+    case 2:
+        return uvec4(u32data[offset / 4], 0, 0, 0);
+    case 3:
+        return uvec4(u64data[offset / 8], 0, 0);
+    case 4:
+        return u128data[offset / 16];
+    }
+    return uvec4(0);
+}
+
+void main() {
+    uvec3 pos = gl_GlobalInvocationID + origin;
+    pos.x <<= bytes_per_block_log2;
+
+    // Read as soon as possible due to its latency
+    const uint swizzle = SwizzleOffset(pos.xy);
+
+    const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
+
+    uint offset = 0;
+    offset += (pos.z >> block_depth) * slice_size;
+    offset += (pos.z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height);
+    offset += (block_y >> block_height) * block_size;
+    offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
+    offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
+    offset += swizzle;
+
+    const uvec4 texel = ReadTexel(offset);
+    const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination;
+    imageStore(output_image, coord, texel);
+}
--- a/src/video_core/host_shaders/convert_depth_to_float.frag
+++ b/src/video_core/host_shaders/convert_depth_to_float.frag
@@ -0,0 +1,13 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+layout(binding = 0) uniform sampler2D depth_texture;
+layout(location = 0) out float output_color;
+
+void main() {
+    ivec2 coord = ivec2(gl_FragCoord.xy);
+    output_color = texelFetch(depth_texture, coord, 0).r;
+}
--- a/src/video_core/host_shaders/convert_float_to_depth.frag
+++ b/src/video_core/host_shaders/convert_float_to_depth.frag
@@ -0,0 +1,13 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+layout(binding = 0) uniform sampler2D color_texture;
+
+void main() {
+    ivec2 coord = ivec2(gl_FragCoord.xy);
+    float color = texelFetch(color_texture, coord, 0).r;
+    gl_FragDepth = color;
+}
--- a/src/video_core/host_shaders/full_screen_triangle.vert
+++ b/src/video_core/host_shaders/full_screen_triangle.vert
@@ -0,0 +1,29 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+#ifdef VULKAN
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) vec2 tex_scale;
+UNIFORM(1) vec2 tex_offset;
+END_PUSH_CONSTANTS
+
+layout(location = 0) out vec2 texcoord;
+
+void main() {
+    float x = float((gl_VertexIndex & 1) << 2);
+    float y = float((gl_VertexIndex & 2) << 1);
+    gl_Position = vec4(x - 1.0, y - 1.0, 0.0, 1.0);
+    texcoord = fma(vec2(x, y) / 2.0, tex_scale, tex_offset);
+}
--- a/src/video_core/host_shaders/opengl_copy_bc4.comp
+++ b/src/video_core/host_shaders/opengl_copy_bc4.comp
@@ -0,0 +1,70 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430 core
+#extension GL_ARB_gpu_shader_int64 : require
+
+layout (local_size_x = 4, local_size_y = 4) in;
+
+layout(binding = 0, rg32ui) readonly uniform uimage3D bc4_input;
+layout(binding = 1, rgba8ui) writeonly uniform uimage3D bc4_output;
+
+layout(location = 0) uniform uvec3 src_offset;
+layout(location = 1) uniform uvec3 dst_offset;
+
+// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt
+uint DecompressBlock(uint64_t bits, uvec2 coord) {
+    const uint code_offset = 16 + 3 * (4 * coord.y + coord.x);
+    const uint code = uint(bits >> code_offset) & 7;
+    const uint red0 = uint(bits >> 0) & 0xff;
+    const uint red1 = uint(bits >> 8) & 0xff;
+    if (red0 > red1) {
+        switch (code) {
+        case 0:
+            return red0;
+        case 1:
+            return red1;
+        case 2:
+            return (6 * red0 + 1 * red1) / 7;
+        case 3:
+            return (5 * red0 + 2 * red1) / 7;
+        case 4:
+            return (4 * red0 + 3 * red1) / 7;
+        case 5:
+            return (3 * red0 + 4 * red1) / 7;
+        case 6:
+            return (2 * red0 + 5 * red1) / 7;
+        case 7:
+            return (1 * red0 + 6 * red1) / 7;
+        }
+    } else {
+        switch (code) {
+        case 0:
+            return red0;
+        case 1:
+            return red1;
+        case 2:
+            return (4 * red0 + 1 * red1) / 5;
+        case 3:
+            return (3 * red0 + 2 * red1) / 5;
+        case 4:
+            return (2 * red0 + 3 * red1) / 5;
+        case 5:
+            return (1 * red0 + 4 * red1) / 5;
+        case 6:
+            return 0;
+        case 7:
+            return 0xff;
+        }
+    }
+    return 0;
+}
+
+void main() {
+    uvec2 packed_bits = imageLoad(bc4_input, ivec3(gl_WorkGroupID + src_offset)).rg;
+    uint64_t bits = packUint2x32(packed_bits);
+    uint red = DecompressBlock(bits, gl_LocalInvocationID.xy);
+    uvec4 color = uvec4(red & 0xff, 0, 0, 0xff);
+    imageStore(bc4_output, ivec3(gl_GlobalInvocationID + dst_offset), color);
+}
--- a/src/video_core/host_shaders/opengl_present.frag
+++ b/src/video_core/host_shaders/opengl_present.frag
@@ -1,3 +1,7 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
 #version 430 core

 layout (location = 0) in vec2 frag_tex_coord;
--- a/src/video_core/host_shaders/opengl_present.vert
+++ b/src/video_core/host_shaders/opengl_present.vert
@@ -1,3 +1,7 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
 #version 430 core

 out gl_PerVertex {
--- a/src/video_core/host_shaders/pitch_unswizzle.comp
+++ b/src/video_core/host_shaders/pitch_unswizzle.comp
@@ -0,0 +1,86 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 430
+
+#ifdef VULKAN
+
+#extension GL_EXT_shader_16bit_storage : require
+#extension GL_EXT_shader_8bit_storage : require
+#define HAS_EXTENDED_TYPES 1
+#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
+#define END_PUSH_CONSTANTS };
+#define UNIFORM(n)
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 1
+
+#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
+
+#extension GL_NV_gpu_shader5 : enable
+#ifdef GL_NV_gpu_shader5
+#define HAS_EXTENDED_TYPES 1
+#else
+#define HAS_EXTENDED_TYPES 0
+#endif
+#define BEGIN_PUSH_CONSTANTS
+#define END_PUSH_CONSTANTS
+#define UNIFORM(n) layout (location = n) uniform
+#define BINDING_INPUT_BUFFER 0
+#define BINDING_OUTPUT_IMAGE 0
+
+#endif
+
+BEGIN_PUSH_CONSTANTS
+UNIFORM(0) uvec2 origin;
+UNIFORM(1) ivec2 destination;
+UNIFORM(2) uint bytes_per_block;
+UNIFORM(3) uint pitch;
+END_PUSH_CONSTANTS
+
+#if HAS_EXTENDED_TYPES
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU8 { uint8_t u8data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU16 { uint16_t u16data[]; };
+#endif
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint u32data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU64 { uvec2 u64data[]; };
+layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU128 { uvec4 u128data[]; };
+
+layout(binding = BINDING_OUTPUT_IMAGE) writeonly uniform uimage2D output_image;
+
+layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
+
+uvec4 ReadTexel(uint offset) {
+    switch (bytes_per_block) {
+#if HAS_EXTENDED_TYPES
+    case 1:
+        return uvec4(u8data[offset], 0, 0, 0);
+    case 2:
+        return uvec4(u16data[offset / 2], 0, 0, 0);
+#else
+    case 1:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0);
+    case 2:
+        return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0);
+#endif
+    case 4:
+        return uvec4(u32data[offset / 4], 0, 0, 0);
+    case 8:
+        return uvec4(u64data[offset / 8], 0, 0);
+    case 16:
+        return u128data[offset / 16];
+    }
+    return uvec4(0);
+}
+
+void main() {
+    uvec2 pos = gl_GlobalInvocationID.xy + origin;
+
+    uint offset = 0;
+    offset += pos.x * bytes_per_block;
+    offset += pos.y * pitch;
+
+    const uvec4 texel = ReadTexel(offset);
+    const ivec2 coord = ivec2(gl_GlobalInvocationID.xy) + destination;
+    imageStore(output_image, coord, texel);
+}
--- a/src/video_core/host_shaders/vulkan_blit_color_float.frag
+++ b/src/video_core/host_shaders/vulkan_blit_color_float.frag
@@ -0,0 +1,14 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+
+layout(binding = 0) uniform sampler2D tex;
+
+layout(location = 0) in vec2 texcoord;
+layout(location = 0) out vec4 color;
+
+void main() {
+    color = textureLod(tex, texcoord, 0);
+}
--- a/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag
+++ b/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag
@@ -0,0 +1,16 @@
+// Copyright 2020 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#version 450
+#extension GL_ARB_shader_stencil_export : require
+
+layout(binding = 0) uniform sampler2D depth_tex;
+layout(binding = 1) uniform isampler2D stencil_tex;
+
+layout(location = 0) in vec2 texcoord;
+
+void main() {
+    gl_FragDepth = textureLod(depth_tex, texcoord, 0).r;
+    gl_FragStencilRefARB = textureLod(stencil_tex, texcoord, 0).r;
+}
--- a/src/video_core/renderer_vulkan/shaders/blit.frag
+++ b/src/video_core/renderer_vulkan/shaders/blit.frag
@@ -2,15 +2,6 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-/*
- * Build instructions:
- * $ glslangValidator -V $THIS_FILE -o output.spv
- * $ spirv-opt -O --strip-debug output.spv -o optimized.spv
- * $ xxd -i optimized.spv
- *
- * Then copy that bytecode to the C++ file
- */
-
 #version 460 core

 layout (location = 0) in vec2 frag_tex_coord;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ReinUsesLisp	cdbee27692	vulkan_instance: Allow different Vulkan versions and enforce 1.1 For listing the available physical devices we can use Vulkan 1.0. Now that MoltenVK supports 1.1 we can require it for running games. Add missing documentation.	2020-12-31 02:07:34 -03:00
ReinUsesLisp	7344a7c447	vk_device: Use an array to report lacking device limits This makes easier to add and tune the required device limits.	2020-12-31 02:07:34 -03:00
ReinUsesLisp	f687392e6f	vk_device: Stop initialization when device is not suitable VKDevice::IsSuitable was not being called. To address this issue, check suitability before initialization and throw an exception if it fails. By doing this, we can deduplicate some code on queue searches. Previosuly we would first search if a present and graphics queue existed, then on initialization we would search again to find the index.	2020-12-31 02:07:33 -03:00
ReinUsesLisp	53ea06dc17	renderer_vulkan: Remove two step initialization on VKDevice The Vulkan device abstraction either initializes successfully on the constructor or throws a Vulkan exception.	2020-12-31 02:07:33 -03:00
ReinUsesLisp	085adfea00	renderer_vulkan: Throw when enumerating devices fails Report device enumeration errors with exceptions to be consistent with other initialization related function calls. Reduces the amount of code to maintain.	2020-12-31 02:07:33 -03:00
ReinUsesLisp	11f0f7598d	renderer_vulkan: Initialize surface in separate file Move surface initialization code to a separate file. It's unlikely to use this code outside of Vulkan, but keeping platform-specific code (Win32, Xlib, Wayland) in its own translation unit keeps things cleaner.	2020-12-31 02:07:33 -03:00
ReinUsesLisp	dce8720780	renderer_vulkan: Catch and report exceptions Move more Vulkan code to report errors with exceptions and report them through a log before notifying it with an error boolean for backwards compatibility. In the future we can replace the rasterizer two-step initialization to always use exceptions.	2020-12-31 02:07:33 -03:00
ReinUsesLisp	47843b4f09	renderer_vulkan: Create debug callback on separate file and throw Initialize debug callbacks (messenger) from a separate file. This allows sharing code with different backends. Change our Vulkan error handling to use exceptions instead of error codes, simplifying the initialization process.	2020-12-31 02:07:33 -03:00
ReinUsesLisp	25f88d99ce	renderer_vulkan: Move instance initialization to a separate file Simplify Vulkan's backend initialization code by moving it to a separate file, allowing us to initialize a Vulkan instance from different backends.	2020-12-31 02:07:33 -03:00
ReinUsesLisp	d1435009ed	vulkan_common: Rename renderer_vulkan/wrapper.h to vulkan_common/vulkan_wrapper.h Allows sharing Vulkan wrapper code between different rendering backends.	2020-12-31 02:07:14 -03:00
ReinUsesLisp	d937421422	vulkan_common: Move dynamic library load to a separate file Allows us to initialize a Vulkan dynamic library from different backends without duplicating code.	2020-12-31 02:02:48 -03:00
bunnei	53e49e5360	Merge pull request #5263 from lioncash/uninit half_set: Resolve -Wmaybe-uninitialized warnings	2020-12-30 15:17:05 -08:00
Lioncash	bcafef4b94	half_set: Resolve -Wmaybe-uninitialized warnings	2020-12-30 17:59:42 -05:00
Rodrigo Locatti	dab7711524	Merge pull request #5260 from lioncash/uninit maxwell_to_vk: Initialize usage variable in SurfaceFormat()	2020-12-30 16:17:01 -03:00
Lioncash	f0d9ab0717	maxwell_to_vk: Initialize usage variable in SurfaceFormat() Silences a -Wmaybe-uninitialized warning	2020-12-30 13:25:03 -05:00
LC	da07977db0	Merge pull request #5251 from ReinUsesLisp/wuninitialized cmake: Enforce -Wuninitialized	2020-12-30 06:34:42 -05:00
bunnei	d5fe722a30	Merge pull request #4967 from ReinUsesLisp/new-texcache video_core/texture_cache: Rewrite the texture cache	2020-12-29 23:20:09 -08:00
ReinUsesLisp	9764c13d6d	video_core: Rewrite the texture cache The current texture cache has several points that hurt maintainability and performance. It's easy to break unrelated parts of the cache when doing minor changes. The cache can easily forget valuable information about the cached textures by CPU writes or simply by its normal usage.The current texture cache has several points that hurt maintainability and performance. It's easy to break unrelated parts of the cache when doing minor changes. The cache can easily forget valuable information about the cached textures by CPU writes or simply by its normal usage. This commit aims to address those issues.	2020-12-30 03:38:50 -03:00
ReinUsesLisp	ac2e2ebe97	cmake: Enforce -Wuninitialized	2020-12-30 02:58:58 -03:00
ReinUsesLisp	157fc2d785	service/pcie: Fix invalid initialization argument	2020-12-30 02:58:38 -03:00
ReinUsesLisp	9106ac1e6b	video_core: Add a delayed destruction ring abstraction	2020-12-30 02:10:19 -03:00
ReinUsesLisp	21b18057f7	host_shaders: Add Vulkan assembler compute shaders	2020-12-30 02:03:50 -03:00
ReinUsesLisp	87ff58b1d7	host_shaders: Add helper to blit depth stencil fragment shader	2020-12-30 02:02:07 -03:00
ReinUsesLisp	ae5725b709	host_shaders: Add texture color blit fragment shader	2020-12-30 02:00:48 -03:00
ReinUsesLisp	64fbf319f1	host_shaders: Add shaders to present to the swapchain	2020-12-30 01:59:12 -03:00
ReinUsesLisp	82b7daed9c	host_shaders: Add shaders to convert between depth and color images	2020-12-30 01:48:44 -03:00
ReinUsesLisp	dc81a90640	host_shaders: Add compute shader to copy BC4 as RG32UI to RGBA8	2020-12-30 01:47:08 -03:00
ReinUsesLisp	5169ce9fcd	host_shaders: Add shader to render a full screen triangle	2020-12-30 01:44:09 -03:00
ReinUsesLisp	59c46f9de9	host_shaders: Add pitch linear upload compute shader	2020-12-30 01:41:42 -03:00
ReinUsesLisp	12d16248dd	host_shaders: Add block linear upload compute shaders	2020-12-30 01:39:35 -03:00
ReinUsesLisp	f20e18f60d	host_shaders: Add copyright headers to OpenGL present shaders	2020-12-30 01:35:56 -03:00
ReinUsesLisp	95d156a150	video_core/host_shaders: Add support for prebuilt SPIR-V shaders Add support for building SPIR-V shaders from GLSL and generating headers to include the text of those same GLSL shaders to consume from OpenGL.	2020-12-30 01:29:07 -03:00
bunnei	85cfd96f62	Merge pull request #5247 from comex/xx-concepts k_priority_queue: Fix concepts use	2020-12-29 16:50:20 -08:00
bunnei	b02464f685	Merge pull request #5246 from comex/xx-include Add missing include of "core/hle/kernel/kernel.h"	2020-12-29 16:43:17 -08:00
LC	8d55c8c855	Merge pull request #5248 from ReinUsesLisp/update-dynarmic externals: Update Dynarmic	2020-12-29 18:11:30 -05:00
ReinUsesLisp	3f048c8646	externals: Update Dynarmic Keeps yuzu up to date with the latest changes and introduces a change needed for a lock-free optimization our side.	2020-12-29 19:30:52 -03:00
comex	388cf58b31	k_priority_queue: Fix concepts use - For `std::same_as`, add missing include of `<concepts>`. - For `std::convertible_to`, create a replacement in `common/concepts.h` and use that instead. This would also be found in `<concepts>`, but unlike `std::same_as`, `std::convertible_to` is not yet implemented in libc++, LLVM's STL implementation - not even in master. (In fact, `std::same_as` is the only concept currently implemented. For some reason.)	2020-12-29 14:33:41 -05:00
comex	b36896b90e	Add missing include of "core/hle/kernel/kernel.h" This is needed as the header invokes methods on KernelCore.	2020-12-29 14:22:35 -05:00
LC	aa87278bf0	Merge pull request #5245 from ameerj/sleepthread-log svc: demote SleepThread log to LOG_TRACE	2020-12-29 14:03:24 -05:00
ameerj	0383363a8f	svc: demote SleepThread log to LOG_TRACE This log is called often, and introduces a lot of noise when debug logging is enabled, making it difficult to see other debug logs.	2020-12-29 14:01:56 -05:00
bunnei	22ba437aa4	Merge pull request #5236 from gal20/udp_client_patch input_common: process udp packets only for the correct pad	2020-12-29 02:51:40 -08:00
gal20	1defd0847a	udp client: process packets only for the correct pad	2020-12-27 22:22:48 +02:00