citra_qt: Settings (configuration) rework

Merge pull request #2055 from bunnei/gpu-thread
Asynchronous GPU command processing
2019-03-07 16:55:50 +01:00 · 2019-03-07 10:41:53 -05:00 · 2019-03-06 21:56:20 -05:00 · 2019-03-06 21:55:16 -05:00 · 2019-03-06 21:48:57 -05:00 · 2019-03-06 21:48:57 -05:00
140 changed files with 3805 additions and 1260 deletions
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ yuzu is an experimental open-source emulator for the Nintendo Switch from the cr

 It is written in C++ with portability in mind, with builds actively maintained for Windows, Linux and macOS. The emulator is currently only useful for homebrew development and research purposes.

-yuzu only emulates a subset of Switch hardware and therefore is generally only useful for running/debugging homebrew applications. At this time, yuzu cannot play any commercial games without major problems. yuzu can boot some games, to varying degrees of success, but does not implement any of the necessary GPU features to render 3D graphics.
+yuzu only emulates a subset of Switch hardware and therefore is generally only useful for running/debugging homebrew applications. At this time, yuzu cannot play any commercial games without major problems. yuzu can boot some games, to varying degrees of success.

 yuzu is licensed under the GPLv2 (or any later version). Refer to the license.txt file included.

--- a/externals/Vulkan-Headers
+++ b/externals/Vulkan-Headers
--- a/externals/cubeb
+++ b/externals/cubeb
--- a/src/audio_core/audio_renderer.h
+++ b/src/audio_core/audio_renderer.h
@@ -46,16 +46,18 @@ struct AudioRendererParameter {
    u32_le sample_rate;
    u32_le sample_count;
    u32_le mix_buffer_count;
-    u32_le unknown_c;
+    u32_le submix_count;
    u32_le voice_count;
    u32_le sink_count;
    u32_le effect_count;
-    u32_le unknown_1c;
-    u8 unknown_20;
-    INSERT_PADDING_BYTES(3);
+    u32_le performance_frame_count;
+    u8 is_voice_drop_enabled;
+    u8 unknown_21;
+    u8 unknown_22;
+    u8 execution_mode;
    u32_le splitter_count;
-    u32_le unknown_2c;
-    INSERT_PADDING_WORDS(1);
+    u32_le num_splitter_send_channels;
+    u32_le unknown_30;
    u32_le revision;
 };
 static_assert(sizeof(AudioRendererParameter) == 52, "AudioRendererParameter is an invalid size");
--- a/src/audio_core/buffer.h
+++ b/src/audio_core/buffer.h
@@ -21,7 +21,7 @@ public:
    Buffer(Tag tag, std::vector<s16>&& samples) : tag{tag}, samples{std::move(samples)} {}

    /// Returns the raw audio data for the buffer
-    std::vector<s16>& Samples() {
+    std::vector<s16>& GetSamples() {
        return samples;
    }

--- a/src/audio_core/codec.cpp
+++ b/src/audio_core/codec.cpp
@@ -68,8 +68,8 @@ std::vector<s16> DecodeADPCM(const u8* const data, std::size_t size, const ADPCM
        }
    }

-    state.yn1 = yn1;
-    state.yn2 = yn2;
+    state.yn1 = static_cast<s16>(yn1);
+    state.yn2 = static_cast<s16>(yn2);

    return ret;
 }
--- a/src/audio_core/cubeb_sink.cpp
+++ b/src/audio_core/cubeb_sink.cpp
@@ -12,6 +12,10 @@
 #include "common/ring_buffer.h"
 #include "core/settings.h"

+#ifdef _MSC_VER
+#include <objbase.h>
+#endif
+
 namespace AudioCore {

 class CubebSinkStream final : public SinkStream {
@@ -46,7 +50,7 @@ public:
        }
    }

-    ~CubebSinkStream() {
+    ~CubebSinkStream() override {
        if (!ctx) {
            return;
        }
@@ -75,11 +79,11 @@ public:
        queue.Push(samples);
    }

-    std::size_t SamplesInQueue(u32 num_channels) const override {
+    std::size_t SamplesInQueue(u32 channel_count) const override {
        if (!ctx)
            return 0;

-        return queue.Size() / num_channels;
+        return queue.Size() / channel_count;
    }

    void Flush() override {
@@ -98,7 +102,7 @@ private:
    u32 num_channels{};

    Common::RingBuffer<s16, 0x10000> queue;
-    std::array<s16, 2> last_frame;
+    std::array<s16, 2> last_frame{};
    std::atomic<bool> should_flush{};
    TimeStretcher time_stretch;

@@ -108,6 +112,11 @@ private:
 };

 CubebSink::CubebSink(std::string_view target_device_name) {
+    // Cubeb requires COM to be initialized on the thread calling cubeb_init on Windows
+#ifdef _MSC_VER
+    com_init_result = CoInitializeEx(nullptr, COINIT_MULTITHREADED);
+#endif
+
    if (cubeb_init(&ctx, "yuzu", nullptr) != CUBEB_OK) {
        LOG_CRITICAL(Audio_Sink, "cubeb_init failed");
        return;
@@ -142,6 +151,12 @@ CubebSink::~CubebSink() {
    }

    cubeb_destroy(ctx);
+
+#ifdef _MSC_VER
+    if (SUCCEEDED(com_init_result)) {
+        CoUninitialize();
+    }
+#endif
 }

 SinkStream& CubebSink::AcquireSinkStream(u32 sample_rate, u32 num_channels,
--- a/src/audio_core/cubeb_sink.h
+++ b/src/audio_core/cubeb_sink.h
@@ -25,6 +25,10 @@ private:
    cubeb* ctx{};
    cubeb_devid output_device{};
    std::vector<SinkStreamPtr> sink_streams;
+
+#ifdef _MSC_VER
+    u32 com_init_result = 0;
+#endif
 };

 std::vector<std::string> ListCubebSinkDevices();
--- a/src/audio_core/stream.cpp
+++ b/src/audio_core/stream.cpp
@@ -95,7 +95,7 @@ void Stream::PlayNextBuffer() {
    active_buffer = queued_buffers.front();
    queued_buffers.pop();

-    VolumeAdjustSamples(active_buffer->Samples());
+    VolumeAdjustSamples(active_buffer->GetSamples());

    sink_stream.EnqueueSamples(GetNumChannels(), active_buffer->GetSamples());

--- a/src/common/color.h
+++ b/src/common/color.h
@@ -55,36 +55,36 @@ constexpr u8 Convert8To6(u8 value) {
 /**
 * Decode a color stored in RGBA8 format
 * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
+ * @return Result color decoded as Common::Vec4<u8>
 */
-inline Math::Vec4<u8> DecodeRGBA8(const u8* bytes) {
+inline Common::Vec4<u8> DecodeRGBA8(const u8* bytes) {
    return {bytes[3], bytes[2], bytes[1], bytes[0]};
 }

 /**
 * Decode a color stored in RGB8 format
 * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
+ * @return Result color decoded as Common::Vec4<u8>
 */
-inline Math::Vec4<u8> DecodeRGB8(const u8* bytes) {
+inline Common::Vec4<u8> DecodeRGB8(const u8* bytes) {
    return {bytes[2], bytes[1], bytes[0], 255};
 }

 /**
 * Decode a color stored in RG8 (aka HILO8) format
 * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
+ * @return Result color decoded as Common::Vec4<u8>
 */
-inline Math::Vec4<u8> DecodeRG8(const u8* bytes) {
+inline Common::Vec4<u8> DecodeRG8(const u8* bytes) {
    return {bytes[1], bytes[0], 0, 255};
 }

 /**
 * Decode a color stored in RGB565 format
 * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
+ * @return Result color decoded as Common::Vec4<u8>
 */
-inline Math::Vec4<u8> DecodeRGB565(const u8* bytes) {
+inline Common::Vec4<u8> DecodeRGB565(const u8* bytes) {
    u16_le pixel;
    std::memcpy(&pixel, bytes, sizeof(pixel));
    return {Convert5To8((pixel >> 11) & 0x1F), Convert6To8((pixel >> 5) & 0x3F),
@@ -94,9 +94,9 @@ inline Math::Vec4<u8> DecodeRGB565(const u8* bytes) {
 /**
 * Decode a color stored in RGB5A1 format
 * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
+ * @return Result color decoded as Common::Vec4<u8>
 */
-inline Math::Vec4<u8> DecodeRGB5A1(const u8* bytes) {
+inline Common::Vec4<u8> DecodeRGB5A1(const u8* bytes) {
    u16_le pixel;
    std::memcpy(&pixel, bytes, sizeof(pixel));
    return {Convert5To8((pixel >> 11) & 0x1F), Convert5To8((pixel >> 6) & 0x1F),
@@ -106,9 +106,9 @@ inline Math::Vec4<u8> DecodeRGB5A1(const u8* bytes) {
 /**
 * Decode a color stored in RGBA4 format
 * @param bytes Pointer to encoded source color
- * @return Result color decoded as Math::Vec4<u8>
+ * @return Result color decoded as Common::Vec4<u8>
 */
-inline Math::Vec4<u8> DecodeRGBA4(const u8* bytes) {
+inline Common::Vec4<u8> DecodeRGBA4(const u8* bytes) {
    u16_le pixel;
    std::memcpy(&pixel, bytes, sizeof(pixel));
    return {Convert4To8((pixel >> 12) & 0xF), Convert4To8((pixel >> 8) & 0xF),
@@ -138,9 +138,9 @@ inline u32 DecodeD24(const u8* bytes) {
 /**
 * Decode a depth value and a stencil value stored in D24S8 format
 * @param bytes Pointer to encoded source values
- * @return Resulting values stored as a Math::Vec2
+ * @return Resulting values stored as a Common::Vec2
 */
-inline Math::Vec2<u32> DecodeD24S8(const u8* bytes) {
+inline Common::Vec2<u32> DecodeD24S8(const u8* bytes) {
    return {static_cast<u32>((bytes[2] << 16) | (bytes[1] << 8) | bytes[0]), bytes[3]};
 }

@@ -149,7 +149,7 @@ inline Math::Vec2<u32> DecodeD24S8(const u8* bytes) {
 * @param color Source color to encode
 * @param bytes Destination pointer to store encoded color
 */
-inline void EncodeRGBA8(const Math::Vec4<u8>& color, u8* bytes) {
+inline void EncodeRGBA8(const Common::Vec4<u8>& color, u8* bytes) {
    bytes[3] = color.r();
    bytes[2] = color.g();
    bytes[1] = color.b();
@@ -161,7 +161,7 @@ inline void EncodeRGBA8(const Math::Vec4<u8>& color, u8* bytes) {
 * @param color Source color to encode
 * @param bytes Destination pointer to store encoded color
 */
-inline void EncodeRGB8(const Math::Vec4<u8>& color, u8* bytes) {
+inline void EncodeRGB8(const Common::Vec4<u8>& color, u8* bytes) {
    bytes[2] = color.r();
    bytes[1] = color.g();
    bytes[0] = color.b();
@@ -172,7 +172,7 @@ inline void EncodeRGB8(const Math::Vec4<u8>& color, u8* bytes) {
 * @param color Source color to encode
 * @param bytes Destination pointer to store encoded color
 */
-inline void EncodeRG8(const Math::Vec4<u8>& color, u8* bytes) {
+inline void EncodeRG8(const Common::Vec4<u8>& color, u8* bytes) {
    bytes[1] = color.r();
    bytes[0] = color.g();
 }
@@ -181,7 +181,7 @@ inline void EncodeRG8(const Math::Vec4<u8>& color, u8* bytes) {
 * @param color Source color to encode
 * @param bytes Destination pointer to store encoded color
 */
-inline void EncodeRGB565(const Math::Vec4<u8>& color, u8* bytes) {
+inline void EncodeRGB565(const Common::Vec4<u8>& color, u8* bytes) {
    const u16_le data =
        (Convert8To5(color.r()) << 11) | (Convert8To6(color.g()) << 5) | Convert8To5(color.b());

@@ -193,7 +193,7 @@ inline void EncodeRGB565(const Math::Vec4<u8>& color, u8* bytes) {
 * @param color Source color to encode
 * @param bytes Destination pointer to store encoded color
 */
-inline void EncodeRGB5A1(const Math::Vec4<u8>& color, u8* bytes) {
+inline void EncodeRGB5A1(const Common::Vec4<u8>& color, u8* bytes) {
    const u16_le data = (Convert8To5(color.r()) << 11) | (Convert8To5(color.g()) << 6) |
                        (Convert8To5(color.b()) << 1) | Convert8To1(color.a());

@@ -205,7 +205,7 @@ inline void EncodeRGB5A1(const Math::Vec4<u8>& color, u8* bytes) {
 * @param color Source color to encode
 * @param bytes Destination pointer to store encoded color
 */
-inline void EncodeRGBA4(const Math::Vec4<u8>& color, u8* bytes) {
+inline void EncodeRGBA4(const Common::Vec4<u8>& color, u8* bytes) {
    const u16 data = (Convert8To4(color.r()) << 12) | (Convert8To4(color.g()) << 8) |
                     (Convert8To4(color.b()) << 4) | Convert8To4(color.a());

--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -39,10 +39,10 @@ public:
    Impl(Impl const&) = delete;
    const Impl& operator=(Impl const&) = delete;

-    void PushEntry(Entry e) {
-        std::lock_guard<std::mutex> lock(message_mutex);
-        message_queue.Push(std::move(e));
-        message_cv.notify_one();
+    void PushEntry(Class log_class, Level log_level, const char* filename, unsigned int line_num,
+                   const char* function, std::string message) {
+        message_queue.Push(
+            CreateEntry(log_class, log_level, filename, line_num, function, std::move(message)));
    }

    void AddBackend(std::unique_ptr<Backend> backend) {
@@ -86,15 +86,13 @@ private:
                }
            };
            while (true) {
-                {
-                    std::unique_lock<std::mutex> lock(message_mutex);
-                    message_cv.wait(lock, [&] { return !running || message_queue.Pop(entry); });
-                }
-                if (!running) {
+                entry = message_queue.PopWait();
+                if (entry.final_entry) {
                    break;
                }
                write_logs(entry);
            }
+
            // Drain the logging queue. Only writes out up to MAX_LOGS_TO_WRITE to prevent a case
            // where a system is repeatedly spamming logs even on close.
            const int MAX_LOGS_TO_WRITE = filter.IsDebug() ? INT_MAX : 100;
@@ -106,18 +104,36 @@ private:
    }

    ~Impl() {
-        running = false;
-        message_cv.notify_one();
+        Entry entry;
+        entry.final_entry = true;
+        message_queue.Push(entry);
        backend_thread.join();
    }

-    std::atomic_bool running{true};
-    std::mutex message_mutex, writing_mutex;
-    std::condition_variable message_cv;
+    Entry CreateEntry(Class log_class, Level log_level, const char* filename, unsigned int line_nr,
+                      const char* function, std::string message) const {
+        using std::chrono::duration_cast;
+        using std::chrono::steady_clock;
+
+        Entry entry;
+        entry.timestamp =
+            duration_cast<std::chrono::microseconds>(steady_clock::now() - time_origin);
+        entry.log_class = log_class;
+        entry.log_level = log_level;
+        entry.filename = Common::TrimSourcePath(filename);
+        entry.line_num = line_nr;
+        entry.function = function;
+        entry.message = std::move(message);
+
+        return entry;
+    }
+
+    std::mutex writing_mutex;
    std::thread backend_thread;
    std::vector<std::unique_ptr<Backend>> backends;
    Common::MPSCQueue<Log::Entry> message_queue;
    Filter filter;
+    std::chrono::steady_clock::time_point time_origin{std::chrono::steady_clock::now()};
 };

 void ConsoleBackend::Write(const Entry& entry) {
@@ -276,25 +292,6 @@ const char* GetLevelName(Level log_level) {
 #undef LVL
 }

-Entry CreateEntry(Class log_class, Level log_level, const char* filename, unsigned int line_nr,
-                  const char* function, std::string message) {
-    using std::chrono::duration_cast;
-    using std::chrono::steady_clock;
-
-    static steady_clock::time_point time_origin = steady_clock::now();
-
-    Entry entry;
-    entry.timestamp = duration_cast<std::chrono::microseconds>(steady_clock::now() - time_origin);
-    entry.log_class = log_class;
-    entry.log_level = log_level;
-    entry.filename = Common::TrimSourcePath(filename);
-    entry.line_num = line_nr;
-    entry.function = function;
-    entry.message = std::move(message);
-
-    return entry;
-}
-
 void SetGlobalFilter(const Filter& filter) {
    Impl::Instance().SetGlobalFilter(filter);
 }
@@ -319,9 +316,7 @@ void FmtLogMessageImpl(Class log_class, Level log_level, const char* filename,
    if (!filter.CheckMessage(log_class, log_level))
        return;

-    Entry entry =
-        CreateEntry(log_class, log_level, filename, line_num, function, fmt::vformat(format, args));
-
-    instance.PushEntry(std::move(entry));
+    instance.PushEntry(log_class, log_level, filename, line_num, function,
+                       fmt::vformat(format, args));
 }
 } // namespace Log
--- a/src/common/logging/backend.h
+++ b/src/common/logging/backend.h
@@ -27,6 +27,7 @@ struct Entry {
    unsigned int line_num;
    std::string function;
    std::string message;
+    bool final_entry = false;

    Entry() = default;
    Entry(Entry&& o) = default;
@@ -134,10 +135,6 @@ const char* GetLogClassName(Class log_class);
 */
 const char* GetLevelName(Level log_level);

-/// Creates a log entry by formatting the given source location, and message.
-Entry CreateEntry(Class log_class, Level log_level, const char* filename, unsigned int line_nr,
-                  const char* function, std::string message);
-
 /**
 * The global filter will prevent any messages from even being processed if they are filtered. Each
 * backend can have a filter, but if the level is lower than the global filter, the backend will
--- a/src/common/math_util.h
+++ b/src/common/math_util.h
@@ -7,7 +7,7 @@
 #include <cstdlib>
 #include <type_traits>

-namespace MathUtil {
+namespace Common {

 constexpr float PI = 3.14159265f;

@@ -41,4 +41,4 @@ struct Rectangle {
    }
 };

-} // namespace MathUtil
+} // namespace Common
--- a/src/common/quaternion.h
+++ b/src/common/quaternion.h
@@ -6,12 +6,12 @@

 #include "common/vector_math.h"

-namespace Math {
+namespace Common {

 template <typename T>
 class Quaternion {
 public:
-    Math::Vec3<T> xyz;
+    Vec3<T> xyz;
    T w{};

    Quaternion<decltype(-T{})> Inverse() const {
@@ -38,12 +38,12 @@ public:
 };

 template <typename T>
-auto QuaternionRotate(const Quaternion<T>& q, const Math::Vec3<T>& v) {
+auto QuaternionRotate(const Quaternion<T>& q, const Vec3<T>& v) {
    return v + 2 * Cross(q.xyz, Cross(q.xyz, v) + v * q.w);
 }

-inline Quaternion<float> MakeQuaternion(const Math::Vec3<float>& axis, float angle) {
+inline Quaternion<float> MakeQuaternion(const Vec3<float>& axis, float angle) {
    return {axis * std::sin(angle / 2), std::cos(angle / 2)};
 }

-} // namespace Math
+} // namespace Common
--- a/src/common/swap.h
+++ b/src/common/swap.h
@@ -28,8 +28,8 @@
 #include <cstring>
 #include "common/common_types.h"

-// GCC 4.6+
-#if __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+// GCC
+#ifdef __GNUC__

 #if __BYTE_ORDER__ && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && !defined(COMMON_LITTLE_ENDIAN)
 #define COMMON_LITTLE_ENDIAN 1
@@ -38,7 +38,7 @@
 #endif

 // LLVM/clang
-#elif __clang__
+#elif defined(__clang__)

 #if __LITTLE_ENDIAN__ && !defined(COMMON_LITTLE_ENDIAN)
 #define COMMON_LITTLE_ENDIAN 1
--- a/src/common/threadsafe_queue.h
+++ b/src/common/threadsafe_queue.h
@@ -8,6 +8,7 @@
 // single reader, single writer queue

 #include <atomic>
+#include <condition_variable>
 #include <cstddef>
 #include <mutex>
 #include <utility>
@@ -45,6 +46,7 @@ public:
        ElementPtr* new_ptr = new ElementPtr();
        write_ptr->next.store(new_ptr, std::memory_order_release);
        write_ptr = new_ptr;
+        cv.notify_one();

        ++size;
    }
@@ -74,6 +76,16 @@ public:
        return true;
    }

+    T PopWait() {
+        if (Empty()) {
+            std::unique_lock<std::mutex> lock(cv_mutex);
+            cv.wait(lock, [this]() { return !Empty(); });
+        }
+        T t;
+        Pop(t);
+        return t;
+    }
+
    // not thread-safe
    void Clear() {
        size.store(0);
@@ -101,6 +113,8 @@ private:
    ElementPtr* write_ptr;
    ElementPtr* read_ptr;
    std::atomic_size_t size{0};
+    std::mutex cv_mutex;
+    std::condition_variable cv;
 };

 // a simple thread-safe,
@@ -135,6 +149,10 @@ public:
        return spsc_queue.Pop(t);
    }

+    T PopWait() {
+        return spsc_queue.PopWait();
+    }
+
    // not thread-safe
    void Clear() {
        spsc_queue.Clear();
--- a/src/common/vector_math.h
+++ b/src/common/vector_math.h
@@ -33,7 +33,7 @@
 #include <cmath>
 #include <type_traits>

-namespace Math {
+namespace Common {

 template <typename T>
 class Vec2;
@@ -690,4 +690,4 @@ constexpr Vec4<T> MakeVec(const T& x, const Vec3<T>& yzw) {
    return MakeVec(x, yzw[0], yzw[1], yzw[2]);
 }

-} // namespace Math
+} // namespace Common
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -217,6 +217,7 @@ add_library(core STATIC
    hle/service/audio/audren_u.h
    hle/service/audio/codecctl.cpp
    hle/service/audio/codecctl.h
+    hle/service/audio/errors.h
    hle/service/audio/hwopus.cpp
    hle/service/audio/hwopus.h
    hle/service/bcat/bcat.cpp
@@ -400,6 +401,10 @@ add_library(core STATIC
    hle/service/time/time.h
    hle/service/usb/usb.cpp
    hle/service/usb/usb.h
+    hle/service/vi/display/vi_display.cpp
+    hle/service/vi/display/vi_display.h
+    hle/service/vi/layer/vi_layer.cpp
+    hle/service/vi/layer/vi_layer.h
    hle/service/vi/vi.cpp
    hle/service/vi/vi.h
    hle/service/vi/vi_m.cpp
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -36,7 +36,8 @@
 #include "frontend/applets/software_keyboard.h"
 #include "frontend/applets/web_browser.h"
 #include "video_core/debug_utils/debug_utils.h"
-#include "video_core/gpu.h"
+#include "video_core/gpu_asynch.h"
+#include "video_core/gpu_synch.h"
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"

@@ -78,6 +79,7 @@ FileSys::VirtualFile GetGameFileFromPath(const FileSys::VirtualFilesystem& vfs,
    return vfs->OpenFile(path, FileSys::Mode::Read);
 }
 struct System::Impl {
+    explicit Impl(System& system) : kernel{system} {}

    Cpu& CurrentCpuCore() {
        return cpu_core_manager.GetCurrentCore();
@@ -95,7 +97,7 @@ struct System::Impl {
        LOG_DEBUG(HW_Memory, "initialized OK");

        core_timing.Initialize();
-        kernel.Initialize(core_timing);
+        kernel.Initialize();

        const auto current_time = std::chrono::duration_cast<std::chrono::seconds>(
            std::chrono::system_clock::now().time_since_epoch());
@@ -128,10 +130,16 @@ struct System::Impl {
            return ResultStatus::ErrorVideoCore;
        }

-        gpu_core = std::make_unique<Tegra::GPU>(renderer->Rasterizer());
+        is_powered_on = true;
+
+        if (Settings::values.use_asynchronous_gpu_emulation) {
+            gpu_core = std::make_unique<VideoCommon::GPUAsynch>(system, *renderer);
+        } else {
+            gpu_core = std::make_unique<VideoCommon::GPUSynch>(system, *renderer);
+        }

        cpu_core_manager.Initialize(system);
-        is_powered_on = true;
+
        LOG_DEBUG(Core, "Initialized OK");

        // Reset counters and set time origin to current frame
@@ -182,13 +190,13 @@ struct System::Impl {

    void Shutdown() {
        // Log last frame performance stats
-        auto perf_results = GetAndResetPerfStats();
-        Telemetry().AddField(Telemetry::FieldType::Performance, "Shutdown_EmulationSpeed",
-                             perf_results.emulation_speed * 100.0);
-        Telemetry().AddField(Telemetry::FieldType::Performance, "Shutdown_Framerate",
-                             perf_results.game_fps);
-        Telemetry().AddField(Telemetry::FieldType::Performance, "Shutdown_Frametime",
-                             perf_results.frametime * 1000.0);
+        const auto perf_results = GetAndResetPerfStats();
+        telemetry_session->AddField(Telemetry::FieldType::Performance, "Shutdown_EmulationSpeed",
+                                    perf_results.emulation_speed * 100.0);
+        telemetry_session->AddField(Telemetry::FieldType::Performance, "Shutdown_Framerate",
+                                    perf_results.game_fps);
+        telemetry_session->AddField(Telemetry::FieldType::Performance, "Shutdown_Frametime",
+                                    perf_results.frametime * 1000.0);

        is_powered_on = false;

@@ -265,7 +273,7 @@ struct System::Impl {
    Core::FrameLimiter frame_limiter;
 };

-System::System() : impl{std::make_unique<Impl>()} {}
+System::System() : impl{std::make_unique<Impl>(*this)} {}
 System::~System() = default;

 Cpu& System::CurrentCpuCore() {
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -293,10 +293,6 @@ inline ARM_Interface& CurrentArmInterface() {
    return System::GetInstance().CurrentArmInterface();
 }

-inline TelemetrySession& Telemetry() {
-    return System::GetInstance().TelemetrySession();
-}
-
 inline Kernel::Process* CurrentProcess() {
    return System::GetInstance().CurrentProcess();
 }
--- a/src/core/crypto/key_manager.cpp
+++ b/src/core/crypto/key_manager.cpp
@@ -398,7 +398,8 @@ static bool ValidCryptoRevisionString(std::string_view base, size_t begin, size_
 }

 void KeyManager::LoadFromFile(const std::string& filename, bool is_title_keys) {
-    std::ifstream file(filename);
+    std::ifstream file;
+    OpenFStream(file, filename, std::ios_base::in);
    if (!file.is_open())
        return;

--- a/src/core/file_sys/vfs_vector.cpp
+++ b/src/core/file_sys/vfs_vector.cpp
@@ -47,7 +47,7 @@ std::size_t VectorVfsFile::Write(const u8* data_, std::size_t length, std::size_
    if (offset + length > data.size())
        data.resize(offset + length);
    const auto write = std::min(length, data.size() - offset);
-    std::memcpy(data.data(), data_, write);
+    std::memcpy(data.data() + offset, data_, write);
    return write;
 }

--- a/src/core/frontend/emu_window.cpp
+++ b/src/core/frontend/emu_window.cpp
@@ -67,7 +67,7 @@ static bool IsWithinTouchscreen(const Layout::FramebufferLayout& layout, unsigne
            framebuffer_x >= layout.screen.left && framebuffer_x < layout.screen.right);
 }

-std::tuple<unsigned, unsigned> EmuWindow::ClipToTouchScreen(unsigned new_x, unsigned new_y) {
+std::tuple<unsigned, unsigned> EmuWindow::ClipToTouchScreen(unsigned new_x, unsigned new_y) const {
    new_x = std::max(new_x, framebuffer_layout.screen.left);
    new_x = std::min(new_x, framebuffer_layout.screen.right - 1);

--- a/src/core/frontend/emu_window.h
+++ b/src/core/frontend/emu_window.h
@@ -166,7 +166,7 @@ private:
    /**
     * Clip the provided coordinates to be inside the touchscreen area.
     */
-    std::tuple<unsigned, unsigned> ClipToTouchScreen(unsigned new_x, unsigned new_y);
+    std::tuple<unsigned, unsigned> ClipToTouchScreen(unsigned new_x, unsigned new_y) const;
 };

 } // namespace Core::Frontend
--- a/src/core/frontend/framebuffer_layout.cpp
+++ b/src/core/frontend/framebuffer_layout.cpp
@@ -12,12 +12,12 @@ namespace Layout {

 // Finds the largest size subrectangle contained in window area that is confined to the aspect ratio
 template <class T>
-static MathUtil::Rectangle<T> maxRectangle(MathUtil::Rectangle<T> window_area,
-                                           float screen_aspect_ratio) {
+static Common::Rectangle<T> MaxRectangle(Common::Rectangle<T> window_area,
+                                         float screen_aspect_ratio) {
    float scale = std::min(static_cast<float>(window_area.GetWidth()),
                           window_area.GetHeight() / screen_aspect_ratio);
-    return MathUtil::Rectangle<T>{0, 0, static_cast<T>(std::round(scale)),
-                                  static_cast<T>(std::round(scale * screen_aspect_ratio))};
+    return Common::Rectangle<T>{0, 0, static_cast<T>(std::round(scale)),
+                                static_cast<T>(std::round(scale * screen_aspect_ratio))};
 }

 FramebufferLayout DefaultFrameLayout(unsigned width, unsigned height) {
@@ -29,8 +29,8 @@ FramebufferLayout DefaultFrameLayout(unsigned width, unsigned height) {

    const float emulation_aspect_ratio{static_cast<float>(ScreenUndocked::Height) /
                                       ScreenUndocked::Width};
-    MathUtil::Rectangle<unsigned> screen_window_area{0, 0, width, height};
-    MathUtil::Rectangle<unsigned> screen = maxRectangle(screen_window_area, emulation_aspect_ratio);
+    Common::Rectangle<unsigned> screen_window_area{0, 0, width, height};
+    Common::Rectangle<unsigned> screen = MaxRectangle(screen_window_area, emulation_aspect_ratio);

    float window_aspect_ratio = static_cast<float>(height) / width;

--- a/src/core/frontend/framebuffer_layout.h
+++ b/src/core/frontend/framebuffer_layout.h
@@ -16,7 +16,7 @@ struct FramebufferLayout {
    unsigned width{ScreenUndocked::Width};
    unsigned height{ScreenUndocked::Height};

-    MathUtil::Rectangle<unsigned> screen;
+    Common::Rectangle<unsigned> screen;

    /**
     * Returns the ration of pixel size of the screen, compared to the native size of the undocked
--- a/src/core/frontend/input.h
+++ b/src/core/frontend/input.h
@@ -124,7 +124,7 @@ using AnalogDevice = InputDevice<std::tuple<float, float>>;
 *   Orientation is determined by right-hand rule.
 *   Units: deg/sec
 */
-using MotionDevice = InputDevice<std::tuple<Math::Vec3<float>, Math::Vec3<float>>>;
+using MotionDevice = InputDevice<std::tuple<Common::Vec3<float>, Common::Vec3<float>>>;

 /**
 * A touch device is an input device that returns a tuple of two floats and a bool. The floats are
--- a/src/core/hle/ipc.h
+++ b/src/core/hle/ipc.h
@@ -4,10 +4,10 @@

 #pragma once

+#include "common/bit_field.h"
+#include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/swap.h"
-#include "core/hle/kernel/errors.h"
-#include "core/memory.h"

 namespace IPC {

--- a/src/core/hle/kernel/address_arbiter.cpp
+++ b/src/core/hle/kernel/address_arbiter.cpp
@@ -9,6 +9,7 @@
 #include "common/common_types.h"
 #include "core/core.h"
 #include "core/core_cpu.h"
+#include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/process.h"
@@ -18,32 +19,143 @@
 #include "core/memory.h"

 namespace Kernel {
-namespace AddressArbiter {
+namespace {
+// Wake up num_to_wake (or all) threads in a vector.
+void WakeThreads(const std::vector<SharedPtr<Thread>>& waiting_threads, s32 num_to_wake) {
+    // Only process up to 'target' threads, unless 'target' is <= 0, in which case process
+    // them all.
+    std::size_t last = waiting_threads.size();
+    if (num_to_wake > 0) {
+        last = num_to_wake;
+    }

-// Performs actual address waiting logic.
-static ResultCode WaitForAddress(VAddr address, s64 timeout) {
-    SharedPtr<Thread> current_thread = GetCurrentThread();
+    // Signal the waiting threads.
+    for (std::size_t i = 0; i < last; i++) {
+        ASSERT(waiting_threads[i]->GetStatus() == ThreadStatus::WaitArb);
+        waiting_threads[i]->SetWaitSynchronizationResult(RESULT_SUCCESS);
+        waiting_threads[i]->SetArbiterWaitAddress(0);
+        waiting_threads[i]->ResumeFromWait();
+    }
+}
+} // Anonymous namespace
+
+AddressArbiter::AddressArbiter(Core::System& system) : system{system} {}
+AddressArbiter::~AddressArbiter() = default;
+
+ResultCode AddressArbiter::SignalToAddress(VAddr address, s32 num_to_wake) {
+    const std::vector<SharedPtr<Thread>> waiting_threads = GetThreadsWaitingOnAddress(address);
+    WakeThreads(waiting_threads, num_to_wake);
+    return RESULT_SUCCESS;
+}
+
+ResultCode AddressArbiter::IncrementAndSignalToAddressIfEqual(VAddr address, s32 value,
+                                                              s32 num_to_wake) {
+    // Ensure that we can write to the address.
+    if (!Memory::IsValidVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+
+    if (static_cast<s32>(Memory::Read32(address)) != value) {
+        return ERR_INVALID_STATE;
+    }
+
+    Memory::Write32(address, static_cast<u32>(value + 1));
+    return SignalToAddress(address, num_to_wake);
+}
+
+ResultCode AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value,
+                                                                         s32 num_to_wake) {
+    // Ensure that we can write to the address.
+    if (!Memory::IsValidVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+
+    // Get threads waiting on the address.
+    const std::vector<SharedPtr<Thread>> waiting_threads = GetThreadsWaitingOnAddress(address);
+
+    // Determine the modified value depending on the waiting count.
+    s32 updated_value;
+    if (waiting_threads.empty()) {
+        updated_value = value - 1;
+    } else if (num_to_wake <= 0 || waiting_threads.size() <= static_cast<u32>(num_to_wake)) {
+        updated_value = value + 1;
+    } else {
+        updated_value = value;
+    }
+
+    if (static_cast<s32>(Memory::Read32(address)) != value) {
+        return ERR_INVALID_STATE;
+    }
+
+    Memory::Write32(address, static_cast<u32>(updated_value));
+    WakeThreads(waiting_threads, num_to_wake);
+    return RESULT_SUCCESS;
+}
+
+ResultCode AddressArbiter::WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout,
+                                                    bool should_decrement) {
+    // Ensure that we can read the address.
+    if (!Memory::IsValidVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+
+    const s32 cur_value = static_cast<s32>(Memory::Read32(address));
+    if (cur_value >= value) {
+        return ERR_INVALID_STATE;
+    }
+
+    if (should_decrement) {
+        Memory::Write32(address, static_cast<u32>(cur_value - 1));
+    }
+
+    // Short-circuit without rescheduling, if timeout is zero.
+    if (timeout == 0) {
+        return RESULT_TIMEOUT;
+    }
+
+    return WaitForAddress(address, timeout);
+}
+
+ResultCode AddressArbiter::WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout) {
+    // Ensure that we can read the address.
+    if (!Memory::IsValidVirtualAddress(address)) {
+        return ERR_INVALID_ADDRESS_STATE;
+    }
+    // Only wait for the address if equal.
+    if (static_cast<s32>(Memory::Read32(address)) != value) {
+        return ERR_INVALID_STATE;
+    }
+    // Short-circuit without rescheduling, if timeout is zero.
+    if (timeout == 0) {
+        return RESULT_TIMEOUT;
+    }
+
+    return WaitForAddress(address, timeout);
+}
+
+ResultCode AddressArbiter::WaitForAddress(VAddr address, s64 timeout) {
+    SharedPtr<Thread> current_thread = system.CurrentScheduler().GetCurrentThread();
    current_thread->SetArbiterWaitAddress(address);
    current_thread->SetStatus(ThreadStatus::WaitArb);
    current_thread->InvalidateWakeupCallback();

    current_thread->WakeAfterDelay(timeout);

-    Core::System::GetInstance().CpuCore(current_thread->GetProcessorID()).PrepareReschedule();
+    system.CpuCore(current_thread->GetProcessorID()).PrepareReschedule();
    return RESULT_TIMEOUT;
 }

-// Gets the threads waiting on an address.
-static std::vector<SharedPtr<Thread>> GetThreadsWaitingOnAddress(VAddr address) {
-    const auto RetrieveWaitingThreads = [](std::size_t core_index,
-                                           std::vector<SharedPtr<Thread>>& waiting_threads,
-                                           VAddr arb_addr) {
-        const auto& scheduler = Core::System::GetInstance().Scheduler(core_index);
+std::vector<SharedPtr<Thread>> AddressArbiter::GetThreadsWaitingOnAddress(VAddr address) const {
+    const auto RetrieveWaitingThreads = [this](std::size_t core_index,
+                                               std::vector<SharedPtr<Thread>>& waiting_threads,
+                                               VAddr arb_addr) {
+        const auto& scheduler = system.Scheduler(core_index);
        const auto& thread_list = scheduler.GetThreadList();

        for (const auto& thread : thread_list) {
-            if (thread->GetArbiterWaitAddress() == arb_addr)
+            if (thread->GetArbiterWaitAddress() == arb_addr) {
                waiting_threads.push_back(thread);
+            }
        }
    };

@@ -62,119 +174,4 @@ static std::vector<SharedPtr<Thread>> GetThreadsWaitingOnAddress(VAddr address)

    return threads;
 }
-
-// Wake up num_to_wake (or all) threads in a vector.
-static void WakeThreads(std::vector<SharedPtr<Thread>>& waiting_threads, s32 num_to_wake) {
-    // Only process up to 'target' threads, unless 'target' is <= 0, in which case process
-    // them all.
-    std::size_t last = waiting_threads.size();
-    if (num_to_wake > 0)
-        last = num_to_wake;
-
-    // Signal the waiting threads.
-    for (std::size_t i = 0; i < last; i++) {
-        ASSERT(waiting_threads[i]->GetStatus() == ThreadStatus::WaitArb);
-        waiting_threads[i]->SetWaitSynchronizationResult(RESULT_SUCCESS);
-        waiting_threads[i]->SetArbiterWaitAddress(0);
-        waiting_threads[i]->ResumeFromWait();
-    }
-}
-
-// Signals an address being waited on.
-ResultCode SignalToAddress(VAddr address, s32 num_to_wake) {
-    std::vector<SharedPtr<Thread>> waiting_threads = GetThreadsWaitingOnAddress(address);
-
-    WakeThreads(waiting_threads, num_to_wake);
-    return RESULT_SUCCESS;
-}
-
-// Signals an address being waited on and increments its value if equal to the value argument.
-ResultCode IncrementAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake) {
-    // Ensure that we can write to the address.
-    if (!Memory::IsValidVirtualAddress(address)) {
-        return ERR_INVALID_ADDRESS_STATE;
-    }
-
-    if (static_cast<s32>(Memory::Read32(address)) == value) {
-        Memory::Write32(address, static_cast<u32>(value + 1));
-    } else {
-        return ERR_INVALID_STATE;
-    }
-
-    return SignalToAddress(address, num_to_wake);
-}
-
-// Signals an address being waited on and modifies its value based on waiting thread count if equal
-// to the value argument.
-ResultCode ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value,
-                                                         s32 num_to_wake) {
-    // Ensure that we can write to the address.
-    if (!Memory::IsValidVirtualAddress(address)) {
-        return ERR_INVALID_ADDRESS_STATE;
-    }
-
-    // Get threads waiting on the address.
-    std::vector<SharedPtr<Thread>> waiting_threads = GetThreadsWaitingOnAddress(address);
-
-    // Determine the modified value depending on the waiting count.
-    s32 updated_value;
-    if (waiting_threads.empty()) {
-        updated_value = value - 1;
-    } else if (num_to_wake <= 0 || waiting_threads.size() <= static_cast<u32>(num_to_wake)) {
-        updated_value = value + 1;
-    } else {
-        updated_value = value;
-    }
-
-    if (static_cast<s32>(Memory::Read32(address)) == value) {
-        Memory::Write32(address, static_cast<u32>(updated_value));
-    } else {
-        return ERR_INVALID_STATE;
-    }
-
-    WakeThreads(waiting_threads, num_to_wake);
-    return RESULT_SUCCESS;
-}
-
-// Waits on an address if the value passed is less than the argument value, optionally decrementing.
-ResultCode WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout, bool should_decrement) {
-    // Ensure that we can read the address.
-    if (!Memory::IsValidVirtualAddress(address)) {
-        return ERR_INVALID_ADDRESS_STATE;
-    }
-
-    s32 cur_value = static_cast<s32>(Memory::Read32(address));
-    if (cur_value < value) {
-        if (should_decrement) {
-            Memory::Write32(address, static_cast<u32>(cur_value - 1));
-        }
-    } else {
-        return ERR_INVALID_STATE;
-    }
-    // Short-circuit without rescheduling, if timeout is zero.
-    if (timeout == 0) {
-        return RESULT_TIMEOUT;
-    }
-
-    return WaitForAddress(address, timeout);
-}
-
-// Waits on an address if the value passed is equal to the argument value.
-ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout) {
-    // Ensure that we can read the address.
-    if (!Memory::IsValidVirtualAddress(address)) {
-        return ERR_INVALID_ADDRESS_STATE;
-    }
-    // Only wait for the address if equal.
-    if (static_cast<s32>(Memory::Read32(address)) != value) {
-        return ERR_INVALID_STATE;
-    }
-    // Short-circuit without rescheduling, if timeout is zero.
-    if (timeout == 0) {
-        return RESULT_TIMEOUT;
-    }
-
-    return WaitForAddress(address, timeout);
-}
-} // namespace AddressArbiter
 } // namespace Kernel
--- a/src/core/hle/kernel/address_arbiter.h
+++ b/src/core/hle/kernel/address_arbiter.h
@@ -5,30 +5,68 @@
 #pragma once

 #include "common/common_types.h"
+#include "core/hle/kernel/address_arbiter.h"

 union ResultCode;

+namespace Core {
+class System;
+}
+
 namespace Kernel {

-namespace AddressArbiter {
-enum class ArbitrationType {
-    WaitIfLessThan = 0,
-    DecrementAndWaitIfLessThan = 1,
-    WaitIfEqual = 2,
+class Thread;
+
+class AddressArbiter {
+public:
+    enum class ArbitrationType {
+        WaitIfLessThan = 0,
+        DecrementAndWaitIfLessThan = 1,
+        WaitIfEqual = 2,
+    };
+
+    enum class SignalType {
+        Signal = 0,
+        IncrementAndSignalIfEqual = 1,
+        ModifyByWaitingCountAndSignalIfEqual = 2,
+    };
+
+    explicit AddressArbiter(Core::System& system);
+    ~AddressArbiter();
+
+    AddressArbiter(const AddressArbiter&) = delete;
+    AddressArbiter& operator=(const AddressArbiter&) = delete;
+
+    AddressArbiter(AddressArbiter&&) = default;
+    AddressArbiter& operator=(AddressArbiter&&) = delete;
+
+    /// Signals an address being waited on.
+    ResultCode SignalToAddress(VAddr address, s32 num_to_wake);
+
+    /// Signals an address being waited on and increments its value if equal to the value argument.
+    ResultCode IncrementAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake);
+
+    /// Signals an address being waited on and modifies its value based on waiting thread count if
+    /// equal to the value argument.
+    ResultCode ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value,
+                                                             s32 num_to_wake);
+
+    /// Waits on an address if the value passed is less than the argument value,
+    /// optionally decrementing.
+    ResultCode WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout,
+                                        bool should_decrement);
+
+    /// Waits on an address if the value passed is equal to the argument value.
+    ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout);
+
+private:
+    // Waits on the given address with a timeout in nanoseconds
+    ResultCode WaitForAddress(VAddr address, s64 timeout);
+
+    // Gets the threads waiting on an address.
+    std::vector<SharedPtr<Thread>> GetThreadsWaitingOnAddress(VAddr address) const;
+
+    Core::System& system;
 };

-enum class SignalType {
-    Signal = 0,
-    IncrementAndSignalIfEqual = 1,
-    ModifyByWaitingCountAndSignalIfEqual = 2,
-};
-
-ResultCode SignalToAddress(VAddr address, s32 num_to_wake);
-ResultCode IncrementAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake);
-ResultCode ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake);
-
-ResultCode WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout, bool should_decrement);
-ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout);
-} // namespace AddressArbiter
-
 } // namespace Kernel
--- a/src/core/hle/kernel/errors.h
+++ b/src/core/hle/kernel/errors.h
@@ -14,6 +14,7 @@ constexpr ResultCode ERR_MAX_CONNECTIONS_REACHED{ErrorModule::Kernel, 7};
 constexpr ResultCode ERR_INVALID_CAPABILITY_DESCRIPTOR{ErrorModule::Kernel, 14};
 constexpr ResultCode ERR_INVALID_SIZE{ErrorModule::Kernel, 101};
 constexpr ResultCode ERR_INVALID_ADDRESS{ErrorModule::Kernel, 102};
+constexpr ResultCode ERR_OUT_OF_MEMORY{ErrorModule::Kernel, 104};
 constexpr ResultCode ERR_HANDLE_TABLE_FULL{ErrorModule::Kernel, 105};
 constexpr ResultCode ERR_INVALID_ADDRESS_STATE{ErrorModule::Kernel, 106};
 constexpr ResultCode ERR_INVALID_MEMORY_PERMISSIONS{ErrorModule::Kernel, 108};
--- a/src/core/hle/kernel/handle_table.cpp
+++ b/src/core/hle/kernel/handle_table.cpp
@@ -14,32 +14,47 @@
 namespace Kernel {
 namespace {
 constexpr u16 GetSlot(Handle handle) {
-    return handle >> 15;
+    return static_cast<u16>(handle >> 15);
 }

 constexpr u16 GetGeneration(Handle handle) {
-    return handle & 0x7FFF;
+    return static_cast<u16>(handle & 0x7FFF);
 }
 } // Anonymous namespace

 HandleTable::HandleTable() {
-    next_generation = 1;
    Clear();
 }

 HandleTable::~HandleTable() = default;

+ResultCode HandleTable::SetSize(s32 handle_table_size) {
+    if (static_cast<u32>(handle_table_size) > MAX_COUNT) {
+        return ERR_OUT_OF_MEMORY;
+    }
+
+    // Values less than or equal to zero indicate to use the maximum allowable
+    // size for the handle table in the actual kernel, so we ignore the given
+    // value in that case, since we assume this by default unless this function
+    // is called.
+    if (handle_table_size > 0) {
+        table_size = static_cast<u16>(handle_table_size);
+    }
+
+    return RESULT_SUCCESS;
+}
+
 ResultVal<Handle> HandleTable::Create(SharedPtr<Object> obj) {
    DEBUG_ASSERT(obj != nullptr);

-    u16 slot = next_free_slot;
-    if (slot >= generations.size()) {
+    const u16 slot = next_free_slot;
+    if (slot >= table_size) {
        LOG_ERROR(Kernel, "Unable to allocate Handle, too many slots in use.");
        return ERR_HANDLE_TABLE_FULL;
    }
    next_free_slot = generations[slot];

-    u16 generation = next_generation++;
+    const u16 generation = next_generation++;

    // Overflow count so it fits in the 15 bits dedicated to the generation in the handle.
    // Horizon OS uses zero to represent an invalid handle, so skip to 1.
@@ -64,10 +79,11 @@ ResultVal<Handle> HandleTable::Duplicate(Handle handle) {
 }

 ResultCode HandleTable::Close(Handle handle) {
-    if (!IsValid(handle))
+    if (!IsValid(handle)) {
        return ERR_INVALID_HANDLE;
+    }

-    u16 slot = GetSlot(handle);
+    const u16 slot = GetSlot(handle);

    objects[slot] = nullptr;

@@ -77,10 +93,10 @@ ResultCode HandleTable::Close(Handle handle) {
 }

 bool HandleTable::IsValid(Handle handle) const {
-    std::size_t slot = GetSlot(handle);
-    u16 generation = GetGeneration(handle);
+    const std::size_t slot = GetSlot(handle);
+    const u16 generation = GetGeneration(handle);

-    return slot < MAX_COUNT && objects[slot] != nullptr && generations[slot] == generation;
+    return slot < table_size && objects[slot] != nullptr && generations[slot] == generation;
 }

 SharedPtr<Object> HandleTable::GetGeneric(Handle handle) const {
@@ -97,7 +113,7 @@ SharedPtr<Object> HandleTable::GetGeneric(Handle handle) const {
 }

 void HandleTable::Clear() {
-    for (u16 i = 0; i < MAX_COUNT; ++i) {
+    for (u16 i = 0; i < table_size; ++i) {
        generations[i] = i + 1;
        objects[i] = nullptr;
    }
--- a/src/core/hle/kernel/handle_table.h
+++ b/src/core/hle/kernel/handle_table.h
@@ -49,6 +49,20 @@ public:
    HandleTable();
    ~HandleTable();

+    /**
+     * Sets the number of handles that may be in use at one time
+     * for this handle table.
+     *
+     * @param handle_table_size The desired size to limit the handle table to.
+     *
+     * @returns an error code indicating if initialization was successful.
+     *          If initialization was not successful, then ERR_OUT_OF_MEMORY
+     *          will be returned.
+     *
+     * @pre handle_table_size must be within the range [0, 1024]
+     */
+    ResultCode SetSize(s32 handle_table_size);
+
    /**
     * Allocates a handle for the given object.
     * @return The created Handle or one of the following errors:
@@ -103,14 +117,21 @@ private:
     */
    std::array<u16, MAX_COUNT> generations;

+    /**
+     * The limited size of the handle table. This can be specified by process
+     * capabilities in order to restrict the overall number of handles that
+     * can be created in a process instance
+     */
+    u16 table_size = static_cast<u16>(MAX_COUNT);
+
    /**
     * Global counter of the number of created handles. Stored in `generations` when a handle is
     * created, and wraps around to 1 when it hits 0x8000.
     */
-    u16 next_generation;
+    u16 next_generation = 1;

    /// Head of the free slots linked list.
-    u16 next_free_slot;
+    u16 next_free_slot = 0;
 };

 } // namespace Kernel
--- a/src/core/hle/kernel/hle_ipc.h
+++ b/src/core/hle/kernel/hle_ipc.h
@@ -15,6 +15,8 @@
 #include "core/hle/ipc.h"
 #include "core/hle/kernel/object.h"

+union ResultCode;
+
 namespace Service {
 class ServiceFrameworkBase;
 }
@@ -208,14 +210,12 @@ public:

    template <typename T>
    SharedPtr<T> GetCopyObject(std::size_t index) {
-        ASSERT(index < copy_objects.size());
-        return DynamicObjectCast<T>(copy_objects[index]);
+        return DynamicObjectCast<T>(copy_objects.at(index));
    }

    template <typename T>
    SharedPtr<T> GetMoveObject(std::size_t index) {
-        ASSERT(index < move_objects.size());
-        return DynamicObjectCast<T>(move_objects[index]);
+        return DynamicObjectCast<T>(move_objects.at(index));
    }

    void AddMoveObject(SharedPtr<Object> object) {
@@ -232,7 +232,7 @@ public:

    template <typename T>
    std::shared_ptr<T> GetDomainRequestHandler(std::size_t index) const {
-        return std::static_pointer_cast<T>(domain_request_handlers[index]);
+        return std::static_pointer_cast<T>(domain_request_handlers.at(index));
    }

    void SetDomainRequestHandlers(
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -12,6 +12,7 @@

 #include "core/core.h"
 #include "core/core_timing.h"
+#include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/kernel.h"
@@ -86,11 +87,13 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] int cycles_
 }

 struct KernelCore::Impl {
-    void Initialize(KernelCore& kernel, Core::Timing::CoreTiming& core_timing) {
+    explicit Impl(Core::System& system) : address_arbiter{system}, system{system} {}
+
+    void Initialize(KernelCore& kernel) {
        Shutdown();

        InitializeSystemResourceLimit(kernel);
-        InitializeThreads(core_timing);
+        InitializeThreads();
    }

    void Shutdown() {
@@ -122,9 +125,9 @@ struct KernelCore::Impl {
        ASSERT(system_resource_limit->SetLimitValue(ResourceType::Sessions, 900).IsSuccess());
    }

-    void InitializeThreads(Core::Timing::CoreTiming& core_timing) {
+    void InitializeThreads() {
        thread_wakeup_event_type =
-            core_timing.RegisterEvent("ThreadWakeupCallback", ThreadWakeupCallback);
+            system.CoreTiming().RegisterEvent("ThreadWakeupCallback", ThreadWakeupCallback);
    }

    std::atomic<u32> next_object_id{0};
@@ -135,6 +138,8 @@ struct KernelCore::Impl {
    std::vector<SharedPtr<Process>> process_list;
    Process* current_process = nullptr;

+    Kernel::AddressArbiter address_arbiter;
+
    SharedPtr<ResourceLimit> system_resource_limit;

    Core::Timing::EventType* thread_wakeup_event_type = nullptr;
@@ -145,15 +150,18 @@ struct KernelCore::Impl {
    /// Map of named ports managed by the kernel, which can be retrieved using
    /// the ConnectToPort SVC.
    NamedPortTable named_ports;
+
+    // System context
+    Core::System& system;
 };

-KernelCore::KernelCore() : impl{std::make_unique<Impl>()} {}
+KernelCore::KernelCore(Core::System& system) : impl{std::make_unique<Impl>(system)} {}
 KernelCore::~KernelCore() {
    Shutdown();
 }

-void KernelCore::Initialize(Core::Timing::CoreTiming& core_timing) {
-    impl->Initialize(*this, core_timing);
+void KernelCore::Initialize() {
+    impl->Initialize(*this);
 }

 void KernelCore::Shutdown() {
@@ -184,6 +192,14 @@ const Process* KernelCore::CurrentProcess() const {
    return impl->current_process;
 }

+AddressArbiter& KernelCore::AddressArbiter() {
+    return impl->address_arbiter;
+}
+
+const AddressArbiter& KernelCore::AddressArbiter() const {
+    return impl->address_arbiter;
+}
+
 void KernelCore::AddNamedPort(std::string name, SharedPtr<ClientPort> port) {
    impl->named_ports.emplace(std::move(name), std::move(port));
 }
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -11,6 +11,10 @@
 template <typename T>
 class ResultVal;

+namespace Core {
+class System;
+}
+
 namespace Core::Timing {
 class CoreTiming;
 struct EventType;
@@ -18,6 +22,7 @@ struct EventType;

 namespace Kernel {

+class AddressArbiter;
 class ClientPort;
 class HandleTable;
 class Process;
@@ -30,7 +35,14 @@ private:
    using NamedPortTable = std::unordered_map<std::string, SharedPtr<ClientPort>>;

 public:
-    KernelCore();
+    /// Constructs an instance of the kernel using the given System
+    /// instance as a context for any necessary system-related state,
+    /// such as threads, CPU core state, etc.
+    ///
+    /// @post After execution of the constructor, the provided System
+    ///       object *must* outlive the kernel instance itself.
+    ///
+    explicit KernelCore(Core::System& system);
    ~KernelCore();

    KernelCore(const KernelCore&) = delete;
@@ -40,11 +52,7 @@ public:
    KernelCore& operator=(KernelCore&&) = delete;

    /// Resets the kernel to a clean slate for use.
-    ///
-    /// @param core_timing CoreTiming instance used to create any necessary
-    ///                    kernel-specific callback events.
-    ///
-    void Initialize(Core::Timing::CoreTiming& core_timing);
+    void Initialize();

    /// Clears all resources in use by the kernel instance.
    void Shutdown();
@@ -67,6 +75,12 @@ public:
    /// Retrieves a const pointer to the current process.
    const Process* CurrentProcess() const;

+    /// Provides a reference to the kernel's address arbiter.
+    Kernel::AddressArbiter& AddressArbiter();
+
+    /// Provides a const reference to the kernel's address arbiter.
+    const Kernel::AddressArbiter& AddressArbiter() const;
+
    /// Adds a port to the named port table
    void AddNamedPort(std::string name, SharedPtr<ClientPort> port);

--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -99,7 +99,13 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) {
    vm_manager.Reset(metadata.GetAddressSpaceType());

    const auto& caps = metadata.GetKernelCapabilities();
-    return capabilities.InitializeForUserProcess(caps.data(), caps.size(), vm_manager);
+    const auto capability_init_result =
+        capabilities.InitializeForUserProcess(caps.data(), caps.size(), vm_manager);
+    if (capability_init_result.IsError()) {
+        return capability_init_result;
+    }
+
+    return handle_table.SetSize(capabilities.GetHandleTableSize());
 }

 void Process::Run(VAddr entry_point, s32 main_thread_priority, u32 stack_size) {
--- a/src/core/hle/kernel/process_capability.cpp
+++ b/src/core/hle/kernel/process_capability.cpp
@@ -96,7 +96,7 @@ void ProcessCapabilities::InitializeForMetadatalessProcess() {
    interrupt_capabilities.set();

    // Allow using the maximum possible amount of handles
-    handle_table_size = static_cast<u32>(HandleTable::MAX_COUNT);
+    handle_table_size = static_cast<s32>(HandleTable::MAX_COUNT);

    // Allow all debugging capabilities.
    is_debuggable = true;
@@ -337,7 +337,7 @@ ResultCode ProcessCapabilities::HandleHandleTableFlags(u32 flags) {
        return ERR_RESERVED_VALUE;
    }

-    handle_table_size = (flags >> 16) & 0x3FF;
+    handle_table_size = static_cast<s32>((flags >> 16) & 0x3FF);
    return RESULT_SUCCESS;
 }

--- a/src/core/hle/kernel/process_capability.h
+++ b/src/core/hle/kernel/process_capability.h
@@ -156,7 +156,7 @@ public:
    }

    /// Gets the number of total allowable handles for the process' handle table.
-    u32 GetHandleTableSize() const {
+    s32 GetHandleTableSize() const {
        return handle_table_size;
    }

@@ -252,7 +252,7 @@ private:
    u64 core_mask = 0;
    u64 priority_mask = 0;

-    u32 handle_table_size = 0;
+    s32 handle_table_size = 0;
    u32 kernel_version = 0;

    ProgramType program_type = ProgramType::SysModule;
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -20,6 +20,7 @@
 #include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/client_session.h"
+#include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/mutex.h"
@@ -47,23 +48,6 @@ constexpr bool IsValidAddressRange(VAddr address, u64 size) {
    return address + size > address;
 }

-// Checks if a given address range lies within a larger address range.
-constexpr bool IsInsideAddressRange(VAddr address, u64 size, VAddr address_range_begin,
-                                    VAddr address_range_end) {
-    const VAddr end_address = address + size - 1;
-    return address_range_begin <= address && end_address <= address_range_end - 1;
-}
-
-bool IsInsideAddressSpace(const VMManager& vm, VAddr address, u64 size) {
-    return IsInsideAddressRange(address, size, vm.GetAddressSpaceBaseAddress(),
-                                vm.GetAddressSpaceEndAddress());
-}
-
-bool IsInsideNewMapRegion(const VMManager& vm, VAddr address, u64 size) {
-    return IsInsideAddressRange(address, size, vm.GetNewMapRegionBaseAddress(),
-                                vm.GetNewMapRegionEndAddress());
-}
-
 // 8 GiB
 constexpr u64 MAIN_MEMORY_SIZE = 0x200000000;

@@ -105,14 +89,14 @@ ResultCode MapUnmapMemorySanityChecks(const VMManager& vm_manager, VAddr dst_add
        return ERR_INVALID_ADDRESS_STATE;
    }

-    if (!IsInsideAddressSpace(vm_manager, src_addr, size)) {
+    if (!vm_manager.IsWithinAddressSpace(src_addr, size)) {
        LOG_ERROR(Kernel_SVC,
                  "Source is not within the address space, addr=0x{:016X}, size=0x{:016X}",
                  src_addr, size);
        return ERR_INVALID_ADDRESS_STATE;
    }

-    if (!IsInsideNewMapRegion(vm_manager, dst_addr, size)) {
+    if (!vm_manager.IsWithinNewMapRegion(dst_addr, size)) {
        LOG_ERROR(Kernel_SVC,
                  "Destination is not within the new map region, addr=0x{:016X}, size=0x{:016X}",
                  dst_addr, size);
@@ -238,7 +222,7 @@ static ResultCode SetMemoryPermission(VAddr addr, u64 size, u32 prot) {
    auto* const current_process = Core::CurrentProcess();
    auto& vm_manager = current_process->VMManager();

-    if (!IsInsideAddressSpace(vm_manager, addr, size)) {
+    if (!vm_manager.IsWithinAddressSpace(addr, size)) {
        LOG_ERROR(Kernel_SVC,
                  "Source is not within the address space, addr=0x{:016X}, size=0x{:016X}", addr,
                  size);
@@ -299,7 +283,7 @@ static ResultCode SetMemoryAttribute(VAddr address, u64 size, u32 mask, u32 attr
    }

    auto& vm_manager = Core::CurrentProcess()->VMManager();
-    if (!IsInsideAddressSpace(vm_manager, address, size)) {
+    if (!vm_manager.IsWithinAddressSpace(address, size)) {
        LOG_ERROR(Kernel_SVC,
                  "Given address (0x{:016X}) is outside the bounds of the address space.", address);
        return ERR_INVALID_ADDRESS_STATE;
@@ -1495,13 +1479,14 @@ static ResultCode WaitForAddress(VAddr address, u32 type, s32 value, s64 timeout
        return ERR_INVALID_ADDRESS;
    }

+    auto& address_arbiter = Core::System::GetInstance().Kernel().AddressArbiter();
    switch (static_cast<AddressArbiter::ArbitrationType>(type)) {
    case AddressArbiter::ArbitrationType::WaitIfLessThan:
-        return AddressArbiter::WaitForAddressIfLessThan(address, value, timeout, false);
+        return address_arbiter.WaitForAddressIfLessThan(address, value, timeout, false);
    case AddressArbiter::ArbitrationType::DecrementAndWaitIfLessThan:
-        return AddressArbiter::WaitForAddressIfLessThan(address, value, timeout, true);
+        return address_arbiter.WaitForAddressIfLessThan(address, value, timeout, true);
    case AddressArbiter::ArbitrationType::WaitIfEqual:
-        return AddressArbiter::WaitForAddressIfEqual(address, value, timeout);
+        return address_arbiter.WaitForAddressIfEqual(address, value, timeout);
    default:
        LOG_ERROR(Kernel_SVC,
                  "Invalid arbitration type, expected WaitIfLessThan, DecrementAndWaitIfLessThan "
@@ -1526,13 +1511,14 @@ static ResultCode SignalToAddress(VAddr address, u32 type, s32 value, s32 num_to
        return ERR_INVALID_ADDRESS;
    }

+    auto& address_arbiter = Core::System::GetInstance().Kernel().AddressArbiter();
    switch (static_cast<AddressArbiter::SignalType>(type)) {
    case AddressArbiter::SignalType::Signal:
-        return AddressArbiter::SignalToAddress(address, num_to_wake);
+        return address_arbiter.SignalToAddress(address, num_to_wake);
    case AddressArbiter::SignalType::IncrementAndSignalIfEqual:
-        return AddressArbiter::IncrementAndSignalToAddressIfEqual(address, value, num_to_wake);
+        return address_arbiter.IncrementAndSignalToAddressIfEqual(address, value, num_to_wake);
    case AddressArbiter::SignalType::ModifyByWaitingCountAndSignalIfEqual:
-        return AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(address, value,
+        return address_arbiter.ModifyByWaitingCountAndSignalToAddressIfEqual(address, value,
                                                                             num_to_wake);
    default:
        LOG_ERROR(Kernel_SVC,
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -184,8 +184,6 @@ ResultVal<SharedPtr<Thread>> Thread::Create(KernelCore& kernel, std::string name
        return ERR_INVALID_PROCESSOR_ID;
    }

-    // TODO(yuriks): Other checks, returning 0xD9001BEA
-
    if (!Memory::IsValidVirtualAddress(owner_process, entry_point)) {
        LOG_ERROR(Kernel_SVC, "(name={}): invalid entry {:016X}", name, entry_point);
        // TODO (bunnei): Find the correct error code to use here
--- a/src/core/hle/kernel/vm_manager.cpp
+++ b/src/core/hle/kernel/vm_manager.cpp
@@ -17,8 +17,8 @@
 #include "core/memory_setup.h"

 namespace Kernel {
-
-static const char* GetMemoryStateName(MemoryState state) {
+namespace {
+const char* GetMemoryStateName(MemoryState state) {
    static constexpr const char* names[] = {
        "Unmapped",         "Io",
        "Normal",           "CodeStatic",
@@ -35,6 +35,14 @@ static const char* GetMemoryStateName(MemoryState state) {
    return names[ToSvcMemoryState(state)];
 }

+// Checks if a given address range lies within a larger address range.
+constexpr bool IsInsideAddressRange(VAddr address, u64 size, VAddr address_range_begin,
+                                    VAddr address_range_end) {
+    const VAddr end_address = address + size - 1;
+    return address_range_begin <= address && end_address <= address_range_end - 1;
+}
+} // Anonymous namespace
+
 bool VirtualMemoryArea::CanBeMergedWith(const VirtualMemoryArea& next) const {
    ASSERT(base + size == next.base);
    if (permissions != next.permissions || state != next.state || attribute != next.attribute ||
@@ -249,8 +257,7 @@ ResultCode VMManager::ReprotectRange(VAddr target, u64 size, VMAPermission new_p
 }

 ResultVal<VAddr> VMManager::HeapAllocate(VAddr target, u64 size, VMAPermission perms) {
-    if (target < GetHeapRegionBaseAddress() || target + size > GetHeapRegionEndAddress() ||
-        target + size < target) {
+    if (!IsWithinHeapRegion(target, size)) {
        return ERR_INVALID_ADDRESS;
    }

@@ -285,8 +292,7 @@ ResultVal<VAddr> VMManager::HeapAllocate(VAddr target, u64 size, VMAPermission p
 }

 ResultCode VMManager::HeapFree(VAddr target, u64 size) {
-    if (target < GetHeapRegionBaseAddress() || target + size > GetHeapRegionEndAddress() ||
-        target + size < target) {
+    if (!IsWithinHeapRegion(target, size)) {
        return ERR_INVALID_ADDRESS;
    }

@@ -706,6 +712,11 @@ u64 VMManager::GetAddressSpaceWidth() const {
    return address_space_width;
 }

+bool VMManager::IsWithinAddressSpace(VAddr address, u64 size) const {
+    return IsInsideAddressRange(address, size, GetAddressSpaceBaseAddress(),
+                                GetAddressSpaceEndAddress());
+}
+
 VAddr VMManager::GetASLRRegionBaseAddress() const {
    return aslr_region_base;
 }
@@ -750,6 +761,11 @@ u64 VMManager::GetCodeRegionSize() const {
    return code_region_end - code_region_base;
 }

+bool VMManager::IsWithinCodeRegion(VAddr address, u64 size) const {
+    return IsInsideAddressRange(address, size, GetCodeRegionBaseAddress(),
+                                GetCodeRegionEndAddress());
+}
+
 VAddr VMManager::GetHeapRegionBaseAddress() const {
    return heap_region_base;
 }
@@ -762,6 +778,11 @@ u64 VMManager::GetHeapRegionSize() const {
    return heap_region_end - heap_region_base;
 }

+bool VMManager::IsWithinHeapRegion(VAddr address, u64 size) const {
+    return IsInsideAddressRange(address, size, GetHeapRegionBaseAddress(),
+                                GetHeapRegionEndAddress());
+}
+
 VAddr VMManager::GetMapRegionBaseAddress() const {
    return map_region_base;
 }
@@ -774,6 +795,10 @@ u64 VMManager::GetMapRegionSize() const {
    return map_region_end - map_region_base;
 }

+bool VMManager::IsWithinMapRegion(VAddr address, u64 size) const {
+    return IsInsideAddressRange(address, size, GetMapRegionBaseAddress(), GetMapRegionEndAddress());
+}
+
 VAddr VMManager::GetNewMapRegionBaseAddress() const {
    return new_map_region_base;
 }
@@ -786,6 +811,11 @@ u64 VMManager::GetNewMapRegionSize() const {
    return new_map_region_end - new_map_region_base;
 }

+bool VMManager::IsWithinNewMapRegion(VAddr address, u64 size) const {
+    return IsInsideAddressRange(address, size, GetNewMapRegionBaseAddress(),
+                                GetNewMapRegionEndAddress());
+}
+
 VAddr VMManager::GetTLSIORegionBaseAddress() const {
    return tls_io_region_base;
 }
@@ -798,4 +828,9 @@ u64 VMManager::GetTLSIORegionSize() const {
    return tls_io_region_end - tls_io_region_base;
 }

+bool VMManager::IsWithinTLSIORegion(VAddr address, u64 size) const {
+    return IsInsideAddressRange(address, size, GetTLSIORegionBaseAddress(),
+                                GetTLSIORegionEndAddress());
+}
+
 } // namespace Kernel
--- a/src/core/hle/kernel/vm_manager.h
+++ b/src/core/hle/kernel/vm_manager.h
@@ -432,18 +432,21 @@ public:
    /// Gets the address space width in bits.
    u64 GetAddressSpaceWidth() const;

+    /// Determines whether or not the given address range lies within the address space.
+    bool IsWithinAddressSpace(VAddr address, u64 size) const;
+
    /// Gets the base address of the ASLR region.
    VAddr GetASLRRegionBaseAddress() const;

    /// Gets the end address of the ASLR region.
    VAddr GetASLRRegionEndAddress() const;

-    /// Determines whether or not the specified address range is within the ASLR region.
-    bool IsWithinASLRRegion(VAddr address, u64 size) const;
-
    /// Gets the size of the ASLR region
    u64 GetASLRRegionSize() const;

+    /// Determines whether or not the specified address range is within the ASLR region.
+    bool IsWithinASLRRegion(VAddr address, u64 size) const;
+
    /// Gets the base address of the code region.
    VAddr GetCodeRegionBaseAddress() const;

@@ -453,6 +456,9 @@ public:
    /// Gets the total size of the code region in bytes.
    u64 GetCodeRegionSize() const;

+    /// Determines whether or not the specified range is within the code region.
+    bool IsWithinCodeRegion(VAddr address, u64 size) const;
+
    /// Gets the base address of the heap region.
    VAddr GetHeapRegionBaseAddress() const;

@@ -462,6 +468,9 @@ public:
    /// Gets the total size of the heap region in bytes.
    u64 GetHeapRegionSize() const;

+    /// Determines whether or not the specified range is within the heap region.
+    bool IsWithinHeapRegion(VAddr address, u64 size) const;
+
    /// Gets the base address of the map region.
    VAddr GetMapRegionBaseAddress() const;

@@ -471,6 +480,9 @@ public:
    /// Gets the total size of the map region in bytes.
    u64 GetMapRegionSize() const;

+    /// Determines whether or not the specified range is within the map region.
+    bool IsWithinMapRegion(VAddr address, u64 size) const;
+
    /// Gets the base address of the new map region.
    VAddr GetNewMapRegionBaseAddress() const;

@@ -480,6 +492,9 @@ public:
    /// Gets the total size of the new map region in bytes.
    u64 GetNewMapRegionSize() const;

+    /// Determines whether or not the given address range is within the new map region
+    bool IsWithinNewMapRegion(VAddr address, u64 size) const;
+
    /// Gets the base address of the TLS IO region.
    VAddr GetTLSIORegionBaseAddress() const;

@@ -489,6 +504,9 @@ public:
    /// Gets the total size of the TLS IO region in bytes.
    u64 GetTLSIORegionSize() const;

+    /// Determines if the given address range is within the TLS IO region.
+    bool IsWithinTLSIORegion(VAddr address, u64 size) const;
+
    /// Each VMManager has its own page table, which is set as the main one when the owning process
    /// is scheduled.
    Memory::PageTable page_table;
--- a/src/core/hle/result.h
+++ b/src/core/hle/result.h
@@ -8,7 +8,6 @@
 #include <utility>
 #include "common/assert.h"
 #include "common/bit_field.h"
-#include "common/common_funcs.h"
 #include "common/common_types.h"

 // All the constants in this file come from http://switchbrew.org/index.php?title=Error_codes
--- a/src/core/hle/service/am/applets/software_keyboard.cpp
+++ b/src/core/hle/service/am/applets/software_keyboard.cpp
@@ -7,6 +7,7 @@
 #include "common/string_util.h"
 #include "core/core.h"
 #include "core/frontend/applets/software_keyboard.h"
+#include "core/hle/result.h"
 #include "core/hle/service/am/am.h"
 #include "core/hle/service/am/applets/software_keyboard.h"

--- a/src/core/hle/service/am/applets/software_keyboard.h
+++ b/src/core/hle/service/am/applets/software_keyboard.h
@@ -9,10 +9,13 @@
 #include <vector>

 #include "common/common_funcs.h"
+#include "common/common_types.h"
 #include "common/swap.h"
 #include "core/hle/service/am/am.h"
 #include "core/hle/service/am/applets/applets.h"

+union ResultCode;
+
 namespace Service::AM::Applets {

 enum class KeysetDisable : u32 {
--- a/src/core/hle/service/audio/audout_u.cpp
+++ b/src/core/hle/service/audio/audout_u.cpp
@@ -18,17 +18,11 @@
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/audio/audout_u.h"
+#include "core/hle/service/audio/errors.h"
 #include "core/memory.h"

 namespace Service::Audio {

-namespace ErrCodes {
-enum {
-    ErrorUnknown = 2,
-    BufferCountExceeded = 8,
-};
-}
-
 constexpr std::array<char, 10> DefaultDevice{{"DeviceOut"}};
 constexpr int DefaultSampleRate{48000};

@@ -100,7 +94,7 @@ private:

        if (stream->IsPlaying()) {
            IPC::ResponseBuilder rb{ctx, 2};
-            rb.Push(ResultCode(ErrorModule::Audio, ErrCodes::ErrorUnknown));
+            rb.Push(ERR_OPERATION_FAILED);
            return;
        }

@@ -143,7 +137,8 @@ private:

        if (!audio_core.QueueBuffer(stream, tag, std::move(samples))) {
            IPC::ResponseBuilder rb{ctx, 2};
-            rb.Push(ResultCode(ErrorModule::Audio, ErrCodes::BufferCountExceeded));
+            rb.Push(ERR_BUFFER_COUNT_EXCEEDED);
+            return;
        }

        IPC::ResponseBuilder rb{ctx, 2};
--- a/src/core/hle/service/audio/audren_u.cpp
+++ b/src/core/hle/service/audio/audren_u.cpp
@@ -17,6 +17,7 @@
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/audio/audren_u.h"
+#include "core/hle/service/audio/errors.h"

 namespace Service::Audio {

@@ -37,7 +38,7 @@ public:
            {8, &IAudioRenderer::SetRenderingTimeLimit, "SetRenderingTimeLimit"},
            {9, &IAudioRenderer::GetRenderingTimeLimit, "GetRenderingTimeLimit"},
            {10, &IAudioRenderer::RequestUpdateImpl, "RequestUpdateAuto"},
-            {11, nullptr, "ExecuteAudioRendererRendering"},
+            {11, &IAudioRenderer::ExecuteAudioRendererRendering, "ExecuteAudioRendererRendering"},
        };
        // clang-format on
        RegisterHandlers(functions);
@@ -138,6 +139,17 @@ private:
        rb.Push(rendering_time_limit_percent);
    }

+    void ExecuteAudioRendererRendering(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_Audio, "called");
+
+        // This service command currently only reports an unsupported operation
+        // error code, or aborts. Given that, we just always return an error
+        // code in this case.
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(ERR_NOT_SUPPORTED);
+    }
+
    Kernel::EventPair system_event;
    std::unique_ptr<AudioCore::AudioRenderer> renderer;
    u32 rendering_time_limit_percent = 100;
@@ -235,7 +247,7 @@ AudRenU::AudRenU() : ServiceFramework("audren:u") {
        {0, &AudRenU::OpenAudioRenderer, "OpenAudioRenderer"},
        {1, &AudRenU::GetAudioRendererWorkBufferSize, "GetAudioRendererWorkBufferSize"},
        {2, &AudRenU::GetAudioDeviceService, "GetAudioDeviceService"},
-        {3, nullptr, "OpenAudioRendererAuto"},
+        {3, &AudRenU::OpenAudioRendererAuto, "OpenAudioRendererAuto"},
        {4, &AudRenU::GetAudioDeviceServiceWithRevisionInfo, "GetAudioDeviceServiceWithRevisionInfo"},
    };
    // clang-format on
@@ -248,12 +260,7 @@ AudRenU::~AudRenU() = default;
 void AudRenU::OpenAudioRenderer(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service_Audio, "called");

-    IPC::RequestParser rp{ctx};
-    auto params = rp.PopRaw<AudioCore::AudioRendererParameter>();
-    IPC::ResponseBuilder rb{ctx, 2, 0, 1};
-
-    rb.Push(RESULT_SUCCESS);
-    rb.PushIpcInterface<Audio::IAudioRenderer>(std::move(params));
+    OpenAudioRendererImpl(ctx);
 }

 void AudRenU::GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx) {
@@ -262,20 +269,20 @@ void AudRenU::GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service_Audio, "called");

    u64 buffer_sz = Common::AlignUp(4 * params.mix_buffer_count, 0x40);
-    buffer_sz += params.unknown_c * 1024;
-    buffer_sz += 0x940 * (params.unknown_c + 1);
+    buffer_sz += params.submix_count * 1024;
+    buffer_sz += 0x940 * (params.submix_count + 1);
    buffer_sz += 0x3F0 * params.voice_count;
-    buffer_sz += Common::AlignUp(8 * (params.unknown_c + 1), 0x10);
+    buffer_sz += Common::AlignUp(8 * (params.submix_count + 1), 0x10);
    buffer_sz += Common::AlignUp(8 * params.voice_count, 0x10);
-    buffer_sz +=
-        Common::AlignUp((0x3C0 * (params.sink_count + params.unknown_c) + 4 * params.sample_count) *
-                            (params.mix_buffer_count + 6),
-                        0x40);
+    buffer_sz += Common::AlignUp(
+        (0x3C0 * (params.sink_count + params.submix_count) + 4 * params.sample_count) *
+            (params.mix_buffer_count + 6),
+        0x40);

    if (IsFeatureSupported(AudioFeatures::Splitter, params.revision)) {
-        u32 count = params.unknown_c + 1;
+        const u32 count = params.submix_count + 1;
        u64 node_count = Common::AlignUp(count, 0x40);
-        u64 node_state_buffer_sz =
+        const u64 node_state_buffer_sz =
            4 * (node_count * node_count) + 0xC * node_count + 2 * (node_count / 8);
        u64 edge_matrix_buffer_sz = 0;
        node_count = Common::AlignUp(count * count, 0x40);
@@ -289,19 +296,19 @@ void AudRenU::GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx) {

    buffer_sz += 0x20 * (params.effect_count + 4 * params.voice_count) + 0x50;
    if (IsFeatureSupported(AudioFeatures::Splitter, params.revision)) {
-        buffer_sz += 0xE0 * params.unknown_2c;
+        buffer_sz += 0xE0 * params.num_splitter_send_channels;
        buffer_sz += 0x20 * params.splitter_count;
-        buffer_sz += Common::AlignUp(4 * params.unknown_2c, 0x10);
+        buffer_sz += Common::AlignUp(4 * params.num_splitter_send_channels, 0x10);
    }
    buffer_sz = Common::AlignUp(buffer_sz, 0x40) + 0x170 * params.sink_count;
    u64 output_sz = buffer_sz + 0x280 * params.sink_count + 0x4B0 * params.effect_count +
                    ((params.voice_count * 256) | 0x40);

-    if (params.unknown_1c >= 1) {
+    if (params.performance_frame_count >= 1) {
        output_sz = Common::AlignUp(((16 * params.sink_count + 16 * params.effect_count +
                                      16 * params.voice_count + 16) +
                                     0x658) *
-                                            (params.unknown_1c + 1) +
+                                            (params.performance_frame_count + 1) +
                                        0xc0,
                                    0x40) +
                    output_sz;
@@ -325,6 +332,12 @@ void AudRenU::GetAudioDeviceService(Kernel::HLERequestContext& ctx) {
    rb.PushIpcInterface<Audio::IAudioDevice>();
 }

+void AudRenU::OpenAudioRendererAuto(Kernel::HLERequestContext& ctx) {
+    LOG_DEBUG(Service_Audio, "called");
+
+    OpenAudioRendererImpl(ctx);
+}
+
 void AudRenU::GetAudioDeviceServiceWithRevisionInfo(Kernel::HLERequestContext& ctx) {
    LOG_WARNING(Service_Audio, "(STUBBED) called");

@@ -335,6 +348,15 @@ void AudRenU::GetAudioDeviceServiceWithRevisionInfo(Kernel::HLERequestContext& c
                                                // based on the current revision
 }

+void AudRenU::OpenAudioRendererImpl(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto params = rp.PopRaw<AudioCore::AudioRendererParameter>();
+    IPC::ResponseBuilder rb{ctx, 2, 0, 1};
+
+    rb.Push(RESULT_SUCCESS);
+    rb.PushIpcInterface<IAudioRenderer>(params);
+}
+
 bool AudRenU::IsFeatureSupported(AudioFeatures feature, u32_le revision) const {
    u32_be version_num = (revision - Common::MakeMagic('R', 'E', 'V', '0')); // Byte swap
    switch (feature) {
--- a/src/core/hle/service/audio/audren_u.h
+++ b/src/core/hle/service/audio/audren_u.h
@@ -21,8 +21,11 @@ private:
    void OpenAudioRenderer(Kernel::HLERequestContext& ctx);
    void GetAudioRendererWorkBufferSize(Kernel::HLERequestContext& ctx);
    void GetAudioDeviceService(Kernel::HLERequestContext& ctx);
+    void OpenAudioRendererAuto(Kernel::HLERequestContext& ctx);
    void GetAudioDeviceServiceWithRevisionInfo(Kernel::HLERequestContext& ctx);

+    void OpenAudioRendererImpl(Kernel::HLERequestContext& ctx);
+
    enum class AudioFeatures : u32 {
        Splitter,
    };
--- a/src/core/hle/service/audio/errors.h
+++ b/src/core/hle/service/audio/errors.h
@@ -0,0 +1,15 @@
+// Copyright 2019 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "core/hle/result.h"
+
+namespace Service::Audio {
+
+constexpr ResultCode ERR_OPERATION_FAILED{ErrorModule::Audio, 2};
+constexpr ResultCode ERR_BUFFER_COUNT_EXCEEDED{ErrorModule::Audio, 8};
+constexpr ResultCode ERR_NOT_SUPPORTED{ErrorModule::Audio, 513};
+
+} // namespace Service::Audio
--- a/src/core/hle/service/hid/hid.h
+++ b/src/core/hle/service/hid/hid.h
@@ -15,7 +15,7 @@ namespace Kernel {
 class SharedMemory;
 }

-namespace SM {
+namespace Service::SM {
 class ServiceManager;
 }

--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@@ -23,7 +23,7 @@ u32 nvdisp_disp0::ioctl(Ioctl command, const std::vector<u8>& input, std::vector

 void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u32 height,
                        u32 stride, NVFlinger::BufferQueue::BufferTransformFlags transform,
-                        const MathUtil::Rectangle<int>& crop_rect) {
+                        const Common::Rectangle<int>& crop_rect) {
    VAddr addr = nvmap_dev->GetObjectAddress(buffer_handle);
    LOG_TRACE(Service,
              "Drawing from address {:X} offset {:08X} Width {} Height {} Stride {} Format {}",
@@ -36,7 +36,7 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3

    auto& instance = Core::System::GetInstance();
    instance.GetPerfStats().EndGameFrame();
-    instance.Renderer().SwapBuffers(framebuffer);
+    instance.GPU().SwapBuffers(framebuffer);
 }

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.h
@@ -25,7 +25,7 @@ public:
    /// Performs a screen flip, drawing the buffer pointed to by the handle.
    void flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u32 height, u32 stride,
              NVFlinger::BufferQueue::BufferTransformFlags transform,
-              const MathUtil::Rectangle<int>& crop_rect);
+              const Common::Rectangle<int>& crop_rect);

 private:
    std::shared_ptr<nvmap> nvmap_dev;
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -178,7 +178,7 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
    auto& gpu = system_instance.GPU();
    auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset);
    ASSERT(cpu_addr);
-    system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(*cpu_addr, itr->second.size);
+    gpu.FlushAndInvalidateRegion(*cpu_addr, itr->second.size);

    params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size);

--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -136,16 +136,6 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
    return 0;
 }

-static void PushGPUEntries(Tegra::CommandList&& entries) {
-    if (entries.empty()) {
-        return;
-    }
-
-    auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()};
-    dma_pusher.Push(std::move(entries));
-    dma_pusher.DispatchCalls();
-}
-
 u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& output) {
    if (input.size() < sizeof(IoctlSubmitGpfifo)) {
        UNIMPLEMENTED();
@@ -163,7 +153,7 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
    std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
                params.num_entries * sizeof(Tegra::CommandListHeader));

-    PushGPUEntries(std::move(entries));
+    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));

    params.fence_out.id = 0;
    params.fence_out.value = 0;
@@ -184,7 +174,7 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
    Memory::ReadBlock(params.address, entries.data(),
                      params.num_entries * sizeof(Tegra::CommandListHeader));

-    PushGPUEntries(std::move(entries));
+    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));

    params.fence_out.id = 0;
    params.fence_out.value = 0;
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -63,7 +63,7 @@ const IGBPBuffer& BufferQueue::RequestBuffer(u32 slot) const {
 }

 void BufferQueue::QueueBuffer(u32 slot, BufferTransformFlags transform,
-                              const MathUtil::Rectangle<int>& crop_rect) {
+                              const Common::Rectangle<int>& crop_rect) {
    auto itr = std::find_if(queue.begin(), queue.end(),
                            [&](const Buffer& buffer) { return buffer.slot == slot; });
    ASSERT(itr != queue.end());
--- a/src/core/hle/service/nvflinger/buffer_queue.h
+++ b/src/core/hle/service/nvflinger/buffer_queue.h
@@ -67,14 +67,14 @@ public:
        Status status = Status::Free;
        IGBPBuffer igbp_buffer;
        BufferTransformFlags transform;
-        MathUtil::Rectangle<int> crop_rect;
+        Common::Rectangle<int> crop_rect;
    };

    void SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer);
    std::optional<u32> DequeueBuffer(u32 width, u32 height);
    const IGBPBuffer& RequestBuffer(u32 slot) const;
    void QueueBuffer(u32 slot, BufferTransformFlags transform,
-                     const MathUtil::Rectangle<int>& crop_rect);
+                     const Common::Rectangle<int>& crop_rect);
    std::optional<std::reference_wrapper<const Buffer>> AcquireBuffer();
    void ReleaseBuffer(u32 slot);
    u32 Query(QueryType type);
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -14,11 +14,12 @@
 #include "core/core_timing_util.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/readable_event.h"
-#include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/nvdrv/devices/nvdisp_disp0.h"
 #include "core/hle/service/nvdrv/nvdrv.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "core/hle/service/nvflinger/nvflinger.h"
+#include "core/hle/service/vi/display/vi_display.h"
+#include "core/hle/service/vi/layer/vi_layer.h"
 #include "core/perf_stats.h"
 #include "video_core/renderer_base.h"

@@ -28,6 +29,12 @@ constexpr std::size_t SCREEN_REFRESH_RATE = 60;
 constexpr u64 frame_ticks = static_cast<u64>(Core::Timing::BASE_CLOCK_RATE / SCREEN_REFRESH_RATE);

 NVFlinger::NVFlinger(Core::Timing::CoreTiming& core_timing) : core_timing{core_timing} {
+    displays.emplace_back(0, "Default");
+    displays.emplace_back(1, "External");
+    displays.emplace_back(2, "Edid");
+    displays.emplace_back(3, "Internal");
+    displays.emplace_back(4, "Null");
+
    // Schedule the screen composition events
    composition_event =
        core_timing.RegisterEvent("ScreenComposition", [this](u64 userdata, int cycles_late) {
@@ -52,13 +59,14 @@ std::optional<u64> NVFlinger::OpenDisplay(std::string_view name) {
    // TODO(Subv): Currently we only support the Default display.
    ASSERT(name == "Default");

-    const auto itr = std::find_if(displays.begin(), displays.end(),
-                                  [&](const Display& display) { return display.name == name; });
+    const auto itr =
+        std::find_if(displays.begin(), displays.end(),
+                     [&](const VI::Display& display) { return display.GetName() == name; });
    if (itr == displays.end()) {
        return {};
    }

-    return itr->id;
+    return itr->GetID();
 }

 std::optional<u64> NVFlinger::CreateLayer(u64 display_id) {
@@ -68,13 +76,10 @@ std::optional<u64> NVFlinger::CreateLayer(u64 display_id) {
        return {};
    }

-    ASSERT_MSG(display->layers.empty(), "Only one layer is supported per display at the moment");
-
    const u64 layer_id = next_layer_id++;
    const u32 buffer_queue_id = next_buffer_queue_id++;
-    auto buffer_queue = std::make_shared<BufferQueue>(buffer_queue_id, layer_id);
-    display->layers.emplace_back(layer_id, buffer_queue);
-    buffer_queues.emplace_back(std::move(buffer_queue));
+    buffer_queues.emplace_back(buffer_queue_id, layer_id);
+    display->CreateLayer(layer_id, buffer_queues.back());
    return layer_id;
 }

@@ -85,7 +90,7 @@ std::optional<u32> NVFlinger::FindBufferQueueId(u64 display_id, u64 layer_id) co
        return {};
    }

-    return layer->buffer_queue->GetId();
+    return layer->GetBufferQueue().GetId();
 }

 Kernel::SharedPtr<Kernel::ReadableEvent> NVFlinger::FindVsyncEvent(u64 display_id) const {
@@ -95,20 +100,29 @@ Kernel::SharedPtr<Kernel::ReadableEvent> NVFlinger::FindVsyncEvent(u64 display_i
        return nullptr;
    }

-    return display->vsync_event.readable;
+    return display->GetVSyncEvent();
 }

-std::shared_ptr<BufferQueue> NVFlinger::FindBufferQueue(u32 id) const {
+BufferQueue& NVFlinger::FindBufferQueue(u32 id) {
    const auto itr = std::find_if(buffer_queues.begin(), buffer_queues.end(),
-                                  [&](const auto& queue) { return queue->GetId() == id; });
+                                  [id](const auto& queue) { return queue.GetId() == id; });

    ASSERT(itr != buffer_queues.end());
    return *itr;
 }

-Display* NVFlinger::FindDisplay(u64 display_id) {
-    const auto itr = std::find_if(displays.begin(), displays.end(),
-                                  [&](const Display& display) { return display.id == display_id; });
+const BufferQueue& NVFlinger::FindBufferQueue(u32 id) const {
+    const auto itr = std::find_if(buffer_queues.begin(), buffer_queues.end(),
+                                  [id](const auto& queue) { return queue.GetId() == id; });
+
+    ASSERT(itr != buffer_queues.end());
+    return *itr;
+}
+
+VI::Display* NVFlinger::FindDisplay(u64 display_id) {
+    const auto itr =
+        std::find_if(displays.begin(), displays.end(),
+                     [&](const VI::Display& display) { return display.GetID() == display_id; });

    if (itr == displays.end()) {
        return nullptr;
@@ -117,9 +131,10 @@ Display* NVFlinger::FindDisplay(u64 display_id) {
    return &*itr;
 }

-const Display* NVFlinger::FindDisplay(u64 display_id) const {
-    const auto itr = std::find_if(displays.begin(), displays.end(),
-                                  [&](const Display& display) { return display.id == display_id; });
+const VI::Display* NVFlinger::FindDisplay(u64 display_id) const {
+    const auto itr =
+        std::find_if(displays.begin(), displays.end(),
+                     [&](const VI::Display& display) { return display.GetID() == display_id; });

    if (itr == displays.end()) {
        return nullptr;
@@ -128,57 +143,41 @@ const Display* NVFlinger::FindDisplay(u64 display_id) const {
    return &*itr;
 }

-Layer* NVFlinger::FindLayer(u64 display_id, u64 layer_id) {
+VI::Layer* NVFlinger::FindLayer(u64 display_id, u64 layer_id) {
    auto* const display = FindDisplay(display_id);

    if (display == nullptr) {
        return nullptr;
    }

-    const auto itr = std::find_if(display->layers.begin(), display->layers.end(),
-                                  [&](const Layer& layer) { return layer.id == layer_id; });
-
-    if (itr == display->layers.end()) {
-        return nullptr;
-    }
-
-    return &*itr;
+    return display->FindLayer(layer_id);
 }

-const Layer* NVFlinger::FindLayer(u64 display_id, u64 layer_id) const {
+const VI::Layer* NVFlinger::FindLayer(u64 display_id, u64 layer_id) const {
    const auto* const display = FindDisplay(display_id);

    if (display == nullptr) {
        return nullptr;
    }

-    const auto itr = std::find_if(display->layers.begin(), display->layers.end(),
-                                  [&](const Layer& layer) { return layer.id == layer_id; });
-
-    if (itr == display->layers.end()) {
-        return nullptr;
-    }
-
-    return &*itr;
+    return display->FindLayer(layer_id);
 }

 void NVFlinger::Compose() {
    for (auto& display : displays) {
        // Trigger vsync for this display at the end of drawing
-        SCOPE_EXIT({ display.vsync_event.writable->Signal(); });
+        SCOPE_EXIT({ display.SignalVSyncEvent(); });

        // Don't do anything for displays without layers.
-        if (display.layers.empty())
+        if (!display.HasLayers())
            continue;

        // TODO(Subv): Support more than 1 layer.
-        ASSERT_MSG(display.layers.size() == 1, "Max 1 layer per display is supported");
-
-        Layer& layer = display.layers[0];
-        auto& buffer_queue = layer.buffer_queue;
+        VI::Layer& layer = display.GetLayer(0);
+        auto& buffer_queue = layer.GetBufferQueue();

        // Search for a queued buffer and acquire it
-        auto buffer = buffer_queue->AcquireBuffer();
+        auto buffer = buffer_queue.AcquireBuffer();

        MicroProfileFlip();

@@ -187,7 +186,7 @@ void NVFlinger::Compose() {

            // There was no queued buffer to draw, render previous frame
            system_instance.GetPerfStats().EndGameFrame();
-            system_instance.Renderer().SwapBuffers({});
+            system_instance.GPU().SwapBuffers({});
            continue;
        }

@@ -203,19 +202,8 @@ void NVFlinger::Compose() {
                     igbp_buffer.width, igbp_buffer.height, igbp_buffer.stride,
                     buffer->get().transform, buffer->get().crop_rect);

-        buffer_queue->ReleaseBuffer(buffer->get().slot);
+        buffer_queue.ReleaseBuffer(buffer->get().slot);
    }
 }

-Layer::Layer(u64 id, std::shared_ptr<BufferQueue> queue) : id(id), buffer_queue(std::move(queue)) {}
-Layer::~Layer() = default;
-
-Display::Display(u64 id, std::string name) : id(id), name(std::move(name)) {
-    auto& kernel = Core::System::GetInstance().Kernel();
-    vsync_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Sticky,
-                                                         fmt::format("Display VSync Event {}", id));
-}
-
-Display::~Display() = default;
-
 } // namespace Service::NVFlinger
--- a/src/core/hle/service/nvflinger/nvflinger.h
+++ b/src/core/hle/service/nvflinger/nvflinger.h
@@ -4,7 +4,6 @@

 #pragma once

-#include <array>
 #include <memory>
 #include <optional>
 #include <string>
@@ -26,31 +25,17 @@ class WritableEvent;

 namespace Service::Nvidia {
 class Module;
-}
+} // namespace Service::Nvidia
+
+namespace Service::VI {
+class Display;
+class Layer;
+} // namespace Service::VI

 namespace Service::NVFlinger {

 class BufferQueue;

-struct Layer {
-    Layer(u64 id, std::shared_ptr<BufferQueue> queue);
-    ~Layer();
-
-    u64 id;
-    std::shared_ptr<BufferQueue> buffer_queue;
-};
-
-struct Display {
-    Display(u64 id, std::string name);
-    ~Display();
-
-    u64 id;
-    std::string name;
-
-    std::vector<Layer> layers;
-    Kernel::EventPair vsync_event;
-};
-
 class NVFlinger final {
 public:
    explicit NVFlinger(Core::Timing::CoreTiming& core_timing);
@@ -80,7 +65,10 @@ public:
    Kernel::SharedPtr<Kernel::ReadableEvent> FindVsyncEvent(u64 display_id) const;

    /// Obtains a buffer queue identified by the ID.
-    std::shared_ptr<BufferQueue> FindBufferQueue(u32 id) const;
+    BufferQueue& FindBufferQueue(u32 id);
+
+    /// Obtains a buffer queue identified by the ID.
+    const BufferQueue& FindBufferQueue(u32 id) const;

    /// Performs a composition request to the emulated nvidia GPU and triggers the vsync events when
    /// finished.
@@ -88,27 +76,21 @@ public:

 private:
    /// Finds the display identified by the specified ID.
-    Display* FindDisplay(u64 display_id);
+    VI::Display* FindDisplay(u64 display_id);

    /// Finds the display identified by the specified ID.
-    const Display* FindDisplay(u64 display_id) const;
+    const VI::Display* FindDisplay(u64 display_id) const;

    /// Finds the layer identified by the specified ID in the desired display.
-    Layer* FindLayer(u64 display_id, u64 layer_id);
+    VI::Layer* FindLayer(u64 display_id, u64 layer_id);

    /// Finds the layer identified by the specified ID in the desired display.
-    const Layer* FindLayer(u64 display_id, u64 layer_id) const;
+    const VI::Layer* FindLayer(u64 display_id, u64 layer_id) const;

    std::shared_ptr<Nvidia::Module> nvdrv;

-    std::array<Display, 5> displays{{
-        {0, "Default"},
-        {1, "External"},
-        {2, "Edid"},
-        {3, "Internal"},
-        {4, "Null"},
-    }};
-    std::vector<std::shared_ptr<BufferQueue>> buffer_queues;
+    std::vector<VI::Display> displays;
+    std::vector<BufferQueue> buffer_queues;

    /// Id to use for the next layer that is created, this counter is shared among all displays.
    u64 next_layer_id = 1;
--- a/src/core/hle/service/vi/display/vi_display.cpp
+++ b/src/core/hle/service/vi/display/vi_display.cpp
@@ -0,0 +1,71 @@
+// Copyright 2019 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <utility>
+
+#include <fmt/format.h>
+
+#include "common/assert.h"
+#include "core/core.h"
+#include "core/hle/kernel/readable_event.h"
+#include "core/hle/service/vi/display/vi_display.h"
+#include "core/hle/service/vi/layer/vi_layer.h"
+
+namespace Service::VI {
+
+Display::Display(u64 id, std::string name) : id{id}, name{std::move(name)} {
+    auto& kernel = Core::System::GetInstance().Kernel();
+    vsync_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Sticky,
+                                                         fmt::format("Display VSync Event {}", id));
+}
+
+Display::~Display() = default;
+
+Layer& Display::GetLayer(std::size_t index) {
+    return layers.at(index);
+}
+
+const Layer& Display::GetLayer(std::size_t index) const {
+    return layers.at(index);
+}
+
+Kernel::SharedPtr<Kernel::ReadableEvent> Display::GetVSyncEvent() const {
+    return vsync_event.readable;
+}
+
+void Display::SignalVSyncEvent() {
+    vsync_event.writable->Signal();
+}
+
+void Display::CreateLayer(u64 id, NVFlinger::BufferQueue& buffer_queue) {
+    // TODO(Subv): Support more than 1 layer.
+    ASSERT_MSG(layers.empty(), "Only one layer is supported per display at the moment");
+
+    layers.emplace_back(id, buffer_queue);
+}
+
+Layer* Display::FindLayer(u64 id) {
+    const auto itr = std::find_if(layers.begin(), layers.end(),
+                                  [id](const VI::Layer& layer) { return layer.GetID() == id; });
+
+    if (itr == layers.end()) {
+        return nullptr;
+    }
+
+    return &*itr;
+}
+
+const Layer* Display::FindLayer(u64 id) const {
+    const auto itr = std::find_if(layers.begin(), layers.end(),
+                                  [id](const VI::Layer& layer) { return layer.GetID() == id; });
+
+    if (itr == layers.end()) {
+        return nullptr;
+    }
+
+    return &*itr;
+}
+
+} // namespace Service::VI
--- a/src/core/hle/service/vi/display/vi_display.h
+++ b/src/core/hle/service/vi/display/vi_display.h
@@ -0,0 +1,98 @@
+// Copyright 2019 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "common/common_types.h"
+#include "core/hle/kernel/writable_event.h"
+
+namespace Service::NVFlinger {
+class BufferQueue;
+}
+
+namespace Service::VI {
+
+class Layer;
+
+/// Represents a single display type
+class Display {
+public:
+    /// Constructs a display with a given unique ID and name.
+    ///
+    /// @param id   The unique ID for this display.
+    /// @param name The name for this display.
+    ///
+    Display(u64 id, std::string name);
+    ~Display();
+
+    Display(const Display&) = delete;
+    Display& operator=(const Display&) = delete;
+
+    Display(Display&&) = default;
+    Display& operator=(Display&&) = default;
+
+    /// Gets the unique ID assigned to this display.
+    u64 GetID() const {
+        return id;
+    }
+
+    /// Gets the name of this display
+    const std::string& GetName() const {
+        return name;
+    }
+
+    /// Whether or not this display has any layers added to it.
+    bool HasLayers() const {
+        return !layers.empty();
+    }
+
+    /// Gets a layer for this display based off an index.
+    Layer& GetLayer(std::size_t index);
+
+    /// Gets a layer for this display based off an index.
+    const Layer& GetLayer(std::size_t index) const;
+
+    /// Gets the readable vsync event.
+    Kernel::SharedPtr<Kernel::ReadableEvent> GetVSyncEvent() const;
+
+    /// Signals the internal vsync event.
+    void SignalVSyncEvent();
+
+    /// Creates and adds a layer to this display with the given ID.
+    ///
+    /// @param id           The ID to assign to the created layer.
+    /// @param buffer_queue The buffer queue for the layer instance to use.
+    ///
+    void CreateLayer(u64 id, NVFlinger::BufferQueue& buffer_queue);
+
+    /// Attempts to find a layer with the given ID.
+    ///
+    /// @param id The layer ID.
+    ///
+    /// @returns If found, the Layer instance with the given ID.
+    ///          If not found, then nullptr is returned.
+    ///
+    Layer* FindLayer(u64 id);
+
+    /// Attempts to find a layer with the given ID.
+    ///
+    /// @param id The layer ID.
+    ///
+    /// @returns If found, the Layer instance with the given ID.
+    ///          If not found, then nullptr is returned.
+    ///
+    const Layer* FindLayer(u64 id) const;
+
+private:
+    u64 id;
+    std::string name;
+
+    std::vector<Layer> layers;
+    Kernel::EventPair vsync_event;
+};
+
+} // namespace Service::VI
--- a/src/core/hle/service/vi/layer/vi_layer.cpp
+++ b/src/core/hle/service/vi/layer/vi_layer.cpp
@@ -0,0 +1,13 @@
+// Copyright 2019 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/hle/service/vi/layer/vi_layer.h"
+
+namespace Service::VI {
+
+Layer::Layer(u64 id, NVFlinger::BufferQueue& queue) : id{id}, buffer_queue{queue} {}
+
+Layer::~Layer() = default;
+
+} // namespace Service::VI
--- a/src/core/hle/service/vi/layer/vi_layer.h
+++ b/src/core/hle/service/vi/layer/vi_layer.h
@@ -0,0 +1,52 @@
+// Copyright 2019 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Service::NVFlinger {
+class BufferQueue;
+}
+
+namespace Service::VI {
+
+/// Represents a single display layer.
+class Layer {
+public:
+    /// Constructs a layer with a given ID and buffer queue.
+    ///
+    /// @param id    The ID to assign to this layer.
+    /// @param queue The buffer queue for this layer to use.
+    ///
+    Layer(u64 id, NVFlinger::BufferQueue& queue);
+    ~Layer();
+
+    Layer(const Layer&) = delete;
+    Layer& operator=(const Layer&) = delete;
+
+    Layer(Layer&&) = default;
+    Layer& operator=(Layer&&) = delete;
+
+    /// Gets the ID for this layer.
+    u64 GetID() const {
+        return id;
+    }
+
+    /// Gets a reference to the buffer queue this layer is using.
+    NVFlinger::BufferQueue& GetBufferQueue() {
+        return buffer_queue;
+    }
+
+    /// Gets a const reference to the buffer queue this layer is using.
+    const NVFlinger::BufferQueue& GetBufferQueue() const {
+        return buffer_queue;
+    }
+
+private:
+    u64 id;
+    NVFlinger::BufferQueue& buffer_queue;
+};
+
+} // namespace Service::VI
--- a/src/core/hle/service/vi/vi.cpp
+++ b/src/core/hle/service/vi/vi.cpp
@@ -420,7 +420,7 @@ public:
        u32_le fence_is_valid;
        std::array<Fence, 2> fences;

-        MathUtil::Rectangle<int> GetCropRect() const {
+        Common::Rectangle<int> GetCropRect() const {
            return {crop_left, crop_top, crop_right, crop_bottom};
        }
    };
@@ -525,7 +525,7 @@ private:
        LOG_DEBUG(Service_VI, "called. id=0x{:08X} transaction={:X}, flags=0x{:08X}", id,
                  static_cast<u32>(transaction), flags);

-        auto buffer_queue = nv_flinger->FindBufferQueue(id);
+        auto& buffer_queue = nv_flinger->FindBufferQueue(id);

        if (transaction == TransactionId::Connect) {
            IGBPConnectRequestParcel request{ctx.ReadBuffer()};
@@ -538,7 +538,7 @@ private:
        } else if (transaction == TransactionId::SetPreallocatedBuffer) {
            IGBPSetPreallocatedBufferRequestParcel request{ctx.ReadBuffer()};

-            buffer_queue->SetPreallocatedBuffer(request.data.slot, request.buffer);
+            buffer_queue.SetPreallocatedBuffer(request.data.slot, request.buffer);

            IGBPSetPreallocatedBufferResponseParcel response{};
            ctx.WriteBuffer(response.Serialize());
@@ -546,7 +546,7 @@ private:
            IGBPDequeueBufferRequestParcel request{ctx.ReadBuffer()};
            const u32 width{request.data.width};
            const u32 height{request.data.height};
-            std::optional<u32> slot = buffer_queue->DequeueBuffer(width, height);
+            std::optional<u32> slot = buffer_queue.DequeueBuffer(width, height);

            if (slot) {
                // Buffer is available
@@ -559,8 +559,8 @@ private:
                    [=](Kernel::SharedPtr<Kernel::Thread> thread, Kernel::HLERequestContext& ctx,
                        Kernel::ThreadWakeupReason reason) {
                        // Repeat TransactParcel DequeueBuffer when a buffer is available
-                        auto buffer_queue = nv_flinger->FindBufferQueue(id);
-                        std::optional<u32> slot = buffer_queue->DequeueBuffer(width, height);
+                        auto& buffer_queue = nv_flinger->FindBufferQueue(id);
+                        std::optional<u32> slot = buffer_queue.DequeueBuffer(width, height);
                        ASSERT_MSG(slot != std::nullopt, "Could not dequeue buffer.");

                        IGBPDequeueBufferResponseParcel response{*slot};
@@ -568,28 +568,28 @@ private:
                        IPC::ResponseBuilder rb{ctx, 2};
                        rb.Push(RESULT_SUCCESS);
                    },
-                    buffer_queue->GetWritableBufferWaitEvent());
+                    buffer_queue.GetWritableBufferWaitEvent());
            }
        } else if (transaction == TransactionId::RequestBuffer) {
            IGBPRequestBufferRequestParcel request{ctx.ReadBuffer()};

-            auto& buffer = buffer_queue->RequestBuffer(request.slot);
+            auto& buffer = buffer_queue.RequestBuffer(request.slot);

            IGBPRequestBufferResponseParcel response{buffer};
            ctx.WriteBuffer(response.Serialize());
        } else if (transaction == TransactionId::QueueBuffer) {
            IGBPQueueBufferRequestParcel request{ctx.ReadBuffer()};

-            buffer_queue->QueueBuffer(request.data.slot, request.data.transform,
-                                      request.data.GetCropRect());
+            buffer_queue.QueueBuffer(request.data.slot, request.data.transform,
+                                     request.data.GetCropRect());

            IGBPQueueBufferResponseParcel response{1280, 720};
            ctx.WriteBuffer(response.Serialize());
        } else if (transaction == TransactionId::Query) {
            IGBPQueryRequestParcel request{ctx.ReadBuffer()};

-            u32 value =
-                buffer_queue->Query(static_cast<NVFlinger::BufferQueue::QueryType>(request.type));
+            const u32 value =
+                buffer_queue.Query(static_cast<NVFlinger::BufferQueue::QueryType>(request.type));

            IGBPQueryResponseParcel response{value};
            ctx.WriteBuffer(response.Serialize());
@@ -629,12 +629,12 @@ private:

        LOG_WARNING(Service_VI, "(STUBBED) called id={}, unknown={:08X}", id, unknown);

-        const auto buffer_queue = nv_flinger->FindBufferQueue(id);
+        const auto& buffer_queue = nv_flinger->FindBufferQueue(id);

        // TODO(Subv): Find out what this actually is.
        IPC::ResponseBuilder rb{ctx, 2, 1};
        rb.Push(RESULT_SUCCESS);
-        rb.PushCopyObjects(buffer_queue->GetBufferWaitEvent());
+        rb.PushCopyObjects(buffer_queue.GetBufferWaitEvent());
    }

    std::shared_ptr<NVFlinger::NVFlinger> nv_flinger;
@@ -752,6 +752,7 @@ public:
            {1102, nullptr, "GetDisplayResolution"},
            {2010, &IManagerDisplayService::CreateManagedLayer, "CreateManagedLayer"},
            {2011, nullptr, "DestroyManagedLayer"},
+            {2012, nullptr, "CreateStrayLayer"},
            {2050, nullptr, "CreateIndirectLayer"},
            {2051, nullptr, "DestroyIndirectLayer"},
            {2052, nullptr, "CreateIndirectProducerEndPoint"},
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -71,15 +71,20 @@ static void MapPages(PageTable& page_table, VAddr base, u64 size, u8* memory, Pa
                                 FlushMode::FlushAndInvalidate);

    VAddr end = base + size;
-    while (base != end) {
-        ASSERT_MSG(base < page_table.pointers.size(), "out of range mapping at {:016X}", base);
+    ASSERT_MSG(end <= page_table.pointers.size(), "out of range mapping at {:016X}",
+               base + page_table.pointers.size());

-        page_table.attributes[base] = type;
-        page_table.pointers[base] = memory;
+    std::fill(page_table.attributes.begin() + base, page_table.attributes.begin() + end, type);

-        base += 1;
-        if (memory != nullptr)
+    if (memory == nullptr) {
+        std::fill(page_table.pointers.begin() + base, page_table.pointers.begin() + end, memory);
+    } else {
+        while (base != end) {
+            page_table.pointers[base] = memory;
+
+            base += 1;
            memory += PAGE_SIZE;
+        }
    }
 }

@@ -166,9 +171,6 @@ T Read(const VAddr vaddr) {
        return value;
    }

-    // The memory access might do an MMIO or cached access, so we have to lock the HLE kernel state
-    std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
-
    PageType type = current_page_table->attributes[vaddr >> PAGE_BITS];
    switch (type) {
    case PageType::Unmapped:
@@ -199,9 +201,6 @@ void Write(const VAddr vaddr, const T data) {
        return;
    }

-    // The memory access might do an MMIO or cached access, so we have to lock the HLE kernel state
-    std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
-
    PageType type = current_page_table->attributes[vaddr >> PAGE_BITS];
    switch (type) {
    case PageType::Unmapped:
@@ -357,16 +356,16 @@ void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode) {
        const VAddr overlap_end = std::min(end, region_end);
        const VAddr overlap_size = overlap_end - overlap_start;

-        auto& rasterizer = system_instance.Renderer().Rasterizer();
+        auto& gpu = system_instance.GPU();
        switch (mode) {
        case FlushMode::Flush:
-            rasterizer.FlushRegion(overlap_start, overlap_size);
+            gpu.FlushRegion(overlap_start, overlap_size);
            break;
        case FlushMode::Invalidate:
-            rasterizer.InvalidateRegion(overlap_start, overlap_size);
+            gpu.InvalidateRegion(overlap_start, overlap_size);
            break;
        case FlushMode::FlushAndInvalidate:
-            rasterizer.FlushAndInvalidateRegion(overlap_start, overlap_size);
+            gpu.FlushAndInvalidateRegion(overlap_start, overlap_size);
            break;
        }
    };
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -393,6 +393,7 @@ struct Values {
    u16 frame_limit;
    bool use_disk_shader_cache;
    bool use_accurate_gpu_emulation;
+    bool use_asynchronous_gpu_emulation;

    float bg_red;
    float bg_green;
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -162,6 +162,8 @@ TelemetrySession::TelemetrySession() {
             Settings::values.use_disk_shader_cache);
    AddField(Telemetry::FieldType::UserConfig, "Renderer_UseAccurateGpuEmulation",
             Settings::values.use_accurate_gpu_emulation);
+    AddField(Telemetry::FieldType::UserConfig, "Renderer_UseAsynchronousGpuEmulation",
+             Settings::values.use_asynchronous_gpu_emulation);
    AddField(Telemetry::FieldType::UserConfig, "System_UseDockedMode",
             Settings::values.use_docked_mode);
 }
--- a/src/input_common/motion_emu.cpp
+++ b/src/input_common/motion_emu.cpp
@@ -32,12 +32,12 @@ public:
    }

    void BeginTilt(int x, int y) {
-        mouse_origin = Math::MakeVec(x, y);
+        mouse_origin = Common::MakeVec(x, y);
        is_tilting = true;
    }

    void Tilt(int x, int y) {
-        auto mouse_move = Math::MakeVec(x, y) - mouse_origin;
+        auto mouse_move = Common::MakeVec(x, y) - mouse_origin;
        if (is_tilting) {
            std::lock_guard<std::mutex> guard(tilt_mutex);
            if (mouse_move.x == 0 && mouse_move.y == 0) {
@@ -45,7 +45,7 @@ public:
            } else {
                tilt_direction = mouse_move.Cast<float>();
                tilt_angle =
-                    std::clamp(tilt_direction.Normalize() * sensitivity, 0.0f, MathUtil::PI * 0.5f);
+                    std::clamp(tilt_direction.Normalize() * sensitivity, 0.0f, Common::PI * 0.5f);
            }
        }
    }
@@ -56,7 +56,7 @@ public:
        is_tilting = false;
    }

-    std::tuple<Math::Vec3<float>, Math::Vec3<float>> GetStatus() {
+    std::tuple<Common::Vec3<float>, Common::Vec3<float>> GetStatus() {
        std::lock_guard<std::mutex> guard(status_mutex);
        return status;
    }
@@ -66,17 +66,17 @@ private:
    const std::chrono::steady_clock::duration update_duration;
    const float sensitivity;

-    Math::Vec2<int> mouse_origin;
+    Common::Vec2<int> mouse_origin;

    std::mutex tilt_mutex;
-    Math::Vec2<float> tilt_direction;
+    Common::Vec2<float> tilt_direction;
    float tilt_angle = 0;

    bool is_tilting = false;

    Common::Event shutdown_event;

-    std::tuple<Math::Vec3<float>, Math::Vec3<float>> status;
+    std::tuple<Common::Vec3<float>, Common::Vec3<float>> status;
    std::mutex status_mutex;

    // Note: always keep the thread declaration at the end so that other objects are initialized
@@ -85,8 +85,8 @@ private:

    void MotionEmuThread() {
        auto update_time = std::chrono::steady_clock::now();
-        Math::Quaternion<float> q = MakeQuaternion(Math::Vec3<float>(), 0);
-        Math::Quaternion<float> old_q;
+        Common::Quaternion<float> q = Common::MakeQuaternion(Common::Vec3<float>(), 0);
+        Common::Quaternion<float> old_q;

        while (!shutdown_event.WaitUntil(update_time)) {
            update_time += update_duration;
@@ -96,18 +96,18 @@ private:
                std::lock_guard<std::mutex> guard(tilt_mutex);

                // Find the quaternion describing current 3DS tilting
-                q = MakeQuaternion(Math::MakeVec(-tilt_direction.y, 0.0f, tilt_direction.x),
-                                   tilt_angle);
+                q = Common::MakeQuaternion(
+                    Common::MakeVec(-tilt_direction.y, 0.0f, tilt_direction.x), tilt_angle);
            }

            auto inv_q = q.Inverse();

            // Set the gravity vector in world space
-            auto gravity = Math::MakeVec(0.0f, -1.0f, 0.0f);
+            auto gravity = Common::MakeVec(0.0f, -1.0f, 0.0f);

            // Find the angular rate vector in world space
            auto angular_rate = ((q - old_q) * inv_q).xyz * 2;
-            angular_rate *= 1000 / update_millisecond / MathUtil::PI * 180;
+            angular_rate *= 1000 / update_millisecond / Common::PI * 180;

            // Transform the two vectors from world space to 3DS space
            gravity = QuaternionRotate(inv_q, gravity);
@@ -131,7 +131,7 @@ public:
        device = std::make_shared<MotionEmuDevice>(update_millisecond, sensitivity);
    }

-    std::tuple<Math::Vec3<float>, Math::Vec3<float>> GetStatus() const override {
+    std::tuple<Common::Vec3<float>, Common::Vec3<float>> GetStatus() const override {
        return device->GetStatus();
    }

--- a/src/tests/core/arm/arm_test_common.cpp
+++ b/src/tests/core/arm/arm_test_common.cpp
@@ -13,11 +13,11 @@
 namespace ArmTests {

 TestEnvironment::TestEnvironment(bool mutable_memory_)
-    : mutable_memory(mutable_memory_), test_memory(std::make_shared<TestMemory>(this)) {
-
+    : mutable_memory(mutable_memory_),
+      test_memory(std::make_shared<TestMemory>(this)), kernel{Core::System::GetInstance()} {
    auto process = Kernel::Process::Create(kernel, "");
    kernel.MakeCurrentProcess(process.get());
-    page_table = &Core::CurrentProcess()->VMManager().page_table;
+    page_table = &process->VMManager().page_table;

    std::fill(page_table->pointers.begin(), page_table->pointers.end(), nullptr);
    page_table->special_regions.clear();
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -17,6 +17,12 @@ add_library(video_core STATIC
    engines/shader_header.h
    gpu.cpp
    gpu.h
+    gpu_asynch.cpp
+    gpu_asynch.h
+    gpu_synch.cpp
+    gpu_synch.h
+    gpu_thread.cpp
+    gpu_thread.h
    macro_interpreter.cpp
    macro_interpreter.h
    memory_manager.cpp
@@ -94,6 +100,8 @@ add_library(video_core STATIC
    surface.h
    textures/astc.cpp
    textures/astc.h
+    textures/convert.cpp
+    textures/convert.h
    textures/decoders.cpp
    textures/decoders.h
    textures/texture.h
@@ -104,8 +112,18 @@ add_library(video_core STATIC
 if (ENABLE_VULKAN)
    target_sources(video_core PRIVATE
        renderer_vulkan/declarations.h
+        renderer_vulkan/vk_buffer_cache.cpp
+        renderer_vulkan/vk_buffer_cache.h
        renderer_vulkan/vk_device.cpp
-        renderer_vulkan/vk_device.h)
+        renderer_vulkan/vk_device.h
+        renderer_vulkan/vk_memory_manager.cpp
+        renderer_vulkan/vk_memory_manager.h
+        renderer_vulkan/vk_resource_manager.cpp
+        renderer_vulkan/vk_resource_manager.h
+        renderer_vulkan/vk_scheduler.cpp
+        renderer_vulkan/vk_scheduler.h
+        renderer_vulkan/vk_stream_buffer.cpp
+        renderer_vulkan/vk_stream_buffer.h)

    target_include_directories(video_core PRIVATE ../../externals/Vulkan-Headers/include)
    target_compile_definitions(video_core PRIVATE HAS_VULKAN)
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -33,18 +33,36 @@ void DmaPusher::DispatchCalls() {
 }

 bool DmaPusher::Step() {
-    if (dma_get != dma_put) {
-        // Push buffer non-empty, read a word
-        const auto address = gpu.MemoryManager().GpuToCpuAddress(dma_get);
-        ASSERT_MSG(address, "Invalid GPU address");
+    if (!ib_enable || dma_pushbuffer.empty()) {
+        // pushbuffer empty and IB empty or nonexistent - nothing to do
+        return false;
+    }

-        const CommandHeader command_header{Memory::Read32(*address)};
+    const CommandList& command_list{dma_pushbuffer.front()};
+    const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]};
+    GPUVAddr dma_get = command_list_header.addr;
+    GPUVAddr dma_put = dma_get + command_list_header.size * sizeof(u32);
+    bool non_main = command_list_header.is_non_main;

-        dma_get += sizeof(u32);
+    if (dma_pushbuffer_subindex >= command_list.size()) {
+        // We've gone through the current list, remove it from the queue
+        dma_pushbuffer.pop();
+        dma_pushbuffer_subindex = 0;
+    }

-        if (!non_main) {
-            dma_mget = dma_get;
-        }
+    if (command_list_header.size == 0) {
+        return true;
+    }
+
+    // Push buffer non-empty, read a word
+    const auto address = gpu.MemoryManager().GpuToCpuAddress(dma_get);
+    ASSERT_MSG(address, "Invalid GPU address");
+
+    command_headers.resize(command_list_header.size);
+
+    Memory::ReadBlock(*address, command_headers.data(), command_list_header.size * sizeof(u32));
+
+    for (const CommandHeader& command_header : command_headers) {

        // now, see if we're in the middle of a command
        if (dma_state.length_pending) {
@@ -91,22 +109,11 @@ bool DmaPusher::Step() {
                break;
            }
        }
-    } else if (ib_enable && !dma_pushbuffer.empty()) {
-        // Current pushbuffer empty, but we have more IB entries to read
-        const CommandList& command_list{dma_pushbuffer.front()};
-        const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]};
-        dma_get = command_list_header.addr;
-        dma_put = dma_get + command_list_header.size * sizeof(u32);
-        non_main = command_list_header.is_non_main;
+    }

-        if (dma_pushbuffer_subindex >= command_list.size()) {
-            // We've gone through the current list, remove it from the queue
-            dma_pushbuffer.pop();
-            dma_pushbuffer_subindex = 0;
-        }
-    } else {
-        // Otherwise, pushbuffer empty and IB empty or nonexistent - nothing to do
-        return {};
+    if (!non_main) {
+        // TODO (degasus): This is dead code, as dma_mget is never read.
+        dma_mget = dma_put;
    }

    return true;
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -75,6 +75,8 @@ private:

    GPU& gpu;

+    std::vector<CommandHeader> command_headers; ///< Buffer for list of commands fetched at once
+
    std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
    std::size_t dma_pushbuffer_subindex{};  ///< Index within a command list within the pushbuffer

@@ -89,11 +91,8 @@ private:
    DmaState dma_state{};
    bool dma_increment_once{};

-    GPUVAddr dma_put{};   ///< pushbuffer current end address
-    GPUVAddr dma_get{};   ///< pushbuffer current read address
    GPUVAddr dma_mget{};  ///< main pushbuffer last read address
    bool ib_enable{true}; ///< IB mode enabled
-    bool non_main{};      ///< non-main pushbuffer active
 };

 } // namespace Tegra
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -2,12 +2,11 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include "core/core.h"
-#include "core/memory.h"
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include "common/math_util.h"
 #include "video_core/engines/fermi_2d.h"
-#include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_interface.h"
-#include "video_core/textures/decoders.h"

 namespace Tegra::Engines {

@@ -44,10 +43,10 @@ void Fermi2D::HandleSurfaceCopy() {
    const u32 src_blit_y2{
        static_cast<u32>((regs.blit_src_y + (regs.blit_dst_height * regs.blit_dv_dy)) >> 32)};

-    const MathUtil::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
-    const MathUtil::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y,
-                                            regs.blit_dst_x + regs.blit_dst_width,
-                                            regs.blit_dst_y + regs.blit_dst_height};
+    const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
+    const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y,
+                                          regs.blit_dst_x + regs.blit_dst_width,
+                                          regs.blit_dst_y + regs.blit_dst_height};

    if (!rasterizer.AccelerateSurfaceCopy(regs.src, regs.dst, src_rect, dst_rect)) {
        UNIMPLEMENTED();
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -5,7 +5,7 @@
 #pragma once

 #include <array>
-#include "common/assert.h"
+#include <cstddef>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -2,9 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "common/assert.h"
 #include "common/logging/log.h"
-#include "core/core.h"
-#include "core/memory.h"
 #include "video_core/engines/kepler_compute.h"
 #include "video_core/memory_manager.h"

--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -5,8 +5,7 @@
 #pragma once

 #include <array>
-#include "common/assert.h"
-#include "common/bit_field.h"
+#include <cstddef>
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/gpu.h"
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "core/memory.h"
@@ -11,9 +12,9 @@

 namespace Tegra::Engines {

-KeplerMemory::KeplerMemory(VideoCore::RasterizerInterface& rasterizer,
+KeplerMemory::KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                           MemoryManager& memory_manager)
-    : memory_manager(memory_manager), rasterizer{rasterizer} {}
+    : system{system}, memory_manager(memory_manager), rasterizer{rasterizer} {}

 KeplerMemory::~KeplerMemory() = default;

@@ -47,10 +48,10 @@ void KeplerMemory::ProcessData(u32 data) {
    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
    // We do this before actually writing the new data because the destination address might contain
    // a dirty surface that will have to be written back to memory.
-    rasterizer.InvalidateRegion(*dest_address, sizeof(u32));
+    Core::System::GetInstance().GPU().InvalidateRegion(*dest_address, sizeof(u32));

    Memory::Write32(*dest_address, data);
-    Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();

    state.write_offset++;
 }
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -5,13 +5,17 @@
 #pragma once

 #include <array>
-#include "common/assert.h"
+#include <cstddef>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"

+namespace Core {
+class System;
+}
+
 namespace VideoCore {
 class RasterizerInterface;
 }
@@ -23,7 +27,8 @@ namespace Tegra::Engines {

 class KeplerMemory final {
 public:
-    KeplerMemory(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
+    KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                 MemoryManager& memory_manager);
    ~KeplerMemory();

    /// Write the value to the register identified by method.
@@ -76,6 +81,7 @@ public:
    } state{};

 private:
+    Core::System& system;
    MemoryManager& memory_manager;
    VideoCore::RasterizerInterface& rasterizer;

--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -19,8 +19,10 @@ namespace Tegra::Engines {
 /// First register id that is actually a Macro call.
 constexpr u32 MacroRegistersStart = 0xE00;

-Maxwell3D::Maxwell3D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
-    : memory_manager(memory_manager), rasterizer{rasterizer}, macro_interpreter(*this) {
+Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                     MemoryManager& memory_manager)
+    : memory_manager(memory_manager), system{system}, rasterizer{rasterizer},
+      macro_interpreter(*this) {
    InitializeRegisterDefaults();
 }

@@ -103,23 +105,25 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
 }

 void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
-    auto debug_context = Core::System::GetInstance().GetGPUDebugContext();
+    auto debug_context = system.GetGPUDebugContext();
+
+    const u32 method = method_call.method;

    // It is an error to write to a register other than the current macro's ARG register before it
    // has finished execution.
    if (executing_macro != 0) {
-        ASSERT(method_call.method == executing_macro + 1);
+        ASSERT(method == executing_macro + 1);
    }

    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
    // uploaded to the GPU during initialization.
-    if (method_call.method >= MacroRegistersStart) {
+    if (method >= MacroRegistersStart) {
        // We're trying to execute a macro
        if (executing_macro == 0) {
            // A macro call must begin by writing the macro method's register, not its argument.
-            ASSERT_MSG((method_call.method % 2) == 0,
+            ASSERT_MSG((method % 2) == 0,
                       "Can't start macro execution by writing to the ARGS register");
-            executing_macro = method_call.method;
+            executing_macro = method;
        }

        macro_params.push_back(method_call.argument);
@@ -131,66 +135,62 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
        return;
    }

-    ASSERT_MSG(method_call.method < Regs::NUM_REGS,
+    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Maxwell3D register, increase the size of the Regs structure");

    if (debug_context) {
        debug_context->OnEvent(Tegra::DebugContext::Event::MaxwellCommandLoaded, nullptr);
    }

-    if (regs.reg_array[method_call.method] != method_call.argument) {
-        regs.reg_array[method_call.method] = method_call.argument;
+    if (regs.reg_array[method] != method_call.argument) {
+        regs.reg_array[method] = method_call.argument;
        // Color buffers
        constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt);
        constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
-        if (method_call.method >= first_rt_reg &&
-            method_call.method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
-            const std::size_t rt_index = (method_call.method - first_rt_reg) / registers_per_rt;
-            dirty_flags.color_buffer |= 1u << static_cast<u32>(rt_index);
+        if (method >= first_rt_reg &&
+            method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
+            const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt;
+            dirty_flags.color_buffer.set(rt_index);
        }

        // Zeta buffer
        constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
-        if (method_call.method == MAXWELL3D_REG_INDEX(zeta_enable) ||
-            method_call.method == MAXWELL3D_REG_INDEX(zeta_width) ||
-            method_call.method == MAXWELL3D_REG_INDEX(zeta_height) ||
-            (method_call.method >= MAXWELL3D_REG_INDEX(zeta) &&
-             method_call.method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
+        if (method == MAXWELL3D_REG_INDEX(zeta_enable) ||
+            method == MAXWELL3D_REG_INDEX(zeta_width) ||
+            method == MAXWELL3D_REG_INDEX(zeta_height) ||
+            (method >= MAXWELL3D_REG_INDEX(zeta) &&
+             method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
            dirty_flags.zeta_buffer = true;
        }

        // Shader
        constexpr u32 shader_registers_count =
            sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
-        if (method_call.method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
-            method_call.method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
+        if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
+            method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
            dirty_flags.shaders = true;
        }

        // Vertex format
-        if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
-            method_call.method <
-                MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
+        if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
+            method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
            dirty_flags.vertex_attrib_format = true;
        }

        // Vertex buffer
-        if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_array) &&
-            method_call.method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) {
-            dirty_flags.vertex_array |=
-                1u << ((method_call.method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
-        } else if (method_call.method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
-                   method_call.method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) {
-            dirty_flags.vertex_array |=
-                1u << ((method_call.method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
-        } else if (method_call.method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
-                   method_call.method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) {
-            dirty_flags.vertex_array |=
-                1u << (method_call.method - MAXWELL3D_REG_INDEX(instanced_arrays));
+        if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
+            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * 32) {
+            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
+        } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
+                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * 32) {
+            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
+        } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
+                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + 32) {
+            dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
        }
    }

-    switch (method_call.method) {
+    switch (method) {
    case MAXWELL3D_REG_INDEX(macros.data): {
        ProcessMacroUpload(method_call.argument);
        break;
@@ -317,7 +317,7 @@ void Maxwell3D::ProcessQueryGet() {
            LongQueryResult query_result{};
            query_result.value = result;
            // TODO(Subv): Generate a real GPU timestamp and write it here instead of CoreTiming
-            query_result.timestamp = Core::System::GetInstance().CoreTiming().GetTicks();
+            query_result.timestamp = system.CoreTiming().GetTicks();
            Memory::WriteBlock(*address, &query_result, sizeof(query_result));
        }
        dirty_flags.OnMemoryWrite();
@@ -334,7 +334,7 @@ void Maxwell3D::DrawArrays() {
              regs.vertex_buffer.count);
    ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?");

-    auto debug_context = Core::System::GetInstance().GetGPUDebugContext();
+    auto debug_context = system.GetGPUDebugContext();

    if (debug_context) {
        debug_context->OnEvent(Tegra::DebugContext::Event::IncomingPrimitiveBatch, nullptr);
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -5,8 +5,10 @@
 #pragma once

 #include <array>
+#include <bitset>
 #include <unordered_map>
 #include <vector>
+
 #include "common/assert.h"
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
@@ -17,6 +19,10 @@
 #include "video_core/memory_manager.h"
 #include "video_core/textures/texture.h"

+namespace Core {
+class System;
+}
+
 namespace VideoCore {
 class RasterizerInterface;
 }
@@ -28,7 +34,8 @@ namespace Tegra::Engines {

 class Maxwell3D final {
 public:
-    explicit Maxwell3D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
+    explicit Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                       MemoryManager& memory_manager);
    ~Maxwell3D() = default;

    /// Register structure of the Maxwell3D engine.
@@ -498,7 +505,7 @@ public:
            f32 translate_z;
            INSERT_PADDING_WORDS(2);

-            MathUtil::Rectangle<s32> GetRect() const {
+            Common::Rectangle<s32> GetRect() const {
                return {
                    GetX(),               // left
                    GetY() + GetHeight(), // top
@@ -1089,19 +1096,18 @@ public:
    MemoryManager& memory_manager;

    struct DirtyFlags {
-        u8 color_buffer = 0xFF;
-        bool zeta_buffer = true;
-
-        bool shaders = true;
+        std::bitset<8> color_buffer{0xFF};
+        std::bitset<32> vertex_array{0xFFFFFFFF};

        bool vertex_attrib_format = true;
-        u32 vertex_array = 0xFFFFFFFF;
+        bool zeta_buffer = true;
+        bool shaders = true;

        void OnMemoryWrite() {
-            color_buffer = 0xFF;
            zeta_buffer = true;
            shaders = true;
-            vertex_array = 0xFFFFFFFF;
+            color_buffer.set();
+            vertex_array.set();
        }
    };

@@ -1131,6 +1137,8 @@ public:
 private:
    void InitializeRegisterDefaults();

+    Core::System& system;
+
    VideoCore::RasterizerInterface& rasterizer;

    /// Start offsets of each macro in macro_memory
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -2,6 +2,8 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "common/assert.h"
+#include "common/logging/log.h"
 #include "core/core.h"
 #include "core/memory.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -11,8 +13,9 @@

 namespace Tegra::Engines {

-MaxwellDMA::MaxwellDMA(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
-    : memory_manager(memory_manager), rasterizer{rasterizer} {}
+MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                       MemoryManager& memory_manager)
+    : memory_manager(memory_manager), system{system}, rasterizer{rasterizer} {}

 void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
    ASSERT_MSG(method_call.method < Regs::NUM_REGS,
@@ -59,7 +62,7 @@ void MaxwellDMA::HandleCopy() {
    }

    // All copies here update the main memory, so mark all rasterizer states as invalid.
-    Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();

    if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
@@ -89,12 +92,12 @@ void MaxwellDMA::HandleCopy() {
    const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
        // copying.
-        rasterizer.FlushRegion(*source_cpu, src_size);
+        Core::System::GetInstance().GPU().FlushRegion(*source_cpu, src_size);

        // We have to invalidate the destination region to evict any outdated surfaces from the
        // cache. We do this before actually writing the new data because the destination address
        // might contain a dirty surface that will have to be written back to memory.
-        rasterizer.InvalidateRegion(*dest_cpu, dst_size);
+        Core::System::GetInstance().GPU().InvalidateRegion(*dest_cpu, dst_size);
    };

    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -5,13 +5,17 @@
 #pragma once

 #include <array>
-#include "common/assert.h"
+#include <cstddef>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"

+namespace Core {
+class System;
+}
+
 namespace VideoCore {
 class RasterizerInterface;
 }
@@ -20,7 +24,8 @@ namespace Tegra::Engines {

 class MaxwellDMA final {
 public:
-    explicit MaxwellDMA(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
+    explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                        MemoryManager& memory_manager);
    ~MaxwellDMA() = default;

    /// Write the value to the register identified by method.
@@ -137,6 +142,8 @@ public:
    MemoryManager& memory_manager;

 private:
+    Core::System& system;
+
    VideoCore::RasterizerInterface& rasterizer;

    /// Performs the copy from the source buffer to the destination buffer as configured in the
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -6,7 +6,6 @@

 #include <bitset>
 #include <optional>
-#include <string>
 #include <tuple>
 #include <vector>

@@ -376,9 +375,9 @@ enum class R2pMode : u64 {
 };

 enum class IpaInterpMode : u64 {
-    Linear = 0,
-    Perspective = 1,
-    Flat = 2,
+    Pass = 0,
+    Multiply = 1,
+    Constant = 2,
    Sc = 3,
 };

--- a/src/video_core/engines/shader_header.h
+++ b/src/video_core/engines/shader_header.h
@@ -16,6 +16,13 @@ enum class OutputTopology : u32 {
    TriangleStrip = 7,
 };

+enum class AttributeUse : u8 {
+    Unused = 0,
+    Constant = 1,
+    Perspective = 2,
+    ScreenLinear = 3,
+};
+
 // Documentation in:
 // http://download.nvidia.com/open-gpu-doc/Shader-Program-Header/1/Shader-Program-Header.html#ImapTexture
 struct Header {
@@ -84,9 +91,15 @@ struct Header {
        } vtg;

        struct {
-            INSERT_PADDING_BYTES(3);  // ImapSystemValuesA
-            INSERT_PADDING_BYTES(1);  // ImapSystemValuesB
-            INSERT_PADDING_BYTES(32); // ImapGenericVector[32]
+            INSERT_PADDING_BYTES(3); // ImapSystemValuesA
+            INSERT_PADDING_BYTES(1); // ImapSystemValuesB
+            union {
+                BitField<0, 2, AttributeUse> x;
+                BitField<2, 2, AttributeUse> y;
+                BitField<4, 2, AttributeUse> w;
+                BitField<6, 2, AttributeUse> z;
+                u8 raw;
+            } imap_generic_vector[32];
            INSERT_PADDING_BYTES(2);  // ImapColor
            INSERT_PADDING_BYTES(2);  // ImapSystemValuesC
            INSERT_PADDING_BYTES(10); // ImapFixedFncTexture[10]
@@ -103,6 +116,28 @@ struct Header {
                const u32 bit = render_target * 4 + component;
                return omap.target & (1 << bit);
            }
+            AttributeUse GetAttributeIndexUse(u32 attribute, u32 index) const {
+                return static_cast<AttributeUse>(
+                    (imap_generic_vector[attribute].raw >> (index * 2)) & 0x03);
+            }
+            AttributeUse GetAttributeUse(u32 attribute) const {
+                AttributeUse result = AttributeUse::Unused;
+                for (u32 i = 0; i < 4; i++) {
+                    const auto index = GetAttributeIndexUse(attribute, i);
+                    if (index == AttributeUse::Unused) {
+                        continue;
+                    }
+                    if (result == AttributeUse::Unused || result == index) {
+                        result = index;
+                        continue;
+                    }
+                    LOG_CRITICAL(HW_GPU, "Generic Attribute Conflict in Interpolation Mode");
+                    if (index == AttributeUse::Perspective) {
+                        result = index;
+                    }
+                }
+                return result;
+            }
        } ps;
    };

--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -12,7 +12,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
-#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"

 namespace Tegra {

@@ -28,14 +28,15 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
    UNREACHABLE();
 }

-GPU::GPU(VideoCore::RasterizerInterface& rasterizer) {
+GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
+    auto& rasterizer{renderer.Rasterizer()};
    memory_manager = std::make_unique<Tegra::MemoryManager>();
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
-    maxwell_3d = std::make_unique<Engines::Maxwell3D>(rasterizer, *memory_manager);
+    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
    kepler_compute = std::make_unique<Engines::KeplerCompute>(*memory_manager);
-    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(rasterizer, *memory_manager);
-    kepler_memory = std::make_unique<Engines::KeplerMemory>(rasterizer, *memory_manager);
+    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager);
+    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, rasterizer, *memory_manager);
 }

 GPU::~GPU() = default;
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -6,16 +6,19 @@

 #include <array>
 #include <memory>
-#include <vector>
 #include "common/common_types.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/memory_manager.h"

-namespace VideoCore {
-class RasterizerInterface;
+namespace Core {
+class System;
 }

+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
 namespace Tegra {

 enum class RenderTargetFormat : u32 {
@@ -97,7 +100,7 @@ struct FramebufferConfig {

    using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags;
    TransformFlags transform_flags;
-    MathUtil::Rectangle<int> crop_rect;
+    Common::Rectangle<int> crop_rect;
 };

 namespace Engines {
@@ -116,9 +119,10 @@ enum class EngineID {
    MAXWELL_DMA_COPY_A = 0xB0B5,
 };

-class GPU final {
+class GPU {
 public:
-    explicit GPU(VideoCore::RasterizerInterface& rasterizer);
+    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
+
    ~GPU();

    struct MethodCall {
@@ -197,8 +201,42 @@ public:
        };
    } regs{};

+    /// Push GPU command entries to be processed
+    virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
+
+    /// Swap buffers (render frame)
+    virtual void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0;
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
+    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+
 private:
+    void ProcessBindMethod(const MethodCall& method_call);
+    void ProcessSemaphoreTriggerMethod();
+    void ProcessSemaphoreRelease();
+    void ProcessSemaphoreAcquire();
+
+    /// Calls a GPU puller method.
+    void CallPullerMethod(const MethodCall& method_call);
+
+    /// Calls a GPU engine method.
+    void CallEngineMethod(const MethodCall& method_call);
+
+    /// Determines where the method should be executed.
+    bool ExecuteMethodOnEngine(const MethodCall& method_call);
+
+protected:
    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+    VideoCore::RendererBase& renderer;
+
+private:
    std::unique_ptr<Tegra::MemoryManager> memory_manager;

    /// Mapping of command subchannels to their bound engine ids.
@@ -214,18 +252,6 @@ private:
    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
    /// Inline memory engine
    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
-
-    void ProcessBindMethod(const MethodCall& method_call);
-    void ProcessSemaphoreTriggerMethod();
-    void ProcessSemaphoreRelease();
-    void ProcessSemaphoreAcquire();
-
-    // Calls a GPU puller method.
-    void CallPullerMethod(const MethodCall& method_call);
-    // Calls a GPU engine method.
-    void CallEngineMethod(const MethodCall& method_call);
-    // Determines where the method should be executed.
-    bool ExecuteMethodOnEngine(const MethodCall& method_call);
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -0,0 +1,37 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/gpu_asynch.h"
+#include "video_core/gpu_thread.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCommon {
+
+GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
+    : Tegra::GPU(system, renderer), gpu_thread{renderer, *dma_pusher} {}
+
+GPUAsynch::~GPUAsynch() = default;
+
+void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
+    gpu_thread.SubmitList(std::move(entries));
+}
+
+void GPUAsynch::SwapBuffers(
+    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
+    gpu_thread.SwapBuffers(std::move(framebuffer));
+}
+
+void GPUAsynch::FlushRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushRegion(addr, size);
+}
+
+void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.InvalidateRegion(addr, size);
+}
+
+void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushAndInvalidateRegion(addr, size);
+}
+
+} // namespace VideoCommon
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -0,0 +1,37 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "video_core/gpu.h"
+#include "video_core/gpu_thread.h"
+
+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
+namespace VideoCommon {
+
+namespace GPUThread {
+class ThreadManager;
+} // namespace GPUThread
+
+/// Implementation of GPU interface that runs the GPU asynchronously
+class GPUAsynch : public Tegra::GPU {
+public:
+    explicit GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer);
+    ~GPUAsynch();
+
+    void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
+    void FlushRegion(VAddr addr, u64 size) override;
+    void InvalidateRegion(VAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+
+private:
+    GPUThread::ThreadManager gpu_thread;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -0,0 +1,37 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/gpu_synch.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCommon {
+
+GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
+    : Tegra::GPU(system, renderer) {}
+
+GPUSynch::~GPUSynch() = default;
+
+void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
+    dma_pusher->Push(std::move(entries));
+    dma_pusher->DispatchCalls();
+}
+
+void GPUSynch::SwapBuffers(
+    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
+    renderer.SwapBuffers(std::move(framebuffer));
+}
+
+void GPUSynch::FlushRegion(VAddr addr, u64 size) {
+    renderer.Rasterizer().FlushRegion(addr, size);
+}
+
+void GPUSynch::InvalidateRegion(VAddr addr, u64 size) {
+    renderer.Rasterizer().InvalidateRegion(addr, size);
+}
+
+void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    renderer.Rasterizer().FlushAndInvalidateRegion(addr, size);
+}
+
+} // namespace VideoCommon
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -0,0 +1,29 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "video_core/gpu.h"
+
+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
+namespace VideoCommon {
+
+/// Implementation of GPU interface that runs the GPU synchronously
+class GPUSynch : public Tegra::GPU {
+public:
+    explicit GPUSynch(Core::System& system, VideoCore::RendererBase& renderer);
+    ~GPUSynch();
+
+    void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
+    void FlushRegion(VAddr addr, u64 size) override;
+    void InvalidateRegion(VAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -0,0 +1,152 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/microprofile.h"
+#include "core/frontend/scope_acquire_window_context.h"
+#include "core/settings.h"
+#include "video_core/dma_pusher.h"
+#include "video_core/gpu.h"
+#include "video_core/gpu_thread.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCommon::GPUThread {
+
+/// Executes a single GPU thread command
+static void ExecuteCommand(CommandData* command, VideoCore::RendererBase& renderer,
+                           Tegra::DmaPusher& dma_pusher) {
+    if (const auto submit_list = std::get_if<SubmitListCommand>(command)) {
+        dma_pusher.Push(std::move(submit_list->entries));
+        dma_pusher.DispatchCalls();
+    } else if (const auto data = std::get_if<SwapBuffersCommand>(command)) {
+        renderer.SwapBuffers(data->framebuffer);
+    } else if (const auto data = std::get_if<FlushRegionCommand>(command)) {
+        renderer.Rasterizer().FlushRegion(data->addr, data->size);
+    } else if (const auto data = std::get_if<InvalidateRegionCommand>(command)) {
+        renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
+    } else if (const auto data = std::get_if<FlushAndInvalidateRegionCommand>(command)) {
+        renderer.Rasterizer().FlushAndInvalidateRegion(data->addr, data->size);
+    } else {
+        UNREACHABLE();
+    }
+}
+
+/// Runs the GPU thread
+static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher,
+                      SynchState& state) {
+
+    MicroProfileOnThreadCreate("GpuThread");
+
+    auto WaitForWakeup = [&]() {
+        std::unique_lock<std::mutex> lock{state.signal_mutex};
+        state.signal_condition.wait(lock, [&] { return !state.is_idle || !state.is_running; });
+    };
+
+    // Wait for first GPU command before acquiring the window context
+    WaitForWakeup();
+
+    // If emulation was stopped during disk shader loading, abort before trying to acquire context
+    if (!state.is_running) {
+        return;
+    }
+
+    Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()};
+
+    while (state.is_running) {
+        if (!state.is_running) {
+            return;
+        }
+
+        {
+            // Thread has been woken up, so make the previous write queue the next read queue
+            std::lock_guard<std::mutex> lock{state.signal_mutex};
+            std::swap(state.push_queue, state.pop_queue);
+        }
+
+        // Execute all of the GPU commands
+        while (!state.pop_queue->empty()) {
+            ExecuteCommand(&state.pop_queue->front(), renderer, dma_pusher);
+            state.pop_queue->pop();
+        }
+
+        state.UpdateIdleState();
+
+        // Signal that the GPU thread has finished processing commands
+        if (state.is_idle) {
+            state.idle_condition.notify_one();
+        }
+
+        // Wait for CPU thread to send more GPU commands
+        WaitForWakeup();
+    }
+}
+
+ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher)
+    : renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer),
+                                                         std::ref(dma_pusher), std::ref(state)},
+      thread_id{thread.get_id()} {}
+
+ThreadManager::~ThreadManager() {
+    {
+        // Notify GPU thread that a shutdown is pending
+        std::lock_guard<std::mutex> lock{state.signal_mutex};
+        state.is_running = false;
+    }
+
+    state.signal_condition.notify_one();
+    thread.join();
+}
+
+void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
+    if (entries.empty()) {
+        return;
+    }
+
+    PushCommand(SubmitListCommand(std::move(entries)), false, false);
+}
+
+void ThreadManager::SwapBuffers(
+    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
+    PushCommand(SwapBuffersCommand(std::move(framebuffer)), true, false);
+}
+
+void ThreadManager::FlushRegion(VAddr addr, u64 size) {
+    // Block the CPU when using accurate emulation
+    PushCommand(FlushRegionCommand(addr, size), Settings::values.use_accurate_gpu_emulation, false);
+}
+
+void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
+    PushCommand(InvalidateRegionCommand(addr, size), true, true);
+}
+
+void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    InvalidateRegion(addr, size);
+}
+
+void ThreadManager::PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu) {
+    {
+        std::lock_guard<std::mutex> lock{state.signal_mutex};
+
+        if ((allow_on_cpu && state.is_idle) || IsGpuThread()) {
+            // Execute the command synchronously on the current thread
+            ExecuteCommand(&command_data, renderer, dma_pusher);
+            return;
+        }
+
+        // Push the command to the GPU thread
+        state.UpdateIdleState();
+        state.push_queue->emplace(command_data);
+    }
+
+    // Signal the GPU thread that commands are pending
+    state.signal_condition.notify_one();
+
+    if (wait_for_idle) {
+        // Wait for the GPU to be idle (all commands to be executed)
+        std::unique_lock<std::mutex> lock{state.idle_mutex};
+        state.idle_condition.wait(lock, [this] { return static_cast<bool>(state.is_idle); });
+    }
+}
+
+} // namespace VideoCommon::GPUThread
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -0,0 +1,136 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <thread>
+#include <variant>
+
+namespace Tegra {
+struct FramebufferConfig;
+class DmaPusher;
+} // namespace Tegra
+
+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
+namespace VideoCommon::GPUThread {
+
+/// Command to signal to the GPU thread that a command list is ready for processing
+struct SubmitListCommand final {
+    explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {}
+
+    Tegra::CommandList entries;
+};
+
+/// Command to signal to the GPU thread that a swap buffers is pending
+struct SwapBuffersCommand final {
+    explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
+        : framebuffer{std::move(framebuffer)} {}
+
+    std::optional<const Tegra::FramebufferConfig> framebuffer;
+};
+
+/// Command to signal to the GPU thread to flush a region
+struct FlushRegionCommand final {
+    explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+
+    const VAddr addr;
+    const u64 size;
+};
+
+/// Command to signal to the GPU thread to invalidate a region
+struct InvalidateRegionCommand final {
+    explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+
+    const VAddr addr;
+    const u64 size;
+};
+
+/// Command to signal to the GPU thread to flush and invalidate a region
+struct FlushAndInvalidateRegionCommand final {
+    explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size)
+        : addr{addr}, size{size} {}
+
+    const VAddr addr;
+    const u64 size;
+};
+
+using CommandData = std::variant<SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
+                                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
+
+/// Struct used to synchronize the GPU thread
+struct SynchState final {
+    std::atomic<bool> is_running{true};
+    std::atomic<bool> is_idle{true};
+    std::condition_variable signal_condition;
+    std::mutex signal_mutex;
+    std::condition_variable idle_condition;
+    std::mutex idle_mutex;
+
+    // We use two queues for sending commands to the GPU thread, one for writing (push_queue) to and
+    // one for reading from (pop_queue). These are swapped whenever the current pop_queue becomes
+    // empty. This allows for efficient thread-safe access, as it does not require any copies.
+
+    using CommandQueue = std::queue<CommandData>;
+    std::array<CommandQueue, 2> command_queues;
+    CommandQueue* push_queue{&command_queues[0]};
+    CommandQueue* pop_queue{&command_queues[1]};
+
+    void UpdateIdleState() {
+        std::lock_guard<std::mutex> lock{idle_mutex};
+        is_idle = command_queues[0].empty() && command_queues[1].empty();
+    }
+};
+
+/// Class used to manage the GPU thread
+class ThreadManager final {
+public:
+    explicit ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher);
+    ~ThreadManager();
+
+    /// Push GPU command entries to be processed
+    void SubmitList(Tegra::CommandList&& entries);
+
+    /// Swap buffers (render frame)
+    void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    void FlushRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    void InvalidateRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
+    void FlushAndInvalidateRegion(VAddr addr, u64 size);
+
+    /// Waits the caller until the GPU thread is idle, used for synchronization
+    void WaitForIdle();
+
+private:
+    /// Pushes a command to be executed by the GPU thread
+    void PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu);
+
+    /// Returns true if this is called by the GPU thread
+    bool IsGpuThread() const {
+        return std::this_thread::get_id() == thread_id;
+    }
+
+private:
+    SynchState state;
+    std::thread thread;
+    std::thread::id thread_id;
+    VideoCore::RendererBase& renderer;
+    Tegra::DmaPusher& dma_pusher;
+};
+
+} // namespace VideoCommon::GPUThread
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -129,6 +129,15 @@ protected:
        return ++modified_ticks;
    }

+    /// Flushes the specified object, updating appropriate cache state as needed
+    void FlushObject(const T& object) {
+        if (!object->IsDirty()) {
+            return;
+        }
+        object->Flush();
+        object->MarkAsModified(false, *this);
+    }
+
 private:
    /// Returns a list of cached objects from the specified memory region, ordered by access time
    std::vector<T> GetSortedObjectsFromRegion(VAddr addr, u64 size) {
@@ -154,15 +163,6 @@ private:
        return objects;
    }

-    /// Flushes the specified object, updating appropriate cache state as needed
-    void FlushObject(const T& object) {
-        if (!object->IsDirty()) {
-            return;
-        }
-        object->Flush();
-        object->MarkAsModified(false, *this);
-    }
-
    using ObjectSet = std::set<T>;
    using ObjectCache = std::unordered_map<VAddr, T>;
    using IntervalCache = boost::icl::interval_map<VAddr, ObjectSet>;
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -47,8 +47,8 @@ public:
    /// Attempt to use a faster method to perform a surface copy
    virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                       const Tegra::Engines::Fermi2D::Regs::Surface& dst,
-                                       const MathUtil::Rectangle<u32>& src_rect,
-                                       const MathUtil::Rectangle<u32>& dst_rect) {
+                                       const Common::Rectangle<u32>& src_rect,
+                                       const Common::Rectangle<u32>& dst_rect) {
        return false;
    }

--- a/src/video_core/renderer_base.cpp
+++ b/src/video_core/renderer_base.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "common/logging/log.h"
 #include "core/frontend/emu_window.h"
 #include "core/settings.h"
 #include "video_core/renderer_base.h"
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -102,8 +102,8 @@ struct FramebufferCacheKey {

 RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& window, Core::System& system,
                                   ScreenInfo& info)
-    : res_cache{*this}, shader_cache{*this, system}, emu_window{window}, screen_info{info},
-      buffer_cache(*this, STREAM_BUFFER_SIZE), global_cache{*this} {
+    : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, emu_window{window},
+      screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) {
    // Create sampler objects
    for (std::size_t i = 0; i < texture_samplers.size(); ++i) {
        texture_samplers[i].Create();
@@ -200,7 +200,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
    }

    // Rebinding the VAO invalidates the vertex buffer bindings.
-    gpu.dirty_flags.vertex_array = 0xFFFFFFFF;
+    gpu.dirty_flags.vertex_array.set();

    state.draw.vertex_array = vao_entry.handle;
    return vao_entry.handle;
@@ -210,14 +210,14 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
    const auto& regs = gpu.regs;

-    if (!gpu.dirty_flags.vertex_array)
+    if (gpu.dirty_flags.vertex_array.none())
        return;

    MICROPROFILE_SCOPE(OpenGL_VB);

    // Upload all guest vertex arrays sequentially to our buffer
    for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
-        if (~gpu.dirty_flags.vertex_array & (1u << index))
+        if (!gpu.dirty_flags.vertex_array[index])
            continue;

        const auto& vertex_array = regs.vertex_array[index];
@@ -244,7 +244,7 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
        }
    }

-    gpu.dirty_flags.vertex_array = 0;
+    gpu.dirty_flags.vertex_array.reset();
 }

 DrawParameters RasterizerOpenGL::SetupDraw() {
@@ -488,13 +488,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
    OpenGLState& current_state, bool using_color_fb, bool using_depth_fb, bool preserve_contents,
    std::optional<std::size_t> single_color_target) {
    MICROPROFILE_SCOPE(OpenGL_Framebuffer);
-    const auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
+    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
    const auto& regs = gpu.regs;

    const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,
                                                 single_color_target};
-    if (fb_config_state == current_framebuffer_config_state && gpu.dirty_flags.color_buffer == 0 &&
-        !gpu.dirty_flags.zeta_buffer) {
+    if (fb_config_state == current_framebuffer_config_state &&
+        gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) {
        // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or
        // single color targets). This is done because the guest registers may not change but the
        // host framebuffer may contain different attachments
@@ -721,10 +721,10 @@ void RasterizerOpenGL::DrawArrays() {
    // Add space for at least 18 constant buffers
    buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment);

-    bool invalidate = buffer_cache.Map(buffer_size);
+    const bool invalidate = buffer_cache.Map(buffer_size);
    if (invalidate) {
        // As all cached buffers are invalidated, we need to recheck their state.
-        gpu.dirty_flags.vertex_array = 0xFFFFFFFF;
+        gpu.dirty_flags.vertex_array.set();
    }

    const GLuint vao = SetupVertexFormat();
@@ -738,30 +738,18 @@ void RasterizerOpenGL::DrawArrays() {
    shader_program_manager->ApplyTo(state);
    state.Apply();

-    // Execute draw call
+    res_cache.SignalPreDrawCall();
    params.DispatchDraw();
-
-    // Disable scissor test
-    state.viewports[0].scissor.enabled = false;
+    res_cache.SignalPostDrawCall();

    accelerate_draw = AccelDraw::Disabled;
-
-    // Unbind textures for potential future use as framebuffer attachments
-    for (auto& texture_unit : state.texture_units) {
-        texture_unit.Unbind();
-    }
-    state.Apply();
 }

 void RasterizerOpenGL::FlushAll() {}

 void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-
-    if (Settings::values.use_accurate_gpu_emulation) {
-        // Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit
-        res_cache.FlushRegion(addr, size);
-    }
+    res_cache.FlushRegion(addr, size);
 }

 void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
@@ -779,8 +767,8 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {

 bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                             const Tegra::Engines::Fermi2D::Regs::Surface& dst,
-                                             const MathUtil::Rectangle<u32>& src_rect,
-                                             const MathUtil::Rectangle<u32>& dst_rect) {
+                                             const Common::Rectangle<u32>& src_rect,
+                                             const Common::Rectangle<u32>& dst_rect) {
    MICROPROFILE_SCOPE(OpenGL_Blits);
    res_cache.FermiCopySurface(src, dst, src_rect, dst_rect);
    return true;
@@ -1034,7 +1022,7 @@ void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
    for (std::size_t i = 0; i < viewport_count; i++) {
        auto& viewport = current_state.viewports[i];
        const auto& src = regs.viewports[i];
-        const MathUtil::Rectangle<s32> viewport_rect{regs.viewport_transform[i].GetRect()};
+        const Common::Rectangle<s32> viewport_rect{regs.viewport_transform[i].GetRect()};
        viewport.x = viewport_rect.left;
        viewport.y = viewport_rect.bottom;
        viewport.width = viewport_rect.GetWidth();
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -62,8 +62,8 @@ public:
    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
    bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                               const Tegra::Engines::Fermi2D::Regs::Surface& dst,
-                               const MathUtil::Rectangle<u32>& src_rect,
-                               const MathUtil::Rectangle<u32>& dst_rect) override;
+                               const Common::Rectangle<u32>& src_rect,
+                               const Common::Rectangle<u32>& dst_rect) override;
    bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                           u32 pixel_stride) override;
    bool AccelerateDrawBatch(bool is_indexed) override;
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.

 #include <algorithm>
+#include <optional>
 #include <glad/glad.h>

 #include "common/alignment.h"
@@ -20,7 +21,7 @@
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/utils.h"
 #include "video_core/surface.h"
-#include "video_core/textures/astc.h"
+#include "video_core/textures/convert.h"
 #include "video_core/textures/decoders.h"

 namespace OpenGL {
@@ -399,7 +400,7 @@ static const FormatTuple& GetFormatTuple(PixelFormat pixel_format, ComponentType
    return format;
 }

-MathUtil::Rectangle<u32> SurfaceParams::GetRect(u32 mip_level) const {
+Common::Rectangle<u32> SurfaceParams::GetRect(u32 mip_level) const {
    u32 actual_height{std::max(1U, unaligned_height >> mip_level)};
    if (IsPixelFormatASTC(pixel_format)) {
        // ASTC formats must stop at the ATSC block size boundary
@@ -423,7 +424,7 @@ void SwizzleFunc(const MortonSwizzleMode& mode, const SurfaceParams& params,
        for (u32 i = 0; i < params.depth; i++) {
            MortonSwizzle(mode, params.pixel_format, params.MipWidth(mip_level),
                          params.MipBlockHeight(mip_level), params.MipHeight(mip_level),
-                          params.MipBlockDepth(mip_level), params.tile_width_spacing, 1,
+                          params.MipBlockDepth(mip_level), 1, params.tile_width_spacing,
                          gl_buffer.data() + offset_gl, gl_size, params.addr + offset);
            offset += layer_size;
            offset_gl += gl_size;
@@ -549,6 +550,8 @@ CachedSurface::CachedSurface(const SurfaceParams& params)
    // alternatives. This signals a bug on those functions.
    const auto width = static_cast<GLsizei>(params.MipWidth(0));
    const auto height = static_cast<GLsizei>(params.MipHeight(0));
+    memory_size = params.MemorySize();
+    reinterpreted = false;

    const auto& format_tuple = GetFormatTuple(params.pixel_format, params.component_type);
    gl_internal_format = format_tuple.internal_format;
@@ -594,103 +597,6 @@ CachedSurface::CachedSurface(const SurfaceParams& params)
    }
 }

-static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height, bool reverse) {
-    union S8Z24 {
-        BitField<0, 24, u32> z24;
-        BitField<24, 8, u32> s8;
-    };
-    static_assert(sizeof(S8Z24) == 4, "S8Z24 is incorrect size");
-
-    union Z24S8 {
-        BitField<0, 8, u32> s8;
-        BitField<8, 24, u32> z24;
-    };
-    static_assert(sizeof(Z24S8) == 4, "Z24S8 is incorrect size");
-
-    S8Z24 s8z24_pixel{};
-    Z24S8 z24s8_pixel{};
-    constexpr auto bpp{GetBytesPerPixel(PixelFormat::S8Z24)};
-    for (std::size_t y = 0; y < height; ++y) {
-        for (std::size_t x = 0; x < width; ++x) {
-            const std::size_t offset{bpp * (y * width + x)};
-            if (reverse) {
-                std::memcpy(&z24s8_pixel, &data[offset], sizeof(Z24S8));
-                s8z24_pixel.s8.Assign(z24s8_pixel.s8);
-                s8z24_pixel.z24.Assign(z24s8_pixel.z24);
-                std::memcpy(&data[offset], &s8z24_pixel, sizeof(S8Z24));
-            } else {
-                std::memcpy(&s8z24_pixel, &data[offset], sizeof(S8Z24));
-                z24s8_pixel.s8.Assign(s8z24_pixel.s8);
-                z24s8_pixel.z24.Assign(s8z24_pixel.z24);
-                std::memcpy(&data[offset], &z24s8_pixel, sizeof(Z24S8));
-            }
-        }
-    }
-}
-
-/**
- * Helper function to perform software conversion (as needed) when loading a buffer from Switch
- * memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or with
- * typical desktop GPUs.
- */
-static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelFormat pixel_format,
-                                               u32 width, u32 height, u32 depth) {
-    switch (pixel_format) {
-    case PixelFormat::ASTC_2D_4X4:
-    case PixelFormat::ASTC_2D_8X8:
-    case PixelFormat::ASTC_2D_8X5:
-    case PixelFormat::ASTC_2D_5X4:
-    case PixelFormat::ASTC_2D_5X5:
-    case PixelFormat::ASTC_2D_4X4_SRGB:
-    case PixelFormat::ASTC_2D_8X8_SRGB:
-    case PixelFormat::ASTC_2D_8X5_SRGB:
-    case PixelFormat::ASTC_2D_5X4_SRGB:
-    case PixelFormat::ASTC_2D_5X5_SRGB:
-    case PixelFormat::ASTC_2D_10X8:
-    case PixelFormat::ASTC_2D_10X8_SRGB: {
-        // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC.
-        u32 block_width{};
-        u32 block_height{};
-        std::tie(block_width, block_height) = GetASTCBlockSize(pixel_format);
-        data =
-            Tegra::Texture::ASTC::Decompress(data, width, height, depth, block_width, block_height);
-        break;
-    }
-    case PixelFormat::S8Z24:
-        // Convert the S8Z24 depth format to Z24S8, as OpenGL does not support S8Z24.
-        ConvertS8Z24ToZ24S8(data, width, height, false);
-        break;
-    }
-}
-
-/**
- * Helper function to perform software conversion (as needed) when flushing a buffer from OpenGL to
- * Switch memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or
- * with typical desktop GPUs.
- */
-static void ConvertFormatAsNeeded_FlushGLBuffer(std::vector<u8>& data, PixelFormat pixel_format,
-                                                u32 width, u32 height) {
-    switch (pixel_format) {
-    case PixelFormat::ASTC_2D_4X4:
-    case PixelFormat::ASTC_2D_8X8:
-    case PixelFormat::ASTC_2D_4X4_SRGB:
-    case PixelFormat::ASTC_2D_8X8_SRGB:
-    case PixelFormat::ASTC_2D_5X5:
-    case PixelFormat::ASTC_2D_5X5_SRGB:
-    case PixelFormat::ASTC_2D_10X8:
-    case PixelFormat::ASTC_2D_10X8_SRGB: {
-        LOG_CRITICAL(HW_GPU, "Conversion of format {} after texture flushing is not implemented",
-                     static_cast<u32>(pixel_format));
-        UNREACHABLE();
-        break;
-    }
-    case PixelFormat::S8Z24:
-        // Convert the Z24S8 depth format to S8Z24, as OpenGL does not support S8Z24.
-        ConvertS8Z24ToZ24S8(data, width, height, true);
-        break;
-    }
-}
-
 MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 192, 64));
 void CachedSurface::LoadGLBuffer() {
    MICROPROFILE_SCOPE(OpenGL_SurfaceLoad);
@@ -719,8 +625,16 @@ void CachedSurface::LoadGLBuffer() {
        }
    }
    for (u32 i = 0; i < params.max_mip_level; i++) {
-        ConvertFormatAsNeeded_LoadGLBuffer(gl_buffer[i], params.pixel_format, params.MipWidth(i),
-                                           params.MipHeight(i), params.MipDepth(i));
+        const u32 width = params.MipWidth(i);
+        const u32 height = params.MipHeight(i);
+        const u32 depth = params.MipDepth(i);
+        if (VideoCore::Surface::IsPixelFormatASTC(params.pixel_format)) {
+            // Reserve size for RGBA8 conversion
+            constexpr std::size_t rgba_bpp = 4;
+            gl_buffer[i].resize(std::max(gl_buffer[i].size(), width * height * depth * rgba_bpp));
+        }
+        Tegra::Texture::ConvertFromGuestToHost(gl_buffer[i].data(), params.pixel_format, width,
+                                               height, depth, true, true);
    }
 }

@@ -743,8 +657,8 @@ void CachedSurface::FlushGLBuffer() {
    glGetTextureImage(texture.handle, 0, tuple.format, tuple.type,
                      static_cast<GLsizei>(gl_buffer[0].size()), gl_buffer[0].data());
    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
-    ConvertFormatAsNeeded_FlushGLBuffer(gl_buffer[0], params.pixel_format, params.width,
-                                        params.height);
+    Tegra::Texture::ConvertFromHostToGuest(gl_buffer[0].data(), params.pixel_format, params.width,
+                                           params.height, params.depth, true, true);
    const u8* const texture_src_data = Memory::GetPointer(params.addr);
    ASSERT(texture_src_data);
    if (params.is_tiled) {
@@ -962,30 +876,31 @@ Surface RasterizerCacheOpenGL::GetColorBufferSurface(std::size_t index, bool pre
    auto& gpu{Core::System::GetInstance().GPU().Maxwell3D()};
    const auto& regs{gpu.regs};

-    if ((gpu.dirty_flags.color_buffer & (1u << static_cast<u32>(index))) == 0) {
-        return last_color_buffers[index];
+    if (!gpu.dirty_flags.color_buffer[index]) {
+        return current_color_buffers[index];
    }
-    gpu.dirty_flags.color_buffer &= ~(1u << static_cast<u32>(index));
+    gpu.dirty_flags.color_buffer.reset(index);

    ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);

    if (index >= regs.rt_control.count) {
-        return last_color_buffers[index] = {};
+        return current_color_buffers[index] = {};
    }

    if (regs.rt[index].Address() == 0 || regs.rt[index].format == Tegra::RenderTargetFormat::NONE) {
-        return last_color_buffers[index] = {};
+        return current_color_buffers[index] = {};
    }

    const SurfaceParams color_params{SurfaceParams::CreateForFramebuffer(index)};

-    return last_color_buffers[index] = GetSurface(color_params, preserve_contents);
+    return current_color_buffers[index] = GetSurface(color_params, preserve_contents);
 }

 void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) {
    surface->LoadGLBuffer();
    surface->UploadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
    surface->MarkAsModified(false, *this);
+    surface->MarkForReload(false);
 }

 Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool preserve_contents) {
@@ -997,18 +912,23 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params, bool pres
    Surface surface{TryGet(params.addr)};
    if (surface) {
        if (surface->GetSurfaceParams().IsCompatibleSurface(params)) {
-            // Use the cached surface as-is
+            // Use the cached surface as-is unless it's not synced with memory
+            if (surface->MustReload())
+                LoadSurface(surface);
            return surface;
        } else if (preserve_contents) {
            // If surface parameters changed and we care about keeping the previous data, recreate
            // the surface from the old one
            Surface new_surface{RecreateSurface(surface, params)};
-            Unregister(surface);
+            UnregisterSurface(surface);
            Register(new_surface);
+            if (new_surface->IsUploaded()) {
+                RegisterReinterpretSurface(new_surface);
+            }
            return new_surface;
        } else {
            // Delete the old surface before creating a new one to prevent collisions.
-            Unregister(surface);
+            UnregisterSurface(surface);
        }
    }

@@ -1062,8 +982,8 @@ void RasterizerCacheOpenGL::FastLayeredCopySurface(const Surface& src_surface,
 }

 static bool BlitSurface(const Surface& src_surface, const Surface& dst_surface,
-                        const MathUtil::Rectangle<u32>& src_rect,
-                        const MathUtil::Rectangle<u32>& dst_rect, GLuint read_fb_handle,
+                        const Common::Rectangle<u32>& src_rect,
+                        const Common::Rectangle<u32>& dst_rect, GLuint read_fb_handle,
                        GLuint draw_fb_handle, GLenum src_attachment = 0, GLenum dst_attachment = 0,
                        std::size_t cubemap_face = 0) {

@@ -1193,7 +1113,7 @@ static bool BlitSurface(const Surface& src_surface, const Surface& dst_surface,
 void RasterizerCacheOpenGL::FermiCopySurface(
    const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
    const Tegra::Engines::Fermi2D::Regs::Surface& dst_config,
-    const MathUtil::Rectangle<u32>& src_rect, const MathUtil::Rectangle<u32>& dst_rect) {
+    const Common::Rectangle<u32>& src_rect, const Common::Rectangle<u32>& dst_rect) {

    const auto& src_params = SurfaceParams::CreateForFermiCopySurface(src_config);
    const auto& dst_params = SurfaceParams::CreateForFermiCopySurface(dst_config);
@@ -1257,7 +1177,11 @@ Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& old_surface,
    case SurfaceTarget::TextureCubemap:
    case SurfaceTarget::Texture2DArray:
    case SurfaceTarget::TextureCubeArray:
-        FastLayeredCopySurface(old_surface, new_surface);
+        if (old_params.pixel_format == new_params.pixel_format)
+            FastLayeredCopySurface(old_surface, new_surface);
+        else {
+            AccurateCopySurface(old_surface, new_surface);
+        }
        break;
    default:
        LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}",
@@ -1286,4 +1210,107 @@ Surface RasterizerCacheOpenGL::TryGetReservedSurface(const SurfaceParams& params
    return {};
 }

+static std::optional<u32> TryFindBestMipMap(std::size_t memory, const SurfaceParams params,
+                                            u32 height) {
+    for (u32 i = 0; i < params.max_mip_level; i++) {
+        if (memory == params.GetMipmapSingleSize(i) && params.MipHeight(i) == height) {
+            return {i};
+        }
+    }
+    return {};
+}
+
+static std::optional<u32> TryFindBestLayer(VAddr addr, const SurfaceParams params, u32 mipmap) {
+    const std::size_t size = params.LayerMemorySize();
+    VAddr start = params.addr + params.GetMipmapLevelOffset(mipmap);
+    for (u32 i = 0; i < params.depth; i++) {
+        if (start == addr) {
+            return {i};
+        }
+        start += size;
+    }
+    return {};
+}
+
+static bool LayerFitReinterpretSurface(RasterizerCacheOpenGL& cache, const Surface render_surface,
+                                       const Surface blitted_surface) {
+    const auto& dst_params = blitted_surface->GetSurfaceParams();
+    const auto& src_params = render_surface->GetSurfaceParams();
+    const std::size_t src_memory_size = src_params.size_in_bytes;
+    const std::optional<u32> level =
+        TryFindBestMipMap(src_memory_size, dst_params, src_params.height);
+    if (level.has_value()) {
+        if (src_params.width == dst_params.MipWidthGobAligned(*level) &&
+            src_params.height == dst_params.MipHeight(*level) &&
+            src_params.block_height >= dst_params.MipBlockHeight(*level)) {
+            const std::optional<u32> slot =
+                TryFindBestLayer(render_surface->GetAddr(), dst_params, *level);
+            if (slot.has_value()) {
+                glCopyImageSubData(render_surface->Texture().handle,
+                                   SurfaceTargetToGL(src_params.target), 0, 0, 0, 0,
+                                   blitted_surface->Texture().handle,
+                                   SurfaceTargetToGL(dst_params.target), *level, 0, 0, *slot,
+                                   dst_params.MipWidth(*level), dst_params.MipHeight(*level), 1);
+                blitted_surface->MarkAsModified(true, cache);
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+static bool IsReinterpretInvalid(const Surface render_surface, const Surface blitted_surface) {
+    const VAddr bound1 = blitted_surface->GetAddr() + blitted_surface->GetMemorySize();
+    const VAddr bound2 = render_surface->GetAddr() + render_surface->GetMemorySize();
+    if (bound2 > bound1)
+        return true;
+    const auto& dst_params = blitted_surface->GetSurfaceParams();
+    const auto& src_params = render_surface->GetSurfaceParams();
+    return (dst_params.component_type != src_params.component_type);
+}
+
+static bool IsReinterpretInvalidSecond(const Surface render_surface,
+                                       const Surface blitted_surface) {
+    const auto& dst_params = blitted_surface->GetSurfaceParams();
+    const auto& src_params = render_surface->GetSurfaceParams();
+    return (dst_params.height > src_params.height && dst_params.width > src_params.width);
+}
+
+bool RasterizerCacheOpenGL::PartialReinterpretSurface(Surface triggering_surface,
+                                                      Surface intersect) {
+    if (IsReinterpretInvalid(triggering_surface, intersect)) {
+        UnregisterSurface(intersect);
+        return false;
+    }
+    if (!LayerFitReinterpretSurface(*this, triggering_surface, intersect)) {
+        if (IsReinterpretInvalidSecond(triggering_surface, intersect)) {
+            UnregisterSurface(intersect);
+            return false;
+        }
+        FlushObject(intersect);
+        FlushObject(triggering_surface);
+        intersect->MarkForReload(true);
+    }
+    return true;
+}
+
+void RasterizerCacheOpenGL::SignalPreDrawCall() {
+    if (texception && GLAD_GL_ARB_texture_barrier) {
+        glTextureBarrier();
+    }
+    texception = false;
+}
+
+void RasterizerCacheOpenGL::SignalPostDrawCall() {
+    for (u32 i = 0; i < Maxwell::NumRenderTargets; i++) {
+        if (current_color_buffers[i] != nullptr) {
+            Surface intersect = CollideOnReinterpretedSurface(current_color_buffers[i]->GetAddr());
+            if (intersect != nullptr) {
+                PartialReinterpretSurface(current_color_buffers[i], intersect);
+                texception = true;
+            }
+        }
+    }
+}
+
 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -28,15 +28,15 @@ namespace OpenGL {

 class CachedSurface;
 using Surface = std::shared_ptr<CachedSurface>;
-using SurfaceSurfaceRect_Tuple = std::tuple<Surface, Surface, MathUtil::Rectangle<u32>>;
+using SurfaceSurfaceRect_Tuple = std::tuple<Surface, Surface, Common::Rectangle<u32>>;

 using SurfaceTarget = VideoCore::Surface::SurfaceTarget;
 using SurfaceType = VideoCore::Surface::SurfaceType;
 using PixelFormat = VideoCore::Surface::PixelFormat;
 using ComponentType = VideoCore::Surface::ComponentType;
+using Maxwell = Tegra::Engines::Maxwell3D::Regs;

 struct SurfaceParams {
-
    enum class SurfaceClass {
        Uploaded,
        RenderTarget,
@@ -72,7 +72,7 @@ struct SurfaceParams {
    }

    /// Returns the rectangle corresponding to this surface
-    MathUtil::Rectangle<u32> GetRect(u32 mip_level = 0) const;
+    Common::Rectangle<u32> GetRect(u32 mip_level = 0) const;

    /// Returns the total size of this surface in bytes, adjusted for compression
    std::size_t SizeInBytesRaw(bool ignore_tiled = false) const {
@@ -141,10 +141,18 @@ struct SurfaceParams {
        return offset;
    }

+    std::size_t GetMipmapSingleSize(u32 mip_level) const {
+        return InnerMipmapMemorySize(mip_level, false, is_layered);
+    }
+
    u32 MipWidth(u32 mip_level) const {
        return std::max(1U, width >> mip_level);
    }

+    u32 MipWidthGobAligned(u32 mip_level) const {
+        return Common::AlignUp(std::max(1U, width >> mip_level), 64U * 8U / GetFormatBpp());
+    }
+
    u32 MipHeight(u32 mip_level) const {
        return std::max(1U, height >> mip_level);
    }
@@ -169,20 +177,27 @@ struct SurfaceParams {
    }

    u32 MipBlockDepth(u32 mip_level) const {
-        if (mip_level == 0)
+        if (mip_level == 0) {
            return block_depth;
-        if (is_layered)
+        }
+
+        if (is_layered) {
            return 1;
-        u32 depth = MipDepth(mip_level);
+        }
+
+        const u32 mip_depth = MipDepth(mip_level);
        u32 bd = 32;
-        while (bd > 1 && depth * 2 <= bd) {
+        while (bd > 1 && mip_depth * 2 <= bd) {
            bd >>= 1;
        }
+
        if (bd == 32) {
-            u32 bh = MipBlockHeight(mip_level);
-            if (bh >= 4)
+            const u32 bh = MipBlockHeight(mip_level);
+            if (bh >= 4) {
                return 16;
+            }
        }
+
        return bd;
    }

@@ -340,6 +355,10 @@ public:
        return cached_size_in_bytes;
    }

+    std::size_t GetMemorySize() const {
+        return memory_size;
+    }
+
    void Flush() override {
        FlushGLBuffer();
    }
@@ -389,6 +408,26 @@ public:
                       Tegra::Texture::SwizzleSource swizzle_z,
                       Tegra::Texture::SwizzleSource swizzle_w);

+    void MarkReinterpreted() {
+        reinterpreted = true;
+    }
+
+    bool IsReinterpreted() const {
+        return reinterpreted;
+    }
+
+    void MarkForReload(bool reload) {
+        must_reload = reload;
+    }
+
+    bool MustReload() const {
+        return must_reload;
+    }
+
+    bool IsUploaded() const {
+        return params.identity == SurfaceParams::SurfaceClass::Uploaded;
+    }
+
 private:
    void UploadGLMipmapTexture(u32 mip_map, GLuint read_fb_handle, GLuint draw_fb_handle);

@@ -402,6 +441,9 @@ private:
    GLenum gl_internal_format{};
    std::size_t cached_size_in_bytes{};
    std::array<GLenum, 4> swizzle{GL_RED, GL_GREEN, GL_BLUE, GL_ALPHA};
+    std::size_t memory_size;
+    bool reinterpreted = false;
+    bool must_reload = false;
 };

 class RasterizerCacheOpenGL final : public RasterizerCache<Surface> {
@@ -424,8 +466,11 @@ public:
    /// Copies the contents of one surface to another
    void FermiCopySurface(const Tegra::Engines::Fermi2D::Regs::Surface& src_config,
                          const Tegra::Engines::Fermi2D::Regs::Surface& dst_config,
-                          const MathUtil::Rectangle<u32>& src_rect,
-                          const MathUtil::Rectangle<u32>& dst_rect);
+                          const Common::Rectangle<u32>& src_rect,
+                          const Common::Rectangle<u32>& dst_rect);
+
+    void SignalPreDrawCall();
+    void SignalPostDrawCall();

 private:
    void LoadSurface(const Surface& surface);
@@ -443,6 +488,10 @@ private:
    /// Tries to get a reserved surface for the specified parameters
    Surface TryGetReservedSurface(const SurfaceParams& params);

+    // Partialy reinterpret a surface based on a triggering_surface that collides with it.
+    // returns true if the reinterpret was successful, false in case it was not.
+    bool PartialReinterpretSurface(Surface triggering_surface, Surface intersect);
+
    /// Performs a slow but accurate surface copy, flushing to RAM and reinterpreting the data
    void AccurateCopySurface(const Surface& src_surface, const Surface& dst_surface);
    void FastLayeredCopySurface(const Surface& src_surface, const Surface& dst_surface);
@@ -459,12 +508,50 @@ private:
    OGLFramebuffer read_framebuffer;
    OGLFramebuffer draw_framebuffer;

+    bool texception = false;
+
    /// Use a Pixel Buffer Object to download the previous texture and then upload it to the new one
    /// using the new format.
    OGLBuffer copy_pbo;

-    std::array<Surface, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> last_color_buffers;
+    std::array<Surface, Maxwell::NumRenderTargets> last_color_buffers;
+    std::array<Surface, Maxwell::NumRenderTargets> current_color_buffers;
    Surface last_depth_buffer;
+
+    using SurfaceIntervalCache = boost::icl::interval_map<VAddr, Surface>;
+    using SurfaceInterval = typename SurfaceIntervalCache::interval_type;
+
+    static auto GetReinterpretInterval(const Surface& object) {
+        return SurfaceInterval::right_open(object->GetAddr() + 1,
+                                           object->GetAddr() + object->GetMemorySize() - 1);
+    }
+
+    // Reinterpreted surfaces are very fragil as the game may keep rendering into them.
+    SurfaceIntervalCache reinterpreted_surfaces;
+
+    void RegisterReinterpretSurface(Surface reinterpret_surface) {
+        auto interval = GetReinterpretInterval(reinterpret_surface);
+        reinterpreted_surfaces.insert({interval, reinterpret_surface});
+        reinterpret_surface->MarkReinterpreted();
+    }
+
+    Surface CollideOnReinterpretedSurface(VAddr addr) const {
+        const SurfaceInterval interval{addr};
+        for (auto& pair :
+             boost::make_iterator_range(reinterpreted_surfaces.equal_range(interval))) {
+            return pair.second;
+        }
+        return nullptr;
+    }
+
+    /// Unregisters an object from the cache
+    void UnregisterSurface(const Surface& object) {
+        if (object->IsReinterpreted()) {
+            auto interval = GetReinterpretInterval(object);
+            reinterpreted_surfaces.erase(interval);
+        }
+        Unregister(object);
+    }
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -20,6 +20,7 @@
 namespace OpenGL::GLShader {

 using Tegra::Shader::Attribute;
+using Tegra::Shader::AttributeUse;
 using Tegra::Shader::Header;
 using Tegra::Shader::IpaInterpMode;
 using Tegra::Shader::IpaMode;
@@ -288,34 +289,22 @@ private:
        code.AddNewLine();
    }

-    std::string GetInputFlags(const IpaMode& input_mode) {
-        const IpaSampleMode sample_mode = input_mode.sampling_mode;
-        const IpaInterpMode interp_mode = input_mode.interpolation_mode;
+    std::string GetInputFlags(AttributeUse attribute) {
        std::string out;

-        switch (interp_mode) {
-        case IpaInterpMode::Flat:
+        switch (attribute) {
+        case AttributeUse::Constant:
            out += "flat ";
            break;
-        case IpaInterpMode::Linear:
+        case AttributeUse::ScreenLinear:
            out += "noperspective ";
            break;
-        case IpaInterpMode::Perspective:
+        case AttributeUse::Perspective:
            // Default, Smooth
            break;
        default:
-            UNIMPLEMENTED_MSG("Unhandled IPA interp mode: {}", static_cast<u32>(interp_mode));
-        }
-        switch (sample_mode) {
-        case IpaSampleMode::Centroid:
-            // It can be implemented with the "centroid " keyword in GLSL
-            UNIMPLEMENTED_MSG("Unimplemented IPA sampler mode centroid");
-            break;
-        case IpaSampleMode::Default:
-            // Default, n/a
-            break;
-        default:
-            UNIMPLEMENTED_MSG("Unimplemented IPA sampler mode: {}", static_cast<u32>(sample_mode));
+            LOG_CRITICAL(HW_GPU, "Unused attribute being fetched");
+            UNREACHABLE();
        }
        return out;
    }
@@ -324,16 +313,11 @@ private:
        const auto& attributes = ir.GetInputAttributes();
        for (const auto element : attributes) {
            const Attribute::Index index = element.first;
-            const IpaMode& input_mode = *element.second.begin();
            if (index < Attribute::Index::Attribute_0 || index > Attribute::Index::Attribute_31) {
                // Skip when it's not a generic attribute
                continue;
            }

-            ASSERT(element.second.size() > 0);
-            UNIMPLEMENTED_IF_MSG(element.second.size() > 1,
-                                 "Multiple input flag modes are not supported in GLSL");
-
            // TODO(bunnei): Use proper number of elements for these
            u32 idx = static_cast<u32>(index) - static_cast<u32>(Attribute::Index::Attribute_0);
            if (stage != ShaderStage::Vertex) {
@@ -345,8 +329,14 @@ private:
            if (stage == ShaderStage::Geometry) {
                attr = "gs_" + attr + "[]";
            }
-            code.AddLine("layout (location = " + std::to_string(idx) + ") " +
-                         GetInputFlags(input_mode) + "in vec4 " + attr + ';');
+            std::string suffix;
+            if (stage == ShaderStage::Fragment) {
+                const auto input_mode =
+                    header.ps.GetAttributeUse(idx - GENERIC_VARYING_START_LOCATION);
+                suffix = GetInputFlags(input_mode);
+            }
+            code.AddLine("layout (location = " + std::to_string(idx) + ") " + suffix + "in vec4 " +
+                         attr + ';');
        }
        if (!attributes.empty())
            code.AddNewLine();
@@ -616,17 +606,8 @@ private:

    std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) {
        std::string value = VisitOperand(operation, operand_index);
-
        switch (type) {
-        case Type::Bool:
-        case Type::Bool2:
-        case Type::Float:
-            return value;
-        case Type::Int:
-            return "ftoi(" + value + ')';
-        case Type::Uint:
-            return "ftou(" + value + ')';
-        case Type::HalfFloat:
+        case Type::HalfFloat: {
            const auto half_meta = std::get_if<MetaHalfArithmetic>(&operation.GetMeta());
            if (!half_meta) {
                value = "toHalf2(" + value + ')';
@@ -643,6 +624,26 @@ private:
                return "vec2(toHalf2(" + value + ")[1])";
            }
        }
+        default:
+            return CastOperand(value, type);
+        }
+    }
+
+    std::string CastOperand(const std::string& value, Type type) const {
+        switch (type) {
+        case Type::Bool:
+        case Type::Bool2:
+        case Type::Float:
+            return value;
+        case Type::Int:
+            return "ftoi(" + value + ')';
+        case Type::Uint:
+            return "ftou(" + value + ')';
+        case Type::HalfFloat:
+            // Can't be handled as a stand-alone value
+            UNREACHABLE();
+            return value;
+        }
        UNREACHABLE();
        return value;
    }
@@ -650,6 +651,7 @@ private:
    std::string BitwiseCastResult(std::string value, Type type, bool needs_parenthesis = false) {
        switch (type) {
        case Type::Bool:
+        case Type::Bool2:
        case Type::Float:
            if (needs_parenthesis) {
                return '(' + value + ')';
@@ -721,7 +723,7 @@ private:
        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
        ASSERT(meta);

-        const auto count = static_cast<u32>(operation.GetOperandsCount());
+        const std::size_t count = operation.GetOperandsCount();
        const bool has_array = meta->sampler.IsArray();
        const bool has_shadow = meta->sampler.IsShadow();

@@ -732,10 +734,10 @@ private:

        expr += coord_constructors.at(count + (has_array ? 1 : 0) + (has_shadow ? 1 : 0) - 1);
        expr += '(';
-        for (u32 i = 0; i < count; ++i) {
+        for (std::size_t i = 0; i < count; ++i) {
            expr += Visit(operation[i]);

-            const u32 next = i + 1;
+            const std::size_t next = i + 1;
            if (next < count || has_array || has_shadow)
                expr += ", ";
        }
@@ -1206,25 +1208,26 @@ private:
        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
        ASSERT(meta);
        UNIMPLEMENTED_IF(meta->sampler.IsArray());
-        UNIMPLEMENTED_IF(!meta->extras.empty());
-
-        const auto count = static_cast<u32>(operation.GetOperandsCount());
+        const std::size_t count = operation.GetOperandsCount();

        std::string expr = "texelFetch(";
        expr += GetSampler(meta->sampler);
        expr += ", ";

-        expr += constructors.at(count - 1);
+        expr += constructors.at(operation.GetOperandsCount() - 1);
        expr += '(';
-        for (u32 i = 0; i < count; ++i) {
+        for (std::size_t i = 0; i < count; ++i) {
            expr += VisitOperand(operation, i, Type::Int);
-
-            const u32 next = i + 1;
+            const std::size_t next = i + 1;
            if (next == count)
                expr += ')';
-            if (next < count)
+            else if (next < count)
                expr += ", ";
        }
+        for (std::size_t i = 0; i < meta->extras.size(); ++i) {
+            expr += ", ";
+            expr += CastOperand(Visit(meta->extras.at(i)), Type::Int);
+        }
        expr += ')';

        return expr + GetSwizzle(meta->element);
@@ -1571,4 +1574,4 @@ ProgramResult Decompile(const ShaderIR& ir, Maxwell::ShaderStage stage, const st
    return {decompiler.GetResult(), decompiler.GetShaderEntries()};
 }

-} // namespace OpenGL::GLShader
+} // namespace OpenGL::GLShader
--- a/Show More
+++ b/Show More