cpu_detect: Revert __cpuid{ex} array-type argument

Restores compatibility with MSVC's `__cpuid` intrinsic.
Merge pull request #7936 from Wunkolo/cpu_detect
2022-03-09 19:50:01 -08:00 · 2022-03-09 15:34:42 -08:00 · 2022-03-09 13:57:47 -08:00 · 2022-03-09 13:57:47 -08:00 · 2022-03-09 13:57:47 -08:00 · 2022-03-09 13:57:47 -08:00
47 changed files with 1935 additions and 634 deletions
--- a/externals/dynarmic
+++ b/externals/dynarmic
--- a/src/common/bit_util.h
+++ b/src/common/bit_util.h
@@ -57,4 +57,11 @@ requires std::is_integral_v<T>
    return static_cast<T>(1ULL << ((8U * sizeof(T)) - std::countl_zero(value - 1U)));
 }

+template <size_t bit_index, typename T>
+requires std::is_integral_v<T>
+[[nodiscard]] constexpr bool Bit(const T value) {
+    static_assert(bit_index < BitSize<T>(), "bit_index must be smaller than size of T");
+    return ((value >> bit_index) & T(1)) == T(1);
+}
+
 } // namespace Common
--- a/src/common/host_memory.cpp
+++ b/src/common/host_memory.cpp
@@ -327,8 +327,8 @@ private:
    bool IsNiechePlaceholder(size_t virtual_offset, size_t length) const {
        const auto it = placeholders.upper_bound({virtual_offset, virtual_offset + length});
        if (it != placeholders.end() && it->lower() == virtual_offset + length) {
-            const bool is_root = it == placeholders.begin() && virtual_offset == 0;
-            return is_root || std::prev(it)->upper() == virtual_offset;
+            return it == placeholders.begin() ? virtual_offset == 0
+                                              : std::prev(it)->upper() == virtual_offset;
        }
        return false;
    }
--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -218,19 +218,17 @@ private:
    Impl(const std::filesystem::path& file_backend_filename, const Filter& filter_)
        : filter{filter_}, file_backend{file_backend_filename} {}

-    ~Impl() {
-        StopBackendThread();
-    }
+    ~Impl() = default;

    void StartBackendThread() {
-        backend_thread = std::thread([this] {
+        backend_thread = std::jthread([this](std::stop_token stop_token) {
            Common::SetCurrentThreadName("yuzu:Log");
            Entry entry;
            const auto write_logs = [this, &entry]() {
                ForEachBackend([&entry](Backend& backend) { backend.Write(entry); });
            };
-            while (!stop.stop_requested()) {
-                entry = message_queue.PopWait(stop.get_token());
+            while (!stop_token.stop_requested()) {
+                entry = message_queue.PopWait(stop_token);
                if (entry.filename != nullptr) {
                    write_logs();
                }
@@ -244,11 +242,6 @@ private:
        });
    }

-    void StopBackendThread() {
-        stop.request_stop();
-        backend_thread.join();
-    }
-
    Entry CreateEntry(Class log_class, Level log_level, const char* filename, unsigned int line_nr,
                      const char* function, std::string&& message) const {
        using std::chrono::duration_cast;
@@ -283,8 +276,7 @@ private:
    ColorConsoleBackend color_console_backend{};
    FileBackend file_backend;

-    std::stop_source stop;
-    std::thread backend_thread;
+    std::jthread backend_thread;
    MPSCQueue<Entry, true> message_queue{};
    std::chrono::steady_clock::time_point time_origin{std::chrono::steady_clock::now()};
 };
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@@ -176,6 +176,7 @@ void RestoreGlobalState(bool is_powered_on) {
    values.cpuopt_unsafe_ignore_standard_fpcr.SetGlobal(true);
    values.cpuopt_unsafe_inaccurate_nan.SetGlobal(true);
    values.cpuopt_unsafe_fastmem_check.SetGlobal(true);
+    values.cpuopt_unsafe_ignore_global_monitor.SetGlobal(true);

    // Renderer
    values.renderer_backend.SetGlobal(true);
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -484,12 +484,15 @@ struct Values {
    BasicSetting<bool> cpuopt_misc_ir{true, "cpuopt_misc_ir"};
    BasicSetting<bool> cpuopt_reduce_misalign_checks{true, "cpuopt_reduce_misalign_checks"};
    BasicSetting<bool> cpuopt_fastmem{true, "cpuopt_fastmem"};
+    BasicSetting<bool> cpuopt_fastmem_exclusives{true, "cpuopt_fastmem_exclusives"};
+    BasicSetting<bool> cpuopt_recompile_exclusives{true, "cpuopt_recompile_exclusives"};

    Setting<bool> cpuopt_unsafe_unfuse_fma{true, "cpuopt_unsafe_unfuse_fma"};
    Setting<bool> cpuopt_unsafe_reduce_fp_error{true, "cpuopt_unsafe_reduce_fp_error"};
    Setting<bool> cpuopt_unsafe_ignore_standard_fpcr{true, "cpuopt_unsafe_ignore_standard_fpcr"};
    Setting<bool> cpuopt_unsafe_inaccurate_nan{true, "cpuopt_unsafe_inaccurate_nan"};
    Setting<bool> cpuopt_unsafe_fastmem_check{true, "cpuopt_unsafe_fastmem_check"};
+    Setting<bool> cpuopt_unsafe_ignore_global_monitor{true, "cpuopt_unsafe_ignore_global_monitor"};

    // Renderer
    RangedSetting<RendererBackend> renderer_backend{
--- a/src/common/x64/cpu_detect.cpp
+++ b/src/common/x64/cpu_detect.cpp
@@ -1,8 +1,12 @@
-// Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
+// Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project / 2022 Yuzu Emulator
+// Project Licensed under GPLv2 or any later version Refer to the license.txt file included.

+#include <array>
 #include <cstring>
+#include <iterator>
+#include <span>
+#include <string_view>
+#include "common/bit_util.h"
 #include "common/common_types.h"
 #include "common/x64/cpu_detect.h"

@@ -17,7 +21,7 @@
 // clang-format on
 #endif

-static inline void __cpuidex(int info[4], int function_id, int subfunction_id) {
+static inline void __cpuidex(int info[4], u32 function_id, u32 subfunction_id) {
 #if defined(__DragonFly__) || defined(__FreeBSD__)
    // Despite the name, this is just do_cpuid() with ECX as second input.
    cpuid_count((u_int)function_id, (u_int)subfunction_id, (u_int*)info);
@@ -30,7 +34,7 @@ static inline void __cpuidex(int info[4], int function_id, int subfunction_id) {
 #endif
 }

-static inline void __cpuid(int info[4], int function_id) {
+static inline void __cpuid(int info[4], u32 function_id) {
    return __cpuidex(info, function_id, 0);
 }

@@ -45,6 +49,17 @@ static inline u64 _xgetbv(u32 index) {

 namespace Common {

+CPUCaps::Manufacturer CPUCaps::ParseManufacturer(std::string_view brand_string) {
+    if (brand_string == "GenuineIntel") {
+        return Manufacturer::Intel;
+    } else if (brand_string == "AuthenticAMD") {
+        return Manufacturer::AMD;
+    } else if (brand_string == "HygonGenuine") {
+        return Manufacturer::Hygon;
+    }
+    return Manufacturer::Unknown;
+}
+
 // Detects the various CPU features
 static CPUCaps Detect() {
    CPUCaps caps = {};
@@ -53,57 +68,44 @@ static CPUCaps Detect() {
    // yuzu at all anyway

    int cpu_id[4];
-    memset(caps.brand_string, 0, sizeof(caps.brand_string));

-    // Detect CPU's CPUID capabilities and grab CPU string
+    // Detect CPU's CPUID capabilities and grab manufacturer string
    __cpuid(cpu_id, 0x00000000);
-    u32 max_std_fn = cpu_id[0]; // EAX
+    const u32 max_std_fn = cpu_id[0]; // EAX

-    std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(int));
-    std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(int));
-    std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(int));
-    if (cpu_id[1] == 0x756e6547 && cpu_id[2] == 0x6c65746e && cpu_id[3] == 0x49656e69)
-        caps.manufacturer = Manufacturer::Intel;
-    else if (cpu_id[1] == 0x68747541 && cpu_id[2] == 0x444d4163 && cpu_id[3] == 0x69746e65)
-        caps.manufacturer = Manufacturer::AMD;
-    else if (cpu_id[1] == 0x6f677948 && cpu_id[2] == 0x656e6975 && cpu_id[3] == 0x6e65476e)
-        caps.manufacturer = Manufacturer::Hygon;
-    else
-        caps.manufacturer = Manufacturer::Unknown;
+    std::memset(caps.brand_string, 0, std::size(caps.brand_string));
+    std::memcpy(&caps.brand_string[0], &cpu_id[1], sizeof(u32));
+    std::memcpy(&caps.brand_string[4], &cpu_id[3], sizeof(u32));
+    std::memcpy(&caps.brand_string[8], &cpu_id[2], sizeof(u32));
+
+    caps.manufacturer = CPUCaps::ParseManufacturer(caps.brand_string);
+
+    // Set reasonable default cpu string even if brand string not available
+    std::strncpy(caps.cpu_string, caps.brand_string, std::size(caps.brand_string));

    __cpuid(cpu_id, 0x80000000);

-    u32 max_ex_fn = cpu_id[0];
-
-    // Set reasonable default brand string even if brand string not available
-    strcpy(caps.cpu_string, caps.brand_string);
+    const u32 max_ex_fn = cpu_id[0];

    // Detect family and other miscellaneous features
    if (max_std_fn >= 1) {
        __cpuid(cpu_id, 0x00000001);
-        if ((cpu_id[3] >> 25) & 1)
-            caps.sse = true;
-        if ((cpu_id[3] >> 26) & 1)
-            caps.sse2 = true;
-        if ((cpu_id[2]) & 1)
-            caps.sse3 = true;
-        if ((cpu_id[2] >> 9) & 1)
-            caps.ssse3 = true;
-        if ((cpu_id[2] >> 19) & 1)
-            caps.sse4_1 = true;
-        if ((cpu_id[2] >> 20) & 1)
-            caps.sse4_2 = true;
-        if ((cpu_id[2] >> 25) & 1)
-            caps.aes = true;
+        caps.sse = Common::Bit<25>(cpu_id[3]);
+        caps.sse2 = Common::Bit<26>(cpu_id[3]);
+        caps.sse3 = Common::Bit<0>(cpu_id[2]);
+        caps.ssse3 = Common::Bit<9>(cpu_id[2]);
+        caps.sse4_1 = Common::Bit<19>(cpu_id[2]);
+        caps.sse4_2 = Common::Bit<20>(cpu_id[2]);
+        caps.aes = Common::Bit<25>(cpu_id[2]);

        // AVX support requires 3 separate checks:
        //  - Is the AVX bit set in CPUID?
        //  - Is the XSAVE bit set in CPUID?
        //  - XGETBV result has the XCR bit set.
-        if (((cpu_id[2] >> 28) & 1) && ((cpu_id[2] >> 27) & 1)) {
+        if (Common::Bit<28>(cpu_id[2]) && Common::Bit<27>(cpu_id[2])) {
            if ((_xgetbv(_XCR_XFEATURE_ENABLED_MASK) & 0x6) == 0x6) {
                caps.avx = true;
-                if ((cpu_id[2] >> 12) & 1)
+                if (Common::Bit<12>(cpu_id[2]))
                    caps.fma = true;
            }
        }
@@ -111,15 +113,13 @@ static CPUCaps Detect() {
        if (max_std_fn >= 7) {
            __cpuidex(cpu_id, 0x00000007, 0x00000000);
            // Can't enable AVX2 unless the XSAVE/XGETBV checks above passed
-            if ((cpu_id[1] >> 5) & 1)
-                caps.avx2 = caps.avx;
-            if ((cpu_id[1] >> 3) & 1)
-                caps.bmi1 = true;
-            if ((cpu_id[1] >> 8) & 1)
-                caps.bmi2 = true;
+            caps.avx2 = caps.avx && Common::Bit<5>(cpu_id[1]);
+            caps.bmi1 = Common::Bit<3>(cpu_id[1]);
+            caps.bmi2 = Common::Bit<8>(cpu_id[1]);
            // Checks for AVX512F, AVX512CD, AVX512VL, AVX512DQ, AVX512BW (Intel Skylake-X/SP)
-            if ((cpu_id[1] >> 16) & 1 && (cpu_id[1] >> 28) & 1 && (cpu_id[1] >> 31) & 1 &&
-                (cpu_id[1] >> 17) & 1 && (cpu_id[1] >> 30) & 1) {
+            if (Common::Bit<16>(cpu_id[1]) && Common::Bit<28>(cpu_id[1]) &&
+                Common::Bit<31>(cpu_id[1]) && Common::Bit<17>(cpu_id[1]) &&
+                Common::Bit<30>(cpu_id[1])) {
                caps.avx512 = caps.avx2;
            }
        }
@@ -138,15 +138,13 @@ static CPUCaps Detect() {
    if (max_ex_fn >= 0x80000001) {
        // Check for more features
        __cpuid(cpu_id, 0x80000001);
-        if ((cpu_id[2] >> 16) & 1)
-            caps.fma4 = true;
+        caps.lzcnt = Common::Bit<5>(cpu_id[2]);
+        caps.fma4 = Common::Bit<16>(cpu_id[2]);
    }

    if (max_ex_fn >= 0x80000007) {
        __cpuid(cpu_id, 0x80000007);
-        if (cpu_id[3] & (1 << 8)) {
-            caps.invariant_tsc = true;
-        }
+        caps.invariant_tsc = Common::Bit<8>(cpu_id[3]);
    }

    if (max_std_fn >= 0x16) {
--- a/src/common/x64/cpu_detect.h
+++ b/src/common/x64/cpu_detect.h
@@ -1,42 +1,50 @@
-// Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
+// Copyright 2013 Dolphin Emulator Project / 2015 Citra Emulator Project / 2022 Yuzu Emulator
+// Project Project Licensed under GPLv2 or any later version Refer to the license.txt file included.

 #pragma once

-namespace Common {
+#include <string_view>
+#include "common/common_types.h"

-enum class Manufacturer : u32 {
-    Intel = 0,
-    AMD = 1,
-    Hygon = 2,
-    Unknown = 3,
-};
+namespace Common {

 /// x86/x64 CPU capabilities that may be detected by this module
 struct CPUCaps {
+
+    enum class Manufacturer : u8 {
+        Unknown = 0,
+        Intel = 1,
+        AMD = 2,
+        Hygon = 3,
+    };
+
+    static Manufacturer ParseManufacturer(std::string_view brand_string);
+
    Manufacturer manufacturer;
-    char cpu_string[0x21];
-    char brand_string[0x41];
-    bool sse;
-    bool sse2;
-    bool sse3;
-    bool ssse3;
-    bool sse4_1;
-    bool sse4_2;
-    bool lzcnt;
-    bool avx;
-    bool avx2;
-    bool avx512;
-    bool bmi1;
-    bool bmi2;
-    bool fma;
-    bool fma4;
-    bool aes;
-    bool invariant_tsc;
+    char brand_string[13];
+
+    char cpu_string[48];
+
    u32 base_frequency;
    u32 max_frequency;
    u32 bus_frequency;
+
+    bool sse : 1;
+    bool sse2 : 1;
+    bool sse3 : 1;
+    bool ssse3 : 1;
+    bool sse4_1 : 1;
+    bool sse4_2 : 1;
+    bool lzcnt : 1;
+    bool avx : 1;
+    bool avx2 : 1;
+    bool avx512 : 1;
+    bool bmi1 : 1;
+    bool bmi2 : 1;
+    bool fma : 1;
+    bool fma4 : 1;
+    bool aes : 1;
+    bool invariant_tsc : 1;
 };

 /**
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -152,6 +152,7 @@ add_library(core STATIC
    hle/api_version.h
    hle/ipc.h
    hle/ipc_helpers.h
+    hle/kernel/board/nintendo/nx/k_memory_layout.h
    hle/kernel/board/nintendo/nx/k_system_control.cpp
    hle/kernel/board/nintendo/nx/k_system_control.h
    hle/kernel/board/nintendo/nx/secure_monitor.h
@@ -164,6 +165,7 @@ add_library(core STATIC
    hle/kernel/hle_ipc.h
    hle/kernel/init/init_slab_setup.cpp
    hle/kernel/init/init_slab_setup.h
+    hle/kernel/initial_process.h
    hle/kernel/k_address_arbiter.cpp
    hle/kernel/k_address_arbiter.h
    hle/kernel/k_address_space_info.cpp
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -137,6 +137,8 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable*
    config.page_table_pointer_mask_bits = Common::PageTable::ATTRIBUTE_BITS;
    config.detect_misaligned_access_via_page_table = 16 | 32 | 64 | 128;
    config.only_detect_misalignment_via_page_table_on_page_boundary = true;
+    config.fastmem_exclusive_access = true;
+    config.recompile_on_exclusive_fastmem_failure = true;

    // Multi-process state
    config.processor_id = core_index;
@@ -178,6 +180,12 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable*
        if (!Settings::values.cpuopt_fastmem) {
            config.fastmem_pointer = nullptr;
        }
+        if (!Settings::values.cpuopt_fastmem_exclusives) {
+            config.fastmem_exclusive_access = false;
+        }
+        if (!Settings::values.cpuopt_recompile_exclusives) {
+            config.recompile_on_exclusive_fastmem_failure = false;
+        }
    }

    // Unsafe optimizations
@@ -195,6 +203,9 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable*
        if (Settings::values.cpuopt_unsafe_inaccurate_nan) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
        }
+        if (Settings::values.cpuopt_unsafe_ignore_global_monitor) {
+            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
+        }
    }

    // Curated optimizations
@@ -203,6 +214,7 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable*
        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA;
        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreStandardFPCRValue;
        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
+        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
    }

    return std::make_unique<Dynarmic::A32::Jit>(config);
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -185,6 +185,9 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
        config.fastmem_pointer = page_table->fastmem_arena;
        config.fastmem_address_space_bits = address_space_bits;
        config.silently_mirror_fastmem = false;
+
+        config.fastmem_exclusive_access = true;
+        config.recompile_on_exclusive_fastmem_failure = true;
    }

    // Multi-process state
@@ -237,6 +240,12 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
        if (!Settings::values.cpuopt_fastmem) {
            config.fastmem_pointer = nullptr;
        }
+        if (!Settings::values.cpuopt_fastmem_exclusives) {
+            config.fastmem_exclusive_access = false;
+        }
+        if (!Settings::values.cpuopt_recompile_exclusives) {
+            config.recompile_on_exclusive_fastmem_failure = false;
+        }
    }

    // Unsafe optimizations
@@ -254,6 +263,9 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
        if (Settings::values.cpuopt_unsafe_fastmem_check) {
            config.fastmem_address_space_bits = 64;
        }
+        if (Settings::values.cpuopt_unsafe_ignore_global_monitor) {
+            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
+        }
    }

    // Curated optimizations
@@ -262,6 +274,7 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable*
        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA;
        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
        config.fastmem_address_space_bits = 64;
+        config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor;
    }

    return std::make_shared<Dynarmic::A64::Jit>(config);
--- a/src/core/arm/dynarmic/arm_exclusive_monitor.cpp
+++ b/src/core/arm/dynarmic/arm_exclusive_monitor.cpp
@@ -37,8 +37,8 @@ u128 DynarmicExclusiveMonitor::ExclusiveRead128(std::size_t core_index, VAddr ad
    });
 }

-void DynarmicExclusiveMonitor::ClearExclusive() {
-    monitor.Clear();
+void DynarmicExclusiveMonitor::ClearExclusive(std::size_t core_index) {
+    monitor.ClearProcessor(core_index);
 }

 bool DynarmicExclusiveMonitor::ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) {
--- a/src/core/arm/dynarmic/arm_exclusive_monitor.h
+++ b/src/core/arm/dynarmic/arm_exclusive_monitor.h
@@ -29,7 +29,7 @@ public:
    u32 ExclusiveRead32(std::size_t core_index, VAddr addr) override;
    u64 ExclusiveRead64(std::size_t core_index, VAddr addr) override;
    u128 ExclusiveRead128(std::size_t core_index, VAddr addr) override;
-    void ClearExclusive() override;
+    void ClearExclusive(std::size_t core_index) override;

    bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) override;
    bool ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) override;
--- a/src/core/arm/exclusive_monitor.h
+++ b/src/core/arm/exclusive_monitor.h
@@ -23,7 +23,7 @@ public:
    virtual u32 ExclusiveRead32(std::size_t core_index, VAddr addr) = 0;
    virtual u64 ExclusiveRead64(std::size_t core_index, VAddr addr) = 0;
    virtual u128 ExclusiveRead128(std::size_t core_index, VAddr addr) = 0;
-    virtual void ClearExclusive() = 0;
+    virtual void ClearExclusive(std::size_t core_index) = 0;

    virtual bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) = 0;
    virtual bool ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) = 0;
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -326,7 +326,9 @@ struct System::Impl {
        is_powered_on = false;
        exit_lock = false;

-        gpu_core->NotifyShutdown();
+        if (gpu_core != nullptr) {
+            gpu_core->NotifyShutdown();
+        }

        services.reset();
        service_manager.reset();
--- a/src/core/frontend/emu_window.h
+++ b/src/core/frontend/emu_window.h
@@ -42,11 +42,20 @@ public:
            context.MakeCurrent();
        }
        ~Scoped() {
-            context.DoneCurrent();
+            if (active) {
+                context.DoneCurrent();
+            }
+        }
+
+        /// In the event that context was destroyed before the Scoped is destroyed, this provides a
+        /// mechanism to prevent calling a destroyed object's method during the deconstructor
+        void Cancel() {
+            active = false;
        }

    private:
        GraphicsContext& context;
+        bool active{true};
    };

    /// Calls MakeCurrent on the context and calls DoneCurrent when the scope for the returned value
--- a/src/core/hle/kernel/board/nintendo/nx/k_memory_layout.h
+++ b/src/core/hle/kernel/board/nintendo/nx/k_memory_layout.h
@@ -0,0 +1,13 @@
+// Copyright 2022 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Kernel {
+
+constexpr inline PAddr MainMemoryAddress = 0x80000000;
+
+} // namespace Kernel
--- a/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp
+++ b/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp
@@ -39,6 +39,10 @@ Smc::MemoryArrangement GetMemoryArrangeForInit() {
 }
 } // namespace

+size_t KSystemControl::Init::GetRealMemorySize() {
+    return GetIntendedMemorySize();
+}
+
 // Initialization.
 size_t KSystemControl::Init::GetIntendedMemorySize() {
    switch (GetMemorySizeForInit()) {
@@ -53,7 +57,13 @@ size_t KSystemControl::Init::GetIntendedMemorySize() {
 }

 PAddr KSystemControl::Init::GetKernelPhysicalBaseAddress(u64 base_address) {
-    return base_address;
+    const size_t real_dram_size = KSystemControl::Init::GetRealMemorySize();
+    const size_t intended_dram_size = KSystemControl::Init::GetIntendedMemorySize();
+    if (intended_dram_size * 2 < real_dram_size) {
+        return base_address;
+    } else {
+        return base_address + ((real_dram_size - intended_dram_size) / 2);
+    }
 }

 bool KSystemControl::Init::ShouldIncreaseThreadResourceLimit() {
--- a/src/core/hle/kernel/board/nintendo/nx/k_system_control.h
+++ b/src/core/hle/kernel/board/nintendo/nx/k_system_control.h
@@ -13,6 +13,7 @@ public:
    class Init {
    public:
        // Initialization.
+        static std::size_t GetRealMemorySize();
        static std::size_t GetIntendedMemorySize();
        static PAddr GetKernelPhysicalBaseAddress(u64 base_address);
        static bool ShouldIncreaseThreadResourceLimit();
--- a/src/core/hle/kernel/initial_process.h
+++ b/src/core/hle/kernel/initial_process.h
@@ -0,0 +1,23 @@
+// Copyright 2022 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+#include "common/literals.h"
+#include "core/hle/kernel/board/nintendo/nx/k_memory_layout.h"
+#include "core/hle/kernel/board/nintendo/nx/k_system_control.h"
+
+namespace Kernel {
+
+using namespace Common::Literals;
+
+constexpr std::size_t InitialProcessBinarySizeMax = 12_MiB;
+
+static inline PAddr GetInitialProcessBinaryPhysicalAddress() {
+    return Kernel::Board::Nintendo::Nx::KSystemControl::Init::GetKernelPhysicalBaseAddress(
+        MainMemoryAddress);
+}
+
+} // namespace Kernel
--- a/src/core/hle/kernel/k_address_arbiter.cpp
+++ b/src/core/hle/kernel/k_address_arbiter.cpp
@@ -49,7 +49,7 @@ bool DecrementIfLessThan(Core::System& system, s32* out, VAddr address, s32 valu
        }
    } else {
        // Otherwise, clear our exclusive hold and finish
-        monitor.ClearExclusive();
+        monitor.ClearExclusive(current_core);
    }

    // We're done.
@@ -78,7 +78,7 @@ bool UpdateIfEqual(Core::System& system, s32* out, VAddr address, s32 value, s32
        }
    } else {
        // Otherwise, clear our exclusive hold and finish.
-        monitor.ClearExclusive();
+        monitor.ClearExclusive(current_core);
    }

    // We're done.
--- a/src/core/hle/kernel/k_memory_layout.h
+++ b/src/core/hle/kernel/k_memory_layout.h
@@ -173,6 +173,10 @@ public:
        return Dereference(FindVirtualLinear(address));
    }

+    const KMemoryRegion& GetPhysicalLinearRegion(PAddr address) const {
+        return Dereference(FindPhysicalLinear(address));
+    }
+
    const KMemoryRegion* GetPhysicalKernelTraceBufferRegion() const {
        return GetPhysicalMemoryRegionTree().FindFirstDerived(KMemoryRegionType_KernelTraceBuffer);
    }
--- a/src/core/hle/kernel/k_memory_manager.cpp
+++ b/src/core/hle/kernel/k_memory_manager.cpp
@@ -10,189 +10,412 @@
 #include "common/scope_exit.h"
 #include "core/core.h"
 #include "core/device_memory.h"
+#include "core/hle/kernel/initial_process.h"
 #include "core/hle/kernel/k_memory_manager.h"
 #include "core/hle/kernel/k_page_linked_list.h"
+#include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/svc_results.h"
+#include "core/memory.h"

 namespace Kernel {

-KMemoryManager::KMemoryManager(Core::System& system_) : system{system_} {}
+namespace {

-std::size_t KMemoryManager::Impl::Initialize(Pool new_pool, u64 start_address, u64 end_address) {
-    const auto size{end_address - start_address};
-
-    // Calculate metadata sizes
-    const auto ref_count_size{(size / PageSize) * sizeof(u16)};
-    const auto optimize_map_size{(Common::AlignUp((size / PageSize), 64) / 64) * sizeof(u64)};
-    const auto manager_size{Common::AlignUp(optimize_map_size + ref_count_size, PageSize)};
-    const auto page_heap_size{KPageHeap::CalculateManagementOverheadSize(size)};
-    const auto total_metadata_size{manager_size + page_heap_size};
-    ASSERT(manager_size <= total_metadata_size);
-    ASSERT(Common::IsAligned(total_metadata_size, PageSize));
-
-    // Setup region
-    pool = new_pool;
-
-    // Initialize the manager's KPageHeap
-    heap.Initialize(start_address, size, page_heap_size);
-
-    // Free the memory to the heap
-    heap.Free(start_address, size / PageSize);
-
-    // Update the heap's used size
-    heap.UpdateUsedSize();
-
-    return total_metadata_size;
+constexpr KMemoryManager::Pool GetPoolFromMemoryRegionType(u32 type) {
+    if ((type | KMemoryRegionType_DramApplicationPool) == type) {
+        return KMemoryManager::Pool::Application;
+    } else if ((type | KMemoryRegionType_DramAppletPool) == type) {
+        return KMemoryManager::Pool::Applet;
+    } else if ((type | KMemoryRegionType_DramSystemPool) == type) {
+        return KMemoryManager::Pool::System;
+    } else if ((type | KMemoryRegionType_DramSystemNonSecurePool) == type) {
+        return KMemoryManager::Pool::SystemNonSecure;
+    } else {
+        UNREACHABLE_MSG("InvalidMemoryRegionType for conversion to Pool");
+        return {};
+    }
 }

-void KMemoryManager::InitializeManager(Pool pool, u64 start_address, u64 end_address) {
-    ASSERT(pool < Pool::Count);
-    managers[static_cast<std::size_t>(pool)].Initialize(pool, start_address, end_address);
+} // namespace
+
+KMemoryManager::KMemoryManager(Core::System& system_)
+    : system{system_}, pool_locks{
+                           KLightLock{system_.Kernel()},
+                           KLightLock{system_.Kernel()},
+                           KLightLock{system_.Kernel()},
+                           KLightLock{system_.Kernel()},
+                       } {}
+
+void KMemoryManager::Initialize(VAddr management_region, size_t management_region_size) {
+
+    // Clear the management region to zero.
+    const VAddr management_region_end = management_region + management_region_size;
+
+    // Reset our manager count.
+    num_managers = 0;
+
+    // Traverse the virtual memory layout tree, initializing each manager as appropriate.
+    while (num_managers != MaxManagerCount) {
+        // Locate the region that should initialize the current manager.
+        PAddr region_address = 0;
+        size_t region_size = 0;
+        Pool region_pool = Pool::Count;
+        for (const auto& it : system.Kernel().MemoryLayout().GetPhysicalMemoryRegionTree()) {
+            // We only care about regions that we need to create managers for.
+            if (!it.IsDerivedFrom(KMemoryRegionType_DramUserPool)) {
+                continue;
+            }
+
+            // We want to initialize the managers in order.
+            if (it.GetAttributes() != num_managers) {
+                continue;
+            }
+
+            const PAddr cur_start = it.GetAddress();
+            const PAddr cur_end = it.GetEndAddress();
+
+            // Validate the region.
+            ASSERT(cur_end != 0);
+            ASSERT(cur_start != 0);
+            ASSERT(it.GetSize() > 0);
+
+            // Update the region's extents.
+            if (region_address == 0) {
+                region_address = cur_start;
+                region_size = it.GetSize();
+                region_pool = GetPoolFromMemoryRegionType(it.GetType());
+            } else {
+                ASSERT(cur_start == region_address + region_size);
+
+                // Update the size.
+                region_size = cur_end - region_address;
+                ASSERT(GetPoolFromMemoryRegionType(it.GetType()) == region_pool);
+            }
+        }
+
+        // If we didn't find a region, we're done.
+        if (region_size == 0) {
+            break;
+        }
+
+        // Initialize a new manager for the region.
+        Impl* manager = std::addressof(managers[num_managers++]);
+        ASSERT(num_managers <= managers.size());
+
+        const size_t cur_size = manager->Initialize(region_address, region_size, management_region,
+                                                    management_region_end, region_pool);
+        management_region += cur_size;
+        ASSERT(management_region <= management_region_end);
+
+        // Insert the manager into the pool list.
+        const auto region_pool_index = static_cast<u32>(region_pool);
+        if (pool_managers_tail[region_pool_index] == nullptr) {
+            pool_managers_head[region_pool_index] = manager;
+        } else {
+            pool_managers_tail[region_pool_index]->SetNext(manager);
+            manager->SetPrev(pool_managers_tail[region_pool_index]);
+        }
+        pool_managers_tail[region_pool_index] = manager;
+    }
+
+    // Free each region to its corresponding heap.
+    size_t reserved_sizes[MaxManagerCount] = {};
+    const PAddr ini_start = GetInitialProcessBinaryPhysicalAddress();
+    const PAddr ini_end = ini_start + InitialProcessBinarySizeMax;
+    const PAddr ini_last = ini_end - 1;
+    for (const auto& it : system.Kernel().MemoryLayout().GetPhysicalMemoryRegionTree()) {
+        if (it.IsDerivedFrom(KMemoryRegionType_DramUserPool)) {
+            // Get the manager for the region.
+            auto index = it.GetAttributes();
+            auto& manager = managers[index];
+
+            const PAddr cur_start = it.GetAddress();
+            const PAddr cur_last = it.GetLastAddress();
+            const PAddr cur_end = it.GetEndAddress();
+
+            if (cur_start <= ini_start && ini_last <= cur_last) {
+                // Free memory before the ini to the heap.
+                if (cur_start != ini_start) {
+                    manager.Free(cur_start, (ini_start - cur_start) / PageSize);
+                }
+
+                // Open/reserve the ini memory.
+                manager.OpenFirst(ini_start, InitialProcessBinarySizeMax / PageSize);
+                reserved_sizes[it.GetAttributes()] += InitialProcessBinarySizeMax;
+
+                // Free memory after the ini to the heap.
+                if (ini_last != cur_last) {
+                    ASSERT(cur_end != 0);
+                    manager.Free(ini_end, cur_end - ini_end);
+                }
+            } else {
+                // Ensure there's no partial overlap with the ini image.
+                if (cur_start <= ini_last) {
+                    ASSERT(cur_last < ini_start);
+                } else {
+                    // Otherwise, check the region for general validity.
+                    ASSERT(cur_end != 0);
+                }
+
+                // Free the memory to the heap.
+                manager.Free(cur_start, it.GetSize() / PageSize);
+            }
+        }
+    }
+
+    // Update the used size for all managers.
+    for (size_t i = 0; i < num_managers; ++i) {
+        managers[i].SetInitialUsedHeapSize(reserved_sizes[i]);
+    }
 }

-VAddr KMemoryManager::AllocateAndOpenContinuous(std::size_t num_pages, std::size_t align_pages,
-                                                u32 option) {
-    // Early return if we're allocating no pages
+PAddr KMemoryManager::AllocateAndOpenContinuous(size_t num_pages, size_t align_pages, u32 option) {
+    // Early return if we're allocating no pages.
    if (num_pages == 0) {
-        return {};
+        return 0;
    }

-    // Lock the pool that we're allocating from
+    // Lock the pool that we're allocating from.
    const auto [pool, dir] = DecodeOption(option);
-    const auto pool_index{static_cast<std::size_t>(pool)};
-    std::lock_guard lock{pool_locks[pool_index]};
+    KScopedLightLock lk(pool_locks[static_cast<std::size_t>(pool)]);

-    // Choose a heap based on our page size request
-    const s32 heap_index{KPageHeap::GetAlignedBlockIndex(num_pages, align_pages)};
+    // Choose a heap based on our page size request.
+    const s32 heap_index = KPageHeap::GetAlignedBlockIndex(num_pages, align_pages);

-    // Loop, trying to iterate from each block
-    // TODO (bunnei): Support multiple managers
-    Impl& chosen_manager{managers[pool_index]};
-    VAddr allocated_block{chosen_manager.AllocateBlock(heap_index, false)};
-
-    // If we failed to allocate, quit now
-    if (!allocated_block) {
-        return {};
+    // Loop, trying to iterate from each block.
+    Impl* chosen_manager = nullptr;
+    PAddr allocated_block = 0;
+    for (chosen_manager = this->GetFirstManager(pool, dir); chosen_manager != nullptr;
+         chosen_manager = this->GetNextManager(chosen_manager, dir)) {
+        allocated_block = chosen_manager->AllocateBlock(heap_index, true);
+        if (allocated_block != 0) {
+            break;
+        }
    }

-    // If we allocated more than we need, free some
-    const auto allocated_pages{KPageHeap::GetBlockNumPages(heap_index)};
+    // If we failed to allocate, quit now.
+    if (allocated_block == 0) {
+        return 0;
+    }
+
+    // If we allocated more than we need, free some.
+    const size_t allocated_pages = KPageHeap::GetBlockNumPages(heap_index);
    if (allocated_pages > num_pages) {
-        chosen_manager.Free(allocated_block + num_pages * PageSize, allocated_pages - num_pages);
+        chosen_manager->Free(allocated_block + num_pages * PageSize, allocated_pages - num_pages);
    }

+    // Open the first reference to the pages.
+    chosen_manager->OpenFirst(allocated_block, num_pages);
+
    return allocated_block;
 }

-ResultCode KMemoryManager::Allocate(KPageLinkedList& page_list, std::size_t num_pages, Pool pool,
-                                    Direction dir, u32 heap_fill_value) {
-    ASSERT(page_list.GetNumPages() == 0);
+ResultCode KMemoryManager::AllocatePageGroupImpl(KPageLinkedList* out, size_t num_pages, Pool pool,
+                                                 Direction dir, bool random) {
+    // Choose a heap based on our page size request.
+    const s32 heap_index = KPageHeap::GetBlockIndex(num_pages);
+    R_UNLESS(0 <= heap_index, ResultOutOfMemory);

-    // Early return if we're allocating no pages
-    if (num_pages == 0) {
-        return ResultSuccess;
-    }
-
-    // Lock the pool that we're allocating from
-    const auto pool_index{static_cast<std::size_t>(pool)};
-    std::lock_guard lock{pool_locks[pool_index]};
-
-    // Choose a heap based on our page size request
-    const s32 heap_index{KPageHeap::GetBlockIndex(num_pages)};
-    if (heap_index < 0) {
-        return ResultOutOfMemory;
-    }
-
-    // TODO (bunnei): Support multiple managers
-    Impl& chosen_manager{managers[pool_index]};
-
-    // Ensure that we don't leave anything un-freed
-    auto group_guard = detail::ScopeExit([&] {
-        for (const auto& it : page_list.Nodes()) {
-            const auto min_num_pages{std::min<size_t>(
-                it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)};
-            chosen_manager.Free(it.GetAddress(), min_num_pages);
+    // Ensure that we don't leave anything un-freed.
+    auto group_guard = SCOPE_GUARD({
+        for (const auto& it : out->Nodes()) {
+            auto& manager = this->GetManager(system.Kernel().MemoryLayout(), it.GetAddress());
+            const size_t num_pages_to_free =
+                std::min(it.GetNumPages(), (manager.GetEndAddress() - it.GetAddress()) / PageSize);
+            manager.Free(it.GetAddress(), num_pages_to_free);
        }
    });

-    // Keep allocating until we've allocated all our pages
-    for (s32 index{heap_index}; index >= 0 && num_pages > 0; index--) {
-        const auto pages_per_alloc{KPageHeap::GetBlockNumPages(index)};
-
-        while (num_pages >= pages_per_alloc) {
-            // Allocate a block
-            VAddr allocated_block{chosen_manager.AllocateBlock(index, false)};
-            if (!allocated_block) {
-                break;
-            }
-
-            // Safely add it to our group
-            {
-                auto block_guard = detail::ScopeExit(
-                    [&] { chosen_manager.Free(allocated_block, pages_per_alloc); });
-
-                if (const ResultCode result{page_list.AddBlock(allocated_block, pages_per_alloc)};
-                    result.IsError()) {
-                    return result;
+    // Keep allocating until we've allocated all our pages.
+    for (s32 index = heap_index; index >= 0 && num_pages > 0; index--) {
+        const size_t pages_per_alloc = KPageHeap::GetBlockNumPages(index);
+        for (Impl* cur_manager = this->GetFirstManager(pool, dir); cur_manager != nullptr;
+             cur_manager = this->GetNextManager(cur_manager, dir)) {
+            while (num_pages >= pages_per_alloc) {
+                // Allocate a block.
+                PAddr allocated_block = cur_manager->AllocateBlock(index, random);
+                if (allocated_block == 0) {
+                    break;
                }

-                block_guard.Cancel();
-            }
+                // Safely add it to our group.
+                {
+                    auto block_guard =
+                        SCOPE_GUARD({ cur_manager->Free(allocated_block, pages_per_alloc); });
+                    R_TRY(out->AddBlock(allocated_block, pages_per_alloc));
+                    block_guard.Cancel();
+                }

-            num_pages -= pages_per_alloc;
+                num_pages -= pages_per_alloc;
+            }
        }
    }

-    // Clear allocated memory.
-    for (const auto& it : page_list.Nodes()) {
-        std::memset(system.DeviceMemory().GetPointer(it.GetAddress()), heap_fill_value,
-                    it.GetSize());
-    }
-
-    // Only succeed if we allocated as many pages as we wanted
-    if (num_pages) {
-        return ResultOutOfMemory;
-    }
+    // Only succeed if we allocated as many pages as we wanted.
+    R_UNLESS(num_pages == 0, ResultOutOfMemory);

    // We succeeded!
    group_guard.Cancel();
-
    return ResultSuccess;
 }

-ResultCode KMemoryManager::Free(KPageLinkedList& page_list, std::size_t num_pages, Pool pool,
-                                Direction dir, u32 heap_fill_value) {
-    // Early return if we're freeing no pages
-    if (!num_pages) {
-        return ResultSuccess;
-    }
+ResultCode KMemoryManager::AllocateAndOpen(KPageLinkedList* out, size_t num_pages, u32 option) {
+    ASSERT(out != nullptr);
+    ASSERT(out->GetNumPages() == 0);

-    // Lock the pool that we're freeing from
-    const auto pool_index{static_cast<std::size_t>(pool)};
-    std::lock_guard lock{pool_locks[pool_index]};
+    // Early return if we're allocating no pages.
+    R_SUCCEED_IF(num_pages == 0);

-    // TODO (bunnei): Support multiple managers
-    Impl& chosen_manager{managers[pool_index]};
+    // Lock the pool that we're allocating from.
+    const auto [pool, dir] = DecodeOption(option);
+    KScopedLightLock lk(pool_locks[static_cast<size_t>(pool)]);

-    // Free all of the pages
-    for (const auto& it : page_list.Nodes()) {
-        const auto min_num_pages{std::min<size_t>(
-            it.GetNumPages(), (chosen_manager.GetEndAddress() - it.GetAddress()) / PageSize)};
-        chosen_manager.Free(it.GetAddress(), min_num_pages);
+    // Allocate the page group.
+    R_TRY(this->AllocatePageGroupImpl(out, num_pages, pool, dir, false));
+
+    // Open the first reference to the pages.
+    for (const auto& block : out->Nodes()) {
+        PAddr cur_address = block.GetAddress();
+        size_t remaining_pages = block.GetNumPages();
+        while (remaining_pages > 0) {
+            // Get the manager for the current address.
+            auto& manager = this->GetManager(system.Kernel().MemoryLayout(), cur_address);
+
+            // Process part or all of the block.
+            const size_t cur_pages =
+                std::min(remaining_pages, manager.GetPageOffsetToEnd(cur_address));
+            manager.OpenFirst(cur_address, cur_pages);
+
+            // Advance.
+            cur_address += cur_pages * PageSize;
+            remaining_pages -= cur_pages;
+        }
    }

    return ResultSuccess;
 }

-std::size_t KMemoryManager::Impl::CalculateManagementOverheadSize(std::size_t region_size) {
-    const std::size_t ref_count_size = (region_size / PageSize) * sizeof(u16);
-    const std::size_t optimize_map_size =
+ResultCode KMemoryManager::AllocateAndOpenForProcess(KPageLinkedList* out, size_t num_pages,
+                                                     u32 option, u64 process_id, u8 fill_pattern) {
+    ASSERT(out != nullptr);
+    ASSERT(out->GetNumPages() == 0);
+
+    // Decode the option.
+    const auto [pool, dir] = DecodeOption(option);
+
+    // Allocate the memory.
+    {
+        // Lock the pool that we're allocating from.
+        KScopedLightLock lk(pool_locks[static_cast<size_t>(pool)]);
+
+        // Allocate the page group.
+        R_TRY(this->AllocatePageGroupImpl(out, num_pages, pool, dir, false));
+
+        // Open the first reference to the pages.
+        for (const auto& block : out->Nodes()) {
+            PAddr cur_address = block.GetAddress();
+            size_t remaining_pages = block.GetNumPages();
+            while (remaining_pages > 0) {
+                // Get the manager for the current address.
+                auto& manager = this->GetManager(system.Kernel().MemoryLayout(), cur_address);
+
+                // Process part or all of the block.
+                const size_t cur_pages =
+                    std::min(remaining_pages, manager.GetPageOffsetToEnd(cur_address));
+                manager.OpenFirst(cur_address, cur_pages);
+
+                // Advance.
+                cur_address += cur_pages * PageSize;
+                remaining_pages -= cur_pages;
+            }
+        }
+    }
+
+    // Set all the allocated memory.
+    for (const auto& block : out->Nodes()) {
+        std::memset(system.DeviceMemory().GetPointer(block.GetAddress()), fill_pattern,
+                    block.GetSize());
+    }
+
+    return ResultSuccess;
+}
+
+void KMemoryManager::Open(PAddr address, size_t num_pages) {
+    // Repeatedly open references until we've done so for all pages.
+    while (num_pages) {
+        auto& manager = this->GetManager(system.Kernel().MemoryLayout(), address);
+        const size_t cur_pages = std::min(num_pages, manager.GetPageOffsetToEnd(address));
+
+        {
+            KScopedLightLock lk(pool_locks[static_cast<size_t>(manager.GetPool())]);
+            manager.Open(address, cur_pages);
+        }
+
+        num_pages -= cur_pages;
+        address += cur_pages * PageSize;
+    }
+}
+
+void KMemoryManager::Close(PAddr address, size_t num_pages) {
+    // Repeatedly close references until we've done so for all pages.
+    while (num_pages) {
+        auto& manager = this->GetManager(system.Kernel().MemoryLayout(), address);
+        const size_t cur_pages = std::min(num_pages, manager.GetPageOffsetToEnd(address));
+
+        {
+            KScopedLightLock lk(pool_locks[static_cast<size_t>(manager.GetPool())]);
+            manager.Close(address, cur_pages);
+        }
+
+        num_pages -= cur_pages;
+        address += cur_pages * PageSize;
+    }
+}
+
+void KMemoryManager::Close(const KPageLinkedList& pg) {
+    for (const auto& node : pg.Nodes()) {
+        Close(node.GetAddress(), node.GetNumPages());
+    }
+}
+void KMemoryManager::Open(const KPageLinkedList& pg) {
+    for (const auto& node : pg.Nodes()) {
+        Open(node.GetAddress(), node.GetNumPages());
+    }
+}
+
+size_t KMemoryManager::Impl::Initialize(PAddr address, size_t size, VAddr management,
+                                        VAddr management_end, Pool p) {
+    // Calculate management sizes.
+    const size_t ref_count_size = (size / PageSize) * sizeof(u16);
+    const size_t optimize_map_size = CalculateOptimizedProcessOverheadSize(size);
+    const size_t manager_size = Common::AlignUp(optimize_map_size + ref_count_size, PageSize);
+    const size_t page_heap_size = KPageHeap::CalculateManagementOverheadSize(size);
+    const size_t total_management_size = manager_size + page_heap_size;
+    ASSERT(manager_size <= total_management_size);
+    ASSERT(management + total_management_size <= management_end);
+    ASSERT(Common::IsAligned(total_management_size, PageSize));
+
+    // Setup region.
+    pool = p;
+    management_region = management;
+    page_reference_counts.resize(
+        Kernel::Board::Nintendo::Nx::KSystemControl::Init::GetIntendedMemorySize() / PageSize);
+    ASSERT(Common::IsAligned(management_region, PageSize));
+
+    // Initialize the manager's KPageHeap.
+    heap.Initialize(address, size, management + manager_size, page_heap_size);
+
+    return total_management_size;
+}
+
+size_t KMemoryManager::Impl::CalculateManagementOverheadSize(size_t region_size) {
+    const size_t ref_count_size = (region_size / PageSize) * sizeof(u16);
+    const size_t optimize_map_size =
        (Common::AlignUp((region_size / PageSize), Common::BitSize<u64>()) /
         Common::BitSize<u64>()) *
        sizeof(u64);
-    const std::size_t manager_meta_size =
-        Common::AlignUp(optimize_map_size + ref_count_size, PageSize);
-    const std::size_t page_heap_size = KPageHeap::CalculateManagementOverheadSize(region_size);
+    const size_t manager_meta_size = Common::AlignUp(optimize_map_size + ref_count_size, PageSize);
+    const size_t page_heap_size = KPageHeap::CalculateManagementOverheadSize(region_size);
    return manager_meta_size + page_heap_size;
 }

--- a/src/core/hle/kernel/k_memory_manager.h
+++ b/src/core/hle/kernel/k_memory_manager.h
@@ -5,11 +5,12 @@
 #pragma once

 #include <array>
-#include <mutex>
 #include <tuple>

 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "core/hle/kernel/k_light_lock.h"
+#include "core/hle/kernel/k_memory_layout.h"
 #include "core/hle/kernel/k_page_heap.h"
 #include "core/hle/result.h"

@@ -52,22 +53,33 @@ public:

    explicit KMemoryManager(Core::System& system_);

-    constexpr std::size_t GetSize(Pool pool) const {
-        return managers[static_cast<std::size_t>(pool)].GetSize();
+    void Initialize(VAddr management_region, size_t management_region_size);
+
+    constexpr size_t GetSize(Pool pool) const {
+        constexpr Direction GetSizeDirection = Direction::FromFront;
+        size_t total = 0;
+        for (auto* manager = this->GetFirstManager(pool, GetSizeDirection); manager != nullptr;
+             manager = this->GetNextManager(manager, GetSizeDirection)) {
+            total += manager->GetSize();
+        }
+        return total;
    }

-    void InitializeManager(Pool pool, u64 start_address, u64 end_address);
+    PAddr AllocateAndOpenContinuous(size_t num_pages, size_t align_pages, u32 option);
+    ResultCode AllocateAndOpen(KPageLinkedList* out, size_t num_pages, u32 option);
+    ResultCode AllocateAndOpenForProcess(KPageLinkedList* out, size_t num_pages, u32 option,
+                                         u64 process_id, u8 fill_pattern);

-    VAddr AllocateAndOpenContinuous(size_t num_pages, size_t align_pages, u32 option);
-    ResultCode Allocate(KPageLinkedList& page_list, std::size_t num_pages, Pool pool, Direction dir,
-                        u32 heap_fill_value = 0);
-    ResultCode Free(KPageLinkedList& page_list, std::size_t num_pages, Pool pool, Direction dir,
-                    u32 heap_fill_value = 0);
+    static constexpr size_t MaxManagerCount = 10;

-    static constexpr std::size_t MaxManagerCount = 10;
+    void Close(PAddr address, size_t num_pages);
+    void Close(const KPageLinkedList& pg);
+
+    void Open(PAddr address, size_t num_pages);
+    void Open(const KPageLinkedList& pg);

 public:
-    static std::size_t CalculateManagementOverheadSize(std::size_t region_size) {
+    static size_t CalculateManagementOverheadSize(size_t region_size) {
        return Impl::CalculateManagementOverheadSize(region_size);
    }

@@ -100,17 +112,26 @@ private:
        Impl() = default;
        ~Impl() = default;

-        std::size_t Initialize(Pool new_pool, u64 start_address, u64 end_address);
+        size_t Initialize(PAddr address, size_t size, VAddr management, VAddr management_end,
+                          Pool p);

        VAddr AllocateBlock(s32 index, bool random) {
            return heap.AllocateBlock(index, random);
        }

-        void Free(VAddr addr, std::size_t num_pages) {
+        void Free(VAddr addr, size_t num_pages) {
            heap.Free(addr, num_pages);
        }

-        constexpr std::size_t GetSize() const {
+        void SetInitialUsedHeapSize(size_t reserved_size) {
+            heap.SetInitialUsedSize(reserved_size);
+        }
+
+        constexpr Pool GetPool() const {
+            return pool;
+        }
+
+        constexpr size_t GetSize() const {
            return heap.GetSize();
        }

@@ -122,10 +143,88 @@ private:
            return heap.GetEndAddress();
        }

-        static std::size_t CalculateManagementOverheadSize(std::size_t region_size);
+        constexpr size_t GetPageOffset(PAddr address) const {
+            return heap.GetPageOffset(address);
+        }

-        static constexpr std::size_t CalculateOptimizedProcessOverheadSize(
-            std::size_t region_size) {
+        constexpr size_t GetPageOffsetToEnd(PAddr address) const {
+            return heap.GetPageOffsetToEnd(address);
+        }
+
+        constexpr void SetNext(Impl* n) {
+            next = n;
+        }
+
+        constexpr void SetPrev(Impl* n) {
+            prev = n;
+        }
+
+        constexpr Impl* GetNext() const {
+            return next;
+        }
+
+        constexpr Impl* GetPrev() const {
+            return prev;
+        }
+
+        void OpenFirst(PAddr address, size_t num_pages) {
+            size_t index = this->GetPageOffset(address);
+            const size_t end = index + num_pages;
+            while (index < end) {
+                const RefCount ref_count = (++page_reference_counts[index]);
+                ASSERT(ref_count == 1);
+
+                index++;
+            }
+        }
+
+        void Open(PAddr address, size_t num_pages) {
+            size_t index = this->GetPageOffset(address);
+            const size_t end = index + num_pages;
+            while (index < end) {
+                const RefCount ref_count = (++page_reference_counts[index]);
+                ASSERT(ref_count > 1);
+
+                index++;
+            }
+        }
+
+        void Close(PAddr address, size_t num_pages) {
+            size_t index = this->GetPageOffset(address);
+            const size_t end = index + num_pages;
+
+            size_t free_start = 0;
+            size_t free_count = 0;
+            while (index < end) {
+                ASSERT(page_reference_counts[index] > 0);
+                const RefCount ref_count = (--page_reference_counts[index]);
+
+                // Keep track of how many zero refcounts we see in a row, to minimize calls to free.
+                if (ref_count == 0) {
+                    if (free_count > 0) {
+                        free_count++;
+                    } else {
+                        free_start = index;
+                        free_count = 1;
+                    }
+                } else {
+                    if (free_count > 0) {
+                        this->Free(heap.GetAddress() + free_start * PageSize, free_count);
+                        free_count = 0;
+                    }
+                }
+
+                index++;
+            }
+
+            if (free_count > 0) {
+                this->Free(heap.GetAddress() + free_start * PageSize, free_count);
+            }
+        }
+
+        static size_t CalculateManagementOverheadSize(size_t region_size);
+
+        static constexpr size_t CalculateOptimizedProcessOverheadSize(size_t region_size) {
            return (Common::AlignUp((region_size / PageSize), Common::BitSize<u64>()) /
                    Common::BitSize<u64>()) *
                   sizeof(u64);
@@ -135,13 +234,45 @@ private:
        using RefCount = u16;

        KPageHeap heap;
+        std::vector<RefCount> page_reference_counts;
+        VAddr management_region{};
        Pool pool{};
+        Impl* next{};
+        Impl* prev{};
    };

+private:
+    Impl& GetManager(const KMemoryLayout& memory_layout, PAddr address) {
+        return managers[memory_layout.GetPhysicalLinearRegion(address).GetAttributes()];
+    }
+
+    const Impl& GetManager(const KMemoryLayout& memory_layout, PAddr address) const {
+        return managers[memory_layout.GetPhysicalLinearRegion(address).GetAttributes()];
+    }
+
+    constexpr Impl* GetFirstManager(Pool pool, Direction dir) const {
+        return dir == Direction::FromBack ? pool_managers_tail[static_cast<size_t>(pool)]
+                                          : pool_managers_head[static_cast<size_t>(pool)];
+    }
+
+    constexpr Impl* GetNextManager(Impl* cur, Direction dir) const {
+        if (dir == Direction::FromBack) {
+            return cur->GetPrev();
+        } else {
+            return cur->GetNext();
+        }
+    }
+
+    ResultCode AllocatePageGroupImpl(KPageLinkedList* out, size_t num_pages, Pool pool,
+                                     Direction dir, bool random);
+
 private:
    Core::System& system;
-    std::array<std::mutex, static_cast<std::size_t>(Pool::Count)> pool_locks;
+    std::array<KLightLock, static_cast<size_t>(Pool::Count)> pool_locks;
+    std::array<Impl*, MaxManagerCount> pool_managers_head{};
+    std::array<Impl*, MaxManagerCount> pool_managers_tail{};
    std::array<Impl, MaxManagerCount> managers;
+    size_t num_managers{};
 };

 } // namespace Kernel
--- a/src/core/hle/kernel/k_memory_region_type.h
+++ b/src/core/hle/kernel/k_memory_region_type.h
@@ -14,7 +14,8 @@
 namespace Kernel {

 enum KMemoryRegionType : u32 {
-    KMemoryRegionAttr_CarveoutProtected = 0x04000000,
+    KMemoryRegionAttr_CarveoutProtected = 0x02000000,
+    KMemoryRegionAttr_Uncached = 0x04000000,
    KMemoryRegionAttr_DidKernelMap = 0x08000000,
    KMemoryRegionAttr_ShouldKernelMap = 0x10000000,
    KMemoryRegionAttr_UserReadOnly = 0x20000000,
@@ -239,6 +240,11 @@ static_assert(KMemoryRegionType_VirtualDramHeapBase.GetValue() == 0x1A);
 static_assert(KMemoryRegionType_VirtualDramKernelPtHeap.GetValue() == 0x2A);
 static_assert(KMemoryRegionType_VirtualDramKernelTraceBuffer.GetValue() == 0x4A);

+// UNUSED: .DeriveSparse(2, 2, 0);
+constexpr auto KMemoryRegionType_VirtualDramUnknownDebug =
+    KMemoryRegionType_Dram.DeriveSparse(2, 2, 1);
+static_assert(KMemoryRegionType_VirtualDramUnknownDebug.GetValue() == (0x52));
+
 constexpr auto KMemoryRegionType_VirtualDramKernelInitPt =
    KMemoryRegionType_VirtualDramHeapBase.Derive(3, 0);
 constexpr auto KMemoryRegionType_VirtualDramPoolManagement =
@@ -330,6 +336,8 @@ constexpr KMemoryRegionType GetTypeForVirtualLinearMapping(u32 type_id) {
        return KMemoryRegionType_VirtualDramKernelTraceBuffer;
    } else if (KMemoryRegionType_DramKernelPtHeap.IsAncestorOf(type_id)) {
        return KMemoryRegionType_VirtualDramKernelPtHeap;
+    } else if ((type_id | KMemoryRegionAttr_ShouldKernelMap) == type_id) {
+        return KMemoryRegionType_VirtualDramUnknownDebug;
    } else {
        return KMemoryRegionType_Dram;
    }
--- a/src/core/hle/kernel/k_page_heap.cpp
+++ b/src/core/hle/kernel/k_page_heap.cpp
@@ -7,35 +7,51 @@

 namespace Kernel {

-void KPageHeap::Initialize(VAddr address, std::size_t size, std::size_t metadata_size) {
-    // Check our assumptions
-    ASSERT(Common::IsAligned((address), PageSize));
+void KPageHeap::Initialize(PAddr address, size_t size, VAddr management_address,
+                           size_t management_size, const size_t* block_shifts,
+                           size_t num_block_shifts) {
+    // Check our assumptions.
+    ASSERT(Common::IsAligned(address, PageSize));
    ASSERT(Common::IsAligned(size, PageSize));
+    ASSERT(0 < num_block_shifts && num_block_shifts <= NumMemoryBlockPageShifts);
+    const VAddr management_end = management_address + management_size;

-    // Set our members
-    heap_address = address;
-    heap_size = size;
+    // Set our members.
+    m_heap_address = address;
+    m_heap_size = size;
+    m_num_blocks = num_block_shifts;

-    // Setup bitmaps
-    metadata.resize(metadata_size / sizeof(u64));
-    u64* cur_bitmap_storage{metadata.data()};
-    for (std::size_t i = 0; i < MemoryBlockPageShifts.size(); i++) {
-        const std::size_t cur_block_shift{MemoryBlockPageShifts[i]};
-        const std::size_t next_block_shift{
-            (i != MemoryBlockPageShifts.size() - 1) ? MemoryBlockPageShifts[i + 1] : 0};
-        cur_bitmap_storage = blocks[i].Initialize(heap_address, heap_size, cur_block_shift,
-                                                  next_block_shift, cur_bitmap_storage);
+    // Setup bitmaps.
+    m_management_data.resize(management_size / sizeof(u64));
+    u64* cur_bitmap_storage{m_management_data.data()};
+    for (size_t i = 0; i < num_block_shifts; i++) {
+        const size_t cur_block_shift = block_shifts[i];
+        const size_t next_block_shift = (i != num_block_shifts - 1) ? block_shifts[i + 1] : 0;
+        cur_bitmap_storage = m_blocks[i].Initialize(m_heap_address, m_heap_size, cur_block_shift,
+                                                    next_block_shift, cur_bitmap_storage);
    }
+
+    // Ensure we didn't overextend our bounds.
+    ASSERT(VAddr(cur_bitmap_storage) <= management_end);
 }

-VAddr KPageHeap::AllocateBlock(s32 index, bool random) {
-    const std::size_t needed_size{blocks[index].GetSize()};
+size_t KPageHeap::GetNumFreePages() const {
+    size_t num_free = 0;

-    for (s32 i{index}; i < static_cast<s32>(MemoryBlockPageShifts.size()); i++) {
-        if (const VAddr addr{blocks[i].PopBlock(random)}; addr) {
-            if (const std::size_t allocated_size{blocks[i].GetSize()};
-                allocated_size > needed_size) {
-                Free(addr + needed_size, (allocated_size - needed_size) / PageSize);
+    for (size_t i = 0; i < m_num_blocks; i++) {
+        num_free += m_blocks[i].GetNumFreePages();
+    }
+
+    return num_free;
+}
+
+PAddr KPageHeap::AllocateBlock(s32 index, bool random) {
+    const size_t needed_size = m_blocks[index].GetSize();
+
+    for (s32 i = index; i < static_cast<s32>(m_num_blocks); i++) {
+        if (const PAddr addr = m_blocks[i].PopBlock(random); addr != 0) {
+            if (const size_t allocated_size = m_blocks[i].GetSize(); allocated_size > needed_size) {
+                this->Free(addr + needed_size, (allocated_size - needed_size) / PageSize);
            }
            return addr;
        }
@@ -44,34 +60,34 @@ VAddr KPageHeap::AllocateBlock(s32 index, bool random) {
    return 0;
 }

-void KPageHeap::FreeBlock(VAddr block, s32 index) {
+void KPageHeap::FreeBlock(PAddr block, s32 index) {
    do {
-        block = blocks[index++].PushBlock(block);
+        block = m_blocks[index++].PushBlock(block);
    } while (block != 0);
 }

-void KPageHeap::Free(VAddr addr, std::size_t num_pages) {
-    // Freeing no pages is a no-op
+void KPageHeap::Free(PAddr addr, size_t num_pages) {
+    // Freeing no pages is a no-op.
    if (num_pages == 0) {
        return;
    }

-    // Find the largest block size that we can free, and free as many as possible
-    s32 big_index{static_cast<s32>(MemoryBlockPageShifts.size()) - 1};
-    const VAddr start{addr};
-    const VAddr end{(num_pages * PageSize) + addr};
-    VAddr before_start{start};
-    VAddr before_end{start};
-    VAddr after_start{end};
-    VAddr after_end{end};
+    // Find the largest block size that we can free, and free as many as possible.
+    s32 big_index = static_cast<s32>(m_num_blocks) - 1;
+    const PAddr start = addr;
+    const PAddr end = addr + num_pages * PageSize;
+    PAddr before_start = start;
+    PAddr before_end = start;
+    PAddr after_start = end;
+    PAddr after_end = end;
    while (big_index >= 0) {
-        const std::size_t block_size{blocks[big_index].GetSize()};
-        const VAddr big_start{Common::AlignUp((start), block_size)};
-        const VAddr big_end{Common::AlignDown((end), block_size)};
+        const size_t block_size = m_blocks[big_index].GetSize();
+        const PAddr big_start = Common::AlignUp(start, block_size);
+        const PAddr big_end = Common::AlignDown(end, block_size);
        if (big_start < big_end) {
-            // Free as many big blocks as we can
-            for (auto block{big_start}; block < big_end; block += block_size) {
-                FreeBlock(block, big_index);
+            // Free as many big blocks as we can.
+            for (auto block = big_start; block < big_end; block += block_size) {
+                this->FreeBlock(block, big_index);
            }
            before_end = big_start;
            after_start = big_end;
@@ -81,31 +97,31 @@ void KPageHeap::Free(VAddr addr, std::size_t num_pages) {
    }
    ASSERT(big_index >= 0);

-    // Free space before the big blocks
-    for (s32 i{big_index - 1}; i >= 0; i--) {
-        const std::size_t block_size{blocks[i].GetSize()};
+    // Free space before the big blocks.
+    for (s32 i = big_index - 1; i >= 0; i--) {
+        const size_t block_size = m_blocks[i].GetSize();
        while (before_start + block_size <= before_end) {
            before_end -= block_size;
-            FreeBlock(before_end, i);
+            this->FreeBlock(before_end, i);
        }
    }

-    // Free space after the big blocks
-    for (s32 i{big_index - 1}; i >= 0; i--) {
-        const std::size_t block_size{blocks[i].GetSize()};
+    // Free space after the big blocks.
+    for (s32 i = big_index - 1; i >= 0; i--) {
+        const size_t block_size = m_blocks[i].GetSize();
        while (after_start + block_size <= after_end) {
-            FreeBlock(after_start, i);
+            this->FreeBlock(after_start, i);
            after_start += block_size;
        }
    }
 }

-std::size_t KPageHeap::CalculateManagementOverheadSize(std::size_t region_size) {
-    std::size_t overhead_size = 0;
-    for (std::size_t i = 0; i < MemoryBlockPageShifts.size(); i++) {
-        const std::size_t cur_block_shift{MemoryBlockPageShifts[i]};
-        const std::size_t next_block_shift{
-            (i != MemoryBlockPageShifts.size() - 1) ? MemoryBlockPageShifts[i + 1] : 0};
+size_t KPageHeap::CalculateManagementOverheadSize(size_t region_size, const size_t* block_shifts,
+                                                  size_t num_block_shifts) {
+    size_t overhead_size = 0;
+    for (size_t i = 0; i < num_block_shifts; i++) {
+        const size_t cur_block_shift = block_shifts[i];
+        const size_t next_block_shift = (i != num_block_shifts - 1) ? block_shifts[i + 1] : 0;
        overhead_size += KPageHeap::Block::CalculateManagementOverheadSize(
            region_size, cur_block_shift, next_block_shift);
    }
--- a/src/core/hle/kernel/k_page_heap.h
+++ b/src/core/hle/kernel/k_page_heap.h
@@ -23,54 +23,73 @@ public:
    KPageHeap() = default;
    ~KPageHeap() = default;

-    constexpr VAddr GetAddress() const {
-        return heap_address;
+    constexpr PAddr GetAddress() const {
+        return m_heap_address;
    }
-    constexpr std::size_t GetSize() const {
-        return heap_size;
+    constexpr size_t GetSize() const {
+        return m_heap_size;
    }
-    constexpr VAddr GetEndAddress() const {
-        return GetAddress() + GetSize();
+    constexpr PAddr GetEndAddress() const {
+        return this->GetAddress() + this->GetSize();
    }
-    constexpr std::size_t GetPageOffset(VAddr block) const {
-        return (block - GetAddress()) / PageSize;
+    constexpr size_t GetPageOffset(PAddr block) const {
+        return (block - this->GetAddress()) / PageSize;
+    }
+    constexpr size_t GetPageOffsetToEnd(PAddr block) const {
+        return (this->GetEndAddress() - block) / PageSize;
    }

-    void Initialize(VAddr heap_address, std::size_t heap_size, std::size_t metadata_size);
-    VAddr AllocateBlock(s32 index, bool random);
-    void Free(VAddr addr, std::size_t num_pages);
-
-    void UpdateUsedSize() {
-        used_size = heap_size - (GetNumFreePages() * PageSize);
+    void Initialize(PAddr heap_address, size_t heap_size, VAddr management_address,
+                    size_t management_size) {
+        return this->Initialize(heap_address, heap_size, management_address, management_size,
+                                MemoryBlockPageShifts.data(), NumMemoryBlockPageShifts);
    }

-    static std::size_t CalculateManagementOverheadSize(std::size_t region_size);
+    size_t GetFreeSize() const {
+        return this->GetNumFreePages() * PageSize;
+    }

-    static constexpr s32 GetAlignedBlockIndex(std::size_t num_pages, std::size_t align_pages) {
-        const auto target_pages{std::max(num_pages, align_pages)};
-        for (std::size_t i = 0; i < NumMemoryBlockPageShifts; i++) {
-            if (target_pages <=
-                (static_cast<std::size_t>(1) << MemoryBlockPageShifts[i]) / PageSize) {
+    void SetInitialUsedSize(size_t reserved_size) {
+        // Check that the reserved size is valid.
+        const size_t free_size = this->GetNumFreePages() * PageSize;
+        ASSERT(m_heap_size >= free_size + reserved_size);
+
+        // Set the initial used size.
+        m_initial_used_size = m_heap_size - free_size - reserved_size;
+    }
+
+    PAddr AllocateBlock(s32 index, bool random);
+    void Free(PAddr addr, size_t num_pages);
+
+    static size_t CalculateManagementOverheadSize(size_t region_size) {
+        return CalculateManagementOverheadSize(region_size, MemoryBlockPageShifts.data(),
+                                               NumMemoryBlockPageShifts);
+    }
+
+    static constexpr s32 GetAlignedBlockIndex(size_t num_pages, size_t align_pages) {
+        const size_t target_pages = std::max(num_pages, align_pages);
+        for (size_t i = 0; i < NumMemoryBlockPageShifts; i++) {
+            if (target_pages <= (size_t(1) << MemoryBlockPageShifts[i]) / PageSize) {
                return static_cast<s32>(i);
            }
        }
        return -1;
    }

-    static constexpr s32 GetBlockIndex(std::size_t num_pages) {
-        for (s32 i{static_cast<s32>(NumMemoryBlockPageShifts) - 1}; i >= 0; i--) {
-            if (num_pages >= (static_cast<std::size_t>(1) << MemoryBlockPageShifts[i]) / PageSize) {
+    static constexpr s32 GetBlockIndex(size_t num_pages) {
+        for (s32 i = static_cast<s32>(NumMemoryBlockPageShifts) - 1; i >= 0; i--) {
+            if (num_pages >= (size_t(1) << MemoryBlockPageShifts[i]) / PageSize) {
                return i;
            }
        }
        return -1;
    }

-    static constexpr std::size_t GetBlockSize(std::size_t index) {
-        return static_cast<std::size_t>(1) << MemoryBlockPageShifts[index];
+    static constexpr size_t GetBlockSize(size_t index) {
+        return size_t(1) << MemoryBlockPageShifts[index];
    }

-    static constexpr std::size_t GetBlockNumPages(std::size_t index) {
+    static constexpr size_t GetBlockNumPages(size_t index) {
        return GetBlockSize(index) / PageSize;
    }

@@ -83,114 +102,116 @@ private:
        Block() = default;
        ~Block() = default;

-        constexpr std::size_t GetShift() const {
-            return block_shift;
+        constexpr size_t GetShift() const {
+            return m_block_shift;
        }
-        constexpr std::size_t GetNextShift() const {
-            return next_block_shift;
+        constexpr size_t GetNextShift() const {
+            return m_next_block_shift;
        }
-        constexpr std::size_t GetSize() const {
-            return static_cast<std::size_t>(1) << GetShift();
+        constexpr size_t GetSize() const {
+            return u64(1) << this->GetShift();
        }
-        constexpr std::size_t GetNumPages() const {
-            return GetSize() / PageSize;
+        constexpr size_t GetNumPages() const {
+            return this->GetSize() / PageSize;
        }
-        constexpr std::size_t GetNumFreeBlocks() const {
-            return bitmap.GetNumBits();
+        constexpr size_t GetNumFreeBlocks() const {
+            return m_bitmap.GetNumBits();
        }
-        constexpr std::size_t GetNumFreePages() const {
-            return GetNumFreeBlocks() * GetNumPages();
+        constexpr size_t GetNumFreePages() const {
+            return this->GetNumFreeBlocks() * this->GetNumPages();
        }

-        u64* Initialize(VAddr addr, std::size_t size, std::size_t bs, std::size_t nbs,
-                        u64* bit_storage) {
-            // Set shifts
-            block_shift = bs;
-            next_block_shift = nbs;
+        u64* Initialize(PAddr addr, size_t size, size_t bs, size_t nbs, u64* bit_storage) {
+            // Set shifts.
+            m_block_shift = bs;
+            m_next_block_shift = nbs;

-            // Align up the address
-            VAddr end{addr + size};
-            const auto align{(next_block_shift != 0) ? (1ULL << next_block_shift)
-                                                     : (1ULL << block_shift)};
-            addr = Common::AlignDown((addr), align);
-            end = Common::AlignUp((end), align);
+            // Align up the address.
+            PAddr end = addr + size;
+            const size_t align = (m_next_block_shift != 0) ? (u64(1) << m_next_block_shift)
+                                                           : (u64(1) << m_block_shift);
+            addr = Common::AlignDown(addr, align);
+            end = Common::AlignUp(end, align);

-            heap_address = addr;
-            end_offset = (end - addr) / (1ULL << block_shift);
-            return bitmap.Initialize(bit_storage, end_offset);
+            m_heap_address = addr;
+            m_end_offset = (end - addr) / (u64(1) << m_block_shift);
+            return m_bitmap.Initialize(bit_storage, m_end_offset);
        }

-        VAddr PushBlock(VAddr address) {
-            // Set the bit for the free block
-            std::size_t offset{(address - heap_address) >> GetShift()};
-            bitmap.SetBit(offset);
+        PAddr PushBlock(PAddr address) {
+            // Set the bit for the free block.
+            size_t offset = (address - m_heap_address) >> this->GetShift();
+            m_bitmap.SetBit(offset);

-            // If we have a next shift, try to clear the blocks below and return the address
-            if (GetNextShift()) {
-                const auto diff{1ULL << (GetNextShift() - GetShift())};
+            // If we have a next shift, try to clear the blocks below this one and return the new
+            // address.
+            if (this->GetNextShift()) {
+                const size_t diff = u64(1) << (this->GetNextShift() - this->GetShift());
                offset = Common::AlignDown(offset, diff);
-                if (bitmap.ClearRange(offset, diff)) {
-                    return heap_address + (offset << GetShift());
+                if (m_bitmap.ClearRange(offset, diff)) {
+                    return m_heap_address + (offset << this->GetShift());
                }
            }

-            // We couldn't coalesce, or we're already as big as possible
-            return 0;
+            // We couldn't coalesce, or we're already as big as possible.
+            return {};
        }

-        VAddr PopBlock(bool random) {
-            // Find a free block
-            const s64 soffset{bitmap.FindFreeBlock(random)};
+        PAddr PopBlock(bool random) {
+            // Find a free block.
+            s64 soffset = m_bitmap.FindFreeBlock(random);
            if (soffset < 0) {
-                return 0;
+                return {};
            }
-            const auto offset{static_cast<std::size_t>(soffset)};
+            const size_t offset = static_cast<size_t>(soffset);

-            // Update our tracking and return it
-            bitmap.ClearBit(offset);
-            return heap_address + (offset << GetShift());
+            // Update our tracking and return it.
+            m_bitmap.ClearBit(offset);
+            return m_heap_address + (offset << this->GetShift());
        }

-        static constexpr std::size_t CalculateManagementOverheadSize(std::size_t region_size,
-                                                                     std::size_t cur_block_shift,
-                                                                     std::size_t next_block_shift) {
-            const auto cur_block_size{(1ULL << cur_block_shift)};
-            const auto next_block_size{(1ULL << next_block_shift)};
-            const auto align{(next_block_shift != 0) ? next_block_size : cur_block_size};
+    public:
+        static constexpr size_t CalculateManagementOverheadSize(size_t region_size,
+                                                                size_t cur_block_shift,
+                                                                size_t next_block_shift) {
+            const size_t cur_block_size = (u64(1) << cur_block_shift);
+            const size_t next_block_size = (u64(1) << next_block_shift);
+            const size_t align = (next_block_shift != 0) ? next_block_size : cur_block_size;
            return KPageBitmap::CalculateManagementOverheadSize(
                (align * 2 + Common::AlignUp(region_size, align)) / cur_block_size);
        }

    private:
-        KPageBitmap bitmap;
-        VAddr heap_address{};
-        uintptr_t end_offset{};
-        std::size_t block_shift{};
-        std::size_t next_block_shift{};
+        KPageBitmap m_bitmap;
+        PAddr m_heap_address{};
+        uintptr_t m_end_offset{};
+        size_t m_block_shift{};
+        size_t m_next_block_shift{};
    };

-    constexpr std::size_t GetNumFreePages() const {
-        std::size_t num_free{};
+private:
+    void Initialize(PAddr heap_address, size_t heap_size, VAddr management_address,
+                    size_t management_size, const size_t* block_shifts, size_t num_block_shifts);
+    size_t GetNumFreePages() const;

-        for (const auto& block : blocks) {
-            num_free += block.GetNumFreePages();
-        }
+    void FreeBlock(PAddr block, s32 index);

-        return num_free;
-    }
-
-    void FreeBlock(VAddr block, s32 index);
-
-    static constexpr std::size_t NumMemoryBlockPageShifts{7};
-    static constexpr std::array<std::size_t, NumMemoryBlockPageShifts> MemoryBlockPageShifts{
+    static constexpr size_t NumMemoryBlockPageShifts{7};
+    static constexpr std::array<size_t, NumMemoryBlockPageShifts> MemoryBlockPageShifts{
        0xC, 0x10, 0x15, 0x16, 0x19, 0x1D, 0x1E,
    };

-    VAddr heap_address{};
-    std::size_t heap_size{};
-    std::size_t used_size{};
-    std::array<Block, NumMemoryBlockPageShifts> blocks{};
-    std::vector<u64> metadata;
+private:
+    static size_t CalculateManagementOverheadSize(size_t region_size, const size_t* block_shifts,
+                                                  size_t num_block_shifts);
+
+private:
+    PAddr m_heap_address{};
+    size_t m_heap_size{};
+    size_t m_initial_used_size{};
+    size_t m_num_blocks{};
+    std::array<Block, NumMemoryBlockPageShifts> m_blocks{};
+    std::vector<u64> m_management_data;
 };

 } // namespace Kernel
--- a/src/core/hle/kernel/k_page_table.cpp
+++ b/src/core/hle/kernel/k_page_table.cpp
@@ -273,83 +273,153 @@ ResultCode KPageTable::MapProcessCode(VAddr addr, std::size_t num_pages, KMemory
    R_TRY(this->CheckMemoryState(addr, size, KMemoryState::All, KMemoryState::Free,
                                 KMemoryPermission::None, KMemoryPermission::None,
                                 KMemoryAttribute::None, KMemoryAttribute::None));
+    KPageLinkedList pg;
+    R_TRY(system.Kernel().MemoryManager().AllocateAndOpen(
+        &pg, num_pages,
+        KMemoryManager::EncodeOption(KMemoryManager::Pool::Application, allocation_option)));

-    KPageLinkedList page_linked_list;
-    R_TRY(system.Kernel().MemoryManager().Allocate(page_linked_list, num_pages, memory_pool,
-                                                   allocation_option));
-    R_TRY(Operate(addr, num_pages, page_linked_list, OperationType::MapGroup));
+    R_TRY(Operate(addr, num_pages, pg, OperationType::MapGroup));

    block_manager->Update(addr, num_pages, state, perm);

    return ResultSuccess;
 }

-ResultCode KPageTable::MapCodeMemory(VAddr dst_addr, VAddr src_addr, std::size_t size) {
+ResultCode KPageTable::MapCodeMemory(VAddr dst_address, VAddr src_address, std::size_t size) {
+    // Validate the mapping request.
+    R_UNLESS(this->CanContain(dst_address, size, KMemoryState::AliasCode),
+             ResultInvalidMemoryRegion);
+
+    // Lock the table.
    KScopedLightLock lk(general_lock);

-    const std::size_t num_pages{size / PageSize};
+    // Verify that the source memory is normal heap.
+    KMemoryState src_state{};
+    KMemoryPermission src_perm{};
+    std::size_t num_src_allocator_blocks{};
+    R_TRY(this->CheckMemoryState(&src_state, &src_perm, nullptr, &num_src_allocator_blocks,
+                                 src_address, size, KMemoryState::All, KMemoryState::Normal,
+                                 KMemoryPermission::All, KMemoryPermission::UserReadWrite,
+                                 KMemoryAttribute::All, KMemoryAttribute::None));

-    KMemoryState state{};
-    KMemoryPermission perm{};
-    CASCADE_CODE(CheckMemoryState(&state, &perm, nullptr, nullptr, src_addr, size,
-                                  KMemoryState::All, KMemoryState::Normal, KMemoryPermission::All,
-                                  KMemoryPermission::UserReadWrite, KMemoryAttribute::Mask,
-                                  KMemoryAttribute::None, KMemoryAttribute::IpcAndDeviceMapped));
-
-    if (IsRegionMapped(dst_addr, size)) {
-        return ResultInvalidCurrentMemory;
-    }
-
-    KPageLinkedList page_linked_list;
-    AddRegionToPages(src_addr, num_pages, page_linked_list);
+    // Verify that the destination memory is unmapped.
+    std::size_t num_dst_allocator_blocks{};
+    R_TRY(this->CheckMemoryState(&num_dst_allocator_blocks, dst_address, size, KMemoryState::All,
+                                 KMemoryState::Free, KMemoryPermission::None,
+                                 KMemoryPermission::None, KMemoryAttribute::None,
+                                 KMemoryAttribute::None));

+    // Map the code memory.
    {
-        auto block_guard = detail::ScopeExit(
-            [&] { Operate(src_addr, num_pages, perm, OperationType::ChangePermissions); });
+        // Determine the number of pages being operated on.
+        const std::size_t num_pages = size / PageSize;

-        CASCADE_CODE(Operate(src_addr, num_pages, KMemoryPermission::None,
-                             OperationType::ChangePermissions));
-        CASCADE_CODE(MapPages(dst_addr, page_linked_list, KMemoryPermission::None));
+        // Create page groups for the memory being mapped.
+        KPageLinkedList pg;
+        AddRegionToPages(src_address, num_pages, pg);

-        block_guard.Cancel();
+        // Reprotect the source as kernel-read/not mapped.
+        const auto new_perm = static_cast<KMemoryPermission>(KMemoryPermission::KernelRead |
+                                                             KMemoryPermission::NotMapped);
+        R_TRY(Operate(src_address, num_pages, new_perm, OperationType::ChangePermissions));
+
+        // Ensure that we unprotect the source pages on failure.
+        auto unprot_guard = SCOPE_GUARD({
+            ASSERT(this->Operate(src_address, num_pages, src_perm, OperationType::ChangePermissions)
+                       .IsSuccess());
+        });
+
+        // Map the alias pages.
+        R_TRY(MapPages(dst_address, pg, new_perm));
+
+        // We successfully mapped the alias pages, so we don't need to unprotect the src pages on
+        // failure.
+        unprot_guard.Cancel();
+
+        // Apply the memory block updates.
+        block_manager->Update(src_address, num_pages, src_state, new_perm,
+                              KMemoryAttribute::Locked);
+        block_manager->Update(dst_address, num_pages, KMemoryState::AliasCode, new_perm,
+                              KMemoryAttribute::None);
    }

-    block_manager->Update(src_addr, num_pages, state, KMemoryPermission::None,
-                          KMemoryAttribute::Locked);
-    block_manager->Update(dst_addr, num_pages, KMemoryState::AliasCode);
-
    return ResultSuccess;
 }

-ResultCode KPageTable::UnmapCodeMemory(VAddr dst_addr, VAddr src_addr, std::size_t size) {
+ResultCode KPageTable::UnmapCodeMemory(VAddr dst_address, VAddr src_address, std::size_t size) {
+    // Validate the mapping request.
+    R_UNLESS(this->CanContain(dst_address, size, KMemoryState::AliasCode),
+             ResultInvalidMemoryRegion);
+
+    // Lock the table.
    KScopedLightLock lk(general_lock);

-    if (!size) {
-        return ResultSuccess;
+    // Verify that the source memory is locked normal heap.
+    std::size_t num_src_allocator_blocks{};
+    R_TRY(this->CheckMemoryState(std::addressof(num_src_allocator_blocks), src_address, size,
+                                 KMemoryState::All, KMemoryState::Normal, KMemoryPermission::None,
+                                 KMemoryPermission::None, KMemoryAttribute::All,
+                                 KMemoryAttribute::Locked));
+
+    // Verify that the destination memory is aliasable code.
+    std::size_t num_dst_allocator_blocks{};
+    R_TRY(this->CheckMemoryStateContiguous(
+        std::addressof(num_dst_allocator_blocks), dst_address, size, KMemoryState::FlagCanCodeAlias,
+        KMemoryState::FlagCanCodeAlias, KMemoryPermission::None, KMemoryPermission::None,
+        KMemoryAttribute::All, KMemoryAttribute::None));
+
+    // Determine whether any pages being unmapped are code.
+    bool any_code_pages = false;
+    {
+        KMemoryBlockManager::const_iterator it = block_manager->FindIterator(dst_address);
+        while (true) {
+            // Get the memory info.
+            const KMemoryInfo info = it->GetMemoryInfo();
+
+            // Check if the memory has code flag.
+            if ((info.GetState() & KMemoryState::FlagCode) != KMemoryState::None) {
+                any_code_pages = true;
+                break;
+            }
+
+            // Check if we're done.
+            if (dst_address + size - 1 <= info.GetLastAddress()) {
+                break;
+            }
+
+            // Advance.
+            ++it;
+        }
    }

-    const std::size_t num_pages{size / PageSize};
+    // Ensure that we maintain the instruction cache.
+    bool reprotected_pages = false;
+    SCOPE_EXIT({
+        if (reprotected_pages && any_code_pages) {
+            system.InvalidateCpuInstructionCacheRange(dst_address, size);
+        }
+    });

-    CASCADE_CODE(CheckMemoryState(nullptr, nullptr, nullptr, nullptr, src_addr, size,
-                                  KMemoryState::All, KMemoryState::Normal, KMemoryPermission::None,
-                                  KMemoryPermission::None, KMemoryAttribute::Mask,
-                                  KMemoryAttribute::Locked, KMemoryAttribute::IpcAndDeviceMapped));
+    // Unmap.
+    {
+        // Determine the number of pages being operated on.
+        const std::size_t num_pages = size / PageSize;

-    KMemoryState state{};
-    CASCADE_CODE(CheckMemoryState(
-        &state, nullptr, nullptr, nullptr, dst_addr, PageSize, KMemoryState::FlagCanCodeAlias,
-        KMemoryState::FlagCanCodeAlias, KMemoryPermission::None, KMemoryPermission::None,
-        KMemoryAttribute::Mask, KMemoryAttribute::None, KMemoryAttribute::IpcAndDeviceMapped));
-    CASCADE_CODE(CheckMemoryState(dst_addr, size, KMemoryState::All, state, KMemoryPermission::None,
-                                  KMemoryPermission::None, KMemoryAttribute::Mask,
-                                  KMemoryAttribute::None));
-    CASCADE_CODE(Operate(dst_addr, num_pages, KMemoryPermission::None, OperationType::Unmap));
+        // Unmap the aliased copy of the pages.
+        R_TRY(Operate(dst_address, num_pages, KMemoryPermission::None, OperationType::Unmap));

-    block_manager->Update(dst_addr, num_pages, KMemoryState::Free);
-    block_manager->Update(src_addr, num_pages, KMemoryState::Normal,
-                          KMemoryPermission::UserReadWrite);
+        // Try to set the permissions for the source pages back to what they should be.
+        R_TRY(Operate(src_address, num_pages, KMemoryPermission::UserReadWrite,
+                      OperationType::ChangePermissions));

-    system.InvalidateCpuInstructionCacheRange(dst_addr, size);
+        // Apply the memory block updates.
+        block_manager->Update(dst_address, num_pages, KMemoryState::None);
+        block_manager->Update(src_address, num_pages, KMemoryState::Normal,
+                              KMemoryPermission::UserReadWrite);
+
+        // Note that we reprotected pages.
+        reprotected_pages = true;
+    }

    return ResultSuccess;
 }
@@ -443,9 +513,10 @@ ResultCode KPageTable::MapPhysicalMemory(VAddr address, std::size_t size) {
            R_UNLESS(memory_reservation.Succeeded(), ResultLimitReached);

            // Allocate pages for the new memory.
-            KPageLinkedList page_linked_list;
-            R_TRY(system.Kernel().MemoryManager().Allocate(
-                page_linked_list, (size - mapped_size) / PageSize, memory_pool, allocation_option));
+            KPageLinkedList pg;
+            R_TRY(system.Kernel().MemoryManager().AllocateAndOpenForProcess(
+                &pg, (size - mapped_size) / PageSize,
+                KMemoryManager::EncodeOption(memory_pool, allocation_option), 0, 0));

            // Map the memory.
            {
@@ -547,7 +618,7 @@ ResultCode KPageTable::MapPhysicalMemory(VAddr address, std::size_t size) {
                });

                // Iterate over the memory.
-                auto pg_it = page_linked_list.Nodes().begin();
+                auto pg_it = pg.Nodes().begin();
                PAddr pg_phys_addr = pg_it->GetAddress();
                size_t pg_pages = pg_it->GetNumPages();

@@ -571,7 +642,7 @@ ResultCode KPageTable::MapPhysicalMemory(VAddr address, std::size_t size) {
                            // Check if we're at the end of the physical block.
                            if (pg_pages == 0) {
                                // Ensure there are more pages to map.
-                                ASSERT(pg_it != page_linked_list.Nodes().end());
+                                ASSERT(pg_it != pg.Nodes().end());

                                // Advance our physical block.
                                ++pg_it;
@@ -841,10 +912,14 @@ ResultCode KPageTable::UnmapPhysicalMemory(VAddr address, std::size_t size) {
    process->GetResourceLimit()->Release(LimitableResource::PhysicalMemory, mapped_size);

    // Update memory blocks.
-    system.Kernel().MemoryManager().Free(pg, size / PageSize, memory_pool, allocation_option);
    block_manager->Update(address, size / PageSize, KMemoryState::Free, KMemoryPermission::None,
                          KMemoryAttribute::None);

+    // TODO(bunnei): This is a workaround until the next set of changes, where we add reference
+    // counting for mapped pages. Until then, we must manually close the reference to the page
+    // group.
+    system.Kernel().MemoryManager().Close(pg);
+
    // We succeeded.
    remap_guard.Cancel();

@@ -1270,9 +1345,16 @@ ResultCode KPageTable::SetHeapSize(VAddr* out, std::size_t size) {
    R_UNLESS(memory_reservation.Succeeded(), ResultLimitReached);

    // Allocate pages for the heap extension.
-    KPageLinkedList page_linked_list;
-    R_TRY(system.Kernel().MemoryManager().Allocate(page_linked_list, allocation_size / PageSize,
-                                                   memory_pool, allocation_option));
+    KPageLinkedList pg;
+    R_TRY(system.Kernel().MemoryManager().AllocateAndOpen(
+        &pg, allocation_size / PageSize,
+        KMemoryManager::EncodeOption(memory_pool, allocation_option)));
+
+    // Clear all the newly allocated pages.
+    for (const auto& it : pg.Nodes()) {
+        std::memset(system.DeviceMemory().GetPointer(it.GetAddress()), heap_fill_value,
+                    it.GetSize());
+    }

    // Map the pages.
    {
@@ -1291,7 +1373,7 @@ ResultCode KPageTable::SetHeapSize(VAddr* out, std::size_t size) {

        // Map the pages.
        const auto num_pages = allocation_size / PageSize;
-        R_TRY(Operate(current_heap_end, num_pages, page_linked_list, OperationType::MapGroup));
+        R_TRY(Operate(current_heap_end, num_pages, pg, OperationType::MapGroup));

        // Clear all the newly allocated pages.
        for (std::size_t cur_page = 0; cur_page < num_pages; ++cur_page) {
@@ -1339,8 +1421,9 @@ ResultVal<VAddr> KPageTable::AllocateAndMapMemory(std::size_t needed_num_pages,
        R_TRY(Operate(addr, needed_num_pages, perm, OperationType::Map, map_addr));
    } else {
        KPageLinkedList page_group;
-        R_TRY(system.Kernel().MemoryManager().Allocate(page_group, needed_num_pages, memory_pool,
-                                                       allocation_option));
+        R_TRY(system.Kernel().MemoryManager().AllocateAndOpenForProcess(
+            &page_group, needed_num_pages,
+            KMemoryManager::EncodeOption(memory_pool, allocation_option), 0, 0));
        R_TRY(Operate(addr, needed_num_pages, page_group, OperationType::MapGroup));
    }

@@ -1547,7 +1630,7 @@ ResultCode KPageTable::Operate(VAddr addr, std::size_t num_pages, KMemoryPermiss
    return ResultSuccess;
 }

-constexpr VAddr KPageTable::GetRegionAddress(KMemoryState state) const {
+VAddr KPageTable::GetRegionAddress(KMemoryState state) const {
    switch (state) {
    case KMemoryState::Free:
    case KMemoryState::Kernel:
@@ -1583,7 +1666,7 @@ constexpr VAddr KPageTable::GetRegionAddress(KMemoryState state) const {
    }
 }

-constexpr std::size_t KPageTable::GetRegionSize(KMemoryState state) const {
+std::size_t KPageTable::GetRegionSize(KMemoryState state) const {
    switch (state) {
    case KMemoryState::Free:
    case KMemoryState::Kernel:
--- a/src/core/hle/kernel/k_page_table.h
+++ b/src/core/hle/kernel/k_page_table.h
@@ -36,8 +36,8 @@ public:
                                    KMemoryManager::Pool pool);
    ResultCode MapProcessCode(VAddr addr, std::size_t pages_count, KMemoryState state,
                              KMemoryPermission perm);
-    ResultCode MapCodeMemory(VAddr dst_addr, VAddr src_addr, std::size_t size);
-    ResultCode UnmapCodeMemory(VAddr dst_addr, VAddr src_addr, std::size_t size);
+    ResultCode MapCodeMemory(VAddr dst_address, VAddr src_address, std::size_t size);
+    ResultCode UnmapCodeMemory(VAddr dst_address, VAddr src_address, std::size_t size);
    ResultCode UnmapProcessMemory(VAddr dst_addr, std::size_t size, KPageTable& src_page_table,
                                  VAddr src_addr);
    ResultCode MapPhysicalMemory(VAddr addr, std::size_t size);
@@ -102,8 +102,8 @@ private:
                       OperationType operation);
    ResultCode Operate(VAddr addr, std::size_t num_pages, KMemoryPermission perm,
                       OperationType operation, PAddr map_addr = 0);
-    constexpr VAddr GetRegionAddress(KMemoryState state) const;
-    constexpr std::size_t GetRegionSize(KMemoryState state) const;
+    VAddr GetRegionAddress(KMemoryState state) const;
+    std::size_t GetRegionSize(KMemoryState state) const;

    ResultCode CheckMemoryStateContiguous(std::size_t* out_blocks_needed, VAddr addr,
                                          std::size_t size, KMemoryState state_mask,
@@ -253,9 +253,10 @@ public:
    constexpr bool IsInsideASLRRegion(VAddr address, std::size_t size) const {
        return !IsOutsideASLRRegion(address, size);
    }
-
-    PAddr GetPhysicalAddr(VAddr addr) {
-        ASSERT(IsLockedByCurrentThread());
+    constexpr std::size_t GetNumGuardPages() const {
+        return IsKernel() ? 1 : 4;
+    }
+    PAddr GetPhysicalAddr(VAddr addr) const {
        const auto backing_addr = page_table_impl.backing_addr[addr >> PageBits];
        ASSERT(backing_addr);
        return backing_addr + addr;
@@ -276,10 +277,6 @@ private:
        return is_aslr_enabled;
    }

-    constexpr std::size_t GetNumGuardPages() const {
-        return IsKernel() ? 1 : 4;
-    }
-
    constexpr bool ContainsPages(VAddr addr, std::size_t num_pages) const {
        return (address_space_start <= addr) &&
               (num_pages <= (address_space_end - address_space_start) / PageSize) &&
@@ -311,6 +308,8 @@ private:
    bool is_kernel{};
    bool is_aslr_enabled{};

+    u32 heap_fill_value{};
+
    KMemoryManager::Pool memory_pool{KMemoryManager::Pool::Application};
    KMemoryManager::Direction allocation_option{KMemoryManager::Direction::FromFront};

--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -70,13 +70,12 @@ struct KernelCore::Impl {

        // Derive the initial memory layout from the emulated board
        Init::InitializeSlabResourceCounts(kernel);
-        KMemoryLayout memory_layout;
-        DeriveInitialMemoryLayout(memory_layout);
-        Init::InitializeSlabHeaps(system, memory_layout);
+        DeriveInitialMemoryLayout();
+        Init::InitializeSlabHeaps(system, *memory_layout);

        // Initialize kernel memory and resources.
-        InitializeSystemResourceLimit(kernel, system.CoreTiming(), memory_layout);
-        InitializeMemoryLayout(memory_layout);
+        InitializeSystemResourceLimit(kernel, system.CoreTiming());
+        InitializeMemoryLayout();
        InitializePageSlab();
        InitializeSchedulers();
        InitializeSuspendThreads();
@@ -219,12 +218,11 @@ struct KernelCore::Impl {

    // Creates the default system resource limit
    void InitializeSystemResourceLimit(KernelCore& kernel,
-                                       const Core::Timing::CoreTiming& core_timing,
-                                       const KMemoryLayout& memory_layout) {
+                                       const Core::Timing::CoreTiming& core_timing) {
        system_resource_limit = KResourceLimit::Create(system.Kernel());
        system_resource_limit->Initialize(&core_timing);

-        const auto [total_size, kernel_size] = memory_layout.GetTotalAndKernelMemorySizes();
+        const auto [total_size, kernel_size] = memory_layout->GetTotalAndKernelMemorySizes();

        // If setting the default system values fails, then something seriously wrong has occurred.
        ASSERT(system_resource_limit->SetLimitValue(LimitableResource::PhysicalMemory, total_size)
@@ -353,16 +351,18 @@ struct KernelCore::Impl {
        return schedulers[thread_id]->GetCurrentThread();
    }

-    void DeriveInitialMemoryLayout(KMemoryLayout& memory_layout) {
+    void DeriveInitialMemoryLayout() {
+        memory_layout = std::make_unique<KMemoryLayout>();
+
        // Insert the root region for the virtual memory tree, from which all other regions will
        // derive.
-        memory_layout.GetVirtualMemoryRegionTree().InsertDirectly(
+        memory_layout->GetVirtualMemoryRegionTree().InsertDirectly(
            KernelVirtualAddressSpaceBase,
            KernelVirtualAddressSpaceBase + KernelVirtualAddressSpaceSize - 1);

        // Insert the root region for the physical memory tree, from which all other regions will
        // derive.
-        memory_layout.GetPhysicalMemoryRegionTree().InsertDirectly(
+        memory_layout->GetPhysicalMemoryRegionTree().InsertDirectly(
            KernelPhysicalAddressSpaceBase,
            KernelPhysicalAddressSpaceBase + KernelPhysicalAddressSpaceSize - 1);

@@ -379,7 +379,7 @@ struct KernelCore::Impl {
        if (!(kernel_region_start + KernelRegionSize - 1 <= KernelVirtualAddressSpaceLast)) {
            kernel_region_size = KernelVirtualAddressSpaceEnd - kernel_region_start;
        }
-        ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(
            kernel_region_start, kernel_region_size, KMemoryRegionType_Kernel));

        // Setup the code region.
@@ -388,11 +388,11 @@ struct KernelCore::Impl {
            Common::AlignDown(code_start_virt_addr, CodeRegionAlign);
        constexpr VAddr code_region_end = Common::AlignUp(code_end_virt_addr, CodeRegionAlign);
        constexpr size_t code_region_size = code_region_end - code_region_start;
-        ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(
            code_region_start, code_region_size, KMemoryRegionType_KernelCode));

        // Setup board-specific device physical regions.
-        Init::SetupDevicePhysicalMemoryRegions(memory_layout);
+        Init::SetupDevicePhysicalMemoryRegions(*memory_layout);

        // Determine the amount of space needed for the misc region.
        size_t misc_region_needed_size;
@@ -401,7 +401,7 @@ struct KernelCore::Impl {
            misc_region_needed_size = Core::Hardware::NUM_CPU_CORES * (3 * (PageSize + PageSize));

            // Account for each auto-map device.
-            for (const auto& region : memory_layout.GetPhysicalMemoryRegionTree()) {
+            for (const auto& region : memory_layout->GetPhysicalMemoryRegionTree()) {
                if (region.HasTypeAttribute(KMemoryRegionAttr_ShouldKernelMap)) {
                    // Check that the region is valid.
                    ASSERT(region.GetEndAddress() != 0);
@@ -426,22 +426,22 @@ struct KernelCore::Impl {

        // Setup the misc region.
        const VAddr misc_region_start =
-            memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegion(
+            memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegion(
                misc_region_size, MiscRegionAlign, KMemoryRegionType_Kernel);
-        ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(
            misc_region_start, misc_region_size, KMemoryRegionType_KernelMisc));

        // Setup the stack region.
        constexpr size_t StackRegionSize = 14_MiB;
        constexpr size_t StackRegionAlign = KernelAslrAlignment;
        const VAddr stack_region_start =
-            memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegion(
+            memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegion(
                StackRegionSize, StackRegionAlign, KMemoryRegionType_Kernel);
-        ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(
            stack_region_start, StackRegionSize, KMemoryRegionType_KernelStack));

        // Determine the size of the resource region.
-        const size_t resource_region_size = memory_layout.GetResourceRegionSizeForInit();
+        const size_t resource_region_size = memory_layout->GetResourceRegionSizeForInit();

        // Determine the size of the slab region.
        const size_t slab_region_size =
@@ -458,23 +458,23 @@ struct KernelCore::Impl {
            Common::AlignUp(code_end_phys_addr + slab_region_size, SlabRegionAlign) -
            Common::AlignDown(code_end_phys_addr, SlabRegionAlign);
        const VAddr slab_region_start =
-            memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegion(
+            memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegion(
                slab_region_needed_size, SlabRegionAlign, KMemoryRegionType_Kernel) +
            (code_end_phys_addr % SlabRegionAlign);
-        ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(
            slab_region_start, slab_region_size, KMemoryRegionType_KernelSlab));

        // Setup the temp region.
        constexpr size_t TempRegionSize = 128_MiB;
        constexpr size_t TempRegionAlign = KernelAslrAlignment;
        const VAddr temp_region_start =
-            memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegion(
+            memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegion(
                TempRegionSize, TempRegionAlign, KMemoryRegionType_Kernel);
-        ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(temp_region_start, TempRegionSize,
-                                                                 KMemoryRegionType_KernelTemp));
+        ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(temp_region_start, TempRegionSize,
+                                                                  KMemoryRegionType_KernelTemp));

        // Automatically map in devices that have auto-map attributes.
-        for (auto& region : memory_layout.GetPhysicalMemoryRegionTree()) {
+        for (auto& region : memory_layout->GetPhysicalMemoryRegionTree()) {
            // We only care about kernel regions.
            if (!region.IsDerivedFrom(KMemoryRegionType_Kernel)) {
                continue;
@@ -501,21 +501,21 @@ struct KernelCore::Impl {
            const size_t map_size =
                Common::AlignUp(region.GetEndAddress(), PageSize) - map_phys_addr;
            const VAddr map_virt_addr =
-                memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegionWithGuard(
+                memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegionWithGuard(
                    map_size, PageSize, KMemoryRegionType_KernelMisc, PageSize);
-            ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(
+            ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(
                map_virt_addr, map_size, KMemoryRegionType_KernelMiscMappedDevice));
            region.SetPairAddress(map_virt_addr + region.GetAddress() - map_phys_addr);
        }

-        Init::SetupDramPhysicalMemoryRegions(memory_layout);
+        Init::SetupDramPhysicalMemoryRegions(*memory_layout);

        // Insert a physical region for the kernel code region.
-        ASSERT(memory_layout.GetPhysicalMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetPhysicalMemoryRegionTree().Insert(
            code_start_phys_addr, code_region_size, KMemoryRegionType_DramKernelCode));

        // Insert a physical region for the kernel slab region.
-        ASSERT(memory_layout.GetPhysicalMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetPhysicalMemoryRegionTree().Insert(
            slab_start_phys_addr, slab_region_size, KMemoryRegionType_DramKernelSlab));

        // Determine size available for kernel page table heaps, requiring > 8 MB.
@@ -524,12 +524,12 @@ struct KernelCore::Impl {
        ASSERT(page_table_heap_size / 4_MiB > 2);

        // Insert a physical region for the kernel page table heap region
-        ASSERT(memory_layout.GetPhysicalMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetPhysicalMemoryRegionTree().Insert(
            slab_end_phys_addr, page_table_heap_size, KMemoryRegionType_DramKernelPtHeap));

        // All DRAM regions that we haven't tagged by this point will be mapped under the linear
        // mapping. Tag them.
-        for (auto& region : memory_layout.GetPhysicalMemoryRegionTree()) {
+        for (auto& region : memory_layout->GetPhysicalMemoryRegionTree()) {
            if (region.GetType() == KMemoryRegionType_Dram) {
                // Check that the region is valid.
                ASSERT(region.GetEndAddress() != 0);
@@ -541,7 +541,7 @@ struct KernelCore::Impl {

        // Get the linear region extents.
        const auto linear_extents =
-            memory_layout.GetPhysicalMemoryRegionTree().GetDerivedRegionExtents(
+            memory_layout->GetPhysicalMemoryRegionTree().GetDerivedRegionExtents(
                KMemoryRegionAttr_LinearMapped);
        ASSERT(linear_extents.GetEndAddress() != 0);

@@ -553,7 +553,7 @@ struct KernelCore::Impl {
            Common::AlignUp(linear_extents.GetEndAddress(), LinearRegionAlign) -
            aligned_linear_phys_start;
        const VAddr linear_region_start =
-            memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegionWithGuard(
+            memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegionWithGuard(
                linear_region_size, LinearRegionAlign, KMemoryRegionType_None, LinearRegionAlign);

        const u64 linear_region_phys_to_virt_diff = linear_region_start - aligned_linear_phys_start;
@@ -562,7 +562,7 @@ struct KernelCore::Impl {
        {
            PAddr cur_phys_addr = 0;
            u64 cur_size = 0;
-            for (auto& region : memory_layout.GetPhysicalMemoryRegionTree()) {
+            for (auto& region : memory_layout->GetPhysicalMemoryRegionTree()) {
                if (!region.HasTypeAttribute(KMemoryRegionAttr_LinearMapped)) {
                    continue;
                }
@@ -581,55 +581,49 @@ struct KernelCore::Impl {

                const VAddr region_virt_addr =
                    region.GetAddress() + linear_region_phys_to_virt_diff;
-                ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(
+                ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(
                    region_virt_addr, region.GetSize(),
                    GetTypeForVirtualLinearMapping(region.GetType())));
                region.SetPairAddress(region_virt_addr);

                KMemoryRegion* virt_region =
-                    memory_layout.GetVirtualMemoryRegionTree().FindModifiable(region_virt_addr);
+                    memory_layout->GetVirtualMemoryRegionTree().FindModifiable(region_virt_addr);
                ASSERT(virt_region != nullptr);
                virt_region->SetPairAddress(region.GetAddress());
            }
        }

        // Insert regions for the initial page table region.
-        ASSERT(memory_layout.GetPhysicalMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetPhysicalMemoryRegionTree().Insert(
            resource_end_phys_addr, KernelPageTableHeapSize, KMemoryRegionType_DramKernelInitPt));
-        ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(
+        ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(
            resource_end_phys_addr + linear_region_phys_to_virt_diff, KernelPageTableHeapSize,
            KMemoryRegionType_VirtualDramKernelInitPt));

        // All linear-mapped DRAM regions that we haven't tagged by this point will be allocated to
        // some pool partition. Tag them.
-        for (auto& region : memory_layout.GetPhysicalMemoryRegionTree()) {
+        for (auto& region : memory_layout->GetPhysicalMemoryRegionTree()) {
            if (region.GetType() == (KMemoryRegionType_Dram | KMemoryRegionAttr_LinearMapped)) {
                region.SetType(KMemoryRegionType_DramPoolPartition);
            }
        }

        // Setup all other memory regions needed to arrange the pool partitions.
-        Init::SetupPoolPartitionMemoryRegions(memory_layout);
+        Init::SetupPoolPartitionMemoryRegions(*memory_layout);

        // Cache all linear regions in their own trees for faster access, later.
-        memory_layout.InitializeLinearMemoryRegionTrees(aligned_linear_phys_start,
-                                                        linear_region_start);
+        memory_layout->InitializeLinearMemoryRegionTrees(aligned_linear_phys_start,
+                                                         linear_region_start);
    }

-    void InitializeMemoryLayout(const KMemoryLayout& memory_layout) {
-        const auto system_pool = memory_layout.GetKernelSystemPoolRegionPhysicalExtents();
-        const auto applet_pool = memory_layout.GetKernelAppletPoolRegionPhysicalExtents();
-        const auto application_pool = memory_layout.GetKernelApplicationPoolRegionPhysicalExtents();
+    void InitializeMemoryLayout() {
+        const auto system_pool = memory_layout->GetKernelSystemPoolRegionPhysicalExtents();

-        // Initialize memory managers
+        // Initialize the memory manager.
        memory_manager = std::make_unique<KMemoryManager>(system);
-        memory_manager->InitializeManager(KMemoryManager::Pool::Application,
-                                          application_pool.GetAddress(),
-                                          application_pool.GetEndAddress());
-        memory_manager->InitializeManager(KMemoryManager::Pool::Applet, applet_pool.GetAddress(),
-                                          applet_pool.GetEndAddress());
-        memory_manager->InitializeManager(KMemoryManager::Pool::System, system_pool.GetAddress(),
-                                          system_pool.GetEndAddress());
+        const auto& management_region = memory_layout->GetPoolManagementRegion();
+        ASSERT(management_region.GetEndAddress() != 0);
+        memory_manager->Initialize(management_region.GetAddress(), management_region.GetSize());

        // Setup memory regions for emulated processes
        // TODO(bunnei): These should not be hardcoded regions initialized within the kernel
@@ -770,6 +764,9 @@ struct KernelCore::Impl {
    Kernel::KSharedMemory* irs_shared_mem{};
    Kernel::KSharedMemory* time_shared_mem{};

+    // Memory layout
+    std::unique_ptr<KMemoryLayout> memory_layout;
+
    // Threads used for services
    std::unordered_set<std::shared_ptr<Kernel::ServiceThread>> service_threads;
    Common::ThreadWorker service_threads_manager;
@@ -1135,6 +1132,10 @@ const KWorkerTaskManager& KernelCore::WorkerTaskManager() const {
    return impl->worker_task_manager;
 }

+const KMemoryLayout& KernelCore::MemoryLayout() const {
+    return *impl->memory_layout;
+}
+
 bool KernelCore::IsPhantomModeForSingleCore() const {
    return impl->IsPhantomModeForSingleCore();
 }
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -41,6 +41,7 @@ class KClientSession;
 class KEvent;
 class KHandleTable;
 class KLinkedListNode;
+class KMemoryLayout;
 class KMemoryManager;
 class KPort;
 class KProcess;
@@ -350,6 +351,9 @@ public:
    /// Gets the current worker task manager, used for dispatching KThread/KProcess tasks.
    const KWorkerTaskManager& WorkerTaskManager() const;

+    /// Gets the memory layout.
+    const KMemoryLayout& MemoryLayout() const;
+
 private:
    friend class KProcess;
    friend class KThread;
--- a/src/core/hle/service/ldr/ldr.cpp
+++ b/src/core/hle/service/ldr/ldr.cpp
@@ -288,7 +288,7 @@ public:
    }

    bool ValidateRegionForMap(Kernel::KPageTable& page_table, VAddr start, std::size_t size) const {
-        constexpr std::size_t padding_size{4 * Kernel::PageSize};
+        const std::size_t padding_size{page_table.GetNumGuardPages() * Kernel::PageSize};
        const auto start_info{page_table.QueryInfo(start - 1)};

        if (start_info.state != Kernel::KMemoryState::Free) {
@@ -308,31 +308,69 @@ public:
        return (start + size + padding_size) <= (end_info.GetAddress() + end_info.GetSize());
    }

-    VAddr GetRandomMapRegion(const Kernel::KPageTable& page_table, std::size_t size) const {
-        VAddr addr{};
-        const std::size_t end_pages{(page_table.GetAliasCodeRegionSize() - size) >>
-                                    Kernel::PageBits};
-        do {
-            addr = page_table.GetAliasCodeRegionStart() +
-                   (Kernel::KSystemControl::GenerateRandomRange(0, end_pages) << Kernel::PageBits);
-        } while (!page_table.IsInsideAddressSpace(addr, size) ||
-                 page_table.IsInsideHeapRegion(addr, size) ||
-                 page_table.IsInsideAliasRegion(addr, size));
-        return addr;
+    ResultCode GetAvailableMapRegion(Kernel::KPageTable& page_table, u64 size, VAddr& out_addr) {
+        size = Common::AlignUp(size, Kernel::PageSize);
+        size += page_table.GetNumGuardPages() * Kernel::PageSize * 4;
+
+        const auto is_region_available = [&](VAddr addr) {
+            const auto end_addr = addr + size;
+            while (addr < end_addr) {
+                if (system.Memory().IsValidVirtualAddress(addr)) {
+                    return false;
+                }
+
+                if (!page_table.IsInsideAddressSpace(out_addr, size)) {
+                    return false;
+                }
+
+                if (page_table.IsInsideHeapRegion(out_addr, size)) {
+                    return false;
+                }
+
+                if (page_table.IsInsideAliasRegion(out_addr, size)) {
+                    return false;
+                }
+
+                addr += Kernel::PageSize;
+            }
+            return true;
+        };
+
+        bool succeeded = false;
+        const auto map_region_end =
+            page_table.GetAliasCodeRegionStart() + page_table.GetAliasCodeRegionSize();
+        while (current_map_addr < map_region_end) {
+            if (is_region_available(current_map_addr)) {
+                succeeded = true;
+                break;
+            }
+            current_map_addr += 0x100000;
+        }
+
+        if (!succeeded) {
+            UNREACHABLE_MSG("Out of address space!");
+            return Kernel::ResultOutOfMemory;
+        }
+
+        out_addr = current_map_addr;
+        current_map_addr += size;
+
+        return ResultSuccess;
    }

-    ResultVal<VAddr> MapProcessCodeMemory(Kernel::KProcess* process, VAddr baseAddress,
-                                          u64 size) const {
-        for (std::size_t retry = 0; retry < MAXIMUM_MAP_RETRIES; retry++) {
-            auto& page_table{process->PageTable()};
-            const VAddr addr{GetRandomMapRegion(page_table, size)};
-            const ResultCode result{page_table.MapCodeMemory(addr, baseAddress, size)};
+    ResultVal<VAddr> MapProcessCodeMemory(Kernel::KProcess* process, VAddr base_addr, u64 size) {
+        auto& page_table{process->PageTable()};
+        VAddr addr{};

+        for (std::size_t retry = 0; retry < MAXIMUM_MAP_RETRIES; retry++) {
+            R_TRY(GetAvailableMapRegion(page_table, size, addr));
+
+            const ResultCode result{page_table.MapCodeMemory(addr, base_addr, size)};
            if (result == Kernel::ResultInvalidCurrentMemory) {
                continue;
            }

-            CASCADE_CODE(result);
+            R_TRY(result);

            if (ValidateRegionForMap(page_table, addr, size)) {
                return addr;
@@ -343,7 +381,7 @@ public:
    }

    ResultVal<VAddr> MapNro(Kernel::KProcess* process, VAddr nro_addr, std::size_t nro_size,
-                            VAddr bss_addr, std::size_t bss_size, std::size_t size) const {
+                            VAddr bss_addr, std::size_t bss_size, std::size_t size) {
        for (std::size_t retry = 0; retry < MAXIMUM_MAP_RETRIES; retry++) {
            auto& page_table{process->PageTable()};
            VAddr addr{};
@@ -597,6 +635,7 @@ public:
        LOG_WARNING(Service_LDR, "(STUBBED) called");

        initialized = true;
+        current_map_addr = system.CurrentProcess()->PageTable().GetAliasCodeRegionStart();

        IPC::ResponseBuilder rb{ctx, 2};
        rb.Push(ResultSuccess);
@@ -607,6 +646,7 @@ private:

    std::map<VAddr, NROInfo> nro;
    std::map<VAddr, std::vector<SHA256Hash>> nrr;
+    VAddr current_map_addr{};

    bool IsValidNROHash(const SHA256Hash& hash) const {
        return std::any_of(nrr.begin(), nrr.end(), [&hash](const auto& p) {
--- a/src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input.cpp
@@ -13,59 +13,535 @@ namespace {
 // Emulate GPU's LOP3.LUT (three-input logic op with 8-bit truth table)
 IR::U32 ApplyLUT(IR::IREmitter& ir, const IR::U32& a, const IR::U32& b, const IR::U32& c,
                 u64 ttbl) {
-    IR::U32 r{ir.Imm32(0)};
-    const IR::U32 not_a{ir.BitwiseNot(a)};
-    const IR::U32 not_b{ir.BitwiseNot(b)};
-    const IR::U32 not_c{ir.BitwiseNot(c)};
-    if (ttbl & 0x01) {
-        // r |= ~a & ~b & ~c;
-        const auto lhs{ir.BitwiseAnd(not_a, not_b)};
-        const auto rhs{ir.BitwiseAnd(lhs, not_c)};
-        r = ir.BitwiseOr(r, rhs);
+    switch (ttbl) {
+        // generated code, do not edit manually
+    case 0:
+        return ir.Imm32(0);
+    case 1:
+        return ir.BitwiseNot(ir.BitwiseOr(a, ir.BitwiseOr(b, c)));
+    case 2:
+        return ir.BitwiseAnd(c, ir.BitwiseNot(ir.BitwiseOr(a, b)));
+    case 3:
+        return ir.BitwiseNot(ir.BitwiseOr(a, b));
+    case 4:
+        return ir.BitwiseAnd(b, ir.BitwiseNot(ir.BitwiseOr(a, c)));
+    case 5:
+        return ir.BitwiseNot(ir.BitwiseOr(a, c));
+    case 6:
+        return ir.BitwiseAnd(ir.BitwiseNot(a), ir.BitwiseXor(b, c));
+    case 7:
+        return ir.BitwiseNot(ir.BitwiseOr(a, ir.BitwiseAnd(b, c)));
+    case 8:
+        return ir.BitwiseAnd(ir.BitwiseAnd(b, c), ir.BitwiseNot(a));
+    case 9:
+        return ir.BitwiseNot(ir.BitwiseOr(a, ir.BitwiseXor(b, c)));
+    case 10:
+        return ir.BitwiseAnd(c, ir.BitwiseNot(a));
+    case 11:
+        return ir.BitwiseAnd(ir.BitwiseNot(a), ir.BitwiseOr(c, ir.BitwiseNot(b)));
+    case 12:
+        return ir.BitwiseAnd(b, ir.BitwiseNot(a));
+    case 13:
+        return ir.BitwiseAnd(ir.BitwiseNot(a), ir.BitwiseOr(b, ir.BitwiseNot(c)));
+    case 14:
+        return ir.BitwiseAnd(ir.BitwiseNot(a), ir.BitwiseOr(b, c));
+    case 15:
+        return ir.BitwiseNot(a);
+    case 16:
+        return ir.BitwiseAnd(a, ir.BitwiseNot(ir.BitwiseOr(b, c)));
+    case 17:
+        return ir.BitwiseNot(ir.BitwiseOr(b, c));
+    case 18:
+        return ir.BitwiseAnd(ir.BitwiseNot(b), ir.BitwiseXor(a, c));
+    case 19:
+        return ir.BitwiseNot(ir.BitwiseOr(b, ir.BitwiseAnd(a, c)));
+    case 20:
+        return ir.BitwiseAnd(ir.BitwiseNot(c), ir.BitwiseXor(a, b));
+    case 21:
+        return ir.BitwiseNot(ir.BitwiseOr(c, ir.BitwiseAnd(a, b)));
+    case 22:
+        return ir.BitwiseXor(ir.BitwiseOr(a, b), ir.BitwiseOr(c, ir.BitwiseAnd(a, b)));
+    case 23:
+        return ir.BitwiseXor(ir.BitwiseAnd(ir.BitwiseXor(a, b), ir.BitwiseXor(a, c)),
+                             ir.BitwiseNot(a));
+    case 24:
+        return ir.BitwiseAnd(ir.BitwiseXor(a, b), ir.BitwiseXor(a, c));
+    case 25:
+        return ir.BitwiseNot(ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseXor(b, c)));
+    case 26:
+        return ir.BitwiseAnd(ir.BitwiseOr(c, ir.BitwiseNot(b)), ir.BitwiseXor(a, c));
+    case 27:
+        return ir.BitwiseXor(ir.BitwiseOr(a, ir.BitwiseNot(c)), ir.BitwiseOr(b, c));
+    case 28:
+        return ir.BitwiseAnd(ir.BitwiseOr(b, ir.BitwiseNot(c)), ir.BitwiseXor(a, b));
+    case 29:
+        return ir.BitwiseXor(ir.BitwiseOr(a, ir.BitwiseNot(b)), ir.BitwiseOr(b, c));
+    case 30:
+        return ir.BitwiseXor(a, ir.BitwiseOr(b, c));
+    case 31:
+        return ir.BitwiseNot(ir.BitwiseAnd(a, ir.BitwiseOr(b, c)));
+    case 32:
+        return ir.BitwiseAnd(ir.BitwiseAnd(a, c), ir.BitwiseNot(b));
+    case 33:
+        return ir.BitwiseNot(ir.BitwiseOr(b, ir.BitwiseXor(a, c)));
+    case 34:
+        return ir.BitwiseAnd(c, ir.BitwiseNot(b));
+    case 35:
+        return ir.BitwiseAnd(ir.BitwiseNot(b), ir.BitwiseOr(c, ir.BitwiseNot(a)));
+    case 36:
+        return ir.BitwiseAnd(ir.BitwiseXor(a, b), ir.BitwiseXor(b, c));
+    case 37:
+        return ir.BitwiseNot(ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseXor(a, c)));
+    case 38:
+        return ir.BitwiseAnd(ir.BitwiseOr(c, ir.BitwiseNot(a)), ir.BitwiseXor(b, c));
+    case 39:
+        return ir.BitwiseXor(ir.BitwiseOr(a, c), ir.BitwiseOr(b, ir.BitwiseNot(c)));
+    case 40:
+        return ir.BitwiseAnd(c, ir.BitwiseXor(a, b));
+    case 41:
+        return ir.BitwiseXor(ir.BitwiseOr(a, b),
+                             ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseNot(c)));
+    case 42:
+        return ir.BitwiseAnd(c, ir.BitwiseNot(ir.BitwiseAnd(a, b)));
+    case 43:
+        return ir.BitwiseXor(ir.BitwiseOr(a, ir.BitwiseNot(c)),
+                             ir.BitwiseOr(b, ir.BitwiseXor(a, c)));
+    case 44:
+        return ir.BitwiseAnd(ir.BitwiseOr(b, c), ir.BitwiseXor(a, b));
+    case 45:
+        return ir.BitwiseXor(a, ir.BitwiseOr(b, ir.BitwiseNot(c)));
+    case 46:
+        return ir.BitwiseXor(ir.BitwiseAnd(a, b), ir.BitwiseOr(b, c));
+    case 47:
+        return ir.BitwiseOr(ir.BitwiseAnd(c, ir.BitwiseNot(b)), ir.BitwiseNot(a));
+    case 48:
+        return ir.BitwiseAnd(a, ir.BitwiseNot(b));
+    case 49:
+        return ir.BitwiseAnd(ir.BitwiseNot(b), ir.BitwiseOr(a, ir.BitwiseNot(c)));
+    case 50:
+        return ir.BitwiseAnd(ir.BitwiseNot(b), ir.BitwiseOr(a, c));
+    case 51:
+        return ir.BitwiseNot(b);
+    case 52:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, ir.BitwiseNot(c)), ir.BitwiseXor(a, b));
+    case 53:
+        return ir.BitwiseXor(ir.BitwiseOr(a, c), ir.BitwiseOr(b, ir.BitwiseNot(a)));
+    case 54:
+        return ir.BitwiseXor(b, ir.BitwiseOr(a, c));
+    case 55:
+        return ir.BitwiseNot(ir.BitwiseAnd(b, ir.BitwiseOr(a, c)));
+    case 56:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, c), ir.BitwiseXor(a, b));
+    case 57:
+        return ir.BitwiseXor(b, ir.BitwiseOr(a, ir.BitwiseNot(c)));
+    case 58:
+        return ir.BitwiseXor(ir.BitwiseAnd(a, b), ir.BitwiseOr(a, c));
+    case 59:
+        return ir.BitwiseOr(ir.BitwiseAnd(c, ir.BitwiseNot(a)), ir.BitwiseNot(b));
+    case 60:
+        return ir.BitwiseXor(a, b);
+    case 61:
+        return ir.BitwiseOr(ir.BitwiseNot(ir.BitwiseOr(a, c)), ir.BitwiseXor(a, b));
+    case 62:
+        return ir.BitwiseOr(ir.BitwiseAnd(c, ir.BitwiseNot(a)), ir.BitwiseXor(a, b));
+    case 63:
+        return ir.BitwiseNot(ir.BitwiseAnd(a, b));
+    case 64:
+        return ir.BitwiseAnd(ir.BitwiseAnd(a, b), ir.BitwiseNot(c));
+    case 65:
+        return ir.BitwiseNot(ir.BitwiseOr(c, ir.BitwiseXor(a, b)));
+    case 66:
+        return ir.BitwiseAnd(ir.BitwiseXor(a, c), ir.BitwiseXor(b, c));
+    case 67:
+        return ir.BitwiseNot(ir.BitwiseOr(ir.BitwiseAnd(a, c), ir.BitwiseXor(a, b)));
+    case 68:
+        return ir.BitwiseAnd(b, ir.BitwiseNot(c));
+    case 69:
+        return ir.BitwiseAnd(ir.BitwiseNot(c), ir.BitwiseOr(b, ir.BitwiseNot(a)));
+    case 70:
+        return ir.BitwiseAnd(ir.BitwiseOr(b, ir.BitwiseNot(a)), ir.BitwiseXor(b, c));
+    case 71:
+        return ir.BitwiseXor(ir.BitwiseOr(a, b), ir.BitwiseOr(c, ir.BitwiseNot(b)));
+    case 72:
+        return ir.BitwiseAnd(b, ir.BitwiseXor(a, c));
+    case 73:
+        return ir.BitwiseXor(ir.BitwiseOr(a, c),
+                             ir.BitwiseOr(ir.BitwiseAnd(a, c), ir.BitwiseNot(b)));
+    case 74:
+        return ir.BitwiseAnd(ir.BitwiseOr(b, c), ir.BitwiseXor(a, c));
+    case 75:
+        return ir.BitwiseXor(a, ir.BitwiseOr(c, ir.BitwiseNot(b)));
+    case 76:
+        return ir.BitwiseAnd(b, ir.BitwiseNot(ir.BitwiseAnd(a, c)));
+    case 77:
+        return ir.BitwiseXor(ir.BitwiseOr(a, ir.BitwiseNot(b)),
+                             ir.BitwiseOr(c, ir.BitwiseXor(a, b)));
+    case 78:
+        return ir.BitwiseXor(ir.BitwiseAnd(a, c), ir.BitwiseOr(b, c));
+    case 79:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, ir.BitwiseNot(c)), ir.BitwiseNot(a));
+    case 80:
+        return ir.BitwiseAnd(a, ir.BitwiseNot(c));
+    case 81:
+        return ir.BitwiseAnd(ir.BitwiseNot(c), ir.BitwiseOr(a, ir.BitwiseNot(b)));
+    case 82:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, ir.BitwiseNot(b)), ir.BitwiseXor(a, c));
+    case 83:
+        return ir.BitwiseXor(ir.BitwiseOr(a, b), ir.BitwiseOr(c, ir.BitwiseNot(a)));
+    case 84:
+        return ir.BitwiseAnd(ir.BitwiseNot(c), ir.BitwiseOr(a, b));
+    case 85:
+        return ir.BitwiseNot(c);
+    case 86:
+        return ir.BitwiseXor(c, ir.BitwiseOr(a, b));
+    case 87:
+        return ir.BitwiseNot(ir.BitwiseAnd(c, ir.BitwiseOr(a, b)));
+    case 88:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseXor(a, c));
+    case 89:
+        return ir.BitwiseXor(c, ir.BitwiseOr(a, ir.BitwiseNot(b)));
+    case 90:
+        return ir.BitwiseXor(a, c);
+    case 91:
+        return ir.BitwiseOr(ir.BitwiseNot(ir.BitwiseOr(a, b)), ir.BitwiseXor(a, c));
+    case 92:
+        return ir.BitwiseXor(ir.BitwiseAnd(a, c), ir.BitwiseOr(a, b));
+    case 93:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, ir.BitwiseNot(a)), ir.BitwiseNot(c));
+    case 94:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, ir.BitwiseNot(a)), ir.BitwiseXor(a, c));
+    case 95:
+        return ir.BitwiseNot(ir.BitwiseAnd(a, c));
+    case 96:
+        return ir.BitwiseAnd(a, ir.BitwiseXor(b, c));
+    case 97:
+        return ir.BitwiseXor(ir.BitwiseOr(b, c),
+                             ir.BitwiseOr(ir.BitwiseAnd(b, c), ir.BitwiseNot(a)));
+    case 98:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, c), ir.BitwiseXor(b, c));
+    case 99:
+        return ir.BitwiseXor(b, ir.BitwiseOr(c, ir.BitwiseNot(a)));
+    case 100:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseXor(b, c));
+    case 101:
+        return ir.BitwiseXor(c, ir.BitwiseOr(b, ir.BitwiseNot(a)));
+    case 102:
+        return ir.BitwiseXor(b, c);
+    case 103:
+        return ir.BitwiseOr(ir.BitwiseNot(ir.BitwiseOr(a, b)), ir.BitwiseXor(b, c));
+    case 104:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseXor(c, ir.BitwiseAnd(a, b)));
+    case 105:
+        return ir.BitwiseXor(ir.BitwiseNot(a), ir.BitwiseXor(b, c));
+    case 106:
+        return ir.BitwiseXor(c, ir.BitwiseAnd(a, b));
+    case 107:
+        return ir.BitwiseXor(ir.BitwiseAnd(c, ir.BitwiseOr(a, b)),
+                             ir.BitwiseXor(a, ir.BitwiseNot(b)));
+    case 108:
+        return ir.BitwiseXor(b, ir.BitwiseAnd(a, c));
+    case 109:
+        return ir.BitwiseXor(ir.BitwiseAnd(b, ir.BitwiseOr(a, c)),
+                             ir.BitwiseXor(a, ir.BitwiseNot(c)));
+    case 110:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, ir.BitwiseNot(a)), ir.BitwiseXor(b, c));
+    case 111:
+        return ir.BitwiseOr(ir.BitwiseNot(a), ir.BitwiseXor(b, c));
+    case 112:
+        return ir.BitwiseAnd(a, ir.BitwiseNot(ir.BitwiseAnd(b, c)));
+    case 113:
+        return ir.BitwiseXor(ir.BitwiseOr(b, ir.BitwiseNot(a)),
+                             ir.BitwiseOr(c, ir.BitwiseXor(a, b)));
+    case 114:
+        return ir.BitwiseXor(ir.BitwiseAnd(b, c), ir.BitwiseOr(a, c));
+    case 115:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, ir.BitwiseNot(c)), ir.BitwiseNot(b));
+    case 116:
+        return ir.BitwiseXor(ir.BitwiseAnd(b, c), ir.BitwiseOr(a, b));
+    case 117:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, ir.BitwiseNot(b)), ir.BitwiseNot(c));
+    case 118:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, ir.BitwiseNot(b)), ir.BitwiseXor(b, c));
+    case 119:
+        return ir.BitwiseNot(ir.BitwiseAnd(b, c));
+    case 120:
+        return ir.BitwiseXor(a, ir.BitwiseAnd(b, c));
+    case 121:
+        return ir.BitwiseXor(ir.BitwiseAnd(a, ir.BitwiseOr(b, c)),
+                             ir.BitwiseXor(b, ir.BitwiseNot(c)));
+    case 122:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, ir.BitwiseNot(b)), ir.BitwiseXor(a, c));
+    case 123:
+        return ir.BitwiseOr(ir.BitwiseNot(b), ir.BitwiseXor(a, c));
+    case 124:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, ir.BitwiseNot(c)), ir.BitwiseXor(a, b));
+    case 125:
+        return ir.BitwiseOr(ir.BitwiseNot(c), ir.BitwiseXor(a, b));
+    case 126:
+        return ir.BitwiseOr(ir.BitwiseXor(a, b), ir.BitwiseXor(a, c));
+    case 127:
+        return ir.BitwiseNot(ir.BitwiseAnd(a, ir.BitwiseAnd(b, c)));
+    case 128:
+        return ir.BitwiseAnd(a, ir.BitwiseAnd(b, c));
+    case 129:
+        return ir.BitwiseNot(ir.BitwiseOr(ir.BitwiseXor(a, b), ir.BitwiseXor(a, c)));
+    case 130:
+        return ir.BitwiseAnd(c, ir.BitwiseXor(a, ir.BitwiseNot(b)));
+    case 131:
+        return ir.BitwiseAnd(ir.BitwiseOr(c, ir.BitwiseNot(a)), ir.BitwiseXor(a, ir.BitwiseNot(b)));
+    case 132:
+        return ir.BitwiseAnd(b, ir.BitwiseXor(a, ir.BitwiseNot(c)));
+    case 133:
+        return ir.BitwiseAnd(ir.BitwiseOr(b, ir.BitwiseNot(a)), ir.BitwiseXor(a, ir.BitwiseNot(c)));
+    case 134:
+        return ir.BitwiseAnd(ir.BitwiseOr(b, c), ir.BitwiseXor(a, ir.BitwiseXor(b, c)));
+    case 135:
+        return ir.BitwiseXor(ir.BitwiseAnd(b, c), ir.BitwiseNot(a));
+    case 136:
+        return ir.BitwiseAnd(b, c);
+    case 137:
+        return ir.BitwiseAnd(ir.BitwiseOr(b, ir.BitwiseNot(a)), ir.BitwiseXor(b, ir.BitwiseNot(c)));
+    case 138:
+        return ir.BitwiseAnd(c, ir.BitwiseOr(b, ir.BitwiseNot(a)));
+    case 139:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, c), ir.BitwiseNot(ir.BitwiseOr(a, b)));
+    case 140:
+        return ir.BitwiseAnd(b, ir.BitwiseOr(c, ir.BitwiseNot(a)));
+    case 141:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, c), ir.BitwiseNot(ir.BitwiseOr(a, c)));
+    case 142:
+        return ir.BitwiseXor(a, ir.BitwiseOr(ir.BitwiseXor(a, b), ir.BitwiseXor(a, c)));
+    case 143:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, c), ir.BitwiseNot(a));
+    case 144:
+        return ir.BitwiseAnd(a, ir.BitwiseXor(b, ir.BitwiseNot(c)));
+    case 145:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, ir.BitwiseNot(b)), ir.BitwiseXor(b, ir.BitwiseNot(c)));
+    case 146:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, c), ir.BitwiseXor(a, ir.BitwiseXor(b, c)));
+    case 147:
+        return ir.BitwiseXor(ir.BitwiseAnd(a, c), ir.BitwiseNot(b));
+    case 148:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseXor(a, ir.BitwiseXor(b, c)));
+    case 149:
+        return ir.BitwiseXor(ir.BitwiseAnd(a, b), ir.BitwiseNot(c));
+    case 150:
+        return ir.BitwiseXor(a, ir.BitwiseXor(b, c));
+    case 151:
+        return ir.BitwiseOr(ir.BitwiseNot(ir.BitwiseOr(a, b)),
+                            ir.BitwiseXor(a, ir.BitwiseXor(b, c)));
+    case 152:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseXor(b, ir.BitwiseNot(c)));
+    case 153:
+        return ir.BitwiseXor(b, ir.BitwiseNot(c));
+    case 154:
+        return ir.BitwiseXor(c, ir.BitwiseAnd(a, ir.BitwiseNot(b)));
+    case 155:
+        return ir.BitwiseNot(ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseXor(b, c)));
+    case 156:
+        return ir.BitwiseXor(b, ir.BitwiseAnd(a, ir.BitwiseNot(c)));
+    case 157:
+        return ir.BitwiseNot(ir.BitwiseAnd(ir.BitwiseOr(a, c), ir.BitwiseXor(b, c)));
+    case 158:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, c), ir.BitwiseXor(a, ir.BitwiseOr(b, c)));
+    case 159:
+        return ir.BitwiseNot(ir.BitwiseAnd(a, ir.BitwiseXor(b, c)));
+    case 160:
+        return ir.BitwiseAnd(a, c);
+    case 161:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, ir.BitwiseNot(b)), ir.BitwiseXor(a, ir.BitwiseNot(c)));
+    case 162:
+        return ir.BitwiseAnd(c, ir.BitwiseOr(a, ir.BitwiseNot(b)));
+    case 163:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, c), ir.BitwiseNot(ir.BitwiseOr(a, b)));
+    case 164:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseXor(a, ir.BitwiseNot(c)));
+    case 165:
+        return ir.BitwiseXor(a, ir.BitwiseNot(c));
+    case 166:
+        return ir.BitwiseXor(c, ir.BitwiseAnd(b, ir.BitwiseNot(a)));
+    case 167:
+        return ir.BitwiseNot(ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseXor(a, c)));
+    case 168:
+        return ir.BitwiseAnd(c, ir.BitwiseOr(a, b));
+    case 169:
+        return ir.BitwiseXor(ir.BitwiseNot(c), ir.BitwiseOr(a, b));
+    case 170:
+        return c;
+    case 171:
+        return ir.BitwiseOr(c, ir.BitwiseNot(ir.BitwiseOr(a, b)));
+    case 172:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseOr(c, ir.BitwiseNot(a)));
+    case 173:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, c), ir.BitwiseXor(a, ir.BitwiseNot(c)));
+    case 174:
+        return ir.BitwiseOr(c, ir.BitwiseAnd(b, ir.BitwiseNot(a)));
+    case 175:
+        return ir.BitwiseOr(c, ir.BitwiseNot(a));
+    case 176:
+        return ir.BitwiseAnd(a, ir.BitwiseOr(c, ir.BitwiseNot(b)));
+    case 177:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, c), ir.BitwiseNot(ir.BitwiseOr(b, c)));
+    case 178:
+        return ir.BitwiseXor(b, ir.BitwiseOr(ir.BitwiseXor(a, b), ir.BitwiseXor(a, c)));
+    case 179:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, c), ir.BitwiseNot(b));
+    case 180:
+        return ir.BitwiseXor(a, ir.BitwiseAnd(b, ir.BitwiseNot(c)));
+    case 181:
+        return ir.BitwiseNot(ir.BitwiseAnd(ir.BitwiseOr(b, c), ir.BitwiseXor(a, c)));
+    case 182:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, c), ir.BitwiseXor(b, ir.BitwiseOr(a, c)));
+    case 183:
+        return ir.BitwiseNot(ir.BitwiseAnd(b, ir.BitwiseXor(a, c)));
+    case 184:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseOr(c, ir.BitwiseNot(b)));
+    case 185:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, c), ir.BitwiseXor(b, ir.BitwiseNot(c)));
+    case 186:
+        return ir.BitwiseOr(c, ir.BitwiseAnd(a, ir.BitwiseNot(b)));
+    case 187:
+        return ir.BitwiseOr(c, ir.BitwiseNot(b));
+    case 188:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, c), ir.BitwiseXor(a, b));
+    case 189:
+        return ir.BitwiseOr(ir.BitwiseXor(a, b), ir.BitwiseXor(a, ir.BitwiseNot(c)));
+    case 190:
+        return ir.BitwiseOr(c, ir.BitwiseXor(a, b));
+    case 191:
+        return ir.BitwiseOr(c, ir.BitwiseNot(ir.BitwiseAnd(a, b)));
+    case 192:
+        return ir.BitwiseAnd(a, b);
+    case 193:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, ir.BitwiseNot(c)), ir.BitwiseXor(a, ir.BitwiseNot(b)));
+    case 194:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, c), ir.BitwiseXor(a, ir.BitwiseNot(b)));
+    case 195:
+        return ir.BitwiseXor(a, ir.BitwiseNot(b));
+    case 196:
+        return ir.BitwiseAnd(b, ir.BitwiseOr(a, ir.BitwiseNot(c)));
+    case 197:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseNot(ir.BitwiseOr(a, c)));
+    case 198:
+        return ir.BitwiseXor(b, ir.BitwiseAnd(c, ir.BitwiseNot(a)));
+    case 199:
+        return ir.BitwiseNot(ir.BitwiseAnd(ir.BitwiseOr(a, c), ir.BitwiseXor(a, b)));
+    case 200:
+        return ir.BitwiseAnd(b, ir.BitwiseOr(a, c));
+    case 201:
+        return ir.BitwiseXor(ir.BitwiseNot(b), ir.BitwiseOr(a, c));
+    case 202:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, c), ir.BitwiseOr(b, ir.BitwiseNot(a)));
+    case 203:
+        return ir.BitwiseOr(ir.BitwiseAnd(b, c), ir.BitwiseXor(a, ir.BitwiseNot(b)));
+    case 204:
+        return b;
+    case 205:
+        return ir.BitwiseOr(b, ir.BitwiseNot(ir.BitwiseOr(a, c)));
+    case 206:
+        return ir.BitwiseOr(b, ir.BitwiseAnd(c, ir.BitwiseNot(a)));
+    case 207:
+        return ir.BitwiseOr(b, ir.BitwiseNot(a));
+    case 208:
+        return ir.BitwiseAnd(a, ir.BitwiseOr(b, ir.BitwiseNot(c)));
+    case 209:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseNot(ir.BitwiseOr(b, c)));
+    case 210:
+        return ir.BitwiseXor(a, ir.BitwiseAnd(c, ir.BitwiseNot(b)));
+    case 211:
+        return ir.BitwiseNot(ir.BitwiseAnd(ir.BitwiseOr(b, c), ir.BitwiseXor(a, b)));
+    case 212:
+        return ir.BitwiseXor(c, ir.BitwiseOr(ir.BitwiseXor(a, b), ir.BitwiseXor(a, c)));
+    case 213:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseNot(c));
+    case 214:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseXor(c, ir.BitwiseOr(a, b)));
+    case 215:
+        return ir.BitwiseNot(ir.BitwiseAnd(c, ir.BitwiseXor(a, b)));
+    case 216:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, c), ir.BitwiseOr(b, ir.BitwiseNot(c)));
+    case 217:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseXor(b, ir.BitwiseNot(c)));
+    case 218:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseXor(a, c));
+    case 219:
+        return ir.BitwiseOr(ir.BitwiseXor(a, c), ir.BitwiseXor(a, ir.BitwiseNot(b)));
+    case 220:
+        return ir.BitwiseOr(b, ir.BitwiseAnd(a, ir.BitwiseNot(c)));
+    case 221:
+        return ir.BitwiseOr(b, ir.BitwiseNot(c));
+    case 222:
+        return ir.BitwiseOr(b, ir.BitwiseXor(a, c));
+    case 223:
+        return ir.BitwiseOr(b, ir.BitwiseNot(ir.BitwiseAnd(a, c)));
+    case 224:
+        return ir.BitwiseAnd(a, ir.BitwiseOr(b, c));
+    case 225:
+        return ir.BitwiseXor(ir.BitwiseNot(a), ir.BitwiseOr(b, c));
+    case 226:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, ir.BitwiseNot(b)), ir.BitwiseOr(b, c));
+    case 227:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, c), ir.BitwiseXor(a, ir.BitwiseNot(b)));
+    case 228:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, ir.BitwiseNot(c)), ir.BitwiseOr(b, c));
+    case 229:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseXor(a, ir.BitwiseNot(c)));
+    case 230:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, b), ir.BitwiseXor(b, c));
+    case 231:
+        return ir.BitwiseOr(ir.BitwiseXor(a, ir.BitwiseNot(b)), ir.BitwiseXor(b, c));
+    case 232:
+        return ir.BitwiseAnd(ir.BitwiseOr(a, b), ir.BitwiseOr(c, ir.BitwiseAnd(a, b)));
+    case 233:
+        return ir.BitwiseOr(ir.BitwiseAnd(a, b),
+                            ir.BitwiseXor(ir.BitwiseNot(c), ir.BitwiseOr(a, b)));
+    case 234:
+        return ir.BitwiseOr(c, ir.BitwiseAnd(a, b));
+    case 235:
+        return ir.BitwiseOr(c, ir.BitwiseXor(a, ir.BitwiseNot(b)));
+    case 236:
+        return ir.BitwiseOr(b, ir.BitwiseAnd(a, c));
+    case 237:
+        return ir.BitwiseOr(b, ir.BitwiseXor(a, ir.BitwiseNot(c)));
+    case 238:
+        return ir.BitwiseOr(b, c);
+    case 239:
+        return ir.BitwiseOr(ir.BitwiseNot(a), ir.BitwiseOr(b, c));
+    case 240:
+        return a;
+    case 241:
+        return ir.BitwiseOr(a, ir.BitwiseNot(ir.BitwiseOr(b, c)));
+    case 242:
+        return ir.BitwiseOr(a, ir.BitwiseAnd(c, ir.BitwiseNot(b)));
+    case 243:
+        return ir.BitwiseOr(a, ir.BitwiseNot(b));
+    case 244:
+        return ir.BitwiseOr(a, ir.BitwiseAnd(b, ir.BitwiseNot(c)));
+    case 245:
+        return ir.BitwiseOr(a, ir.BitwiseNot(c));
+    case 246:
+        return ir.BitwiseOr(a, ir.BitwiseXor(b, c));
+    case 247:
+        return ir.BitwiseOr(a, ir.BitwiseNot(ir.BitwiseAnd(b, c)));
+    case 248:
+        return ir.BitwiseOr(a, ir.BitwiseAnd(b, c));
+    case 249:
+        return ir.BitwiseOr(a, ir.BitwiseXor(b, ir.BitwiseNot(c)));
+    case 250:
+        return ir.BitwiseOr(a, c);
+    case 251:
+        return ir.BitwiseOr(ir.BitwiseNot(b), ir.BitwiseOr(a, c));
+    case 252:
+        return ir.BitwiseOr(a, b);
+    case 253:
+        return ir.BitwiseOr(ir.BitwiseNot(c), ir.BitwiseOr(a, b));
+    case 254:
+        return ir.BitwiseOr(a, ir.BitwiseOr(b, c));
+    case 255:
+        return ir.Imm32(0xFFFFFFFF);
+        // end of generated code
    }
-    if (ttbl & 0x02) {
-        // r |= ~a & ~b & c;
-        const auto lhs{ir.BitwiseAnd(not_a, not_b)};
-        const auto rhs{ir.BitwiseAnd(lhs, c)};
-        r = ir.BitwiseOr(r, rhs);
-    }
-    if (ttbl & 0x04) {
-        // r |= ~a & b & ~c;
-        const auto lhs{ir.BitwiseAnd(not_a, b)};
-        const auto rhs{ir.BitwiseAnd(lhs, not_c)};
-        r = ir.BitwiseOr(r, rhs);
-    }
-    if (ttbl & 0x08) {
-        // r |= ~a & b & c;
-        const auto lhs{ir.BitwiseAnd(not_a, b)};
-        const auto rhs{ir.BitwiseAnd(lhs, c)};
-        r = ir.BitwiseOr(r, rhs);
-    }
-    if (ttbl & 0x10) {
-        // r |= a & ~b & ~c;
-        const auto lhs{ir.BitwiseAnd(a, not_b)};
-        const auto rhs{ir.BitwiseAnd(lhs, not_c)};
-        r = ir.BitwiseOr(r, rhs);
-    }
-    if (ttbl & 0x20) {
-        // r |= a & ~b & c;
-        const auto lhs{ir.BitwiseAnd(a, not_b)};
-        const auto rhs{ir.BitwiseAnd(lhs, c)};
-        r = ir.BitwiseOr(r, rhs);
-    }
-    if (ttbl & 0x40) {
-        // r |= a & b & ~c;
-        const auto lhs{ir.BitwiseAnd(a, b)};
-        const auto rhs{ir.BitwiseAnd(lhs, not_c)};
-        r = ir.BitwiseOr(r, rhs);
-    }
-    if (ttbl & 0x80) {
-        // r |= a & b & c;
-        const auto lhs{ir.BitwiseAnd(a, b)};
-        const auto rhs{ir.BitwiseAnd(lhs, c)};
-        r = ir.BitwiseOr(r, rhs);
-    }
-    return r;
+    throw NotImplementedException("LOP3 with out of range ttbl");
 }

 IR::U32 LOP3(TranslatorVisitor& v, u64 insn, const IR::U32& op_b, const IR::U32& op_c, u64 lut) {
--- a/src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input_lut3.py
+++ b/src/shader_recompiler/frontend/maxwell/translate/impl/logic_operation_three_input_lut3.py
@@ -0,0 +1,92 @@
+# Copyright © 2022 degasus <markus@selfnet.de>
+# This work is free. You can redistribute it and/or modify it under the
+# terms of the Do What The Fuck You Want To Public License, Version 2,
+# as published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
+
+from itertools import product
+
+# The primitive instructions
+OPS = {
+    'ir.BitwiseAnd({}, {})' : (2, 1, lambda a,b: a&b),
+    'ir.BitwiseOr({}, {})' : (2, 1, lambda a,b: a|b),
+    'ir.BitwiseXor({}, {})' : (2, 1, lambda a,b: a^b),
+    'ir.BitwiseNot({})' : (1, 0.1, lambda a: (~a) & 255), # Only tiny cost, as this can often inlined in other instructions
+}
+
+# Our database of combination of instructions
+optimized_calls = {}
+def cmp(lhs, rhs):
+    if lhs is None: # new entry
+        return True
+    if lhs[3] > rhs[3]: # costs
+        return True
+    if lhs[3] < rhs[3]: # costs
+        return False
+    if len(lhs[0]) > len(rhs[0]): # string len
+        return True
+    if len(lhs[0]) < len(rhs[0]): # string len
+        return False
+    if lhs[0] > rhs[0]: # string sorting
+        return True
+    if lhs[0] < rhs[0]: # string sorting
+        return False
+    assert lhs == rhs, "redundant instruction, bug in brute force"
+    return False
+def register(imm, instruction, count, latency):
+    # Use the sum of instruction count and latency as costs to evaluate which combination is best
+    costs = count + latency
+
+    old = optimized_calls.get(imm, None)
+    new = (instruction, count, latency, costs)
+
+    # Update if new or better
+    if cmp(old, new):
+        optimized_calls[imm] = new
+        return True
+
+    return False
+
+# Constants: 0, 1 (for free)
+register(0, 'ir.Imm32(0)', 0, 0)
+register(255, 'ir.Imm32(0xFFFFFFFF)', 0, 0)
+
+# Inputs: a, b, c (for free)
+ta = 0xF0
+tb = 0xCC
+tc = 0xAA
+inputs = {
+    ta : 'a',
+    tb : 'b',
+    tc : 'c',
+}
+for imm, instruction in inputs.items():
+    register(imm, instruction, 0, 0)
+    register((~imm) & 255, 'ir.BitwiseNot({})'.format(instruction), 0.099, 0.099) # slightly cheaper NEG on inputs
+
+# Try to combine two values from the db with an instruction.
+# If it is better than the old method, update it.
+while True:
+    registered = 0
+    calls_copy = optimized_calls.copy()
+    for OP, (argc, cost, f) in OPS.items():
+        for args in product(calls_copy.items(), repeat=argc):
+            # unpack(transponse) the arrays
+            imm = [arg[0] for arg in args]
+            value = [arg[1][0] for arg in args]
+            count = [arg[1][1] for arg in args]
+            latency = [arg[1][2] for arg in args]
+
+            registered += register(
+                f(*imm),
+                OP.format(*value),
+                sum(count) + cost,
+                max(latency) + cost)
+    if registered == 0:
+        # No update at all? So terminate
+        break
+
+# Hacky output. Please improve me to output valid C++ instead.
+s = """    case {imm}:
+        return {op};"""
+for imm in range(256):
+    print(s.format(imm=imm, op=optimized_calls[imm][0]))
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -53,7 +53,6 @@ void MaxwellDMA::Launch() {

    // TODO(Subv): Perform more research and implement all features of this engine.
    const LaunchDMA& launch = regs.launch_dma;
-    ASSERT(launch.semaphore_type == LaunchDMA::SemaphoreType::NONE);
    ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
    ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
    ASSERT(regs.dst_params.origin.x == 0);
@@ -79,6 +78,7 @@ void MaxwellDMA::Launch() {
            CopyPitchToBlockLinear();
        }
    }
+    ReleaseSemaphore();
 }

 void MaxwellDMA::CopyPitchToPitch() {
@@ -244,4 +244,22 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
 }

+void MaxwellDMA::ReleaseSemaphore() {
+    const auto type = regs.launch_dma.semaphore_type;
+    const GPUVAddr address = regs.semaphore.address;
+    switch (type) {
+    case LaunchDMA::SemaphoreType::NONE:
+        break;
+    case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE:
+        memory_manager.Write<u32>(address, regs.semaphore.payload);
+        break;
+    case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE:
+        memory_manager.Write<u64>(address, static_cast<u64>(regs.semaphore.payload));
+        memory_manager.Write<u64>(address + 8, system.GPU().GetTicks());
+        break;
+    default:
+        UNREACHABLE_MSG("Unknown semaphore type: {}", static_cast<u32>(type.Value()));
+    }
+}
+
 } // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -224,6 +224,8 @@ private:

    void FastCopyBlockLinearToPitch();

+    void ReleaseSemaphore();
+
    Core::System& system;

    MemoryManager& memory_manager;
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.cpp
@@ -243,10 +243,6 @@ GraphicsPipeline::GraphicsPipeline(
            case Settings::ShaderBackend::GLASM:
                if (!sources[stage].empty()) {
                    assembly_programs[stage] = CompileProgram(sources[stage], AssemblyStage(stage));
-                    if (in_parallel) {
-                        // Make sure program is built before continuing when building in parallel
-                        glGetString(GL_PROGRAM_ERROR_STRING_NV);
-                    }
                }
                break;
            case Settings::ShaderBackend::SPIRV:
@@ -256,20 +252,18 @@ GraphicsPipeline::GraphicsPipeline(
                break;
            }
        }
-        if (in_parallel && backend != Settings::ShaderBackend::GLASM) {
-            // Make sure programs have built if we are building shaders in parallel
-            for (OGLProgram& program : source_programs) {
-                if (program.handle != 0) {
-                    GLint status{};
-                    glGetProgramiv(program.handle, GL_LINK_STATUS, &status);
-                }
-            }
+        if (in_parallel) {
+            std::lock_guard lock{built_mutex};
+            built_fence.Create();
+            // Flush this context to ensure compilation commands and fence are in the GPU pipe.
+            glFlush();
+            built_condvar.notify_one();
+        } else {
+            is_built = true;
        }
        if (shader_notify) {
            shader_notify->MarkShaderComplete();
        }
-        is_built = true;
-        built_condvar.notify_one();
    }};
    if (thread_worker) {
        thread_worker->QueueWork(std::move(func));
@@ -440,7 +434,7 @@ void GraphicsPipeline::ConfigureImpl(bool is_indexed) {
    buffer_cache.UpdateGraphicsBuffers(is_indexed);
    buffer_cache.BindHostGeometryBuffers(is_indexed);

-    if (!is_built.load(std::memory_order::relaxed)) {
+    if (!IsBuilt()) {
        WaitForBuild();
    }
    const bool use_assembly{assembly_programs[0].handle != 0};
@@ -585,8 +579,26 @@ void GraphicsPipeline::GenerateTransformFeedbackState() {
 }

 void GraphicsPipeline::WaitForBuild() {
-    std::unique_lock lock{built_mutex};
-    built_condvar.wait(lock, [this] { return is_built.load(std::memory_order::relaxed); });
+    if (built_fence.handle == 0) {
+        std::unique_lock lock{built_mutex};
+        built_condvar.wait(lock, [this] { return built_fence.handle != 0; });
+    }
+    ASSERT(glClientWaitSync(built_fence.handle, 0, GL_TIMEOUT_IGNORED) != GL_WAIT_FAILED);
+    is_built = true;
+}
+
+bool GraphicsPipeline::IsBuilt() noexcept {
+    if (is_built) {
+        return true;
+    }
+    if (built_fence.handle == 0) {
+        return false;
+    }
+    // Timeout of zero means this is non-blocking
+    const auto sync_status = glClientWaitSync(built_fence.handle, 0, 0);
+    ASSERT(sync_status != GL_WAIT_FAILED);
+    is_built = sync_status != GL_TIMEOUT_EXPIRED;
+    return is_built;
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_graphics_pipeline.h
+++ b/src/video_core/renderer_opengl/gl_graphics_pipeline.h
@@ -100,9 +100,7 @@ public:
        return writes_global_memory;
    }

-    [[nodiscard]] bool IsBuilt() const noexcept {
-        return is_built.load(std::memory_order::relaxed);
-    }
+    [[nodiscard]] bool IsBuilt() noexcept;

    template <typename Spec>
    static auto MakeConfigureSpecFunc() {
@@ -154,7 +152,8 @@ private:

    std::mutex built_mutex;
    std::condition_variable built_condvar;
-    std::atomic_bool is_built{false};
+    OGLSync built_fence{};
+    bool is_built{false};
 };

 } // namespace OpenGL
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -50,6 +50,7 @@ std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Cor
        gpu->BindRenderer(std::move(renderer));
        return gpu;
    } catch (const std::runtime_error& exception) {
+        scope.Cancel();
        LOG_ERROR(HW_GPU, "Failed to initialize GPU: {}", exception.what());
        return nullptr;
    }
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -609,6 +609,7 @@ void Config::ReadCpuValues() {
    ReadGlobalSetting(Settings::values.cpuopt_unsafe_ignore_standard_fpcr);
    ReadGlobalSetting(Settings::values.cpuopt_unsafe_inaccurate_nan);
    ReadGlobalSetting(Settings::values.cpuopt_unsafe_fastmem_check);
+    ReadGlobalSetting(Settings::values.cpuopt_unsafe_ignore_global_monitor);

    if (global) {
        ReadBasicSetting(Settings::values.cpu_debug_mode);
@@ -621,6 +622,8 @@ void Config::ReadCpuValues() {
        ReadBasicSetting(Settings::values.cpuopt_misc_ir);
        ReadBasicSetting(Settings::values.cpuopt_reduce_misalign_checks);
        ReadBasicSetting(Settings::values.cpuopt_fastmem);
+        ReadBasicSetting(Settings::values.cpuopt_fastmem_exclusives);
+        ReadBasicSetting(Settings::values.cpuopt_recompile_exclusives);
    }

    qt_config->endGroup();
@@ -1139,6 +1142,7 @@ void Config::SaveCpuValues() {
    WriteGlobalSetting(Settings::values.cpuopt_unsafe_ignore_standard_fpcr);
    WriteGlobalSetting(Settings::values.cpuopt_unsafe_inaccurate_nan);
    WriteGlobalSetting(Settings::values.cpuopt_unsafe_fastmem_check);
+    WriteGlobalSetting(Settings::values.cpuopt_unsafe_ignore_global_monitor);

    if (global) {
        WriteBasicSetting(Settings::values.cpu_debug_mode);
--- a/src/yuzu/configuration/configure_cpu.cpp
+++ b/src/yuzu/configuration/configure_cpu.cpp
@@ -36,6 +36,7 @@ void ConfigureCpu::SetConfiguration() {
    ui->cpuopt_unsafe_ignore_standard_fpcr->setEnabled(runtime_lock);
    ui->cpuopt_unsafe_inaccurate_nan->setEnabled(runtime_lock);
    ui->cpuopt_unsafe_fastmem_check->setEnabled(runtime_lock);
+    ui->cpuopt_unsafe_ignore_global_monitor->setEnabled(runtime_lock);

    ui->cpuopt_unsafe_unfuse_fma->setChecked(Settings::values.cpuopt_unsafe_unfuse_fma.GetValue());
    ui->cpuopt_unsafe_reduce_fp_error->setChecked(
@@ -46,6 +47,8 @@ void ConfigureCpu::SetConfiguration() {
        Settings::values.cpuopt_unsafe_inaccurate_nan.GetValue());
    ui->cpuopt_unsafe_fastmem_check->setChecked(
        Settings::values.cpuopt_unsafe_fastmem_check.GetValue());
+    ui->cpuopt_unsafe_ignore_global_monitor->setChecked(
+        Settings::values.cpuopt_unsafe_ignore_global_monitor.GetValue());

    if (Settings::IsConfiguringGlobal()) {
        ui->accuracy->setCurrentIndex(static_cast<int>(Settings::values.cpu_accuracy.GetValue()));
@@ -82,6 +85,9 @@ void ConfigureCpu::ApplyConfiguration() {
    ConfigurationShared::ApplyPerGameSetting(&Settings::values.cpuopt_unsafe_fastmem_check,
                                             ui->cpuopt_unsafe_fastmem_check,
                                             cpuopt_unsafe_fastmem_check);
+    ConfigurationShared::ApplyPerGameSetting(&Settings::values.cpuopt_unsafe_ignore_global_monitor,
+                                             ui->cpuopt_unsafe_ignore_global_monitor,
+                                             cpuopt_unsafe_ignore_global_monitor);
 }

 void ConfigureCpu::changeEvent(QEvent* event) {
@@ -120,4 +126,7 @@ void ConfigureCpu::SetupPerGameUI() {
    ConfigurationShared::SetColoredTristate(ui->cpuopt_unsafe_fastmem_check,
                                            Settings::values.cpuopt_unsafe_fastmem_check,
                                            cpuopt_unsafe_fastmem_check);
+    ConfigurationShared::SetColoredTristate(ui->cpuopt_unsafe_ignore_global_monitor,
+                                            Settings::values.cpuopt_unsafe_ignore_global_monitor,
+                                            cpuopt_unsafe_ignore_global_monitor);
 }
--- a/src/yuzu/configuration/configure_cpu.h
+++ b/src/yuzu/configuration/configure_cpu.h
@@ -45,6 +45,7 @@ private:
    ConfigurationShared::CheckState cpuopt_unsafe_ignore_standard_fpcr;
    ConfigurationShared::CheckState cpuopt_unsafe_inaccurate_nan;
    ConfigurationShared::CheckState cpuopt_unsafe_fastmem_check;
+    ConfigurationShared::CheckState cpuopt_unsafe_ignore_global_monitor;

    const Core::System& system;
 };
--- a/src/yuzu/configuration/configure_cpu.ui
+++ b/src/yuzu/configuration/configure_cpu.ui
@@ -150,6 +150,18 @@
          </property>
         </widget>
        </item>
+        <item>
+         <widget class="QCheckBox" name="cpuopt_unsafe_ignore_global_monitor">
+          <property name="toolTip">
+           <string>
+            &lt;div&gt;This option improves speed by relying only on the semantics of cmpxchg to ensure safety of exclusive access instructions. Please note this may result in deadlocks and other race conditions.&lt;/div&gt;
+           </string>
+          </property>
+          <property name="text">
+           <string>Ignore global monitor</string>
+          </property>
+         </widget>
+        </item>
       </layout>
      </widget>
     </item>
--- a/src/yuzu/configuration/configure_cpu_debug.cpp
+++ b/src/yuzu/configuration/configure_cpu_debug.cpp
@@ -44,6 +44,12 @@ void ConfigureCpuDebug::SetConfiguration() {
        Settings::values.cpuopt_reduce_misalign_checks.GetValue());
    ui->cpuopt_fastmem->setEnabled(runtime_lock);
    ui->cpuopt_fastmem->setChecked(Settings::values.cpuopt_fastmem.GetValue());
+    ui->cpuopt_fastmem_exclusives->setEnabled(runtime_lock);
+    ui->cpuopt_fastmem_exclusives->setChecked(
+        Settings::values.cpuopt_fastmem_exclusives.GetValue());
+    ui->cpuopt_recompile_exclusives->setEnabled(runtime_lock);
+    ui->cpuopt_recompile_exclusives->setChecked(
+        Settings::values.cpuopt_recompile_exclusives.GetValue());
 }

 void ConfigureCpuDebug::ApplyConfiguration() {
@@ -56,6 +62,8 @@ void ConfigureCpuDebug::ApplyConfiguration() {
    Settings::values.cpuopt_misc_ir = ui->cpuopt_misc_ir->isChecked();
    Settings::values.cpuopt_reduce_misalign_checks = ui->cpuopt_reduce_misalign_checks->isChecked();
    Settings::values.cpuopt_fastmem = ui->cpuopt_fastmem->isChecked();
+    Settings::values.cpuopt_fastmem_exclusives = ui->cpuopt_fastmem_exclusives->isChecked();
+    Settings::values.cpuopt_recompile_exclusives = ui->cpuopt_recompile_exclusives->isChecked();
 }

 void ConfigureCpuDebug::changeEvent(QEvent* event) {
--- a/src/yuzu/configuration/configure_cpu_debug.ui
+++ b/src/yuzu/configuration/configure_cpu_debug.ui
@@ -144,7 +144,34 @@
           </string>
          </property>
          <property name="text">
-           <string>Enable Host MMU Emulation</string>
+           <string>Enable Host MMU Emulation (general memory instructions)</string>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QCheckBox" name="cpuopt_fastmem_exclusives">
+          <property name="toolTip">
+           <string>
+            &lt;div style=&quot;white-space: nowrap&quot;&gt;This optimization speeds up exclusive memory accesses by the guest program.&lt;/div&gt;
+            &lt;div style=&quot;white-space: nowrap&quot;&gt;Enabling it causes guest exclusive memory reads/writes to be done directly into memory and make use of Host's MMU.&lt;/div&gt;
+            &lt;div style=&quot;white-space: nowrap&quot;&gt;Disabling this forces all exclusive memory accesses to use Software MMU Emulation.&lt;/div&gt;
+           </string>
+          </property>
+          <property name="text">
+           <string>Enable Host MMU Emulation (exclusive memory instructions)</string>
+          </property>
+         </widget>
+        </item>
+        <item>
+         <widget class="QCheckBox" name="cpuopt_recompile_exclusives">
+          <property name="toolTip">
+           <string>
+            &lt;div style=&quot;white-space: nowrap&quot;&gt;This optimization speeds up exclusive memory accesses by the guest program.&lt;/div&gt;
+            &lt;div style=&quot;white-space: nowrap&quot;&gt;Enabling it reduces the overhead of fastmem failure of exclusive memory accesses.&lt;/div&gt;
+           </string>
+          </property>
+          <property name="text">
+           <string>Enable recompilation of exclusive memory instructions</string>
          </property>
         </widget>
        </item>
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -280,11 +280,14 @@ void Config::ReadValues() {
    ReadSetting("Cpu", Settings::values.cpuopt_misc_ir);
    ReadSetting("Cpu", Settings::values.cpuopt_reduce_misalign_checks);
    ReadSetting("Cpu", Settings::values.cpuopt_fastmem);
+    ReadSetting("Cpu", Settings::values.cpuopt_fastmem_exclusives);
+    ReadSetting("Cpu", Settings::values.cpuopt_recompile_exclusives);
    ReadSetting("Cpu", Settings::values.cpuopt_unsafe_unfuse_fma);
    ReadSetting("Cpu", Settings::values.cpuopt_unsafe_reduce_fp_error);
    ReadSetting("Cpu", Settings::values.cpuopt_unsafe_ignore_standard_fpcr);
    ReadSetting("Cpu", Settings::values.cpuopt_unsafe_inaccurate_nan);
    ReadSetting("Cpu", Settings::values.cpuopt_unsafe_fastmem_check);
+    ReadSetting("Cpu", Settings::values.cpuopt_unsafe_ignore_global_monitor);

    // Renderer
    ReadSetting("Renderer", Settings::values.renderer_backend);
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -174,6 +174,14 @@ cpuopt_reduce_misalign_checks =
 # 0: Disabled, 1 (default): Enabled
 cpuopt_fastmem =

+# Enable Host MMU Emulation for exclusive memory instructions (faster guest memory access)
+# 0: Disabled, 1 (default): Enabled
+cpuopt_fastmem_exclusives =
+
+# Enable fallback on failure of fastmem of exclusive memory instructions (faster guest memory access)
+# 0: Disabled, 1 (default): Enabled
+cpuopt_recompile_exclusives =
+
 # Enable unfuse FMA (improve performance on CPUs without FMA)
 # Only enabled if cpu_accuracy is set to Unsafe. Automatically chosen with cpu_accuracy = Auto-select.
 # 0: Disabled, 1 (default): Enabled
@@ -199,6 +207,11 @@ cpuopt_unsafe_inaccurate_nan =
 # 0: Disabled, 1 (default): Enabled
 cpuopt_unsafe_fastmem_check =

+# Enable faster exclusive instructions
+# Only enabled if cpu_accuracy is set to Unsafe. Automatically chosen with cpu_accuracy = Auto-select.
+# 0: Disabled, 1 (default): Enabled
+cpuopt_unsafe_ignore_global_monitor =
+
 [Renderer]
 # Which backend API to use.
 # 0 (default): OpenGL, 1: Vulkan
Author	SHA1	Message	Date
Wunkolo	d9b1199ffb	cpu_detect: Revert `__cpuid{ex}` array-type argument Restores compatibility with MSVC's `__cpuid` intrinsic.	2022-03-09 19:50:01 -08:00
bunnei	9a97ef4647	Merge pull request #7936 from Wunkolo/cpu_detect cpu_detect: Refactor detection of processor features	2022-03-09 15:34:42 -08:00
Wunkolo	873a9fa7e5	cpu_detect: Add missing `lzcnt` detection	2022-03-09 13:57:47 -08:00
Wunkolo	ec5f3351b6	cpu_detect: Refactor cpu/manufacturer identification Set the zero-enum value to Unknown Move the Manufacterer enum into the CPUCaps structure namespace Add "ParseManufacturer" utility-function Fix cpu/brand string buffer sizes(!)	2022-03-09 13:57:47 -08:00
Wunkolo	86e9e60f07	cpu_detect: Update array-types to `span` and `array` Update some uses of `int` into some more explicitly sized types as well	2022-03-09 13:57:47 -08:00
Wunkolo	3c33ba7f18	cpu_detect: Utilize `Bit<N>` utility function	2022-03-09 13:57:47 -08:00
Wunkolo	d233de8194	cpu_detect: Compact capability fields As this structure gets more explicit, bools can be bitfields and small enums can use smaller types for their span of values.	2022-03-09 13:57:47 -08:00
Wunkolo	add2cfcb96	bit_util: Add `bit` utility function Extracts a singular bit, as a bool, from the specified compile-time index.	2022-03-09 13:57:47 -08:00
bunnei	6f670381cf	Merge pull request #7975 from bunnei/ldr-fix hle: service: ldr: Use deterministic addresses when mapping NROs.	2022-03-08 17:39:03 -08:00
bunnei	853e58e593	hle: service: ldr: Use deterministic addresses when mapping NROs. - Instead of randomization, choose in-order addresses for where to map NROs into memory. - This results in predictable behavior when debugging and consistent behavior when reproducing issues.	2022-03-08 17:38:20 -08:00
bunnei	f2743b41b0	Merge pull request #7986 from lat9nq/vk-callback core, video_core: Fix two crashes when failing to create the emulated GPU instance	2022-03-08 12:36:57 -08:00
Fernando S	35309f27ed	Merge pull request #7989 from degasus/maxwell_LUT3 shader_recompiler/LOP3: Use brute force python results within switch/case.	2022-03-08 15:40:31 +01:00
Markus Wick	c78c8190d5	shader_recompiler/LOP3: Use brute force python results within switch/case. Thanks to @asLody for optimizing this function. This raised the focus that this function should be optimized more. The current table assumes that the host GPU is able to invert for free, so only AND,OR,XOR are accumulated in the performance metrik. Performance results: Instructions 0: 8 1: 30 2: 114 3: 80 4: 24 Latency 0: 8 1: 30 2: 194 3: 24	2022-03-08 09:44:28 +01:00
bunnei	1f37896066	Merge pull request #7974 from bunnei/improve-code-mem Kernel Memory Updates (Part 5): Revamp MapCodeMemory and UnmapCodeMemory.	2022-03-07 20:28:39 -08:00
bunnei	749f76e6fe	hle: kernel: KPageTable: Improve implementations of MapCodeMemory and UnmapCodeMemory. - This makes these functions more accurate to the real HOS implementations. - Fixes memory access issues in Super Smash Bros. Ultimate that occur when un/mapping NROs.	2022-03-07 17:18:20 -08:00
lat9nq	b5e60ae1b0	video_core: Cancel Scoped's exit call on GPU failure When CreateRenderer fails, the GraphicsContext that was std::move'd into it is destroyed before the Scoped that was created to manage its currency. In that case, the GraphicsContext::Scoped will still call its destructor at the ending of the function. And because the context is destroyed, the Scoped will cause a crash as it attempts to call a destroyed object's DoneCurrent function. Since we know when the call would be invalid, call the Scoped's Cancel method. This prevents it from calling a method on a destroyed object.	2022-03-07 18:21:56 -05:00
lat9nq	1f24a4e520	emu_window: Create a way to Cancel the exit of a Scoped If a GraphicsContext is destroyed before its Scoped is destroyed, this causes a crash as the Scoped tries to call a method in the destroyed context on exit. Add a way to Cancel the call when we know that calling the GraphicsContext will not work.	2022-03-07 18:21:56 -05:00
Fernando S	58b52f4884	Merge pull request #7930 from asLody/dma-semaphore MaxwellDMA: Implement semaphore operations	2022-03-07 21:53:38 +01:00
lat9nq	381f1dd2c9	core: Don't shutdown a null GPU When CreateGPU fails, yuzu would try and shutdown the GPU instance regardless of whether any instance was actually created. Check for nullptr before calling its methods to prevent a crash.	2022-03-07 15:25:20 -05:00
Lody	4498908e72	MaxwellDMA: Implement semaphore operations	2022-03-07 13:46:18 +08:00
Ameer J	370e480c8c	gl_graphics_pipeline: Improve shader builder synchronization using fences (#7969 ) * gl_graphics_pipeline: Improve shader builder synchronization Make use of GLsync objects to ensure better synchronization between shader builder threads and the main context * gl_graphics_pipeline: Make built_fence access threadsafe * gl_graphics_pipeline: Use GLsync objects only when building in parallel * gl_graphics_pipeline: Replace GetSync calls with non-blocking waits The spec states that a ClientWait on a Fence object ensures the changes propagate to the calling context	2022-03-06 16:46:49 +01:00
Fernando S	5192c64991	Merge pull request #7973 from Morph1984/debug-crash host_memory: Fix fastmem crashes in debug builds	2022-03-06 04:49:27 +01:00
bunnei	a31c195749	Merge pull request #7935 from Wunkolo/logging-join-fix logging: Convert `backend_thread` into an `std::jthread`	2022-03-02 19:09:26 -08:00
bunnei	3ab82e7582	Merge pull request #7956 from bunnei/improve-mem-manager Kernel Memory Updates (Part 4): Revamp KMemoryManager & other fixes	2022-03-02 17:55:51 -08:00
Morph	b33f23cc46	host_memory: Fix fastmem crashes in debug builds It is possible for virtual_offset to not be 0 when the iterator is at the beginning, and thus, std::prev(it) may be evaluated, leading to a crash in debug mode. Co-Authored-By: Fernando S. <1731197+FernandoS27@users.noreply.github.com>	2022-03-02 18:36:59 -05:00
Fernando S	e06a133717	Merge pull request #7959 from merryhime/cmpxchg dynarmic: Inline exclusive memory accesses	2022-03-01 22:50:52 +01:00
Mai M	3c47570563	Merge pull request #7967 from zhaobot/tx-update-20220301023432 Update translations (2022-03-01)	2022-03-01 00:50:28 -05:00
merry	ec9689f200	dynarmic: Update to latest master	2022-02-28 20:10:13 +00:00
bunnei	14d28a043d	hle: kernel: Re-create memory layout at initialization. - As this can only be derived once.	2022-02-27 18:00:09 -08:00
bunnei	16e5954fcb	hle: kernel: Remove unused pool locals.	2022-02-27 18:00:09 -08:00
bunnei	f87f076162	hle: kernel: k_memory_manager: Rework for latest kernel behavior. - Updates the KMemoryManager implementation against latest documentation. - Reworks KMemoryLayout to be accessed throughout the kernel. - Fixes an issue with pool sizes being incorrectly reported.	2022-02-27 18:00:09 -08:00
Wunkolo	913c2bd2cb	logging: Convert `backend_thread` into an `std::jthread` Was getting an unhandled `invalid_argument` [exception](https://en.cppreference.com/w/cpp/thread/thread/join) during shutdown on my linux machine. This removes the need for a `StopBackendThread` function entirely since `jthread` [automatically handles both checking if the thread is joinable and stopping the token before attempting to join](https://en.cppreference.com/w/cpp/thread/jthread/~jthread) in the case that `StartBackendThread` was never called.	2022-02-27 16:23:52 -08:00
merry	16784e5bb3	dynarmic: Inline exclusive memory accesses Inlines implementation of exclusive instructions into JITted code, improving performance of applications relying heavily on these instructions. We also fastmem these instructions for additional speed, with support for appropriate recompilation on fastmem failure. An unsafe optimization to disable the intercore global_monitor is also provided, should one wish to rely solely on cmpxchg semantics for safety. See also: merryhime/dynarmic#664	2022-02-27 19:40:05 +00:00
bunnei	adbb9c2b00	hle: kernel: k_page_heap: GetPhysicalAddr can be const.	2022-02-27 10:34:02 -08:00
bunnei	f7e65eb971	hle: kernel: k_page_heap: Remove superfluous consexpr.	2022-02-27 10:34:02 -08:00
bunnei	06e2b76c75	hle: kernel: k_page_heap: Various updates and improvements. - KPageHeap tracks physical addresses, not virtual addresses. - Various updates and improvements to match latest documentation for this type.	2022-02-27 10:34:02 -08:00
bunnei	5d1a81520c	hle: kernel: Add initial_process.h header.	2022-02-27 10:34:02 -08:00
bunnei	a6496deeed	hle: kernel: board: nx: Add k_memory_layout.h header.	2022-02-27 10:34:02 -08:00
bunnei	9b5e7971dc	hle: kernel: k_system_control: Add GetRealMemorySize and update GetKernelPhysicalBaseAddress.	2022-02-27 10:34:02 -08:00
bunnei	18e77a54c3	hle: kernel: k_memory_layout: Add GetPhysicalLinearRegion.	2022-02-27 10:34:02 -08:00
bunnei	06a21ac229	hle: kernel: k_memory_region_types: Update for new regions.	2022-02-27 10:34:02 -08:00