"Merge Tagged PR 1012"

"Merge Tagged PR 1340"
"Merge Tagged PR 1703"
2019-11-03 10:19:13 +00:00 · 2019-11-03 10:19:11 +00:00 · 2019-11-03 10:19:10 +00:00 · 2019-11-03 10:19:09 +00:00 · 2019-11-03 10:19:09 +00:00 · 2019-11-03 10:19:08 +00:00
130 changed files with 4452 additions and 2306 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,11 +26,11 @@
    path = externals/mbedtls
    url = https://github.com/DarkLordZach/mbedtls
 [submodule "opus"]
-    path = externals/opus
-    url = https://github.com/ogniK5377/opus.git
+    path = externals/opus/opus
+    url = https://github.com/xiph/opus.git
 [submodule "soundtouch"]
-	path = externals/soundtouch
-	url = https://github.com/citra-emu/ext-soundtouch.git
+    path = externals/soundtouch
+    url = https://github.com/citra-emu/ext-soundtouch.git
 [submodule "libressl"]
    path = externals/libressl
    url = https://github.com/citra-emu/ext-libressl-portable.git
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -85,10 +85,12 @@ set(HASH_FILES
    "${VIDEO_CORE}/shader/decode/xmad.cpp"
    "${VIDEO_CORE}/shader/ast.cpp"
    "${VIDEO_CORE}/shader/ast.h"
-    "${VIDEO_CORE}/shader/control_flow.cpp"
-    "${VIDEO_CORE}/shader/control_flow.h"
    "${VIDEO_CORE}/shader/compiler_settings.cpp"
    "${VIDEO_CORE}/shader/compiler_settings.h"
+    "${VIDEO_CORE}/shader/const_buffer_locker.cpp"
+    "${VIDEO_CORE}/shader/const_buffer_locker.h"
+    "${VIDEO_CORE}/shader/control_flow.cpp"
+    "${VIDEO_CORE}/shader/control_flow.h"
    "${VIDEO_CORE}/shader/decode.cpp"
    "${VIDEO_CORE}/shader/expr.cpp"
    "${VIDEO_CORE}/shader/expr.h"
--- a/externals/opus
+++ b/externals/opus
--- a/externals/opus/CMakeLists.txt
+++ b/externals/opus/CMakeLists.txt
@@ -0,0 +1,250 @@
+cmake_minimum_required(VERSION 3.8)
+
+project(opus)
+
+option(OPUS_STACK_PROTECTOR "Use stack protection" OFF)
+option(OPUS_USE_ALLOCA "Use alloca for stack arrays (on non-C99 compilers)" OFF)
+option(OPUS_CUSTOM_MODES "Enable non-Opus modes, e.g. 44.1 kHz & 2^n frames" OFF)
+option(OPUS_FIXED_POINT "Compile as fixed-point (for machines without a fast enough FPU)" OFF)
+option(OPUS_ENABLE_FLOAT_API "Compile with the floating point API (for machines with float library" ON)
+
+include(opus/opus_functions.cmake)
+
+if(OPUS_STACK_PROTECTOR)
+    if(NOT MSVC) # GC on by default on MSVC
+        check_and_set_flag(STACK_PROTECTION_STRONG -fstack-protector-strong)
+    endif()
+else()
+    if(MSVC)
+        check_and_set_flag(BUFFER_SECURITY_CHECK /GS-)
+    endif()
+endif()
+
+add_library(opus STATIC
+    # CELT sources
+    opus/celt/bands.c
+    opus/celt/celt.c
+    opus/celt/celt_decoder.c
+    opus/celt/celt_encoder.c
+    opus/celt/celt_lpc.c
+    opus/celt/cwrs.c
+    opus/celt/entcode.c
+    opus/celt/entdec.c
+    opus/celt/entenc.c
+    opus/celt/kiss_fft.c
+    opus/celt/laplace.c
+    opus/celt/mathops.c
+    opus/celt/mdct.c
+    opus/celt/modes.c
+    opus/celt/pitch.c
+    opus/celt/quant_bands.c
+    opus/celt/rate.c
+    opus/celt/vq.c
+
+    # SILK sources
+    opus/silk/A2NLSF.c
+    opus/silk/CNG.c
+    opus/silk/HP_variable_cutoff.c
+    opus/silk/LPC_analysis_filter.c
+    opus/silk/LPC_fit.c
+    opus/silk/LPC_inv_pred_gain.c
+    opus/silk/LP_variable_cutoff.c
+    opus/silk/NLSF2A.c
+    opus/silk/NLSF_VQ.c
+    opus/silk/NLSF_VQ_weights_laroia.c
+    opus/silk/NLSF_decode.c
+    opus/silk/NLSF_del_dec_quant.c
+    opus/silk/NLSF_encode.c
+    opus/silk/NLSF_stabilize.c
+    opus/silk/NLSF_unpack.c
+    opus/silk/NSQ.c
+    opus/silk/NSQ_del_dec.c
+    opus/silk/PLC.c
+    opus/silk/VAD.c
+    opus/silk/VQ_WMat_EC.c
+    opus/silk/ana_filt_bank_1.c
+    opus/silk/biquad_alt.c
+    opus/silk/bwexpander.c
+    opus/silk/bwexpander_32.c
+    opus/silk/check_control_input.c
+    opus/silk/code_signs.c
+    opus/silk/control_SNR.c
+    opus/silk/control_audio_bandwidth.c
+    opus/silk/control_codec.c
+    opus/silk/dec_API.c
+    opus/silk/decode_core.c
+    opus/silk/decode_frame.c
+    opus/silk/decode_indices.c
+    opus/silk/decode_parameters.c
+    opus/silk/decode_pitch.c
+    opus/silk/decode_pulses.c
+    opus/silk/decoder_set_fs.c
+    opus/silk/enc_API.c
+    opus/silk/encode_indices.c
+    opus/silk/encode_pulses.c
+    opus/silk/gain_quant.c
+    opus/silk/init_decoder.c
+    opus/silk/init_encoder.c
+    opus/silk/inner_prod_aligned.c
+    opus/silk/interpolate.c
+    opus/silk/lin2log.c
+    opus/silk/log2lin.c
+    opus/silk/pitch_est_tables.c
+    opus/silk/process_NLSFs.c
+    opus/silk/quant_LTP_gains.c
+    opus/silk/resampler.c
+    opus/silk/resampler_down2.c
+    opus/silk/resampler_down2_3.c
+    opus/silk/resampler_private_AR2.c
+    opus/silk/resampler_private_IIR_FIR.c
+    opus/silk/resampler_private_down_FIR.c
+    opus/silk/resampler_private_up2_HQ.c
+    opus/silk/resampler_rom.c
+    opus/silk/shell_coder.c
+    opus/silk/sigm_Q15.c
+    opus/silk/sort.c
+    opus/silk/stereo_LR_to_MS.c
+    opus/silk/stereo_MS_to_LR.c
+    opus/silk/stereo_decode_pred.c
+    opus/silk/stereo_encode_pred.c
+    opus/silk/stereo_find_predictor.c
+    opus/silk/stereo_quant_pred.c
+    opus/silk/sum_sqr_shift.c
+    opus/silk/table_LSF_cos.c
+    opus/silk/tables_LTP.c
+    opus/silk/tables_NLSF_CB_NB_MB.c
+    opus/silk/tables_NLSF_CB_WB.c
+    opus/silk/tables_gain.c
+    opus/silk/tables_other.c
+    opus/silk/tables_pitch_lag.c
+    opus/silk/tables_pulses_per_block.c
+
+    # Opus sources
+    opus/src/analysis.c
+    opus/src/mapping_matrix.c
+    opus/src/mlp.c
+    opus/src/mlp_data.c
+    opus/src/opus.c
+    opus/src/opus_decoder.c
+    opus/src/opus_encoder.c
+    opus/src/opus_multistream.c
+    opus/src/opus_multistream_decoder.c
+    opus/src/opus_multistream_encoder.c
+    opus/src/opus_projection_decoder.c
+    opus/src/opus_projection_encoder.c
+    opus/src/repacketizer.c
+)
+
+if (DEBUG)
+    target_sources(opus PRIVATE opus/silk/debug.c)
+endif()
+
+if (OPUS_FIXED_POINT)
+    target_sources(opus PRIVATE
+        opus/silk/fixed/LTP_analysis_filter_FIX.c
+        opus/silk/fixed/LTP_scale_ctrl_FIX.c
+        opus/silk/fixed/apply_sine_window_FIX.c
+        opus/silk/fixed/autocorr_FIX.c
+        opus/silk/fixed/burg_modified_FIX.c
+        opus/silk/fixed/corrMatrix_FIX.c
+        opus/silk/fixed/encode_frame_FIX.c
+        opus/silk/fixed/find_LPC_FIX.c
+        opus/silk/fixed/find_LTP_FIX.c
+        opus/silk/fixed/find_pitch_lags_FIX.c
+        opus/silk/fixed/find_pred_coefs_FIX.c
+        opus/silk/fixed/k2a_FIX.c
+        opus/silk/fixed/k2a_Q16_FIX.c
+        opus/silk/fixed/noise_shape_analysis_FIX.c
+        opus/silk/fixed/pitch_analysis_core_FIX.c
+        opus/silk/fixed/prefilter_FIX.c
+        opus/silk/fixed/process_gains_FIX.c
+        opus/silk/fixed/regularize_correlations_FIX.c
+        opus/silk/fixed/residual_energy16_FIX.c
+        opus/silk/fixed/residual_energy_FIX.c
+        opus/silk/fixed/schur64_FIX.c
+        opus/silk/fixed/schur_FIX.c
+        opus/silk/fixed/solve_LS_FIX.c
+        opus/silk/fixed/vector_ops_FIX.c
+        opus/silk/fixed/warped_autocorrelation_FIX.c
+    )
+else()
+    target_sources(opus PRIVATE
+        opus/silk/float/LPC_analysis_filter_FLP.c
+        opus/silk/float/LPC_inv_pred_gain_FLP.c
+        opus/silk/float/LTP_analysis_filter_FLP.c
+        opus/silk/float/LTP_scale_ctrl_FLP.c
+        opus/silk/float/apply_sine_window_FLP.c
+        opus/silk/float/autocorrelation_FLP.c
+        opus/silk/float/burg_modified_FLP.c
+        opus/silk/float/bwexpander_FLP.c
+        opus/silk/float/corrMatrix_FLP.c
+        opus/silk/float/encode_frame_FLP.c
+        opus/silk/float/energy_FLP.c
+        opus/silk/float/find_LPC_FLP.c
+        opus/silk/float/find_LTP_FLP.c
+        opus/silk/float/find_pitch_lags_FLP.c
+        opus/silk/float/find_pred_coefs_FLP.c
+        opus/silk/float/inner_product_FLP.c
+        opus/silk/float/k2a_FLP.c
+        opus/silk/float/noise_shape_analysis_FLP.c
+        opus/silk/float/pitch_analysis_core_FLP.c
+        opus/silk/float/process_gains_FLP.c
+        opus/silk/float/regularize_correlations_FLP.c
+        opus/silk/float/residual_energy_FLP.c
+        opus/silk/float/scale_copy_vector_FLP.c
+        opus/silk/float/scale_vector_FLP.c
+        opus/silk/float/schur_FLP.c
+        opus/silk/float/sort_FLP.c
+        opus/silk/float/warped_autocorrelation_FLP.c
+        opus/silk/float/wrappers_FLP.c
+    )
+endif()
+
+target_compile_definitions(opus PRIVATE OPUS_BUILD ENABLE_HARDENING)
+
+if(NOT MSVC)
+    target_compile_definitions(opus PRIVATE _FORTIFY_SOURCE=2)
+endif()
+
+# It is strongly recommended to uncomment one of these VAR_ARRAYS: Use C99
+# variable-length arrays for stack allocation USE_ALLOCA: Use alloca() for stack
+# allocation If none is defined, then the fallback is a non-threadsafe global
+# array
+if(OPUS_USE_ALLOCA OR MSVC)
+    target_compile_definitions(opus PRIVATE USE_ALLOCA)
+else()
+    target_compile_definitions(opus PRIVATE VAR_ARRAYS)
+endif()
+
+if(OPUS_CUSTOM_MODES)
+    target_compile_definitions(opus PRIVATE CUSTOM_MODES)
+endif()
+
+if(NOT OPUS_ENABLE_FLOAT_API)
+    target_compile_definitions(opus PRIVATE DISABLE_FLOAT_API)
+endif()
+
+target_compile_definitions(opus
+PUBLIC
+    -DOPUS_VERSION="\\"1.3.1\\""
+
+PRIVATE
+    # Use C99 intrinsics to speed up float-to-int conversion
+    HAVE_LRINTF
+)
+
+if (FIXED_POINT)
+    target_compile_definitions(opus PRIVATE -DFIXED_POINT=1 -DDISABLE_FLOAT_API)
+endif()
+
+target_include_directories(opus
+PUBLIC
+    opus/include
+
+PRIVATE
+    opus/celt
+    opus/silk
+    opus/silk/fixed
+    opus/silk/float
+    opus/src
+)
--- a/externals/opus/opus
+++ b/externals/opus/opus
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -74,10 +74,12 @@ add_custom_command(OUTPUT scm_rev.cpp
      "${VIDEO_CORE}/shader/decode/xmad.cpp"
      "${VIDEO_CORE}/shader/ast.cpp"
      "${VIDEO_CORE}/shader/ast.h"
-      "${VIDEO_CORE}/shader/control_flow.cpp"
-      "${VIDEO_CORE}/shader/control_flow.h"
      "${VIDEO_CORE}/shader/compiler_settings.cpp"
      "${VIDEO_CORE}/shader/compiler_settings.h"
+      "${VIDEO_CORE}/shader/const_buffer_locker.cpp"
+      "${VIDEO_CORE}/shader/const_buffer_locker.h"
+      "${VIDEO_CORE}/shader/control_flow.cpp"
+      "${VIDEO_CORE}/shader/control_flow.h"
      "${VIDEO_CORE}/shader/decode.cpp"
      "${VIDEO_CORE}/shader/expr.cpp"
      "${VIDEO_CORE}/shader/expr.h"
--- a/src/common/assert.h
+++ b/src/common/assert.h
@@ -28,18 +28,14 @@ __declspec(noinline, noreturn)
 }

 #define ASSERT(_a_)                                                                                \
-    do                                                                                             \
-        if (!(_a_)) {                                                                              \
-            assert_noinline_call([] { LOG_CRITICAL(Debug, "Assertion Failed!"); });                \
-        }                                                                                          \
-    while (0)
+    if (!(_a_)) {                                                                                  \
+        LOG_CRITICAL(Debug, "Assertion Failed!");                                                  \
+    }

 #define ASSERT_MSG(_a_, ...)                                                                       \
-    do                                                                                             \
-        if (!(_a_)) {                                                                              \
-            assert_noinline_call([&] { LOG_CRITICAL(Debug, "Assertion Failed!\n" __VA_ARGS__); }); \
-        }                                                                                          \
-    while (0)
+    if (!(_a_)) {                                                                                  \
+        LOG_CRITICAL(Debug, "Assertion Failed! " __VA_ARGS__);                                     \
+    }

 #define UNREACHABLE() ASSERT_MSG(false, "Unreachable code!")
 #define UNREACHABLE_MSG(...) ASSERT_MSG(false, __VA_ARGS__)
--- a/src/common/bit_field.h
+++ b/src/common/bit_field.h
@@ -168,11 +168,11 @@ public:
    constexpr BitField(BitField&&) noexcept = default;
    constexpr BitField& operator=(BitField&&) noexcept = default;

-    constexpr FORCE_INLINE operator T() const {
+    constexpr operator T() const {
        return Value();
    }

-    constexpr FORCE_INLINE void Assign(const T& value) {
+    constexpr void Assign(const T& value) {
        storage = (static_cast<StorageType>(storage) & ~mask) | FormatValue(value);
    }

--- a/src/common/hash.h
+++ b/src/common/hash.h
@@ -6,6 +6,8 @@

 #include <cstddef>
 #include <cstring>
+#include <utility>
+#include <boost/functional/hash.hpp>
 #include "common/cityhash.h"
 #include "common/common_types.h"

@@ -68,4 +70,13 @@ struct HashableStruct {
    }
 };

+struct PairHash {
+    template <class T1, class T2>
+    std::size_t operator()(const std::pair<T1, T2>& pair) const noexcept {
+        std::size_t seed = std::hash<T1>()(pair.first);
+        boost::hash_combine(seed, std::hash<T2>()(pair.second));
+        return seed;
+    }
+};
+
 } // namespace Common
--- a/src/common/multi_level_queue.h
+++ b/src/common/multi_level_queue.h
@@ -304,6 +304,13 @@ public:
        return levels[priority == Depth ? 63 : priority].back();
    }

+    void clear() {
+        used_priorities = 0;
+        for (std::size_t i = 0; i < Depth; i++) {
+            levels[i].clear();
+        }
+    }
+
 private:
    using const_list_iterator = typename std::list<T>::const_iterator;

--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -409,6 +409,12 @@ void System::PrepareReschedule() {
    CurrentCpuCore().PrepareReschedule();
 }

+void System::PrepareReschedule(const u32 core_index) {
+    if (core_index < GlobalScheduler().CpuCoresCount()) {
+        CpuCore(core_index).PrepareReschedule();
+    }
+}
+
 PerfStatsResults System::GetAndResetPerfStats() {
    return impl->GetAndResetPerfStats();
 }
@@ -449,6 +455,16 @@ const Kernel::Scheduler& System::Scheduler(std::size_t core_index) const {
    return CpuCore(core_index).Scheduler();
 }

+/// Gets the global scheduler
+Kernel::GlobalScheduler& System::GlobalScheduler() {
+    return impl->kernel.GlobalScheduler();
+}
+
+/// Gets the global scheduler
+const Kernel::GlobalScheduler& System::GlobalScheduler() const {
+    return impl->kernel.GlobalScheduler();
+}
+
 Kernel::Process* System::CurrentProcess() {
    return impl->kernel.CurrentProcess();
 }
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -24,6 +24,7 @@ class VfsFilesystem;
 } // namespace FileSys

 namespace Kernel {
+class GlobalScheduler;
 class KernelCore;
 class Process;
 class Scheduler;
@@ -184,6 +185,9 @@ public:
    /// Prepare the core emulation for a reschedule
    void PrepareReschedule();

+    /// Prepare the core emulation for a reschedule
+    void PrepareReschedule(u32 core_index);
+
    /// Gets and resets core performance statistics
    PerfStatsResults GetAndResetPerfStats();

@@ -238,6 +242,12 @@ public:
    /// Gets the scheduler for the CPU core with the specified index
    const Kernel::Scheduler& Scheduler(std::size_t core_index) const;

+    /// Gets the global scheduler
+    Kernel::GlobalScheduler& GlobalScheduler();
+
+    /// Gets the global scheduler
+    const Kernel::GlobalScheduler& GlobalScheduler() const;
+
    /// Provides a pointer to the current process
    Kernel::Process* CurrentProcess();

--- a/src/core/core_cpu.cpp
+++ b/src/core/core_cpu.cpp
@@ -52,7 +52,8 @@ bool CpuBarrier::Rendezvous() {

 Cpu::Cpu(System& system, ExclusiveMonitor& exclusive_monitor, CpuBarrier& cpu_barrier,
         std::size_t core_index)
-    : cpu_barrier{cpu_barrier}, core_timing{system.CoreTiming()}, core_index{core_index} {
+    : cpu_barrier{cpu_barrier}, global_scheduler{system.GlobalScheduler()},
+      core_timing{system.CoreTiming()}, core_index{core_index} {
 #ifdef ARCHITECTURE_x86_64
    arm_interface = std::make_unique<ARM_Dynarmic>(system, exclusive_monitor, core_index);
 #else
@@ -60,7 +61,7 @@ Cpu::Cpu(System& system, ExclusiveMonitor& exclusive_monitor, CpuBarrier& cpu_ba
    LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available");
 #endif

-    scheduler = std::make_unique<Kernel::Scheduler>(system, *arm_interface);
+    scheduler = std::make_unique<Kernel::Scheduler>(system, *arm_interface, core_index);
 }

 Cpu::~Cpu() = default;
@@ -81,21 +82,21 @@ void Cpu::RunLoop(bool tight_loop) {
        return;
    }

+    Reschedule();
+
    // If we don't have a currently active thread then don't execute instructions,
    // instead advance to the next event and try to yield to the next thread
    if (Kernel::GetCurrentThread() == nullptr) {
        LOG_TRACE(Core, "Core-{} idling", core_index);
        core_timing.Idle();
-        core_timing.Advance();
-        PrepareReschedule();
    } else {
        if (tight_loop) {
            arm_interface->Run();
        } else {
            arm_interface->Step();
        }
-        core_timing.Advance();
    }
+    core_timing.Advance();

    Reschedule();
 }
@@ -106,18 +107,18 @@ void Cpu::SingleStep() {

 void Cpu::PrepareReschedule() {
    arm_interface->PrepareReschedule();
-    reschedule_pending = true;
 }

 void Cpu::Reschedule() {
-    if (!reschedule_pending) {
-        return;
-    }
-
-    reschedule_pending = false;
    // Lock the global kernel mutex when we manipulate the HLE state
-    std::lock_guard lock{HLE::g_hle_lock};
-    scheduler->Reschedule();
+    std::lock_guard lock(HLE::g_hle_lock);
+
+    global_scheduler.SelectThread(core_index);
+    scheduler->TryDoContextSwitch();
+}
+
+void Cpu::Shutdown() {
+    scheduler->Shutdown();
 }

 } // namespace Core
--- a/src/core/core_cpu.h
+++ b/src/core/core_cpu.h
@@ -12,8 +12,9 @@
 #include "common/common_types.h"

 namespace Kernel {
+class GlobalScheduler;
 class Scheduler;
-}
+} // namespace Kernel

 namespace Core {
 class System;
@@ -83,6 +84,8 @@ public:
        return core_index;
    }

+    void Shutdown();
+
    static std::unique_ptr<ExclusiveMonitor> MakeExclusiveMonitor(std::size_t num_cores);

 private:
@@ -90,6 +93,7 @@ private:

    std::unique_ptr<ARM_Interface> arm_interface;
    CpuBarrier& cpu_barrier;
+    Kernel::GlobalScheduler& global_scheduler;
    std::unique_ptr<Kernel::Scheduler> scheduler;
    Timing::CoreTiming& core_timing;

--- a/src/core/cpu_core_manager.cpp
+++ b/src/core/cpu_core_manager.cpp
@@ -58,6 +58,7 @@ void CpuCoreManager::Shutdown() {

    thread_to_cpu.clear();
    for (auto& cpu_core : cores) {
+        cpu_core->Shutdown();
        cpu_core.reset();
    }

--- a/src/core/file_sys/savedata_factory.cpp
+++ b/src/core/file_sys/savedata_factory.cpp
@@ -16,6 +16,7 @@ namespace FileSys {
 constexpr char SAVE_DATA_SIZE_FILENAME[] = ".yuzu_save_size";

 namespace {
+
 void PrintSaveDataDescriptorWarnings(SaveDataDescriptor meta) {
    if (meta.type == SaveDataType::SystemSaveData || meta.type == SaveDataType::SaveData) {
        if (meta.zero_1 != 0) {
@@ -52,6 +53,13 @@ void PrintSaveDataDescriptorWarnings(SaveDataDescriptor meta) {
                    meta.user_id[1], meta.user_id[0]);
    }
 }
+
+bool ShouldSaveDataBeAutomaticallyCreated(SaveDataSpaceId space, const SaveDataDescriptor& desc) {
+    return desc.type == SaveDataType::CacheStorage || desc.type == SaveDataType::TemporaryStorage ||
+           (space == SaveDataSpaceId::NandUser && ///< Normal Save Data -- Current Title & User
+            desc.type == SaveDataType::SaveData && desc.title_id == 0 && desc.save_id == 0);
+}
+
 } // Anonymous namespace

 std::string SaveDataDescriptor::DebugInfo() const {
@@ -96,6 +104,10 @@ ResultVal<VirtualDir> SaveDataFactory::Open(SaveDataSpaceId space,

    auto out = dir->GetDirectoryRelative(save_directory);

+    if (out == nullptr && ShouldSaveDataBeAutomaticallyCreated(space, meta)) {
+        return Create(space, meta);
+    }
+
    // Return an error if the save data doesn't actually exist.
    if (out == nullptr) {
        // TODO(Subv): Find out correct error code.
--- a/src/core/gdbstub/gdbstub.cpp
+++ b/src/core/gdbstub/gdbstub.cpp
@@ -202,13 +202,11 @@ void RegisterModule(std::string name, VAddr beg, VAddr end, bool add_elf_ext) {
 }

 static Kernel::Thread* FindThreadById(s64 id) {
-    for (u32 core = 0; core < Core::NUM_CPU_CORES; core++) {
-        const auto& threads = Core::System::GetInstance().Scheduler(core).GetThreadList();
-        for (auto& thread : threads) {
-            if (thread->GetThreadID() == static_cast<u64>(id)) {
-                current_core = core;
-                return thread.get();
-            }
+    const auto& threads = Core::System::GetInstance().GlobalScheduler().GetThreadList();
+    for (auto& thread : threads) {
+        if (thread->GetThreadID() == static_cast<u64>(id)) {
+            current_core = thread->GetProcessorID();
+            return thread.get();
        }
    }
    return nullptr;
@@ -647,11 +645,9 @@ static void HandleQuery() {
        SendReply(buffer.c_str());
    } else if (strncmp(query, "fThreadInfo", strlen("fThreadInfo")) == 0) {
        std::string val = "m";
-        for (u32 core = 0; core < Core::NUM_CPU_CORES; core++) {
-            const auto& threads = Core::System::GetInstance().Scheduler(core).GetThreadList();
-            for (const auto& thread : threads) {
-                val += fmt::format("{:x},", thread->GetThreadID());
-            }
+        const auto& threads = Core::System::GetInstance().GlobalScheduler().GetThreadList();
+        for (const auto& thread : threads) {
+            val += fmt::format("{:x},", thread->GetThreadID());
        }
        val.pop_back();
        SendReply(val.c_str());
@@ -661,13 +657,11 @@ static void HandleQuery() {
        std::string buffer;
        buffer += "l<?xml version=\"1.0\"?>";
        buffer += "<threads>";
-        for (u32 core = 0; core < Core::NUM_CPU_CORES; core++) {
-            const auto& threads = Core::System::GetInstance().Scheduler(core).GetThreadList();
-            for (const auto& thread : threads) {
-                buffer +=
-                    fmt::format(R"*(<thread id="{:x}" core="{:d}" name="Thread {:x}"></thread>)*",
-                                thread->GetThreadID(), core, thread->GetThreadID());
-            }
+        const auto& threads = Core::System::GetInstance().GlobalScheduler().GetThreadList();
+        for (const auto& thread : threads) {
+            buffer +=
+                fmt::format(R"*(<thread id="{:x}" core="{:d}" name="Thread {:x}"></thread>)*",
+                            thread->GetThreadID(), thread->GetProcessorID(), thread->GetThreadID());
        }
        buffer += "</threads>";
        SendReply(buffer.c_str());
--- a/src/core/hle/kernel/address_arbiter.cpp
+++ b/src/core/hle/kernel/address_arbiter.cpp
@@ -22,6 +22,7 @@ namespace Kernel {
 namespace {
 // Wake up num_to_wake (or all) threads in a vector.
 void WakeThreads(const std::vector<SharedPtr<Thread>>& waiting_threads, s32 num_to_wake) {
+    auto& system = Core::System::GetInstance();
    // Only process up to 'target' threads, unless 'target' is <= 0, in which case process
    // them all.
    std::size_t last = waiting_threads.size();
@@ -35,6 +36,7 @@ void WakeThreads(const std::vector<SharedPtr<Thread>>& waiting_threads, s32 num_
        waiting_threads[i]->SetWaitSynchronizationResult(RESULT_SUCCESS);
        waiting_threads[i]->SetArbiterWaitAddress(0);
        waiting_threads[i]->ResumeFromWait();
+        system.PrepareReschedule(waiting_threads[i]->GetProcessorID());
    }
 }
 } // Anonymous namespace
@@ -89,12 +91,20 @@ ResultCode AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr a

    // Determine the modified value depending on the waiting count.
    s32 updated_value;
-    if (waiting_threads.empty()) {
-        updated_value = value + 1;
-    } else if (num_to_wake <= 0 || waiting_threads.size() <= static_cast<u32>(num_to_wake)) {
-        updated_value = value - 1;
+    if (num_to_wake <= 0) {
+        if (waiting_threads.empty()) {
+            updated_value = value + 1;
+        } else {
+            updated_value = value - 1;
+        }
    } else {
-        updated_value = value;
+        if (waiting_threads.empty()) {
+            updated_value = value + 1;
+        } else if (waiting_threads.size() <= static_cast<u32>(num_to_wake)) {
+            updated_value = value - 1;
+        } else {
+            updated_value = value;
+        }
    }

    if (static_cast<s32>(Memory::Read32(address)) != value) {
@@ -169,30 +179,22 @@ ResultCode AddressArbiter::WaitForAddressImpl(VAddr address, s64 timeout) {

    current_thread->WakeAfterDelay(timeout);

-    system.CpuCore(current_thread->GetProcessorID()).PrepareReschedule();
+    system.PrepareReschedule(current_thread->GetProcessorID());
    return RESULT_TIMEOUT;
 }

 std::vector<SharedPtr<Thread>> AddressArbiter::GetThreadsWaitingOnAddress(VAddr address) const {
-    const auto RetrieveWaitingThreads = [this](std::size_t core_index,
-                                               std::vector<SharedPtr<Thread>>& waiting_threads,
-                                               VAddr arb_addr) {
-        const auto& scheduler = system.Scheduler(core_index);
-        const auto& thread_list = scheduler.GetThreadList();
-
-        for (const auto& thread : thread_list) {
-            if (thread->GetArbiterWaitAddress() == arb_addr) {
-                waiting_threads.push_back(thread);
-            }
-        }
-    };

    // Retrieve all threads that are waiting for this address.
    std::vector<SharedPtr<Thread>> threads;
-    RetrieveWaitingThreads(0, threads, address);
-    RetrieveWaitingThreads(1, threads, address);
-    RetrieveWaitingThreads(2, threads, address);
-    RetrieveWaitingThreads(3, threads, address);
+    const auto& scheduler = system.GlobalScheduler();
+    const auto& thread_list = scheduler.GetThreadList();
+
+    for (const auto& thread : thread_list) {
+        if (thread->GetArbiterWaitAddress() == address) {
+            threads.push_back(thread);
+        }
+    }

    // Sort them by priority, such that the highest priority ones come first.
    std::sort(threads.begin(), threads.end(),
--- a/src/core/hle/kernel/hle_ipc.cpp
+++ b/src/core/hle/kernel/hle_ipc.cpp
@@ -58,8 +58,7 @@ SharedPtr<WritableEvent> HLERequestContext::SleepClientThread(
    auto& kernel = Core::System::GetInstance().Kernel();
    if (!writable_event) {
        // Create event if not provided
-        const auto pair = WritableEvent::CreateEventPair(kernel, ResetType::Automatic,
-                                                         "HLE Pause Event: " + reason);
+        const auto pair = WritableEvent::CreateEventPair(kernel, "HLE Pause Event: " + reason);
        writable_event = pair.writable;
    }

--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -12,12 +12,15 @@

 #include "core/core.h"
 #include "core/core_timing.h"
+#include "core/core_timing_util.h"
 #include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/client_port.h"
+#include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/resource_limit.h"
+#include "core/hle/kernel/scheduler.h"
 #include "core/hle/kernel/thread.h"
 #include "core/hle/lock.h"
 #include "core/hle/result.h"
@@ -58,12 +61,8 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_
        if (thread->HasWakeupCallback()) {
            resume = thread->InvokeWakeupCallback(ThreadWakeupReason::Timeout, thread, nullptr, 0);
        }
-    }
-
-    if (thread->GetMutexWaitAddress() != 0 || thread->GetCondVarWaitAddress() != 0 ||
-        thread->GetWaitHandle() != 0) {
-        ASSERT(thread->GetStatus() == ThreadStatus::WaitMutex ||
-               thread->GetStatus() == ThreadStatus::WaitCondVar);
+    } else if (thread->GetStatus() == ThreadStatus::WaitMutex ||
+               thread->GetStatus() == ThreadStatus::WaitCondVar) {
        thread->SetMutexWaitAddress(0);
        thread->SetCondVarWaitAddress(0);
        thread->SetWaitHandle(0);
@@ -83,18 +82,23 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_
    }

    if (resume) {
+        if (thread->GetStatus() == ThreadStatus::WaitCondVar ||
+            thread->GetStatus() == ThreadStatus::WaitArb) {
+            thread->SetWaitSynchronizationResult(RESULT_TIMEOUT);
+        }
        thread->ResumeFromWait();
    }
 }

 struct KernelCore::Impl {
-    explicit Impl(Core::System& system) : system{system} {}
+    explicit Impl(Core::System& system) : system{system}, global_scheduler{system} {}

    void Initialize(KernelCore& kernel) {
        Shutdown();

        InitializeSystemResourceLimit(kernel);
        InitializeThreads();
+        InitializePreemption();
    }

    void Shutdown() {
@@ -110,6 +114,9 @@ struct KernelCore::Impl {

        thread_wakeup_callback_handle_table.Clear();
        thread_wakeup_event_type = nullptr;
+        preemption_event = nullptr;
+
+        global_scheduler.Shutdown();

        named_ports.clear();
    }
@@ -132,6 +139,18 @@ struct KernelCore::Impl {
            system.CoreTiming().RegisterEvent("ThreadWakeupCallback", ThreadWakeupCallback);
    }

+    void InitializePreemption() {
+        preemption_event = system.CoreTiming().RegisterEvent(
+            "PreemptionCallback", [this](u64 userdata, s64 cycles_late) {
+                global_scheduler.PreemptThreads();
+                s64 time_interval = Core::Timing::msToCycles(std::chrono::milliseconds(10));
+                system.CoreTiming().ScheduleEvent(time_interval, preemption_event);
+            });
+
+        s64 time_interval = Core::Timing::msToCycles(std::chrono::milliseconds(10));
+        system.CoreTiming().ScheduleEvent(time_interval, preemption_event);
+    }
+
    std::atomic<u32> next_object_id{0};
    std::atomic<u64> next_kernel_process_id{Process::InitialKIPIDMin};
    std::atomic<u64> next_user_process_id{Process::ProcessIDMin};
@@ -140,10 +159,12 @@ struct KernelCore::Impl {
    // Lists all processes that exist in the current session.
    std::vector<SharedPtr<Process>> process_list;
    Process* current_process = nullptr;
+    Kernel::GlobalScheduler global_scheduler;

    SharedPtr<ResourceLimit> system_resource_limit;

    Core::Timing::EventType* thread_wakeup_event_type = nullptr;
+    Core::Timing::EventType* preemption_event = nullptr;
    // TODO(yuriks): This can be removed if Thread objects are explicitly pooled in the future,
    // allowing us to simply use a pool index or similar.
    Kernel::HandleTable thread_wakeup_callback_handle_table;
@@ -203,6 +224,14 @@ const std::vector<SharedPtr<Process>>& KernelCore::GetProcessList() const {
    return impl->process_list;
 }

+Kernel::GlobalScheduler& KernelCore::GlobalScheduler() {
+    return impl->global_scheduler;
+}
+
+const Kernel::GlobalScheduler& KernelCore::GlobalScheduler() const {
+    return impl->global_scheduler;
+}
+
 void KernelCore::AddNamedPort(std::string name, SharedPtr<ClientPort> port) {
    impl->named_ports.emplace(std::move(name), std::move(port));
 }
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -21,6 +21,7 @@ namespace Kernel {

 class AddressArbiter;
 class ClientPort;
+class GlobalScheduler;
 class HandleTable;
 class Process;
 class ResourceLimit;
@@ -75,6 +76,12 @@ public:
    /// Retrieves the list of processes.
    const std::vector<SharedPtr<Process>>& GetProcessList() const;

+    /// Gets the sole instance of the global scheduler
+    Kernel::GlobalScheduler& GlobalScheduler();
+
+    /// Gets the sole instance of the global scheduler
+    const Kernel::GlobalScheduler& GlobalScheduler() const;
+
    /// Adds a port to the named port table
    void AddNamedPort(std::string name, SharedPtr<ClientPort> port);

--- a/src/core/hle/kernel/mutex.cpp
+++ b/src/core/hle/kernel/mutex.cpp
@@ -7,6 +7,7 @@

 #include "common/assert.h"
 #include "core/core.h"
+#include "core/core_cpu.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/kernel.h"
@@ -78,7 +79,7 @@ ResultCode Mutex::TryAcquire(VAddr address, Handle holding_thread_handle,
    // thread.
    ASSERT(requesting_thread == current_thread);

-    const u32 addr_value = Memory::Read32(address);
+    u32 addr_value = Memory::Read32(address);

    // If the mutex isn't being held, just return success.
    if (addr_value != (holding_thread_handle | Mutex::MutexHasWaitersFlag)) {
@@ -89,6 +90,20 @@ ResultCode Mutex::TryAcquire(VAddr address, Handle holding_thread_handle,
        return ERR_INVALID_HANDLE;
    }

+    // This a workaround where an unknown bug writes the mutex value to give ownership to a cond var
+    // waiting thread.
+    if (holding_thread->GetStatus() == ThreadStatus::WaitCondVar) {
+        if (holding_thread->GetMutexWaitAddress() == address) {
+            Release(address, holding_thread.get());
+            addr_value = Memory::Read32(address);
+            if (addr_value == 0)
+                return RESULT_SUCCESS;
+            else {
+                holding_thread = handle_table.Get<Thread>(addr_value & Mutex::MutexOwnerMask);
+            }
+        }
+    }
+
    // Wait until the mutex is released
    current_thread->SetMutexWaitAddress(address);
    current_thread->SetWaitHandle(requesting_thread_handle);
@@ -104,14 +119,13 @@ ResultCode Mutex::TryAcquire(VAddr address, Handle holding_thread_handle,
    return RESULT_SUCCESS;
 }

-ResultCode Mutex::Release(VAddr address) {
+ResultCode Mutex::Release(VAddr address, Thread* holding_thread) {
    // The mutex address must be 4-byte aligned
    if ((address % sizeof(u32)) != 0) {
        return ERR_INVALID_ADDRESS;
    }

-    auto* const current_thread = system.CurrentScheduler().GetCurrentThread();
-    auto [thread, num_waiters] = GetHighestPriorityMutexWaitingThread(current_thread, address);
+    auto [thread, num_waiters] = GetHighestPriorityMutexWaitingThread(holding_thread, address);

    // There are no more threads waiting for the mutex, release it completely.
    if (thread == nullptr) {
@@ -120,7 +134,7 @@ ResultCode Mutex::Release(VAddr address) {
    }

    // Transfer the ownership of the mutex from the previous owner to the new one.
-    TransferMutexOwnership(address, current_thread, thread);
+    TransferMutexOwnership(address, holding_thread, thread);

    u32 mutex_value = thread->GetWaitHandle();

@@ -139,6 +153,12 @@ ResultCode Mutex::Release(VAddr address) {
    thread->SetCondVarWaitAddress(0);
    thread->SetMutexWaitAddress(0);
    thread->SetWaitHandle(0);
+    thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
+
+    if (thread->GetProcessorID() >= 0)
+        system.CpuCore(thread->GetProcessorID()).PrepareReschedule();
+    if (holding_thread->GetProcessorID() >= 0)
+        system.CpuCore(holding_thread->GetProcessorID()).PrepareReschedule();

    return RESULT_SUCCESS;
 }
--- a/src/core/hle/kernel/mutex.h
+++ b/src/core/hle/kernel/mutex.h
@@ -29,7 +29,7 @@ public:
                          Handle requesting_thread_handle);

    /// Releases the mutex at the specified address.
-    ResultCode Release(VAddr address);
+    ResultCode Release(VAddr address, Thread* holding_thread);

 private:
    Core::System& system;
--- a/src/core/hle/kernel/object.h
+++ b/src/core/hle/kernel/object.h
@@ -32,11 +32,6 @@ enum class HandleType : u32 {
    ServerSession,
 };

-enum class ResetType {
-    Automatic, ///< Reset automatically on object acquisition
-    Manual,    ///< Never reset automatically
-};
-
 class Object : NonCopyable {
 public:
    explicit Object(KernelCore& kernel);
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -213,10 +213,7 @@ void Process::PrepareForTermination() {
        }
    };

-    stop_threads(system.Scheduler(0).GetThreadList());
-    stop_threads(system.Scheduler(1).GetThreadList());
-    stop_threads(system.Scheduler(2).GetThreadList());
-    stop_threads(system.Scheduler(3).GetThreadList());
+    stop_threads(system.GlobalScheduler().GetThreadList());

    FreeTLSRegion(tls_region_address);
    tls_region_address = 0;
--- a/src/core/hle/kernel/readable_event.cpp
+++ b/src/core/hle/kernel/readable_event.cpp
@@ -20,15 +20,13 @@ bool ReadableEvent::ShouldWait(const Thread* thread) const {

 void ReadableEvent::Acquire(Thread* thread) {
    ASSERT_MSG(!ShouldWait(thread), "object unavailable!");
-
-    if (reset_type == ResetType::Automatic) {
-        signaled = false;
-    }
 }

 void ReadableEvent::Signal() {
-    signaled = true;
-    WakeupAllWaitingThreads();
+    if (!signaled) {
+        signaled = true;
+        WakeupAllWaitingThreads();
+    };
 }

 void ReadableEvent::Clear() {
--- a/src/core/hle/kernel/readable_event.h
+++ b/src/core/hle/kernel/readable_event.h
@@ -27,10 +27,6 @@ public:
        return name;
    }

-    ResetType GetResetType() const {
-        return reset_type;
-    }
-
    static constexpr HandleType HANDLE_TYPE = HandleType::ReadableEvent;
    HandleType GetHandleType() const override {
        return HANDLE_TYPE;
@@ -55,8 +51,7 @@ private:

    void Signal();

-    ResetType reset_type;
-    bool signaled;
+    bool signaled{};

    std::string name; ///< Name of event (optional)
 };
--- a/src/core/hle/kernel/scheduler.cpp
+++ b/src/core/hle/kernel/scheduler.cpp
@@ -1,8 +1,13 @@
 // Copyright 2018 yuzu emulator team
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
+//
+// SelectThreads, Yield functions originally by TuxSH.
+// licensed under GPLv2 or later under exception provided by the author.

 #include <algorithm>
+#include <set>
+#include <unordered_set>
 #include <utility>

 #include "common/assert.h"
@@ -17,56 +22,374 @@

 namespace Kernel {

-std::mutex Scheduler::scheduler_mutex;
+GlobalScheduler::GlobalScheduler(Core::System& system) : system{system} {}

-Scheduler::Scheduler(Core::System& system, Core::ARM_Interface& cpu_core)
-    : cpu_core{cpu_core}, system{system} {}
+GlobalScheduler::~GlobalScheduler() = default;

-Scheduler::~Scheduler() {
-    for (auto& thread : thread_list) {
-        thread->Stop();
+void GlobalScheduler::AddThread(SharedPtr<Thread> thread) {
+    thread_list.push_back(std::move(thread));
+}
+
+void GlobalScheduler::RemoveThread(const Thread* thread) {
+    thread_list.erase(std::remove(thread_list.begin(), thread_list.end(), thread),
+                      thread_list.end());
+}
+
+void GlobalScheduler::UnloadThread(s32 core) {
+    Scheduler& sched = system.Scheduler(core);
+    sched.UnloadThread();
+}
+
+void GlobalScheduler::SelectThread(u32 core) {
+    const auto update_thread = [](Thread* thread, Scheduler& sched) {
+        if (thread != sched.selected_thread) {
+            if (thread == nullptr) {
+                ++sched.idle_selection_count;
+            }
+            sched.selected_thread = thread;
+        }
+        sched.is_context_switch_pending = sched.selected_thread != sched.current_thread;
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+    };
+    Scheduler& sched = system.Scheduler(core);
+    Thread* current_thread = nullptr;
+    // Step 1: Get top thread in schedule queue.
+    current_thread = scheduled_queue[core].empty() ? nullptr : scheduled_queue[core].front();
+    if (current_thread) {
+        update_thread(current_thread, sched);
+        return;
+    }
+    // Step 2: Try selecting a suggested thread.
+    Thread* winner = nullptr;
+    std::set<s32> sug_cores;
+    for (auto thread : suggested_queue[core]) {
+        s32 this_core = thread->GetProcessorID();
+        Thread* thread_on_core = nullptr;
+        if (this_core >= 0) {
+            thread_on_core = scheduled_queue[this_core].front();
+        }
+        if (this_core < 0 || thread != thread_on_core) {
+            winner = thread;
+            break;
+        }
+        sug_cores.insert(this_core);
+    }
+    // if we got a suggested thread, select it, else do a second pass.
+    if (winner && winner->GetPriority() > 2) {
+        if (winner->IsRunning()) {
+            UnloadThread(winner->GetProcessorID());
+        }
+        TransferToCore(winner->GetPriority(), core, winner);
+        update_thread(winner, sched);
+        return;
+    }
+    // Step 3: Select a suggested thread from another core
+    for (auto& src_core : sug_cores) {
+        auto it = scheduled_queue[src_core].begin();
+        it++;
+        if (it != scheduled_queue[src_core].end()) {
+            Thread* thread_on_core = scheduled_queue[src_core].front();
+            Thread* to_change = *it;
+            if (thread_on_core->IsRunning() || to_change->IsRunning()) {
+                UnloadThread(src_core);
+            }
+            TransferToCore(thread_on_core->GetPriority(), core, thread_on_core);
+            current_thread = thread_on_core;
+            break;
+        }
+    }
+    update_thread(current_thread, sched);
+}
+
+bool GlobalScheduler::YieldThread(Thread* yielding_thread) {
+    // Note: caller should use critical section, etc.
+    const u32 core_id = static_cast<u32>(yielding_thread->GetProcessorID());
+    const u32 priority = yielding_thread->GetPriority();
+
+    // Yield the thread
+    const Thread* const winner = scheduled_queue[core_id].front(priority);
+    ASSERT_MSG(yielding_thread == winner, "Thread yielding without being in front");
+    scheduled_queue[core_id].yield(priority);
+
+    return AskForReselectionOrMarkRedundant(yielding_thread, winner);
+}
+
+bool GlobalScheduler::YieldThreadAndBalanceLoad(Thread* yielding_thread) {
+    // Note: caller should check if !thread.IsSchedulerOperationRedundant and use critical section,
+    // etc.
+    const u32 core_id = static_cast<u32>(yielding_thread->GetProcessorID());
+    const u32 priority = yielding_thread->GetPriority();
+
+    // Yield the thread
+    ASSERT_MSG(yielding_thread == scheduled_queue[core_id].front(priority),
+               "Thread yielding without being in front");
+    scheduled_queue[core_id].yield(priority);
+
+    std::array<Thread*, NUM_CPU_CORES> current_threads;
+    for (u32 i = 0; i < NUM_CPU_CORES; i++) {
+        current_threads[i] = scheduled_queue[i].empty() ? nullptr : scheduled_queue[i].front();
+    }
+
+    Thread* next_thread = scheduled_queue[core_id].front(priority);
+    Thread* winner = nullptr;
+    for (auto& thread : suggested_queue[core_id]) {
+        const s32 source_core = thread->GetProcessorID();
+        if (source_core >= 0) {
+            if (current_threads[source_core] != nullptr) {
+                if (thread == current_threads[source_core] ||
+                    current_threads[source_core]->GetPriority() < min_regular_priority) {
+                    continue;
+                }
+            }
+        }
+        if (next_thread->GetLastRunningTicks() >= thread->GetLastRunningTicks() ||
+            next_thread->GetPriority() < thread->GetPriority()) {
+            if (thread->GetPriority() <= priority) {
+                winner = thread;
+                break;
+            }
+        }
+    }
+
+    if (winner != nullptr) {
+        if (winner != yielding_thread) {
+            if (winner->IsRunning()) {
+                UnloadThread(winner->GetProcessorID());
+            }
+            TransferToCore(winner->GetPriority(), core_id, winner);
+        }
+    } else {
+        winner = next_thread;
+    }
+
+    return AskForReselectionOrMarkRedundant(yielding_thread, winner);
+}
+
+bool GlobalScheduler::YieldThreadAndWaitForLoadBalancing(Thread* yielding_thread) {
+    // Note: caller should check if !thread.IsSchedulerOperationRedundant and use critical section,
+    // etc.
+    Thread* winner = nullptr;
+    const u32 core_id = static_cast<u32>(yielding_thread->GetProcessorID());
+
+    // Remove the thread from its scheduled mlq, put it on the corresponding "suggested" one instead
+    TransferToCore(yielding_thread->GetPriority(), -1, yielding_thread);
+
+    // If the core is idle, perform load balancing, excluding the threads that have just used this
+    // function...
+    if (scheduled_queue[core_id].empty()) {
+        // Here, "current_threads" is calculated after the ""yield"", unlike yield -1
+        std::array<Thread*, NUM_CPU_CORES> current_threads;
+        for (u32 i = 0; i < NUM_CPU_CORES; i++) {
+            current_threads[i] = scheduled_queue[i].empty() ? nullptr : scheduled_queue[i].front();
+        }
+        for (auto& thread : suggested_queue[core_id]) {
+            const s32 source_core = thread->GetProcessorID();
+            if (source_core < 0 || thread == current_threads[source_core]) {
+                continue;
+            }
+            if (current_threads[source_core] == nullptr ||
+                current_threads[source_core]->GetPriority() >= min_regular_priority) {
+                winner = thread;
+            }
+            break;
+        }
+        if (winner != nullptr) {
+            if (winner != yielding_thread) {
+                if (winner->IsRunning()) {
+                    UnloadThread(winner->GetProcessorID());
+                }
+                TransferToCore(winner->GetPriority(), core_id, winner);
+            }
+        } else {
+            winner = yielding_thread;
+        }
+    }
+
+    return AskForReselectionOrMarkRedundant(yielding_thread, winner);
+}
+
+void GlobalScheduler::PreemptThreads() {
+    for (std::size_t core_id = 0; core_id < NUM_CPU_CORES; core_id++) {
+        const u32 priority = preemption_priorities[core_id];
+
+        if (scheduled_queue[core_id].size(priority) > 0) {
+            scheduled_queue[core_id].front(priority)->IncrementYieldCount();
+            scheduled_queue[core_id].yield(priority);
+            if (scheduled_queue[core_id].size(priority) > 1) {
+                scheduled_queue[core_id].front(priority)->IncrementYieldCount();
+            }
+        }
+
+        Thread* current_thread =
+            scheduled_queue[core_id].empty() ? nullptr : scheduled_queue[core_id].front();
+        Thread* winner = nullptr;
+        for (auto& thread : suggested_queue[core_id]) {
+            const s32 source_core = thread->GetProcessorID();
+            if (thread->GetPriority() != priority) {
+                continue;
+            }
+            if (source_core >= 0) {
+                Thread* next_thread = scheduled_queue[source_core].empty()
+                                          ? nullptr
+                                          : scheduled_queue[source_core].front();
+                if (next_thread != nullptr && next_thread->GetPriority() < 2) {
+                    break;
+                }
+                if (next_thread == thread) {
+                    continue;
+                }
+            }
+            if (current_thread != nullptr &&
+                current_thread->GetLastRunningTicks() >= thread->GetLastRunningTicks()) {
+                winner = thread;
+                break;
+            }
+        }
+
+        if (winner != nullptr) {
+            if (winner->IsRunning()) {
+                UnloadThread(winner->GetProcessorID());
+            }
+            TransferToCore(winner->GetPriority(), s32(core_id), winner);
+            current_thread =
+                winner->GetPriority() <= current_thread->GetPriority() ? winner : current_thread;
+        }
+
+        if (current_thread != nullptr && current_thread->GetPriority() > priority) {
+            for (auto& thread : suggested_queue[core_id]) {
+                const s32 source_core = thread->GetProcessorID();
+                if (thread->GetPriority() < priority) {
+                    continue;
+                }
+                if (source_core >= 0) {
+                    Thread* next_thread = scheduled_queue[source_core].empty()
+                                              ? nullptr
+                                              : scheduled_queue[source_core].front();
+                    if (next_thread != nullptr && next_thread->GetPriority() < 2) {
+                        break;
+                    }
+                    if (next_thread == thread) {
+                        continue;
+                    }
+                }
+                if (current_thread != nullptr &&
+                    current_thread->GetLastRunningTicks() >= thread->GetLastRunningTicks()) {
+                    winner = thread;
+                    break;
+                }
+            }
+
+            if (winner != nullptr) {
+                if (winner->IsRunning()) {
+                    UnloadThread(winner->GetProcessorID());
+                }
+                TransferToCore(winner->GetPriority(), s32(core_id), winner);
+                current_thread = winner;
+            }
+        }
+
+        is_reselection_pending.store(true, std::memory_order_release);
    }
 }

+void GlobalScheduler::Suggest(u32 priority, u32 core, Thread* thread) {
+    suggested_queue[core].add(thread, priority);
+}
+
+void GlobalScheduler::Unsuggest(u32 priority, u32 core, Thread* thread) {
+    suggested_queue[core].remove(thread, priority);
+}
+
+void GlobalScheduler::Schedule(u32 priority, u32 core, Thread* thread) {
+    ASSERT_MSG(thread->GetProcessorID() == s32(core), "Thread must be assigned to this core.");
+    scheduled_queue[core].add(thread, priority);
+}
+
+void GlobalScheduler::SchedulePrepend(u32 priority, u32 core, Thread* thread) {
+    ASSERT_MSG(thread->GetProcessorID() == s32(core), "Thread must be assigned to this core.");
+    scheduled_queue[core].add(thread, priority, false);
+}
+
+void GlobalScheduler::Reschedule(u32 priority, u32 core, Thread* thread) {
+    scheduled_queue[core].remove(thread, priority);
+    scheduled_queue[core].add(thread, priority);
+}
+
+void GlobalScheduler::Unschedule(u32 priority, u32 core, Thread* thread) {
+    scheduled_queue[core].remove(thread, priority);
+}
+
+void GlobalScheduler::TransferToCore(u32 priority, s32 destination_core, Thread* thread) {
+    const bool schedulable = thread->GetPriority() < THREADPRIO_COUNT;
+    const s32 source_core = thread->GetProcessorID();
+    if (source_core == destination_core || !schedulable) {
+        return;
+    }
+    thread->SetProcessorID(destination_core);
+    if (source_core >= 0) {
+        Unschedule(priority, source_core, thread);
+    }
+    if (destination_core >= 0) {
+        Unsuggest(priority, destination_core, thread);
+        Schedule(priority, destination_core, thread);
+    }
+    if (source_core >= 0) {
+        Suggest(priority, source_core, thread);
+    }
+}
+
+bool GlobalScheduler::AskForReselectionOrMarkRedundant(Thread* current_thread,
+                                                       const Thread* winner) {
+    if (current_thread == winner) {
+        current_thread->IncrementYieldCount();
+        return true;
+    } else {
+        is_reselection_pending.store(true, std::memory_order_release);
+        return false;
+    }
+}
+
+void GlobalScheduler::Shutdown() {
+    for (std::size_t core = 0; core < NUM_CPU_CORES; core++) {
+        scheduled_queue[core].clear();
+        suggested_queue[core].clear();
+    }
+    thread_list.clear();
+}
+
+Scheduler::Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, u32 core_id)
+    : system(system), cpu_core(cpu_core), core_id(core_id) {}
+
+Scheduler::~Scheduler() = default;
+
 bool Scheduler::HaveReadyThreads() const {
-    std::lock_guard lock{scheduler_mutex};
-    return !ready_queue.empty();
+    return system.GlobalScheduler().HaveReadyThreads(core_id);
 }

 Thread* Scheduler::GetCurrentThread() const {
    return current_thread.get();
 }

+Thread* Scheduler::GetSelectedThread() const {
+    return selected_thread.get();
+}
+
+void Scheduler::SelectThreads() {
+    system.GlobalScheduler().SelectThread(core_id);
+}
+
 u64 Scheduler::GetLastContextSwitchTicks() const {
    return last_context_switch_time;
 }

-Thread* Scheduler::PopNextReadyThread() {
-    Thread* next = nullptr;
-    Thread* thread = GetCurrentThread();
-
-    if (thread && thread->GetStatus() == ThreadStatus::Running) {
-        if (ready_queue.empty()) {
-            return thread;
-        }
-        // We have to do better than the current thread.
-        // This call returns null when that's not possible.
-        next = ready_queue.front();
-        if (next == nullptr || next->GetPriority() >= thread->GetPriority()) {
-            next = thread;
-        }
-    } else {
-        if (ready_queue.empty()) {
-            return nullptr;
-        }
-        next = ready_queue.front();
+void Scheduler::TryDoContextSwitch() {
+    if (is_context_switch_pending) {
+        SwitchContext();
    }
-
-    return next;
 }

-void Scheduler::SwitchContext(Thread* new_thread) {
-    Thread* previous_thread = GetCurrentThread();
+void Scheduler::UnloadThread() {
+    Thread* const previous_thread = GetCurrentThread();
    Process* const previous_process = system.Kernel().CurrentProcess();

    UpdateLastContextSwitchTime(previous_thread, previous_process);
@@ -80,23 +403,52 @@ void Scheduler::SwitchContext(Thread* new_thread) {
        if (previous_thread->GetStatus() == ThreadStatus::Running) {
            // This is only the case when a reschedule is triggered without the current thread
            // yielding execution (i.e. an event triggered, system core time-sliced, etc)
-            ready_queue.add(previous_thread, previous_thread->GetPriority(), false);
            previous_thread->SetStatus(ThreadStatus::Ready);
        }
+        previous_thread->SetIsRunning(false);
+    }
+    current_thread = nullptr;
+}
+
+void Scheduler::SwitchContext() {
+    Thread* const previous_thread = GetCurrentThread();
+    Thread* const new_thread = GetSelectedThread();
+
+    is_context_switch_pending = false;
+    if (new_thread == previous_thread) {
+        return;
+    }
+
+    Process* const previous_process = system.Kernel().CurrentProcess();
+
+    UpdateLastContextSwitchTime(previous_thread, previous_process);
+
+    // Save context for previous thread
+    if (previous_thread) {
+        cpu_core.SaveContext(previous_thread->GetContext());
+        // Save the TPIDR_EL0 system register in case it was modified.
+        previous_thread->SetTPIDR_EL0(cpu_core.GetTPIDR_EL0());
+
+        if (previous_thread->GetStatus() == ThreadStatus::Running) {
+            // This is only the case when a reschedule is triggered without the current thread
+            // yielding execution (i.e. an event triggered, system core time-sliced, etc)
+            previous_thread->SetStatus(ThreadStatus::Ready);
+        }
+        previous_thread->SetIsRunning(false);
    }

    // Load context of new thread
    if (new_thread) {
+        ASSERT_MSG(new_thread->GetProcessorID() == s32(this->core_id),
+                   "Thread must be assigned to this core.");
        ASSERT_MSG(new_thread->GetStatus() == ThreadStatus::Ready,
                   "Thread must be ready to become running.");

        // Cancel any outstanding wakeup events for this thread
        new_thread->CancelWakeupTimer();
-
        current_thread = new_thread;
-
-        ready_queue.remove(new_thread, new_thread->GetPriority());
        new_thread->SetStatus(ThreadStatus::Running);
+        new_thread->SetIsRunning(true);

        auto* const thread_owner_process = current_thread->GetOwnerProcess();
        if (previous_process != thread_owner_process) {
@@ -130,124 +482,9 @@ void Scheduler::UpdateLastContextSwitchTime(Thread* thread, Process* process) {
    last_context_switch_time = most_recent_switch_ticks;
 }

-void Scheduler::Reschedule() {
-    std::lock_guard lock{scheduler_mutex};
-
-    Thread* cur = GetCurrentThread();
-    Thread* next = PopNextReadyThread();
-
-    if (cur && next) {
-        LOG_TRACE(Kernel, "context switch {} -> {}", cur->GetObjectId(), next->GetObjectId());
-    } else if (cur) {
-        LOG_TRACE(Kernel, "context switch {} -> idle", cur->GetObjectId());
-    } else if (next) {
-        LOG_TRACE(Kernel, "context switch idle -> {}", next->GetObjectId());
-    }
-
-    SwitchContext(next);
-}
-
-void Scheduler::AddThread(SharedPtr<Thread> thread) {
-    std::lock_guard lock{scheduler_mutex};
-
-    thread_list.push_back(std::move(thread));
-}
-
-void Scheduler::RemoveThread(Thread* thread) {
-    std::lock_guard lock{scheduler_mutex};
-
-    thread_list.erase(std::remove(thread_list.begin(), thread_list.end(), thread),
-                      thread_list.end());
-}
-
-void Scheduler::ScheduleThread(Thread* thread, u32 priority) {
-    std::lock_guard lock{scheduler_mutex};
-
-    ASSERT(thread->GetStatus() == ThreadStatus::Ready);
-    ready_queue.add(thread, priority);
-}
-
-void Scheduler::UnscheduleThread(Thread* thread, u32 priority) {
-    std::lock_guard lock{scheduler_mutex};
-
-    ASSERT(thread->GetStatus() == ThreadStatus::Ready);
-    ready_queue.remove(thread, priority);
-}
-
-void Scheduler::SetThreadPriority(Thread* thread, u32 priority) {
-    std::lock_guard lock{scheduler_mutex};
-    if (thread->GetPriority() == priority) {
-        return;
-    }
-
-    // If thread was ready, adjust queues
-    if (thread->GetStatus() == ThreadStatus::Ready)
-        ready_queue.adjust(thread, thread->GetPriority(), priority);
-}
-
-Thread* Scheduler::GetNextSuggestedThread(u32 core, u32 maximum_priority) const {
-    std::lock_guard lock{scheduler_mutex};
-
-    const u32 mask = 1U << core;
-    for (auto* thread : ready_queue) {
-        if ((thread->GetAffinityMask() & mask) != 0 && thread->GetPriority() < maximum_priority) {
-            return thread;
-        }
-    }
-    return nullptr;
-}
-
-void Scheduler::YieldWithoutLoadBalancing(Thread* thread) {
-    ASSERT(thread != nullptr);
-    // Avoid yielding if the thread isn't even running.
-    ASSERT(thread->GetStatus() == ThreadStatus::Running);
-
-    // Sanity check that the priority is valid
-    ASSERT(thread->GetPriority() < THREADPRIO_COUNT);
-
-    // Yield this thread -- sleep for zero time and force reschedule to different thread
-    GetCurrentThread()->Sleep(0);
-}
-
-void Scheduler::YieldWithLoadBalancing(Thread* thread) {
-    ASSERT(thread != nullptr);
-    const auto priority = thread->GetPriority();
-    const auto core = static_cast<u32>(thread->GetProcessorID());
-
-    // Avoid yielding if the thread isn't even running.
-    ASSERT(thread->GetStatus() == ThreadStatus::Running);
-
-    // Sanity check that the priority is valid
-    ASSERT(priority < THREADPRIO_COUNT);
-
-    // Sleep for zero time to be able to force reschedule to different thread
-    GetCurrentThread()->Sleep(0);
-
-    Thread* suggested_thread = nullptr;
-
-    // Search through all of the cpu cores (except this one) for a suggested thread.
-    // Take the first non-nullptr one
-    for (unsigned cur_core = 0; cur_core < Core::NUM_CPU_CORES; ++cur_core) {
-        const auto res =
-            system.CpuCore(cur_core).Scheduler().GetNextSuggestedThread(core, priority);
-
-        // If scheduler provides a suggested thread
-        if (res != nullptr) {
-            // And its better than the current suggested thread (or is the first valid one)
-            if (suggested_thread == nullptr ||
-                suggested_thread->GetPriority() > res->GetPriority()) {
-                suggested_thread = res;
-            }
-        }
-    }
-
-    // If a suggested thread was found, queue that for this core
-    if (suggested_thread != nullptr)
-        suggested_thread->ChangeCore(core, suggested_thread->GetAffinityMask());
-}
-
-void Scheduler::YieldAndWaitForLoadBalancing(Thread* thread) {
-    UNIMPLEMENTED_MSG("Wait for load balancing thread yield type is not implemented!");
+void Scheduler::Shutdown() {
+    current_thread = nullptr;
+    selected_thread = nullptr;
 }

 } // namespace Kernel
--- a/src/core/hle/kernel/scheduler.h
+++ b/src/core/hle/kernel/scheduler.h
@@ -20,124 +20,185 @@ namespace Kernel {

 class Process;

-class Scheduler final {
+class GlobalScheduler final {
 public:
-    explicit Scheduler(Core::System& system, Core::ARM_Interface& cpu_core);
-    ~Scheduler();
+    static constexpr u32 NUM_CPU_CORES = 4;

-    /// Returns whether there are any threads that are ready to run.
-    bool HaveReadyThreads() const;
-
-    /// Reschedules to the next available thread (call after current thread is suspended)
-    void Reschedule();
-
-    /// Gets the current running thread
-    Thread* GetCurrentThread() const;
-
-    /// Gets the timestamp for the last context switch in ticks.
-    u64 GetLastContextSwitchTicks() const;
+    explicit GlobalScheduler(Core::System& system);
+    ~GlobalScheduler();

    /// Adds a new thread to the scheduler
    void AddThread(SharedPtr<Thread> thread);

    /// Removes a thread from the scheduler
-    void RemoveThread(Thread* thread);
-
-    /// Schedules a thread that has become "ready"
-    void ScheduleThread(Thread* thread, u32 priority);
-
-    /// Unschedules a thread that was already scheduled
-    void UnscheduleThread(Thread* thread, u32 priority);
-
-    /// Sets the priority of a thread in the scheduler
-    void SetThreadPriority(Thread* thread, u32 priority);
-
-    /// Gets the next suggested thread for load balancing
-    Thread* GetNextSuggestedThread(u32 core, u32 minimum_priority) const;
-
-    /**
-     * YieldWithoutLoadBalancing -- analogous to normal yield on a system
-     * Moves the thread to the end of the ready queue for its priority, and then reschedules the
-     * system to the new head of the queue.
-     *
-     * Example (Single Core -- but can be extrapolated to multi):
-     * ready_queue[prio=0]: ThreadA, ThreadB, ThreadC (->exec order->)
-     * Currently Running: ThreadR
-     *
-     * ThreadR calls YieldWithoutLoadBalancing
-     *
-     * ThreadR is moved to the end of ready_queue[prio=0]:
-     * ready_queue[prio=0]: ThreadA, ThreadB, ThreadC, ThreadR (->exec order->)
-     * Currently Running: Nothing
-     *
-     * System is rescheduled (ThreadA is popped off of queue):
-     * ready_queue[prio=0]: ThreadB, ThreadC, ThreadR (->exec order->)
-     * Currently Running: ThreadA
-     *
-     * If the queue is empty at time of call, no yielding occurs. This does not cross between cores
-     * or priorities at all.
-     */
-    void YieldWithoutLoadBalancing(Thread* thread);
-
-    /**
-     * YieldWithLoadBalancing -- yield but with better selection of the new running thread
-     * Moves the current thread to the end of the ready queue for its priority, then selects a
-     * 'suggested thread' (a thread on a different core that could run on this core) from the
-     * scheduler, changes its core, and reschedules the current core to that thread.
-     *
-     * Example (Dual Core -- can be extrapolated to Quad Core, this is just normal yield if it were
-     * single core):
-     * ready_queue[core=0][prio=0]: ThreadA, ThreadB (affinities not pictured as irrelevant
-     * ready_queue[core=1][prio=0]: ThreadC[affinity=both], ThreadD[affinity=core1only]
-     * Currently Running: ThreadQ on Core 0 || ThreadP on Core 1
-     *
-     * ThreadQ calls YieldWithLoadBalancing
-     *
-     * ThreadQ is moved to the end of ready_queue[core=0][prio=0]:
-     * ready_queue[core=0][prio=0]: ThreadA, ThreadB
-     * ready_queue[core=1][prio=0]: ThreadC[affinity=both], ThreadD[affinity=core1only]
-     * Currently Running: ThreadQ on Core 0 || ThreadP on Core 1
-     *
-     * A list of suggested threads for each core is compiled
-     * Suggested Threads: {ThreadC on Core 1}
-     * If this were quad core (as the switch is), there could be between 0 and 3 threads in this
-     * list. If there are more than one, the thread is selected by highest prio.
-     *
-     * ThreadC is core changed to Core 0:
-     * ready_queue[core=0][prio=0]: ThreadC, ThreadA, ThreadB, ThreadQ
-     * ready_queue[core=1][prio=0]: ThreadD
-     * Currently Running: None on Core 0 || ThreadP on Core 1
-     *
-     * System is rescheduled (ThreadC is popped off of queue):
-     * ready_queue[core=0][prio=0]: ThreadA, ThreadB, ThreadQ
-     * ready_queue[core=1][prio=0]: ThreadD
-     * Currently Running: ThreadC on Core 0 || ThreadP on Core 1
-     *
-     * If no suggested threads can be found this will behave just as normal yield. If there are
-     * multiple candidates for the suggested thread on a core, the highest prio is taken.
-     */
-    void YieldWithLoadBalancing(Thread* thread);
-
-    /// Currently unknown -- asserts as unimplemented on call
-    void YieldAndWaitForLoadBalancing(Thread* thread);
+    void RemoveThread(const Thread* thread);

    /// Returns a list of all threads managed by the scheduler
    const std::vector<SharedPtr<Thread>>& GetThreadList() const {
        return thread_list;
    }

-private:
    /**
-     * Pops and returns the next thread from the thread queue
-     * @return A pointer to the next ready thread
+     * Add a thread to the suggested queue of a cpu core. Suggested threads may be
+     * picked if no thread is scheduled to run on the core.
     */
-    Thread* PopNextReadyThread();
+    void Suggest(u32 priority, u32 core, Thread* thread);

    /**
-     * Switches the CPU's active thread context to that of the specified thread
-     * @param new_thread The thread to switch to
+     * Remove a thread to the suggested queue of a cpu core. Suggested threads may be
+     * picked if no thread is scheduled to run on the core.
     */
-    void SwitchContext(Thread* new_thread);
+    void Unsuggest(u32 priority, u32 core, Thread* thread);
+
+    /**
+     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
+     * back the queue in its priority level.
+     */
+    void Schedule(u32 priority, u32 core, Thread* thread);
+
+    /**
+     * Add a thread to the scheduling queue of a cpu core. The thread is added at the
+     * front the queue in its priority level.
+     */
+    void SchedulePrepend(u32 priority, u32 core, Thread* thread);
+
+    /// Reschedule an already scheduled thread based on a new priority
+    void Reschedule(u32 priority, u32 core, Thread* thread);
+
+    /// Unschedules a thread.
+    void Unschedule(u32 priority, u32 core, Thread* thread);
+
+    /**
+     * Transfers a thread into an specific core. If the destination_core is -1
+     * it will be unscheduled from its source code and added into its suggested
+     * queue.
+     */
+    void TransferToCore(u32 priority, s32 destination_core, Thread* thread);
+
+    /// Selects a core and forces it to unload its current thread's context
+    void UnloadThread(s32 core);
+
+    /**
+     * Takes care of selecting the new scheduled thread in three steps:
+     *
+     * 1. First a thread is selected from the top of the priority queue. If no thread
+     *    is obtained then we move to step two, else we are done.
+     *
+     * 2. Second we try to get a suggested thread that's not assigned to any core or
+     *    that is not the top thread in that core.
+     *
+     * 3. Third is no suggested thread is found, we do a second pass and pick a running
+     *    thread in another core and swap it with its current thread.
+     */
+    void SelectThread(u32 core);
+
+    bool HaveReadyThreads(u32 core_id) const {
+        return !scheduled_queue[core_id].empty();
+    }
+
+    /**
+     * Takes a thread and moves it to the back of the it's priority list.
+     *
+     * @note This operation can be redundant and no scheduling is changed if marked as so.
+     */
+    bool YieldThread(Thread* thread);
+
+    /**
+     * Takes a thread and moves it to the back of the it's priority list.
+     * Afterwards, tries to pick a suggested thread from the suggested queue that has worse time or
+     * a better priority than the next thread in the core.
+     *
+     * @note This operation can be redundant and no scheduling is changed if marked as so.
+     */
+    bool YieldThreadAndBalanceLoad(Thread* thread);
+
+    /**
+     * Takes a thread and moves it out of the scheduling queue.
+     * and into the suggested queue. If no thread can be scheduled afterwards in that core,
+     * a suggested thread is obtained instead.
+     *
+     * @note This operation can be redundant and no scheduling is changed if marked as so.
+     */
+    bool YieldThreadAndWaitForLoadBalancing(Thread* thread);
+
+    /**
+     * Rotates the scheduling queues of threads at a preemption priority and then does
+     * some core rebalancing. Preemption priorities can be found in the array
+     * 'preemption_priorities'.
+     *
+     * @note This operation happens every 10ms.
+     */
+    void PreemptThreads();
+
+    u32 CpuCoresCount() const {
+        return NUM_CPU_CORES;
+    }
+
+    void SetReselectionPending() {
+        is_reselection_pending.store(true, std::memory_order_release);
+    }
+
+    bool IsReselectionPending() const {
+        return is_reselection_pending.load(std::memory_order_acquire);
+    }
+
+    void Shutdown();
+
+private:
+    bool AskForReselectionOrMarkRedundant(Thread* current_thread, const Thread* winner);
+
+    static constexpr u32 min_regular_priority = 2;
+    std::array<Common::MultiLevelQueue<Thread*, THREADPRIO_COUNT>, NUM_CPU_CORES> scheduled_queue;
+    std::array<Common::MultiLevelQueue<Thread*, THREADPRIO_COUNT>, NUM_CPU_CORES> suggested_queue;
+    std::atomic<bool> is_reselection_pending{false};
+
+    // The priority levels at which the global scheduler preempts threads every 10 ms. They are
+    // ordered from Core 0 to Core 3.
+    std::array<u32, NUM_CPU_CORES> preemption_priorities = {59, 59, 59, 62};
+
+    /// Lists all thread ids that aren't deleted/etc.
+    std::vector<SharedPtr<Thread>> thread_list;
+    Core::System& system;
+};
+
+class Scheduler final {
+public:
+    explicit Scheduler(Core::System& system, Core::ARM_Interface& cpu_core, u32 core_id);
+    ~Scheduler();
+
+    /// Returns whether there are any threads that are ready to run.
+    bool HaveReadyThreads() const;
+
+    /// Reschedules to the next available thread (call after current thread is suspended)
+    void TryDoContextSwitch();
+
+    /// Unloads currently running thread
+    void UnloadThread();
+
+    /// Select the threads in top of the scheduling multilist.
+    void SelectThreads();
+
+    /// Gets the current running thread
+    Thread* GetCurrentThread() const;
+
+    /// Gets the currently selected thread from the top of the multilevel queue
+    Thread* GetSelectedThread() const;
+
+    /// Gets the timestamp for the last context switch in ticks.
+    u64 GetLastContextSwitchTicks() const;
+
+    bool ContextSwitchPending() const {
+        return is_context_switch_pending;
+    }
+
+    /// Shutdowns the scheduler.
+    void Shutdown();
+
+private:
+    friend class GlobalScheduler;
+
+    /// Switches the CPU's active thread context to that of the specified thread
+    void SwitchContext();

    /**
     * Called on every context switch to update the internal timestamp
@@ -152,19 +213,16 @@ private:
     */
    void UpdateLastContextSwitchTime(Thread* thread, Process* process);

-    /// Lists all thread ids that aren't deleted/etc.
-    std::vector<SharedPtr<Thread>> thread_list;
-
-    /// Lists only ready thread ids.
-    Common::MultiLevelQueue<Thread*, THREADPRIO_LOWEST + 1> ready_queue;
-
    SharedPtr<Thread> current_thread = nullptr;
-
-    Core::ARM_Interface& cpu_core;
-    u64 last_context_switch_time = 0;
+    SharedPtr<Thread> selected_thread = nullptr;

    Core::System& system;
-    static std::mutex scheduler_mutex;
+    Core::ARM_Interface& cpu_core;
+    u64 last_context_switch_time = 0;
+    u64 idle_selection_count = 0;
+    const u32 core_id;
+
+    bool is_context_switch_pending = false;
 };

 } // namespace Kernel
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -516,7 +516,7 @@ static ResultCode WaitSynchronization(Core::System& system, Handle* index, VAddr
    thread->WakeAfterDelay(nano_seconds);
    thread->SetWakeupCallback(DefaultThreadWakeupCallback);

-    system.CpuCore(thread->GetProcessorID()).PrepareReschedule();
+    system.PrepareReschedule(thread->GetProcessorID());

    return RESULT_TIMEOUT;
 }
@@ -534,6 +534,7 @@ static ResultCode CancelSynchronization(Core::System& system, Handle thread_hand
    }

    thread->CancelWait();
+    system.PrepareReschedule(thread->GetProcessorID());
    return RESULT_SUCCESS;
 }

@@ -577,7 +578,8 @@ static ResultCode ArbitrateUnlock(Core::System& system, VAddr mutex_addr) {
    }

    auto* const current_process = system.Kernel().CurrentProcess();
-    return current_process->GetMutex().Release(mutex_addr);
+    return current_process->GetMutex().Release(mutex_addr,
+                                               system.CurrentScheduler().GetCurrentThread());
 }

 enum class BreakType : u32 {
@@ -1066,6 +1068,8 @@ static ResultCode SetThreadActivity(Core::System& system, Handle handle, u32 act
    }

    thread->SetActivity(static_cast<ThreadActivity>(activity));
+
+    system.PrepareReschedule(thread->GetProcessorID());
    return RESULT_SUCCESS;
 }

@@ -1147,7 +1151,7 @@ static ResultCode SetThreadPriority(Core::System& system, Handle handle, u32 pri

    thread->SetPriority(priority);

-    system.CpuCore(thread->GetProcessorID()).PrepareReschedule();
+    system.PrepareReschedule(thread->GetProcessorID());
    return RESULT_SUCCESS;
 }

@@ -1503,7 +1507,7 @@ static ResultCode CreateThread(Core::System& system, Handle* out_handle, VAddr e
    thread->SetName(
        fmt::format("thread[entry_point={:X}, handle={:X}]", entry_point, *new_thread_handle));

-    system.CpuCore(thread->GetProcessorID()).PrepareReschedule();
+    system.PrepareReschedule(thread->GetProcessorID());

    return RESULT_SUCCESS;
 }
@@ -1525,7 +1529,7 @@ static ResultCode StartThread(Core::System& system, Handle thread_handle) {
    thread->ResumeFromWait();

    if (thread->GetStatus() == ThreadStatus::Ready) {
-        system.CpuCore(thread->GetProcessorID()).PrepareReschedule();
+        system.PrepareReschedule(thread->GetProcessorID());
    }

    return RESULT_SUCCESS;
@@ -1537,7 +1541,7 @@ static void ExitThread(Core::System& system) {

    auto* const current_thread = system.CurrentScheduler().GetCurrentThread();
    current_thread->Stop();
-    system.CurrentScheduler().RemoveThread(current_thread);
+    system.GlobalScheduler().RemoveThread(current_thread);
    system.PrepareReschedule();
 }

@@ -1553,17 +1557,18 @@ static void SleepThread(Core::System& system, s64 nanoseconds) {

    auto& scheduler = system.CurrentScheduler();
    auto* const current_thread = scheduler.GetCurrentThread();
+    bool is_redundant = false;

    if (nanoseconds <= 0) {
        switch (static_cast<SleepType>(nanoseconds)) {
        case SleepType::YieldWithoutLoadBalancing:
-            scheduler.YieldWithoutLoadBalancing(current_thread);
+            is_redundant = current_thread->YieldSimple();
            break;
        case SleepType::YieldWithLoadBalancing:
-            scheduler.YieldWithLoadBalancing(current_thread);
+            is_redundant = current_thread->YieldAndBalanceLoad();
            break;
        case SleepType::YieldAndWaitForLoadBalancing:
-            scheduler.YieldAndWaitForLoadBalancing(current_thread);
+            is_redundant = current_thread->YieldAndWaitForLoadBalancing();
            break;
        default:
            UNREACHABLE_MSG("Unimplemented sleep yield type '{:016X}'!", nanoseconds);
@@ -1572,10 +1577,13 @@ static void SleepThread(Core::System& system, s64 nanoseconds) {
        current_thread->Sleep(nanoseconds);
    }

-    // Reschedule all CPU cores
-    for (std::size_t i = 0; i < Core::NUM_CPU_CORES; ++i) {
-        system.CpuCore(i).PrepareReschedule();
+    if (is_redundant) {
+        // If it's redundant, the core is pretty much idle. Some games keep idling
+        // a core while it's doing nothing, we advance timing to avoid costly continuous
+        // calls.
+        system.CoreTiming().AddTicks(2000);
    }
+    system.PrepareReschedule(current_thread->GetProcessorID());
 }

 /// Wait process wide key atomic
@@ -1601,17 +1609,21 @@ static ResultCode WaitProcessWideKeyAtomic(Core::System& system, VAddr mutex_add
        return ERR_INVALID_ADDRESS;
    }

+    ASSERT(condition_variable_addr == Common::AlignDown(condition_variable_addr, 4));
+
    auto* const current_process = system.Kernel().CurrentProcess();
    const auto& handle_table = current_process->GetHandleTable();
    SharedPtr<Thread> thread = handle_table.Get<Thread>(thread_handle);
    ASSERT(thread);

-    const auto release_result = current_process->GetMutex().Release(mutex_addr);
+    SharedPtr<Thread> current_thread = system.CurrentScheduler().GetCurrentThread();
+
+    const auto release_result =
+        current_process->GetMutex().Release(mutex_addr, current_thread.get());
    if (release_result.IsError()) {
        return release_result;
    }

-    SharedPtr<Thread> current_thread = system.CurrentScheduler().GetCurrentThread();
    current_thread->SetCondVarWaitAddress(condition_variable_addr);
    current_thread->SetMutexWaitAddress(mutex_addr);
    current_thread->SetWaitHandle(thread_handle);
@@ -1622,7 +1634,7 @@ static ResultCode WaitProcessWideKeyAtomic(Core::System& system, VAddr mutex_add

    // Note: Deliberately don't attempt to inherit the lock owner's priority.

-    system.CpuCore(current_thread->GetProcessorID()).PrepareReschedule();
+    system.PrepareReschedule(current_thread->GetProcessorID());
    return RESULT_SUCCESS;
 }

@@ -1632,24 +1644,19 @@ static ResultCode SignalProcessWideKey(Core::System& system, VAddr condition_var
    LOG_TRACE(Kernel_SVC, "called, condition_variable_addr=0x{:X}, target=0x{:08X}",
              condition_variable_addr, target);

-    const auto RetrieveWaitingThreads = [&system](std::size_t core_index,
-                                                  std::vector<SharedPtr<Thread>>& waiting_threads,
-                                                  VAddr condvar_addr) {
-        const auto& scheduler = system.Scheduler(core_index);
-        const auto& thread_list = scheduler.GetThreadList();
-
-        for (const auto& thread : thread_list) {
-            if (thread->GetCondVarWaitAddress() == condvar_addr)
-                waiting_threads.push_back(thread);
-        }
-    };
+    ASSERT(condition_variable_addr == Common::AlignDown(condition_variable_addr, 4));

    // Retrieve a list of all threads that are waiting for this condition variable.
    std::vector<SharedPtr<Thread>> waiting_threads;
-    RetrieveWaitingThreads(0, waiting_threads, condition_variable_addr);
-    RetrieveWaitingThreads(1, waiting_threads, condition_variable_addr);
-    RetrieveWaitingThreads(2, waiting_threads, condition_variable_addr);
-    RetrieveWaitingThreads(3, waiting_threads, condition_variable_addr);
+    const auto& scheduler = system.GlobalScheduler();
+    const auto& thread_list = scheduler.GetThreadList();
+
+    for (const auto& thread : thread_list) {
+        if (thread->GetCondVarWaitAddress() == condition_variable_addr) {
+            waiting_threads.push_back(thread);
+        }
+    }
+
    // Sort them by priority, such that the highest priority ones come first.
    std::sort(waiting_threads.begin(), waiting_threads.end(),
              [](const SharedPtr<Thread>& lhs, const SharedPtr<Thread>& rhs) {
@@ -1679,18 +1686,20 @@ static ResultCode SignalProcessWideKey(Core::System& system, VAddr condition_var

        // Atomically read the value of the mutex.
        u32 mutex_val = 0;
+        u32 update_val = 0;
+        const VAddr mutex_address = thread->GetMutexWaitAddress();
        do {
-            monitor.SetExclusive(current_core, thread->GetMutexWaitAddress());
+            monitor.SetExclusive(current_core, mutex_address);

            // If the mutex is not yet acquired, acquire it.
-            mutex_val = Memory::Read32(thread->GetMutexWaitAddress());
+            mutex_val = Memory::Read32(mutex_address);

            if (mutex_val != 0) {
-                monitor.ClearExclusive();
-                break;
+                update_val = mutex_val | Mutex::MutexHasWaitersFlag;
+            } else {
+                update_val = thread->GetWaitHandle();
            }
-        } while (!monitor.ExclusiveWrite32(current_core, thread->GetMutexWaitAddress(),
-                                           thread->GetWaitHandle()));
+        } while (!monitor.ExclusiveWrite32(current_core, mutex_address, update_val));
        if (mutex_val == 0) {
            // We were able to acquire the mutex, resume this thread.
            ASSERT(thread->GetStatus() == ThreadStatus::WaitCondVar);
@@ -1704,20 +1713,9 @@ static ResultCode SignalProcessWideKey(Core::System& system, VAddr condition_var
            thread->SetLockOwner(nullptr);
            thread->SetMutexWaitAddress(0);
            thread->SetWaitHandle(0);
-            system.CpuCore(thread->GetProcessorID()).PrepareReschedule();
+            thread->SetWaitSynchronizationResult(RESULT_SUCCESS);
+            system.PrepareReschedule(thread->GetProcessorID());
        } else {
-            // Atomically signal that the mutex now has a waiting thread.
-            do {
-                monitor.SetExclusive(current_core, thread->GetMutexWaitAddress());
-
-                // Ensure that the mutex value is still what we expect.
-                u32 value = Memory::Read32(thread->GetMutexWaitAddress());
-                // TODO(Subv): When this happens, the kernel just clears the exclusive state and
-                // retries the initial read for this thread.
-                ASSERT_MSG(mutex_val == value, "Unhandled synchronization primitive case");
-            } while (!monitor.ExclusiveWrite32(current_core, thread->GetMutexWaitAddress(),
-                                               mutex_val | Mutex::MutexHasWaitersFlag));
-
            // The mutex is already owned by some other thread, make this thread wait on it.
            const Handle owner_handle = static_cast<Handle>(mutex_val & Mutex::MutexOwnerMask);
            const auto& handle_table = system.Kernel().CurrentProcess()->GetHandleTable();
@@ -1728,6 +1726,7 @@ static ResultCode SignalProcessWideKey(Core::System& system, VAddr condition_var
            thread->SetStatus(ThreadStatus::WaitMutex);

            owner->AddMutexWaiter(thread);
+            system.PrepareReschedule(thread->GetProcessorID());
        }
    }

@@ -1754,7 +1753,12 @@ static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type,

    const auto arbitration_type = static_cast<AddressArbiter::ArbitrationType>(type);
    auto& address_arbiter = system.Kernel().CurrentProcess()->GetAddressArbiter();
-    return address_arbiter.WaitForAddress(address, arbitration_type, value, timeout);
+    const ResultCode result =
+        address_arbiter.WaitForAddress(address, arbitration_type, value, timeout);
+    if (result == RESULT_SUCCESS) {
+        system.PrepareReschedule();
+    }
+    return result;
 }

 // Signals to an address (via Address Arbiter)
@@ -2040,7 +2044,10 @@ static ResultCode SetThreadCoreMask(Core::System& system, Handle thread_handle,
        return ERR_INVALID_HANDLE;
    }

+    system.PrepareReschedule(thread->GetProcessorID());
    thread->ChangeCore(core, affinity_mask);
+    system.PrepareReschedule(thread->GetProcessorID());
+
    return RESULT_SUCCESS;
 }

@@ -2095,7 +2102,7 @@ static ResultCode CreateEvent(Core::System& system, Handle* write_handle, Handle

    auto& kernel = system.Kernel();
    const auto [readable_event, writable_event] =
-        WritableEvent::CreateEventPair(kernel, ResetType::Manual, "CreateEvent");
+        WritableEvent::CreateEventPair(kernel, "CreateEvent");

    HandleTable& handle_table = kernel.CurrentProcess()->GetHandleTable();

@@ -2151,6 +2158,7 @@ static ResultCode SignalEvent(Core::System& system, Handle handle) {
    }

    writable_event->Signal();
+    system.PrepareReschedule();
    return RESULT_SUCCESS;
 }

--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -45,15 +45,7 @@ void Thread::Stop() {
                                                             callback_handle);
    kernel.ThreadWakeupCallbackHandleTable().Close(callback_handle);
    callback_handle = 0;
-
-    // Clean up thread from ready queue
-    // This is only needed when the thread is terminated forcefully (SVC TerminateProcess)
-    if (status == ThreadStatus::Ready || status == ThreadStatus::Paused) {
-        scheduler->UnscheduleThread(this, current_priority);
-    }
-
-    status = ThreadStatus::Dead;
-
+    SetStatus(ThreadStatus::Dead);
    WakeupAllWaitingThreads();

    // Clean up any dangling references in objects that this thread was waiting for
@@ -132,17 +124,16 @@ void Thread::ResumeFromWait() {
    wakeup_callback = nullptr;

    if (activity == ThreadActivity::Paused) {
-        status = ThreadStatus::Paused;
+        SetStatus(ThreadStatus::Paused);
        return;
    }

-    status = ThreadStatus::Ready;
-
-    ChangeScheduler();
+    SetStatus(ThreadStatus::Ready);
 }

 void Thread::CancelWait() {
    ASSERT(GetStatus() == ThreadStatus::WaitSynch);
+    ClearWaitObjects();
    SetWaitSynchronizationResult(ERR_SYNCHRONIZATION_CANCELED);
    ResumeFromWait();
 }
@@ -205,9 +196,9 @@ ResultVal<SharedPtr<Thread>> Thread::Create(KernelCore& kernel, std::string name
    thread->name = std::move(name);
    thread->callback_handle = kernel.ThreadWakeupCallbackHandleTable().Create(thread).Unwrap();
    thread->owner_process = &owner_process;
+    auto& scheduler = kernel.GlobalScheduler();
+    scheduler.AddThread(thread);
    thread->tls_address = thread->owner_process->CreateTLSRegion();
-    thread->scheduler = &system.Scheduler(processor_id);
-    thread->scheduler->AddThread(thread);

    thread->owner_process->RegisterThread(thread.get());

@@ -250,6 +241,22 @@ void Thread::SetStatus(ThreadStatus new_status) {
        return;
    }

+    switch (new_status) {
+    case ThreadStatus::Ready:
+    case ThreadStatus::Running:
+        SetSchedulingStatus(ThreadSchedStatus::Runnable);
+        break;
+    case ThreadStatus::Dormant:
+        SetSchedulingStatus(ThreadSchedStatus::None);
+        break;
+    case ThreadStatus::Dead:
+        SetSchedulingStatus(ThreadSchedStatus::Exited);
+        break;
+    default:
+        SetSchedulingStatus(ThreadSchedStatus::Paused);
+        break;
+    }
+
    if (status == ThreadStatus::Running) {
        last_running_ticks = Core::System::GetInstance().CoreTiming().GetTicks();
    }
@@ -311,8 +318,7 @@ void Thread::UpdatePriority() {
        return;
    }

-    scheduler->SetThreadPriority(this, new_priority);
-    current_priority = new_priority;
+    SetCurrentPriority(new_priority);

    if (!lock_owner) {
        return;
@@ -328,47 +334,7 @@ void Thread::UpdatePriority() {
 }

 void Thread::ChangeCore(u32 core, u64 mask) {
-    ideal_core = core;
-    affinity_mask = mask;
-    ChangeScheduler();
-}
-
-void Thread::ChangeScheduler() {
-    if (status != ThreadStatus::Ready) {
-        return;
-    }
-
-    auto& system = Core::System::GetInstance();
-    std::optional<s32> new_processor_id{GetNextProcessorId(affinity_mask)};
-
-    if (!new_processor_id) {
-        new_processor_id = processor_id;
-    }
-    if (ideal_core != -1 && system.Scheduler(ideal_core).GetCurrentThread() == nullptr) {
-        new_processor_id = ideal_core;
-    }
-
-    ASSERT(*new_processor_id < 4);
-
-    // Add thread to new core's scheduler
-    auto& next_scheduler = system.Scheduler(*new_processor_id);
-
-    if (*new_processor_id != processor_id) {
-        // Remove thread from previous core's scheduler
-        scheduler->RemoveThread(this);
-        next_scheduler.AddThread(this);
-    }
-
-    processor_id = *new_processor_id;
-
-    // If the thread was ready, unschedule from the previous core and schedule on the new core
-    scheduler->UnscheduleThread(this, current_priority);
-    next_scheduler.ScheduleThread(this, current_priority);
-
-    // Change thread's scheduler
-    scheduler = &next_scheduler;
-
-    system.CpuCore(processor_id).PrepareReschedule();
+    SetCoreAndAffinityMask(core, mask);
 }

 bool Thread::AllWaitObjectsReady() const {
@@ -388,10 +354,8 @@ void Thread::SetActivity(ThreadActivity value) {

    if (value == ThreadActivity::Paused) {
        // Set status if not waiting
-        if (status == ThreadStatus::Ready) {
-            status = ThreadStatus::Paused;
-        } else if (status == ThreadStatus::Running) {
-            status = ThreadStatus::Paused;
+        if (status == ThreadStatus::Ready || status == ThreadStatus::Running) {
+            SetStatus(ThreadStatus::Paused);
            Core::System::GetInstance().CpuCore(processor_id).PrepareReschedule();
        }
    } else if (status == ThreadStatus::Paused) {
@@ -408,6 +372,170 @@ void Thread::Sleep(s64 nanoseconds) {
    WakeAfterDelay(nanoseconds);
 }

+bool Thread::YieldSimple() {
+    auto& scheduler = kernel.GlobalScheduler();
+    return scheduler.YieldThread(this);
+}
+
+bool Thread::YieldAndBalanceLoad() {
+    auto& scheduler = kernel.GlobalScheduler();
+    return scheduler.YieldThreadAndBalanceLoad(this);
+}
+
+bool Thread::YieldAndWaitForLoadBalancing() {
+    auto& scheduler = kernel.GlobalScheduler();
+    return scheduler.YieldThreadAndWaitForLoadBalancing(this);
+}
+
+void Thread::SetSchedulingStatus(ThreadSchedStatus new_status) {
+    const u32 old_flags = scheduling_state;
+    scheduling_state = (scheduling_state & static_cast<u32>(ThreadSchedMasks::HighMask)) |
+                       static_cast<u32>(new_status);
+    AdjustSchedulingOnStatus(old_flags);
+}
+
+void Thread::SetCurrentPriority(u32 new_priority) {
+    const u32 old_priority = std::exchange(current_priority, new_priority);
+    AdjustSchedulingOnPriority(old_priority);
+}
+
+ResultCode Thread::SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask) {
+    const auto HighestSetCore = [](u64 mask, u32 max_cores) {
+        for (s32 core = max_cores - 1; core >= 0; core--) {
+            if (((mask >> core) & 1) != 0) {
+                return core;
+            }
+        }
+        return -1;
+    };
+
+    const bool use_override = affinity_override_count != 0;
+    if (new_core == THREADPROCESSORID_DONT_UPDATE) {
+        new_core = use_override ? ideal_core_override : ideal_core;
+        if ((new_affinity_mask & (1ULL << new_core)) == 0) {
+            return ERR_INVALID_COMBINATION;
+        }
+    }
+    if (use_override) {
+        ideal_core_override = new_core;
+        affinity_mask_override = new_affinity_mask;
+    } else {
+        const u64 old_affinity_mask = std::exchange(affinity_mask, new_affinity_mask);
+        ideal_core = new_core;
+        if (old_affinity_mask != new_affinity_mask) {
+            const s32 old_core = processor_id;
+            if (processor_id >= 0 && ((affinity_mask >> processor_id) & 1) == 0) {
+                if (ideal_core < 0) {
+                    processor_id = HighestSetCore(affinity_mask, GlobalScheduler::NUM_CPU_CORES);
+                } else {
+                    processor_id = ideal_core;
+                }
+            }
+            AdjustSchedulingOnAffinity(old_affinity_mask, old_core);
+        }
+    }
+    return RESULT_SUCCESS;
+}
+
+void Thread::AdjustSchedulingOnStatus(u32 old_flags) {
+    if (old_flags == scheduling_state) {
+        return;
+    }
+
+    auto& scheduler = kernel.GlobalScheduler();
+    if (static_cast<ThreadSchedStatus>(old_flags & static_cast<u32>(ThreadSchedMasks::LowMask)) ==
+        ThreadSchedStatus::Runnable) {
+        // In this case the thread was running, now it's pausing/exitting
+        if (processor_id >= 0) {
+            scheduler.Unschedule(current_priority, processor_id, this);
+        }
+
+        for (s32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+            if (core != processor_id && ((affinity_mask >> core) & 1) != 0) {
+                scheduler.Unsuggest(current_priority, static_cast<u32>(core), this);
+            }
+        }
+    } else if (GetSchedulingStatus() == ThreadSchedStatus::Runnable) {
+        // The thread is now set to running from being stopped
+        if (processor_id >= 0) {
+            scheduler.Schedule(current_priority, processor_id, this);
+        }
+
+        for (s32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+            if (core != processor_id && ((affinity_mask >> core) & 1) != 0) {
+                scheduler.Suggest(current_priority, static_cast<u32>(core), this);
+            }
+        }
+    }
+
+    scheduler.SetReselectionPending();
+}
+
+void Thread::AdjustSchedulingOnPriority(u32 old_priority) {
+    if (GetSchedulingStatus() != ThreadSchedStatus::Runnable) {
+        return;
+    }
+    auto& scheduler = Core::System::GetInstance().GlobalScheduler();
+    if (processor_id >= 0) {
+        scheduler.Unschedule(old_priority, processor_id, this);
+    }
+
+    for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+        if (core != processor_id && ((affinity_mask >> core) & 1) != 0) {
+            scheduler.Unsuggest(old_priority, core, this);
+        }
+    }
+
+    // Add thread to the new priority queues.
+    Thread* current_thread = GetCurrentThread();
+
+    if (processor_id >= 0) {
+        if (current_thread == this) {
+            scheduler.SchedulePrepend(current_priority, processor_id, this);
+        } else {
+            scheduler.Schedule(current_priority, processor_id, this);
+        }
+    }
+
+    for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+        if (core != processor_id && ((affinity_mask >> core) & 1) != 0) {
+            scheduler.Suggest(current_priority, core, this);
+        }
+    }
+
+    scheduler.SetReselectionPending();
+}
+
+void Thread::AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core) {
+    auto& scheduler = Core::System::GetInstance().GlobalScheduler();
+    if (GetSchedulingStatus() != ThreadSchedStatus::Runnable ||
+        current_priority >= THREADPRIO_COUNT) {
+        return;
+    }
+
+    for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+        if (((old_affinity_mask >> core) & 1) != 0) {
+            if (core == old_core) {
+                scheduler.Unschedule(current_priority, core, this);
+            } else {
+                scheduler.Unsuggest(current_priority, core, this);
+            }
+        }
+    }
+
+    for (u32 core = 0; core < GlobalScheduler::NUM_CPU_CORES; core++) {
+        if (((affinity_mask >> core) & 1) != 0) {
+            if (core == processor_id) {
+                scheduler.Schedule(current_priority, core, this);
+            } else {
+                scheduler.Suggest(current_priority, core, this);
+            }
+        }
+    }
+
+    scheduler.SetReselectionPending();
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////

 /**
--- a/src/core/hle/kernel/thread.h
+++ b/src/core/hle/kernel/thread.h
@@ -75,6 +75,26 @@ enum class ThreadActivity : u32 {
    Paused = 1,
 };

+enum class ThreadSchedStatus : u32 {
+    None = 0,
+    Paused = 1,
+    Runnable = 2,
+    Exited = 3,
+};
+
+enum class ThreadSchedFlags : u32 {
+    ProcessPauseFlag = 1 << 4,
+    ThreadPauseFlag = 1 << 5,
+    ProcessDebugPauseFlag = 1 << 6,
+    KernelInitPauseFlag = 1 << 8,
+};
+
+enum class ThreadSchedMasks : u32 {
+    LowMask = 0x000f,
+    HighMask = 0xfff0,
+    ForcePauseMask = 0x0070,
+};
+
 class Thread final : public WaitObject {
 public:
    using MutexWaitingThreads = std::vector<SharedPtr<Thread>>;
@@ -278,6 +298,10 @@ public:
        return processor_id;
    }

+    void SetProcessorID(s32 new_core) {
+        processor_id = new_core;
+    }
+
    Process* GetOwnerProcess() {
        return owner_process;
    }
@@ -295,6 +319,9 @@ public:
    }

    void ClearWaitObjects() {
+        for (const auto& waiting_object : wait_objects) {
+            waiting_object->RemoveWaitingThread(this);
+        }
        wait_objects.clear();
    }

@@ -383,11 +410,47 @@ public:
    /// Sleeps this thread for the given amount of nanoseconds.
    void Sleep(s64 nanoseconds);

+    /// Yields this thread without rebalancing loads.
+    bool YieldSimple();
+
+    /// Yields this thread and does a load rebalancing.
+    bool YieldAndBalanceLoad();
+
+    /// Yields this thread and if the core is left idle, loads are rebalanced
+    bool YieldAndWaitForLoadBalancing();
+
+    void IncrementYieldCount() {
+        yield_count++;
+    }
+
+    u64 GetYieldCount() const {
+        return yield_count;
+    }
+
+    ThreadSchedStatus GetSchedulingStatus() const {
+        return static_cast<ThreadSchedStatus>(scheduling_state &
+                                              static_cast<u32>(ThreadSchedMasks::LowMask));
+    }
+
+    bool IsRunning() const {
+        return is_running;
+    }
+
+    void SetIsRunning(bool value) {
+        is_running = value;
+    }
+
 private:
    explicit Thread(KernelCore& kernel);
    ~Thread() override;

-    void ChangeScheduler();
+    void SetSchedulingStatus(ThreadSchedStatus new_status);
+    void SetCurrentPriority(u32 new_priority);
+    ResultCode SetCoreAndAffinityMask(s32 new_core, u64 new_affinity_mask);
+
+    void AdjustSchedulingOnStatus(u32 old_flags);
+    void AdjustSchedulingOnPriority(u32 old_priority);
+    void AdjustSchedulingOnAffinity(u64 old_affinity_mask, s32 old_core);

    Core::ARM_Interface::ThreadContext context{};

@@ -409,6 +472,8 @@ private:

    u64 total_cpu_time_ticks = 0; ///< Total CPU running ticks.
    u64 last_running_ticks = 0;   ///< CPU tick when thread was last running
+    u64 yield_count = 0;          ///< Number of redundant yields carried by this thread.
+                                  ///< a redundant yield is one where no scheduling is changed

    s32 processor_id = 0;

@@ -453,6 +518,13 @@ private:

    ThreadActivity activity = ThreadActivity::Normal;

+    s32 ideal_core_override = -1;
+    u64 affinity_mask_override = 0x1;
+    u32 affinity_override_count = 0;
+
+    u32 scheduling_state = 0;
+    bool is_running = false;
+
    std::string name;
 };

--- a/src/core/hle/kernel/wait_object.cpp
+++ b/src/core/hle/kernel/wait_object.cpp
@@ -6,6 +6,9 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "common/logging/log.h"
+#include "core/core.h"
+#include "core/core_cpu.h"
+#include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/kernel/thread.h"
@@ -82,9 +85,6 @@ void WaitObject::WakeupWaitingThread(SharedPtr<Thread> thread) {

    const std::size_t index = thread->GetWaitObjectIndex(this);

-    for (const auto& object : thread->GetWaitObjects()) {
-        object->RemoveWaitingThread(thread.get());
-    }
    thread->ClearWaitObjects();

    thread->CancelWakeupTimer();
@@ -95,6 +95,7 @@ void WaitObject::WakeupWaitingThread(SharedPtr<Thread> thread) {
    }
    if (resume) {
        thread->ResumeFromWait();
+        Core::System::GetInstance().PrepareReschedule(thread->GetProcessorID());
    }
 }

--- a/src/core/hle/kernel/writable_event.cpp
+++ b/src/core/hle/kernel/writable_event.cpp
@@ -15,8 +15,7 @@ namespace Kernel {
 WritableEvent::WritableEvent(KernelCore& kernel) : Object{kernel} {}
 WritableEvent::~WritableEvent() = default;

-EventPair WritableEvent::CreateEventPair(KernelCore& kernel, ResetType reset_type,
-                                         std::string name) {
+EventPair WritableEvent::CreateEventPair(KernelCore& kernel, std::string name) {
    SharedPtr<WritableEvent> writable_event(new WritableEvent(kernel));
    SharedPtr<ReadableEvent> readable_event(new ReadableEvent(kernel));

@@ -24,7 +23,6 @@ EventPair WritableEvent::CreateEventPair(KernelCore& kernel, ResetType reset_typ
    writable_event->readable = readable_event;
    readable_event->name = name + ":Readable";
    readable_event->signaled = false;
-    readable_event->reset_type = reset_type;

    return {std::move(readable_event), std::move(writable_event)};
 }
@@ -33,10 +31,6 @@ SharedPtr<ReadableEvent> WritableEvent::GetReadableEvent() const {
    return readable;
 }

-ResetType WritableEvent::GetResetType() const {
-    return readable->reset_type;
-}
-
 void WritableEvent::Signal() {
    readable->Signal();
 }
--- a/src/core/hle/kernel/writable_event.h
+++ b/src/core/hle/kernel/writable_event.h
@@ -24,11 +24,9 @@ public:
    /**
     * Creates an event
     * @param kernel The kernel instance to create this event under.
-     * @param reset_type ResetType describing how to create event
     * @param name Optional name of event
     */
-    static EventPair CreateEventPair(KernelCore& kernel, ResetType reset_type,
-                                     std::string name = "Unknown");
+    static EventPair CreateEventPair(KernelCore& kernel, std::string name = "Unknown");

    std::string GetTypeName() const override {
        return "WritableEvent";
@@ -44,8 +42,6 @@ public:

    SharedPtr<ReadableEvent> GetReadableEvent() const;

-    ResetType GetResetType() const;
-
    void Signal();
    void Clear();
    bool IsSignaled() const;
--- a/src/core/hle/service/am/am.cpp
+++ b/src/core/hle/service/am/am.cpp
@@ -289,8 +289,8 @@ ISelfController::ISelfController(Core::System& system,
    RegisterHandlers(functions);

    auto& kernel = system.Kernel();
-    launchable_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
-                                                              "ISelfController:LaunchableEvent");
+    launchable_event =
+        Kernel::WritableEvent::CreateEventPair(kernel, "ISelfController:LaunchableEvent");

    // This event is created by AM on the first time GetAccumulatedSuspendedTickChangedEvent() is
    // called. Yuzu can just create it unconditionally, since it doesn't need to support multiple
@@ -298,7 +298,7 @@ ISelfController::ISelfController(Core::System& system,
    // suspended if the event has previously been created by a call to
    // GetAccumulatedSuspendedTickChangedEvent.
    accumulated_suspended_tick_changed_event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Manual, "ISelfController:AccumulatedSuspendedTickChangedEvent");
+        kernel, "ISelfController:AccumulatedSuspendedTickChangedEvent");
    accumulated_suspended_tick_changed_event.writable->Signal();
 }

@@ -523,10 +523,10 @@ void ISelfController::GetAccumulatedSuspendedTickChangedEvent(Kernel::HLERequest
 }

 AppletMessageQueue::AppletMessageQueue(Kernel::KernelCore& kernel) {
-    on_new_message = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
-                                                            "AMMessageQueue:OnMessageRecieved");
-    on_operation_mode_changed = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Automatic, "AMMessageQueue:OperationModeChanged");
+    on_new_message =
+        Kernel::WritableEvent::CreateEventPair(kernel, "AMMessageQueue:OnMessageRecieved");
+    on_operation_mode_changed =
+        Kernel::WritableEvent::CreateEventPair(kernel, "AMMessageQueue:OperationModeChanged");
 }

 AppletMessageQueue::~AppletMessageQueue() = default;
@@ -1091,7 +1091,7 @@ IApplicationFunctions::IApplicationFunctions(Core::System& system_)

    auto& kernel = system.Kernel();
    gpu_error_detected_event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Manual, "IApplicationFunctions:GpuErrorDetectedSystemEvent");
+        kernel, "IApplicationFunctions:GpuErrorDetectedSystemEvent");
 }

 IApplicationFunctions::~IApplicationFunctions() = default;
--- a/src/core/hle/service/am/applets/applets.cpp
+++ b/src/core/hle/service/am/applets/applets.cpp
@@ -24,12 +24,12 @@
 namespace Service::AM::Applets {

 AppletDataBroker::AppletDataBroker(Kernel::KernelCore& kernel) {
-    state_changed_event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Manual, "ILibraryAppletAccessor:StateChangedEvent");
-    pop_out_data_event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Manual, "ILibraryAppletAccessor:PopDataOutEvent");
+    state_changed_event =
+        Kernel::WritableEvent::CreateEventPair(kernel, "ILibraryAppletAccessor:StateChangedEvent");
+    pop_out_data_event =
+        Kernel::WritableEvent::CreateEventPair(kernel, "ILibraryAppletAccessor:PopDataOutEvent");
    pop_interactive_out_data_event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Manual, "ILibraryAppletAccessor:PopInteractiveDataOutEvent");
+        kernel, "ILibraryAppletAccessor:PopInteractiveDataOutEvent");
 }

 AppletDataBroker::~AppletDataBroker() = default;
--- a/src/core/hle/service/aoc/aoc_u.cpp
+++ b/src/core/hle/service/aoc/aoc_u.cpp
@@ -67,8 +67,8 @@ AOC_U::AOC_U(Core::System& system)
    RegisterHandlers(functions);

    auto& kernel = system.Kernel();
-    aoc_change_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
-                                                              "GetAddOnContentListChanged:Event");
+    aoc_change_event =
+        Kernel::WritableEvent::CreateEventPair(kernel, "GetAddOnContentListChanged:Event");
 }

 AOC_U::~AOC_U() = default;
--- a/src/core/hle/service/audio/audout_u.cpp
+++ b/src/core/hle/service/audio/audout_u.cpp
@@ -65,8 +65,8 @@ public:
        RegisterHandlers(functions);

        // This is the event handle used to check if the audio buffer was released
-        buffer_event = Kernel::WritableEvent::CreateEventPair(
-            system.Kernel(), Kernel::ResetType::Manual, "IAudioOutBufferReleased");
+        buffer_event =
+            Kernel::WritableEvent::CreateEventPair(system.Kernel(), "IAudioOutBufferReleased");

        stream = audio_core.OpenStream(system.CoreTiming(), audio_params.sample_rate,
                                       audio_params.channel_count, std::move(unique_name),
--- a/src/core/hle/service/audio/audren_u.cpp
+++ b/src/core/hle/service/audio/audren_u.cpp
@@ -47,8 +47,8 @@ public:
        // clang-format on
        RegisterHandlers(functions);

-        system_event = Kernel::WritableEvent::CreateEventPair(
-            system.Kernel(), Kernel::ResetType::Manual, "IAudioRenderer:SystemEvent");
+        system_event =
+            Kernel::WritableEvent::CreateEventPair(system.Kernel(), "IAudioRenderer:SystemEvent");
        renderer = std::make_unique<AudioCore::AudioRenderer>(
            system.CoreTiming(), audren_params, system_event.writable, instance_number);
    }
@@ -180,17 +180,17 @@ public:
        RegisterHandlers(functions);

        auto& kernel = system.Kernel();
-        buffer_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
-                                                              "IAudioOutBufferReleasedEvent");
+        buffer_event =
+            Kernel::WritableEvent::CreateEventPair(kernel, "IAudioOutBufferReleasedEvent");

        // Should be similar to audio_output_device_switch_event
        audio_input_device_switch_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IAudioDevice:AudioInputDeviceSwitchedEvent");
+            kernel, "IAudioDevice:AudioInputDeviceSwitchedEvent");

        // Should only be signalled when an audio output device has been changed, example: speaker
        // to headset
        audio_output_device_switch_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IAudioDevice:AudioOutputDeviceSwitchedEvent");
+            kernel, "IAudioDevice:AudioOutputDeviceSwitchedEvent");
    }

 private:
--- a/src/core/hle/service/bcat/backend/backend.cpp
+++ b/src/core/hle/service/bcat/backend/backend.cpp
@@ -13,8 +13,7 @@ namespace Service::BCAT {
 ProgressServiceBackend::ProgressServiceBackend(Kernel::KernelCore& kernel,
                                               std::string_view event_name) {
    event = Kernel::WritableEvent::CreateEventPair(
-        kernel, Kernel::ResetType::Automatic,
-        std::string("ProgressServiceBackend:UpdateEvent:").append(event_name));
+        kernel, std::string("ProgressServiceBackend:UpdateEvent:").append(event_name));
 }

 Kernel::SharedPtr<Kernel::ReadableEvent> ProgressServiceBackend::GetEvent() const {
--- a/src/core/hle/service/btdrv/btdrv.cpp
+++ b/src/core/hle/service/btdrv/btdrv.cpp
@@ -34,8 +34,7 @@ public:
        RegisterHandlers(functions);

        auto& kernel = system.Kernel();
-        register_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "BT:RegisterEvent");
+        register_event = Kernel::WritableEvent::CreateEventPair(kernel, "BT:RegisterEvent");
    }

 private:
--- a/src/core/hle/service/btm/btm.cpp
+++ b/src/core/hle/service/btm/btm.cpp
@@ -57,14 +57,12 @@ public:
        RegisterHandlers(functions);

        auto& kernel = system.Kernel();
-        scan_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
-                                                            "IBtmUserCore:ScanEvent");
-        connection_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IBtmUserCore:ConnectionEvent");
-        service_discovery = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IBtmUserCore:Discovery");
-        config_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
-                                                              "IBtmUserCore:ConfigEvent");
+        scan_event = Kernel::WritableEvent::CreateEventPair(kernel, "IBtmUserCore:ScanEvent");
+        connection_event =
+            Kernel::WritableEvent::CreateEventPair(kernel, "IBtmUserCore:ConnectionEvent");
+        service_discovery =
+            Kernel::WritableEvent::CreateEventPair(kernel, "IBtmUserCore:Discovery");
+        config_event = Kernel::WritableEvent::CreateEventPair(kernel, "IBtmUserCore:ConfigEvent");
    }

 private:
--- a/src/core/hle/service/filesystem/filesystem.cpp
+++ b/src/core/hle/service/filesystem/filesystem.cpp
@@ -40,7 +40,10 @@ static FileSys::VirtualDir GetDirectoryRelativeWrapped(FileSys::VirtualDir base,
    if (dir_name.empty() || dir_name == "." || dir_name == "/" || dir_name == "\\")
        return base;

-    return base->GetDirectoryRelative(dir_name);
+    const auto res = base->GetDirectoryRelative(dir_name);
+    if (res == nullptr)
+        return base->CreateDirectoryRelative(dir_name);
+    return res;
 }

 VfsDirectoryServiceWrapper::VfsDirectoryServiceWrapper(FileSys::VirtualDir backing_)
--- a/src/core/hle/service/friend/friend.cpp
+++ b/src/core/hle/service/friend/friend.cpp
@@ -162,7 +162,7 @@ public:
        RegisterHandlers(functions);

        notification_event = Kernel::WritableEvent::CreateEventPair(
-            system.Kernel(), Kernel::ResetType::Manual, "INotificationService:NotifyEvent");
+            system.Kernel(), "INotificationService:NotifyEvent");
    }

 private:
--- a/src/core/hle/service/hid/controllers/npad.cpp
+++ b/src/core/hle/service/hid/controllers/npad.cpp
@@ -174,7 +174,7 @@ void Controller_NPad::OnInit() {
    auto& kernel = system.Kernel();
    for (std::size_t i = 0; i < styleset_changed_events.size(); i++) {
        styleset_changed_events[i] = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Manual, fmt::format("npad:NpadStyleSetChanged_{}", i));
+            kernel, fmt::format("npad:NpadStyleSetChanged_{}", i));
    }

    if (!IsControllerActivated()) {
@@ -583,36 +583,6 @@ bool Controller_NPad::SwapNpadAssignment(u32 npad_id_1, u32 npad_id_2) {
    return true;
 }

-bool Controller_NPad::IsControllerSupported(NPadControllerType controller) {
-    if (controller == NPadControllerType::Handheld) {
-        // Handheld is not even a supported type, lets stop here
-        if (std::find(supported_npad_id_types.begin(), supported_npad_id_types.end(),
-                      NPAD_HANDHELD) == supported_npad_id_types.end()) {
-            return false;
-        }
-        // Handheld should not be supported in docked mode
-        if (Settings::values.use_docked_mode) {
-            return false;
-        }
-    }
-    switch (controller) {
-    case NPadControllerType::ProController:
-        return style.pro_controller;
-    case NPadControllerType::Handheld:
-        return style.handheld;
-    case NPadControllerType::JoyDual:
-        return style.joycon_dual;
-    case NPadControllerType::JoyLeft:
-        return style.joycon_left;
-    case NPadControllerType::JoyRight:
-        return style.joycon_right;
-    case NPadControllerType::Pokeball:
-        return style.pokeball;
-    default:
-        return false;
-    }
-}
-
 Controller_NPad::LedPattern Controller_NPad::GetLedPattern(u32 npad_id) {
    if (npad_id == npad_id_list.back() || npad_id == npad_id_list[npad_id_list.size() - 2]) {
        // These are controllers without led patterns
@@ -659,25 +629,24 @@ void Controller_NPad::ClearAllConnectedControllers() {
 }

 void Controller_NPad::DisconnectAllConnectedControllers() {
-    std::for_each(connected_controllers.begin(), connected_controllers.end(),
-                  [](ControllerHolder& controller) { controller.is_connected = false; });
+    for (ControllerHolder& controller : connected_controllers) {
+        controller.is_connected = false;
+    }
 }

 void Controller_NPad::ConnectAllDisconnectedControllers() {
-    std::for_each(connected_controllers.begin(), connected_controllers.end(),
-                  [](ControllerHolder& controller) {
-                      if (controller.type != NPadControllerType::None && !controller.is_connected) {
-                          controller.is_connected = false;
-                      }
-                  });
+    for (ControllerHolder& controller : connected_controllers) {
+        if (controller.type != NPadControllerType::None && !controller.is_connected) {
+            controller.is_connected = true;
+        }
+    }
 }

 void Controller_NPad::ClearAllControllers() {
-    std::for_each(connected_controllers.begin(), connected_controllers.end(),
-                  [](ControllerHolder& controller) {
-                      controller.type = NPadControllerType::None;
-                      controller.is_connected = false;
-                  });
+    for (ControllerHolder& controller : connected_controllers) {
+        controller.type = NPadControllerType::None;
+        controller.is_connected = false;
+    }
 }

 u32 Controller_NPad::GetAndResetPressState() {
@@ -685,10 +654,10 @@ u32 Controller_NPad::GetAndResetPressState() {
 }

 bool Controller_NPad::IsControllerSupported(NPadControllerType controller) const {
-    const bool support_handheld =
-        std::find(supported_npad_id_types.begin(), supported_npad_id_types.end(), NPAD_HANDHELD) !=
-        supported_npad_id_types.end();
    if (controller == NPadControllerType::Handheld) {
+        const bool support_handheld =
+            std::find(supported_npad_id_types.begin(), supported_npad_id_types.end(),
+                      NPAD_HANDHELD) != supported_npad_id_types.end();
        // Handheld is not even a supported type, lets stop here
        if (!support_handheld) {
            return false;
@@ -700,6 +669,7 @@ bool Controller_NPad::IsControllerSupported(NPadControllerType controller) const

        return true;
    }
+
    if (std::any_of(supported_npad_id_types.begin(), supported_npad_id_types.end(),
                    [](u32 npad_id) { return npad_id <= MAX_NPAD_ID; })) {
        switch (controller) {
@@ -717,6 +687,7 @@ bool Controller_NPad::IsControllerSupported(NPadControllerType controller) const
            return false;
        }
    }
+
    return false;
 }

@@ -795,6 +766,7 @@ Controller_NPad::NPadControllerType Controller_NPad::DecideBestController(
        priority_list.push_back(NPadControllerType::JoyLeft);
        priority_list.push_back(NPadControllerType::JoyRight);
        priority_list.push_back(NPadControllerType::JoyDual);
+        break;
    }

    const auto iter = std::find_if(priority_list.begin(), priority_list.end(),
--- a/src/core/hle/service/hid/controllers/npad.h
+++ b/src/core/hle/service/hid/controllers/npad.h
@@ -301,6 +301,11 @@ private:
        bool is_connected;
    };

+    void InitNewlyAddedControler(std::size_t controller_idx);
+    bool IsControllerSupported(NPadControllerType controller) const;
+    NPadControllerType DecideBestController(NPadControllerType priority) const;
+    void RequestPadStateUpdate(u32 npad_id);
+
    u32 press_state{};

    NPadType style{};
@@ -321,12 +326,7 @@ private:
    std::array<ControllerHolder, 10> connected_controllers{};
    bool can_controllers_vibrate{true};

-    void InitNewlyAddedControler(std::size_t controller_idx);
-    bool IsControllerSupported(NPadControllerType controller) const;
-    NPadControllerType DecideBestController(NPadControllerType priority) const;
-    void RequestPadStateUpdate(u32 npad_id);
    std::array<ControllerPad, 10> npad_pad_states{};
-    bool IsControllerSupported(NPadControllerType controller);
    bool is_in_lr_assignment_mode{false};
    Core::System& system;
 };
--- a/src/core/hle/service/hid/hid.cpp
+++ b/src/core/hle/service/hid/hid.cpp
@@ -203,13 +203,13 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) {
        {120, &Hid::SetNpadJoyHoldType, "SetNpadJoyHoldType"},
        {121, &Hid::GetNpadJoyHoldType, "GetNpadJoyHoldType"},
        {122, &Hid::SetNpadJoyAssignmentModeSingleByDefault, "SetNpadJoyAssignmentModeSingleByDefault"},
-        {123, nullptr, "SetNpadJoyAssignmentModeSingleByDefault"},
+        {123, &Hid::SetNpadJoyAssignmentModeSingle, "SetNpadJoyAssignmentModeSingle"},
        {124, &Hid::SetNpadJoyAssignmentModeDual, "SetNpadJoyAssignmentModeDual"},
        {125, &Hid::MergeSingleJoyAsDualJoy, "MergeSingleJoyAsDualJoy"},
        {126, &Hid::StartLrAssignmentMode, "StartLrAssignmentMode"},
        {127, &Hid::StopLrAssignmentMode, "StopLrAssignmentMode"},
        {128, &Hid::SetNpadHandheldActivationMode, "SetNpadHandheldActivationMode"},
-        {129, nullptr, "GetNpadHandheldActivationMode"},
+        {129, &Hid::GetNpadHandheldActivationMode, "GetNpadHandheldActivationMode"},
        {130, &Hid::SwapNpadAssignment, "SwapNpadAssignment"},
        {131, nullptr, "IsUnintendedHomeButtonInputProtectionEnabled"},
        {132, nullptr, "EnableUnintendedHomeButtonInputProtection"},
@@ -557,10 +557,126 @@ void Hid::SetNpadJoyAssignmentModeSingleByDefault(Kernel::HLERequestContext& ctx
    LOG_WARNING(Service_HID, "(STUBBED) called, npad_id={}, applet_resource_user_id={}", npad_id,
                applet_resource_user_id);

+    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
+    controller.SetNpadMode(npad_id, Controller_NPad::NPadAssignments::Single);
+
    IPC::ResponseBuilder rb{ctx, 2};
    rb.Push(RESULT_SUCCESS);
 }

+void Hid::SetNpadJoyAssignmentModeSingle(Kernel::HLERequestContext& ctx) {
+    // TODO: Check the differences between this and SetNpadJoyAssignmentModeSingleByDefault
+    IPC::RequestParser rp{ctx};
+    const auto npad_id{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+    const auto npad_joy_device_type{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID,
+                "(STUBBED) called, npad_id={}, applet_resource_user_id={}, npad_joy_device_type={}",
+                npad_id, applet_resource_user_id, npad_joy_device_type);
+
+    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
+    controller.SetNpadMode(npad_id, Controller_NPad::NPadAssignments::Single);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::SetNpadJoyAssignmentModeDual(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto npad_id{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_DEBUG(Service_HID, "called, npad_id={}, applet_resource_user_id={}", npad_id,
+              applet_resource_user_id);
+
+    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
+    controller.SetNpadMode(npad_id, Controller_NPad::NPadAssignments::Dual);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::MergeSingleJoyAsDualJoy(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto unknown_1{rp.Pop<u32>()};
+    const auto unknown_2{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID,
+                "(STUBBED) called, unknown_1={}, unknown_2={}, applet_resource_user_id={}",
+                unknown_1, unknown_2, applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::StartLrAssignmentMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_DEBUG(Service_HID, "called, applet_resource_user_id={}", applet_resource_user_id);
+    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
+    controller.StartLRAssignmentMode();
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::StopLrAssignmentMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_DEBUG(Service_HID, "called, applet_resource_user_id={}", applet_resource_user_id);
+    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
+    controller.StopLRAssignmentMode();
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::SetNpadHandheldActivationMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+    const auto mode{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}, mode={}",
+                applet_resource_user_id, mode);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::GetNpadHandheldActivationMode(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}",
+                applet_resource_user_id);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(RESULT_SUCCESS);
+}
+
+void Hid::SwapNpadAssignment(Kernel::HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const auto npad_1{rp.Pop<u32>()};
+    const auto npad_2{rp.Pop<u32>()};
+    const auto applet_resource_user_id{rp.Pop<u64>()};
+
+    LOG_DEBUG(Service_HID, "called, applet_resource_user_id={}, npad_1={}, npad_2={}",
+              applet_resource_user_id, npad_1, npad_2);
+
+    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
+    IPC::ResponseBuilder rb{ctx, 2};
+    if (controller.SwapNpadAssignment(npad_1, npad_2)) {
+        rb.Push(RESULT_SUCCESS);
+    } else {
+        LOG_ERROR(Service_HID, "Npads are not connected!");
+        rb.Push(ERR_NPAD_NOT_CONNECTED);
+    }
+}
+
 void Hid::BeginPermitVibrationSession(Kernel::HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};
    const auto applet_resource_user_id{rp.Pop<u64>()};
@@ -635,47 +751,6 @@ void Hid::GetActualVibrationValue(Kernel::HLERequestContext& ctx) {
        applet_resource->GetController<Controller_NPad>(HidController::NPad).GetLastVibration());
 }

-void Hid::SetNpadJoyAssignmentModeDual(Kernel::HLERequestContext& ctx) {
-    IPC::RequestParser rp{ctx};
-    const auto npad_id{rp.Pop<u32>()};
-    const auto applet_resource_user_id{rp.Pop<u64>()};
-
-    LOG_DEBUG(Service_HID, "called, npad_id={}, applet_resource_user_id={}", npad_id,
-              applet_resource_user_id);
-
-    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
-    controller.SetNpadMode(npad_id, Controller_NPad::NPadAssignments::Dual);
-
-    IPC::ResponseBuilder rb{ctx, 2};
-    rb.Push(RESULT_SUCCESS);
-}
-
-void Hid::MergeSingleJoyAsDualJoy(Kernel::HLERequestContext& ctx) {
-    IPC::RequestParser rp{ctx};
-    const auto unknown_1{rp.Pop<u32>()};
-    const auto unknown_2{rp.Pop<u32>()};
-    const auto applet_resource_user_id{rp.Pop<u64>()};
-
-    LOG_WARNING(Service_HID,
-                "(STUBBED) called, unknown_1={}, unknown_2={}, applet_resource_user_id={}",
-                unknown_1, unknown_2, applet_resource_user_id);
-
-    IPC::ResponseBuilder rb{ctx, 2};
-    rb.Push(RESULT_SUCCESS);
-}
-
-void Hid::SetNpadHandheldActivationMode(Kernel::HLERequestContext& ctx) {
-    IPC::RequestParser rp{ctx};
-    const auto applet_resource_user_id{rp.Pop<u64>()};
-    const auto mode{rp.Pop<u64>()};
-
-    LOG_WARNING(Service_HID, "(STUBBED) called, applet_resource_user_id={}, mode={}",
-                applet_resource_user_id, mode);
-
-    IPC::ResponseBuilder rb{ctx, 2};
-    rb.Push(RESULT_SUCCESS);
-}
-
 void Hid::GetVibrationDeviceInfo(Kernel::HLERequestContext& ctx) {
    LOG_DEBUG(Service_HID, "called");

@@ -769,49 +844,6 @@ void Hid::SetPalmaBoostMode(Kernel::HLERequestContext& ctx) {
    rb.Push(RESULT_SUCCESS);
 }

-void Hid::StartLrAssignmentMode(Kernel::HLERequestContext& ctx) {
-    IPC::RequestParser rp{ctx};
-    const auto applet_resource_user_id{rp.Pop<u64>()};
-
-    LOG_DEBUG(Service_HID, "called, applet_resource_user_id={}", applet_resource_user_id);
-    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
-    controller.StartLRAssignmentMode();
-
-    IPC::ResponseBuilder rb{ctx, 2};
-    rb.Push(RESULT_SUCCESS);
-}
-
-void Hid::StopLrAssignmentMode(Kernel::HLERequestContext& ctx) {
-    IPC::RequestParser rp{ctx};
-    const auto applet_resource_user_id{rp.Pop<u64>()};
-
-    LOG_DEBUG(Service_HID, "called, applet_resource_user_id={}", applet_resource_user_id);
-    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
-    controller.StopLRAssignmentMode();
-
-    IPC::ResponseBuilder rb{ctx, 2};
-    rb.Push(RESULT_SUCCESS);
-}
-
-void Hid::SwapNpadAssignment(Kernel::HLERequestContext& ctx) {
-    IPC::RequestParser rp{ctx};
-    const auto npad_1{rp.Pop<u32>()};
-    const auto npad_2{rp.Pop<u32>()};
-    const auto applet_resource_user_id{rp.Pop<u64>()};
-
-    LOG_DEBUG(Service_HID, "called, applet_resource_user_id={}, npad_1={}, npad_2={}",
-              applet_resource_user_id, npad_1, npad_2);
-
-    auto& controller = applet_resource->GetController<Controller_NPad>(HidController::NPad);
-    IPC::ResponseBuilder rb{ctx, 2};
-    if (controller.SwapNpadAssignment(npad_1, npad_2)) {
-        rb.Push(RESULT_SUCCESS);
-    } else {
-        LOG_ERROR(Service_HID, "Npads are not connected!");
-        rb.Push(ERR_NPAD_NOT_CONNECTED);
-    }
-}
-
 class HidDbg final : public ServiceFramework<HidDbg> {
 public:
    explicit HidDbg() : ServiceFramework{"hid:dbg"} {
--- a/src/core/hle/service/hid/hid.h
+++ b/src/core/hle/service/hid/hid.h
@@ -106,14 +106,19 @@ private:
    void SetNpadJoyHoldType(Kernel::HLERequestContext& ctx);
    void GetNpadJoyHoldType(Kernel::HLERequestContext& ctx);
    void SetNpadJoyAssignmentModeSingleByDefault(Kernel::HLERequestContext& ctx);
+    void SetNpadJoyAssignmentModeSingle(Kernel::HLERequestContext& ctx);
+    void SetNpadJoyAssignmentModeDual(Kernel::HLERequestContext& ctx);
+    void MergeSingleJoyAsDualJoy(Kernel::HLERequestContext& ctx);
+    void StartLrAssignmentMode(Kernel::HLERequestContext& ctx);
+    void StopLrAssignmentMode(Kernel::HLERequestContext& ctx);
+    void SetNpadHandheldActivationMode(Kernel::HLERequestContext& ctx);
+    void GetNpadHandheldActivationMode(Kernel::HLERequestContext& ctx);
+    void SwapNpadAssignment(Kernel::HLERequestContext& ctx);
    void BeginPermitVibrationSession(Kernel::HLERequestContext& ctx);
    void EndPermitVibrationSession(Kernel::HLERequestContext& ctx);
    void SendVibrationValue(Kernel::HLERequestContext& ctx);
    void SendVibrationValues(Kernel::HLERequestContext& ctx);
    void GetActualVibrationValue(Kernel::HLERequestContext& ctx);
-    void SetNpadJoyAssignmentModeDual(Kernel::HLERequestContext& ctx);
-    void MergeSingleJoyAsDualJoy(Kernel::HLERequestContext& ctx);
-    void SetNpadHandheldActivationMode(Kernel::HLERequestContext& ctx);
    void GetVibrationDeviceInfo(Kernel::HLERequestContext& ctx);
    void CreateActiveVibrationDeviceList(Kernel::HLERequestContext& ctx);
    void PermitVibration(Kernel::HLERequestContext& ctx);
@@ -123,9 +128,6 @@ private:
    void StopSixAxisSensor(Kernel::HLERequestContext& ctx);
    void SetIsPalmaAllConnectable(Kernel::HLERequestContext& ctx);
    void SetPalmaBoostMode(Kernel::HLERequestContext& ctx);
-    void StartLrAssignmentMode(Kernel::HLERequestContext& ctx);
-    void StopLrAssignmentMode(Kernel::HLERequestContext& ctx);
-    void SwapNpadAssignment(Kernel::HLERequestContext& ctx);

    std::shared_ptr<IAppletResource> applet_resource;
    Core::System& system;
--- a/src/core/hle/service/lbl/lbl.cpp
+++ b/src/core/hle/service/lbl/lbl.cpp
@@ -10,6 +10,8 @@
 #include "core/hle/service/lbl/lbl.h"
 #include "core/hle/service/service.h"
 #include "core/hle/service/sm/sm.h"
+#include "core/settings.h"
+#include "video_core/renderer_base.h"

 namespace Service::LBL {

@@ -18,21 +20,21 @@ public:
    explicit LBL() : ServiceFramework{"lbl"} {
        // clang-format off
        static const FunctionInfo functions[] = {
-            {0, nullptr, "SaveCurrentSetting"},
-            {1, nullptr, "LoadCurrentSetting"},
-            {2, nullptr, "SetCurrentBrightnessSetting"},
-            {3, nullptr, "GetCurrentBrightnessSetting"},
-            {4, nullptr, "ApplyCurrentBrightnessSettingToBacklight"},
-            {5, nullptr, "GetBrightnessSettingAppliedToBacklight"},
-            {6, nullptr, "SwitchBacklightOn"},
-            {7, nullptr, "SwitchBacklightOff"},
-            {8, nullptr, "GetBacklightSwitchStatus"},
-            {9, nullptr, "EnableDimming"},
-            {10, nullptr, "DisableDimming"},
-            {11, nullptr, "IsDimmingEnabled"},
-            {12, nullptr, "EnableAutoBrightnessControl"},
-            {13, nullptr, "DisableAutoBrightnessControl"},
-            {14, nullptr, "IsAutoBrightnessControlEnabled"},
+            {0, &LBL::SaveCurrentSetting, "SaveCurrentSetting"},
+            {1, &LBL::LoadCurrentSetting, "LoadCurrentSetting"},
+            {2, &LBL::SetCurrentBrightnessSetting, "SetCurrentBrightnessSetting"},
+            {3, &LBL::GetCurrentBrightnessSetting, "GetCurrentBrightnessSetting"},
+            {4, &LBL::ApplyCurrentBrightnessSettingToBacklight, "ApplyCurrentBrightnessSettingToBacklight"},
+            {5, &LBL::GetBrightnessSettingAppliedToBacklight, "GetBrightnessSettingAppliedToBacklight"},
+            {6, &LBL::SwitchBacklightOn, "SwitchBacklightOn"},
+            {7, &LBL::SwitchBacklightOff, "SwitchBacklightOff"},
+            {8, &LBL::GetBacklightSwitchStatus, "GetBacklightSwitchStatus"},
+            {9, &LBL::EnableDimming, "EnableDimming"},
+            {10, &LBL::DisableDimming, "DisableDimming"},
+            {11, &LBL::IsDimmingEnabled, "IsDimmingEnabled"},
+            {12, &LBL::EnableAutoBrightnessControl, "EnableAutoBrightnessControl"},
+            {13, &LBL::DisableAutoBrightnessControl, "DisableAutoBrightnessControl"},
+            {14, &LBL::IsAutoBrightnessControlEnabled, "IsAutoBrightnessControlEnabled"},
            {15, nullptr, "SetAmbientLightSensorValue"},
            {16, nullptr, "GetAmbientLightSensorValue"},
            {17, nullptr, "SetBrightnessReflectionDelayLevel"},
@@ -42,8 +44,8 @@ public:
            {21, nullptr, "SetCurrentAmbientLightSensorMapping"},
            {22, nullptr, "GetCurrentAmbientLightSensorMapping"},
            {23, nullptr, "IsAmbientLightSensorAvailable"},
-            {24, nullptr, "SetCurrentBrightnessSettingForVrMode"},
-            {25, nullptr, "GetCurrentBrightnessSettingForVrMode"},
+            {24, &LBL::SetCurrentBrightnessSettingForVrMode, "SetCurrentBrightnessSettingForVrMode"},
+            {25, &LBL::GetCurrentBrightnessSettingForVrMode, "GetCurrentBrightnessSettingForVrMode"},
            {26, &LBL::EnableVrMode, "EnableVrMode"},
            {27, &LBL::DisableVrMode, "DisableVrMode"},
            {28, &LBL::IsVrModeEnabled, "IsVrModeEnabled"},
@@ -53,13 +55,209 @@ public:
        RegisterHandlers(functions);
    }

+    void LoadFromSettings() {
+        current_brightness = Settings::values.backlight_brightness;
+        current_vr_mode_brightness = Settings::values.backlight_brightness;
+
+        if (auto_brightness_enabled) {
+            return;
+        }
+
+        if (vr_mode_enabled) {
+            Renderer().SetCurrentBrightness(current_vr_mode_brightness);
+        } else {
+            Renderer().SetCurrentBrightness(current_brightness);
+        }
+    }
+
 private:
+    f32 GetAutoBrightnessValue() const {
+        return 0.5f;
+    }
+
+    VideoCore::RendererBase& Renderer() {
+        return Core::System::GetInstance().Renderer();
+    }
+
+    void SaveCurrentSetting(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        Settings::values.backlight_brightness = current_brightness;
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void LoadCurrentSetting(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        LoadFromSettings();
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void SetCurrentBrightnessSetting(Kernel::HLERequestContext& ctx) {
+        IPC::RequestParser rp{ctx};
+        const auto value = rp.PopRaw<f32>();
+
+        LOG_DEBUG(Service_LBL, "called, value={:.3f}", value);
+
+        current_brightness = std::clamp(value, 0.0f, 1.0f);
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void GetCurrentBrightnessSetting(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push(current_brightness);
+    }
+
+    void ApplyCurrentBrightnessSettingToBacklight(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        if (!auto_brightness_enabled) {
+            Renderer().SetCurrentBrightness(vr_mode_enabled ? current_vr_mode_brightness
+                                                            : current_brightness);
+        }
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void GetBrightnessSettingAppliedToBacklight(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push(Renderer().GetCurrentResultantBrightness());
+    }
+
+    void SwitchBacklightOn(Kernel::HLERequestContext& ctx) {
+        IPC::RequestParser rp{ctx};
+        const auto fade_time = rp.PopRaw<u64>();
+
+        LOG_DEBUG(Service_LBL, "called, fade_time={:016X}", fade_time);
+
+        Renderer().SetBacklightStatus(true, fade_time);
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void SwitchBacklightOff(Kernel::HLERequestContext& ctx) {
+        IPC::RequestParser rp{ctx};
+        const auto fade_time = rp.PopRaw<u64>();
+
+        LOG_DEBUG(Service_LBL, "called, fade_time={:016X}", fade_time);
+
+        Renderer().SetBacklightStatus(false, fade_time);
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void GetBacklightSwitchStatus(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push<u8>(Renderer().GetBacklightStatus());
+    }
+
+    void EnableDimming(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        dimming_enabled = true;
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void DisableDimming(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "callled");
+
+        dimming_enabled = false;
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void IsDimmingEnabled(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push<u8>(dimming_enabled);
+    }
+
+    void EnableAutoBrightnessControl(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        auto_brightness_enabled = true;
+        Renderer().SetCurrentBrightness(GetAutoBrightnessValue());
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void DisableAutoBrightnessControl(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        auto_brightness_enabled = false;
+        Renderer().SetCurrentBrightness(current_brightness);
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void IsAutoBrightnessControlEnabled(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push<u8>(auto_brightness_enabled);
+    }
+
+    void SetCurrentBrightnessSettingForVrMode(Kernel::HLERequestContext& ctx) {
+        IPC::RequestParser rp{ctx};
+        const auto value = rp.PopRaw<f32>();
+
+        LOG_DEBUG(Service_LBL, "called, value={:.3f}", value);
+
+        current_vr_mode_brightness = std::clamp(value, 0.0f, 1.0f);
+
+        if (vr_mode_enabled && !auto_brightness_enabled) {
+            Renderer().SetCurrentBrightness(value);
+        }
+
+        IPC::ResponseBuilder rb{ctx, 2};
+        rb.Push(RESULT_SUCCESS);
+    }
+
+    void GetCurrentBrightnessSettingForVrMode(Kernel::HLERequestContext& ctx) {
+        LOG_DEBUG(Service_LBL, "called");
+
+        IPC::ResponseBuilder rb{ctx, 3};
+        rb.Push(RESULT_SUCCESS);
+        rb.Push(current_vr_mode_brightness);
+    }
+
    void EnableVrMode(Kernel::HLERequestContext& ctx) {
        LOG_DEBUG(Service_LBL, "called");

        IPC::ResponseBuilder rb{ctx, 2};
        rb.Push(RESULT_SUCCESS);

+        if (!vr_mode_enabled && !auto_brightness_enabled &&
+            current_brightness != current_vr_mode_brightness) {
+            Renderer().SetCurrentBrightness(current_vr_mode_brightness);
+        }
+
        vr_mode_enabled = true;
    }

@@ -69,6 +267,11 @@ private:
        IPC::ResponseBuilder rb{ctx, 2};
        rb.Push(RESULT_SUCCESS);

+        if (vr_mode_enabled && !auto_brightness_enabled &&
+            current_brightness != current_vr_mode_brightness) {
+            Renderer().SetCurrentBrightness(current_brightness);
+        }
+
        vr_mode_enabled = false;
    }

@@ -80,9 +283,27 @@ private:
        rb.Push(vr_mode_enabled);
    }

+    bool auto_brightness_enabled = false;
+    bool dimming_enabled = true;
+
+    f32 current_brightness = GetAutoBrightnessValue();
+    f32 current_vr_mode_brightness = GetAutoBrightnessValue();
+
    bool vr_mode_enabled = false;
 };

+void RequestLoadCurrentSetting(SM::ServiceManager& sm) {
+    if (&sm == nullptr) {
+        return;
+    }
+
+    const auto lbl = sm.GetService<LBL>("lbl");
+
+    if (lbl) {
+        lbl->LoadFromSettings();
+    }
+}
+
 void InstallInterfaces(SM::ServiceManager& sm) {
    std::make_shared<LBL>()->InstallAsService(sm);
 }
--- a/src/core/hle/service/lbl/lbl.h
+++ b/src/core/hle/service/lbl/lbl.h
@@ -10,6 +10,9 @@ class ServiceManager;

 namespace Service::LBL {

+// Requests the LBL service passed to load brightness values from Settings
+void RequestLoadCurrentSetting(SM::ServiceManager& sm);
+
 void InstallInterfaces(SM::ServiceManager& sm);

 } // namespace Service::LBL
--- a/src/core/hle/service/nfp/nfp.cpp
+++ b/src/core/hle/service/nfp/nfp.cpp
@@ -26,8 +26,7 @@ constexpr ResultCode ERR_NO_APPLICATION_AREA(ErrorModule::NFP, 152);
 Module::Interface::Interface(std::shared_ptr<Module> module, Core::System& system, const char* name)
    : ServiceFramework(name), module(std::move(module)), system(system) {
    auto& kernel = system.Kernel();
-    nfc_tag_load = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
-                                                          "IUser:NFCTagDetected");
+    nfc_tag_load = Kernel::WritableEvent::CreateEventPair(kernel, "IUser:NFCTagDetected");
 }

 Module::Interface::~Interface() = default;
@@ -66,10 +65,9 @@ public:
        RegisterHandlers(functions);

        auto& kernel = system.Kernel();
-        deactivate_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IUser:DeactivateEvent");
-        availability_change_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic, "IUser:AvailabilityChangeEvent");
+        deactivate_event = Kernel::WritableEvent::CreateEventPair(kernel, "IUser:DeactivateEvent");
+        availability_change_event =
+            Kernel::WritableEvent::CreateEventPair(kernel, "IUser:AvailabilityChangeEvent");
    }

 private:
--- a/src/core/hle/service/nifm/nifm.cpp
+++ b/src/core/hle/service/nifm/nifm.cpp
@@ -9,6 +9,7 @@
 #include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/nifm/nifm.h"
 #include "core/hle/service/service.h"
+#include "core/settings.h"

 namespace Service::NIFM {

@@ -69,10 +70,8 @@ public:
        RegisterHandlers(functions);

        auto& kernel = system.Kernel();
-        event1 = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
-                                                        "IRequest:Event1");
-        event2 = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Automatic,
-                                                        "IRequest:Event2");
+        event1 = Kernel::WritableEvent::CreateEventPair(kernel, "IRequest:Event1");
+        event2 = Kernel::WritableEvent::CreateEventPair(kernel, "IRequest:Event2");
    }

 private:
@@ -88,7 +87,12 @@ private:

        IPC::ResponseBuilder rb{ctx, 3};
        rb.Push(RESULT_SUCCESS);
-        rb.PushEnum(RequestState::Connected);
+
+        if (Settings::values.bcat_backend == "none") {
+            rb.PushEnum(RequestState::NotSubmitted);
+        } else {
+            rb.PushEnum(RequestState::Connected);
+        }
    }

    void GetResult(Kernel::HLERequestContext& ctx) {
@@ -196,14 +200,22 @@ private:

        IPC::ResponseBuilder rb{ctx, 3};
        rb.Push(RESULT_SUCCESS);
-        rb.Push<u8>(1);
+        if (Settings::values.bcat_backend == "none") {
+            rb.Push<u8>(0);
+        } else {
+            rb.Push<u8>(1);
+        }
    }
    void IsAnyInternetRequestAccepted(Kernel::HLERequestContext& ctx) {
        LOG_WARNING(Service_NIFM, "(STUBBED) called");

        IPC::ResponseBuilder rb{ctx, 3};
        rb.Push(RESULT_SUCCESS);
-        rb.Push<u8>(1);
+        if (Settings::values.bcat_backend == "none") {
+            rb.Push<u8>(0);
+        } else {
+            rb.Push<u8>(1);
+        }
    }
    Core::System& system;
 };
--- a/src/core/hle/service/nim/nim.cpp
+++ b/src/core/hle/service/nim/nim.cpp
@@ -141,8 +141,7 @@ public:

        auto& kernel = system.Kernel();
        finished_event = Kernel::WritableEvent::CreateEventPair(
-            kernel, Kernel::ResetType::Automatic,
-            "IEnsureNetworkClockAvailabilityService:FinishEvent");
+            kernel, "IEnsureNetworkClockAvailabilityService:FinishEvent");
    }

 private:
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -22,6 +22,18 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::
    switch (static_cast<IoctlCommand>(command.raw)) {
    case IoctlCommand::IocSetNVMAPfdCommand:
        return SetNVMAPfd(input, output);
+    case IoctlCommand::IocSubmit:
+        return Submit(input, output);
+    case IoctlCommand::IocGetSyncpoint:
+        return GetSyncpoint(input, output);
+    case IoctlCommand::IocGetWaitbase:
+        return GetWaitbase(input, output);
+    case IoctlCommand::IocMapBuffer:
+        return MapBuffer(input, output);
+    case IoctlCommand::IocMapBufferEx:
+        return MapBufferEx(input, output);
+    case IoctlCommand::IocUnmapBufferEx:
+        return UnmapBufferEx(input, output);
    }

    UNIMPLEMENTED_MSG("Unimplemented ioctl");
@@ -30,11 +42,67 @@ u32 nvhost_nvdec::ioctl(Ioctl command, const std::vector<u8>& input, const std::

 u32 nvhost_nvdec::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlSetNvmapFD params{};
-    std::memcpy(&params, input.data(), input.size());
+    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);

    nvmap_fd = params.nvmap_fd;
    return 0;
 }

+u32 nvhost_nvdec::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlSubmit params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
+    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
+    return 0;
+}
+
+u32 nvhost_nvdec::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlGetSyncpoint params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
+    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
+    params.value = 0; // Seems to be hard coded at 0
+    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
+    return 0;
+}
+
+u32 nvhost_nvdec::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlGetWaitbase params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
+    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
+    params.value = 0; // Seems to be hard coded at 0
+    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
+    return 0;
+}
+
+u32 nvhost_nvdec::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlMapBuffer params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
+                params.address_1);
+    params.address_1 = 0;
+    params.address_2 = 0;
+    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
+    return 0;
+}
+
+u32 nvhost_nvdec::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlMapBufferEx params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
+                params.address_1);
+    params.address_1 = 0;
+    params.address_2 = 0;
+    std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
+    return 0;
+}
+
+u32 nvhost_nvdec::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlUnmapBufferEx params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
+    std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
+    return 0;
+}
+
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.h
@@ -23,16 +23,66 @@ public:
 private:
    enum class IoctlCommand : u32_le {
        IocSetNVMAPfdCommand = 0x40044801,
+        IocSubmit = 0xC0400001,
+        IocGetSyncpoint = 0xC0080002,
+        IocGetWaitbase = 0xC0080003,
+        IocMapBuffer = 0xC01C0009,
+        IocMapBufferEx = 0xC0A40009,
+        IocUnmapBufferEx = 0xC0A4000A,
    };

    struct IoctlSetNvmapFD {
        u32_le nvmap_fd;
    };
-    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");
+    static_assert(sizeof(IoctlSetNvmapFD) == 0x4, "IoctlSetNvmapFD is incorrect size");
+
+    struct IoctlSubmit {
+        INSERT_PADDING_BYTES(0x40); // TODO(DarkLordZach): RE this structure
+    };
+    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit has incorrect size");
+
+    struct IoctlGetSyncpoint {
+        u32 unknown; // seems to be ignored? Nintendo added this
+        u32 value;
+    };
+    static_assert(sizeof(IoctlGetSyncpoint) == 0x08, "IoctlGetSyncpoint has incorrect size");
+
+    struct IoctlGetWaitbase {
+        u32 unknown; // seems to be ignored? Nintendo added this
+        u32 value;
+    };
+    static_assert(sizeof(IoctlGetWaitbase) == 0x08, "IoctlGetWaitbase has incorrect size");
+
+    struct IoctlMapBuffer {
+        u32 unknown;
+        u32 address_1;
+        u32 address_2;
+        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
+    };
+    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
+
+    struct IoctlMapBufferEx {
+        u32 unknown;
+        u32 address_1;
+        u32 address_2;
+        INSERT_PADDING_BYTES(0x98); // TODO(DarkLordZach): RE this structure
+    };
+    static_assert(sizeof(IoctlMapBufferEx) == 0xA4, "IoctlMapBufferEx has incorrect size");
+
+    struct IoctlUnmapBufferEx {
+        INSERT_PADDING_BYTES(0xA4); // TODO(DarkLordZach): RE this structure
+    };
+    static_assert(sizeof(IoctlUnmapBufferEx) == 0xA4, "IoctlUnmapBufferEx has incorrect size");

    u32_le nvmap_fd{};

    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
 };

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -22,6 +22,18 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve
    switch (static_cast<IoctlCommand>(command.raw)) {
    case IoctlCommand::IocSetNVMAPfdCommand:
        return SetNVMAPfd(input, output);
+    case IoctlCommand::IocSubmit:
+        return Submit(input, output);
+    case IoctlCommand::IocGetSyncpoint:
+        return GetSyncpoint(input, output);
+    case IoctlCommand::IocGetWaitbase:
+        return GetWaitbase(input, output);
+    case IoctlCommand::IocMapBuffer:
+        return MapBuffer(input, output);
+    case IoctlCommand::IocMapBufferEx:
+        return MapBuffer(input, output);
+    case IoctlCommand::IocUnmapBufferEx:
+        return UnmapBufferEx(input, output);
    }

    UNIMPLEMENTED_MSG("Unimplemented ioctl");
@@ -30,11 +42,67 @@ u32 nvhost_vic::ioctl(Ioctl command, const std::vector<u8>& input, const std::ve

 u32 nvhost_vic::SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output) {
    IoctlSetNvmapFD params{};
-    std::memcpy(&params, input.data(), input.size());
+    std::memcpy(&params, input.data(), sizeof(IoctlSetNvmapFD));
    LOG_DEBUG(Service_NVDRV, "called, fd={}", params.nvmap_fd);

    nvmap_fd = params.nvmap_fd;
    return 0;
 }

+u32 nvhost_vic::Submit(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlSubmit params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlSubmit));
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
+    std::memcpy(output.data(), &params, sizeof(IoctlSubmit));
+    return 0;
+}
+
+u32 nvhost_vic::GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlGetSyncpoint params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlGetSyncpoint));
+    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
+    params.value = 0; // Seems to be hard coded at 0
+    std::memcpy(output.data(), &params, sizeof(IoctlGetSyncpoint));
+    return 0;
+}
+
+u32 nvhost_vic::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlGetWaitbase params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlGetWaitbase));
+    LOG_INFO(Service_NVDRV, "called, unknown=0x{:X}", params.unknown);
+    params.value = 0; // Seems to be hard coded at 0
+    std::memcpy(output.data(), &params, sizeof(IoctlGetWaitbase));
+    return 0;
+}
+
+u32 nvhost_vic::MapBuffer(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlMapBuffer params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlMapBuffer));
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
+                params.address_1);
+    params.address_1 = 0;
+    params.address_2 = 0;
+    std::memcpy(output.data(), &params, sizeof(IoctlMapBuffer));
+    return 0;
+}
+
+u32 nvhost_vic::MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlMapBufferEx params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlMapBufferEx));
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called with address={:08X}{:08X}", params.address_2,
+                params.address_1);
+    params.address_1 = 0;
+    params.address_2 = 0;
+    std::memcpy(output.data(), &params, sizeof(IoctlMapBufferEx));
+    return 0;
+}
+
+u32 nvhost_vic::UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output) {
+    IoctlUnmapBufferEx params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlUnmapBufferEx));
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called");
+    std::memcpy(output.data(), &params, sizeof(IoctlUnmapBufferEx));
+    return 0;
+}
+
 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.h
@@ -23,6 +23,12 @@ public:
 private:
    enum class IoctlCommand : u32_le {
        IocSetNVMAPfdCommand = 0x40044801,
+        IocSubmit = 0xC0400001,
+        IocGetSyncpoint = 0xC0080002,
+        IocGetWaitbase = 0xC0080003,
+        IocMapBuffer = 0xC01C0009,
+        IocMapBufferEx = 0xC03C0009,
+        IocUnmapBufferEx = 0xC03C000A,
    };

    struct IoctlSetNvmapFD {
@@ -30,9 +36,53 @@ private:
    };
    static_assert(sizeof(IoctlSetNvmapFD) == 4, "IoctlSetNvmapFD is incorrect size");

+    struct IoctlSubmit {
+        INSERT_PADDING_BYTES(0x40); // TODO(DarkLordZach): RE this structure
+    };
+    static_assert(sizeof(IoctlSubmit) == 0x40, "IoctlSubmit is incorrect size");
+
+    struct IoctlGetSyncpoint {
+        u32 unknown; // seems to be ignored? Nintendo added this
+        u32 value;
+    };
+    static_assert(sizeof(IoctlGetSyncpoint) == 0x8, "IoctlGetSyncpoint is incorrect size");
+
+    struct IoctlGetWaitbase {
+        u32 unknown; // seems to be ignored? Nintendo added this
+        u32 value;
+    };
+    static_assert(sizeof(IoctlGetWaitbase) == 0x8, "IoctlGetWaitbase is incorrect size");
+
+    struct IoctlMapBuffer {
+        u32 unknown;
+        u32 address_1;
+        u32 address_2;
+        INSERT_PADDING_BYTES(0x10); // TODO(DarkLordZach): RE this structure
+    };
+    static_assert(sizeof(IoctlMapBuffer) == 0x1C, "IoctlMapBuffer is incorrect size");
+
+    struct IoctlMapBufferEx {
+        u32 unknown;
+        u32 address_1;
+        u32 address_2;
+        INSERT_PADDING_BYTES(0x30); // TODO(DarkLordZach): RE this structure
+    };
+    static_assert(sizeof(IoctlMapBufferEx) == 0x3C, "IoctlMapBufferEx is incorrect size");
+
+    struct IoctlUnmapBufferEx {
+        INSERT_PADDING_BYTES(0x3C); // TODO(DarkLordZach): RE this structure
+    };
+    static_assert(sizeof(IoctlUnmapBufferEx) == 0x3C, "IoctlUnmapBufferEx is incorrect size");
+
    u32_le nvmap_fd{};

    u32 SetNVMAPfd(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 Submit(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetSyncpoint(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 MapBuffer(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 MapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 UnmapBufferEx(const std::vector<u8>& input, std::vector<u8>& output);
 };

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@@ -40,8 +40,7 @@ Module::Module(Core::System& system) {
    auto& kernel = system.Kernel();
    for (u32 i = 0; i < MaxNvEvents; i++) {
        std::string event_label = fmt::format("NVDRV::NvEvent_{}", i);
-        events_interface.events[i] =
-            Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual, event_label);
+        events_interface.events[i] = Kernel::WritableEvent::CreateEventPair(kernel, event_label);
        events_interface.status[i] = EventState::Free;
        events_interface.registered[i] = false;
    }
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -16,8 +16,7 @@ namespace Service::NVFlinger {

 BufferQueue::BufferQueue(Kernel::KernelCore& kernel, u32 id, u64 layer_id)
    : id(id), layer_id(layer_id) {
-    buffer_wait_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
-                                                               "BufferQueue NativeHandle");
+    buffer_wait_event = Kernel::WritableEvent::CreateEventPair(kernel, "BufferQueue NativeHandle");
 }

 BufferQueue::~BufferQueue() = default;
--- a/src/core/hle/service/vi/display/vi_display.cpp
+++ b/src/core/hle/service/vi/display/vi_display.cpp
@@ -17,8 +17,8 @@ namespace Service::VI {

 Display::Display(u64 id, std::string name, Core::System& system) : id{id}, name{std::move(name)} {
    auto& kernel = system.Kernel();
-    vsync_event = Kernel::WritableEvent::CreateEventPair(kernel, Kernel::ResetType::Manual,
-                                                         fmt::format("Display VSync Event {}", id));
+    vsync_event =
+        Kernel::WritableEvent::CreateEventPair(kernel, fmt::format("Display VSync Event {}", id));
 }

 Display::~Display() = default;
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -6,6 +6,8 @@
 #include "core/core.h"
 #include "core/gdbstub/gdbstub.h"
 #include "core/hle/service/hid/hid.h"
+#include "core/hle/service/lbl/lbl.h"
+#include "core/hle/service/sm/sm.h"
 #include "core/settings.h"
 #include "video_core/renderer_base.h"

@@ -70,6 +72,7 @@ void Apply() {
    auto& system_instance = Core::System::GetInstance();
    if (system_instance.IsPoweredOn()) {
        system_instance.Renderer().RefreshBaseSettings();
+        Service::LBL::RequestLoadCurrentSetting(system_instance.ServiceManager());
    }

    Service::HID::ReloadInputDevices();
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -428,6 +428,8 @@ struct Values {
    float bg_green;
    float bg_blue;

+    float backlight_brightness = 0.5f;
+
    std::string log_filter;

    bool use_dev_keys;
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -6,6 +6,7 @@ add_library(video_core STATIC
    dma_pusher.h
    debug_utils/debug_utils.cpp
    debug_utils/debug_utils.h
+    engines/const_buffer_engine_interface.h
    engines/const_buffer_info.h
    engines/engine_upload.cpp
    engines/engine_upload.h
@@ -35,6 +36,8 @@ add_library(video_core STATIC
    memory_manager.h
    morton.cpp
    morton.h
+    rasterizer_accelerated.cpp
+    rasterizer_accelerated.h
    rasterizer_cache.cpp
    rasterizer_cache.h
    rasterizer_interface.h
@@ -107,10 +110,12 @@ add_library(video_core STATIC
    shader/decode/other.cpp
    shader/ast.cpp
    shader/ast.h
-    shader/control_flow.cpp
-    shader/control_flow.h
    shader/compiler_settings.cpp
    shader/compiler_settings.h
+    shader/const_buffer_locker.cpp
+    shader/const_buffer_locker.h
+    shader/control_flow.cpp
+    shader/control_flow.h
    shader/decode.cpp
    shader/expr.cpp
    shader/expr.h
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -30,7 +30,7 @@ public:
    using BufferInfo = std::pair<const TBufferType*, u64>;

    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
-                            bool is_written = false) {
+                            bool is_written = false, bool use_fast_cbuf = false) {
        std::lock_guard lock{mutex};

        auto& memory_manager = system.GPU().MemoryManager();
@@ -43,9 +43,13 @@ public:
        // Cache management is a big overhead, so only cache entries with a given size.
        // TODO: Figure out which size is the best for given games.
        constexpr std::size_t max_stream_size = 0x800;
-        if (size < max_stream_size) {
+        if (use_fast_cbuf || size < max_stream_size) {
            if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) {
-                return StreamBufferUpload(host_ptr, size, alignment);
+                if (use_fast_cbuf) {
+                    return ConstBufferUpload(host_ptr, size);
+                } else {
+                    return StreamBufferUpload(host_ptr, size, alignment);
+                }
            }
        }

@@ -152,6 +156,10 @@ protected:
    virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
                           std::size_t dst_offset, std::size_t size) = 0;

+    virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) {
+        return {};
+    }
+
    /// Register an object into the cache
    void Register(const MapInterval& new_map, bool inherit_written = false) {
        const CacheAddr cache_ptr = new_map->GetStart();
--- a/src/video_core/engines/const_buffer_engine_interface.h
+++ b/src/video_core/engines/const_buffer_engine_interface.h
@@ -0,0 +1,119 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <type_traits>
+#include "common/bit_field.h"
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/textures/texture.h"
+
+namespace Tegra::Engines {
+
+enum class ShaderType : u32 {
+    Vertex = 0,
+    TesselationControl = 1,
+    TesselationEval = 2,
+    Geometry = 3,
+    Fragment = 4,
+    Compute = 5,
+};
+
+struct SamplerDescriptor {
+    union {
+        BitField<0, 20, Tegra::Shader::TextureType> texture_type;
+        BitField<20, 1, u32> is_array;
+        BitField<21, 1, u32> is_buffer;
+        BitField<22, 1, u32> is_shadow;
+        u32 raw{};
+    };
+
+    bool operator==(const SamplerDescriptor& rhs) const noexcept {
+        return raw == rhs.raw;
+    }
+
+    bool operator!=(const SamplerDescriptor& rhs) const noexcept {
+        return !operator==(rhs);
+    }
+
+    static SamplerDescriptor FromTicTexture(Tegra::Texture::TextureType tic_texture_type) {
+        SamplerDescriptor result;
+        switch (tic_texture_type) {
+        case Tegra::Texture::TextureType::Texture1D:
+            result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D);
+            result.is_array.Assign(0);
+            result.is_buffer.Assign(0);
+            result.is_shadow.Assign(0);
+            return result;
+        case Tegra::Texture::TextureType::Texture2D:
+            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
+            result.is_array.Assign(0);
+            result.is_buffer.Assign(0);
+            result.is_shadow.Assign(0);
+            return result;
+        case Tegra::Texture::TextureType::Texture3D:
+            result.texture_type.Assign(Tegra::Shader::TextureType::Texture3D);
+            result.is_array.Assign(0);
+            result.is_buffer.Assign(0);
+            result.is_shadow.Assign(0);
+            return result;
+        case Tegra::Texture::TextureType::TextureCubemap:
+            result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube);
+            result.is_array.Assign(0);
+            result.is_buffer.Assign(0);
+            result.is_shadow.Assign(0);
+            return result;
+        case Tegra::Texture::TextureType::Texture1DArray:
+            result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D);
+            result.is_array.Assign(1);
+            result.is_buffer.Assign(0);
+            result.is_shadow.Assign(0);
+            return result;
+        case Tegra::Texture::TextureType::Texture2DArray:
+            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
+            result.is_array.Assign(1);
+            result.is_buffer.Assign(0);
+            result.is_shadow.Assign(0);
+            return result;
+        case Tegra::Texture::TextureType::Texture1DBuffer:
+            result.texture_type.Assign(Tegra::Shader::TextureType::Texture1D);
+            result.is_array.Assign(0);
+            result.is_buffer.Assign(1);
+            result.is_shadow.Assign(0);
+            return result;
+        case Tegra::Texture::TextureType::Texture2DNoMipmap:
+            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
+            result.is_array.Assign(0);
+            result.is_buffer.Assign(0);
+            result.is_shadow.Assign(0);
+            return result;
+        case Tegra::Texture::TextureType::TextureCubeArray:
+            result.texture_type.Assign(Tegra::Shader::TextureType::TextureCube);
+            result.is_array.Assign(1);
+            result.is_buffer.Assign(0);
+            result.is_shadow.Assign(0);
+            return result;
+        default:
+            result.texture_type.Assign(Tegra::Shader::TextureType::Texture2D);
+            result.is_array.Assign(0);
+            result.is_buffer.Assign(0);
+            result.is_shadow.Assign(0);
+            return result;
+        }
+    }
+};
+static_assert(std::is_trivially_copyable_v<SamplerDescriptor>);
+
+class ConstBufferEngineInterface {
+public:
+    virtual ~ConstBufferEngineInterface() = default;
+    virtual u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const = 0;
+    virtual SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const = 0;
+    virtual SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                                    u64 offset) const = 0;
+    virtual u32 GetBoundBuffer() const = 0;
+};
+
+} // namespace Tegra::Engines
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -28,6 +28,13 @@ void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
    }
 }

+std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) {
+    const u32 line_a = src_2 - src_1;
+    const u32 line_b = dst_2 - dst_1;
+    const u32 excess = std::max<s32>(0, line_a - src_line + src_1);
+    return {line_b - (excess * line_b) / line_a, excess};
+}
+
 void Fermi2D::HandleSurfaceCopy() {
    LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}",
              static_cast<u32>(regs.operation));
@@ -47,10 +54,27 @@ void Fermi2D::HandleSurfaceCopy() {
        src_blit_x2 = static_cast<u32>((regs.blit_src_x >> 32) + regs.blit_dst_width);
        src_blit_y2 = static_cast<u32>((regs.blit_src_y >> 32) + regs.blit_dst_height);
    }
+    u32 dst_blit_x2 = regs.blit_dst_x + regs.blit_dst_width;
+    u32 dst_blit_y2 = regs.blit_dst_y + regs.blit_dst_height;
+    const auto [new_dst_w, src_excess_x] =
+        DelimitLine(src_blit_x1, src_blit_x2, regs.blit_dst_x, dst_blit_x2, regs.src.width);
+    const auto [new_dst_h, src_excess_y] =
+        DelimitLine(src_blit_y1, src_blit_y2, regs.blit_dst_y, dst_blit_y2, regs.src.height);
+    dst_blit_x2 = new_dst_w + regs.blit_dst_x;
+    src_blit_x2 = src_blit_x2 - src_excess_x;
+    dst_blit_y2 = new_dst_h + regs.blit_dst_y;
+    src_blit_y2 = src_blit_y2 - src_excess_y;
+    const auto [new_src_w, dst_excess_x] =
+        DelimitLine(regs.blit_dst_x, dst_blit_x2, src_blit_x1, src_blit_x2, regs.dst.width);
+    const auto [new_src_h, dst_excess_y] =
+        DelimitLine(regs.blit_dst_y, dst_blit_y2, src_blit_y1, src_blit_y2, regs.dst.height);
+    src_blit_x2 = new_src_w + src_blit_x1;
+    dst_blit_x2 = dst_blit_x2 - dst_excess_x;
+    src_blit_y2 = new_src_h + src_blit_y1;
+    dst_blit_y2 = dst_blit_y2 - dst_excess_y;
    const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2};
-    const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y,
-                                          regs.blit_dst_x + regs.blit_dst_width,
-                                          regs.blit_dst_y + regs.blit_dst_height};
+    const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2,
+                                          dst_blit_y2};
    Config copy_config;
    copy_config.operation = regs.operation;
    copy_config.filter = regs.blit_control.filter;
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -50,7 +50,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
    }
 }

-Tegra::Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const {
+Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const {
    const std::bitset<8> cbuf_mask = launch_description.const_buffer_enable_mask.Value();
    ASSERT(cbuf_mask[regs.tex_cb_index]);

@@ -61,22 +61,38 @@ Tegra::Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) co
    ASSERT(address < texinfo.Address() + texinfo.size);

    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(address)};
-    return GetTextureInfo(tex_handle, offset);
+    return GetTextureInfo(tex_handle);
 }

-Texture::FullTextureInfo KeplerCompute::GetTextureInfo(const Texture::TextureHandle tex_handle,
-                                                       std::size_t offset) const {
-    return Texture::FullTextureInfo{static_cast<u32>(offset), GetTICEntry(tex_handle.tic_id),
-                                    GetTSCEntry(tex_handle.tsc_id)};
+Texture::FullTextureInfo KeplerCompute::GetTextureInfo(Texture::TextureHandle tex_handle) const {
+    return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)};
 }

-u32 KeplerCompute::AccessConstBuffer32(u64 const_buffer, u64 offset) const {
+u32 KeplerCompute::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
+    ASSERT(stage == ShaderType::Compute);
    const auto& buffer = launch_description.const_buffer_config[const_buffer];
    u32 result;
    std::memcpy(&result, memory_manager.GetPointer(buffer.Address() + offset), sizeof(u32));
    return result;
 }

+SamplerDescriptor KeplerCompute::AccessBoundSampler(ShaderType stage, u64 offset) const {
+    return AccessBindlessSampler(stage, regs.tex_cb_index, offset * sizeof(Texture::TextureHandle));
+}
+
+SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                                       u64 offset) const {
+    ASSERT(stage == ShaderType::Compute);
+    const auto& tex_info_buffer = launch_description.const_buffer_config[const_buffer];
+    const GPUVAddr tex_info_address = tex_info_buffer.Address() + offset;
+
+    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
+    SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value());
+    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
+    return result;
+}
+
 void KeplerCompute::ProcessLaunch() {
    const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
    memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
--- a/src/video_core/engines/kepler_compute.h
+++ b/src/video_core/engines/kepler_compute.h
@@ -10,6 +10,7 @@
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "video_core/engines/const_buffer_engine_interface.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
 #include "video_core/textures/texture.h"
@@ -37,7 +38,7 @@ namespace Tegra::Engines {
 #define KEPLER_COMPUTE_REG_INDEX(field_name)                                                       \
    (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32))

-class KeplerCompute final {
+class KeplerCompute final : public ConstBufferEngineInterface {
 public:
    explicit KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                           MemoryManager& memory_manager);
@@ -195,13 +196,21 @@ public:
    /// Write the value to the register identified by method.
    void CallMethod(const GPU::MethodCall& method_call);

-    Tegra::Texture::FullTextureInfo GetTexture(std::size_t offset) const;
+    Texture::FullTextureInfo GetTexture(std::size_t offset) const;

-    /// Given a Texture Handle, returns the TSC and TIC entries.
-    Texture::FullTextureInfo GetTextureInfo(const Texture::TextureHandle tex_handle,
-                                            std::size_t offset) const;
+    /// Given a texture handle, returns the TSC and TIC entries.
+    Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const;

-    u32 AccessConstBuffer32(u64 const_buffer, u64 offset) const;
+    u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;
+
+    SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
+
+    SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                            u64 offset) const override;
+
+    u32 GetBoundBuffer() const override {
+        return regs.tex_cb_index;
+    }

 private:
    Core::System& system;
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -98,11 +98,10 @@ void Maxwell3D::InitializeRegisterDefaults() {
    mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }

-#define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name))
+#define DIRTY_REGS_POS(field_name) static_cast<u8>(offsetof(Maxwell3D::DirtyRegs, field_name))

 void Maxwell3D::InitDirtySettings() {
-    const auto set_block = [this](const std::size_t start, const std::size_t range,
-                                  const u8 position) {
+    const auto set_block = [this](std::size_t start, std::size_t range, u8 position) {
        const auto start_itr = dirty_pointers.begin() + start;
        const auto end_itr = start_itr + range;
        std::fill(start_itr, end_itr, position);
@@ -113,10 +112,10 @@ void Maxwell3D::InitDirtySettings() {
    constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
    constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt);
    constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8;
-    u32 rt_dirty_reg = DIRTY_REGS_POS(render_target);
+    u8 rt_dirty_reg = DIRTY_REGS_POS(render_target);
    for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) {
        set_block(rt_reg, registers_per_rt, rt_dirty_reg);
-        rt_dirty_reg++;
+        ++rt_dirty_reg;
    }
    constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer);
    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag;
@@ -130,35 +129,35 @@ void Maxwell3D::InitDirtySettings() {
    constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array);
    constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32);
    constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays;
-    u32 va_reg = DIRTY_REGS_POS(vertex_array);
-    u32 vi_reg = DIRTY_REGS_POS(vertex_instance);
+    u8 va_dirty_reg = DIRTY_REGS_POS(vertex_array);
+    u8 vi_dirty_reg = DIRTY_REGS_POS(vertex_instance);
    for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end;
         vertex_reg += vertex_array_size) {
-        set_block(vertex_reg, 3, va_reg);
+        set_block(vertex_reg, 3, va_dirty_reg);
        // The divisor concerns vertex array instances
-        dirty_pointers[vertex_reg + 3] = vi_reg;
-        va_reg++;
-        vi_reg++;
+        dirty_pointers[static_cast<std::size_t>(vertex_reg) + 3] = vi_dirty_reg;
+        ++va_dirty_reg;
+        ++vi_dirty_reg;
    }
    constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit);
    constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32);
    constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays;
-    va_reg = DIRTY_REGS_POS(vertex_array);
+    va_dirty_reg = DIRTY_REGS_POS(vertex_array);
    for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end;
         vertex_reg += vertex_limit_size) {
-        set_block(vertex_reg, vertex_limit_size, va_reg);
-        va_reg++;
+        set_block(vertex_reg, vertex_limit_size, va_dirty_reg);
+        va_dirty_reg++;
    }
    constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays);
    constexpr u32 vertex_instance_size =
        sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32);
    constexpr u32 vertex_instance_end =
        vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays;
-    vi_reg = DIRTY_REGS_POS(vertex_instance);
+    vi_dirty_reg = DIRTY_REGS_POS(vertex_instance);
    for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end;
         vertex_reg += vertex_instance_size) {
-        set_block(vertex_reg, vertex_instance_size, vi_reg);
-        vi_reg++;
+        set_block(vertex_reg, vertex_instance_size, vi_dirty_reg);
+        vi_dirty_reg++;
    }
    set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(),
              DIRTY_REGS_POS(vertex_attrib_format));
@@ -172,7 +171,7 @@ void Maxwell3D::InitDirtySettings() {
    // State

    // Viewport
-    constexpr u32 viewport_dirty_reg = DIRTY_REGS_POS(viewport);
+    constexpr u8 viewport_dirty_reg = DIRTY_REGS_POS(viewport);
    constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports);
    constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32);
    set_block(viewport_start, viewport_size, viewport_dirty_reg);
@@ -199,7 +198,7 @@ void Maxwell3D::InitDirtySettings() {
    set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart));

    // Depth Test
-    constexpr u32 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test);
+    constexpr u8 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test);
    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg;
    dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg;
    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg;
@@ -224,12 +223,12 @@ void Maxwell3D::InitDirtySettings() {
    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg;

    // Color Mask
-    constexpr u32 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask);
+    constexpr u8 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask);
    dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg;
    set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32),
              color_mask_dirty_reg);
    // Blend State
-    constexpr u32 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state);
+    constexpr u8 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state);
    set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32),
              blend_state_dirty_reg);
    dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg;
@@ -238,12 +237,12 @@ void Maxwell3D::InitDirtySettings() {
              blend_state_dirty_reg);

    // Scissor State
-    constexpr u32 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test);
+    constexpr u8 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test);
    set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32),
              scissor_test_dirty_reg);

    // Polygon Offset
-    constexpr u32 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset);
+    constexpr u8 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset);
    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg;
    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg;
    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg;
@@ -252,7 +251,7 @@ void Maxwell3D::InitDirtySettings() {
    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg;

    // Depth bounds
-    constexpr u32 depth_bounds_values_dirty_reg = DIRTY_REGS_POS(depth_bounds_values);
+    constexpr u8 depth_bounds_values_dirty_reg = DIRTY_REGS_POS(depth_bounds_values);
    dirty_pointers[MAXWELL3D_REG_INDEX(depth_bounds[0])] = depth_bounds_values_dirty_reg;
    dirty_pointers[MAXWELL3D_REG_INDEX(depth_bounds[1])] = depth_bounds_values_dirty_reg;
 }
@@ -761,61 +760,8 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
    return tsc_entry;
 }

-std::vector<Texture::FullTextureInfo> Maxwell3D::GetStageTextures(Regs::ShaderStage stage) const {
-    std::vector<Texture::FullTextureInfo> textures;
-
-    auto& fragment_shader = state.shader_stages[static_cast<std::size_t>(stage)];
-    auto& tex_info_buffer = fragment_shader.const_buffers[regs.tex_cb_index];
-    ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0);
-
-    GPUVAddr tex_info_buffer_end = tex_info_buffer.address + tex_info_buffer.size;
-
-    // Offset into the texture constbuffer where the texture info begins.
-    static constexpr std::size_t TextureInfoOffset = 0x20;
-
-    for (GPUVAddr current_texture = tex_info_buffer.address + TextureInfoOffset;
-         current_texture < tex_info_buffer_end; current_texture += sizeof(Texture::TextureHandle)) {
-
-        const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(current_texture)};
-
-        Texture::FullTextureInfo tex_info{};
-        // TODO(Subv): Use the shader to determine which textures are actually accessed.
-        tex_info.index =
-            static_cast<u32>(current_texture - tex_info_buffer.address - TextureInfoOffset) /
-            sizeof(Texture::TextureHandle);
-
-        // Load the TIC data.
-        auto tic_entry = GetTICEntry(tex_handle.tic_id);
-        // TODO(Subv): Workaround for BitField's move constructor being deleted.
-        std::memcpy(&tex_info.tic, &tic_entry, sizeof(tic_entry));
-
-        // Load the TSC data
-        auto tsc_entry = GetTSCEntry(tex_handle.tsc_id);
-        // TODO(Subv): Workaround for BitField's move constructor being deleted.
-        std::memcpy(&tex_info.tsc, &tsc_entry, sizeof(tsc_entry));
-
-        textures.push_back(tex_info);
-    }
-
-    return textures;
-}
-
-Texture::FullTextureInfo Maxwell3D::GetTextureInfo(const Texture::TextureHandle tex_handle,
-                                                   std::size_t offset) const {
-    Texture::FullTextureInfo tex_info{};
-    tex_info.index = static_cast<u32>(offset);
-
-    // Load the TIC data.
-    auto tic_entry = GetTICEntry(tex_handle.tic_id);
-    // TODO(Subv): Workaround for BitField's move constructor being deleted.
-    std::memcpy(&tex_info.tic, &tic_entry, sizeof(tic_entry));
-
-    // Load the TSC data
-    auto tsc_entry = GetTSCEntry(tex_handle.tsc_id);
-    // TODO(Subv): Workaround for BitField's move constructor being deleted.
-    std::memcpy(&tex_info.tsc, &tsc_entry, sizeof(tsc_entry));
-
-    return tex_info;
+Texture::FullTextureInfo Maxwell3D::GetTextureInfo(Texture::TextureHandle tex_handle) const {
+    return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)};
 }

 Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,
@@ -831,7 +777,7 @@ Texture::FullTextureInfo Maxwell3D::GetStageTexture(Regs::ShaderStage stage,

    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};

-    return GetTextureInfo(tex_handle, offset);
+    return GetTextureInfo(tex_handle);
 }

 u32 Maxwell3D::GetRegisterValue(u32 method) const {
@@ -847,7 +793,8 @@ void Maxwell3D::ProcessClearBuffers() {
    rasterizer.Clear();
 }

-u32 Maxwell3D::AccessConstBuffer32(Regs::ShaderStage stage, u64 const_buffer, u64 offset) const {
+u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const {
+    ASSERT(stage != ShaderType::Compute);
    const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)];
    const auto& buffer = shader_stage.const_buffers[const_buffer];
    u32 result;
@@ -855,4 +802,22 @@ u32 Maxwell3D::AccessConstBuffer32(Regs::ShaderStage stage, u64 const_buffer, u6
    return result;
 }

+SamplerDescriptor Maxwell3D::AccessBoundSampler(ShaderType stage, u64 offset) const {
+    return AccessBindlessSampler(stage, regs.tex_cb_index, offset * sizeof(Texture::TextureHandle));
+}
+
+SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                                   u64 offset) const {
+    ASSERT(stage != ShaderType::Compute);
+    const auto& shader = state.shader_stages[static_cast<std::size_t>(stage)];
+    const auto& tex_info_buffer = shader.const_buffers[const_buffer];
+    const GPUVAddr tex_info_address = tex_info_buffer.address + offset;
+
+    const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)};
+    const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle);
+    SamplerDescriptor result = SamplerDescriptor::FromTicTexture(tex_info.tic.texture_type.Value());
+    result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value());
+    return result;
+}
+
 } // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -15,6 +15,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/math_util.h"
+#include "video_core/engines/const_buffer_engine_interface.h"
 #include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/gpu.h"
@@ -44,7 +45,7 @@ namespace Tegra::Engines {
 #define MAXWELL3D_REG_INDEX(field_name)                                                            \
    (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32))

-class Maxwell3D final {
+class Maxwell3D final : public ConstBufferEngineInterface {
 public:
    explicit Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
                       MemoryManager& memory_manager);
@@ -1165,6 +1166,8 @@ public:

    struct DirtyRegs {
        static constexpr std::size_t NUM_REGS = 256;
+        static_assert(NUM_REGS - 1 <= std::numeric_limits<u8>::max());
+
        union {
            struct {
                bool null_dirty;
@@ -1247,17 +1250,22 @@ public:

    void FlushMMEInlineDraw();

-    /// Given a Texture Handle, returns the TSC and TIC entries.
-    Texture::FullTextureInfo GetTextureInfo(const Texture::TextureHandle tex_handle,
-                                            std::size_t offset) const;
-
-    /// Returns a list of enabled textures for the specified shader stage.
-    std::vector<Texture::FullTextureInfo> GetStageTextures(Regs::ShaderStage stage) const;
+    /// Given a texture handle, returns the TSC and TIC entries.
+    Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const;

    /// Returns the texture information for a specific texture in a specific shader stage.
    Texture::FullTextureInfo GetStageTexture(Regs::ShaderStage stage, std::size_t offset) const;

-    u32 AccessConstBuffer32(Regs::ShaderStage stage, u64 const_buffer, u64 offset) const;
+    u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override;
+
+    SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override;
+
+    SamplerDescriptor AccessBindlessSampler(ShaderType stage, u64 const_buffer,
+                                            u64 offset) const override;
+
+    u32 GetBoundBuffer() const override {
+        return regs.tex_cb_index;
+    }

    /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than
    /// we've seen used.
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -574,7 +574,7 @@ enum class ShuffleOperation : u64 {
 };

 union Instruction {
-    Instruction& operator=(const Instruction& instr) {
+    constexpr Instruction& operator=(const Instruction& instr) {
        value = instr.value;
        return *this;
    }
@@ -1237,6 +1237,32 @@ union Instruction {
        }
    } tld4;

+    union {
+        BitField<35, 1, u64> ndv_flag;
+        BitField<49, 1, u64> nodep_flag;
+        BitField<50, 1, u64> dc_flag;
+        BitField<33, 2, u64> info;
+        BitField<37, 2, u64> component;
+
+        bool UsesMiscMode(TextureMiscMode mode) const {
+            switch (mode) {
+            case TextureMiscMode::NDV:
+                return ndv_flag != 0;
+            case TextureMiscMode::NODEP:
+                return nodep_flag != 0;
+            case TextureMiscMode::DC:
+                return dc_flag != 0;
+            case TextureMiscMode::AOFFI:
+                return info == 1;
+            case TextureMiscMode::PTP:
+                return info == 2;
+            default:
+                break;
+            }
+            return false;
+        }
+    } tld4_b;
+
    union {
        BitField<49, 1, u64> nodep_flag;
        BitField<50, 1, u64> dc_flag;
@@ -1590,7 +1616,8 @@ public:
        TEXS,   // Texture Fetch with scalar/non-vec4 source/destinations
        TLD,    // Texture Load
        TLDS,   // Texture Load with scalar/non-vec4 source/destinations
-        TLD4,   // Texture Load 4
+        TLD4,   // Texture Gather 4
+        TLD4_B, // Texture Gather 4 Bindless
        TLD4S,  // Texture Load 4 with scalar / non - vec4 source / destinations
        TMML_B, // Texture Mip Map Level
        TMML,   // Texture Mip Map Level
@@ -1760,22 +1787,22 @@ public:

    class Matcher {
    public:
-        Matcher(const char* const name, u16 mask, u16 expected, OpCode::Id id, OpCode::Type type)
+        constexpr Matcher(const char* const name, u16 mask, u16 expected, Id id, Type type)
            : name{name}, mask{mask}, expected{expected}, id{id}, type{type} {}

-        const char* GetName() const {
+        constexpr const char* GetName() const {
            return name;
        }

-        u16 GetMask() const {
+        constexpr u16 GetMask() const {
            return mask;
        }

-        Id GetId() const {
+        constexpr Id GetId() const {
            return id;
        }

-        Type GetType() const {
+        constexpr Type GetType() const {
            return type;
        }

@@ -1784,7 +1811,7 @@ public:
         * @param instruction The instruction to test
         * @returns true if the given instruction matches.
         */
-        bool Matches(u16 instruction) const {
+        constexpr bool Matches(u16 instruction) const {
            return (instruction & mask) == expected;
        }

@@ -1818,7 +1845,7 @@ private:
         * A '0' in a bitstring indicates that a zero must be present at that bit position.
         * A '1' in a bitstring indicates that a one must be present at that bit position.
         */
-        static auto GetMaskAndExpect(const char* const bitstring) {
+        static constexpr auto GetMaskAndExpect(const char* const bitstring) {
            u16 mask = 0, expect = 0;
            for (std::size_t i = 0; i < opcode_bitsize; i++) {
                const std::size_t bit_position = opcode_bitsize - i - 1;
@@ -1835,15 +1862,15 @@ private:
                    break;
                }
            }
-            return std::make_tuple(mask, expect);
+            return std::make_pair(mask, expect);
        }

    public:
        /// Creates a matcher that can match and parse instructions based on bitstring.
-        static auto GetMatcher(const char* const bitstring, OpCode::Id op, OpCode::Type type,
-                               const char* const name) {
-            const auto mask_expect = GetMaskAndExpect(bitstring);
-            return Matcher(name, std::get<0>(mask_expect), std::get<1>(mask_expect), op, type);
+        static constexpr auto GetMatcher(const char* const bitstring, Id op, Type type,
+                                         const char* const name) {
+            const auto [mask, expected] = GetMaskAndExpect(bitstring);
+            return Matcher(name, mask, expected, op, type);
        }
    };

@@ -1881,6 +1908,7 @@ private:
            INST("11011100--11----", Id::TLD, Type::Texture, "TLD"),
            INST("1101-01---------", Id::TLDS, Type::Texture, "TLDS"),
            INST("110010----111---", Id::TLD4, Type::Texture, "TLD4"),
+            INST("1101111011111---", Id::TLD4_B, Type::Texture, "TLD4_B"),
            INST("1101111100------", Id::TLD4S, Type::Texture, "TLD4S"),
            INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
            INST("1101111101011---", Id::TMML, Type::Texture, "TMML"),
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -112,6 +112,7 @@ static constexpr ConversionArray morton_to_linear_fns = {
    MortonCopy<true, PixelFormat::ASTC_2D_8X6_SRGB>,
    MortonCopy<true, PixelFormat::ASTC_2D_6X5>,
    MortonCopy<true, PixelFormat::ASTC_2D_6X5_SRGB>,
+    MortonCopy<true, PixelFormat::E5B9G9R9F>,
    MortonCopy<true, PixelFormat::Z32F>,
    MortonCopy<true, PixelFormat::Z16>,
    MortonCopy<true, PixelFormat::Z24S8>,
@@ -192,6 +193,7 @@ static constexpr ConversionArray linear_to_morton_fns = {
    nullptr,
    nullptr,
    nullptr,
+    MortonCopy<false, PixelFormat::E5B9G9R9F>,
    MortonCopy<false, PixelFormat::Z32F>,
    MortonCopy<false, PixelFormat::Z16>,
    MortonCopy<false, PixelFormat::Z24S8>,
--- a/src/video_core/rasterizer_accelerated.cpp
+++ b/src/video_core/rasterizer_accelerated.cpp
@@ -0,0 +1,63 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <mutex>
+
+#include <boost/icl/interval_map.hpp>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "core/memory.h"
+#include "video_core/rasterizer_accelerated.h"
+
+namespace VideoCore {
+
+namespace {
+
+template <typename Map, typename Interval>
+constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
+    return boost::make_iterator_range(map.equal_range(interval));
+}
+
+} // Anonymous namespace
+
+RasterizerAccelerated::RasterizerAccelerated() = default;
+
+RasterizerAccelerated::~RasterizerAccelerated() = default;
+
+void RasterizerAccelerated::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
+    std::lock_guard lock{pages_mutex};
+    const u64 page_start{addr >> Memory::PAGE_BITS};
+    const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS};
+
+    // Interval maps will erase segments if count reaches 0, so if delta is negative we have to
+    // subtract after iterating
+    const auto pages_interval = CachedPageMap::interval_type::right_open(page_start, page_end);
+    if (delta > 0) {
+        cached_pages.add({pages_interval, delta});
+    }
+
+    for (const auto& pair : RangeFromInterval(cached_pages, pages_interval)) {
+        const auto interval = pair.first & pages_interval;
+        const int count = pair.second;
+
+        const VAddr interval_start_addr = boost::icl::first(interval) << Memory::PAGE_BITS;
+        const VAddr interval_end_addr = boost::icl::last_next(interval) << Memory::PAGE_BITS;
+        const u64 interval_size = interval_end_addr - interval_start_addr;
+
+        if (delta > 0 && count == delta) {
+            Memory::RasterizerMarkRegionCached(interval_start_addr, interval_size, true);
+        } else if (delta < 0 && count == -delta) {
+            Memory::RasterizerMarkRegionCached(interval_start_addr, interval_size, false);
+        } else {
+            ASSERT(count >= 0);
+        }
+    }
+
+    if (delta < 0) {
+        cached_pages.add({pages_interval, delta});
+    }
+}
+
+} // namespace VideoCore
--- a/src/video_core/rasterizer_accelerated.h
+++ b/src/video_core/rasterizer_accelerated.h
@@ -0,0 +1,31 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <mutex>
+
+#include <boost/icl/interval_map.hpp>
+
+#include "common/common_types.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCore {
+
+/// Implements the shared part in GPU accelerated rasterizers in RasterizerInterface.
+class RasterizerAccelerated : public RasterizerInterface {
+public:
+    explicit RasterizerAccelerated();
+    ~RasterizerAccelerated() override;
+
+    void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override;
+
+private:
+    using CachedPageMap = boost::icl::interval_map<u64, int>;
+    CachedPageMap cached_pages;
+
+    std::mutex pages_mutex;
+};
+
+} // namespace VideoCore
--- a/src/video_core/renderer_base.cpp
+++ b/src/video_core/renderer_base.cpp
@@ -40,4 +40,35 @@ void RendererBase::RequestScreenshot(void* data, std::function<void()> callback,
    renderer_settings.screenshot_requested = true;
 }

+f32 RendererBase::GetCurrentResultantBrightness() const {
+    return renderer_settings.current_brightness / 2.0f;
+}
+
+void RendererBase::SetBacklightStatus(bool enabled, u64 fade_transition_time) {
+    if (fade_transition_time == 0) {
+        // Needed to ensure the renderer recognizes that a change must occur.
+        fade_transition_time = 1;
+    }
+
+    if (enabled && renderer_settings.current_brightness == 0) {
+        renderer_settings.current_brightness = current_brightness_backup;
+        renderer_settings.backlight_fade_time = fade_transition_time;
+    } else if (!enabled && renderer_settings.current_brightness != 0) {
+        current_brightness_backup = renderer_settings.current_brightness;
+        renderer_settings.current_brightness = 0;
+        renderer_settings.backlight_fade_time = fade_transition_time;
+    }
+}
+
+bool RendererBase::GetBacklightStatus() const {
+    return renderer_settings.current_brightness != 0;
+}
+
+void RendererBase::SetCurrentBrightness(f32 value) {
+    if (value != renderer_settings.current_brightness) {
+        renderer_settings.current_brightness = value * 2.0f;
+        renderer_settings.backlight_fade_time = 1;
+    }
+}
+
 } // namespace VideoCore
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -28,6 +28,10 @@ struct RendererSettings {
    void* screenshot_bits;
    std::function<void()> screenshot_complete_callback;
    Layout::FramebufferLayout screenshot_framebuffer_layout;
+
+    // Backlight & Brightness
+    std::atomic<f32> current_brightness{1.f};
+    std::atomic<u64> backlight_fade_time{0};
 };

 class RendererBase : NonCopyable {
@@ -86,6 +90,17 @@ public:
    void RequestScreenshot(void* data, std::function<void()> callback,
                           const Layout::FramebufferLayout& layout);

+    // Gets the current brightness, even if it has been changed from the set value. Most of the time
+    // for yuzu this will simply match what was returned, but implementations are free to change the
+    // value in settings.
+    f32 GetCurrentResultantBrightness() const;
+
+    void SetBacklightStatus(bool enabled, u64 fade_transition_time);
+
+    bool GetBacklightStatus() const;
+
+    void SetCurrentBrightness(f32 value);
+
 protected:
    Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle.
    std::unique_ptr<RasterizerInterface> rasterizer;
@@ -97,6 +112,9 @@ protected:
 private:
    /// Updates the framebuffer layout of the contained render window handle.
    void UpdateCurrentFramebufferLayout();
+
+    // Value of brightness before backlight switch used to preserve value.
+    f32 current_brightness_backup;
 };

 } // namespace VideoCore
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -8,13 +8,17 @@

 #include "common/assert.h"
 #include "common/microprofile.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
+#include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"

 namespace OpenGL {

+using Maxwell = Tegra::Engines::Maxwell3D::Regs;
+
 MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));

 CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size)
@@ -26,11 +30,22 @@ CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t siz
 CachedBufferBlock::~CachedBufferBlock() = default;

 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                               std::size_t stream_size)
-    : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{
-          rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}
+                               const Device& device, std::size_t stream_size)
+    : GenericBufferCache{rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {
+    if (!device.HasFastBufferSubData()) {
+        return;
+    }

-OGLBufferCache::~OGLBufferCache() = default;
+    static constexpr auto size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize);
+    glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
+    for (const GLuint cbuf : cbufs) {
+        glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW);
+    }
+}
+
+OGLBufferCache::~OGLBufferCache() {
+    glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs));
+}

 Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {
    return std::make_shared<CachedBufferBlock>(cache_addr, size);
@@ -69,4 +84,12 @@ void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t
                             static_cast<GLsizeiptr>(size));
 }

+OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer,
+                                                             std::size_t size) {
+    DEBUG_ASSERT(cbuf_cursor < std::size(cbufs));
+    const GLuint& cbuf = cbufs[cbuf_cursor++];
+    glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer);
+    return {&cbuf, 0};
+}
+
 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -4,10 +4,12 @@

 #pragma once

+#include <array>
 #include <memory>

 #include "common/common_types.h"
 #include "video_core/buffer_cache/buffer_cache.h"
+#include "video_core/engines/maxwell_3d.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
@@ -18,12 +20,14 @@ class System;

 namespace OpenGL {

+class Device;
 class OGLStreamBuffer;
 class RasterizerOpenGL;

 class CachedBufferBlock;

 using Buffer = std::shared_ptr<CachedBufferBlock>;
+using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>;

 class CachedBufferBlock : public VideoCommon::BufferBlock {
 public:
@@ -38,14 +42,18 @@ private:
    OGLBuffer gl_buffer{};
 };

-class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> {
+class OGLBufferCache final : public GenericBufferCache {
 public:
    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
-                            std::size_t stream_size);
+                            const Device& device, std::size_t stream_size);
    ~OGLBufferCache();

    const GLuint* GetEmptyBuffer(std::size_t) override;

+    void Acquire() noexcept {
+        cbuf_cursor = 0;
+    }
+
 protected:
    Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override;

@@ -61,6 +69,14 @@ protected:

    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
                   std::size_t dst_offset, std::size_t size) override;
+
+    BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override;
+
+private:
+    std::size_t cbuf_cursor = 0;
+    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
+                           Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram>
+        cbufs;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -51,8 +51,11 @@ bool HasExtension(const std::vector<std::string_view>& images, std::string_view
 } // Anonymous namespace

 Device::Device() {
+    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
    const std::vector extensions = GetExtensions();

+    const bool is_nvidia = vendor == "NVIDIA Corporation";
+
    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
@@ -64,6 +67,7 @@ Device::Device() {
    has_variable_aoffi = TestVariableAoffi();
    has_component_indexing_bug = TestComponentIndexingBug();
    has_precise_bug = TestPreciseBug();
+    has_fast_buffer_sub_data = is_nvidia;

    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
    LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -54,6 +54,10 @@ public:
        return has_precise_bug;
    }

+    bool HasFastBufferSubData() const {
+        return has_fast_buffer_sub_data;
+    }
+
 private:
    static bool TestVariableAoffi();
    static bool TestComponentIndexingBug();
@@ -69,6 +73,7 @@ private:
    bool has_variable_aoffi{};
    bool has_component_indexing_bug{};
    bool has_precise_bug{};
+    bool has_fast_buffer_sub_data{};
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -67,9 +67,7 @@ static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buf
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                   ScreenInfo& info)
    : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device},
-      system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} {
-    OpenGLState::ApplyDefaultState();
-
+      system{system}, screen_info{info}, buffer_cache{*this, system, device, STREAM_BUFFER_SIZE} {
    shader_program_manager = std::make_unique<GLShader::ProgramManager>();
    state.draw.shader_program = 0;
    state.Apply();
@@ -259,10 +257,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
            continue;
        }

-        const std::size_t stage{index == 0 ? 0 : index - 1}; // Stage indices are 0 - 5
-
        GLShader::MaxwellUniformData ubo{};
-        ubo.SetFromRegs(gpu, stage);
+        ubo.SetFromRegs(gpu);
        const auto [buffer, offset] =
            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());

@@ -271,10 +267,11 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {

        Shader shader{shader_cache.GetStageProgram(program)};

-        const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
-        SetupDrawConstBuffers(stage_enum, shader);
-        SetupDrawGlobalMemory(stage_enum, shader);
-        const auto texture_buffer_usage{SetupDrawTextures(stage_enum, shader, base_bindings)};
+        // Stage indices are 0 - 5
+        const auto stage = static_cast<Maxwell::ShaderStage>(index == 0 ? 0 : index - 1);
+        SetupDrawConstBuffers(stage, shader);
+        SetupDrawGlobalMemory(stage, shader);
+        const auto texture_buffer_usage{SetupDrawTextures(stage, shader, base_bindings)};

        const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage};
        const auto [program_handle, next_bindings] = shader->GetProgramHandle(variant);
@@ -342,42 +339,6 @@ std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
           static_cast<std::size_t>(regs.index_array.FormatSizeInBytes());
 }

-template <typename Map, typename Interval>
-static constexpr auto RangeFromInterval(Map& map, const Interval& interval) {
-    return boost::make_iterator_range(map.equal_range(interval));
-}
-
-void RasterizerOpenGL::UpdatePagesCachedCount(VAddr addr, u64 size, int delta) {
-    std::lock_guard lock{pages_mutex};
-    const u64 page_start{addr >> Memory::PAGE_BITS};
-    const u64 page_end{(addr + size + Memory::PAGE_SIZE - 1) >> Memory::PAGE_BITS};
-
-    // Interval maps will erase segments if count reaches 0, so if delta is negative we have to
-    // subtract after iterating
-    const auto pages_interval = CachedPageMap::interval_type::right_open(page_start, page_end);
-    if (delta > 0)
-        cached_pages.add({pages_interval, delta});
-
-    for (const auto& pair : RangeFromInterval(cached_pages, pages_interval)) {
-        const auto interval = pair.first & pages_interval;
-        const int count = pair.second;
-
-        const VAddr interval_start_addr = boost::icl::first(interval) << Memory::PAGE_BITS;
-        const VAddr interval_end_addr = boost::icl::last_next(interval) << Memory::PAGE_BITS;
-        const u64 interval_size = interval_end_addr - interval_start_addr;
-
-        if (delta > 0 && count == delta)
-            Memory::RasterizerMarkRegionCached(interval_start_addr, interval_size, true);
-        else if (delta < 0 && count == -delta)
-            Memory::RasterizerMarkRegionCached(interval_start_addr, interval_size, false);
-        else
-            ASSERT(count >= 0);
-    }
-
-    if (delta < 0)
-        cached_pages.add({pages_interval, delta});
-}
-
 void RasterizerOpenGL::LoadDiskResources(const std::atomic_bool& stop_loading,
                                         const VideoCore::DiskResourceLoadCallback& callback) {
    shader_cache.LoadDiskCache(stop_loading, callback);
@@ -596,6 +557,8 @@ void RasterizerOpenGL::DrawPrelude() {
    SyncPolygonOffset();
    SyncAlphaTest();

+    buffer_cache.Acquire();
+
    // Draw the vertex batch
    const bool is_indexed = accelerate_draw == AccelDraw::Indexed;

@@ -917,7 +880,8 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
    const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));

    const auto alignment = device.GetUniformBufferAlignment();
-    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment);
+    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false,
+                                                          device.HasFastBufferSubData());
    bind_ubo_pushbuffer.Push(cbuf, offset, size);
 }

@@ -969,14 +933,15 @@ TextureBufferUsage RasterizerOpenGL::SetupDrawTextures(Maxwell::ShaderStage stag

    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
        const auto& entry = entries[bindpoint];
-        const auto texture = [&]() {
+        const auto texture = [&] {
            if (!entry.IsBindless()) {
                return maxwell3d.GetStageTexture(stage, entry.GetOffset());
            }
            const auto cbuf = entry.GetBindlessCBuf();
            Tegra::Texture::TextureHandle tex_handle;
-            tex_handle.raw = maxwell3d.AccessConstBuffer32(stage, cbuf.first, cbuf.second);
-            return maxwell3d.GetTextureInfo(tex_handle, entry.GetOffset());
+            Tegra::Engines::ShaderType shader_type = static_cast<Tegra::Engines::ShaderType>(stage);
+            tex_handle.raw = maxwell3d.AccessConstBuffer32(shader_type, cbuf.first, cbuf.second);
+            return maxwell3d.GetTextureInfo(tex_handle);
        }();

        if (SetupTexture(base_bindings.sampler + bindpoint, texture, entry)) {
@@ -999,14 +964,15 @@ TextureBufferUsage RasterizerOpenGL::SetupComputeTextures(const Shader& kernel)

    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
        const auto& entry = entries[bindpoint];
-        const auto texture = [&]() {
+        const auto texture = [&] {
            if (!entry.IsBindless()) {
                return compute.GetTexture(entry.GetOffset());
            }
            const auto cbuf = entry.GetBindlessCBuf();
            Tegra::Texture::TextureHandle tex_handle;
-            tex_handle.raw = compute.AccessConstBuffer32(cbuf.first, cbuf.second);
-            return compute.GetTextureInfo(tex_handle, entry.GetOffset());
+            tex_handle.raw = compute.AccessConstBuffer32(Tegra::Engines::ShaderType::Compute,
+                                                         cbuf.first, cbuf.second);
+            return compute.GetTextureInfo(tex_handle);
        }();

        if (SetupTexture(bindpoint, texture, entry)) {
@@ -1044,14 +1010,15 @@ void RasterizerOpenGL::SetupComputeImages(const Shader& shader) {
    const auto& entries = shader->GetShaderEntries().images;
    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
        const auto& entry = entries[bindpoint];
-        const auto tic = [&]() {
+        const auto tic = [&] {
            if (!entry.IsBindless()) {
                return compute.GetTexture(entry.GetOffset()).tic;
            }
            const auto cbuf = entry.GetBindlessCBuf();
            Tegra::Texture::TextureHandle tex_handle;
-            tex_handle.raw = compute.AccessConstBuffer32(cbuf.first, cbuf.second);
-            return compute.GetTextureInfo(tex_handle, entry.GetOffset()).tic;
+            tex_handle.raw = compute.AccessConstBuffer32(Tegra::Engines::ShaderType::Compute,
+                                                         cbuf.first, cbuf.second);
+            return compute.GetTextureInfo(tex_handle).tic;
        }();
        SetupImage(bindpoint, tic, entry);
    }
@@ -1092,6 +1059,15 @@ void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
    }
    state.depth_clamp.far_plane = regs.view_volume_clip_control.depth_clamp_far != 0;
    state.depth_clamp.near_plane = regs.view_volume_clip_control.depth_clamp_near != 0;
+
+    bool flip_y = false;
+    if (regs.viewport_transform[0].scale_y < 0.0) {
+        flip_y = !flip_y;
+    }
+    if (regs.screen_y_control.y_negate != 0) {
+        flip_y = !flip_y;
+    }
+    state.clip_control.origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT;
 }

 void RasterizerOpenGL::SyncClipEnabled(
@@ -1114,26 +1090,24 @@ void RasterizerOpenGL::SyncClipCoef() {
 }

 void RasterizerOpenGL::SyncCullMode() {
-    auto& maxwell3d = system.GPU().Maxwell3D();
-
-    const auto& regs = maxwell3d.regs;
+    const auto& regs = system.GPU().Maxwell3D().regs;

    state.cull.enabled = regs.cull.enabled != 0;
    if (state.cull.enabled) {
-        state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);
        state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face);
+    }

-        const bool flip_triangles{regs.screen_y_control.triangle_rast_flip == 0 ||
-                                  regs.viewport_transform[0].scale_y < 0.0f};
+    state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);

-        // If the GPU is configured to flip the rasterized triangles, then we need to flip the
-        // notion of front and back. Note: We flip the triangles when the value of the register is 0
-        // because OpenGL already does it for us.
-        if (flip_triangles) {
-            if (state.cull.front_face == GL_CCW)
-                state.cull.front_face = GL_CW;
-            else if (state.cull.front_face == GL_CW)
-                state.cull.front_face = GL_CCW;
+    // If the GPU is configured to flip the rasterized triangles, then we need to flip the
+    // notion of front and back.
+    const bool flip_triangles{regs.screen_y_control.triangle_rast_flip != 0 &&
+                              regs.viewport_transform[0].scale_y > 0.0f};
+    if (flip_triangles) {
+        if (state.cull.front_face == GL_CCW) {
+            state.cull.front_face = GL_CW;
+        } else if (state.cull.front_face == GL_CW) {
+            state.cull.front_face = GL_CCW;
        }
    }
 }
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -9,17 +9,16 @@
 #include <cstddef>
 #include <map>
 #include <memory>
-#include <mutex>
 #include <optional>
 #include <tuple>
 #include <utility>

-#include <boost/icl/interval_map.hpp>
 #include <glad/glad.h>

 #include "common/common_types.h"
 #include "video_core/engines/const_buffer_info.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/rasterizer_accelerated.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
@@ -52,7 +51,7 @@ namespace OpenGL {
 struct ScreenInfo;
 struct DrawParameters;

-class RasterizerOpenGL : public VideoCore::RasterizerInterface {
+class RasterizerOpenGL : public VideoCore::RasterizerAccelerated {
 public:
    explicit RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                              ScreenInfo& info);
@@ -73,7 +72,6 @@ public:
                               const Tegra::Engines::Fermi2D::Config& copy_config) override;
    bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                           u32 pixel_stride) override;
-    void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) override;
    void LoadDiskResources(const std::atomic_bool& stop_loading,
                           const VideoCore::DiskResourceLoadCallback& callback) override;

@@ -228,11 +226,6 @@ private:
    AccelDraw accelerate_draw = AccelDraw::Disabled;

    OGLFramebuffer clear_framebuffer;
-
-    using CachedPageMap = boost::icl::interval_map<u64, int>;
-    CachedPageMap cached_pages;
-
-    std::mutex pages_mutex;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -3,13 +3,16 @@
 // Refer to the license.txt file included.

 #include <mutex>
+#include <optional>
+#include <string>
 #include <thread>
+#include <unordered_set>
 #include <boost/functional/hash.hpp>
 #include "common/assert.h"
-#include "common/hash.h"
 #include "common/scope_exit.h"
 #include "core/core.h"
 #include "core/frontend/emu_window.h"
+#include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -21,18 +24,20 @@

 namespace OpenGL {

+using Tegra::Engines::ShaderType;
+using VideoCommon::Shader::ConstBufferLocker;
 using VideoCommon::Shader::ProgramCode;
+using VideoCommon::Shader::ShaderIR;
+
+namespace {

 // One UBO is always reserved for emulation values on staged shaders
 constexpr u32 STAGE_RESERVED_UBOS = 1;

-struct UnspecializedShader {
-    std::string code;
-    GLShader::ShaderEntries entries;
-    ProgramType program_type;
-};
+constexpr u32 STAGE_MAIN_OFFSET = 10;
+constexpr u32 KERNEL_MAIN_OFFSET = 0;

-namespace {
+constexpr VideoCommon::Shader::CompilerSettings COMPILER_SETTINGS{};

 /// Gets the address for the specified shader stage program
 GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program) {
@@ -41,6 +46,39 @@ GPUVAddr GetShaderAddress(Core::System& system, Maxwell::ShaderProgram program)
    return gpu.regs.code_address.CodeAddress() + shader_config.offset;
 }

+/// Gets if the current instruction offset is a scheduler instruction
+constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
+    // Sched instructions appear once every 4 instructions.
+    constexpr std::size_t SchedPeriod = 4;
+    const std::size_t absolute_offset = offset - main_offset;
+    return (absolute_offset % SchedPeriod) == 0;
+}
+
+/// Calculates the size of a program stream
+std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
+    constexpr std::size_t start_offset = 10;
+    // This is the encoded version of BRA that jumps to itself. All Nvidia
+    // shaders end with one.
+    constexpr u64 self_jumping_branch = 0xE2400FFFFF07000FULL;
+    constexpr u64 mask = 0xFFFFFFFFFF7FFFFFULL;
+    std::size_t offset = start_offset;
+    while (offset < program.size()) {
+        const u64 instruction = program[offset];
+        if (!IsSchedInstruction(offset, start_offset)) {
+            if ((instruction & mask) == self_jumping_branch) {
+                // End on Maxwell's "nop" instruction
+                break;
+            }
+            if (instruction == 0) {
+                break;
+            }
+        }
+        offset++;
+    }
+    // The last instruction is included in the program size
+    return std::min(offset + 1, program.size());
+}
+
 /// Gets the shader program code from memory for the specified address
 ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr,
                          const u8* host_ptr) {
@@ -51,6 +89,7 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr g
    });
    memory_manager.ReadBlockUnsafe(gpu_addr, program_code.data(),
                                   program_code.size() * sizeof(u64));
+    program_code.resize(CalculateProgramSize(program_code));
    return program_code;
 }

@@ -71,14 +110,6 @@ constexpr GLenum GetShaderType(ProgramType program_type) {
    }
 }

-/// Gets if the current instruction offset is a scheduler instruction
-constexpr bool IsSchedInstruction(std::size_t offset, std::size_t main_offset) {
-    // Sched instructions appear once every 4 instructions.
-    constexpr std::size_t SchedPeriod = 4;
-    const std::size_t absolute_offset = offset - main_offset;
-    return (absolute_offset % SchedPeriod) == 0;
-}
-
 /// Describes primitive behavior on geometry shaders
 constexpr std::tuple<const char*, const char*, u32> GetPrimitiveDescription(GLenum primitive_mode) {
    switch (primitive_mode) {
@@ -121,110 +152,142 @@ ProgramType GetProgramType(Maxwell::ShaderProgram program) {
    return {};
 }

-/// Calculates the size of a program stream
-std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
-    constexpr std::size_t start_offset = 10;
-    // This is the encoded version of BRA that jumps to itself. All Nvidia
-    // shaders end with one.
-    constexpr u64 self_jumping_branch = 0xE2400FFFFF07000FULL;
-    constexpr u64 mask = 0xFFFFFFFFFF7FFFFFULL;
-    std::size_t offset = start_offset;
-    std::size_t size = start_offset * sizeof(u64);
-    while (offset < program.size()) {
-        const u64 instruction = program[offset];
-        if (!IsSchedInstruction(offset, start_offset)) {
-            if ((instruction & mask) == self_jumping_branch) {
-                // End on Maxwell's "nop" instruction
-                break;
-            }
-            if (instruction == 0) {
-                break;
-            }
-        }
-        size += sizeof(u64);
-        offset++;
-    }
-    // The last instruction is included in the program size
-    return std::min(size + sizeof(u64), program.size() * sizeof(u64));
-}
-
 /// Hashes one (or two) program streams
 u64 GetUniqueIdentifier(ProgramType program_type, const ProgramCode& code,
-                        const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) {
-    if (size_a == 0) {
-        size_a = CalculateProgramSize(code);
+                        const ProgramCode& code_b) {
+    u64 unique_identifier = boost::hash_value(code);
+    if (program_type == ProgramType::VertexA) {
+        // VertexA programs include two programs
+        boost::hash_combine(unique_identifier, boost::hash_value(code_b));
    }
-    u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a);
-    if (program_type != ProgramType::VertexA) {
-        return unique_identifier;
-    }
-    // VertexA programs include two programs
-
-    std::size_t seed = 0;
-    boost::hash_combine(seed, unique_identifier);
-
-    if (size_b == 0) {
-        size_b = CalculateProgramSize(code_b);
-    }
-    const u64 identifier_b =
-        Common::CityHash64(reinterpret_cast<const char*>(code_b.data()), size_b);
-    boost::hash_combine(seed, identifier_b);
-    return static_cast<u64>(seed);
+    return unique_identifier;
 }

 /// Creates an unspecialized program from code streams
-GLShader::ProgramResult CreateProgram(const Device& device, ProgramType program_type,
-                                      ProgramCode program_code, ProgramCode program_code_b) {
-    GLShader::ShaderSetup setup(program_code);
-    setup.program.size_a = CalculateProgramSize(program_code);
-    setup.program.size_b = 0;
-    if (program_type == ProgramType::VertexA) {
-        // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
-        // Conventional HW does not support this, so we combine VertexA and VertexB into one
-        // stage here.
-        setup.SetProgramB(program_code_b);
-        setup.program.size_b = CalculateProgramSize(program_code_b);
-    }
-    setup.program.unique_identifier = GetUniqueIdentifier(
-        program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b);
-
+std::string GenerateGLSL(const Device& device, ProgramType program_type, const ShaderIR& ir,
+                         const std::optional<ShaderIR>& ir_b) {
    switch (program_type) {
    case ProgramType::VertexA:
    case ProgramType::VertexB:
-        return GLShader::GenerateVertexShader(device, setup);
+        return GLShader::GenerateVertexShader(device, ir, ir_b ? &*ir_b : nullptr);
    case ProgramType::Geometry:
-        return GLShader::GenerateGeometryShader(device, setup);
+        return GLShader::GenerateGeometryShader(device, ir);
    case ProgramType::Fragment:
-        return GLShader::GenerateFragmentShader(device, setup);
+        return GLShader::GenerateFragmentShader(device, ir);
    case ProgramType::Compute:
-        return GLShader::GenerateComputeShader(device, setup);
+        return GLShader::GenerateComputeShader(device, ir);
    default:
        UNIMPLEMENTED_MSG("Unimplemented program_type={}", static_cast<u32>(program_type));
        return {};
    }
 }

-CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries,
-                               ProgramType program_type, const ProgramVariant& variant,
-                               bool hint_retrievable = false) {
+constexpr const char* GetProgramTypeName(ProgramType program_type) {
+    switch (program_type) {
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
+        return "VS";
+    case ProgramType::TessellationControl:
+        return "TCS";
+    case ProgramType::TessellationEval:
+        return "TES";
+    case ProgramType::Geometry:
+        return "GS";
+    case ProgramType::Fragment:
+        return "FS";
+    case ProgramType::Compute:
+        return "CS";
+    }
+    return "UNK";
+}
+
+Tegra::Engines::ShaderType GetEnginesShaderType(ProgramType program_type) {
+    switch (program_type) {
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
+        return Tegra::Engines::ShaderType::Vertex;
+    case ProgramType::TessellationControl:
+        return Tegra::Engines::ShaderType::TesselationControl;
+    case ProgramType::TessellationEval:
+        return Tegra::Engines::ShaderType::TesselationEval;
+    case ProgramType::Geometry:
+        return Tegra::Engines::ShaderType::Geometry;
+    case ProgramType::Fragment:
+        return Tegra::Engines::ShaderType::Fragment;
+    case ProgramType::Compute:
+        return Tegra::Engines::ShaderType::Compute;
+    }
+    UNREACHABLE();
+    return {};
+}
+
+std::string GetShaderId(u64 unique_identifier, ProgramType program_type) {
+    return fmt::format("{}{:016X}", GetProgramTypeName(program_type), unique_identifier);
+}
+
+Tegra::Engines::ConstBufferEngineInterface& GetConstBufferEngineInterface(
+    Core::System& system, ProgramType program_type) {
+    if (program_type == ProgramType::Compute) {
+        return system.GPU().KeplerCompute();
+    } else {
+        return system.GPU().Maxwell3D();
+    }
+}
+
+std::unique_ptr<ConstBufferLocker> MakeLocker(Core::System& system, ProgramType program_type) {
+    return std::make_unique<ConstBufferLocker>(GetEnginesShaderType(program_type),
+                                               GetConstBufferEngineInterface(system, program_type));
+}
+
+void FillLocker(ConstBufferLocker& locker, const ShaderDiskCacheUsage& usage) {
+    for (const auto& key : usage.keys) {
+        const auto [buffer, offset] = key.first;
+        locker.InsertKey(buffer, offset, key.second);
+    }
+    for (const auto& [offset, sampler] : usage.bound_samplers) {
+        locker.InsertBoundSampler(offset, sampler);
+    }
+    for (const auto& [key, sampler] : usage.bindless_samplers) {
+        const auto [buffer, offset] = key;
+        locker.InsertBindlessSampler(buffer, offset, sampler);
+    }
+}
+
+CachedProgram BuildShader(const Device& device, u64 unique_identifier, ProgramType program_type,
+                          const ProgramCode& program_code, const ProgramCode& program_code_b,
+                          const ProgramVariant& variant, ConstBufferLocker& locker,
+                          bool hint_retrievable = false) {
+    LOG_INFO(Render_OpenGL, "called. {}", GetShaderId(unique_identifier, program_type));
+
+    const bool is_compute = program_type == ProgramType::Compute;
+    const u32 main_offset = is_compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
+    const ShaderIR ir(program_code, main_offset, COMPILER_SETTINGS, locker);
+    std::optional<ShaderIR> ir_b;
+    if (!program_code_b.empty()) {
+        ir_b.emplace(program_code_b, main_offset, COMPILER_SETTINGS, locker);
+    }
+    const auto entries = GLShader::GetEntries(ir);
+
    auto base_bindings{variant.base_bindings};
    const auto primitive_mode{variant.primitive_mode};
    const auto texture_buffer_usage{variant.texture_buffer_usage};

-    std::string source = R"(#version 430 core
+    std::string source = fmt::format(R"(// {}
+#version 430 core
 #extension GL_ARB_separate_shader_objects : enable
 #extension GL_ARB_shader_viewport_layer_array : enable
 #extension GL_EXT_shader_image_load_formatted : enable
 #extension GL_NV_gpu_shader5 : enable
 #extension GL_NV_shader_thread_group : enable
 #extension GL_NV_shader_thread_shuffle : enable
-)";
-    if (program_type == ProgramType::Compute) {
+)",
+                                     GetShaderId(unique_identifier, program_type));
+    if (is_compute) {
        source += "#extension GL_ARB_compute_variable_group_size : require\n";
    }
    source += '\n';

-    if (program_type != ProgramType::Compute) {
+    if (!is_compute) {
        source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
    }

@@ -268,7 +331,7 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
    }

    source += '\n';
-    source += code;
+    source += GenerateGLSL(device, program_type, ir, ir_b);

    OGLShader shader;
    shader.Create(source.c_str(), GetShaderType(program_type));
@@ -278,85 +341,97 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
    return program;
 }

-std::set<GLenum> GetSupportedFormats() {
-    std::set<GLenum> supported_formats;
-
+std::unordered_set<GLenum> GetSupportedFormats() {
    GLint num_formats{};
    glGetIntegerv(GL_NUM_PROGRAM_BINARY_FORMATS, &num_formats);

    std::vector<GLint> formats(num_formats);
    glGetIntegerv(GL_PROGRAM_BINARY_FORMATS, formats.data());

-    for (const GLint format : formats)
+    std::unordered_set<GLenum> supported_formats;
+    for (const GLint format : formats) {
        supported_formats.insert(static_cast<GLenum>(format));
+    }
    return supported_formats;
 }

 } // Anonymous namespace

 CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
-                           GLShader::ProgramResult result)
-    : RasterizerCacheObject{params.host_ptr}, cpu_addr{params.cpu_addr},
-      unique_identifier{params.unique_identifier}, program_type{program_type},
-      disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs},
-      entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {}
+                           GLShader::ShaderEntries entries, ProgramCode program_code,
+                           ProgramCode program_code_b)
+    : RasterizerCacheObject{params.host_ptr}, system{params.system},
+      disk_cache{params.disk_cache}, device{params.device}, cpu_addr{params.cpu_addr},
+      unique_identifier{params.unique_identifier}, program_type{program_type}, entries{entries},
+      program_code{std::move(program_code)}, program_code_b{std::move(program_code_b)} {
+    if (!params.precompiled_variants) {
+        return;
+    }
+    for (const auto& pair : *params.precompiled_variants) {
+        auto locker = MakeLocker(system, program_type);
+        const auto& usage = pair->first;
+        FillLocker(*locker, usage);
+
+        std::unique_ptr<LockerVariant>* locker_variant = nullptr;
+        const auto it =
+            std::find_if(locker_variants.begin(), locker_variants.end(), [&](const auto& variant) {
+                return variant->locker->HasEqualKeys(*locker);
+            });
+        if (it == locker_variants.end()) {
+            locker_variant = &locker_variants.emplace_back();
+            *locker_variant = std::make_unique<LockerVariant>();
+            locker_variant->get()->locker = std::move(locker);
+        } else {
+            locker_variant = &*it;
+        }
+        locker_variant->get()->programs.emplace(usage.variant, pair->second);
+    }
+}

 Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
                                           Maxwell::ShaderProgram program_type,
-                                           ProgramCode&& program_code,
-                                           ProgramCode&& program_code_b) {
-    const auto code_size{CalculateProgramSize(program_code)};
-    const auto code_size_b{CalculateProgramSize(program_code_b)};
-    auto result{
-        CreateProgram(params.device, GetProgramType(program_type), program_code, program_code_b)};
-    if (result.first.empty()) {
-        // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
-        return {};
-    }
-
+                                           ProgramCode program_code, ProgramCode program_code_b) {
    params.disk_cache.SaveRaw(ShaderDiskCacheRaw(
-        params.unique_identifier, GetProgramType(program_type),
-        static_cast<u32>(code_size / sizeof(u64)), static_cast<u32>(code_size_b / sizeof(u64)),
-        std::move(program_code), std::move(program_code_b)));
+        params.unique_identifier, GetProgramType(program_type), program_code, program_code_b));

+    ConstBufferLocker locker(GetEnginesShaderType(GetProgramType(program_type)));
+    const ShaderIR ir(program_code, STAGE_MAIN_OFFSET, COMPILER_SETTINGS, locker);
+    // TODO(Rodrigo): Handle VertexA shaders
+    // std::optional<ShaderIR> ir_b;
+    // if (!program_code_b.empty()) {
+    //     ir_b.emplace(program_code_b, STAGE_MAIN_OFFSET);
+    // }
    return std::shared_ptr<CachedShader>(
-        new CachedShader(params, GetProgramType(program_type), std::move(result)));
+        new CachedShader(params, GetProgramType(program_type), GLShader::GetEntries(ir),
+                         std::move(program_code), std::move(program_code_b)));
 }

-Shader CachedShader::CreateStageFromCache(const ShaderParameters& params,
-                                          Maxwell::ShaderProgram program_type,
-                                          GLShader::ProgramResult result) {
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params, GetProgramType(program_type), std::move(result)));
+Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) {
+    params.disk_cache.SaveRaw(
+        ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute, code));
+
+    ConstBufferLocker locker(Tegra::Engines::ShaderType::Compute);
+    const ShaderIR ir(code, KERNEL_MAIN_OFFSET, COMPILER_SETTINGS, locker);
+    return std::shared_ptr<CachedShader>(new CachedShader(
+        params, ProgramType::Compute, GLShader::GetEntries(ir), std::move(code), {}));
 }

-Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code) {
-    auto result{CreateProgram(params.device, ProgramType::Compute, code, {})};
-
-    const auto code_size{CalculateProgramSize(code)};
-    params.disk_cache.SaveRaw(ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute,
-                                                 static_cast<u32>(code_size / sizeof(u64)), 0,
-                                                 std::move(code), {}));
-
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params, ProgramType::Compute, std::move(result)));
-}
-
-Shader CachedShader::CreateKernelFromCache(const ShaderParameters& params,
-                                           GLShader::ProgramResult result) {
-    return std::shared_ptr<CachedShader>(
-        new CachedShader(params, ProgramType::Compute, std::move(result)));
+Shader CachedShader::CreateFromCache(const ShaderParameters& params,
+                                     const UnspecializedShader& unspecialized) {
+    return std::shared_ptr<CachedShader>(new CachedShader(params, unspecialized.program_type,
+                                                          unspecialized.entries, unspecialized.code,
+                                                          unspecialized.code_b));
 }

 std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) {
-    const auto [entry, is_cache_miss] = programs.try_emplace(variant);
+    UpdateVariant();
+
+    const auto [entry, is_cache_miss] = curr_variant->programs.try_emplace(variant);
    auto& program = entry->second;
    if (is_cache_miss) {
-        program = TryLoadProgram(variant);
-        if (!program) {
-            program = SpecializeShader(code, entries, program_type, variant);
-            disk_cache.SaveUsage(GetUsage(variant));
-        }
+        program = BuildShader(device, unique_identifier, program_type, program_code, program_code_b,
+                              variant, *curr_variant->locker);
+        disk_cache.SaveUsage(GetUsage(variant, *curr_variant->locker));

        LabelGLObject(GL_PROGRAM, program->handle, cpu_addr);
    }
@@ -372,18 +447,33 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVar
    return {program->handle, base_bindings};
 }

-CachedProgram CachedShader::TryLoadProgram(const ProgramVariant& variant) const {
-    const auto found = precompiled_programs.find(GetUsage(variant));
-    if (found == precompiled_programs.end()) {
-        return {};
+void CachedShader::UpdateVariant() {
+    if (curr_variant && !curr_variant->locker->IsConsistent()) {
+        curr_variant = nullptr;
+    }
+    if (!curr_variant) {
+        for (auto& variant : locker_variants) {
+            if (variant->locker->IsConsistent()) {
+                curr_variant = variant.get();
+            }
+        }
+    }
+    if (!curr_variant) {
+        auto& new_variant = locker_variants.emplace_back();
+        new_variant = std::make_unique<LockerVariant>();
+        new_variant->locker = MakeLocker(system, program_type);
+        curr_variant = new_variant.get();
    }
-    return found->second;
 }

-ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant) const {
+ShaderDiskCacheUsage CachedShader::GetUsage(const ProgramVariant& variant,
+                                            const ConstBufferLocker& locker) const {
    ShaderDiskCacheUsage usage;
    usage.unique_identifier = unique_identifier;
    usage.variant = variant;
+    usage.keys = locker.GetKeys();
+    usage.bound_samplers = locker.GetBoundSamplers();
+    usage.bindless_samplers = locker.GetBindlessSamplers();
    return usage;
 }

@@ -399,18 +489,15 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
        return;
    }
    const auto [raws, shader_usages] = *transferable;
-
-    auto [decompiled, dumps] = disk_cache.LoadPrecompiled();
-
-    const auto supported_formats{GetSupportedFormats()};
-    const auto unspecialized_shaders{
-        GenerateUnspecializedShaders(stop_loading, callback, raws, decompiled)};
-    if (stop_loading) {
+    if (!GenerateUnspecializedShaders(stop_loading, callback, raws) || stop_loading) {
        return;
    }

-    // Track if precompiled cache was altered during loading to know if we have to serialize the
-    // virtual precompiled cache file back to the hard drive
+    const auto dumps = disk_cache.LoadPrecompiled();
+    const auto supported_formats = GetSupportedFormats();
+
+    // Track if precompiled cache was altered during loading to know if we have to
+    // serialize the virtual precompiled cache file back to the hard drive
    bool precompiled_cache_altered = false;

    // Inform the frontend about shader build initialization
@@ -433,9 +520,6 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                return;
            }
            const auto& usage{shader_usages[i]};
-            LOG_INFO(Render_OpenGL, "Building shader {:016x} (index {} of {})",
-                     usage.unique_identifier, i, shader_usages.size());
-
            const auto& unspecialized{unspecialized_shaders.at(usage.unique_identifier)};
            const auto dump{dumps.find(usage)};

@@ -449,21 +533,28 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
                }
            }
            if (!shader) {
-                shader = SpecializeShader(unspecialized.code, unspecialized.entries,
-                                          unspecialized.program_type, usage.variant, true);
+                auto locker{MakeLocker(system, unspecialized.program_type)};
+                FillLocker(*locker, usage);
+                shader = BuildShader(device, usage.unique_identifier, unspecialized.program_type,
+                                     unspecialized.code, unspecialized.code_b, usage.variant,
+                                     *locker, true);
            }

-            std::scoped_lock lock(mutex);
+            std::scoped_lock lock{mutex};
            if (callback) {
                callback(VideoCore::LoadCallbackStage::Build, ++built_shaders,
                         shader_usages.size());
            }

            precompiled_programs.emplace(usage, std::move(shader));
+
+            // TODO(Rodrigo): Is there a better way to do this?
+            precompiled_variants[usage.unique_identifier].push_back(
+                precompiled_programs.find(usage));
        }
    };

-    const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1)};
+    const auto num_workers{static_cast<std::size_t>(std::thread::hardware_concurrency() + 1ULL)};
    const std::size_t bucket_size{shader_usages.size() / num_workers};
    std::vector<std::unique_ptr<Core::Frontend::GraphicsContext>> contexts(num_workers);
    std::vector<std::thread> threads(num_workers);
@@ -483,7 +574,6 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
    if (compilation_failed) {
        // Invalidate the precompiled cache if a shader dumped shader was rejected
        disk_cache.InvalidatePrecompiled();
-        dumps.clear();
        precompiled_cache_altered = true;
        return;
    }
@@ -491,8 +581,8 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
        return;
    }

-    // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw before
-    // precompiling them
+    // TODO(Rodrigo): Do state tracking for transferable shaders and do a dummy draw
+    // before precompiling them

    for (std::size_t i = 0; i < shader_usages.size(); ++i) {
        const auto& usage{shader_usages[i]};
@@ -508,9 +598,13 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
    }
 }

-CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram(
-    const ShaderDiskCacheDump& dump, const std::set<GLenum>& supported_formats) {
+const PrecompiledVariants* ShaderCacheOpenGL::GetPrecompiledVariants(u64 unique_identifier) const {
+    const auto it = precompiled_variants.find(unique_identifier);
+    return it == precompiled_variants.end() ? nullptr : &it->second;
+}

+CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram(
+    const ShaderDiskCacheDump& dump, const std::unordered_set<GLenum>& supported_formats) {
    if (supported_formats.find(dump.binary_format) == supported_formats.end()) {
        LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format - removing");
        return {};
@@ -532,56 +626,52 @@ CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram(
    return shader;
 }

-std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecializedShaders(
+bool ShaderCacheOpenGL::GenerateUnspecializedShaders(
    const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback,
-    const std::vector<ShaderDiskCacheRaw>& raws,
-    const std::unordered_map<u64, ShaderDiskCacheDecompiled>& decompiled) {
-    std::unordered_map<u64, UnspecializedShader> unspecialized;
-
+    const std::vector<ShaderDiskCacheRaw>& raws) {
    if (callback) {
        callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size());
    }

    for (std::size_t i = 0; i < raws.size(); ++i) {
        if (stop_loading) {
-            return {};
+            return false;
        }
        const auto& raw{raws[i]};
        const u64 unique_identifier{raw.GetUniqueIdentifier()};
        const u64 calculated_hash{
            GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB())};
        if (unique_identifier != calculated_hash) {
-            LOG_ERROR(
-                Render_OpenGL,
-                "Invalid hash in entry={:016x} (obtained hash={:016x}) - removing shader cache",
-                raw.GetUniqueIdentifier(), calculated_hash);
+            LOG_ERROR(Render_OpenGL,
+                      "Invalid hash in entry={:016x} (obtained hash={:016x}) - "
+                      "removing shader cache",
+                      raw.GetUniqueIdentifier(), calculated_hash);
            disk_cache.InvalidateTransferable();
-            return {};
+            return false;
        }

-        GLShader::ProgramResult result;
-        if (const auto it = decompiled.find(unique_identifier); it != decompiled.end()) {
-            // If it's stored in the precompiled file, avoid decompiling it here
-            const auto& stored_decompiled{it->second};
-            result = {stored_decompiled.code, stored_decompiled.entries};
-        } else {
-            // Otherwise decompile the shader at boot and save the result to the decompiled file
-            result = CreateProgram(device, raw.GetProgramType(), raw.GetProgramCode(),
-                                   raw.GetProgramCodeB());
-            disk_cache.SaveDecompiled(unique_identifier, result.first, result.second);
-        }
+        const u32 main_offset =
+            raw.GetProgramType() == ProgramType::Compute ? KERNEL_MAIN_OFFSET : STAGE_MAIN_OFFSET;
+        ConstBufferLocker locker(GetEnginesShaderType(raw.GetProgramType()));
+        const ShaderIR ir(raw.GetProgramCode(), main_offset, COMPILER_SETTINGS, locker);
+        // TODO(Rodrigo): Handle VertexA shaders
+        // std::optional<ShaderIR> ir_b;
+        // if (raw.HasProgramA()) {
+        //     ir_b.emplace(raw.GetProgramCodeB(), main_offset);
+        // }

-        precompiled_shaders.insert({unique_identifier, result});
-
-        unspecialized.insert(
-            {raw.GetUniqueIdentifier(),
-             {std::move(result.first), std::move(result.second), raw.GetProgramType()}});
+        UnspecializedShader unspecialized;
+        unspecialized.entries = GLShader::GetEntries(ir);
+        unspecialized.program_type = raw.GetProgramType();
+        unspecialized.code = raw.GetProgramCode();
+        unspecialized.code_b = raw.GetProgramCodeB();
+        unspecialized_shaders.emplace(raw.GetUniqueIdentifier(), unspecialized);

        if (callback) {
            callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size());
        }
    }
-    return unspecialized;
+    return true;
 }

 Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
@@ -590,37 +680,35 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
    }

    auto& memory_manager{system.GPU().MemoryManager()};
-    const GPUVAddr program_addr{GetShaderAddress(system, program)};
+    const GPUVAddr address{GetShaderAddress(system, program)};

    // Look up shader in the cache based on address
-    const auto host_ptr{memory_manager.GetPointer(program_addr)};
+    const auto host_ptr{memory_manager.GetPointer(address)};
    Shader shader{TryGet(host_ptr)};
    if (shader) {
        return last_shaders[static_cast<std::size_t>(program)] = shader;
    }

    // No shader found - create a new one
-    ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
-    ProgramCode program_code_b;
-    const bool is_program_a{program == Maxwell::ShaderProgram::VertexA};
-    if (is_program_a) {
-        const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
-        program_code_b = GetShaderCode(memory_manager, program_addr_b,
-                                       memory_manager.GetPointer(program_addr_b));
+    ProgramCode code{GetShaderCode(memory_manager, address, host_ptr)};
+    ProgramCode code_b;
+    if (program == Maxwell::ShaderProgram::VertexA) {
+        const GPUVAddr address_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
+        code_b = GetShaderCode(memory_manager, address_b, memory_manager.GetPointer(address_b));
    }

-    const auto unique_identifier =
-        GetUniqueIdentifier(GetProgramType(program), program_code, program_code_b);
-    const auto cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
-    const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
-                                  host_ptr,   unique_identifier};
+    const auto unique_identifier = GetUniqueIdentifier(GetProgramType(program), code, code_b);
+    const auto precompiled_variants = GetPrecompiledVariants(unique_identifier);
+    const auto cpu_addr{*memory_manager.GpuToCpuAddress(address)};
+    const ShaderParameters params{system,   disk_cache, precompiled_variants, device,
+                                  cpu_addr, host_ptr,   unique_identifier};

-    const auto found = precompiled_shaders.find(unique_identifier);
-    if (found == precompiled_shaders.end()) {
-        shader = CachedShader::CreateStageFromMemory(params, program, std::move(program_code),
-                                                     std::move(program_code_b));
+    const auto found = unspecialized_shaders.find(unique_identifier);
+    if (found == unspecialized_shaders.end()) {
+        shader = CachedShader::CreateStageFromMemory(params, program, std::move(code),
+                                                     std::move(code_b));
    } else {
-        shader = CachedShader::CreateStageFromCache(params, program, found->second);
+        shader = CachedShader::CreateFromCache(params, found->second);
    }
    Register(shader);

@@ -638,15 +726,16 @@ Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
    // No kernel found - create a new one
    auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
    const auto unique_identifier{GetUniqueIdentifier(ProgramType::Compute, code, {})};
+    const auto precompiled_variants = GetPrecompiledVariants(unique_identifier);
    const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)};
-    const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
-                                  host_ptr,   unique_identifier};
+    const ShaderParameters params{system,   disk_cache, precompiled_variants, device,
+                                  cpu_addr, host_ptr,   unique_identifier};

-    const auto found = precompiled_shaders.find(unique_identifier);
-    if (found == precompiled_shaders.end()) {
+    const auto found = unspecialized_shaders.find(unique_identifier);
+    if (found == unspecialized_shaders.end()) {
        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
    } else {
-        kernel = CachedShader::CreateKernelFromCache(params, found->second);
+        kernel = CachedShader::CreateFromCache(params, found->second);
    }

    Register(kernel);
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -8,9 +8,10 @@
 #include <atomic>
 #include <bitset>
 #include <memory>
-#include <set>
+#include <string>
 #include <tuple>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>

 #include <glad/glad.h>
@@ -20,6 +21,8 @@
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_disk_cache.h"
+#include "video_core/shader/const_buffer_locker.h"
+#include "video_core/shader/shader_ir.h"

 namespace Core {
 class System;
@@ -40,11 +43,19 @@ using Shader = std::shared_ptr<CachedShader>;
 using CachedProgram = std::shared_ptr<OGLProgram>;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 using PrecompiledPrograms = std::unordered_map<ShaderDiskCacheUsage, CachedProgram>;
-using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>;
+using PrecompiledVariants = std::vector<PrecompiledPrograms::iterator>;
+
+struct UnspecializedShader {
+    GLShader::ShaderEntries entries;
+    ProgramType program_type;
+    ProgramCode code;
+    ProgramCode code_b;
+};

 struct ShaderParameters {
+    Core::System& system;
    ShaderDiskCacheOpenGL& disk_cache;
-    const PrecompiledPrograms& precompiled_programs;
+    const PrecompiledVariants* precompiled_variants;
    const Device& device;
    VAddr cpu_addr;
    u8* host_ptr;
@@ -55,23 +66,18 @@ class CachedShader final : public RasterizerCacheObject {
 public:
    static Shader CreateStageFromMemory(const ShaderParameters& params,
                                        Maxwell::ShaderProgram program_type,
-                                        ProgramCode&& program_code, ProgramCode&& program_code_b);
+                                        ProgramCode program_code, ProgramCode program_code_b);
+    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code);

-    static Shader CreateStageFromCache(const ShaderParameters& params,
-                                       Maxwell::ShaderProgram program_type,
-                                       GLShader::ProgramResult result);
-
-    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code);
-
-    static Shader CreateKernelFromCache(const ShaderParameters& params,
-                                        GLShader::ProgramResult result);
+    static Shader CreateFromCache(const ShaderParameters& params,
+                                  const UnspecializedShader& unspecialized);

    VAddr GetCpuAddr() const override {
        return cpu_addr;
    }

    std::size_t GetSizeInBytes() const override {
-        return shader_length;
+        return program_code.size() * sizeof(u64);
    }

    /// Gets the shader entries for the shader
@@ -83,24 +89,36 @@ public:
    std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant);

 private:
+    struct LockerVariant {
+        std::unique_ptr<VideoCommon::Shader::ConstBufferLocker> locker;
+        std::unordered_map<ProgramVariant, CachedProgram> programs;
+    };
+
    explicit CachedShader(const ShaderParameters& params, ProgramType program_type,
-                          GLShader::ProgramResult result);
+                          GLShader::ShaderEntries entries, ProgramCode program_code,
+                          ProgramCode program_code_b);

-    CachedProgram TryLoadProgram(const ProgramVariant& variant) const;
+    void UpdateVariant();

-    ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const;
+    ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant,
+                                  const VideoCommon::Shader::ConstBufferLocker& locker) const;
+
+    Core::System& system;
+    ShaderDiskCacheOpenGL& disk_cache;
+    const Device& device;

    VAddr cpu_addr{};
+
    u64 unique_identifier{};
    ProgramType program_type{};
-    ShaderDiskCacheOpenGL& disk_cache;
-    const PrecompiledPrograms& precompiled_programs;

    GLShader::ShaderEntries entries;
-    std::string code;
-    std::size_t shader_length{};

-    std::unordered_map<ProgramVariant, CachedProgram> programs;
+    ProgramCode program_code;
+    ProgramCode program_code_b;
+
+    LockerVariant* curr_variant = nullptr;
+    std::vector<std::unique_ptr<LockerVariant>> locker_variants;
 };

 class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
@@ -123,21 +141,26 @@ protected:
    void FlushObjectInner(const Shader& object) override {}

 private:
-    std::unordered_map<u64, UnspecializedShader> GenerateUnspecializedShaders(
-        const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback,
-        const std::vector<ShaderDiskCacheRaw>& raws,
-        const std::unordered_map<u64, ShaderDiskCacheDecompiled>& decompiled);
+    bool GenerateUnspecializedShaders(const std::atomic_bool& stop_loading,
+                                      const VideoCore::DiskResourceLoadCallback& callback,
+                                      const std::vector<ShaderDiskCacheRaw>& raws);

    CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
-                                             const std::set<GLenum>& supported_formats);
+                                             const std::unordered_set<GLenum>& supported_formats);
+
+    const PrecompiledVariants* GetPrecompiledVariants(u64 unique_identifier) const;

    Core::System& system;
    Core::Frontend::EmuWindow& emu_window;
    const Device& device;
+
    ShaderDiskCacheOpenGL disk_cache;

-    PrecompiledShaders precompiled_shaders;
    PrecompiledPrograms precompiled_programs;
+    std::unordered_map<u64, PrecompiledVariants> precompiled_variants;
+
+    std::unordered_map<u64, UnspecializedShader> unspecialized_shaders;
+
    std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
 };

--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -415,27 +415,6 @@ public:
        return code.GetResult();
    }

-    ShaderEntries GetShaderEntries() const {
-        ShaderEntries entries;
-        for (const auto& cbuf : ir.GetConstantBuffers()) {
-            entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
-                                               cbuf.first);
-        }
-        for (const auto& sampler : ir.GetSamplers()) {
-            entries.samplers.emplace_back(sampler);
-        }
-        for (const auto& [offset, image] : ir.GetImages()) {
-            entries.images.emplace_back(image);
-        }
-        for (const auto& [base, usage] : ir.GetGlobalMemory()) {
-            entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset,
-                                                       usage.is_read, usage.is_written);
-        }
-        entries.clip_distances = ir.GetClipDistances();
-        entries.shader_length = ir.GetLength();
-        return entries;
-    }
-
 private:
    friend class ASTDecompiler;
    friend class ExprDecompiler;
@@ -1893,10 +1872,6 @@ private:
    Expression EmitVertex(Operation operation) {
        ASSERT_MSG(stage == ProgramType::Geometry,
                   "EmitVertex is expected to be used in a geometry shader.");
-
-        // If a geometry shader is attached, it will always flip (it's the last stage before
-        // fragment). For more info about flipping, refer to gl_shader_gen.cpp.
-        code.AddLine("gl_Position.xy *= viewport_flip.xy;");
        code.AddLine("EmitVertex();");
        return {};
    }
@@ -1904,14 +1879,12 @@ private:
    Expression EndPrimitive(Operation operation) {
        ASSERT_MSG(stage == ProgramType::Geometry,
                   "EndPrimitive is expected to be used in a geometry shader.");
-
        code.AddLine("EndPrimitive();");
        return {};
    }

    Expression YNegate(Operation operation) {
-        // Config pack's third value is Y_NEGATE's state.
-        return {"config_pack[2]", Type::Uint};
+        return {"y_negate", Type::Float};
    }

    template <u32 element>
@@ -2314,10 +2287,13 @@ public:
            switch (index) {
            case Tegra::Shader::Pred::NeverExecute:
                target = "false";
+                break;
            case Tegra::Shader::Pred::UnusedIndex:
                target = "true";
+                break;
            default:
                target = decomp.GetPredicate(index);
+                break;
            }
        } else if (const auto flag = std::get_if<InternalFlagNode>(&*cc)) {
            target = decomp.GetInternalFlag(flag->GetFlag());
@@ -2335,6 +2311,11 @@ public:
        inner += expr.value ? "true" : "false";
    }

+    void operator()(VideoCommon::Shader::ExprGprEqual& expr) {
+        inner +=
+            "( ftou(" + decomp.GetRegister(expr.gpr) + ") == " + std::to_string(expr.value) + ')';
+    }
+
    const std::string& GetResult() const {
        return inner;
    }
@@ -2473,25 +2454,46 @@ void GLSLDecompiler::DecompileAST() {

 } // Anonymous namespace

-std::string GetCommonDeclarations() {
-    return fmt::format(
-        "#define ftoi floatBitsToInt\n"
-        "#define ftou floatBitsToUint\n"
-        "#define itof intBitsToFloat\n"
-        "#define utof uintBitsToFloat\n\n"
-        "bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{\n"
-        "    bvec2 is_nan1 = isnan(pair1);\n"
-        "    bvec2 is_nan2 = isnan(pair2);\n"
-        "    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || "
-        "is_nan2.y);\n"
-        "}}\n\n");
+ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir) {
+    ShaderEntries entries;
+    for (const auto& cbuf : ir.GetConstantBuffers()) {
+        entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(),
+                                           cbuf.first);
+    }
+    for (const auto& sampler : ir.GetSamplers()) {
+        entries.samplers.emplace_back(sampler);
+    }
+    for (const auto& [offset, image] : ir.GetImages()) {
+        entries.images.emplace_back(image);
+    }
+    for (const auto& [base, usage] : ir.GetGlobalMemory()) {
+        entries.global_memory_entries.emplace_back(base.cbuf_index, base.cbuf_offset, usage.is_read,
+                                                   usage.is_written);
+    }
+    entries.clip_distances = ir.GetClipDistances();
+    entries.shader_length = ir.GetLength();
+    return entries;
 }

-ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage,
-                        const std::string& suffix) {
+std::string GetCommonDeclarations() {
+    return R"(#define ftoi floatBitsToInt
+#define ftou floatBitsToUint
+#define itof intBitsToFloat
+#define utof uintBitsToFloat
+
+bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {
+    bvec2 is_nan1 = isnan(pair1);
+    bvec2 is_nan2 = isnan(pair2);
+    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || is_nan2.y);
+}
+)";
+}
+
+std::string Decompile(const Device& device, const ShaderIR& ir, ProgramType stage,
+                      const std::string& suffix) {
    GLSLDecompiler decompiler(device, ir, stage, suffix);
    decompiler.Decompile();
-    return {decompiler.GetResult(), decompiler.GetShaderEntries()};
+    return decompiler.GetResult();
 }

 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -34,10 +34,7 @@ enum class ProgramType : u32 {

 namespace OpenGL::GLShader {

-struct ShaderEntries;
-
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using ProgramResult = std::pair<std::string, ShaderEntries>;
 using SamplerEntry = VideoCommon::Shader::Sampler;
 using ImageEntry = VideoCommon::Shader::Image;

@@ -93,9 +90,11 @@ struct ShaderEntries {
    std::size_t shader_length{};
 };

+ShaderEntries GetEntries(const VideoCommon::Shader::ShaderIR& ir);
+
 std::string GetCommonDeclarations();

-ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
-                        ProgramType stage, const std::string& suffix);
+std::string Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
+                      ProgramType stage, const std::string& suffix);

 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -22,6 +22,29 @@

 namespace OpenGL {

+using VideoCommon::Shader::BindlessSamplerMap;
+using VideoCommon::Shader::BoundSamplerMap;
+using VideoCommon::Shader::KeyMap;
+
+namespace {
+
+struct ConstBufferKey {
+    u32 cbuf;
+    u32 offset;
+    u32 value;
+};
+
+struct BoundSamplerKey {
+    u32 offset;
+    Tegra::Engines::SamplerDescriptor sampler;
+};
+
+struct BindlessSamplerKey {
+    u32 cbuf;
+    u32 offset;
+    Tegra::Engines::SamplerDescriptor sampler;
+};
+
 using ShaderCacheVersionHash = std::array<u8, 64>;

 enum class TransferableEntryKind : u32 {
@@ -29,18 +52,10 @@ enum class TransferableEntryKind : u32 {
    Usage,
 };

-enum class PrecompiledEntryKind : u32 {
-    Decompiled,
-    Dump,
-};
-
-constexpr u32 NativeVersion = 4;
+constexpr u32 NativeVersion = 5;

 // Making sure sizes doesn't change by accident
 static_assert(sizeof(BaseBindings) == 16);
-static_assert(sizeof(ShaderDiskCacheUsage) == 40);
-
-namespace {

 ShaderCacheVersionHash GetShaderCacheVersionHash() {
    ShaderCacheVersionHash hash{};
@@ -49,13 +64,11 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {
    return hash;
 }

-} // namespace
+} // Anonymous namespace

 ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
-                                       u32 program_code_size, u32 program_code_size_b,
                                       ProgramCode program_code, ProgramCode program_code_b)
    : unique_identifier{unique_identifier}, program_type{program_type},
-      program_code_size{program_code_size}, program_code_size_b{program_code_size_b},
      program_code{std::move(program_code)}, program_code_b{std::move(program_code_b)} {}

 ShaderDiskCacheRaw::ShaderDiskCacheRaw() = default;
@@ -90,15 +103,16 @@ bool ShaderDiskCacheRaw::Load(FileUtil::IOFile& file) {
 bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const {
    if (file.WriteObject(unique_identifier) != 1 ||
        file.WriteObject(static_cast<u32>(program_type)) != 1 ||
-        file.WriteObject(program_code_size) != 1 || file.WriteObject(program_code_size_b) != 1) {
+        file.WriteObject(static_cast<u32>(program_code.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(program_code_b.size())) != 1) {
        return false;
    }

-    if (file.WriteArray(program_code.data(), program_code_size) != program_code_size)
+    if (file.WriteArray(program_code.data(), program_code.size()) != program_code.size())
        return false;

    if (HasProgramA() &&
-        file.WriteArray(program_code_b.data(), program_code_size_b) != program_code_size_b) {
+        file.WriteArray(program_code_b.data(), program_code_b.size()) != program_code_b.size()) {
        return false;
    }
    return true;
@@ -127,13 +141,13 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
    u32 version{};
    if (file.ReadBytes(&version, sizeof(version)) != sizeof(version)) {
        LOG_ERROR(Render_OpenGL,
-                  "Failed to get transferable cache version for title id={} - skipping",
+                  "Failed to get transferable cache version for title id={}, skipping",
                  GetTitleID());
        return {};
    }

    if (version < NativeVersion) {
-        LOG_INFO(Render_OpenGL, "Transferable shader cache is old - removing");
+        LOG_INFO(Render_OpenGL, "Transferable shader cache is old, removing");
        file.Close();
        InvalidateTransferable();
        is_usable = true;
@@ -141,17 +155,18 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
    }
    if (version > NativeVersion) {
        LOG_WARNING(Render_OpenGL, "Transferable shader cache was generated with a newer version "
-                                   "of the emulator - skipping");
+                                   "of the emulator, skipping");
        return {};
    }

    // Version is valid, load the shaders
+    constexpr const char error_loading[] = "Failed to load transferable raw entry, skipping";
    std::vector<ShaderDiskCacheRaw> raws;
    std::vector<ShaderDiskCacheUsage> usages;
    while (file.Tell() < file.GetSize()) {
        TransferableEntryKind kind{};
        if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) {
-            LOG_ERROR(Render_OpenGL, "Failed to read transferable file - skipping");
+            LOG_ERROR(Render_OpenGL, "Failed to read transferable file, skipping");
            return {};
        }

@@ -159,7 +174,7 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
        case TransferableEntryKind::Raw: {
            ShaderDiskCacheRaw entry;
            if (!entry.Load(file)) {
-                LOG_ERROR(Render_OpenGL, "Failed to load transferable raw entry - skipping");
+                LOG_ERROR(Render_OpenGL, error_loading);
                return {};
            }
            transferable.insert({entry.GetUniqueIdentifier(), {}});
@@ -167,16 +182,45 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
            break;
        }
        case TransferableEntryKind::Usage: {
-            ShaderDiskCacheUsage usage{};
-            if (file.ReadBytes(&usage, sizeof(usage)) != sizeof(usage)) {
-                LOG_ERROR(Render_OpenGL, "Failed to load transferable usage entry - skipping");
+            ShaderDiskCacheUsage usage;
+
+            u32 num_keys{};
+            u32 num_bound_samplers{};
+            u32 num_bindless_samplers{};
+            if (file.ReadArray(&usage.unique_identifier, 1) != 1 ||
+                file.ReadArray(&usage.variant, 1) != 1 || file.ReadArray(&num_keys, 1) != 1 ||
+                file.ReadArray(&num_bound_samplers, 1) != 1 ||
+                file.ReadArray(&num_bindless_samplers, 1) != 1) {
+                LOG_ERROR(Render_OpenGL, error_loading);
                return {};
            }
+
+            std::vector<ConstBufferKey> keys(num_keys);
+            std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers);
+            std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers);
+            if (file.ReadArray(keys.data(), keys.size()) != keys.size() ||
+                file.ReadArray(bound_samplers.data(), bound_samplers.size()) !=
+                    bound_samplers.size() ||
+                file.ReadArray(bindless_samplers.data(), bindless_samplers.size()) !=
+                    bindless_samplers.size()) {
+                LOG_ERROR(Render_OpenGL, error_loading);
+                return {};
+            }
+            for (const auto& key : keys) {
+                usage.keys.insert({{key.cbuf, key.offset}, key.value});
+            }
+            for (const auto& key : bound_samplers) {
+                usage.bound_samplers.emplace(key.offset, key.sampler);
+            }
+            for (const auto& key : bindless_samplers) {
+                usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+            }
+
            usages.push_back(std::move(usage));
            break;
        }
        default:
-            LOG_ERROR(Render_OpenGL, "Unknown transferable shader cache entry kind={} - skipping",
+            LOG_ERROR(Render_OpenGL, "Unknown transferable shader cache entry kind={}, skipping",
                      static_cast<u32>(kind));
            return {};
        }
@@ -186,13 +230,14 @@ ShaderDiskCacheOpenGL::LoadTransferable() {
    return {{std::move(raws), std::move(usages)}};
 }

-std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>, ShaderDumpsMap>
+std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>
 ShaderDiskCacheOpenGL::LoadPrecompiled() {
    if (!is_usable) {
        return {};
    }

-    FileUtil::IOFile file(GetPrecompiledPath(), "rb");
+    std::string path = GetPrecompiledPath();
+    FileUtil::IOFile file(path, "rb");
    if (!file.IsOpen()) {
        LOG_INFO(Render_OpenGL, "No precompiled shader cache found for game with title id={}",
                 GetTitleID());
@@ -202,7 +247,7 @@ ShaderDiskCacheOpenGL::LoadPrecompiled() {
    const auto result = LoadPrecompiledFile(file);
    if (!result) {
        LOG_INFO(Render_OpenGL,
-                 "Failed to load precompiled cache for game with title id={} - removing",
+                 "Failed to load precompiled cache for game with title id={}, removing",
                 GetTitleID());
        file.Close();
        InvalidatePrecompiled();
@@ -211,7 +256,7 @@ ShaderDiskCacheOpenGL::LoadPrecompiled() {
    return *result;
 }

-std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>, ShaderDumpsMap>>
+std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>
 ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
    // Read compressed file from disk and decompress to virtual precompiled cache file
    std::vector<u8> compressed(file.GetSize());
@@ -231,238 +276,56 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) {
        return {};
    }

-    std::unordered_map<u64, ShaderDiskCacheDecompiled> decompiled;
    ShaderDumpsMap dumps;
    while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) {
-        PrecompiledEntryKind kind{};
-        if (!LoadObjectFromPrecompiled(kind)) {
+        u32 num_keys{};
+        u32 num_bound_samplers{};
+        u32 num_bindless_samplers{};
+        ShaderDiskCacheUsage usage;
+        if (!LoadObjectFromPrecompiled(usage.unique_identifier) ||
+            !LoadObjectFromPrecompiled(usage.variant) || !LoadObjectFromPrecompiled(num_keys) ||
+            !LoadObjectFromPrecompiled(num_bound_samplers) ||
+            !LoadObjectFromPrecompiled(num_bindless_samplers)) {
+            return {};
+        }
+        std::vector<ConstBufferKey> keys(num_keys);
+        std::vector<BoundSamplerKey> bound_samplers(num_bound_samplers);
+        std::vector<BindlessSamplerKey> bindless_samplers(num_bindless_samplers);
+        if (!LoadArrayFromPrecompiled(keys.data(), keys.size()) ||
+            !LoadArrayFromPrecompiled(bound_samplers.data(), bound_samplers.size()) !=
+                bound_samplers.size() ||
+            !LoadArrayFromPrecompiled(bindless_samplers.data(), bindless_samplers.size()) !=
+                bindless_samplers.size()) {
+            return {};
+        }
+        for (const auto& key : keys) {
+            usage.keys.insert({{key.cbuf, key.offset}, key.value});
+        }
+        for (const auto& key : bound_samplers) {
+            usage.bound_samplers.emplace(key.offset, key.sampler);
+        }
+        for (const auto& key : bindless_samplers) {
+            usage.bindless_samplers.insert({{key.cbuf, key.offset}, key.sampler});
+        }
+
+        ShaderDiskCacheDump dump;
+        if (!LoadObjectFromPrecompiled(dump.binary_format)) {
            return {};
        }

-        switch (kind) {
-        case PrecompiledEntryKind::Decompiled: {
-            u64 unique_identifier{};
-            if (!LoadObjectFromPrecompiled(unique_identifier)) {
-                return {};
-            }
-
-            auto entry = LoadDecompiledEntry();
-            if (!entry) {
-                return {};
-            }
-            decompiled.insert({unique_identifier, std::move(*entry)});
-            break;
-        }
-        case PrecompiledEntryKind::Dump: {
-            ShaderDiskCacheUsage usage;
-            if (!LoadObjectFromPrecompiled(usage)) {
-                return {};
-            }
-
-            ShaderDiskCacheDump dump;
-            if (!LoadObjectFromPrecompiled(dump.binary_format)) {
-                return {};
-            }
-
-            u32 binary_length{};
-            if (!LoadObjectFromPrecompiled(binary_length)) {
-                return {};
-            }
-
-            dump.binary.resize(binary_length);
-            if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) {
-                return {};
-            }
-
-            dumps.insert({usage, dump});
-            break;
-        }
-        default:
+        u32 binary_length{};
+        if (!LoadObjectFromPrecompiled(binary_length)) {
            return {};
        }
-    }
-    return {{decompiled, dumps}};
-}

-std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEntry() {
-    u32 code_size{};
-    if (!LoadObjectFromPrecompiled(code_size)) {
-        return {};
-    }
-
-    std::string code(code_size, '\0');
-    if (!LoadArrayFromPrecompiled(code.data(), code.size())) {
-        return {};
-    }
-
-    ShaderDiskCacheDecompiled entry;
-    entry.code = std::move(code);
-
-    u32 const_buffers_count{};
-    if (!LoadObjectFromPrecompiled(const_buffers_count)) {
-        return {};
-    }
-
-    for (u32 i = 0; i < const_buffers_count; ++i) {
-        u32 max_offset{};
-        u32 index{};
-        bool is_indirect{};
-        if (!LoadObjectFromPrecompiled(max_offset) || !LoadObjectFromPrecompiled(index) ||
-            !LoadObjectFromPrecompiled(is_indirect)) {
+        dump.binary.resize(binary_length);
+        if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) {
            return {};
        }
-        entry.entries.const_buffers.emplace_back(max_offset, is_indirect, index);
-    }

-    u32 samplers_count{};
-    if (!LoadObjectFromPrecompiled(samplers_count)) {
-        return {};
+        dumps.emplace(std::move(usage), dump);
    }
-
-    for (u32 i = 0; i < samplers_count; ++i) {
-        u64 offset{};
-        u64 index{};
-        u32 type{};
-        bool is_array{};
-        bool is_shadow{};
-        bool is_bindless{};
-        if (!LoadObjectFromPrecompiled(offset) || !LoadObjectFromPrecompiled(index) ||
-            !LoadObjectFromPrecompiled(type) || !LoadObjectFromPrecompiled(is_array) ||
-            !LoadObjectFromPrecompiled(is_shadow) || !LoadObjectFromPrecompiled(is_bindless)) {
-            return {};
-        }
-        entry.entries.samplers.emplace_back(
-            static_cast<std::size_t>(offset), static_cast<std::size_t>(index),
-            static_cast<Tegra::Shader::TextureType>(type), is_array, is_shadow, is_bindless);
-    }
-
-    u32 images_count{};
-    if (!LoadObjectFromPrecompiled(images_count)) {
-        return {};
-    }
-    for (u32 i = 0; i < images_count; ++i) {
-        u64 offset{};
-        u64 index{};
-        u32 type{};
-        u8 is_bindless{};
-        u8 is_written{};
-        u8 is_read{};
-        u8 is_atomic{};
-        if (!LoadObjectFromPrecompiled(offset) || !LoadObjectFromPrecompiled(index) ||
-            !LoadObjectFromPrecompiled(type) || !LoadObjectFromPrecompiled(is_bindless) ||
-            !LoadObjectFromPrecompiled(is_written) || !LoadObjectFromPrecompiled(is_read) ||
-            !LoadObjectFromPrecompiled(is_atomic)) {
-            return {};
-        }
-        entry.entries.images.emplace_back(
-            static_cast<std::size_t>(offset), static_cast<std::size_t>(index),
-            static_cast<Tegra::Shader::ImageType>(type), is_bindless != 0, is_written != 0,
-            is_read != 0, is_atomic != 0);
-    }
-
-    u32 global_memory_count{};
-    if (!LoadObjectFromPrecompiled(global_memory_count)) {
-        return {};
-    }
-    for (u32 i = 0; i < global_memory_count; ++i) {
-        u32 cbuf_index{};
-        u32 cbuf_offset{};
-        bool is_read{};
-        bool is_written{};
-        if (!LoadObjectFromPrecompiled(cbuf_index) || !LoadObjectFromPrecompiled(cbuf_offset) ||
-            !LoadObjectFromPrecompiled(is_read) || !LoadObjectFromPrecompiled(is_written)) {
-            return {};
-        }
-        entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read,
-                                                         is_written);
-    }
-
-    for (auto& clip_distance : entry.entries.clip_distances) {
-        if (!LoadObjectFromPrecompiled(clip_distance)) {
-            return {};
-        }
-    }
-
-    u64 shader_length{};
-    if (!LoadObjectFromPrecompiled(shader_length)) {
-        return {};
-    }
-    entry.entries.shader_length = static_cast<std::size_t>(shader_length);
-
-    return entry;
-}
-
-bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std::string& code,
-                                               const GLShader::ShaderEntries& entries) {
-    if (!SaveObjectToPrecompiled(static_cast<u32>(PrecompiledEntryKind::Decompiled)) ||
-        !SaveObjectToPrecompiled(unique_identifier) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(code.size())) ||
-        !SaveArrayToPrecompiled(code.data(), code.size())) {
-        return false;
-    }
-
-    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.const_buffers.size()))) {
-        return false;
-    }
-    for (const auto& cbuf : entries.const_buffers) {
-        if (!SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetMaxOffset())) ||
-            !SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetIndex())) ||
-            !SaveObjectToPrecompiled(cbuf.IsIndirect())) {
-            return false;
-        }
-    }
-
-    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.samplers.size()))) {
-        return false;
-    }
-    for (const auto& sampler : entries.samplers) {
-        if (!SaveObjectToPrecompiled(static_cast<u64>(sampler.GetOffset())) ||
-            !SaveObjectToPrecompiled(static_cast<u64>(sampler.GetIndex())) ||
-            !SaveObjectToPrecompiled(static_cast<u32>(sampler.GetType())) ||
-            !SaveObjectToPrecompiled(sampler.IsArray()) ||
-            !SaveObjectToPrecompiled(sampler.IsShadow()) ||
-            !SaveObjectToPrecompiled(sampler.IsBindless())) {
-            return false;
-        }
-    }
-
-    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.images.size()))) {
-        return false;
-    }
-    for (const auto& image : entries.images) {
-        if (!SaveObjectToPrecompiled(static_cast<u64>(image.GetOffset())) ||
-            !SaveObjectToPrecompiled(static_cast<u64>(image.GetIndex())) ||
-            !SaveObjectToPrecompiled(static_cast<u32>(image.GetType())) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(image.IsBindless() ? 1 : 0)) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(image.IsWritten() ? 1 : 0)) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(image.IsRead() ? 1 : 0)) ||
-            !SaveObjectToPrecompiled(static_cast<u8>(image.IsAtomic() ? 1 : 0))) {
-            return false;
-        }
-    }
-
-    if (!SaveObjectToPrecompiled(static_cast<u32>(entries.global_memory_entries.size()))) {
-        return false;
-    }
-    for (const auto& gmem : entries.global_memory_entries) {
-        if (!SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufIndex())) ||
-            !SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufOffset())) ||
-            !SaveObjectToPrecompiled(gmem.IsRead()) || !SaveObjectToPrecompiled(gmem.IsWritten())) {
-            return false;
-        }
-    }
-
-    for (const bool clip_distance : entries.clip_distances) {
-        if (!SaveObjectToPrecompiled(clip_distance)) {
-            return false;
-        }
-    }
-
-    if (!SaveObjectToPrecompiled(static_cast<u64>(entries.shader_length))) {
-        return false;
-    }
-
-    return true;
+    return dumps;
 }

 void ShaderDiskCacheOpenGL::InvalidateTransferable() {
@@ -494,10 +357,11 @@ void ShaderDiskCacheOpenGL::SaveRaw(const ShaderDiskCacheRaw& entry) {
    }

    FileUtil::IOFile file = AppendTransferableFile();
-    if (!file.IsOpen())
+    if (!file.IsOpen()) {
        return;
+    }
    if (file.WriteObject(TransferableEntryKind::Raw) != 1 || !entry.Save(file)) {
-        LOG_ERROR(Render_OpenGL, "Failed to save raw transferable cache entry - removing");
+        LOG_ERROR(Render_OpenGL, "Failed to save raw transferable cache entry, removing");
        file.Close();
        InvalidateTransferable();
        return;
@@ -523,29 +387,39 @@ void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) {
    FileUtil::IOFile file = AppendTransferableFile();
    if (!file.IsOpen())
        return;
-
-    if (file.WriteObject(TransferableEntryKind::Usage) != 1 || file.WriteObject(usage) != 1) {
-        LOG_ERROR(Render_OpenGL, "Failed to save usage transferable cache entry - removing");
+    const auto Close = [&] {
+        LOG_ERROR(Render_OpenGL, "Failed to save usage transferable cache entry, removing");
        file.Close();
        InvalidateTransferable();
+    };
+
+    if (file.WriteObject(TransferableEntryKind::Usage) != 1 ||
+        file.WriteObject(usage.unique_identifier) != 1 || file.WriteObject(usage.variant) != 1 ||
+        file.WriteObject(static_cast<u32>(usage.keys.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(usage.bound_samplers.size())) != 1 ||
+        file.WriteObject(static_cast<u32>(usage.bindless_samplers.size())) != 1) {
+        Close();
        return;
    }
-}
-
-void ShaderDiskCacheOpenGL::SaveDecompiled(u64 unique_identifier, const std::string& code,
-                                           const GLShader::ShaderEntries& entries) {
-    if (!is_usable) {
-        return;
+    for (const auto& [pair, value] : usage.keys) {
+        const auto [cbuf, offset] = pair;
+        if (file.WriteObject(ConstBufferKey{cbuf, offset, value}) != 1) {
+            Close();
+            return;
+        }
    }
-
-    if (precompiled_cache_virtual_file.GetSize() == 0) {
-        SavePrecompiledHeaderToVirtualPrecompiledCache();
+    for (const auto& [offset, sampler] : usage.bound_samplers) {
+        if (file.WriteObject(BoundSamplerKey{offset, sampler}) != 1) {
+            Close();
+            return;
+        }
    }
-
-    if (!SaveDecompiledFile(unique_identifier, code, entries)) {
-        LOG_ERROR(Render_OpenGL,
-                  "Failed to save decompiled entry to the precompiled file - removing");
-        InvalidatePrecompiled();
+    for (const auto& [pair, sampler] : usage.bindless_samplers) {
+        const auto [cbuf, offset] = pair;
+        if (file.WriteObject(BindlessSamplerKey{cbuf, offset, sampler}) != 1) {
+            Close();
+            return;
+        }
    }
 }

@@ -554,6 +428,13 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
        return;
    }

+    // TODO(Rodrigo): This is a design smell. I shouldn't be having to manually write the header
+    // when writing the dump. This should be done the moment I get access to write to the virtual
+    // file.
+    if (precompiled_cache_virtual_file.GetSize() == 0) {
+        SavePrecompiledHeaderToVirtualPrecompiledCache();
+    }
+
    GLint binary_length{};
    glGetProgramiv(program, GL_PROGRAM_BINARY_LENGTH, &binary_length);

@@ -561,21 +442,51 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p
    std::vector<u8> binary(binary_length);
    glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data());

-    if (!SaveObjectToPrecompiled(static_cast<u32>(PrecompiledEntryKind::Dump)) ||
-        !SaveObjectToPrecompiled(usage) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(binary_format)) ||
-        !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) ||
-        !SaveArrayToPrecompiled(binary.data(), binary.size())) {
-        LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016x} - removing",
+    const auto Close = [&] {
+        LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016X}, removing",
                  usage.unique_identifier);
        InvalidatePrecompiled();
+    };
+
+    if (!SaveObjectToPrecompiled(usage.unique_identifier) ||
+        !SaveObjectToPrecompiled(usage.variant) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(usage.keys.size())) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(usage.bound_samplers.size())) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(usage.bindless_samplers.size()))) {
+        Close();
        return;
    }
+    for (const auto& [pair, value] : usage.keys) {
+        const auto [cbuf, offset] = pair;
+        if (SaveObjectToPrecompiled(ConstBufferKey{cbuf, offset, value}) != 1) {
+            Close();
+            return;
+        }
+    }
+    for (const auto& [offset, sampler] : usage.bound_samplers) {
+        if (SaveObjectToPrecompiled(BoundSamplerKey{offset, sampler}) != 1) {
+            Close();
+            return;
+        }
+    }
+    for (const auto& [pair, sampler] : usage.bindless_samplers) {
+        const auto [cbuf, offset] = pair;
+        if (SaveObjectToPrecompiled(BindlessSamplerKey{cbuf, offset, sampler}) != 1) {
+            Close();
+            return;
+        }
+    }
+    if (!SaveObjectToPrecompiled(static_cast<u32>(binary_format)) ||
+        !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) ||
+        !SaveArrayToPrecompiled(binary.data(), binary.size())) {
+        Close();
+    }
 }

 FileUtil::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const {
-    if (!EnsureDirectories())
+    if (!EnsureDirectories()) {
        return {};
+    }

    const auto transferable_path{GetTransferablePath()};
    const bool existed = FileUtil::Exists(transferable_path);
@@ -607,8 +518,8 @@ void ShaderDiskCacheOpenGL::SavePrecompiledHeaderToVirtualPrecompiledCache() {

 void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() {
    precompiled_cache_virtual_file_offset = 0;
-    const std::vector<u8>& uncompressed = precompiled_cache_virtual_file.ReadAllBytes();
-    const std::vector<u8>& compressed =
+    const std::vector<u8> uncompressed = precompiled_cache_virtual_file.ReadAllBytes();
+    const std::vector<u8> compressed =
        Common::Compression::CompressDataZSTDDefault(uncompressed.data(), uncompressed.size());

    const auto precompiled_path{GetPrecompiledPath()};
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -8,6 +8,7 @@
 #include <optional>
 #include <string>
 #include <tuple>
+#include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -19,6 +20,7 @@
 #include "common/common_types.h"
 #include "core/file_sys/vfs_vector.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
+#include "video_core/shader/const_buffer_locker.h"

 namespace Core {
 class System;
@@ -53,6 +55,7 @@ struct BaseBindings {
        return !operator==(rhs);
    }
 };
+static_assert(std::is_trivially_copyable_v<BaseBindings>);

 /// Describes the different variants a single program can be compiled.
 struct ProgramVariant {
@@ -70,13 +73,20 @@ struct ProgramVariant {
    }
 };

+static_assert(std::is_trivially_copyable_v<ProgramVariant>);
+
 /// Describes how a shader is used.
 struct ShaderDiskCacheUsage {
    u64 unique_identifier{};
    ProgramVariant variant;
+    VideoCommon::Shader::KeyMap keys;
+    VideoCommon::Shader::BoundSamplerMap bound_samplers;
+    VideoCommon::Shader::BindlessSamplerMap bindless_samplers;

    bool operator==(const ShaderDiskCacheUsage& rhs) const {
-        return std::tie(unique_identifier, variant) == std::tie(rhs.unique_identifier, rhs.variant);
+        return std::tie(unique_identifier, variant, keys, bound_samplers, bindless_samplers) ==
+               std::tie(rhs.unique_identifier, rhs.variant, rhs.keys, rhs.bound_samplers,
+                        rhs.bindless_samplers);
    }

    bool operator!=(const ShaderDiskCacheUsage& rhs) const {
@@ -123,8 +133,7 @@ namespace OpenGL {
 class ShaderDiskCacheRaw {
 public:
    explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
-                                u32 program_code_size, u32 program_code_size_b,
-                                ProgramCode program_code, ProgramCode program_code_b);
+                                ProgramCode program_code, ProgramCode program_code_b = {});
    ShaderDiskCacheRaw();
    ~ShaderDiskCacheRaw();

@@ -155,22 +164,14 @@ public:
 private:
    u64 unique_identifier{};
    ProgramType program_type{};
-    u32 program_code_size{};
-    u32 program_code_size_b{};

    ProgramCode program_code;
    ProgramCode program_code_b;
 };

-/// Contains decompiled data from a shader
-struct ShaderDiskCacheDecompiled {
-    std::string code;
-    GLShader::ShaderEntries entries;
-};
-
 /// Contains an OpenGL dumped binary program
 struct ShaderDiskCacheDump {
-    GLenum binary_format;
+    GLenum binary_format{};
    std::vector<u8> binary;
 };

@@ -184,9 +185,7 @@ public:
    LoadTransferable();

    /// Loads current game's precompiled cache. Invalidates on failure.
-    std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>,
-              std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>
-    LoadPrecompiled();
+    std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> LoadPrecompiled();

    /// Removes the transferable (and precompiled) cache file.
    void InvalidateTransferable();
@@ -200,10 +199,6 @@ public:
    /// Saves shader usage to the transferable file. Does not check for collisions.
    void SaveUsage(const ShaderDiskCacheUsage& usage);

-    /// Saves a decompiled entry to the precompiled file. Does not check for collisions.
-    void SaveDecompiled(u64 unique_identifier, const std::string& code,
-                        const GLShader::ShaderEntries& entries);
-
    /// Saves a dump entry to the precompiled file. Does not check for collisions.
    void SaveDump(const ShaderDiskCacheUsage& usage, GLuint program);

@@ -212,18 +207,9 @@ public:

 private:
    /// Loads the transferable cache. Returns empty on failure.
-    std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>,
-                            std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>>
+    std::optional<std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>
    LoadPrecompiledFile(FileUtil::IOFile& file);

-    /// Loads a decompiled cache entry from m_precompiled_cache_virtual_file. Returns empty on
-    /// failure.
-    std::optional<ShaderDiskCacheDecompiled> LoadDecompiledEntry();
-
-    /// Saves a decompiled entry to the passed file. Returns true on success.
-    bool SaveDecompiledFile(u64 unique_identifier, const std::string& code,
-                            const GLShader::ShaderEntries& entries);
-
    /// Opens current game's transferable file and write it's header if it doesn't exist
    FileUtil::IOFile AppendTransferableFile() const;

--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -16,93 +16,51 @@ using VideoCommon::Shader::CompilerSettings;
 using VideoCommon::Shader::ProgramCode;
 using VideoCommon::Shader::ShaderIR;

-static constexpr u32 PROGRAM_OFFSET = 10;
-static constexpr u32 COMPUTE_OFFSET = 0;
-
-static constexpr CompilerSettings settings{CompileDepth::NoFlowStack, true};
-
-ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
-    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
-
-    std::string out = "// Shader Unique Id: VS" + id + "\n\n";
-    out += GetCommonDeclarations();
-
+std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b) {
+    std::string out = GetCommonDeclarations();
    out += R"(
 layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
-    vec4 viewport_flip;
-    uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
+    float y_direction;
 };

 )";
-
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a, settings);
-    const auto stage = setup.IsDualProgram() ? ProgramType::VertexA : ProgramType::VertexB;
-    ProgramResult program = Decompile(device, program_ir, stage, "vertex");
-    out += program.first;
-
-    if (setup.IsDualProgram()) {
-        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b,
-                                    settings);
-        ProgramResult program_b = Decompile(device, program_ir_b, ProgramType::VertexB, "vertex_b");
-        out += program_b.first;
+    const auto stage = ir_b ? ProgramType::VertexA : ProgramType::VertexB;
+    out += Decompile(device, ir, stage, "vertex");
+    if (ir_b) {
+        out += Decompile(device, *ir_b, ProgramType::VertexB, "vertex_b");
    }

    out += R"(
 void main() {
    execute_vertex();
 )";
-
-    if (setup.IsDualProgram()) {
+    if (ir_b) {
        out += "    execute_vertex_b();";
    }
-
-    out += R"(
-
-    // Set Position Y direction
-    gl_Position.y *= utof(config_pack[2]);
-    // Check if the flip stage is VertexB
-    // Config pack's second value is flip_stage
-    if (config_pack[1] == 1) {
-        // Viewport can be flipped, which is unsupported by glViewport
-        gl_Position.xy *= viewport_flip.xy;
-    }
-})";
-
-    return {std::move(out), std::move(program.second)};
+    out += "}\n";
+    return out;
 }

-ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup) {
-    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
-
-    std::string out = "// Shader Unique Id: GS" + id + "\n\n";
-    out += GetCommonDeclarations();
-
+std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir) {
+    std::string out = GetCommonDeclarations();
    out += R"(
 layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
-    vec4 viewport_flip;
-    uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
+    float y_direction;
 };

 )";
-
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a, settings);
-    ProgramResult program = Decompile(device, program_ir, ProgramType::Geometry, "geometry");
-    out += program.first;
+    out += Decompile(device, ir, ProgramType::Geometry, "geometry");

    out += R"(
 void main() {
    execute_geometry();
-};)";
-
-    return {std::move(out), std::move(program.second)};
+}
+)";
+    return out;
 }

-ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup) {
-    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
-
-    std::string out = "// Shader Unique Id: FS" + id + "\n\n";
-    out += GetCommonDeclarations();
-
+std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir) {
+    std::string out = GetCommonDeclarations();
    out += R"(
 layout (location = 0) out vec4 FragColor0;
 layout (location = 1) out vec4 FragColor1;
@@ -114,41 +72,29 @@ layout (location = 6) out vec4 FragColor6;
 layout (location = 7) out vec4 FragColor7;

 layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {
-    vec4 viewport_flip;
-    uvec4 config_pack; // instance_id, flip_stage, y_direction, padding
+    float y_direction;
 };

 )";
-
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a, settings);
-    ProgramResult program = Decompile(device, program_ir, ProgramType::Fragment, "fragment");
-    out += program.first;
+    out += Decompile(device, ir, ProgramType::Fragment, "fragment");

    out += R"(
 void main() {
    execute_fragment();
 }
-
 )";
-    return {std::move(out), std::move(program.second)};
+    return out;
 }

-ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup) {
-    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
-
-    std::string out = "// Shader Unique Id: CS" + id + "\n\n";
-    out += GetCommonDeclarations();
-
-    const ShaderIR program_ir(setup.program.code, COMPUTE_OFFSET, setup.program.size_a, settings);
-    ProgramResult program = Decompile(device, program_ir, ProgramType::Compute, "compute");
-    out += program.first;
-
+std::string GenerateComputeShader(const Device& device, const ShaderIR& ir) {
+    std::string out = GetCommonDeclarations();
+    out += Decompile(device, ir, ProgramType::Compute, "compute");
    out += R"(
 void main() {
    execute_compute();
 }
 )";
-    return {std::move(out), std::move(program.second)};
+    return out;
 }

 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -17,44 +17,18 @@ class Device;
 namespace OpenGL::GLShader {

 using VideoCommon::Shader::ProgramCode;
-
-struct ShaderSetup {
-    explicit ShaderSetup(ProgramCode program_code) {
-        program.code = std::move(program_code);
-    }
-
-    struct {
-        ProgramCode code;
-        ProgramCode code_b; // Used for dual vertex shaders
-        u64 unique_identifier;
-        std::size_t size_a;
-        std::size_t size_b;
-    } program;
-
-    /// Used in scenarios where we have a dual vertex shaders
-    void SetProgramB(ProgramCode program_b) {
-        program.code_b = std::move(program_b);
-        has_program_b = true;
-    }
-
-    bool IsDualProgram() const {
-        return has_program_b;
-    }
-
-private:
-    bool has_program_b{};
-};
+using VideoCommon::Shader::ShaderIR;

 /// Generates the GLSL vertex shader program source code for the given VS program
-ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup);
+std::string GenerateVertexShader(const Device& device, const ShaderIR& ir, const ShaderIR* ir_b);

 /// Generates the GLSL geometry shader program source code for the given GS program
-ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup);
+std::string GenerateGeometryShader(const Device& device, const ShaderIR& ir);

 /// Generates the GLSL fragment shader program source code for the given FS program
-ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);
+std::string GenerateFragmentShader(const Device& device, const ShaderIR& ir);

 /// Generates the GLSL compute shader program source code for the given CS program
-ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup);
+std::string GenerateComputeShader(const Device& device, const ShaderIR& ir);

 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_manager.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp
@@ -40,27 +40,11 @@ void ProgramManager::UpdatePipeline() {
    old_state = current_state;
 }

-void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell, std::size_t shader_stage) {
+void MaxwellUniformData::SetFromRegs(const Maxwell3D& maxwell) {
    const auto& regs = maxwell.regs;
-    const auto& state = maxwell.state;
-
-    // TODO(bunnei): Support more than one viewport
-    viewport_flip[0] = regs.viewport_transform[0].scale_x < 0.0 ? -1.0f : 1.0f;
-    viewport_flip[1] = regs.viewport_transform[0].scale_y < 0.0 ? -1.0f : 1.0f;
-
-    instance_id = state.current_instance;
-
-    // Assign in which stage the position has to be flipped
-    // (the last stage before the fragment shader).
-    constexpr u32 geometry_index = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::Geometry);
-    if (maxwell.regs.shader_config[geometry_index].enable) {
-        flip_stage = geometry_index;
-    } else {
-        flip_stage = static_cast<u32>(Maxwell3D::Regs::ShaderProgram::VertexB);
-    }

    // Y_NEGATE controls what value S2R returns for the Y_DIRECTION system value.
-    y_direction = regs.screen_y_control.y_negate == 0 ? 1.f : -1.f;
+    y_direction = regs.screen_y_control.y_negate == 0 ? 1.0f : -1.0f;
 }

 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_manager.h
+++ b/src/video_core/renderer_opengl/gl_shader_manager.h
@@ -18,17 +18,12 @@ namespace OpenGL::GLShader {
 /// @note Always keep a vec4 at the end. The GL spec is not clear whether the alignment at
 ///       the end of a uniform block is included in UNIFORM_BLOCK_DATA_SIZE or not.
 ///       Not following that rule will cause problems on some AMD drivers.
-struct MaxwellUniformData {
-    void SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell, std::size_t shader_stage);
+struct alignas(16) MaxwellUniformData {
+    void SetFromRegs(const Tegra::Engines::Maxwell3D& maxwell);

-    alignas(16) GLvec4 viewport_flip;
-    struct alignas(16) {
-        GLuint instance_id;
-        GLuint flip_stage;
-        GLfloat y_direction;
-    };
+    GLfloat y_direction;
 };
-static_assert(sizeof(MaxwellUniformData) == 32, "MaxwellUniformData structure size is incorrect");
+static_assert(sizeof(MaxwellUniformData) == 16, "MaxwellUniformData structure size is incorrect");
 static_assert(sizeof(MaxwellUniformData) < 16384,
              "MaxwellUniformData structure must be less than 16kb as per the OpenGL spec");

--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <algorithm>
 #include <iterator>
 #include <glad/glad.h>
 #include "common/assert.h"
@@ -69,147 +70,29 @@ void Enable(GLenum cap, GLuint index, bool enable) {
 }

 void Enable(GLenum cap, bool& current_value, bool new_value) {
-    if (UpdateValue(current_value, new_value))
+    if (UpdateValue(current_value, new_value)) {
        Enable(cap, new_value);
+    }
 }

 void Enable(GLenum cap, GLuint index, bool& current_value, bool new_value) {
-    if (UpdateValue(current_value, new_value))
+    if (UpdateValue(current_value, new_value)) {
        Enable(cap, index, new_value);
+    }
 }

-} // namespace
+} // Anonymous namespace

-OpenGLState::OpenGLState() {
-    // These all match default OpenGL values
-    framebuffer_srgb.enabled = false;
-
-    multisample_control.alpha_to_coverage = false;
-    multisample_control.alpha_to_one = false;
-
-    cull.enabled = false;
-    cull.mode = GL_BACK;
-    cull.front_face = GL_CCW;
-
-    depth.test_enabled = false;
-    depth.test_func = GL_LESS;
-    depth.write_mask = GL_TRUE;
-
-    primitive_restart.enabled = false;
-    primitive_restart.index = 0;
-
-    for (auto& item : color_mask) {
-        item.red_enabled = GL_TRUE;
-        item.green_enabled = GL_TRUE;
-        item.blue_enabled = GL_TRUE;
-        item.alpha_enabled = GL_TRUE;
-    }
-
-    const auto ResetStencil = [](auto& config) {
-        config.test_func = GL_ALWAYS;
-        config.test_ref = 0;
-        config.test_mask = 0xFFFFFFFF;
-        config.write_mask = 0xFFFFFFFF;
-        config.action_depth_fail = GL_KEEP;
-        config.action_depth_pass = GL_KEEP;
-        config.action_stencil_fail = GL_KEEP;
-    };
-    stencil.test_enabled = false;
-    ResetStencil(stencil.front);
-    ResetStencil(stencil.back);
-
-    for (auto& item : viewports) {
-        item.x = 0;
-        item.y = 0;
-        item.width = 0;
-        item.height = 0;
-        item.depth_range_near = 0.0f;
-        item.depth_range_far = 1.0f;
-        item.scissor.enabled = false;
-        item.scissor.x = 0;
-        item.scissor.y = 0;
-        item.scissor.width = 0;
-        item.scissor.height = 0;
-    }
-
-    for (auto& item : blend) {
-        item.enabled = true;
-        item.rgb_equation = GL_FUNC_ADD;
-        item.a_equation = GL_FUNC_ADD;
-        item.src_rgb_func = GL_ONE;
-        item.dst_rgb_func = GL_ZERO;
-        item.src_a_func = GL_ONE;
-        item.dst_a_func = GL_ZERO;
-    }
-
-    independant_blend.enabled = false;
-
-    blend_color.red = 0.0f;
-    blend_color.green = 0.0f;
-    blend_color.blue = 0.0f;
-    blend_color.alpha = 0.0f;
-
-    logic_op.enabled = false;
-    logic_op.operation = GL_COPY;
-
-    draw.read_framebuffer = 0;
-    draw.draw_framebuffer = 0;
-    draw.vertex_array = 0;
-    draw.shader_program = 0;
-    draw.program_pipeline = 0;
-
-    clip_distance = {};
-
-    point.size = 1;
-
-    fragment_color_clamp.enabled = false;
-
-    depth_clamp.far_plane = false;
-    depth_clamp.near_plane = false;
-
-    polygon_offset.fill_enable = false;
-    polygon_offset.line_enable = false;
-    polygon_offset.point_enable = false;
-    polygon_offset.factor = 0.0f;
-    polygon_offset.units = 0.0f;
-    polygon_offset.clamp = 0.0f;
-
-    alpha_test.enabled = false;
-    alpha_test.func = GL_ALWAYS;
-    alpha_test.ref = 0.0f;
-}
+OpenGLState::OpenGLState() = default;

 void OpenGLState::SetDefaultViewports() {
-    for (auto& item : viewports) {
-        item.x = 0;
-        item.y = 0;
-        item.width = 0;
-        item.height = 0;
-        item.depth_range_near = 0.0f;
-        item.depth_range_far = 1.0f;
-        item.scissor.enabled = false;
-        item.scissor.x = 0;
-        item.scissor.y = 0;
-        item.scissor.width = 0;
-        item.scissor.height = 0;
-    }
+    viewports.fill(Viewport{});

    depth_clamp.far_plane = false;
    depth_clamp.near_plane = false;
 }

-void OpenGLState::ApplyDefaultState() {
-    glEnable(GL_BLEND);
-    glDisable(GL_FRAMEBUFFER_SRGB);
-    glDisable(GL_CULL_FACE);
-    glDisable(GL_DEPTH_TEST);
-    glDisable(GL_PRIMITIVE_RESTART);
-    glDisable(GL_STENCIL_TEST);
-    glDisable(GL_COLOR_LOGIC_OP);
-    glDisable(GL_SCISSOR_TEST);
-}
-
-void OpenGLState::ApplyFramebufferState() const {
+void OpenGLState::ApplyFramebufferState() {
    if (UpdateValue(cur_state.draw.read_framebuffer, draw.read_framebuffer)) {
        glBindFramebuffer(GL_READ_FRAMEBUFFER, draw.read_framebuffer);
    }
@@ -218,52 +101,52 @@ void OpenGLState::ApplyFramebufferState() const {
    }
 }

-void OpenGLState::ApplyVertexArrayState() const {
+void OpenGLState::ApplyVertexArrayState() {
    if (UpdateValue(cur_state.draw.vertex_array, draw.vertex_array)) {
        glBindVertexArray(draw.vertex_array);
    }
 }

-void OpenGLState::ApplyShaderProgram() const {
+void OpenGLState::ApplyShaderProgram() {
    if (UpdateValue(cur_state.draw.shader_program, draw.shader_program)) {
        glUseProgram(draw.shader_program);
    }
 }

-void OpenGLState::ApplyProgramPipeline() const {
+void OpenGLState::ApplyProgramPipeline() {
    if (UpdateValue(cur_state.draw.program_pipeline, draw.program_pipeline)) {
        glBindProgramPipeline(draw.program_pipeline);
    }
 }

-void OpenGLState::ApplyClipDistances() const {
+void OpenGLState::ApplyClipDistances() {
    for (std::size_t i = 0; i < clip_distance.size(); ++i) {
        Enable(GL_CLIP_DISTANCE0 + static_cast<GLenum>(i), cur_state.clip_distance[i],
               clip_distance[i]);
    }
 }

-void OpenGLState::ApplyPointSize() const {
+void OpenGLState::ApplyPointSize() {
    if (UpdateValue(cur_state.point.size, point.size)) {
        glPointSize(point.size);
    }
 }

-void OpenGLState::ApplyFragmentColorClamp() const {
+void OpenGLState::ApplyFragmentColorClamp() {
    if (UpdateValue(cur_state.fragment_color_clamp.enabled, fragment_color_clamp.enabled)) {
        glClampColor(GL_CLAMP_FRAGMENT_COLOR_ARB,
                     fragment_color_clamp.enabled ? GL_TRUE : GL_FALSE);
    }
 }

-void OpenGLState::ApplyMultisample() const {
+void OpenGLState::ApplyMultisample() {
    Enable(GL_SAMPLE_ALPHA_TO_COVERAGE, cur_state.multisample_control.alpha_to_coverage,
           multisample_control.alpha_to_coverage);
    Enable(GL_SAMPLE_ALPHA_TO_ONE, cur_state.multisample_control.alpha_to_one,
           multisample_control.alpha_to_one);
 }

-void OpenGLState::ApplyDepthClamp() const {
+void OpenGLState::ApplyDepthClamp() {
    if (depth_clamp.far_plane == cur_state.depth_clamp.far_plane &&
        depth_clamp.near_plane == cur_state.depth_clamp.near_plane) {
        return;
@@ -276,7 +159,7 @@ void OpenGLState::ApplyDepthClamp() const {
    Enable(GL_DEPTH_CLAMP, depth_clamp.far_plane || depth_clamp.near_plane);
 }

-void OpenGLState::ApplySRgb() const {
+void OpenGLState::ApplySRgb() {
    if (cur_state.framebuffer_srgb.enabled == framebuffer_srgb.enabled)
        return;
    cur_state.framebuffer_srgb.enabled = framebuffer_srgb.enabled;
@@ -287,7 +170,7 @@ void OpenGLState::ApplySRgb() const {
    }
 }

-void OpenGLState::ApplyCulling() const {
+void OpenGLState::ApplyCulling() {
    Enable(GL_CULL_FACE, cur_state.cull.enabled, cull.enabled);

    if (UpdateValue(cur_state.cull.mode, cull.mode)) {
@@ -299,7 +182,12 @@ void OpenGLState::ApplyCulling() const {
    }
 }

-void OpenGLState::ApplyColorMask() const {
+void OpenGLState::ApplyColorMask() {
+    if (!dirty.color_mask) {
+        return;
+    }
+    dirty.color_mask = false;
+
    for (std::size_t i = 0; i < Maxwell::NumRenderTargets; ++i) {
        const auto& updated = color_mask[i];
        auto& current = cur_state.color_mask[i];
@@ -314,7 +202,7 @@ void OpenGLState::ApplyColorMask() const {
    }
 }

-void OpenGLState::ApplyDepth() const {
+void OpenGLState::ApplyDepth() {
    Enable(GL_DEPTH_TEST, cur_state.depth.test_enabled, depth.test_enabled);

    if (cur_state.depth.test_func != depth.test_func) {
@@ -328,7 +216,7 @@ void OpenGLState::ApplyDepth() const {
    }
 }

-void OpenGLState::ApplyPrimitiveRestart() const {
+void OpenGLState::ApplyPrimitiveRestart() {
    Enable(GL_PRIMITIVE_RESTART, cur_state.primitive_restart.enabled, primitive_restart.enabled);

    if (cur_state.primitive_restart.index != primitive_restart.index) {
@@ -337,7 +225,12 @@ void OpenGLState::ApplyPrimitiveRestart() const {
    }
 }

-void OpenGLState::ApplyStencilTest() const {
+void OpenGLState::ApplyStencilTest() {
+    if (!dirty.stencil_state) {
+        return;
+    }
+    dirty.stencil_state = false;
+
    Enable(GL_STENCIL_TEST, cur_state.stencil.test_enabled, stencil.test_enabled);

    const auto ConfigStencil = [](GLenum face, const auto& config, auto& current) {
@@ -366,7 +259,7 @@ void OpenGLState::ApplyStencilTest() const {
    ConfigStencil(GL_BACK, stencil.back, cur_state.stencil.back);
 }

-void OpenGLState::ApplyViewport() const {
+void OpenGLState::ApplyViewport() {
    for (GLuint i = 0; i < static_cast<GLuint>(Maxwell::NumViewports); ++i) {
        const auto& updated = viewports[i];
        auto& current = cur_state.viewports[i];
@@ -403,7 +296,7 @@ void OpenGLState::ApplyViewport() const {
    }
 }

-void OpenGLState::ApplyGlobalBlending() const {
+void OpenGLState::ApplyGlobalBlending() {
    const Blend& updated = blend[0];
    Blend& current = cur_state.blend[0];

@@ -427,7 +320,7 @@ void OpenGLState::ApplyGlobalBlending() const {
    }
 }

-void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const {
+void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) {
    const Blend& updated = blend[target];
    Blend& current = cur_state.blend[target];

@@ -451,7 +344,12 @@ void OpenGLState::ApplyTargetBlending(std::size_t target, bool force) const {
    }
 }

-void OpenGLState::ApplyBlending() const {
+void OpenGLState::ApplyBlending() {
+    if (!dirty.blend_state) {
+        return;
+    }
+    dirty.blend_state = false;
+
    if (independant_blend.enabled) {
        const bool force = independant_blend.enabled != cur_state.independant_blend.enabled;
        for (std::size_t target = 0; target < Maxwell::NumRenderTargets; ++target) {
@@ -470,7 +368,7 @@ void OpenGLState::ApplyBlending() const {
    }
 }

-void OpenGLState::ApplyLogicOp() const {
+void OpenGLState::ApplyLogicOp() {
    Enable(GL_COLOR_LOGIC_OP, cur_state.logic_op.enabled, logic_op.enabled);

    if (UpdateValue(cur_state.logic_op.operation, logic_op.operation)) {
@@ -478,7 +376,12 @@ void OpenGLState::ApplyLogicOp() const {
    }
 }

-void OpenGLState::ApplyPolygonOffset() const {
+void OpenGLState::ApplyPolygonOffset() {
+    if (!dirty.polygon_offset) {
+        return;
+    }
+    dirty.polygon_offset = false;
+
    Enable(GL_POLYGON_OFFSET_FILL, cur_state.polygon_offset.fill_enable,
           polygon_offset.fill_enable);
    Enable(GL_POLYGON_OFFSET_LINE, cur_state.polygon_offset.line_enable,
@@ -499,7 +402,7 @@ void OpenGLState::ApplyPolygonOffset() const {
    }
 }

-void OpenGLState::ApplyAlphaTest() const {
+void OpenGLState::ApplyAlphaTest() {
    Enable(GL_ALPHA_TEST, cur_state.alpha_test.enabled, alpha_test.enabled);
    if (UpdateTie(std::tie(cur_state.alpha_test.func, cur_state.alpha_test.ref),
                  std::tie(alpha_test.func, alpha_test.ref))) {
@@ -507,19 +410,25 @@ void OpenGLState::ApplyAlphaTest() const {
    }
 }

-void OpenGLState::ApplyTextures() const {
+void OpenGLState::ApplyClipControl() {
+    if (UpdateValue(cur_state.clip_control.origin, clip_control.origin)) {
+        glClipControl(clip_control.origin, GL_NEGATIVE_ONE_TO_ONE);
+    }
+}
+
+void OpenGLState::ApplyTextures() {
    if (const auto update = UpdateArray(cur_state.textures, textures)) {
        glBindTextures(update->first, update->second, textures.data() + update->first);
    }
 }

-void OpenGLState::ApplySamplers() const {
+void OpenGLState::ApplySamplers() {
    if (const auto update = UpdateArray(cur_state.samplers, samplers)) {
        glBindSamplers(update->first, update->second, samplers.data() + update->first);
    }
 }

-void OpenGLState::ApplyImages() const {
+void OpenGLState::ApplyImages() {
    if (const auto update = UpdateArray(cur_state.images, images)) {
        glBindImageTextures(update->first, update->second, images.data() + update->first);
    }
@@ -535,33 +444,22 @@ void OpenGLState::Apply() {
    ApplyPointSize();
    ApplyFragmentColorClamp();
    ApplyMultisample();
-    if (dirty.color_mask) {
-        ApplyColorMask();
-        dirty.color_mask = false;
-    }
+    ApplyColorMask();
    ApplyDepthClamp();
    ApplyViewport();
-    if (dirty.stencil_state) {
-        ApplyStencilTest();
-        dirty.stencil_state = false;
-    }
+    ApplyStencilTest();
    ApplySRgb();
    ApplyCulling();
    ApplyDepth();
    ApplyPrimitiveRestart();
-    if (dirty.blend_state) {
-        ApplyBlending();
-        dirty.blend_state = false;
-    }
+    ApplyBlending();
    ApplyLogicOp();
    ApplyTextures();
    ApplySamplers();
    ApplyImages();
-    if (dirty.polygon_offset) {
-        ApplyPolygonOffset();
-        dirty.polygon_offset = false;
-    }
+    ApplyPolygonOffset();
    ApplyAlphaTest();
+    ApplyClipControl();
 }

 void OpenGLState::EmulateViewportWithScissor() {
--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -5,168 +5,150 @@
 #pragma once

 #include <array>
+#include <type_traits>
 #include <glad/glad.h>
 #include "video_core/engines/maxwell_3d.h"

 namespace OpenGL {

-namespace TextureUnits {
-
-struct TextureUnit {
-    GLint id;
-    constexpr GLenum Enum() const {
-        return static_cast<GLenum>(GL_TEXTURE0 + id);
-    }
-};
-
-constexpr TextureUnit MaxwellTexture(int unit) {
-    return TextureUnit{unit};
-}
-
-constexpr TextureUnit LightingLUT{3};
-constexpr TextureUnit FogLUT{4};
-constexpr TextureUnit ProcTexNoiseLUT{5};
-constexpr TextureUnit ProcTexColorMap{6};
-constexpr TextureUnit ProcTexAlphaMap{7};
-constexpr TextureUnit ProcTexLUT{8};
-constexpr TextureUnit ProcTexDiffLUT{9};
-
-} // namespace TextureUnits
-
 class OpenGLState {
 public:
    struct {
-        bool enabled; // GL_FRAMEBUFFER_SRGB
+        bool enabled = false; // GL_FRAMEBUFFER_SRGB
    } framebuffer_srgb;

    struct {
-        bool alpha_to_coverage; // GL_ALPHA_TO_COVERAGE
-        bool alpha_to_one;      // GL_ALPHA_TO_ONE
+        bool alpha_to_coverage = false; // GL_ALPHA_TO_COVERAGE
+        bool alpha_to_one = false;      // GL_ALPHA_TO_ONE
    } multisample_control;

    struct {
-        bool enabled; // GL_CLAMP_FRAGMENT_COLOR_ARB
+        bool enabled = false; // GL_CLAMP_FRAGMENT_COLOR_ARB
    } fragment_color_clamp;

    struct {
-        bool far_plane;
-        bool near_plane;
+        bool far_plane = false;
+        bool near_plane = false;
    } depth_clamp; // GL_DEPTH_CLAMP

    struct {
-        bool enabled;      // GL_CULL_FACE
-        GLenum mode;       // GL_CULL_FACE_MODE
-        GLenum front_face; // GL_FRONT_FACE
+        bool enabled = false;       // GL_CULL_FACE
+        GLenum mode = GL_BACK;      // GL_CULL_FACE_MODE
+        GLenum front_face = GL_CCW; // GL_FRONT_FACE
    } cull;

    struct {
-        bool test_enabled;    // GL_DEPTH_TEST
-        GLenum test_func;     // GL_DEPTH_FUNC
-        GLboolean write_mask; // GL_DEPTH_WRITEMASK
+        bool test_enabled = false;      // GL_DEPTH_TEST
+        GLboolean write_mask = GL_TRUE; // GL_DEPTH_WRITEMASK
+        GLenum test_func = GL_LESS;     // GL_DEPTH_FUNC
    } depth;

    struct {
-        bool enabled;
-        GLuint index;
+        bool enabled = false;
+        GLuint index = 0;
    } primitive_restart; // GL_PRIMITIVE_RESTART

    struct ColorMask {
-        GLboolean red_enabled;
-        GLboolean green_enabled;
-        GLboolean blue_enabled;
-        GLboolean alpha_enabled;
+        GLboolean red_enabled = GL_TRUE;
+        GLboolean green_enabled = GL_TRUE;
+        GLboolean blue_enabled = GL_TRUE;
+        GLboolean alpha_enabled = GL_TRUE;
    };
    std::array<ColorMask, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets>
        color_mask; // GL_COLOR_WRITEMASK
    struct {
-        bool test_enabled; // GL_STENCIL_TEST
+        bool test_enabled = false; // GL_STENCIL_TEST
        struct {
-            GLenum test_func;           // GL_STENCIL_FUNC
-            GLint test_ref;             // GL_STENCIL_REF
-            GLuint test_mask;           // GL_STENCIL_VALUE_MASK
-            GLuint write_mask;          // GL_STENCIL_WRITEMASK
-            GLenum action_stencil_fail; // GL_STENCIL_FAIL
-            GLenum action_depth_fail;   // GL_STENCIL_PASS_DEPTH_FAIL
-            GLenum action_depth_pass;   // GL_STENCIL_PASS_DEPTH_PASS
+            GLenum test_func = GL_ALWAYS;         // GL_STENCIL_FUNC
+            GLint test_ref = 0;                   // GL_STENCIL_REF
+            GLuint test_mask = 0xFFFFFFFF;        // GL_STENCIL_VALUE_MASK
+            GLuint write_mask = 0xFFFFFFFF;       // GL_STENCIL_WRITEMASK
+            GLenum action_stencil_fail = GL_KEEP; // GL_STENCIL_FAIL
+            GLenum action_depth_fail = GL_KEEP;   // GL_STENCIL_PASS_DEPTH_FAIL
+            GLenum action_depth_pass = GL_KEEP;   // GL_STENCIL_PASS_DEPTH_PASS
        } front, back;
    } stencil;

    struct Blend {
-        bool enabled;        // GL_BLEND
-        GLenum rgb_equation; // GL_BLEND_EQUATION_RGB
-        GLenum a_equation;   // GL_BLEND_EQUATION_ALPHA
-        GLenum src_rgb_func; // GL_BLEND_SRC_RGB
-        GLenum dst_rgb_func; // GL_BLEND_DST_RGB
-        GLenum src_a_func;   // GL_BLEND_SRC_ALPHA
-        GLenum dst_a_func;   // GL_BLEND_DST_ALPHA
+        bool enabled = false;              // GL_BLEND
+        GLenum rgb_equation = GL_FUNC_ADD; // GL_BLEND_EQUATION_RGB
+        GLenum a_equation = GL_FUNC_ADD;   // GL_BLEND_EQUATION_ALPHA
+        GLenum src_rgb_func = GL_ONE;      // GL_BLEND_SRC_RGB
+        GLenum dst_rgb_func = GL_ZERO;     // GL_BLEND_DST_RGB
+        GLenum src_a_func = GL_ONE;        // GL_BLEND_SRC_ALPHA
+        GLenum dst_a_func = GL_ZERO;       // GL_BLEND_DST_ALPHA
    };
    std::array<Blend, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> blend;

    struct {
-        bool enabled;
+        bool enabled = false;
    } independant_blend;

    struct {
-        GLclampf red;
-        GLclampf green;
-        GLclampf blue;
-        GLclampf alpha;
+        GLclampf red = 0.0f;
+        GLclampf green = 0.0f;
+        GLclampf blue = 0.0f;
+        GLclampf alpha = 0.0f;
    } blend_color; // GL_BLEND_COLOR

    struct {
-        bool enabled; // GL_LOGIC_OP_MODE
-        GLenum operation;
+        bool enabled = false; // GL_LOGIC_OP_MODE
+        GLenum operation = GL_COPY;
    } logic_op;

-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> textures{};
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> samplers{};
-    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumImages> images{};
+    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> textures = {};
+    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumTextureSamplers> samplers = {};
+    std::array<GLuint, Tegra::Engines::Maxwell3D::Regs::NumImages> images = {};

    struct {
-        GLuint read_framebuffer; // GL_READ_FRAMEBUFFER_BINDING
-        GLuint draw_framebuffer; // GL_DRAW_FRAMEBUFFER_BINDING
-        GLuint vertex_array;     // GL_VERTEX_ARRAY_BINDING
-        GLuint shader_program;   // GL_CURRENT_PROGRAM
-        GLuint program_pipeline; // GL_PROGRAM_PIPELINE_BINDING
+        GLuint read_framebuffer = 0; // GL_READ_FRAMEBUFFER_BINDING
+        GLuint draw_framebuffer = 0; // GL_DRAW_FRAMEBUFFER_BINDING
+        GLuint vertex_array = 0;     // GL_VERTEX_ARRAY_BINDING
+        GLuint shader_program = 0;   // GL_CURRENT_PROGRAM
+        GLuint program_pipeline = 0; // GL_PROGRAM_PIPELINE_BINDING
    } draw;

-    struct viewport {
-        GLint x;
-        GLint y;
-        GLint width;
-        GLint height;
-        GLfloat depth_range_near; // GL_DEPTH_RANGE
-        GLfloat depth_range_far;  // GL_DEPTH_RANGE
+    struct Viewport {
+        GLint x = 0;
+        GLint y = 0;
+        GLint width = 0;
+        GLint height = 0;
+        GLfloat depth_range_near = 0.0f; // GL_DEPTH_RANGE
+        GLfloat depth_range_far = 1.0f;  // GL_DEPTH_RANGE
        struct {
-            bool enabled; // GL_SCISSOR_TEST
-            GLint x;
-            GLint y;
-            GLsizei width;
-            GLsizei height;
+            bool enabled = false; // GL_SCISSOR_TEST
+            GLint x = 0;
+            GLint y = 0;
+            GLsizei width = 0;
+            GLsizei height = 0;
        } scissor;
    };
-    std::array<viewport, Tegra::Engines::Maxwell3D::Regs::NumViewports> viewports;
+    std::array<Viewport, Tegra::Engines::Maxwell3D::Regs::NumViewports> viewports;

    struct {
-        float size; // GL_POINT_SIZE
+        float size = 1.0f; // GL_POINT_SIZE
    } point;

    struct {
-        bool point_enable;
-        bool line_enable;
-        bool fill_enable;
-        GLfloat units;
-        GLfloat factor;
-        GLfloat clamp;
+        bool point_enable = false;
+        bool line_enable = false;
+        bool fill_enable = false;
+        GLfloat units = 0.0f;
+        GLfloat factor = 0.0f;
+        GLfloat clamp = 0.0f;
    } polygon_offset;

    struct {
-        bool enabled; // GL_ALPHA_TEST
-        GLenum func;  // GL_ALPHA_TEST_FUNC
-        GLfloat ref;  // GL_ALPHA_TEST_REF
+        bool enabled = false;    // GL_ALPHA_TEST
+        GLenum func = GL_ALWAYS; // GL_ALPHA_TEST_FUNC
+        GLfloat ref = 0.0f;      // GL_ALPHA_TEST_REF
    } alpha_test;

-    std::array<bool, 8> clip_distance; // GL_CLIP_DISTANCE
+    std::array<bool, 8> clip_distance = {}; // GL_CLIP_DISTANCE
+
+    struct {
+        GLenum origin = GL_LOWER_LEFT;
+    } clip_control;

    OpenGLState();

@@ -179,34 +161,32 @@ public:
    /// Apply this state as the current OpenGL state
    void Apply();

-    void ApplyFramebufferState() const;
-    void ApplyVertexArrayState() const;
-    void ApplyShaderProgram() const;
-    void ApplyProgramPipeline() const;
-    void ApplyClipDistances() const;
-    void ApplyPointSize() const;
-    void ApplyFragmentColorClamp() const;
-    void ApplyMultisample() const;
-    void ApplySRgb() const;
-    void ApplyCulling() const;
-    void ApplyColorMask() const;
-    void ApplyDepth() const;
-    void ApplyPrimitiveRestart() const;
-    void ApplyStencilTest() const;
-    void ApplyViewport() const;
-    void ApplyTargetBlending(std::size_t target, bool force) const;
-    void ApplyGlobalBlending() const;
-    void ApplyBlending() const;
-    void ApplyLogicOp() const;
-    void ApplyTextures() const;
-    void ApplySamplers() const;
-    void ApplyImages() const;
-    void ApplyDepthClamp() const;
-    void ApplyPolygonOffset() const;
-    void ApplyAlphaTest() const;
-
-    /// Set the initial OpenGL state
-    static void ApplyDefaultState();
+    void ApplyFramebufferState();
+    void ApplyVertexArrayState();
+    void ApplyShaderProgram();
+    void ApplyProgramPipeline();
+    void ApplyClipDistances();
+    void ApplyPointSize();
+    void ApplyFragmentColorClamp();
+    void ApplyMultisample();
+    void ApplySRgb();
+    void ApplyCulling();
+    void ApplyColorMask();
+    void ApplyDepth();
+    void ApplyPrimitiveRestart();
+    void ApplyStencilTest();
+    void ApplyViewport();
+    void ApplyTargetBlending(std::size_t target, bool force);
+    void ApplyGlobalBlending();
+    void ApplyBlending();
+    void ApplyLogicOp();
+    void ApplyTextures();
+    void ApplySamplers();
+    void ApplyImages();
+    void ApplyDepthClamp();
+    void ApplyPolygonOffset();
+    void ApplyAlphaTest();
+    void ApplyClipControl();

    /// Resets any references to the given resource
    OpenGLState& UnbindTexture(GLuint handle);
@@ -253,5 +233,6 @@ private:
        bool color_mask;
    } dirty{};
 };
+static_assert(std::is_trivially_copyable_v<OpenGLState>);

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -131,6 +131,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format
    {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_8X6_SRGB
    {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false},        // ASTC_2D_6X5
    {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE, ComponentType::UNorm, false}, // ASTC_2D_6X5_SRGB
+    {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV, ComponentType::Float, false}, // E5B9G9R9F

    // Depth formats
    {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT, ComponentType::Float, false}, // Z32F
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -54,11 +54,13 @@ in vec2 frag_tex_coord;
 out vec4 color;

 uniform sampler2D color_texture;
+uniform vec4 backlight;

 void main() {
    // Swap RGBA -> ABGR so we don't have to do this on the CPU. This needs to change if we have to
    // support more framebuffer pixel formats.
-    color = texture(color_texture, frag_tex_coord);
+    // Also multiply the color by the backlight multiplier supplied.
+    color = texture(color_texture, frag_tex_coord) * backlight;
 }
 )";

@@ -121,8 +123,13 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
        // Load the framebuffer from memory, draw it to the screen, and swap buffers
        LoadFBToScreenInfo(*framebuffer);

-        if (renderer_settings.screenshot_requested)
+        if (renderer_settings.screenshot_requested) {
            CaptureScreenshot();
+        }
+
+        if (renderer_settings.backlight_fade_time > 0) {
+            UpdateBacklight();
+        }

        DrawScreen(render_window.GetFramebufferLayout());

@@ -205,9 +212,13 @@ void RendererOpenGL::InitOpenGLObjects() {
    state.Apply();
    uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix");
    uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture");
+    uniform_backlight = glGetUniformLocation(shader.handle, "backlight");
    attrib_position = glGetAttribLocation(shader.handle, "vert_position");
    attrib_tex_coord = glGetAttribLocation(shader.handle, "vert_tex_coord");

+    // Initialize backlight
+    glUniform4f(uniform_backlight, 1.f, 1.f, 1.f, 1.f);
+
    // Generate VBO handle for drawing
    vertex_buffer.Create();

@@ -416,6 +427,29 @@ void RendererOpenGL::CaptureScreenshot() {
    renderer_settings.screenshot_requested = false;
 }

+void RendererOpenGL::UpdateBacklight() {
+    constexpr u64 PER_FRAME_FADE_TIME = 1000000000.0f / 60;
+
+    const auto fade_time = renderer_settings.backlight_fade_time.load(std::memory_order_relaxed);
+    auto value = renderer_settings.current_brightness.load(std::memory_order_relaxed);
+    if (fade_time <= PER_FRAME_FADE_TIME) {
+        glUniform4f(uniform_backlight, value, value, value, value);
+        renderer_settings.backlight_fade_time = 0;
+        fade_time_max = 0;
+    } else {
+        if (fade_time_max == 0) {
+            fade_time_max = fade_time;
+            value_max = value;
+        }
+
+        value += (value_max - value) * PER_FRAME_FADE_TIME / fade_time_max;
+
+        glUniform4f(uniform_backlight, value, value, value, value);
+        renderer_settings.backlight_fade_time -= PER_FRAME_FADE_TIME;
+        renderer_settings.current_brightness = value;
+    }
+}
+
 static const char* GetSource(GLenum source) {
 #define RET(s)                                                                                     \
    case GL_DEBUG_SOURCE_##s:                                                                      \
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -70,6 +70,7 @@ private:
    void UpdateFramerate();

    void CaptureScreenshot();
+    void UpdateBacklight();

    // Loads framebuffer from emulated memory into the display information structure
    void LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer);
@@ -97,6 +98,7 @@ private:
    // Shader uniform location indices
    GLuint uniform_modelview_matrix;
    GLuint uniform_color_texture;
+    GLuint uniform_backlight;

    // Shader attribute input indices
    GLuint attrib_position;
@@ -105,6 +107,10 @@ private:
    /// Used for transforming the framebuffer orientation
    Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags;
    Common::Rectangle<int> framebuffer_crop_rect;
+
+    // Used for backlight transitions
+    u64 fade_time_max = 0;
+    f32 value_max = 0;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1682,10 +1682,13 @@ public:
            switch (index) {
            case Tegra::Shader::Pred::NeverExecute:
                target = decomp.v_false;
+                break;
            case Tegra::Shader::Pred::UnusedIndex:
                target = decomp.v_true;
+                break;
            default:
                target = decomp.predicates.at(index);
+                break;
            }
        } else if (const auto flag = std::get_if<InternalFlagNode>(&*cc)) {
            target = decomp.internal_flags.at(static_cast<u32>(flag->GetFlag()));
@@ -1701,6 +1704,13 @@ public:
        return expr.value ? decomp.v_true : decomp.v_false;
    }

+    Id operator()(const ExprGprEqual& expr) {
+        const Id target = decomp.Constant(decomp.t_uint, expr.value);
+        const Id gpr = decomp.BitcastTo<Type::Uint>(
+            decomp.Emit(decomp.OpLoad(decomp.t_float, decomp.registers.at(expr.gpr))));
+        return decomp.Emit(decomp.OpLogicalEqual(decomp.t_uint, gpr, target));
+    }
+
    Id Visit(const Expr& node) {
        return std::visit(*this, *node);
    }
--- a/src/video_core/shader/ast.cpp
+++ b/src/video_core/shader/ast.cpp
@@ -228,6 +228,10 @@ public:
        inner += expr.value ? "true" : "false";
    }

+    void operator()(const ExprGprEqual& expr) {
+        inner += "( gpr_" + std::to_string(expr.gpr) + " == " + std::to_string(expr.value) + ')';
+    }
+
    const std::string& GetResult() const {
        return inner;
    }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
yuzubot	e7d6207a61	"Merge Tagged PR 1012"	2019-11-03 10:19:13 +00:00
yuzubot	2b0d48d67c	"Merge Tagged PR 1340"	2019-11-03 10:19:11 +00:00
yuzubot	7d2fa09e77	"Merge Tagged PR 1703"	2019-11-03 10:19:10 +00:00
yuzubot	f1b371016d	"Merge Tagged PR 2365"	2019-11-03 10:19:09 +00:00
yuzubot	ce49a47e88	"Merge Tagged PR 2542"	2019-11-03 10:19:09 +00:00
yuzubot	cf63652688	"Merge Tagged PR 2859"	2019-11-03 10:19:08 +00:00
yuzubot	2c67c9ac82	"Merge Tagged PR 2914"	2019-11-03 10:19:07 +00:00
yuzubot	026f0a0e4d	"Merge Tagged PR 2945"	2019-11-03 10:19:06 +00:00
yuzubot	269e21aa3e	"Merge Tagged PR 2987"	2019-11-03 10:19:05 +00:00
yuzubot	e233b2d6a8	"Merge Tagged PR 3047"	2019-11-03 10:19:04 +00:00
yuzubot	12ed1a05ab	"Merge Tagged PR 3057"	2019-11-03 10:19:04 +00:00
yuzubot	72e0907574	"Merge Tagged PR 3062"	2019-11-03 10:19:03 +00:00
Mat M	2b4208254e	Merge pull request #3060 from FearlessTobi/patch-1 common/bit_field: Remove FORCE_INLINE calls	2019-11-03 04:59:06 -05:00
bunnei	84887b0088	Merge pull request #3064 from yuzu-emu/revert-3063-zero-init-padding Revert "common_func: Use std::array for INSERT_PADDING_* macros."	2019-11-03 04:40:04 -05:00
bunnei	4edf73344f	Revert "common_func: Use std::array for INSERT_PADDING_* macros."	2019-11-03 04:39:51 -05:00
bunnei	8c1e38f744	Merge pull request #3063 from bunnei/zero-init-padding common_func: Use std::array for INSERT_PADDING_* macros.	2019-11-03 04:35:12 -05:00
bunnei	fdc5791b08	common_func: Use std::array for INSERT_PADDING_* macros. - Zero initialization here is useful for determinism.	2019-11-03 04:30:16 -05:00
Tobias	a81987a7cb	common/bit_field: Remove FORCE_INLINE calls See bunneis comment here https://github.com/citra-emu/citra/pull/4629#discussion_r258533167. They were supposed to be removed by him, but he missed them.	2019-11-03 08:25:37 +01:00
bunnei	bec7e3b7d9	Merge pull request #3058 from FearlessTobi/port-4948 Port citra-emu/citra#4948: "citra_qt: add amiibo drag and drop support"	2019-11-03 01:55:21 -04:00
FearlessTobi	727ba2f2d0	citra_qt: add amiibo drag and drop support Co-Authored-By: Valentin Vanelslande <vvanelslandedev@gmail.com>	2019-11-03 05:24:47 +01:00
Rodrigo Locatti	11e39da02b	Merge pull request #3054 from FernandoS27/fix-tld4-2 shader_ir: Fix regression on TLD4	2019-10-31 01:56:29 +00:00
Fernando Sahmkow	23cabc98db	Shader_IR: Fix regression on TLD4 Originally on the last commit I thought TLD4 acted the same as TLD4S and didn't have a mask. It actually does have a component mask. This commit corrects that.	2019-10-30 21:14:57 -04:00
Rodrigo Locatti	658489ebf7	Merge pull request #3050 from FernandoS27/fix-tld4 shader_ir: Fix TLD4 and add bindless variant	2019-10-30 18:37:17 +00:00
Fernando Sahmkow	9293c3a0f2	Shader_IR: Fix TLD4 and add Bindless Variant. This commit fixes an issue where not all 4 results of tld4 were being written, the color component was defaulted to red, among other things. It also implements the bindless variant.	2019-10-30 12:02:03 -04:00
Rodrigo Locatti	04b838c857	Merge pull request #3038 from lioncash/docs kernel/scheduler: Minor changes	2019-10-30 03:47:28 +00:00
bunnei	2382bbe3ac	Merge pull request #3046 from ReinUsesLisp/clean-gl-state gl_state: Miscellaneous clean up	2019-10-29 22:50:04 -04:00
bunnei	b5138f3c35	Merge pull request #3035 from ReinUsesLisp/rasterizer-accelerated rasterizer_accelerated: Add intermediary for GPU rasterizers	2019-10-29 22:06:41 -04:00
bunnei	a81bd962ab	Merge pull request #3007 from DarkLordZach/fsc-regress savedata_factory: Automatically create certain savedata	2019-10-29 22:05:09 -04:00
Rodrigo Locatti	3d0cde6a75	gl_state: Use std::array::fill instead of std::fill Co-Authored-By: Mat M. <mathew1800@gmail.com>	2019-10-30 01:30:31 +00:00
ReinUsesLisp	ce20ed8e4e	gl_state: Move dirty checks to individual apply calls instead of Apply This requires removing constness from some methods, but for consistency it's removed in all methods.	2019-10-29 21:27:25 -03:00
ReinUsesLisp	3c6557c235	gl_state: Remove ApplyDefaultState OpenGL has defaults values we can trust. Remove these.	2019-10-29 21:27:25 -03:00
ReinUsesLisp	d3651b0b82	gl_state: Change SetDefaultViewports to use default constructor	2019-10-29 21:27:24 -03:00
ReinUsesLisp	c7698d0bc8	gl_state: Minor style changes	2019-10-29 21:27:24 -03:00
ReinUsesLisp	a14d202ac2	gl_state: Remove unused Citra TextureUnits	2019-10-29 21:27:24 -03:00
ReinUsesLisp	28fece8e9b	gl_state: Move initializers from constructor to class declaration	2019-10-29 21:27:23 -03:00
Rodrigo Locatti	2ec5b55ee3	Merge pull request #3004 from ReinUsesLisp/maxwell3d-cleanup maxwell_3d: Remove unused entries	2019-10-29 23:46:33 +00:00
Rodrigo Locatti	9f93ad08a5	Merge pull request #3023 from lioncash/opus externals: Track upstream opus	2019-10-28 02:45:01 -03:00
Rodrigo Locatti	c5d9589942	Merge pull request #3037 from FernandoS27/new-formats video_core: Implement texture format E5B9G9R9_SHAREDEXP.	2019-10-28 01:36:58 -03:00
Lioncash	6c8f28813c	scheduler: Mark parameter of AskForReselectionOrMarkRedundant() as const This is only compared against, so it can be made const.	2019-10-27 23:35:50 -04:00
ReinUsesLisp	fa31e5b868	maxwell_3d/kepler_compute: Remove unused arguments in GetTexture	2019-10-28 00:23:42 -03:00
ReinUsesLisp	538ddd220e	video_core/textures: Remove unused index entry in FullTextureInfo	2019-10-28 00:14:38 -03:00
ReinUsesLisp	961fe4d19b	maxwell_3d: Remove unused method GetStageTextures	2019-10-28 00:14:29 -03:00
Lioncash	f19c1a7cda	scheduler: Silence sign conversion warnings	2019-10-27 22:44:52 -04:00
Lioncash	2fb0bbff29	scheduler: Initialize class members directly where applicable Reduces the overall amount of code.	2019-10-27 22:13:55 -04:00
Lioncash	2dc469ceba	scheduler: Amend documentation comments Adjusts the formatting of a few of the comments an ensures they get recognized as proper Doxygen comments.	2019-10-27 22:12:32 -04:00
David	4c5731c34f	Merge pull request #2971 from FernandoS27/new-scheduler-v2 Kernel: Implement a New Thread Scheduler V2	2019-10-28 10:53:27 +11:00
Fernando Sahmkow	3f9262195b	Video_Core: Implement texture format E5B9G9R9_SHAREDEXP. This commit implements the E5B9G9R9 Texture format into the general system and OpenGL backend.	2019-10-27 16:44:09 -04:00
bunnei	6909b2f0f9	Merge pull request #3034 from ReinUsesLisp/w4244-maxwell3d maxwell_3d: Silence implicit conversion warnings	2019-10-27 15:08:59 -04:00
ReinUsesLisp	3e469cecc1	maxwell_3d: Silence implicit conversion warnings While we are at it, unify types for dirty reg pointers.	2019-10-27 15:22:17 -03:00
bunnei	7e2494e987	Merge pull request #3033 from ReinUsesLisp/w4244-astc astc: Silence implicit conversion warnings	2019-10-27 14:09:53 -04:00
ReinUsesLisp	bd2aff3e26	rasterizer_accelerated: Add intermediary for GPU rasterizers Add an intermediary class that implements common functions across GPU accelerated rasterizers. This avoids code repetition on different backends.	2019-10-27 03:40:08 -03:00
ReinUsesLisp	a5aa1bb174	astc: Silence implicit conversion warnings	2019-10-27 03:04:50 -03:00
Rodrigo Locatti	26f3e18c5c	Merge pull request #2976 from FernandoS27/cache-fast-brx-rebased Implement Fast BRX, fix TXQ and addapt the Shader Cache for it	2019-10-26 16:56:13 -03:00
Fernando Sahmkow	be856a38d6	Shader_IR: Address Feedback.	2019-10-26 15:38:30 -04:00
Rodrigo Locatti	a0d79085c4	Merge pull request #3027 from lioncash/lookup shader_ir: Use std::array with std::pair instead of std::unordered_map	2019-10-26 05:49:15 -03:00
Rodrigo Locatti	d52598173d	Merge pull request #3013 from FernandoS27/tld4s-fix Shader_Ir: Fix TLD4S from using a component mask.	2019-10-25 20:06:26 -03:00
Fernando Sahmkow	e3afd6595a	Shader_IR: Clang format	2019-10-25 09:01:32 -04:00
ReinUsesLisp	78f3e8a757	gl_shader_cache: Implement locker variants invalidation	2019-10-25 09:01:32 -04:00
ReinUsesLisp	ec85648af3	gl_shader_disk_cache: Store and load fast BRX	2019-10-25 09:01:31 -04:00
ReinUsesLisp	fa2c297f3e	const_buffer_locker: Minor style changes	2019-10-25 09:01:31 -04:00
ReinUsesLisp	7b81ba4d8a	gl_shader_decompiler: Move entries to a separate function	2019-10-25 09:01:31 -04:00
Fernando Sahmkow	1244f2d368	Shader_IR: Implement Fast BRX and allow multi-branches in the CFG.	2019-10-25 09:01:31 -04:00
Fernando Sahmkow	a05120ec0b	Shader_IR: Correct typo in Consistent method.	2019-10-25 09:01:30 -04:00
Fernando Sahmkow	33fcec3502	Shader_IR: allow lookup of texture samplers within the shader_ir for instructions that don't provide it	2019-10-25 09:01:30 -04:00
Fernando Sahmkow	8909f52166	Shader_IR: Implement Fast BRX and allow multi-branches in the CFG.	2019-10-25 09:01:30 -04:00
Fernando Sahmkow	acd6441134	Shader_Cache: setup connection of ConstBufferLocker	2019-10-25 09:01:29 -04:00
Fernando Sahmkow	1a58f45d76	VideoCore: Unify const buffer accessing along engines and provide ConstBufferLocker class to shaders.	2019-10-25 09:01:29 -04:00
Fernando Sahmkow	2ef696c85a	Shader_IR: Implement BRX tracking.	2019-10-25 09:01:29 -04:00
James Rowe	5ee4fb6e12	Merge pull request #3029 from jroweboy/revert Revert "ci: Add build name to archive root folder"	2019-10-24 12:53:30 -06:00
James Rowe	969f0afa4e	Revert "ci: Add build name to archive root folder" This reverts commit `5e553a6c26`.	2019-10-24 12:46:15 -06:00
Rodrigo Locatti	5062728669	Merge pull request #3028 from lioncash/constexpr shader_bytecode: Make Matcher constexpr capable	2019-10-24 15:10:40 -03:00
Lioncash	7fdf991097	shader_bytecode: Make Matcher constexpr capable Greatly shrinks the amount of generated code for GetDecodeTable(). Collapses an assembly output of 9000+ lines down to ~3621 with Clang, and 6513 down to ~2616 with GCC, given it's now allowed to construct all the entries as a sequence of constant data.	2019-10-24 01:10:10 -04:00
Lioncash	382717172e	shader_ir: Use std::array with pair instead of unordered_map Given the overall size of the maps are very small, we can use arrays of pairs here instead of always heap allocating a new map every time the functions are called. Given the small size of the maps, the difference in container lookups are negligible, especially given the entries are already sorted.	2019-10-24 00:25:38 -04:00
Rodrigo Locatti	5328d570df	Merge pull request #3024 from lioncash/shadow video_core/shader: Resolve instances of variable shadowing	2019-10-24 00:45:23 -03:00
Lioncash	1f5401c89c	video_core/shader: Resolve instances of variable shadowing Silences a few -Wshadow warnings.	2019-10-23 23:00:31 -04:00
Lioncash	611236c883	externals: Track upstream opus Tracks upstream opus, allowing the library to be easily updated. While we're at it, we incorporate the CMakeLists.txt so that we have easy control over the requirements of the build.	2019-10-23 20:58:54 -04:00
bunnei	012d7f5233	Merge pull request #3022 from DarkLordZach/azure-folder-rename ci: Add build name to archive root folder	2019-10-23 15:52:37 -04:00
Zach Hilman	5e553a6c26	ci: Add build name to archive root folder	2019-10-23 15:23:43 -04:00
bunnei	6fe89acf0d	Merge pull request #2991 from lioncash/npad hid/npad: Minor cleanup	2019-10-22 19:51:24 -04:00
Zach Hilman	bb207fe27a	savedata_factory: Automatically create certain savedata After further hardware investigation, it appears that some games, perhaps those more lazily coded, will not call EnsureSaveData, meaning that they expect the normal (current) save to be automatically made. Additionally, some games do not create a cache or temporary save before use. In these 3 specific instances, the save is created automatically for the game if it doesn't exist.	2019-10-22 15:47:38 -04:00
Fernando Sahmkow	c4a0aa9207	Merge pull request #2995 from ReinUsesLisp/ignore-gmem shader_ir/memory: Ignore global memory when tracking fails	2019-10-22 13:22:43 -04:00
Fernando Sahmkow	7ecf9f7228	Merge pull request #2983 from lioncash/fallthrough gl_shader_decompiler/vk_shader_decompiler: Resolve implicit fallthrough cases	2019-10-22 13:16:46 -04:00
Fernando Sahmkow	1509d2ffbd	Shader_Ir: Fix TLD4S from using a component mask. TLD4S always outputs 4 values, the previous code checked a component mask and omitted those values that weren't part of it. This commit corrects that and makes sure all 4 values are set.	2019-10-22 10:59:07 -04:00
ReinUsesLisp	1ea07954fb	shader_ir/memory: Ignore global memory when tracking fails Ignore global memory operations instead of invoking undefined behaviour when constant buffer tracking fails and we are blasting through asserts, ignore the operation. In the case of LDG this means filling the destination registers with zeroes; for STG this means ignore the instruction as a whole. The default behaviour is still to abort execution on failure.	2019-10-22 02:49:17 -03:00
Lioncash	8d8e495248	hid/npad: Fix incorrect connection boolean value in ConnectAllDisconnectedControllers() We should be setting the connection state to true, otherwise we aren't actually making the controllers connected like the function name indicates.	2019-10-17 18:19:47 -04:00
Lioncash	d076466f26	hid/npad: Add missing break in default case While not an issue, it does prevent fallthrough from occurring if anything is ever added after this case (unlikely to occur, but this turns a trivial "should not cause issues" into a definite "won't cause issues).	2019-10-17 18:17:42 -04:00
Lioncash	26c84718c8	hid/npad: Replace std::for_each with ranged for loops Performs the same behavior, but is built into the core language itself. No functional change.	2019-10-17 18:16:36 -04:00
Lioncash	e433e99191	hid/npad: Remove redundant non-const variant of IsControllerSupported() The const qualified variant can also be called in non-const contexts, so we can remove the non-const variant to eliminate a bit of code duplication.	2019-10-17 18:11:41 -04:00
Lioncash	a71e8066a1	hid/npad: Move function declarations Clearly separate these from the variable declarations to make them more visible.	2019-10-17 18:09:08 -04:00
Lioncash	6947bf8e44	vk_shader_decompiler: Resolve fallthrough within ExprDecompiler's ExprCondCode operator() This would previously result in NeverExecute and UnusedIndex being treated as regular predicates.	2019-10-15 19:40:58 -04:00
Lioncash	b42a74ff2c	gl_shader_decompiler: Resolve fallthrough within ExprDecompiler's ExprCondCode operator() This would previously result in NeverExecute and UnusedIndex being treated as regular predicates.	2019-10-15 19:38:55 -04:00
Fernando Sahmkow	64e652d8cb	Kernel Thread: Cleanup THREADPROCESSORID_DONT_UPDATE.	2019-10-15 11:55:30 -04:00
Fernando Sahmkow	e28c7f5217	Kernel: Address Feedback 2	2019-10-15 11:55:28 -04:00
Fernando Sahmkow	a3524879be	Kernel: Clang Format	2019-10-15 11:55:27 -04:00
Fernando Sahmkow	c32520ceb7	Kernel: Reverse global accessor removal.	2019-10-15 11:55:26 -04:00
Fernando Sahmkow	3073615dbc	Kernel: Address Feedback.	2019-10-15 11:55:25 -04:00
Fernando Sahmkow	25f8606a6d	Kernel Scheduler: Make sure the global scheduler shutdowns correctly.	2019-10-15 11:55:24 -04:00
Fernando Sahmkow	b3c1deba49	Kernel_Thread: Eliminate most global accessors.	2019-10-15 11:55:23 -04:00
Fernando Sahmkow	0b72b34d89	KernelSVC: Assert that condition variable address is aligned to 4 bytes.	2019-10-15 11:55:22 -04:00
Fernando Sahmkow	96b1b144af	Kernel: Correct Paused scheduling	2019-10-15 11:55:21 -04:00
Fernando Sahmkow	1c6a11ab14	Kernel: Corrections to Wait Objects clearing in which a thread could still be signalled after a timeout or a cancel.	2019-10-15 11:55:20 -04:00
Fernando Sahmkow	27d571c084	Kernel: Correct redundant yields to only advance time forward.	2019-10-15 11:55:20 -04:00
Fernando Sahmkow	7176857177	Kernel: Corrections to ModifyByWaitingCountAndSignalToAddressIfEqual	2019-10-15 11:55:19 -04:00
Fernando Sahmkow	44e09e5f21	Kernel: Correct Results in Condition Variables and Mutexes	2019-10-15 11:55:18 -04:00
Fernando Sahmkow	1ec1e81373	Kernel: Clang Format	2019-10-15 11:55:17 -04:00
Fernando Sahmkow	e05a8c2385	Kernel: Remove global system accessor from WaitObject	2019-10-15 11:55:16 -04:00
Fernando Sahmkow	0cf26cee59	Scheduler: Implement Yield Count and Core migration on Thread Preemption.	2019-10-15 11:55:16 -04:00
Fernando Sahmkow	2d382de6fa	Scheduler: Corrections to YieldAndBalanceLoad and Yield bombing protection.	2019-10-15 11:55:15 -04:00
Fernando Sahmkow	b49c0dab87	Kernel: Initial implementation of thread preemption.	2019-10-15 11:55:14 -04:00
Fernando Sahmkow	103f3a2fe5	Scheduler: Add protections for Yield bombing In case of redundant yields, the scheduler will now idle the core for it's timeslice, in order to avoid continuously yielding the same thing over and over.	2019-10-15 11:55:13 -04:00
Fernando Sahmkow	82218c925a	Kernel: Style and Corrections	2019-10-15 11:55:12 -04:00
Fernando Sahmkow	fcc6b34fff	Correct PrepareReschedule	2019-10-15 11:55:12 -04:00
Fernando Sahmkow	3a94e7ea33	Comment and reorganize the scheduler	2019-10-15 11:55:11 -04:00
Fernando Sahmkow	b5d1e44782	Add PrepareReschedule where required.	2019-10-15 11:55:10 -04:00
Fernando Sahmkow	b8b7ebcece	Correct compiling errors and addapt to the new interface.	2019-10-15 11:55:09 -04:00
Fernando Sahmkow	9031502974	Correct Supervisor Calls to work with the new scheduler,	2019-10-15 11:55:08 -04:00
Fernando Sahmkow	47c6c78c03	Redesign CPU Cores to work with the new scheduler	2019-10-15 11:55:07 -04:00
Fernando Sahmkow	57a71f899a	Add interfacing to the Global Scheduler	2019-10-15 11:55:07 -04:00
Fernando Sahmkow	a1ac0c6cb4	Addapt thread class to the new Scheduler	2019-10-15 11:55:06 -04:00
Fernando Sahmkow	b164d8ee53	Implement a new Core Scheduler	2019-10-15 11:55:04 -04:00