emit_glsl_atomic: Implement 32x2 fallback atomic ops

lower_int64_to_int32: Add 64-bit atomic fallbacks
shaders: Add U64->U32x2 Atomic fallback functions
2022-01-29 19:56:03 -05:00 · 2022-01-29 19:56:02 -05:00 · 2022-01-29 19:55:53 -05:00 · 2022-01-28 20:04:24 -05:00 · 2022-01-28 20:03:50 -05:00 · 2022-01-28 19:00:04 -05:00
54 changed files with 1080 additions and 490 deletions
--- a/externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h
+++ b/externals/FidelityFX-FSR/ffx-fsr/ffx_fsr1.h
@@ -747,12 +747,12 @@ AF1 sharpness){
  // Immediate constants for peak range.
  AF2 peakC=AF2(1.0,-1.0*4.0);
  // Limiters, these need to be high precision RCPs.
-  AF1 hitMinR=mn4R*ARcpF1(AF1_(4.0)*mx4R);
-  AF1 hitMinG=mn4G*ARcpF1(AF1_(4.0)*mx4G);
-  AF1 hitMinB=mn4B*ARcpF1(AF1_(4.0)*mx4B);
-  AF1 hitMaxR=(peakC.x-mx4R)*ARcpF1(AF1_(4.0)*mn4R+peakC.y);
-  AF1 hitMaxG=(peakC.x-mx4G)*ARcpF1(AF1_(4.0)*mn4G+peakC.y);
-  AF1 hitMaxB=(peakC.x-mx4B)*ARcpF1(AF1_(4.0)*mn4B+peakC.y);
+  AF1 hitMinR=min(mn4R,eR)*ARcpF1(AF1_(4.0)*mx4R);
+  AF1 hitMinG=min(mn4G,eG)*ARcpF1(AF1_(4.0)*mx4G);
+  AF1 hitMinB=min(mn4B,eB)*ARcpF1(AF1_(4.0)*mx4B);
+  AF1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpF1(AF1_(4.0)*mn4R+peakC.y);
+  AF1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpF1(AF1_(4.0)*mn4G+peakC.y);
+  AF1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpF1(AF1_(4.0)*mn4B+peakC.y);
  AF1 lobeR=max(-hitMinR,hitMaxR);
  AF1 lobeG=max(-hitMinG,hitMaxG);
  AF1 lobeB=max(-hitMinB,hitMaxB);
@@ -845,12 +845,12 @@ AF1 sharpness){
  // Immediate constants for peak range.
  AH2 peakC=AH2(1.0,-1.0*4.0);
  // Limiters, these need to be high precision RCPs.
-  AH1 hitMinR=mn4R*ARcpH1(AH1_(4.0)*mx4R);
-  AH1 hitMinG=mn4G*ARcpH1(AH1_(4.0)*mx4G);
-  AH1 hitMinB=mn4B*ARcpH1(AH1_(4.0)*mx4B);
-  AH1 hitMaxR=(peakC.x-mx4R)*ARcpH1(AH1_(4.0)*mn4R+peakC.y);
-  AH1 hitMaxG=(peakC.x-mx4G)*ARcpH1(AH1_(4.0)*mn4G+peakC.y);
-  AH1 hitMaxB=(peakC.x-mx4B)*ARcpH1(AH1_(4.0)*mn4B+peakC.y);
+  AH1 hitMinR=min(mn4R,eR)*ARcpH1(AH1_(4.0)*mx4R);
+  AH1 hitMinG=min(mn4G,eG)*ARcpH1(AH1_(4.0)*mx4G);
+  AH1 hitMinB=min(mn4B,eB)*ARcpH1(AH1_(4.0)*mx4B);
+  AH1 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH1(AH1_(4.0)*mn4R+peakC.y);
+  AH1 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH1(AH1_(4.0)*mn4G+peakC.y);
+  AH1 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH1(AH1_(4.0)*mn4B+peakC.y);
  AH1 lobeR=max(-hitMinR,hitMaxR);
  AH1 lobeG=max(-hitMinG,hitMaxG);
  AH1 lobeB=max(-hitMinB,hitMaxB);
@@ -963,12 +963,12 @@ AF1 sharpness){
  // Immediate constants for peak range.
  AH2 peakC=AH2(1.0,-1.0*4.0);
  // Limiters, these need to be high precision RCPs.
-  AH2 hitMinR=mn4R*ARcpH2(AH2_(4.0)*mx4R);
-  AH2 hitMinG=mn4G*ARcpH2(AH2_(4.0)*mx4G);
-  AH2 hitMinB=mn4B*ARcpH2(AH2_(4.0)*mx4B);
-  AH2 hitMaxR=(peakC.x-mx4R)*ARcpH2(AH2_(4.0)*mn4R+peakC.y);
-  AH2 hitMaxG=(peakC.x-mx4G)*ARcpH2(AH2_(4.0)*mn4G+peakC.y);
-  AH2 hitMaxB=(peakC.x-mx4B)*ARcpH2(AH2_(4.0)*mn4B+peakC.y);
+  AH2 hitMinR=min(mn4R,eR)*ARcpH2(AH2_(4.0)*mx4R);
+  AH2 hitMinG=min(mn4G,eG)*ARcpH2(AH2_(4.0)*mx4G);
+  AH2 hitMinB=min(mn4B,eB)*ARcpH2(AH2_(4.0)*mx4B);
+  AH2 hitMaxR=(peakC.x-max(mx4R,eR))*ARcpH2(AH2_(4.0)*mn4R+peakC.y);
+  AH2 hitMaxG=(peakC.x-max(mx4G,eG))*ARcpH2(AH2_(4.0)*mn4G+peakC.y);
+  AH2 hitMaxB=(peakC.x-max(mx4B,eB))*ARcpH2(AH2_(4.0)*mn4B+peakC.y);
  AH2 lobeR=max(-hitMinR,hitMaxR);
  AH2 lobeG=max(-hitMinG,hitMaxG);
  AH2 lobeB=max(-hitMinB,hitMaxB);
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -554,6 +554,7 @@ struct Values {
    Setting<bool> use_docked_mode{true, "use_docked_mode"};

    BasicSetting<bool> enable_raw_input{false, "enable_raw_input"};
+    BasicSetting<bool> controller_navigation{true, "controller_navigation"};

    Setting<bool> vibration_enabled{true, "vibration_enabled"};
    Setting<bool> enable_accurate_vibrations{false, "enable_accurate_vibrations"};
--- a/src/common/wall_clock.cpp
+++ b/src/common/wall_clock.cpp
@@ -72,7 +72,9 @@ std::unique_ptr<WallClock> CreateBestMatchingClock(u32 emulated_cpu_frequency,
    if (caps.invariant_tsc) {
        rtsc_frequency = EstimateRDTSCFrequency();
    }
-    if (rtsc_frequency == 0) {
+
+    // Fallback to StandardWallClock if rtsc period is higher than a nano second
+    if (rtsc_frequency <= 1000000000) {
        return std::make_unique<StandardWallClock>(emulated_cpu_frequency,
                                                   emulated_clock_frequency);
    } else {
--- a/src/common/x64/xbyak_abi.h
+++ b/src/common/x64/xbyak_abi.h
@@ -37,12 +37,12 @@ constexpr Xbyak::Reg IndexToReg(size_t reg_index) {
    }
 }

-inline std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
-    std::bitset<32> bits;
+constexpr std::bitset<32> BuildRegSet(std::initializer_list<Xbyak::Reg> regs) {
+    size_t bits = 0;
    for (const Xbyak::Reg& reg : regs) {
-        bits[RegToIndex(reg)] = true;
+        bits |= size_t{1} << RegToIndex(reg);
    }
-    return bits;
+    return {bits};
 }

 constexpr inline std::bitset<32> ABI_ALL_GPRS(0x0000FFFF);
@@ -57,7 +57,7 @@ constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx;
 constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8;
 constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9;

-const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+constexpr inline std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
    // GPRs
    Xbyak::util::rcx,
    Xbyak::util::rdx,
@@ -74,7 +74,7 @@ const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
    Xbyak::util::xmm5,
 });

-const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+constexpr inline std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
    // GPRs
    Xbyak::util::rbx,
    Xbyak::util::rsi,
@@ -108,7 +108,7 @@ constexpr inline Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi;
 constexpr inline Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx;
 constexpr inline Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx;

-const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
+constexpr inline std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
    // GPRs
    Xbyak::util::rcx,
    Xbyak::util::rdx,
@@ -137,7 +137,7 @@ const std::bitset<32> ABI_ALL_CALLER_SAVED = BuildRegSet({
    Xbyak::util::xmm15,
 });

-const std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
+constexpr inline std::bitset<32> ABI_ALL_CALLEE_SAVED = BuildRegSet({
    // GPRs
    Xbyak::util::rbx,
    Xbyak::util::rbp,
--- a/src/core/hle/kernel/k_affinity_mask.h
+++ b/src/core/hle/kernel/k_affinity_mask.h
@@ -31,8 +31,6 @@ public:
    }

    constexpr void SetAffinity(s32 core, bool set) {
-        ASSERT(0 <= core && core < static_cast<s32>(Core::Hardware::NUM_CPU_CORES));
-
        if (set) {
            this->mask |= GetCoreBit(core);
        } else {
--- a/src/core/hle/kernel/k_page_table.cpp
+++ b/src/core/hle/kernel/k_page_table.cpp
@@ -276,22 +276,23 @@ ResultCode KPageTable::InitializeForProcess(FileSys::ProgramAddressSpaceType as_

 ResultCode KPageTable::MapProcessCode(VAddr addr, std::size_t num_pages, KMemoryState state,
                                      KMemoryPermission perm) {
-    std::lock_guard lock{page_table_lock};
-
    const u64 size{num_pages * PageSize};

-    if (!CanContain(addr, size, state)) {
-        return ResultInvalidCurrentMemory;
-    }
+    // Validate the mapping request.
+    R_UNLESS(this->CanContain(addr, size, state), ResultInvalidCurrentMemory);

-    if (IsRegionMapped(addr, size)) {
-        return ResultInvalidCurrentMemory;
-    }
+    // Lock the table.
+    std::lock_guard lock{page_table_lock};
+
+    // Verify that the destination memory is unmapped.
+    R_TRY(this->CheckMemoryState(addr, size, KMemoryState::All, KMemoryState::Free,
+                                 KMemoryPermission::None, KMemoryPermission::None,
+                                 KMemoryAttribute::None, KMemoryAttribute::None));

    KPageLinkedList page_linked_list;
-    CASCADE_CODE(system.Kernel().MemoryManager().Allocate(page_linked_list, num_pages, memory_pool,
-                                                          allocation_option));
-    CASCADE_CODE(Operate(addr, num_pages, page_linked_list, OperationType::MapGroup));
+    R_TRY(system.Kernel().MemoryManager().Allocate(page_linked_list, num_pages, memory_pool,
+                                                   allocation_option));
+    R_TRY(Operate(addr, num_pages, page_linked_list, OperationType::MapGroup));

    block_manager->Update(addr, num_pages, state, perm);

@@ -395,39 +396,12 @@ ResultCode KPageTable::UnmapProcessMemory(VAddr dst_addr, std::size_t size,

    return ResultSuccess;
 }
-void KPageTable::MapPhysicalMemory(KPageLinkedList& page_linked_list, VAddr start, VAddr end) {
-    auto node{page_linked_list.Nodes().begin()};
-    PAddr map_addr{node->GetAddress()};
-    std::size_t src_num_pages{node->GetNumPages()};
-
-    block_manager->IterateForRange(start, end, [&](const KMemoryInfo& info) {
-        if (info.state != KMemoryState::Free) {
-            return;
-        }
-
-        std::size_t dst_num_pages{GetSizeInRange(info, start, end) / PageSize};
-        VAddr dst_addr{GetAddressInRange(info, start)};
-
-        while (dst_num_pages) {
-            if (!src_num_pages) {
-                node = std::next(node);
-                map_addr = node->GetAddress();
-                src_num_pages = node->GetNumPages();
-            }
-
-            const std::size_t num_pages{std::min(src_num_pages, dst_num_pages)};
-            Operate(dst_addr, num_pages, KMemoryPermission::UserReadWrite, OperationType::Map,
-                    map_addr);
-
-            dst_addr += num_pages * PageSize;
-            map_addr += num_pages * PageSize;
-            src_num_pages -= num_pages;
-            dst_num_pages -= num_pages;
-        }
-    });
-}

 ResultCode KPageTable::MapPhysicalMemory(VAddr addr, std::size_t size) {
+    // Lock the physical memory lock.
+    std::lock_guard phys_lk(map_physical_memory_lock);
+
+    // Lock the table.
    std::lock_guard lock{page_table_lock};

    std::size_t mapped_size{};
@@ -463,7 +437,35 @@ ResultCode KPageTable::MapPhysicalMemory(VAddr addr, std::size_t size) {
    // We succeeded, so commit the memory reservation.
    memory_reservation.Commit();

-    MapPhysicalMemory(page_linked_list, addr, end_addr);
+    // Map the memory.
+    auto node{page_linked_list.Nodes().begin()};
+    PAddr map_addr{node->GetAddress()};
+    std::size_t src_num_pages{node->GetNumPages()};
+    block_manager->IterateForRange(addr, end_addr, [&](const KMemoryInfo& info) {
+        if (info.state != KMemoryState::Free) {
+            return;
+        }
+
+        std::size_t dst_num_pages{GetSizeInRange(info, addr, end_addr) / PageSize};
+        VAddr dst_addr{GetAddressInRange(info, addr)};
+
+        while (dst_num_pages) {
+            if (!src_num_pages) {
+                node = std::next(node);
+                map_addr = node->GetAddress();
+                src_num_pages = node->GetNumPages();
+            }
+
+            const std::size_t num_pages{std::min(src_num_pages, dst_num_pages)};
+            Operate(dst_addr, num_pages, KMemoryPermission::UserReadWrite, OperationType::Map,
+                    map_addr);
+
+            dst_addr += num_pages * PageSize;
+            map_addr += num_pages * PageSize;
+            src_num_pages -= num_pages;
+            dst_num_pages -= num_pages;
+        }
+    });

    mapped_physical_memory_size += remaining_size;

@@ -503,23 +505,8 @@ ResultCode KPageTable::UnmapPhysicalMemory(VAddr addr, std::size_t size) {
        return ResultSuccess;
    }

-    CASCADE_CODE(UnmapMemory(addr, size));
-
-    auto process{system.Kernel().CurrentProcess()};
-    process->GetResourceLimit()->Release(LimitableResource::PhysicalMemory, mapped_size);
-    mapped_physical_memory_size -= mapped_size;
-
-    return ResultSuccess;
-}
-
-ResultCode KPageTable::UnmapMemory(VAddr addr, std::size_t size) {
-    std::lock_guard lock{page_table_lock};
-
-    const VAddr end_addr{addr + size};
-    ResultCode result{ResultSuccess};
-    KPageLinkedList page_linked_list;
-
    // Unmap each region within the range
+    KPageLinkedList page_linked_list;
    block_manager->IterateForRange(addr, end_addr, [&](const KMemoryInfo& info) {
        if (info.state == KMemoryState::Normal) {
            const std::size_t block_size{GetSizeInRange(info, addr, end_addr)};
@@ -535,7 +522,6 @@ ResultCode KPageTable::UnmapMemory(VAddr addr, std::size_t size) {
            }
        }
    });
-
    if (result.IsError()) {
        return result;
    }
@@ -546,10 +532,14 @@ ResultCode KPageTable::UnmapMemory(VAddr addr, std::size_t size) {

    block_manager->Update(addr, num_pages, KMemoryState::Free);

+    auto process{system.Kernel().CurrentProcess()};
+    process->GetResourceLimit()->Release(LimitableResource::PhysicalMemory, mapped_size);
+    mapped_physical_memory_size -= mapped_size;
+
    return ResultSuccess;
 }

-ResultCode KPageTable::Map(VAddr dst_addr, VAddr src_addr, std::size_t size) {
+ResultCode KPageTable::MapMemory(VAddr dst_addr, VAddr src_addr, std::size_t size) {
    std::lock_guard lock{page_table_lock};

    KMemoryState src_state{};
@@ -588,7 +578,7 @@ ResultCode KPageTable::Map(VAddr dst_addr, VAddr src_addr, std::size_t size) {
    return ResultSuccess;
 }

-ResultCode KPageTable::Unmap(VAddr dst_addr, VAddr src_addr, std::size_t size) {
+ResultCode KPageTable::UnmapMemory(VAddr dst_addr, VAddr src_addr, std::size_t size) {
    std::lock_guard lock{page_table_lock};

    KMemoryState src_state{};
@@ -652,24 +642,26 @@ ResultCode KPageTable::MapPages(VAddr addr, const KPageLinkedList& page_linked_l
    return ResultSuccess;
 }

-ResultCode KPageTable::MapPages(VAddr addr, KPageLinkedList& page_linked_list, KMemoryState state,
-                                KMemoryPermission perm) {
-    std::lock_guard lock{page_table_lock};
-
+ResultCode KPageTable::MapPages(VAddr address, KPageLinkedList& page_linked_list,
+                                KMemoryState state, KMemoryPermission perm) {
+    // Check that the map is in range.
    const std::size_t num_pages{page_linked_list.GetNumPages()};
    const std::size_t size{num_pages * PageSize};
+    R_UNLESS(this->CanContain(address, size, state), ResultInvalidCurrentMemory);

-    if (!CanContain(addr, size, state)) {
-        return ResultInvalidCurrentMemory;
-    }
+    // Lock the table.
+    std::lock_guard lock{page_table_lock};

-    if (IsRegionMapped(addr, num_pages * PageSize)) {
-        return ResultInvalidCurrentMemory;
-    }
+    // Check the memory state.
+    R_TRY(this->CheckMemoryState(address, size, KMemoryState::All, KMemoryState::Free,
+                                 KMemoryPermission::None, KMemoryPermission::None,
+                                 KMemoryAttribute::None, KMemoryAttribute::None));

-    CASCADE_CODE(MapPages(addr, page_linked_list, perm));
+    // Map the pages.
+    R_TRY(MapPages(address, page_linked_list, perm));

-    block_manager->Update(addr, num_pages, state, perm);
+    // Update the blocks.
+    block_manager->Update(address, num_pages, state, perm);

    return ResultSuccess;
 }
@@ -693,21 +685,23 @@ ResultCode KPageTable::UnmapPages(VAddr addr, const KPageLinkedList& page_linked

 ResultCode KPageTable::UnmapPages(VAddr addr, KPageLinkedList& page_linked_list,
                                  KMemoryState state) {
-    std::lock_guard lock{page_table_lock};
-
+    // Check that the unmap is in range.
    const std::size_t num_pages{page_linked_list.GetNumPages()};
    const std::size_t size{num_pages * PageSize};
+    R_UNLESS(this->Contains(addr, size), ResultInvalidCurrentMemory);

-    if (!CanContain(addr, size, state)) {
-        return ResultInvalidCurrentMemory;
-    }
+    // Lock the table.
+    std::lock_guard lock{page_table_lock};

-    if (IsRegionMapped(addr, num_pages * PageSize)) {
-        return ResultInvalidCurrentMemory;
-    }
+    // Check the memory state.
+    R_TRY(this->CheckMemoryState(addr, size, KMemoryState::All, state, KMemoryPermission::None,
+                                 KMemoryPermission::None, KMemoryAttribute::All,
+                                 KMemoryAttribute::None));

-    CASCADE_CODE(UnmapPages(addr, page_linked_list));
+    // Perform the unmap.
+    R_TRY(UnmapPages(addr, page_linked_list));

+    // Update the blocks.
    block_manager->Update(addr, num_pages, state, KMemoryPermission::None);

    return ResultSuccess;
@@ -765,7 +759,6 @@ ResultCode KPageTable::SetProcessMemoryPermission(VAddr addr, std::size_t size,

    // Ensure cache coherency, if we're setting pages as executable.
    if (is_x) {
-        // Memory execution state is changing, invalidate CPU cache range
        system.InvalidateCpuInstructionCacheRange(addr, size);
    }

@@ -793,12 +786,12 @@ ResultCode KPageTable::ReserveTransferMemory(VAddr addr, std::size_t size, KMemo
    KMemoryState state{};
    KMemoryAttribute attribute{};

-    CASCADE_CODE(CheckMemoryState(
-        &state, nullptr, &attribute, nullptr, addr, size,
-        KMemoryState::FlagCanTransfer | KMemoryState::FlagReferenceCounted,
-        KMemoryState::FlagCanTransfer | KMemoryState::FlagReferenceCounted, KMemoryPermission::All,
-        KMemoryPermission::UserReadWrite, KMemoryAttribute::Mask, KMemoryAttribute::None,
-        KMemoryAttribute::IpcAndDeviceMapped));
+    R_TRY(CheckMemoryState(&state, nullptr, &attribute, nullptr, addr, size,
+                           KMemoryState::FlagCanTransfer | KMemoryState::FlagReferenceCounted,
+                           KMemoryState::FlagCanTransfer | KMemoryState::FlagReferenceCounted,
+                           KMemoryPermission::All, KMemoryPermission::UserReadWrite,
+                           KMemoryAttribute::Mask, KMemoryAttribute::None,
+                           KMemoryAttribute::IpcAndDeviceMapped));

    block_manager->Update(addr, size / PageSize, state, perm, attribute | KMemoryAttribute::Locked);

@@ -810,12 +803,11 @@ ResultCode KPageTable::ResetTransferMemory(VAddr addr, std::size_t size) {

    KMemoryState state{};

-    CASCADE_CODE(
-        CheckMemoryState(&state, nullptr, nullptr, nullptr, addr, size,
-                         KMemoryState::FlagCanTransfer | KMemoryState::FlagReferenceCounted,
-                         KMemoryState::FlagCanTransfer | KMemoryState::FlagReferenceCounted,
-                         KMemoryPermission::None, KMemoryPermission::None, KMemoryAttribute::Mask,
-                         KMemoryAttribute::Locked, KMemoryAttribute::IpcAndDeviceMapped));
+    R_TRY(CheckMemoryState(&state, nullptr, nullptr, nullptr, addr, size,
+                           KMemoryState::FlagCanTransfer | KMemoryState::FlagReferenceCounted,
+                           KMemoryState::FlagCanTransfer | KMemoryState::FlagReferenceCounted,
+                           KMemoryPermission::None, KMemoryPermission::None, KMemoryAttribute::Mask,
+                           KMemoryAttribute::Locked, KMemoryAttribute::IpcAndDeviceMapped));

    block_manager->Update(addr, size / PageSize, state, KMemoryPermission::UserReadWrite);
    return ResultSuccess;
@@ -871,8 +863,9 @@ ResultCode KPageTable::SetMemoryAttribute(VAddr addr, std::size_t size, u32 mask
        AttributeTestMask, KMemoryAttribute::None, ~AttributeTestMask));

    // Determine the new attribute.
-    const auto new_attr = ((old_attr & static_cast<KMemoryAttribute>(~mask)) |
-                           static_cast<KMemoryAttribute>(attr & mask));
+    const KMemoryAttribute new_attr =
+        static_cast<KMemoryAttribute>(((old_attr & static_cast<KMemoryAttribute>(~mask)) |
+                                       static_cast<KMemoryAttribute>(attr & mask)));

    // Perform operation.
    this->Operate(addr, num_pages, old_perm, OperationType::ChangePermissionsAndRefresh);
@@ -896,6 +889,9 @@ ResultCode KPageTable::SetMaxHeapSize(std::size_t size) {
 }

 ResultCode KPageTable::SetHeapSize(VAddr* out, std::size_t size) {
+    // Lock the physical memory lock.
+    std::lock_guard phys_lk(map_physical_memory_lock);
+
    // Try to perform a reduction in heap, instead of an extension.
    VAddr cur_address{};
    std::size_t allocation_size{};
@@ -1025,12 +1021,12 @@ ResultVal<VAddr> KPageTable::AllocateAndMapMemory(std::size_t needed_num_pages,
    }

    if (is_map_only) {
-        CASCADE_CODE(Operate(addr, needed_num_pages, perm, OperationType::Map, map_addr));
+        R_TRY(Operate(addr, needed_num_pages, perm, OperationType::Map, map_addr));
    } else {
        KPageLinkedList page_group;
-        CASCADE_CODE(system.Kernel().MemoryManager().Allocate(page_group, needed_num_pages,
-                                                              memory_pool, allocation_option));
-        CASCADE_CODE(Operate(addr, needed_num_pages, page_group, OperationType::MapGroup));
+        R_TRY(system.Kernel().MemoryManager().Allocate(page_group, needed_num_pages, memory_pool,
+                                                       allocation_option));
+        R_TRY(Operate(addr, needed_num_pages, page_group, OperationType::MapGroup));
    }

    block_manager->Update(addr, needed_num_pages, state, perm);
@@ -1186,7 +1182,7 @@ VAddr KPageTable::AllocateVirtualMemory(VAddr start, std::size_t region_num_page

 ResultCode KPageTable::Operate(VAddr addr, std::size_t num_pages, const KPageLinkedList& page_group,
                               OperationType operation) {
-    std::lock_guard lock{page_table_lock};
+    ASSERT(this->IsLockedByCurrentThread());

    ASSERT(Common::IsAligned(addr, PageSize));
    ASSERT(num_pages > 0);
@@ -1211,7 +1207,7 @@ ResultCode KPageTable::Operate(VAddr addr, std::size_t num_pages, const KPageLin

 ResultCode KPageTable::Operate(VAddr addr, std::size_t num_pages, KMemoryPermission perm,
                               OperationType operation, PAddr map_addr) {
-    std::lock_guard lock{page_table_lock};
+    ASSERT(this->IsLockedByCurrentThread());

    ASSERT(num_pages > 0);
    ASSERT(Common::IsAligned(addr, PageSize));
--- a/src/core/hle/kernel/k_page_table.h
+++ b/src/core/hle/kernel/k_page_table.h
@@ -37,9 +37,8 @@ public:
                                  VAddr src_addr);
    ResultCode MapPhysicalMemory(VAddr addr, std::size_t size);
    ResultCode UnmapPhysicalMemory(VAddr addr, std::size_t size);
-    ResultCode UnmapMemory(VAddr addr, std::size_t size);
-    ResultCode Map(VAddr dst_addr, VAddr src_addr, std::size_t size);
-    ResultCode Unmap(VAddr dst_addr, VAddr src_addr, std::size_t size);
+    ResultCode MapMemory(VAddr dst_addr, VAddr src_addr, std::size_t size);
+    ResultCode UnmapMemory(VAddr dst_addr, VAddr src_addr, std::size_t size);
    ResultCode MapPages(VAddr addr, KPageLinkedList& page_linked_list, KMemoryState state,
                        KMemoryPermission perm);
    ResultCode UnmapPages(VAddr addr, KPageLinkedList& page_linked_list, KMemoryState state);
@@ -88,7 +87,6 @@ private:
    ResultCode MapPages(VAddr addr, const KPageLinkedList& page_linked_list,
                        KMemoryPermission perm);
    ResultCode UnmapPages(VAddr addr, const KPageLinkedList& page_linked_list);
-    void MapPhysicalMemory(KPageLinkedList& page_linked_list, VAddr start, VAddr end);
    bool IsRegionMapped(VAddr address, u64 size);
    bool IsRegionContiguous(VAddr addr, u64 size) const;
    void AddRegionToPages(VAddr start, std::size_t num_pages, KPageLinkedList& page_linked_list);
@@ -148,6 +146,7 @@ private:
    }

    std::recursive_mutex page_table_lock;
+    std::mutex map_physical_memory_lock;
    std::unique_ptr<KMemoryBlockManager> block_manager;

 public:
@@ -249,7 +248,9 @@ public:
        return !IsOutsideASLRRegion(address, size);
    }
    constexpr PAddr GetPhysicalAddr(VAddr addr) {
-        return page_table_impl.backing_addr[addr >> PageBits] + addr;
+        const auto backing_addr = page_table_impl.backing_addr[addr >> PageBits];
+        ASSERT(backing_addr);
+        return backing_addr + addr;
    }
    constexpr bool Contains(VAddr addr) const {
        return address_space_start <= addr && addr <= address_space_end - 1;
--- a/src/core/hle/kernel/k_priority_queue.h
+++ b/src/core/hle/kernel/k_priority_queue.h
@@ -258,7 +258,7 @@ private:

 private:
    constexpr void ClearAffinityBit(u64& affinity, s32 core) {
-        affinity &= ~(u64(1) << core);
+        affinity &= ~(UINT64_C(1) << core);
    }

    constexpr s32 GetNextCore(u64& affinity) {
--- a/src/core/hle/kernel/k_process.cpp
+++ b/src/core/hle/kernel/k_process.cpp
@@ -146,6 +146,13 @@ ResultCode KProcess::Initialize(KProcess* process, Core::System& system, std::st
    // Open a reference to the resource limit.
    process->resource_limit->Open();

+    // Clear remaining fields.
+    process->num_running_threads = 0;
+    process->is_signaled = false;
+    process->exception_thread = nullptr;
+    process->is_suspended = false;
+    process->schedule_count = 0;
+
    return ResultSuccess;
 }

@@ -157,20 +164,17 @@ KResourceLimit* KProcess::GetResourceLimit() const {
    return resource_limit;
 }

-void KProcess::IncrementThreadCount() {
-    ASSERT(num_threads >= 0);
-    num_created_threads++;
-
-    if (const auto count = ++num_threads; count > peak_num_threads) {
-        peak_num_threads = count;
-    }
+void KProcess::IncrementRunningThreadCount() {
+    ASSERT(num_running_threads.load() >= 0);
+    ++num_running_threads;
 }

-void KProcess::DecrementThreadCount() {
-    ASSERT(num_threads > 0);
+void KProcess::DecrementRunningThreadCount() {
+    ASSERT(num_running_threads.load() > 0);

-    if (const auto count = --num_threads; count == 0) {
-        LOG_WARNING(Kernel, "Process termination is not fully implemented.");
+    if (const auto prev = num_running_threads--; prev == 1) {
+        // TODO(bunnei): Process termination to be implemented when multiprocess is supported.
+        UNIMPLEMENTED_MSG("KProcess termination is not implemennted!");
    }
 }

--- a/src/core/hle/kernel/k_process.h
+++ b/src/core/hle/kernel/k_process.h
@@ -235,8 +235,8 @@ public:
        ++schedule_count;
    }

-    void IncrementThreadCount();
-    void DecrementThreadCount();
+    void IncrementRunningThreadCount();
+    void DecrementRunningThreadCount();

    void SetRunningThread(s32 core, KThread* thread, u64 idle_count) {
        running_threads[core] = thread;
@@ -473,9 +473,7 @@ private:
    bool is_suspended{};
    bool is_initialized{};

-    std::atomic<s32> num_created_threads{};
-    std::atomic<u16> num_threads{};
-    u16 peak_num_threads{};
+    std::atomic<u16> num_running_threads{};

    std::array<KThread*, Core::Hardware::NUM_CPU_CORES> running_threads{};
    std::array<u64, Core::Hardware::NUM_CPU_CORES> running_thread_idle_counts{};
--- a/src/core/hle/kernel/k_scheduler.cpp
+++ b/src/core/hle/kernel/k_scheduler.cpp
@@ -710,23 +710,19 @@ void KScheduler::Unload(KThread* thread) {
 }

 void KScheduler::Reload(KThread* thread) {
-    LOG_TRACE(Kernel, "core {}, reload thread {}", core_id, thread ? thread->GetName() : "nullptr");
+    LOG_TRACE(Kernel, "core {}, reload thread {}", core_id, thread->GetName());

-    if (thread) {
-        ASSERT_MSG(thread->GetState() == ThreadState::Runnable, "Thread must be runnable.");
-
-        Core::ARM_Interface& cpu_core = system.ArmInterface(core_id);
-        cpu_core.LoadContext(thread->GetContext32());
-        cpu_core.LoadContext(thread->GetContext64());
-        cpu_core.SetTlsAddress(thread->GetTLSAddress());
-        cpu_core.SetTPIDR_EL0(thread->GetTPIDR_EL0());
-        cpu_core.ClearExclusiveState();
-    }
+    Core::ARM_Interface& cpu_core = system.ArmInterface(core_id);
+    cpu_core.LoadContext(thread->GetContext32());
+    cpu_core.LoadContext(thread->GetContext64());
+    cpu_core.SetTlsAddress(thread->GetTLSAddress());
+    cpu_core.SetTPIDR_EL0(thread->GetTPIDR_EL0());
+    cpu_core.ClearExclusiveState();
 }

 void KScheduler::SwitchContextStep2() {
    // Load context of new thread
-    Reload(current_thread.load());
+    Reload(GetCurrentThread());

    RescheduleCurrentCore();
 }
@@ -735,13 +731,17 @@ void KScheduler::ScheduleImpl() {
    KThread* previous_thread = GetCurrentThread();
    KThread* next_thread = state.highest_priority_thread;

-    state.needs_scheduling = false;
+    state.needs_scheduling.store(false);

    // We never want to schedule a null thread, so use the idle thread if we don't have a next.
    if (next_thread == nullptr) {
        next_thread = idle_thread;
    }

+    if (next_thread->GetCurrentCore() != core_id) {
+        next_thread->SetCurrentCore(core_id);
+    }
+
    // We never want to schedule a dummy thread, as these are only used by host threads for locking.
    if (next_thread->GetThreadType() == ThreadType::Dummy) {
        ASSERT_MSG(false, "Dummy threads should never be scheduled!");
@@ -755,14 +755,8 @@ void KScheduler::ScheduleImpl() {
        return;
    }

-    if (next_thread->GetCurrentCore() != core_id) {
-        next_thread->SetCurrentCore(core_id);
-    }
-
-    current_thread.store(next_thread);
-
+    // Update the CPU time tracking variables.
    KProcess* const previous_process = system.Kernel().CurrentProcess();
-
    UpdateLastContextSwitchTime(previous_thread, previous_process);

    // Save context for previous thread
@@ -770,6 +764,10 @@ void KScheduler::ScheduleImpl() {

    std::shared_ptr<Common::Fiber>* old_context;
    old_context = &previous_thread->GetHostContext();
+
+    // Set the new thread.
+    current_thread.store(next_thread);
+
    guard.Unlock();

    Common::Fiber::YieldTo(*old_context, *switch_fiber);
@@ -797,8 +795,8 @@ void KScheduler::SwitchToCurrent() {
        do {
            auto next_thread = current_thread.load();
            if (next_thread != nullptr) {
-                next_thread->context_guard.Lock();
-                if (next_thread->GetRawState() != ThreadState::Runnable) {
+                const auto locked = next_thread->context_guard.TryLock();
+                if (state.needs_scheduling.load()) {
                    next_thread->context_guard.Unlock();
                    break;
                }
@@ -806,6 +804,9 @@ void KScheduler::SwitchToCurrent() {
                    next_thread->context_guard.Unlock();
                    break;
                }
+                if (!locked) {
+                    continue;
+                }
            }
            auto thread = next_thread ? next_thread : idle_thread;
            Common::Fiber::YieldTo(switch_fiber, *thread->GetHostContext());
--- a/src/core/hle/kernel/k_thread.cpp
+++ b/src/core/hle/kernel/k_thread.cpp
@@ -215,7 +215,6 @@ ResultCode KThread::Initialize(KThreadFunction func, uintptr_t arg, VAddr user_s

        parent = owner;
        parent->Open();
-        parent->IncrementThreadCount();
    }

    // Initialize thread context.
@@ -327,11 +326,6 @@ void KThread::Finalize() {
        }
    }

-    // Decrement the parent process's thread count.
-    if (parent != nullptr) {
-        parent->DecrementThreadCount();
-    }
-
    // Perform inherited finalization.
    KSynchronizationObject::Finalize();
 }
@@ -1011,7 +1005,7 @@ ResultCode KThread::Run() {
            if (IsUserThread() && IsSuspended()) {
                this->UpdateState();
            }
-            owner->IncrementThreadCount();
+            owner->IncrementRunningThreadCount();
        }

        // Set our state and finish.
@@ -1026,10 +1020,11 @@ ResultCode KThread::Run() {
 void KThread::Exit() {
    ASSERT(this == GetCurrentThreadPointer(kernel));

-    // Release the thread resource hint from parent.
+    // Release the thread resource hint, running thread count from parent.
    if (parent != nullptr) {
        parent->GetResourceLimit()->Release(Kernel::LimitableResource::Threads, 0, 1);
        resource_limit_release_hint = true;
+        parent->DecrementRunningThreadCount();
    }

    // Perform termination.
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -230,7 +230,7 @@ static ResultCode MapMemory(Core::System& system, VAddr dst_addr, VAddr src_addr
        return result;
    }

-    return page_table.Map(dst_addr, src_addr, size);
+    return page_table.MapMemory(dst_addr, src_addr, size);
 }

 static ResultCode MapMemory32(Core::System& system, u32 dst_addr, u32 src_addr, u32 size) {
@@ -249,7 +249,7 @@ static ResultCode UnmapMemory(Core::System& system, VAddr dst_addr, VAddr src_ad
        return result;
    }

-    return page_table.Unmap(dst_addr, src_addr, size);
+    return page_table.UnmapMemory(dst_addr, src_addr, size);
 }

 static ResultCode UnmapMemory32(Core::System& system, u32 dst_addr, u32 src_addr, u32 size) {
--- a/src/input_common/drivers/sdl_driver.cpp
+++ b/src/input_common/drivers/sdl_driver.cpp
@@ -109,8 +109,9 @@ public:

    bool HasHDRumble() const {
        if (sdl_controller) {
-            return (SDL_GameControllerGetType(sdl_controller.get()) ==
-                    SDL_CONTROLLER_TYPE_NINTENDO_SWITCH_PRO);
+            const auto type = SDL_GameControllerGetType(sdl_controller.get());
+            return (type == SDL_CONTROLLER_TYPE_NINTENDO_SWITCH_PRO) ||
+                   (type == SDL_CONTROLLER_TYPE_PS5);
        }
        return false;
    }
--- a/src/input_common/drivers/udp_client.cpp
+++ b/src/input_common/drivers/udp_client.cpp
@@ -339,7 +339,7 @@ void UDPClient::StartCommunication(std::size_t client, const std::string& host,
    }
 }

-const PadIdentifier UDPClient::GetPadIdentifier(std::size_t pad_index) const {
+PadIdentifier UDPClient::GetPadIdentifier(std::size_t pad_index) const {
    const std::size_t client = pad_index / PADS_PER_CLIENT;
    return {
        .guid = clients[client].uuid,
@@ -348,9 +348,9 @@ const PadIdentifier UDPClient::GetPadIdentifier(std::size_t pad_index) const {
    };
 }

-const Common::UUID UDPClient::GetHostUUID(const std::string host) const {
-    const auto ip = boost::asio::ip::address_v4::from_string(host);
-    const auto hex_host = fmt::format("{:06x}", ip.to_ulong());
+Common::UUID UDPClient::GetHostUUID(const std::string& host) const {
+    const auto ip = boost::asio::ip::make_address_v4(host);
+    const auto hex_host = fmt::format("{:06x}", ip.to_uint());
    return Common::UUID{hex_host};
 }

--- a/src/input_common/drivers/udp_client.h
+++ b/src/input_common/drivers/udp_client.h
@@ -145,8 +145,8 @@ private:
    void OnPortInfo(Response::PortInfo);
    void OnPadData(Response::PadData, std::size_t client);
    void StartCommunication(std::size_t client, const std::string& host, u16 port);
-    const PadIdentifier GetPadIdentifier(std::size_t pad_index) const;
-    const Common::UUID GetHostUUID(const std::string host) const;
+    PadIdentifier GetPadIdentifier(std::size_t pad_index) const;
+    Common::UUID GetHostUUID(const std::string& host) const;

    Common::Input::ButtonNames GetUIButtonName(const Common::ParamPackage& params) const;

--- a/src/input_common/input_engine.h
+++ b/src/input_common/input_engine.h
@@ -16,7 +16,7 @@

 // Pad Identifier of data source
 struct PadIdentifier {
-    Common::UUID guid{};
+    Common::UUID guid{Common::INVALID_UUID};
    std::size_t port{};
    std::size_t pad{};

@@ -89,7 +89,7 @@ struct UpdateCallback {

 // Triggered if data changed on the controller and the engine is on configuring mode
 struct MappingCallback {
-    std::function<void(MappingData)> on_data;
+    std::function<void(const MappingData&)> on_data;
 };

 // Input Identifier of data source
--- a/src/input_common/input_mapping.cpp
+++ b/src/input_common/input_mapping.cpp
@@ -2,14 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included

-#include "common/common_types.h"
 #include "common/settings.h"
 #include "input_common/input_engine.h"
 #include "input_common/input_mapping.h"

 namespace InputCommon {

-MappingFactory::MappingFactory() {}
+MappingFactory::MappingFactory() = default;

 void MappingFactory::BeginMapping(Polling::InputType type) {
    is_enabled = true;
@@ -19,7 +18,7 @@ void MappingFactory::BeginMapping(Polling::InputType type) {
    second_axis = -1;
 }

-[[nodiscard]] const Common::ParamPackage MappingFactory::GetNextInput() {
+Common::ParamPackage MappingFactory::GetNextInput() {
    Common::ParamPackage input;
    input_queue.Pop(input);
    return input;
@@ -57,7 +56,7 @@ void MappingFactory::StopMapping() {
 void MappingFactory::RegisterButton(const MappingData& data) {
    Common::ParamPackage new_input;
    new_input.Set("engine", data.engine);
-    if (data.pad.guid != Common::UUID{}) {
+    if (data.pad.guid.IsValid()) {
        new_input.Set("guid", data.pad.guid.Format());
    }
    new_input.Set("port", static_cast<int>(data.pad.port));
@@ -93,7 +92,7 @@ void MappingFactory::RegisterButton(const MappingData& data) {
 void MappingFactory::RegisterStick(const MappingData& data) {
    Common::ParamPackage new_input;
    new_input.Set("engine", data.engine);
-    if (data.pad.guid != Common::UUID{}) {
+    if (data.pad.guid.IsValid()) {
        new_input.Set("guid", data.pad.guid.Format());
    }
    new_input.Set("port", static_cast<int>(data.pad.port));
@@ -138,7 +137,7 @@ void MappingFactory::RegisterStick(const MappingData& data) {
 void MappingFactory::RegisterMotion(const MappingData& data) {
    Common::ParamPackage new_input;
    new_input.Set("engine", data.engine);
-    if (data.pad.guid != Common::UUID{}) {
+    if (data.pad.guid.IsValid()) {
        new_input.Set("guid", data.pad.guid.Format());
    }
    new_input.Set("port", static_cast<int>(data.pad.port));
--- a/src/input_common/input_mapping.h
+++ b/src/input_common/input_mapping.h
@@ -3,8 +3,14 @@
 // Refer to the license.txt file included

 #pragma once
+
+#include "common/param_package.h"
 #include "common/threadsafe_queue.h"

+namespace InputCommon::Polling {
+enum class InputType;
+}
+
 namespace InputCommon {
 class InputEngine;
 struct MappingData;
@@ -20,7 +26,7 @@ public:
    void BeginMapping(Polling::InputType type);

    /// Returns an input event with mapping information from the input_queue
-    [[nodiscard]] const Common::ParamPackage GetNextInput();
+    [[nodiscard]] Common::ParamPackage GetNextInput();

    /**
     * Registers mapping input data from the driver
--- a/src/input_common/main.cpp
+++ b/src/input_common/main.cpp
@@ -27,7 +27,7 @@ namespace InputCommon {
 struct InputSubsystem::Impl {
    void Initialize() {
        mapping_factory = std::make_shared<MappingFactory>();
-        MappingCallback mapping_callback{[this](MappingData data) { RegisterInput(data); }};
+        MappingCallback mapping_callback{[this](const MappingData& data) { RegisterInput(data); }};

        keyboard = std::make_shared<Keyboard>("keyboard");
        keyboard->SetMappingCallback(mapping_callback);
@@ -284,7 +284,7 @@ struct InputSubsystem::Impl {
 #endif
    }

-    void RegisterInput(MappingData data) {
+    void RegisterInput(const MappingData& data) {
        mapping_factory->RegisterInput(data);
    }

@@ -394,7 +394,7 @@ void InputSubsystem::BeginMapping(Polling::InputType type) {
    impl->mapping_factory->BeginMapping(type);
 }

-const Common::ParamPackage InputSubsystem::GetNextInput() const {
+Common::ParamPackage InputSubsystem::GetNextInput() const {
    return impl->mapping_factory->GetNextInput();
 }

--- a/src/input_common/main.h
+++ b/src/input_common/main.h
@@ -126,7 +126,7 @@ public:
    void BeginMapping(Polling::InputType type);

    /// Returns an input event with mapping information.
-    [[nodiscard]] const Common::ParamPackage GetNextInput() const;
+    [[nodiscard]] Common::ParamPackage GetNextInput() const;

    /// Stop polling from all backends.
    void StopMapping() const;
--- a/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_instructions.h
@@ -372,6 +372,8 @@ void EmitSharedAtomicExchange32(EmitContext& ctx, IR::Inst& inst, ScalarU32 poin
                                ScalarU32 value);
 void EmitSharedAtomicExchange64(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
                                Register value);
+void EmitSharedAtomicExchange32x2(EmitContext& ctx, IR::Inst& inst, ScalarU32 pointer_offset,
+                                  Register value);
 void EmitStorageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                             ScalarU32 offset, ScalarU32 value);
 void EmitStorageAtomicSMin32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
@@ -412,6 +414,24 @@ void EmitStorageAtomicXor64(EmitContext& ctx, IR::Inst& inst, const IR::Value& b
                            ScalarU32 offset, Register value);
 void EmitStorageAtomicExchange64(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                                 ScalarU32 offset, Register value);
+void EmitStorageAtomicIAdd32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               ScalarU32 offset, Register value);
+void EmitStorageAtomicSMin32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               ScalarU32 offset, Register value);
+void EmitStorageAtomicUMin32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               ScalarU32 offset, Register value);
+void EmitStorageAtomicSMax32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               ScalarU32 offset, Register value);
+void EmitStorageAtomicUMax32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               ScalarU32 offset, Register value);
+void EmitStorageAtomicAnd32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                              ScalarU32 offset, Register value);
+void EmitStorageAtomicOr32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                             ScalarU32 offset, Register value);
+void EmitStorageAtomicXor32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                              ScalarU32 offset, Register value);
+void EmitStorageAtomicExchange32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                                   ScalarU32 offset, Register value);
 void EmitStorageAtomicAddF32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                             ScalarU32 offset, ScalarF32 value);
 void EmitStorageAtomicAddF16x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
@@ -448,6 +468,17 @@ void EmitGlobalAtomicAnd64(EmitContext& ctx);
 void EmitGlobalAtomicOr64(EmitContext& ctx);
 void EmitGlobalAtomicXor64(EmitContext& ctx);
 void EmitGlobalAtomicExchange64(EmitContext& ctx);
+void EmitGlobalAtomicIAdd32x2(EmitContext& ctx);
+void EmitGlobalAtomicSMin32x2(EmitContext& ctx);
+void EmitGlobalAtomicUMin32x2(EmitContext& ctx);
+void EmitGlobalAtomicSMax32x2(EmitContext& ctx);
+void EmitGlobalAtomicUMax32x2(EmitContext& ctx);
+void EmitGlobalAtomicInc32x2(EmitContext& ctx);
+void EmitGlobalAtomicDec32x2(EmitContext& ctx);
+void EmitGlobalAtomicAnd32x2(EmitContext& ctx);
+void EmitGlobalAtomicOr32x2(EmitContext& ctx);
+void EmitGlobalAtomicXor32x2(EmitContext& ctx);
+void EmitGlobalAtomicExchange32x2(EmitContext& ctx);
 void EmitGlobalAtomicAddF32(EmitContext& ctx);
 void EmitGlobalAtomicAddF16x2(EmitContext& ctx);
 void EmitGlobalAtomicAddF32x2(EmitContext& ctx);
--- a/src/shader_recompiler/backend/glasm/emit_glasm_memory.cpp
+++ b/src/shader_recompiler/backend/glasm/emit_glasm_memory.cpp
@@ -311,6 +311,13 @@ void EmitSharedAtomicExchange64(EmitContext& ctx, IR::Inst& inst, ScalarU32 poin
    ctx.LongAdd("ATOMS.EXCH.U64 {}.x,{},shared_mem[{}];", inst, value, pointer_offset);
 }

+void EmitSharedAtomicExchange32x2([[maybe_unused]] EmitContext& ctx,
+                                  [[maybe_unused]] IR::Inst& inst,
+                                  [[maybe_unused]] ScalarU32 pointer_offset,
+                                  [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
 void EmitStorageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                             ScalarU32 offset, ScalarU32 value) {
    Atom(ctx, inst, binding, offset, value, "ADD", "U32");
@@ -411,6 +418,62 @@ void EmitStorageAtomicExchange64(EmitContext& ctx, IR::Inst& inst, const IR::Val
    Atom(ctx, inst, binding, offset, value, "EXCH", "U64");
 }

+void EmitStorageAtomicIAdd32x2([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                               [[maybe_unused]] const IR::Value& binding,
+                               [[maybe_unused]] ScalarU32 offset, [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitStorageAtomicSMin32x2([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                               [[maybe_unused]] const IR::Value& binding,
+                               [[maybe_unused]] ScalarU32 offset, [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitStorageAtomicUMin32x2([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                               [[maybe_unused]] const IR::Value& binding,
+                               [[maybe_unused]] ScalarU32 offset, [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitStorageAtomicSMax32x2([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                               [[maybe_unused]] const IR::Value& binding,
+                               [[maybe_unused]] ScalarU32 offset, [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitStorageAtomicUMax32x2([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                               [[maybe_unused]] const IR::Value& binding,
+                               [[maybe_unused]] ScalarU32 offset, [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitStorageAtomicAnd32x2([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                              [[maybe_unused]] const IR::Value& binding,
+                              [[maybe_unused]] ScalarU32 offset, [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitStorageAtomicOr32x2([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                             [[maybe_unused]] const IR::Value& binding,
+                             [[maybe_unused]] ScalarU32 offset, [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitStorageAtomicXor32x2([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& inst,
+                              [[maybe_unused]] const IR::Value& binding,
+                              [[maybe_unused]] ScalarU32 offset, [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitStorageAtomicExchange32x2([[maybe_unused]] EmitContext& ctx,
+                                   [[maybe_unused]] IR::Inst& inst,
+                                   [[maybe_unused]] const IR::Value& binding,
+                                   [[maybe_unused]] ScalarU32 offset,
+                                   [[maybe_unused]] Register value) {
+    throw NotImplementedException("GLASM instruction");
+}
+
 void EmitStorageAtomicAddF32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                             ScalarU32 offset, ScalarF32 value) {
    Atom(ctx, inst, binding, offset, value, "ADD", "F32");
@@ -537,6 +600,50 @@ void EmitGlobalAtomicExchange64(EmitContext&) {
    throw NotImplementedException("GLASM instruction");
 }

+void EmitGlobalAtomicIAdd32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicSMin32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicUMin32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicSMax32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicUMax32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicInc32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicDec32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicAnd32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicOr32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicXor32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
+void EmitGlobalAtomicExchange32x2(EmitContext&) {
+    throw NotImplementedException("GLASM instruction");
+}
+
 void EmitGlobalAtomicAddF32(EmitContext&) {
    throw NotImplementedException("GLASM instruction");
 }
--- a/src/shader_recompiler/backend/glsl/emit_glsl_atomic.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_atomic.cpp
@@ -105,6 +105,13 @@ void EmitSharedAtomicExchange64(EmitContext& ctx, IR::Inst& inst, std::string_vi
            pointer_offset, value, pointer_offset, value);
 }

+void EmitSharedAtomicExchange32x2(EmitContext& ctx, IR::Inst& inst, std::string_view pointer_offset,
+                                  std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to non-atomic");
+    ctx.AddU32x2("{}=uvec2(smem[{}>>2],smem[({}+4)>>2]);", inst, pointer_offset, pointer_offset);
+    ctx.Add("smem[{}>>2]={}.x;smem[({}+4)>>2]={}.y;", pointer_offset, value, pointer_offset, value);
+}
+
 void EmitStorageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                             const IR::Value& offset, std::string_view value) {
    ctx.AddU32("{}=atomicAdd({}_ssbo{}[{}>>2],{});", inst, ctx.stage_name, binding.U32(),
@@ -265,6 +272,97 @@ void EmitStorageAtomicExchange64(EmitContext& ctx, IR::Inst& inst, const IR::Val
               ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), value);
 }

+void EmitStorageAtomicIAdd32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to non-atomic");
+    ctx.AddU32x2("{}=uvec2({}_ssbo{}[{}>>2],{}_ssbo{}[({}>>2)+1]);", inst, ctx.stage_name,
+                 binding.U32(), ctx.var_alloc.Consume(offset), ctx.stage_name, binding.U32(),
+                 ctx.var_alloc.Consume(offset));
+    ctx.Add("{}_ssbo{}[{}>>2]+={}.x;{}_ssbo{}[({}>>2)+1]+={}.y;", ctx.stage_name, binding.U32(),
+            ctx.var_alloc.Consume(offset), value, ctx.stage_name, binding.U32(),
+            ctx.var_alloc.Consume(offset), value);
+}
+
+void EmitStorageAtomicSMin32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to non-atomic");
+    ctx.AddU32x2("{}=ivec2({}_ssbo{}[{}>>2],{}_ssbo{}[({}>>2)+1]);", inst, ctx.stage_name,
+                 binding.U32(), ctx.var_alloc.Consume(offset), ctx.stage_name, binding.U32(),
+                 ctx.var_alloc.Consume(offset));
+    ctx.Add("for(int "
+            "i=0;i<2;++i){{{}_ssbo{}[({}>>2)+i]=uint(min(int({}_ssbo{}[({}>>2)+i]),int({}[i])));}}",
+            ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), ctx.stage_name,
+            binding.U32(), ctx.var_alloc.Consume(offset), value);
+}
+
+void EmitStorageAtomicUMin32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to non-atomic");
+    ctx.AddU32x2("{}=uvec2({}_ssbo{}[{}>>2],{}_ssbo{}[({}>>2)+1]);", inst, ctx.stage_name,
+                 binding.U32(), ctx.var_alloc.Consume(offset), ctx.stage_name, binding.U32(),
+                 ctx.var_alloc.Consume(offset));
+    ctx.Add("for(int i=0;i<2;++i){{ "
+            "{}_ssbo{}[({}>>2)+i]=min({}_ssbo{}[({}>>2)+i],{}[i]);}}",
+            ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), ctx.stage_name,
+            binding.U32(), ctx.var_alloc.Consume(offset), value);
+}
+
+void EmitStorageAtomicSMax32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to non-atomic");
+    ctx.AddU32x2("{}=ivec2({}_ssbo{}[{}>>2],{}_ssbo{}[({}>>2)+1]);", inst, ctx.stage_name,
+                 binding.U32(), ctx.var_alloc.Consume(offset), ctx.stage_name, binding.U32(),
+                 ctx.var_alloc.Consume(offset));
+    ctx.Add("for(int "
+            "i=0;i<2;++i){{{}_ssbo{}[({}>>2)+i]=uint(max(int({}_ssbo{}[({}>>2)+i]),int({}[i])));}}",
+            ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), ctx.stage_name,
+            binding.U32(), ctx.var_alloc.Consume(offset), value);
+}
+
+void EmitStorageAtomicUMax32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to non-atomic");
+    ctx.AddU32x2("{}=uvec2({}_ssbo{}[{}>>2],{}_ssbo{}[({}>>2)+1]);", inst, ctx.stage_name,
+                 binding.U32(), ctx.var_alloc.Consume(offset), ctx.stage_name, binding.U32(),
+                 ctx.var_alloc.Consume(offset));
+    ctx.Add("for(int i=0;i<2;++i){{{}_ssbo{}[({}>>2)+i]=max({}_ssbo{}[({}>>2)+i],{}[i]);}}",
+            ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), ctx.stage_name,
+            binding.U32(), ctx.var_alloc.Consume(offset), value);
+}
+
+void EmitStorageAtomicAnd32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                              const IR::Value& offset, std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to 32x2");
+    ctx.AddU32x2("{}=uvec2(atomicAnd({}_ssbo{}[{}>>2],{}.x),atomicAnd({}_ssbo{}[({}>>2)+1],{}.y));",
+                 inst, ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), value,
+                 ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), value);
+}
+
+void EmitStorageAtomicOr32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                             const IR::Value& offset, std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to 32x2");
+    ctx.AddU32x2("{}=uvec2(atomicOr({}_ssbo{}[{}>>2],{}.x),atomicOr({}_ssbo{}[({}>>2)+1],{}.y));",
+                 inst, ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), value,
+                 ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), value);
+}
+
+void EmitStorageAtomicXor32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                              const IR::Value& offset, std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to 32x2");
+    ctx.AddU32x2("{}=uvec2(atomicXor({}_ssbo{}[{}>>2],{}.x),atomicXor({}_ssbo{}[({}>>2)+1],{}.y));",
+                 inst, ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), value,
+                 ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), value);
+}
+
+void EmitStorageAtomicExchange32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                                   const IR::Value& offset, std::string_view value) {
+    LOG_WARNING(Shader_GLSL, "Int64 atomics not supported, fallback to 32x2");
+    ctx.AddU32x2("{}=uvec2(atomicExchange({}_ssbo{}[{}>>2],{}.x),atomicExchange({}_ssbo{}[({}>>2)+"
+                 "1],{}.y));",
+                 inst, ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), value,
+                 ctx.stage_name, binding.U32(), ctx.var_alloc.Consume(offset), value);
+}
+
 void EmitStorageAtomicAddF32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                             const IR::Value& offset, std::string_view value) {
    SsboCasFunctionF32(ctx, inst, binding, offset, value, "CasFloatAdd");
@@ -388,6 +486,50 @@ void EmitGlobalAtomicExchange64(EmitContext&) {
    throw NotImplementedException("GLSL Instrucion");
 }

+void EmitGlobalAtomicIAdd32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicSMin32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicUMin32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicSMax32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicUMax32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicInc32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicDec32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicAnd32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicOr32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicXor32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
+void EmitGlobalAtomicExchange32x2(EmitContext&) {
+    throw NotImplementedException("GLSL Instrucion");
+}
+
 void EmitGlobalAtomicAddF32(EmitContext&) {
    throw NotImplementedException("GLSL Instrucion");
 }
--- a/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_instructions.h
@@ -442,6 +442,8 @@ void EmitSharedAtomicExchange32(EmitContext& ctx, IR::Inst& inst, std::string_vi
                                std::string_view value);
 void EmitSharedAtomicExchange64(EmitContext& ctx, IR::Inst& inst, std::string_view pointer_offset,
                                std::string_view value);
+void EmitSharedAtomicExchange32x2(EmitContext& ctx, IR::Inst& inst, std::string_view pointer_offset,
+                                  std::string_view value);
 void EmitStorageAtomicIAdd32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                             const IR::Value& offset, std::string_view value);
 void EmitStorageAtomicSMin32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
@@ -482,6 +484,24 @@ void EmitStorageAtomicXor64(EmitContext& ctx, IR::Inst& inst, const IR::Value& b
                            const IR::Value& offset, std::string_view value);
 void EmitStorageAtomicExchange64(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                                 const IR::Value& offset, std::string_view value);
+void EmitStorageAtomicIAdd32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value);
+void EmitStorageAtomicSMin32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value);
+void EmitStorageAtomicUMin32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value);
+void EmitStorageAtomicSMax32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value);
+void EmitStorageAtomicUMax32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                               const IR::Value& offset, std::string_view value);
+void EmitStorageAtomicAnd32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                              const IR::Value& offset, std::string_view value);
+void EmitStorageAtomicOr32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                             const IR::Value& offset, std::string_view value);
+void EmitStorageAtomicXor32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                              const IR::Value& offset, std::string_view value);
+void EmitStorageAtomicExchange32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
+                                   const IR::Value& offset, std::string_view value);
 void EmitStorageAtomicAddF32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
                             const IR::Value& offset, std::string_view value);
 void EmitStorageAtomicAddF16x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
@@ -518,6 +538,17 @@ void EmitGlobalAtomicAnd64(EmitContext& ctx);
 void EmitGlobalAtomicOr64(EmitContext& ctx);
 void EmitGlobalAtomicXor64(EmitContext& ctx);
 void EmitGlobalAtomicExchange64(EmitContext& ctx);
+void EmitGlobalAtomicIAdd32x2(EmitContext& ctx);
+void EmitGlobalAtomicSMin32x2(EmitContext& ctx);
+void EmitGlobalAtomicUMin32x2(EmitContext& ctx);
+void EmitGlobalAtomicSMax32x2(EmitContext& ctx);
+void EmitGlobalAtomicUMax32x2(EmitContext& ctx);
+void EmitGlobalAtomicInc32x2(EmitContext& ctx);
+void EmitGlobalAtomicDec32x2(EmitContext& ctx);
+void EmitGlobalAtomicAnd32x2(EmitContext& ctx);
+void EmitGlobalAtomicOr32x2(EmitContext& ctx);
+void EmitGlobalAtomicXor32x2(EmitContext& ctx);
+void EmitGlobalAtomicExchange32x2(EmitContext& ctx);
 void EmitGlobalAtomicAddF32(EmitContext& ctx);
 void EmitGlobalAtomicAddF16x2(EmitContext& ctx);
 void EmitGlobalAtomicAddF32x2(EmitContext& ctx);
--- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp
@@ -387,6 +387,14 @@ void SetupSignedNanCapabilities(const Profile& profile, const IR::Program& progr
    }
 }

+void SetupTransformFeedbackCapabilities(EmitContext& ctx, Id main_func) {
+    if (ctx.runtime_info.xfb_varyings.empty()) {
+        return;
+    }
+    ctx.AddCapability(spv::Capability::TransformFeedback);
+    ctx.AddExecutionMode(main_func, spv::ExecutionMode::Xfb);
+}
+
 void SetupCapabilities(const Profile& profile, const Info& info, EmitContext& ctx) {
    if (info.uses_sampled_1d) {
        ctx.AddCapability(spv::Capability::Sampled1D);
@@ -442,9 +450,6 @@ void SetupCapabilities(const Profile& profile, const Info& info, EmitContext& ct
    if (info.uses_sample_id) {
        ctx.AddCapability(spv::Capability::SampleRateShading);
    }
-    if (!ctx.runtime_info.xfb_varyings.empty()) {
-        ctx.AddCapability(spv::Capability::TransformFeedback);
-    }
    if (info.uses_derivatives) {
        ctx.AddCapability(spv::Capability::DerivativeControl);
    }
@@ -484,6 +489,7 @@ std::vector<u32> EmitSPIRV(const Profile& profile, const RuntimeInfo& runtime_in
        SetupSignedNanCapabilities(profile, program, ctx, main);
    }
    SetupCapabilities(profile, program.info, ctx);
+    SetupTransformFeedbackCapabilities(ctx, main);
    PatchPhiNodes(program, ctx);
    return ctx.Assemble();
 }
--- a/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_atomic.cpp
@@ -74,7 +74,7 @@ Id StorageAtomicU64(EmitContext& ctx, const IR::Value& binding, const IR::Value&
        const auto [scope, semantics]{AtomicArgs(ctx)};
        return (ctx.*atomic_func)(ctx.U64, pointer, scope, semantics, value);
    }
-    LOG_ERROR(Shader_SPIRV, "Int64 atomics not supported, fallback to non-atomic");
+    LOG_WARNING(Shader_SPIRV, "Int64 atomics not supported, fallback to non-atomic");
    const Id pointer{StoragePointer(ctx, ctx.storage_types.U32x2, &StorageDefinitions::U32x2,
                                    binding, offset, sizeof(u32[2]))};
    const Id original_value{ctx.OpBitcast(ctx.U64, ctx.OpLoad(ctx.U32[2], pointer))};
@@ -82,6 +82,17 @@ Id StorageAtomicU64(EmitContext& ctx, const IR::Value& binding, const IR::Value&
    ctx.OpStore(pointer, ctx.OpBitcast(ctx.U32[2], result));
    return original_value;
 }
+
+Id StorageAtomicU32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset, Id value,
+                      Id (Sirit::Module::*non_atomic_func)(Id, Id, Id)) {
+    LOG_WARNING(Shader_SPIRV, "Int64 atomics not supported, fallback to non-atomic");
+    const Id pointer{StoragePointer(ctx, ctx.storage_types.U32x2, &StorageDefinitions::U32x2,
+                                    binding, offset, sizeof(u32[2]))};
+    const Id original_value{ctx.OpLoad(ctx.U32[2], pointer)};
+    const Id result{(ctx.*non_atomic_func)(ctx.U32[2], value, original_value)};
+    ctx.OpStore(pointer, result);
+    return original_value;
+}
 } // Anonymous namespace

 Id EmitSharedAtomicIAdd32(EmitContext& ctx, Id offset, Id value) {
@@ -141,7 +152,7 @@ Id EmitSharedAtomicExchange64(EmitContext& ctx, Id offset, Id value) {
        const auto [scope, semantics]{AtomicArgs(ctx)};
        return ctx.OpAtomicExchange(ctx.U64, pointer, scope, semantics, value);
    }
-    LOG_ERROR(Shader_SPIRV, "Int64 atomics not supported, fallback to non-atomic");
+    LOG_WARNING(Shader_SPIRV, "Int64 atomics not supported, fallback to non-atomic");
    const Id pointer_1{SharedPointer(ctx, offset, 0)};
    const Id pointer_2{SharedPointer(ctx, offset, 1)};
    const Id value_1{ctx.OpLoad(ctx.U32[1], pointer_1)};
@@ -152,6 +163,18 @@ Id EmitSharedAtomicExchange64(EmitContext& ctx, Id offset, Id value) {
    return ctx.OpBitcast(ctx.U64, ctx.OpCompositeConstruct(ctx.U32[2], value_1, value_2));
 }

+Id EmitSharedAtomicExchange32x2(EmitContext& ctx, Id offset, Id value) {
+    LOG_WARNING(Shader_SPIRV, "Int64 atomics not supported, fallback to non-atomic");
+    const Id pointer_1{SharedPointer(ctx, offset, 0)};
+    const Id pointer_2{SharedPointer(ctx, offset, 1)};
+    const Id value_1{ctx.OpLoad(ctx.U32[1], pointer_1)};
+    const Id value_2{ctx.OpLoad(ctx.U32[1], pointer_2)};
+    const Id new_vector{ctx.OpBitcast(ctx.U32[2], value)};
+    ctx.OpStore(pointer_1, ctx.OpCompositeExtract(ctx.U32[1], new_vector, 0U));
+    ctx.OpStore(pointer_2, ctx.OpCompositeExtract(ctx.U32[1], new_vector, 1U));
+    return ctx.OpCompositeConstruct(ctx.U32[2], value_1, value_2);
+}
+
 Id EmitStorageAtomicIAdd32(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
                           Id value) {
    return StorageAtomicU32(ctx, binding, offset, value, &Sirit::Module::OpAtomicIAdd);
@@ -267,7 +290,7 @@ Id EmitStorageAtomicExchange64(EmitContext& ctx, const IR::Value& binding, const
        const auto [scope, semantics]{AtomicArgs(ctx)};
        return ctx.OpAtomicExchange(ctx.U64, pointer, scope, semantics, value);
    }
-    LOG_ERROR(Shader_SPIRV, "Int64 atomics not supported, fallback to non-atomic");
+    LOG_WARNING(Shader_SPIRV, "Int64 atomics not supported, fallback to non-atomic");
    const Id pointer{StoragePointer(ctx, ctx.storage_types.U32x2, &StorageDefinitions::U32x2,
                                    binding, offset, sizeof(u32[2]))};
    const Id original{ctx.OpBitcast(ctx.U64, ctx.OpLoad(ctx.U32[2], pointer))};
@@ -275,6 +298,56 @@ Id EmitStorageAtomicExchange64(EmitContext& ctx, const IR::Value& binding, const
    return original;
 }

+Id EmitStorageAtomicIAdd32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value) {
+    return StorageAtomicU32x2(ctx, binding, offset, value, &Sirit::Module::OpIAdd);
+}
+
+Id EmitStorageAtomicSMin32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value) {
+    return StorageAtomicU32x2(ctx, binding, offset, value, &Sirit::Module::OpSMin);
+}
+
+Id EmitStorageAtomicUMin32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value) {
+    return StorageAtomicU32x2(ctx, binding, offset, value, &Sirit::Module::OpUMin);
+}
+
+Id EmitStorageAtomicSMax32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value) {
+    return StorageAtomicU32x2(ctx, binding, offset, value, &Sirit::Module::OpSMax);
+}
+
+Id EmitStorageAtomicUMax32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value) {
+    return StorageAtomicU32x2(ctx, binding, offset, value, &Sirit::Module::OpUMax);
+}
+
+Id EmitStorageAtomicAnd32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                            Id value) {
+    return StorageAtomicU32x2(ctx, binding, offset, value, &Sirit::Module::OpBitwiseAnd);
+}
+
+Id EmitStorageAtomicOr32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                           Id value) {
+    return StorageAtomicU32x2(ctx, binding, offset, value, &Sirit::Module::OpBitwiseOr);
+}
+
+Id EmitStorageAtomicXor32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                            Id value) {
+    return StorageAtomicU32x2(ctx, binding, offset, value, &Sirit::Module::OpBitwiseXor);
+}
+
+Id EmitStorageAtomicExchange32x2(EmitContext& ctx, const IR::Value& binding,
+                                 const IR::Value& offset, Id value) {
+    LOG_WARNING(Shader_SPIRV, "Int64 atomics not supported, fallback to non-atomic");
+    const Id pointer{StoragePointer(ctx, ctx.storage_types.U32x2, &StorageDefinitions::U32x2,
+                                    binding, offset, sizeof(u32[2]))};
+    const Id original{ctx.OpLoad(ctx.U32[2], pointer)};
+    ctx.OpStore(pointer, value);
+    return original;
+}
+
 Id EmitStorageAtomicAddF32(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
                           Id value) {
    const Id ssbo{ctx.ssbos[binding.U32()].U32};
@@ -418,6 +491,50 @@ Id EmitGlobalAtomicExchange64(EmitContext&) {
    throw NotImplementedException("SPIR-V Instruction");
 }

+Id EmitGlobalAtomicIAdd32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicSMin32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicUMin32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicSMax32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicUMax32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicInc32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicDec32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicAnd32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicOr32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicXor32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
+Id EmitGlobalAtomicExchange32x2(EmitContext&) {
+    throw NotImplementedException("SPIR-V Instruction");
+}
+
 Id EmitGlobalAtomicAddF32(EmitContext&) {
    throw NotImplementedException("SPIR-V Instruction");
 }
--- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
@@ -335,6 +335,7 @@ Id EmitSharedAtomicOr32(EmitContext& ctx, Id pointer_offset, Id value);
 Id EmitSharedAtomicXor32(EmitContext& ctx, Id pointer_offset, Id value);
 Id EmitSharedAtomicExchange32(EmitContext& ctx, Id pointer_offset, Id value);
 Id EmitSharedAtomicExchange64(EmitContext& ctx, Id pointer_offset, Id value);
+Id EmitSharedAtomicExchange32x2(EmitContext& ctx, Id pointer_offset, Id value);
 Id EmitStorageAtomicIAdd32(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
                           Id value);
 Id EmitStorageAtomicSMin32(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
@@ -375,6 +376,24 @@ Id EmitStorageAtomicXor64(EmitContext& ctx, const IR::Value& binding, const IR::
                          Id value);
 Id EmitStorageAtomicExchange64(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
                               Id value);
+Id EmitStorageAtomicIAdd32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value);
+Id EmitStorageAtomicSMin32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value);
+Id EmitStorageAtomicUMin32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value);
+Id EmitStorageAtomicSMax32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value);
+Id EmitStorageAtomicUMax32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                             Id value);
+Id EmitStorageAtomicAnd32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                            Id value);
+Id EmitStorageAtomicOr32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                           Id value);
+Id EmitStorageAtomicXor32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
+                            Id value);
+Id EmitStorageAtomicExchange32x2(EmitContext& ctx, const IR::Value& binding,
+                                 const IR::Value& offset, Id value);
 Id EmitStorageAtomicAddF32(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
                           Id value);
 Id EmitStorageAtomicAddF16x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset,
@@ -411,6 +430,17 @@ Id EmitGlobalAtomicAnd64(EmitContext& ctx);
 Id EmitGlobalAtomicOr64(EmitContext& ctx);
 Id EmitGlobalAtomicXor64(EmitContext& ctx);
 Id EmitGlobalAtomicExchange64(EmitContext& ctx);
+Id EmitGlobalAtomicIAdd32x2(EmitContext& ctx);
+Id EmitGlobalAtomicSMin32x2(EmitContext& ctx);
+Id EmitGlobalAtomicUMin32x2(EmitContext& ctx);
+Id EmitGlobalAtomicSMax32x2(EmitContext& ctx);
+Id EmitGlobalAtomicUMax32x2(EmitContext& ctx);
+Id EmitGlobalAtomicInc32x2(EmitContext& ctx);
+Id EmitGlobalAtomicDec32x2(EmitContext& ctx);
+Id EmitGlobalAtomicAnd32x2(EmitContext& ctx);
+Id EmitGlobalAtomicOr32x2(EmitContext& ctx);
+Id EmitGlobalAtomicXor32x2(EmitContext& ctx);
+Id EmitGlobalAtomicExchange32x2(EmitContext& ctx);
 Id EmitGlobalAtomicAddF32(EmitContext& ctx);
 Id EmitGlobalAtomicAddF16x2(EmitContext& ctx);
 Id EmitGlobalAtomicAddF32x2(EmitContext& ctx);
--- a/src/shader_recompiler/frontend/ir/microinstruction.cpp
+++ b/src/shader_recompiler/frontend/ir/microinstruction.cpp
@@ -118,6 +118,7 @@ bool Inst::MayHaveSideEffects() const noexcept {
    case Opcode::SharedAtomicXor32:
    case Opcode::SharedAtomicExchange32:
    case Opcode::SharedAtomicExchange64:
+    case Opcode::SharedAtomicExchange32x2:
    case Opcode::GlobalAtomicIAdd32:
    case Opcode::GlobalAtomicSMin32:
    case Opcode::GlobalAtomicUMin32:
@@ -138,6 +139,15 @@ bool Inst::MayHaveSideEffects() const noexcept {
    case Opcode::GlobalAtomicOr64:
    case Opcode::GlobalAtomicXor64:
    case Opcode::GlobalAtomicExchange64:
+    case Opcode::GlobalAtomicIAdd32x2:
+    case Opcode::GlobalAtomicSMin32x2:
+    case Opcode::GlobalAtomicUMin32x2:
+    case Opcode::GlobalAtomicSMax32x2:
+    case Opcode::GlobalAtomicUMax32x2:
+    case Opcode::GlobalAtomicAnd32x2:
+    case Opcode::GlobalAtomicOr32x2:
+    case Opcode::GlobalAtomicXor32x2:
+    case Opcode::GlobalAtomicExchange32x2:
    case Opcode::GlobalAtomicAddF32:
    case Opcode::GlobalAtomicAddF16x2:
    case Opcode::GlobalAtomicAddF32x2:
@@ -165,6 +175,15 @@ bool Inst::MayHaveSideEffects() const noexcept {
    case Opcode::StorageAtomicOr64:
    case Opcode::StorageAtomicXor64:
    case Opcode::StorageAtomicExchange64:
+    case Opcode::StorageAtomicIAdd32x2:
+    case Opcode::StorageAtomicSMin32x2:
+    case Opcode::StorageAtomicUMin32x2:
+    case Opcode::StorageAtomicSMax32x2:
+    case Opcode::StorageAtomicUMax32x2:
+    case Opcode::StorageAtomicAnd32x2:
+    case Opcode::StorageAtomicOr32x2:
+    case Opcode::StorageAtomicXor32x2:
+    case Opcode::StorageAtomicExchange32x2:
    case Opcode::StorageAtomicAddF32:
    case Opcode::StorageAtomicAddF16x2:
    case Opcode::StorageAtomicAddF32x2:
--- a/src/shader_recompiler/frontend/ir/opcodes.inc
+++ b/src/shader_recompiler/frontend/ir/opcodes.inc
@@ -341,6 +341,7 @@ OPCODE(SharedAtomicOr32,                                    U32,            U32,
 OPCODE(SharedAtomicXor32,                                   U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicExchange32,                              U32,            U32,            U32,                                                            )
 OPCODE(SharedAtomicExchange64,                              U64,            U32,            U64,                                                            )
+OPCODE(SharedAtomicExchange32x2,                            U32x2,          U32,            U32x2,                                                          )

 OPCODE(GlobalAtomicIAdd32,                                  U32,            U64,            U32,                                                            )
 OPCODE(GlobalAtomicSMin32,                                  U32,            U64,            U32,                                                            )
@@ -362,6 +363,15 @@ OPCODE(GlobalAtomicAnd64,                                   U64,            U64,
 OPCODE(GlobalAtomicOr64,                                    U64,            U64,            U64,                                                            )
 OPCODE(GlobalAtomicXor64,                                   U64,            U64,            U64,                                                            )
 OPCODE(GlobalAtomicExchange64,                              U64,            U64,            U64,                                                            )
+OPCODE(GlobalAtomicIAdd32x2,                                U32x2,          U32x2,          U32x2,                                                          )
+OPCODE(GlobalAtomicSMin32x2,                                U32x2,          U32x2,          U32x2,                                                          )
+OPCODE(GlobalAtomicUMin32x2,                                U32x2,          U32x2,          U32x2,                                                          )
+OPCODE(GlobalAtomicSMax32x2,                                U32x2,          U32x2,          U32x2,                                                          )
+OPCODE(GlobalAtomicUMax32x2,                                U32x2,          U32x2,          U32x2,                                                          )
+OPCODE(GlobalAtomicAnd32x2,                                 U32x2,          U32x2,          U32x2,                                                          )
+OPCODE(GlobalAtomicOr32x2,                                  U32x2,          U32x2,          U32x2,                                                          )
+OPCODE(GlobalAtomicXor32x2,                                 U32x2,          U32x2,          U32x2,                                                          )
+OPCODE(GlobalAtomicExchange32x2,                            U32x2,          U32x2,          U32x2,                                                          )
 OPCODE(GlobalAtomicAddF32,                                  F32,            U64,            F32,                                                            )
 OPCODE(GlobalAtomicAddF16x2,                                U32,            U64,            F16x2,                                                          )
 OPCODE(GlobalAtomicAddF32x2,                                U32,            U64,            F32x2,                                                          )
@@ -390,6 +400,15 @@ OPCODE(StorageAtomicAnd64,                                  U64,            U32,
 OPCODE(StorageAtomicOr64,                                   U64,            U32,            U32,            U64,                                            )
 OPCODE(StorageAtomicXor64,                                  U64,            U32,            U32,            U64,                                            )
 OPCODE(StorageAtomicExchange64,                             U64,            U32,            U32,            U64,                                            )
+OPCODE(StorageAtomicIAdd32x2,                               U32x2,          U32,            U32,            U32x2,                                          )
+OPCODE(StorageAtomicSMin32x2,                               U32x2,          U32,            U32,            U32x2,                                          )
+OPCODE(StorageAtomicUMin32x2,                               U32x2,          U32,            U32,            U32x2,                                          )
+OPCODE(StorageAtomicSMax32x2,                               U32x2,          U32,            U32,            U32x2,                                          )
+OPCODE(StorageAtomicUMax32x2,                               U32x2,          U32,            U32,            U32x2,                                          )
+OPCODE(StorageAtomicAnd32x2,                                U32x2,          U32,            U32,            U32x2,                                          )
+OPCODE(StorageAtomicOr32x2,                                 U32x2,          U32,            U32,            U32x2,                                          )
+OPCODE(StorageAtomicXor32x2,                                U32x2,          U32,            U32,            U32x2,                                          )
+OPCODE(StorageAtomicExchange32x2,                           U32x2,          U32,            U32,            U32x2,                                          )
 OPCODE(StorageAtomicAddF32,                                 F32,            U32,            U32,            F32,                                            )
 OPCODE(StorageAtomicAddF16x2,                               U32,            U32,            U32,            F16x2,                                          )
 OPCODE(StorageAtomicAddF32x2,                               U32,            U32,            U32,            F32x2,                                          )
--- a/src/shader_recompiler/frontend/maxwell/translate/impl/video_minimum_maximum.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate/impl/video_minimum_maximum.cpp
@@ -57,16 +57,6 @@ void TranslatorVisitor::VMNMX(u64 insn) {
    if (vmnmx.sat != 0) {
        throw NotImplementedException("VMNMX SAT");
    }
-    // Selectors were shown to default to 2 in unit tests
-    if (vmnmx.src_a_selector != 2) {
-        throw NotImplementedException("VMNMX Selector {}", vmnmx.src_a_selector.Value());
-    }
-    if (vmnmx.src_b_selector != 2) {
-        throw NotImplementedException("VMNMX Selector {}", vmnmx.src_b_selector.Value());
-    }
-    if (vmnmx.src_a_width != VideoWidth::Word) {
-        throw NotImplementedException("VMNMX Source Width {}", vmnmx.src_a_width.Value());
-    }

    const bool is_b_imm{vmnmx.is_src_b_reg == 0};
    const IR::U32 src_a{GetReg8(insn)};
@@ -76,10 +66,14 @@ void TranslatorVisitor::VMNMX(u64 insn) {
    const VideoWidth a_width{vmnmx.src_a_width};
    const VideoWidth b_width{GetVideoSourceWidth(vmnmx.src_b_width, is_b_imm)};

+    const u32 a_selector{static_cast<u32>(vmnmx.src_a_selector)};
+    // Immediate values can't have a selector
+    const u32 b_selector{is_b_imm ? 0U : static_cast<u32>(vmnmx.src_b_selector)};
+
    const bool src_a_signed{vmnmx.src_a_sign != 0};
    const bool src_b_signed{vmnmx.src_b_sign != 0};
-    const IR::U32 op_a{ExtractVideoOperandValue(ir, src_a, a_width, 0, src_a_signed)};
-    const IR::U32 op_b{ExtractVideoOperandValue(ir, src_b, b_width, 0, src_b_signed)};
+    const IR::U32 op_a{ExtractVideoOperandValue(ir, src_a, a_width, a_selector, src_a_signed)};
+    const IR::U32 op_b{ExtractVideoOperandValue(ir, src_b, b_width, b_selector, src_b_signed)};

    // First operation's sign is only dependent on operand b's sign
    const bool op_1_signed{src_b_signed};
--- a/src/shader_recompiler/frontend/maxwell/translate_program.h
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.h
@@ -21,7 +21,6 @@ namespace Shader::Maxwell {
 [[nodiscard]] IR::Program MergeDualVertexPrograms(IR::Program& vertex_a, IR::Program& vertex_b,
                                                  Environment& env_vertex_b);

-[[nodiscard]] void ConvertLegacyToGeneric(IR::Program& program,
-                                          const Shader::RuntimeInfo& runtime_info);
+void ConvertLegacyToGeneric(IR::Program& program, const RuntimeInfo& runtime_info);

 } // namespace Shader::Maxwell
--- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
+++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp
@@ -360,6 +360,15 @@ void VisitUsages(Info& info, IR::Inst& inst) {
    case IR::Opcode::GlobalAtomicOr64:
    case IR::Opcode::GlobalAtomicXor64:
    case IR::Opcode::GlobalAtomicExchange64:
+    case IR::Opcode::GlobalAtomicIAdd32x2:
+    case IR::Opcode::GlobalAtomicSMin32x2:
+    case IR::Opcode::GlobalAtomicUMin32x2:
+    case IR::Opcode::GlobalAtomicSMax32x2:
+    case IR::Opcode::GlobalAtomicUMax32x2:
+    case IR::Opcode::GlobalAtomicAnd32x2:
+    case IR::Opcode::GlobalAtomicOr32x2:
+    case IR::Opcode::GlobalAtomicXor32x2:
+    case IR::Opcode::GlobalAtomicExchange32x2:
    case IR::Opcode::GlobalAtomicAddF32:
    case IR::Opcode::GlobalAtomicAddF16x2:
    case IR::Opcode::GlobalAtomicAddF32x2:
@@ -597,6 +606,15 @@ void VisitUsages(Info& info, IR::Inst& inst) {
        break;
    case IR::Opcode::LoadStorage64:
    case IR::Opcode::WriteStorage64:
+    case IR::Opcode::StorageAtomicIAdd32x2:
+    case IR::Opcode::StorageAtomicSMin32x2:
+    case IR::Opcode::StorageAtomicUMin32x2:
+    case IR::Opcode::StorageAtomicSMax32x2:
+    case IR::Opcode::StorageAtomicUMax32x2:
+    case IR::Opcode::StorageAtomicAnd32x2:
+    case IR::Opcode::StorageAtomicOr32x2:
+    case IR::Opcode::StorageAtomicXor32x2:
+    case IR::Opcode::StorageAtomicExchange32x2:
        info.used_storage_buffer_types |= IR::Type::U32x2;
        break;
    case IR::Opcode::LoadStorage128:
@@ -688,7 +706,7 @@ void VisitUsages(Info& info, IR::Inst& inst) {
    case IR::Opcode::StorageAtomicAnd64:
    case IR::Opcode::StorageAtomicOr64:
    case IR::Opcode::StorageAtomicXor64:
-        info.used_storage_buffer_types |= IR::Type::U64;
+        info.used_storage_buffer_types |= IR::Type::U64 | IR::Type::U32x2;
        info.uses_int64_bit_atomics = true;
        break;
    case IR::Opcode::BindlessImageAtomicIAdd32:
--- a/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp
+++ b/src/shader_recompiler/ir_opt/global_memory_to_storage_buffer_pass.cpp
@@ -92,6 +92,15 @@ bool IsGlobalMemory(const IR::Inst& inst) {
    case IR::Opcode::GlobalAtomicOr64:
    case IR::Opcode::GlobalAtomicXor64:
    case IR::Opcode::GlobalAtomicExchange64:
+    case IR::Opcode::GlobalAtomicIAdd32x2:
+    case IR::Opcode::GlobalAtomicSMin32x2:
+    case IR::Opcode::GlobalAtomicUMin32x2:
+    case IR::Opcode::GlobalAtomicSMax32x2:
+    case IR::Opcode::GlobalAtomicUMax32x2:
+    case IR::Opcode::GlobalAtomicAnd32x2:
+    case IR::Opcode::GlobalAtomicOr32x2:
+    case IR::Opcode::GlobalAtomicXor32x2:
+    case IR::Opcode::GlobalAtomicExchange32x2:
    case IR::Opcode::GlobalAtomicAddF32:
    case IR::Opcode::GlobalAtomicAddF16x2:
    case IR::Opcode::GlobalAtomicAddF32x2:
@@ -135,6 +144,15 @@ bool IsGlobalMemoryWrite(const IR::Inst& inst) {
    case IR::Opcode::GlobalAtomicOr64:
    case IR::Opcode::GlobalAtomicXor64:
    case IR::Opcode::GlobalAtomicExchange64:
+    case IR::Opcode::GlobalAtomicIAdd32x2:
+    case IR::Opcode::GlobalAtomicSMin32x2:
+    case IR::Opcode::GlobalAtomicUMin32x2:
+    case IR::Opcode::GlobalAtomicSMax32x2:
+    case IR::Opcode::GlobalAtomicUMax32x2:
+    case IR::Opcode::GlobalAtomicAnd32x2:
+    case IR::Opcode::GlobalAtomicOr32x2:
+    case IR::Opcode::GlobalAtomicXor32x2:
+    case IR::Opcode::GlobalAtomicExchange32x2:
    case IR::Opcode::GlobalAtomicAddF32:
    case IR::Opcode::GlobalAtomicAddF16x2:
    case IR::Opcode::GlobalAtomicAddF32x2:
@@ -199,6 +217,8 @@ IR::Opcode GlobalToStorage(IR::Opcode opcode) {
        return IR::Opcode::StorageAtomicOr32;
    case IR::Opcode::GlobalAtomicXor32:
        return IR::Opcode::StorageAtomicXor32;
+    case IR::Opcode::GlobalAtomicExchange32:
+        return IR::Opcode::StorageAtomicExchange32;
    case IR::Opcode::GlobalAtomicIAdd64:
        return IR::Opcode::StorageAtomicIAdd64;
    case IR::Opcode::GlobalAtomicSMin64:
@@ -215,10 +235,26 @@ IR::Opcode GlobalToStorage(IR::Opcode opcode) {
        return IR::Opcode::StorageAtomicOr64;
    case IR::Opcode::GlobalAtomicXor64:
        return IR::Opcode::StorageAtomicXor64;
-    case IR::Opcode::GlobalAtomicExchange32:
-        return IR::Opcode::StorageAtomicExchange32;
    case IR::Opcode::GlobalAtomicExchange64:
        return IR::Opcode::StorageAtomicExchange64;
+    case IR::Opcode::GlobalAtomicIAdd32x2:
+        return IR::Opcode::StorageAtomicIAdd32x2;
+    case IR::Opcode::GlobalAtomicSMin32x2:
+        return IR::Opcode::StorageAtomicSMin32x2;
+    case IR::Opcode::GlobalAtomicUMin32x2:
+        return IR::Opcode::StorageAtomicUMin32x2;
+    case IR::Opcode::GlobalAtomicSMax32x2:
+        return IR::Opcode::StorageAtomicSMax32x2;
+    case IR::Opcode::GlobalAtomicUMax32x2:
+        return IR::Opcode::StorageAtomicUMax32x2;
+    case IR::Opcode::GlobalAtomicAnd32x2:
+        return IR::Opcode::StorageAtomicAnd32x2;
+    case IR::Opcode::GlobalAtomicOr32x2:
+        return IR::Opcode::StorageAtomicOr32x2;
+    case IR::Opcode::GlobalAtomicXor32x2:
+        return IR::Opcode::StorageAtomicXor32x2;
+    case IR::Opcode::GlobalAtomicExchange32x2:
+        return IR::Opcode::StorageAtomicExchange32x2;
    case IR::Opcode::GlobalAtomicAddF32:
        return IR::Opcode::StorageAtomicAddF32;
    case IR::Opcode::GlobalAtomicAddF16x2:
@@ -454,6 +490,15 @@ void Replace(IR::Block& block, IR::Inst& inst, const IR::U32& storage_index,
    case IR::Opcode::GlobalAtomicOr64:
    case IR::Opcode::GlobalAtomicXor64:
    case IR::Opcode::GlobalAtomicExchange64:
+    case IR::Opcode::GlobalAtomicIAdd32x2:
+    case IR::Opcode::GlobalAtomicSMin32x2:
+    case IR::Opcode::GlobalAtomicUMin32x2:
+    case IR::Opcode::GlobalAtomicSMax32x2:
+    case IR::Opcode::GlobalAtomicUMax32x2:
+    case IR::Opcode::GlobalAtomicAnd32x2:
+    case IR::Opcode::GlobalAtomicOr32x2:
+    case IR::Opcode::GlobalAtomicXor32x2:
+    case IR::Opcode::GlobalAtomicExchange32x2:
    case IR::Opcode::GlobalAtomicAddF32:
    case IR::Opcode::GlobalAtomicAddF16x2:
    case IR::Opcode::GlobalAtomicAddF32x2:
--- a/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp
+++ b/src/shader_recompiler/ir_opt/lower_int64_to_int32.cpp
@@ -199,6 +199,26 @@ void Lower(IR::Block& block, IR::Inst& inst) {
        return ShiftRightLogical64To32(block, inst);
    case IR::Opcode::ShiftRightArithmetic64:
        return ShiftRightArithmetic64To32(block, inst);
+    case IR::Opcode::SharedAtomicExchange64:
+        return inst.ReplaceOpcode(IR::Opcode::SharedAtomicExchange32x2);
+    case IR::Opcode::GlobalAtomicIAdd64:
+        return inst.ReplaceOpcode(IR::Opcode::GlobalAtomicIAdd32x2);
+    case IR::Opcode::GlobalAtomicSMin64:
+        return inst.ReplaceOpcode(IR::Opcode::GlobalAtomicSMin32x2);
+    case IR::Opcode::GlobalAtomicUMin64:
+        return inst.ReplaceOpcode(IR::Opcode::GlobalAtomicUMin32x2);
+    case IR::Opcode::GlobalAtomicSMax64:
+        return inst.ReplaceOpcode(IR::Opcode::GlobalAtomicSMax32x2);
+    case IR::Opcode::GlobalAtomicUMax64:
+        return inst.ReplaceOpcode(IR::Opcode::GlobalAtomicUMax32x2);
+    case IR::Opcode::GlobalAtomicAnd64:
+        return inst.ReplaceOpcode(IR::Opcode::GlobalAtomicAnd32x2);
+    case IR::Opcode::GlobalAtomicOr64:
+        return inst.ReplaceOpcode(IR::Opcode::GlobalAtomicOr32x2);
+    case IR::Opcode::GlobalAtomicXor64:
+        return inst.ReplaceOpcode(IR::Opcode::GlobalAtomicXor32x2);
+    case IR::Opcode::GlobalAtomicExchange64:
+        return inst.ReplaceOpcode(IR::Opcode::GlobalAtomicExchange32x2);
    default:
        break;
    }
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -1474,6 +1474,8 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
            // When this memory region has been joined a bunch of times, we assume it's being used
            // as a stream buffer. Increase the size to skip constantly recreating buffers.
            has_stream_leap = true;
+            begin -= PAGE_SIZE * 256;
+            cpu_addr = begin;
            end += PAGE_SIZE * 256;
        }
    }
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -240,7 +240,7 @@ void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters)
        ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size());

    // Execute the current macro.
-    macro_engine->Execute(*this, macro_positions[entry], parameters);
+    macro_engine->Execute(macro_positions[entry], parameters);
    if (mme_draw.current_mode != MMEDrawMode::Undefined) {
        FlushMMEInlineDraw();
    }
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -12,9 +12,6 @@
 #include "video_core/framebuffer_config.h"

 namespace Core {
-namespace Frontend {
-class EmuWindow;
-}
 class System;
 } // namespace Core

@@ -25,7 +22,6 @@ class ShaderNotify;

 namespace Tegra {
 class DmaPusher;
-class CDmaPusher;
 struct CommandList;

 enum class RenderTargetFormat : u32 {
@@ -88,15 +84,9 @@ enum class DepthFormat : u32 {
    D32_FLOAT_S8X24_UINT = 0x19,
 };

-struct CommandListHeader;
-class DebugContext;
-
 namespace Engines {
-class Fermi2D;
 class Maxwell3D;
-class MaxwellDMA;
 class KeplerCompute;
-class KeplerMemory;
 } // namespace Engines

 enum class EngineID {
@@ -190,12 +180,6 @@ public:
    /// Returns a const reference to the GPU DMA pusher.
    [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const;

-    /// Returns a reference to the GPU CDMA pusher.
-    [[nodiscard]] Tegra::CDmaPusher& CDmaPusher();
-
-    /// Returns a const reference to the GPU CDMA pusher.
-    [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const;
-
    /// Returns a reference to the underlying renderer.
    [[nodiscard]] VideoCore::RendererBase& Renderer();

--- a/src/video_core/macro/macro.cpp
+++ b/src/video_core/macro/macro.cpp
@@ -2,12 +2,13 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <cstring>
 #include <optional>
+
 #include <boost/container_hash/hash.hpp>
+
 #include "common/assert.h"
-#include "common/logging/log.h"
 #include "common/settings.h"
-#include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro/macro.h"
 #include "video_core/macro/macro_hle.h"
 #include "video_core/macro/macro_interpreter.h"
@@ -24,8 +25,7 @@ void MacroEngine::AddCode(u32 method, u32 data) {
    uploaded_macro_code[method].push_back(data);
 }

-void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
-                          const std::vector<u32>& parameters) {
+void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) {
    auto compiled_macro = macro_cache.find(method);
    if (compiled_macro != macro_cache.end()) {
        const auto& cache_info = compiled_macro->second;
@@ -66,10 +66,9 @@ void MacroEngine::Execute(Engines::Maxwell3D& maxwell3d, u32 method,
            cache_info.lle_program = Compile(code);
        }

-        auto hle_program = hle_macros->GetHLEProgram(cache_info.hash);
-        if (hle_program.has_value()) {
+        if (auto hle_program = hle_macros->GetHLEProgram(cache_info.hash)) {
            cache_info.has_hle_program = true;
-            cache_info.hle_program = std::move(hle_program.value());
+            cache_info.hle_program = std::move(hle_program);
            cache_info.hle_program->Execute(parameters, method);
        } else {
            cache_info.lle_program->Execute(parameters, method);
--- a/src/video_core/macro/macro.h
+++ b/src/video_core/macro/macro.h
@@ -119,7 +119,7 @@ public:
    void AddCode(u32 method, u32 data);

    // Compiles the macro if its not in the cache, and executes the compiled macro
-    void Execute(Engines::Maxwell3D& maxwell3d, u32 method, const std::vector<u32>& parameters);
+    void Execute(u32 method, const std::vector<u32>& parameters);

 protected:
    virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0;
--- a/src/video_core/macro/macro_hle.cpp
+++ b/src/video_core/macro/macro_hle.cpp
@@ -5,12 +5,15 @@
 #include <array>
 #include <vector>
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/macro/macro.h"
 #include "video_core/macro/macro_hle.h"
 #include "video_core/rasterizer_interface.h"

 namespace Tegra {
-
 namespace {
+
+using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
+
 // HLE'd functions
 void HLE_771BB18C62444DA0(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters) {
    const u32 instance_count = parameters[2] & maxwell3d.GetRegisterValue(0xD1B);
@@ -77,7 +80,6 @@ void HLE_0217920100488FF7(Engines::Maxwell3D& maxwell3d, const std::vector<u32>&
    maxwell3d.CallMethodFromMME(0x8e5, 0x0);
    maxwell3d.mme_draw.current_mode = Engines::Maxwell3D::MMEDrawMode::Undefined;
 }
-} // Anonymous namespace

 constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
    {0x771BB18C62444DA0, &HLE_771BB18C62444DA0},
@@ -85,25 +87,31 @@ constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{
    {0x0217920100488FF7, &HLE_0217920100488FF7},
 }};

+class HLEMacroImpl final : public CachedMacro {
+public:
+    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_)
+        : maxwell3d{maxwell3d_}, func{func_} {}
+
+    void Execute(const std::vector<u32>& parameters, u32 method) override {
+        func(maxwell3d, parameters);
+    }
+
+private:
+    Engines::Maxwell3D& maxwell3d;
+    HLEFunction func;
+};
+} // Anonymous namespace
+
 HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {}
 HLEMacro::~HLEMacro() = default;

-std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const {
+std::unique_ptr<CachedMacro> HLEMacro::GetHLEProgram(u64 hash) const {
    const auto it = std::find_if(hle_funcs.cbegin(), hle_funcs.cend(),
                                 [hash](const auto& pair) { return pair.first == hash; });
    if (it == hle_funcs.end()) {
-        return std::nullopt;
+        return nullptr;
    }
    return std::make_unique<HLEMacroImpl>(maxwell3d, it->second);
 }

-HLEMacroImpl::~HLEMacroImpl() = default;
-
-HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_)
-    : maxwell3d{maxwell3d_}, func{func_} {}
-
-void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) {
-    func(maxwell3d, parameters);
-}
-
 } // namespace Tegra
--- a/src/video_core/macro/macro_hle.h
+++ b/src/video_core/macro/macro_hle.h
@@ -5,10 +5,7 @@
 #pragma once

 #include <memory>
-#include <optional>
-#include <vector>
 #include "common/common_types.h"
-#include "video_core/macro/macro.h"

 namespace Tegra {

@@ -16,29 +13,17 @@ namespace Engines {
 class Maxwell3D;
 }

-using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& parameters);
-
 class HLEMacro {
 public:
    explicit HLEMacro(Engines::Maxwell3D& maxwell3d_);
    ~HLEMacro();

-    std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const;
+    // Allocates and returns a cached macro if the hash matches a known function.
+    // Returns nullptr otherwise.
+    [[nodiscard]] std::unique_ptr<CachedMacro> GetHLEProgram(u64 hash) const;

 private:
    Engines::Maxwell3D& maxwell3d;
 };

-class HLEMacroImpl : public CachedMacro {
-public:
-    explicit HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func);
-    ~HLEMacroImpl();
-
-    void Execute(const std::vector<u32>& parameters, u32 method) override;
-
-private:
-    Engines::Maxwell3D& maxwell3d;
-    HLEFunction func;
-};
-
 } // namespace Tegra
--- a/src/video_core/macro/macro_interpreter.cpp
+++ b/src/video_core/macro/macro_interpreter.cpp
@@ -2,6 +2,9 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <array>
+#include <optional>
+
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
@@ -11,16 +14,81 @@
 MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));

 namespace Tegra {
-MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_)
-    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
+namespace {
+class MacroInterpreterImpl final : public CachedMacro {
+public:
+    explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
+        : maxwell3d{maxwell3d_}, code{code_} {}

-std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
-    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
-}
+    void Execute(const std::vector<u32>& params, u32 method) override;

-MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_,
-                                           const std::vector<u32>& code_)
-    : maxwell3d{maxwell3d_}, code{code_} {}
+private:
+    /// Resets the execution engine state, zeroing registers, etc.
+    void Reset();
+
+    /**
+     * Executes a single macro instruction located at the current program counter. Returns whether
+     * the interpreter should keep running.
+     *
+     * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
+     *                      previous instruction.
+     */
+    bool Step(bool is_delay_slot);
+
+    /// Calculates the result of an ALU operation. src_a OP src_b;
+    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
+
+    /// Performs the result operation on the input result and stores it in the specified register
+    /// (if necessary).
+    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
+
+    /// Evaluates the branch condition and returns whether the branch should be taken or not.
+    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
+
+    /// Reads an opcode at the current program counter location.
+    Macro::Opcode GetOpcode() const;
+
+    /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
+    u32 GetRegister(u32 register_id) const;
+
+    /// Sets the register to the input value.
+    void SetRegister(u32 register_id, u32 value);
+
+    /// Sets the method address to use for the next Send instruction.
+    void SetMethodAddress(u32 address);
+
+    /// Calls a GPU Engine method with the input parameter.
+    void Send(u32 value);
+
+    /// Reads a GPU register located at the method address.
+    u32 Read(u32 method) const;
+
+    /// Returns the next parameter in the parameter queue.
+    u32 FetchParameter();
+
+    Engines::Maxwell3D& maxwell3d;
+
+    /// Current program counter
+    u32 pc{};
+    /// Program counter to execute at after the delay slot is executed.
+    std::optional<u32> delayed_pc;
+
+    /// General purpose macro registers.
+    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
+
+    /// Method address to use for the next Send instruction.
+    Macro::MethodAddress method_address = {};
+
+    /// Input parameters of the current macro.
+    std::unique_ptr<u32[]> parameters;
+    std::size_t num_parameters = 0;
+    std::size_t parameters_capacity = 0;
+    /// Index of the next parameter that will be fetched by the 'parm' instruction.
+    u32 next_parameter_index = 0;
+
+    bool carry_flag = false;
+    const std::vector<u32>& code;
+};

 void MacroInterpreterImpl::Execute(const std::vector<u32>& params, u32 method) {
    MICROPROFILE_SCOPE(MacroInterp);
@@ -283,5 +351,13 @@ u32 MacroInterpreterImpl::FetchParameter() {
    ASSERT(next_parameter_index < num_parameters);
    return parameters[next_parameter_index++];
 }
+} // Anonymous namespace
+
+MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_)
+    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
+
+std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroInterpreterImpl>(maxwell3d, code);
+}

 } // namespace Tegra
--- a/src/video_core/macro/macro_interpreter.h
+++ b/src/video_core/macro/macro_interpreter.h
@@ -3,10 +3,9 @@
 // Refer to the license.txt file included.

 #pragma once
-#include <array>
-#include <optional>
+
 #include <vector>
-#include "common/bit_field.h"
+
 #include "common/common_types.h"
 #include "video_core/macro/macro.h"

@@ -26,77 +25,4 @@ private:
    Engines::Maxwell3D& maxwell3d;
 };

-class MacroInterpreterImpl : public CachedMacro {
-public:
-    explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
-    void Execute(const std::vector<u32>& params, u32 method) override;
-
-private:
-    /// Resets the execution engine state, zeroing registers, etc.
-    void Reset();
-
-    /**
-     * Executes a single macro instruction located at the current program counter. Returns whether
-     * the interpreter should keep running.
-     *
-     * @param is_delay_slot Whether the current step is being executed due to a delay slot in a
-     *                      previous instruction.
-     */
-    bool Step(bool is_delay_slot);
-
-    /// Calculates the result of an ALU operation. src_a OP src_b;
-    u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b);
-
-    /// Performs the result operation on the input result and stores it in the specified register
-    /// (if necessary).
-    void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result);
-
-    /// Evaluates the branch condition and returns whether the branch should be taken or not.
-    bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const;
-
-    /// Reads an opcode at the current program counter location.
-    Macro::Opcode GetOpcode() const;
-
-    /// Returns the specified register's value. Register 0 is hardcoded to always return 0.
-    u32 GetRegister(u32 register_id) const;
-
-    /// Sets the register to the input value.
-    void SetRegister(u32 register_id, u32 value);
-
-    /// Sets the method address to use for the next Send instruction.
-    void SetMethodAddress(u32 address);
-
-    /// Calls a GPU Engine method with the input parameter.
-    void Send(u32 value);
-
-    /// Reads a GPU register located at the method address.
-    u32 Read(u32 method) const;
-
-    /// Returns the next parameter in the parameter queue.
-    u32 FetchParameter();
-
-    Engines::Maxwell3D& maxwell3d;
-
-    /// Current program counter
-    u32 pc;
-    /// Program counter to execute at after the delay slot is executed.
-    std::optional<u32> delayed_pc;
-
-    /// General purpose macro registers.
-    std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {};
-
-    /// Method address to use for the next Send instruction.
-    Macro::MethodAddress method_address = {};
-
-    /// Input parameters of the current macro.
-    std::unique_ptr<u32[]> parameters;
-    std::size_t num_parameters = 0;
-    std::size_t parameters_capacity = 0;
-    /// Index of the next parameter that will be fetched by the 'parm' instruction.
-    u32 next_parameter_index = 0;
-
-    bool carry_flag = false;
-    const std::vector<u32>& code;
-};
-
 } // namespace Tegra
--- a/src/video_core/macro/macro_jit_x64.cpp
+++ b/src/video_core/macro/macro_jit_x64.cpp
@@ -2,9 +2,17 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <array>
+#include <bitset>
+#include <optional>
+
+#include <xbyak/xbyak.h>
+
 #include "common/assert.h"
+#include "common/bit_field.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
+#include "common/x64/xbyak_abi.h"
 #include "common/x64/xbyak_util.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro/macro_interpreter.h"
@@ -14,13 +22,14 @@ MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255
 MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0));

 namespace Tegra {
+namespace {
 constexpr Xbyak::Reg64 STATE = Xbyak::util::rbx;
 constexpr Xbyak::Reg32 RESULT = Xbyak::util::ebp;
 constexpr Xbyak::Reg64 PARAMETERS = Xbyak::util::r12;
 constexpr Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d;
 constexpr Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15;

-static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
+constexpr std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
    STATE,
    RESULT,
    PARAMETERS,
@@ -28,19 +37,75 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({
    BRANCH_HOLDER,
 });

-MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_)
-    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
+// Arbitrarily chosen based on current booting games.
+constexpr size_t MAX_CODE_SIZE = 0x10000;

-std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
-    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
+std::bitset<32> PersistentCallerSavedRegs() {
+    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
 }

-MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
-    : CodeGenerator{MAX_CODE_SIZE}, code{code_}, maxwell3d{maxwell3d_} {
-    Compile();
-}
+class MacroJITx64Impl final : public Xbyak::CodeGenerator, public CachedMacro {
+public:
+    explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_)
+        : CodeGenerator{MAX_CODE_SIZE}, code{code_}, maxwell3d{maxwell3d_} {
+        Compile();
+    }

-MacroJITx64Impl::~MacroJITx64Impl() = default;
+    void Execute(const std::vector<u32>& parameters, u32 method) override;
+
+    void Compile_ALU(Macro::Opcode opcode);
+    void Compile_AddImmediate(Macro::Opcode opcode);
+    void Compile_ExtractInsert(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
+    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
+    void Compile_Read(Macro::Opcode opcode);
+    void Compile_Branch(Macro::Opcode opcode);
+
+private:
+    void Optimizer_ScanFlags();
+
+    void Compile();
+    bool Compile_NextInstruction();
+
+    Xbyak::Reg32 Compile_FetchParameter();
+    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
+
+    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
+    void Compile_Send(Xbyak::Reg32 value);
+
+    Macro::Opcode GetOpCode() const;
+
+    struct JITState {
+        Engines::Maxwell3D* maxwell3d{};
+        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
+        u32 carry_flag{};
+    };
+    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
+    using ProgramType = void (*)(JITState*, const u32*);
+
+    struct OptimizerState {
+        bool can_skip_carry{};
+        bool has_delayed_pc{};
+        bool zero_reg_skip{};
+        bool skip_dummy_addimmediate{};
+        bool optimize_for_method_move{};
+        bool enable_asserts{};
+    };
+    OptimizerState optimizer{};
+
+    std::optional<Macro::Opcode> next_opcode{};
+    ProgramType program{nullptr};
+
+    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
+    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
+    Xbyak::Label end_of_code{};
+
+    bool is_delay_slot{};
+    u32 pc{};
+
+    const std::vector<u32>& code;
+    Engines::Maxwell3D& maxwell3d;
+};

 void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) {
    MICROPROFILE_SCOPE(MacroJitExecute);
@@ -307,11 +372,11 @@ void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) {
    Compile_ProcessResult(opcode.result_operation, opcode.dst);
 }

-static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
+void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) {
    maxwell3d->CallMethodFromMME(method_address.address, value);
 }

-void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
+void MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
    Common::X64::ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0);
    mov(Common::X64::ABI_PARAM1, qword[STATE]);
    mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS);
@@ -338,7 +403,7 @@ void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) {
    L(dont_process);
 }

-void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
+void MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
    ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid");
    const s32 jump_address =
        static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32));
@@ -392,7 +457,7 @@ void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) {
    L(end);
 }

-void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() {
+void MacroJITx64Impl::Optimizer_ScanFlags() {
    optimizer.can_skip_carry = true;
    optimizer.has_delayed_pc = false;
    for (auto raw_op : code) {
@@ -534,7 +599,7 @@ bool MacroJITx64Impl::Compile_NextInstruction() {
    return true;
 }

-Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() {
+Xbyak::Reg32 MacroJITx64Impl::Compile_FetchParameter() {
    mov(eax, dword[PARAMETERS]);
    add(PARAMETERS, sizeof(u32));
    return eax;
@@ -611,9 +676,12 @@ Macro::Opcode MacroJITx64Impl::GetOpCode() const {
    ASSERT(pc < code.size());
    return {code[pc]};
 }
+} // Anonymous namespace

-std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const {
-    return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED;
+MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_)
+    : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {}
+
+std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) {
+    return std::make_unique<MacroJITx64Impl>(maxwell3d, code);
 }
-
 } // namespace Tegra
--- a/src/video_core/macro/macro_jit_x64.h
+++ b/src/video_core/macro/macro_jit_x64.h
@@ -4,12 +4,7 @@

 #pragma once

-#include <array>
-#include <bitset>
-#include <xbyak/xbyak.h>
-#include "common/bit_field.h"
 #include "common/common_types.h"
-#include "common/x64/xbyak_abi.h"
 #include "video_core/macro/macro.h"

 namespace Tegra {
@@ -18,9 +13,6 @@ namespace Engines {
 class Maxwell3D;
 }

-/// MAX_CODE_SIZE is arbitrarily chosen based on current booting games
-constexpr size_t MAX_CODE_SIZE = 0x10000;
-
 class MacroJITx64 final : public MacroEngine {
 public:
    explicit MacroJITx64(Engines::Maxwell3D& maxwell3d_);
@@ -32,67 +24,4 @@ private:
    Engines::Maxwell3D& maxwell3d;
 };

-class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro {
-public:
-    explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_);
-    ~MacroJITx64Impl();
-
-    void Execute(const std::vector<u32>& parameters, u32 method) override;
-
-    void Compile_ALU(Macro::Opcode opcode);
-    void Compile_AddImmediate(Macro::Opcode opcode);
-    void Compile_ExtractInsert(Macro::Opcode opcode);
-    void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode);
-    void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode);
-    void Compile_Read(Macro::Opcode opcode);
-    void Compile_Branch(Macro::Opcode opcode);
-
-private:
-    void Optimizer_ScanFlags();
-
-    void Compile();
-    bool Compile_NextInstruction();
-
-    Xbyak::Reg32 Compile_FetchParameter();
-    Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst);
-
-    void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg);
-    void Compile_Send(Xbyak::Reg32 value);
-
-    Macro::Opcode GetOpCode() const;
-    std::bitset<32> PersistentCallerSavedRegs() const;
-
-    struct JITState {
-        Engines::Maxwell3D* maxwell3d{};
-        std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{};
-        u32 carry_flag{};
-    };
-    static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0");
-    using ProgramType = void (*)(JITState*, const u32*);
-
-    struct OptimizerState {
-        bool can_skip_carry{};
-        bool has_delayed_pc{};
-        bool zero_reg_skip{};
-        bool skip_dummy_addimmediate{};
-        bool optimize_for_method_move{};
-        bool enable_asserts{};
-    };
-    OptimizerState optimizer{};
-
-    std::optional<Macro::Opcode> next_opcode{};
-    ProgramType program{nullptr};
-
-    std::array<Xbyak::Label, MAX_CODE_SIZE> labels;
-    std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip;
-    Xbyak::Label end_of_code{};
-
-    bool is_delay_slot{};
-    u32 pc{};
-    std::optional<u32> delayed_pc;
-
-    const std::vector<u32>& code;
-    Engines::Maxwell3D& maxwell3d;
-};
-
 } // namespace Tegra
--- a/src/video_core/renderer_vulkan/vk_fsr.cpp
+++ b/src/video_core/renderer_vulkan/vk_fsr.cpp
@@ -214,7 +214,7 @@ VkImageView FSR::Draw(VKScheduler& scheduler, size_t image_index, VkImageView im

        {
            VkImageMemoryBarrier fsr_write_barrier = base_barrier;
-            fsr_write_barrier.image = *images[image_index],
+            fsr_write_barrier.image = *images[image_index];
            fsr_write_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;

            cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -393,6 +393,8 @@ void Config::ReadControlValues() {
    ReadGlobalSetting(Settings::values.enable_accurate_vibrations);
    ReadGlobalSetting(Settings::values.motion_enabled);

+    ReadBasicSetting(Settings::values.controller_navigation);
+
    qt_config->endGroup();
 }

@@ -1001,6 +1003,7 @@ void Config::SaveControlValues() {
    WriteBasicSetting(Settings::values.keyboard_enabled);
    WriteBasicSetting(Settings::values.emulate_analog_keyboard);
    WriteBasicSetting(Settings::values.mouse_panning_sensitivity);
+    WriteBasicSetting(Settings::values.controller_navigation);

    WriteBasicSetting(Settings::values.tas_enable);
    WriteBasicSetting(Settings::values.tas_loop);
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -429,7 +429,7 @@
             </item>
             <item>
              <property name="text">
-               <string>AMD FidelityFX™️ Super Resolution [Vulkan Only]</string>
+               <string>AMD FidelityFX™️ Super Resolution (Vulkan Only)</string>
              </property>
             </item>
            </widget>
--- a/src/yuzu/configuration/configure_input_advanced.cpp
+++ b/src/yuzu/configuration/configure_input_advanced.cpp
@@ -131,6 +131,7 @@ void ConfigureInputAdvanced::ApplyConfiguration() {
    Settings::values.touchscreen.enabled = ui->touchscreen_enabled->isChecked();
    Settings::values.enable_raw_input = ui->enable_raw_input->isChecked();
    Settings::values.enable_udp_controller = ui->enable_udp_controller->isChecked();
+    Settings::values.controller_navigation = ui->controller_navigation->isChecked();
 }

 void ConfigureInputAdvanced::LoadConfiguration() {
@@ -162,6 +163,7 @@ void ConfigureInputAdvanced::LoadConfiguration() {
    ui->touchscreen_enabled->setChecked(Settings::values.touchscreen.enabled);
    ui->enable_raw_input->setChecked(Settings::values.enable_raw_input.GetValue());
    ui->enable_udp_controller->setChecked(Settings::values.enable_udp_controller.GetValue());
+    ui->controller_navigation->setChecked(Settings::values.controller_navigation.GetValue());

    UpdateUIEnabled();
 }
--- a/src/yuzu/configuration/configure_input_advanced.ui
+++ b/src/yuzu/configuration/configure_input_advanced.ui
@@ -2655,6 +2655,19 @@
                     </widget>
                   </item>
                   <item row="4" column="0">
+                     <widget class="QCheckBox" name="controller_navigation">
+                       <property name="minimumSize">
+                         <size>
+                           <width>0</width>
+                           <height>23</height>
+                         </size>
+                       </property>
+                       <property name="text">
+                         <string>Controller navigation</string>
+                       </property>
+                     </widget>
+                   </item>
+                   <item row="5" column="0">
                     <widget class="QCheckBox" name="mouse_panning">
                       <property name="minimumSize">
                         <size>
@@ -2667,7 +2680,7 @@
                       </property>
                     </widget>
                   </item>
-                   <item row="4" column="2">
+                   <item row="5" column="2">
                     <widget class="QSpinBox" name="mouse_panning_sensitivity">
                       <property name="toolTip">
                         <string>Mouse sensitivity</string>
@@ -2689,14 +2702,14 @@
                       </property>
                     </widget>
                   </item>
-                   <item row="5" column="0">
+                   <item row="6" column="0">
                     <widget class="QLabel" name="motion_touch">
                       <property name="text">
                         <string>Motion / Touch</string>
                       </property>
                     </widget>
                   </item>
-                   <item row="5" column="2">
+                   <item row="6" column="2">
                     <widget class="QPushButton" name="buttonMotionTouch">
                       <property name="text">
                         <string>Configure</string>
--- a/src/yuzu/configuration/configure_input_player.cpp
+++ b/src/yuzu/configuration/configure_input_player.cpp
@@ -147,7 +147,7 @@ QString ConfigureInputPlayer::ButtonToText(const Common::ParamPackage& param) {
    // Retrieve the names from Qt
    if (param.Get("engine", "") == "keyboard") {
        const QString button_str = GetKeyName(param.Get("code", 0));
-        return QObject::tr("%1%2").arg(toggle, button_str);
+        return QObject::tr("%1%2%3").arg(toggle, inverted, button_str);
    }

    if (common_button_name == Common::Input::ButtonNames::Invalid) {
@@ -341,7 +341,7 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
                        emulated_controller->SetButtonParam(button_id, {});
                        button_map[button_id]->setText(tr("[not set]"));
                    });
-                    if (param.Has("button") || param.Has("hat")) {
+                    if (param.Has("code") || param.Has("button") || param.Has("hat")) {
                        context_menu.addAction(tr("Toggle button"), [&] {
                            const bool toggle_value = !param.Get("toggle", false);
                            param.Set("toggle", toggle_value);
@@ -349,8 +349,8 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i
                            emulated_controller->SetButtonParam(button_id, param);
                        });
                        context_menu.addAction(tr("Invert button"), [&] {
-                            const bool toggle_value = !param.Get("inverted", false);
-                            param.Set("inverted", toggle_value);
+                            const bool invert_value = !param.Get("inverted", false);
+                            param.Set("inverted", invert_value);
                            button_map[button_id]->setText(ButtonToText(param));
                            emulated_controller->SetButtonParam(button_id, param);
                        });
@@ -522,28 +522,37 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i

        analog_map_modifier_button[analog_id]->setContextMenuPolicy(Qt::CustomContextMenu);

-        connect(analog_map_modifier_button[analog_id], &QPushButton::customContextMenuRequested,
-                [=, this](const QPoint& menu_location) {
-                    QMenu context_menu;
-                    Common::ParamPackage param = emulated_controller->GetStickParam(analog_id);
-                    context_menu.addAction(tr("Clear"), [&] {
-                        param.Set("modifier", "");
-                        analog_map_modifier_button[analog_id]->setText(tr("[not set]"));
-                        emulated_controller->SetStickParam(analog_id, param);
-                    });
-                    context_menu.addAction(tr("Toggle button"), [&] {
-                        Common::ParamPackage modifier_param =
-                            Common::ParamPackage{param.Get("modifier", "")};
-                        const bool toggle_value = !modifier_param.Get("toggle", false);
-                        modifier_param.Set("toggle", toggle_value);
-                        param.Set("modifier", modifier_param.Serialize());
-                        analog_map_modifier_button[analog_id]->setText(
-                            ButtonToText(modifier_param));
-                        emulated_controller->SetStickParam(analog_id, param);
-                    });
-                    context_menu.exec(
-                        analog_map_modifier_button[analog_id]->mapToGlobal(menu_location));
+        connect(
+            analog_map_modifier_button[analog_id], &QPushButton::customContextMenuRequested,
+            [=, this](const QPoint& menu_location) {
+                QMenu context_menu;
+                Common::ParamPackage param = emulated_controller->GetStickParam(analog_id);
+                context_menu.addAction(tr("Clear"), [&] {
+                    param.Set("modifier", "");
+                    analog_map_modifier_button[analog_id]->setText(tr("[not set]"));
+                    emulated_controller->SetStickParam(analog_id, param);
                });
+                context_menu.addAction(tr("Toggle button"), [&] {
+                    Common::ParamPackage modifier_param =
+                        Common::ParamPackage{param.Get("modifier", "")};
+                    const bool toggle_value = !modifier_param.Get("toggle", false);
+                    modifier_param.Set("toggle", toggle_value);
+                    param.Set("modifier", modifier_param.Serialize());
+                    analog_map_modifier_button[analog_id]->setText(ButtonToText(modifier_param));
+                    emulated_controller->SetStickParam(analog_id, param);
+                });
+                context_menu.addAction(tr("Invert button"), [&] {
+                    Common::ParamPackage modifier_param =
+                        Common::ParamPackage{param.Get("modifier", "")};
+                    const bool invert_value = !modifier_param.Get("inverted", false);
+                    modifier_param.Set("inverted", invert_value);
+                    param.Set("modifier", modifier_param.Serialize());
+                    analog_map_modifier_button[analog_id]->setText(ButtonToText(modifier_param));
+                    emulated_controller->SetStickParam(analog_id, param);
+                });
+                context_menu.exec(
+                    analog_map_modifier_button[analog_id]->mapToGlobal(menu_location));
+            });

        connect(analog_map_range_spinbox[analog_id], qOverload<int>(&QSpinBox::valueChanged),
                [=, this] {
--- a/src/yuzu/hotkeys.cpp
+++ b/src/yuzu/hotkeys.cpp
@@ -190,6 +190,9 @@ void ControllerShortcut::ControllerUpdateEvent(Core::HID::ControllerTriggerType
    if (type != Core::HID::ControllerTriggerType::Button) {
        return;
    }
+    if (!Settings::values.controller_navigation) {
+        return;
+    }
    if (button_sequence.npad.raw == Core::HID::NpadButton::None &&
        button_sequence.capture.raw == 0 && button_sequence.home.raw == 0) {
        return;
--- a/src/yuzu/util/controller_navigation.cpp
+++ b/src/yuzu/util/controller_navigation.cpp
@@ -40,6 +40,9 @@ void ControllerNavigation::TriggerButton(Settings::NativeButton::Values native_b

 void ControllerNavigation::ControllerUpdateEvent(Core::HID::ControllerTriggerType type) {
    std::lock_guard lock{mutex};
+    if (!Settings::values.controller_navigation) {
+        return;
+    }
    if (type == Core::HID::ControllerTriggerType::Button) {
        ControllerUpdateButton();
        return;
Author	SHA1	Message	Date
ameerj	e394e1ecc4	emit_glsl_atomic: Implement 32x2 fallback atomic ops	2022-01-29 19:56:03 -05:00
ameerj	90a0506d56	lower_int64_to_int32: Add 64-bit atomic fallbacks	2022-01-29 19:56:02 -05:00
ameerj	ad58d7eae7	shaders: Add U64->U32x2 Atomic fallback functions	2022-01-29 19:55:53 -05:00
Morph	11099dda2e	Merge pull request #7791 from german77/wall_clock wall_clock: Use standard wall clock if rtsc frequency is too low	2022-01-28 20:04:24 -05:00
Morph	64a68ccbb4	Merge pull request #7800 from ameerj/spirv-int64-storage spirv_atomic: Define U32x2 storage buffers for 64-bit storage atomics	2022-01-28 20:03:50 -05:00
ameerj	4790ba7839	spirv_atomic: Define U32x2 storage buffers for 64-bit storage atomics Some drivers do not support 64-bit atomics, and fallback to atomically modifying U32x2 vectors. This change ensures that U32x2 storage vectors are defined in the spir-v shader when 64-bit atomics are used. Fixes a hang on some devices, notably Intel GPUs, when booting Pokemon Legends Arceus	2022-01-28 19:00:04 -05:00
Morph	1900abde13	Merge pull request #7784 from german77/ds5 input_common: Add DS5 to HD rumble list	2022-01-28 18:36:28 -05:00
Morph	60b5670577	Merge pull request #7787 from bunnei/scheduler-deadlock-fix hle: kernel: KScheduler: Fix deadlock with core waiting for a thread lock that has migrated.	2022-01-28 18:30:29 -05:00
Morph	b00406c8e4	Merge pull request #7788 from ameerj/stream-buffer-begin buffer_cache: Reduce stream buffer allocations when expanding from the left	2022-01-28 18:30:01 -05:00
Morph	8dea7fa129	Merge pull request #7786 from ameerj/vmnmx-sel video_minimum_maximum: Implement src operand selectors	2022-01-28 18:24:56 -05:00
Morph	2241d8c971	Merge pull request #7799 from ameerj/amd-xfb emit_spirv: Add Xfb execution mode when transform feedback is used	2022-01-28 17:55:17 -05:00
ameerj	beaf7654bb	emit_spirv: Add Xfb execution mode when transform feedback is used Fixes Transform Feedback on Vulkan AMD drivers.	2022-01-28 16:32:48 -05:00
bunnei	0dec42431f	Merge pull request #7770 from german77/motion-threshold input_common: Add option to configure gyro threshold	2022-01-27 15:44:04 -08:00
german77	e4c63d432d	wall_clock: use standard wall clock if rtsc frequency is too low	2022-01-27 17:07:52 -06:00
ameerj	f300a1d54b	buffer_cache: Reduce stream buffer allocations when expanding from the left The existing stream buffer optimization accounts for size increases at the end of the allocated buffer. This adds the same optimization, increasing the size from the beginning of the buffer as well to reduce buffer allocations when expanding the same buffer from the left.	2022-01-27 15:31:43 -05:00
bunnei	3a1a3dd0db	hle: kernel: KScheduler: Fix deadlock with core waiting for a thread lock that has migrated. - Previously, it was possible for a thread migration to occur from core A to core B. - Next, core B waits on a guest lock that must be released by a thread queued for core A. - Meanwhile, core A is still waiting on the core B's current thread lock - resulting in a deadlock. - Fix this by try-locking the thread lock. - Fixes softlocks in FF8 and Pokemon Legends Arceus.	2022-01-27 12:17:14 -08:00
ameerj	74e6e3623f	video_minimum_maximum: Implement src operand selectors Used by Pokemon Legends: Arceus	2022-01-27 14:55:08 -05:00
Morph	8a244dd3d3	Merge pull request #7783 from lioncash/abi-cexpr common/xbyak_api: Make BuildRegSet() constexpr	2022-01-27 10:29:34 -05:00
Narr the Reg	fd1cef5616	input_common: Add DS5 to HD rumble list	2022-01-26 21:49:32 -06:00
bunnei	adcac857f8	Merge pull request #7762 from bunnei/un-map-improve Kernel Memory Updates (Part 4): Improve Un/MapPages, and more.	2022-01-26 17:54:20 -08:00
Lioncash	f6a049337e	common/xbyak_api: Make BuildRegSet() constexpr This allows us to eliminate any static constructors that would have been emitted due to the function not being constexpr.	2022-01-26 16:29:15 -05:00
bunnei	40050c1188	Merge pull request #7780 from lioncash/macro video_core/macro: Move impl classes into their cpp files	2022-01-26 12:39:59 -08:00
bunnei	9bf7ad97f5	Merge pull request #7769 from german77/no-control yuzu: Add setting to disable controller navigation	2022-01-25 20:24:42 -08:00
bunnei	5723145165	Merge pull request #7768 from Moonlacer/fsr-1.0.2 Update AMD FidelityFX Super Resolution™ to 1.0.2	2022-01-25 17:32:44 -08:00
Morph	84cc22b21b	Merge pull request #7777 from lioncash/nodisc shader_recompiler: Remove unnecessary [[nodiscard]] specifier	2022-01-25 16:16:20 -05:00
Morph	c93dd45997	Merge pull request #7779 from lioncash/gpu-iface gpu: Remove obsoleted CDmaPusher() accessors	2022-01-25 16:16:04 -05:00
Morph	a1c4bca908	Merge pull request #7778 from lioncash/comma vk_fsr: Replace comma operator with semicolon	2022-01-25 16:15:53 -05:00
Morph	432f4441b9	Merge pull request #7774 from lioncash/mapping input_common/main: Pass MappingData by const reference in callbacks	2022-01-25 16:15:45 -05:00
Morph	306b3491c4	Merge pull request #7773 from lioncash/udp-deprecated input_common/udp_client: Replace deprecated from_string()/to_ulong() functions	2022-01-25 16:15:27 -05:00
Morph	8dbad556ec	Merge pull request #7771 from lioncash/assert kernel/k_affinity_mask: Remove duplicated assert	2022-01-25 16:15:18 -05:00
Lioncash	a8a4f37628	video_core/macro: Add missing <cstring> header Necessary since memcpy is used.	2022-01-25 14:10:02 -05:00
Lioncash	81d1a1133d	video_core/macro_interpreter: Move impl class to the cpp file Keeps the implementation hidden from the intended API and lessens the header dependencies on the interpreter's header.	2022-01-25 14:03:48 -05:00
Lioncash	cfd9f7d25b	video_core/macro_hle: Return unique_ptr directly from GetHLEProgram() Same behavior, but less code and header dependencies.	2022-01-25 13:50:14 -05:00
Lioncash	a05d9405b9	video_core/macro: Remove unused parameter from Execute() Simplifies the function interface.	2022-01-25 13:41:38 -05:00
Lioncash	74f80299b0	video_core/macro_jit_x64: Remove unused impl class member Reduces the size of the impl class a tiny bit.	2022-01-25 13:33:09 -05:00
Lioncash	f11eefed56	video_core/macro_jit_x64: Decouple PersistentCallerSavedRegs() from impl This doesn't depend on class state and can just be a regular function.	2022-01-25 13:31:54 -05:00
Lioncash	6b873b72ae	video_core/macro_jit_x64: Move impl class into cpp file Keeps the implementation internalized and also reduces API-facing header dependencies. Notably, this fully internalizes all of the xbyak externals.	2022-01-25 13:31:46 -05:00
Lioncash	a3c81745b1	video_core/macro_hle: Move impl class into cpp file Given it's intended to be an internal implementation class, we can move it into the cpp file to ensure that. This also lets us move some header dependencies into the cpp file as well.	2022-01-25 13:15:48 -05:00
Lioncash	d8486a9968	gpu: Tidy up forward declarations Over time a few forward declarations became unnecessary, so we can remove these to tidy up the header a little bit.	2022-01-25 13:05:39 -05:00
Lioncash	9b38c8ef08	gpu: Remove obsoleted CDMAPusher() accessors These were obsoleted in `2c47f8aa18` but were accidentally overlooked.	2022-01-25 12:53:56 -05:00
Lioncash	e7af84670d	vk_fsr: Replace comma operator with semicolon Generally, we should be ending statements with a semicolon not a comma Resolves a clang diagnostic.	2022-01-25 12:42:27 -05:00
Lioncash	b46ec4efea	shader_recompiler: Remove unnecessary [[nodiscard]] Since ConvertLegacyToGeneric has a void return value, there's nothing that is actually returned by the function.	2022-01-25 12:16:09 -05:00
bunnei	4f9f55ec21	Merge pull request #7765 from bunnei/update-thread-count hle: kernel: KThread: Improve Increment/Decrement RunningThreadCount.	2022-01-24 18:58:48 -08:00
bunnei	3442365127	Merge pull request #7760 from german77/inverted_keyboard yuzu: Add modifiers for keyboard	2022-01-24 15:41:49 -08:00
Lioncash	651358d0b6	input_common/input_engine: Ensure PadIdentifier UUIDs have a valid initial state The default constructor of a UUID instance doesn't initialize the underlying array.	2022-01-24 11:57:48 -05:00
Lioncash	187c9d7e33	input_common/input_mapping: Simplify UUID validity checks Makes the checks a little more intuitive to read and doesn't construct an extra UUID instance	2022-01-24 11:49:52 -05:00
Lioncash	0849be094e	input_common/input_mapping: Add missing includes Ensures that the class always sees the types it needs.	2022-01-24 11:49:31 -05:00
Lioncash	8bb39750a1	input_common/input_mapping: Remove const from return value Top-level const on a return by value can inhibit move semantics, and is unnecessary.	2022-01-24 11:39:20 -05:00
Lioncash	12e7d3b254	input_common/input_mapping: Default constructor	2022-01-24 11:37:48 -05:00
Lioncash	51dd3da11c	input_common/main: Pass MappingData by const reference in callbacks Avoids creating unnecessary 168 byte copies per callback invocation.	2022-01-24 11:31:43 -05:00
Lioncash	87eb3cb083	input_common/udp_client: Replace deprecated from_string()/to_ulong() functions These are deprecated and make_address variants and to_uint() should be used instead.	2022-01-24 11:14:30 -05:00
Lioncash	b084a9bf0a	input_common/udp_client: Prevent unnecessary string copies We can also remove some redundant const on the return values, since these don't do anything	2022-01-24 10:58:25 -05:00
Lioncash	2f12caccf9	kernel/k_affinity_mask: Remove duplicated assert This is already checked inside GetCoreBit()	2022-01-24 10:35:22 -05:00
german77	b998aa5504	yuzu: Add setting to disable controller navigation	2022-01-23 21:08:49 -06:00
Moonlacer	fdde08bd01	Update FSR to 1.0.2 Updates yuzu's FSR implementation to 1.0.2	2022-01-23 14:38:48 -06:00
bunnei	59add00d4a	hle: kernel: KThread: Improve Increment/Decrement RunningThreadCount. - Previously implementation was incorrect, and would occasionally underflow.	2022-01-22 21:09:45 -08:00
bunnei	e791da9791	core: hle: kernel: KPageTable: Various improvements to MapPages and UnmapPages.	2022-01-22 20:51:34 -08:00
bunnei	07add23251	core: hle: kernel: KPageTable: MapProcessCode: Various cleanup.	2022-01-22 20:51:34 -08:00
bunnei	ee25e0a40b	core: hle: kernel: KPageTable: ReserveTransferMemory: Various cleanup.	2022-01-22 20:51:34 -08:00
bunnei	0cee5e1af8	core: hle: kernel: KPageTable: ResetTransferMemory: Various cleanup.	2022-01-22 20:51:34 -08:00
bunnei	ffcaf5af90	core: hle: kernel: KPageTable: SetMemoryAttribute: Various cleanup.	2022-01-22 20:51:34 -08:00
bunnei	2935c9d8de	core: hle: kernel: KPageTable: Assert valid address on GetPhysicalAddr.	2022-01-22 01:33:26 -08:00
bunnei	264bb5abf7	core: hle: kernel: KPageTable: Operate: Assert lock ownership.	2022-01-22 01:33:26 -08:00
bunnei	0137f2e6e1	core: hle: kernel: KPageTable: SetHeapSize: Cleanup & take physical memory lock.	2022-01-22 01:33:26 -08:00
bunnei	6d8e498f76	core: hle: kernel: Refactor Un/MapPhysicalMemory to remove unnecessary methods.	2022-01-22 01:33:26 -08:00
bunnei	b8b1b58f36	core: hle: kernel: Rename Un/Map to Un/MapMeory.	2022-01-22 01:33:26 -08:00
Narr the Reg	7d133fd37e	yuzu: Add modifiers for keyboard	2022-01-21 20:41:50 -06:00