maxwell_3d: Use insert instead of loop push_back

This reduces the overhead of bounds checking on each element. It won't reduce the cost of allocation because usually this vector's capacity is usually large enough to hold whatever we push to it.
maxwell_3d: Move code to separate functions
2020-11-11 19:52:19 -03:00 · 2020-11-11 19:52:19 -03:00 · 2020-11-09 21:20:08 -08:00 · 2020-11-08 22:09:40 -03:00 · 2020-11-08 19:11:31 -03:00 · 2020-11-08 15:58:11 -05:00
72 changed files with 777 additions and 944 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,9 +7,6 @@
 [submodule "dynarmic"]
    path = externals/dynarmic
    url = https://github.com/MerryMage/dynarmic.git
-[submodule "unicorn"]
-    path = externals/unicorn
-    url = https://github.com/yuzu-emu/unicorn
 [submodule "soundtouch"]
    path = externals/soundtouch
    url = https://github.com/citra-emu/ext-soundtouch.git
--- a/.travis/linux-mingw/docker.sh
+++ b/.travis/linux-mingw/docker.sh
@@ -4,16 +4,8 @@ cd /yuzu
 # override Travis CI unreasonable ccache size
 echo 'max_size = 3.0G' > "$HOME/.ccache/ccache.conf"

-# Dirty hack to trick unicorn makefile into believing we are in a MINGW system
-mv /bin/uname /bin/uname1 && echo -e '#!/bin/sh\necho MINGW64' >> /bin/uname
-chmod +x /bin/uname
-
-# Dirty hack to trick unicorn makefile into believing we have cmd
-echo '' >> /bin/cmd
-chmod +x /bin/cmd
-
 mkdir build && cd build
-cmake .. -G Ninja -DCMAKE_TOOLCHAIN_FILE="$(pwd)/../CMakeModules/MinGWCross.cmake" -DUSE_CCACHE=ON -DYUZU_USE_BUNDLED_UNICORN=ON -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DCMAKE_BUILD_TYPE=Release
+cmake .. -G Ninja -DCMAKE_TOOLCHAIN_FILE="$(pwd)/../CMakeModules/MinGWCross.cmake" -DUSE_CCACHE=ON -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DCMAKE_BUILD_TYPE=Release
 ninja

 # Clean up the dirty hacks
--- a/.travis/linux/docker.sh
+++ b/.travis/linux/docker.sh
@@ -3,7 +3,7 @@
 cd /yuzu

 mkdir build && cd build
-cmake .. -G Ninja -DYUZU_USE_BUNDLED_UNICORN=ON -DYUZU_USE_QT_WEB_ENGINE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/usr/lib/ccache/gcc -DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++ -DYUZU_ENABLE_COMPATIBILITY_REPORTING=${ENABLE_COMPATIBILITY_REPORTING:-"OFF"} -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DUSE_DISCORD_PRESENCE=ON
+cmake .. -G Ninja -DYUZU_USE_QT_WEB_ENGINE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/usr/lib/ccache/gcc -DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++ -DYUZU_ENABLE_COMPATIBILITY_REPORTING=${ENABLE_COMPATIBILITY_REPORTING:-"OFF"} -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DUSE_DISCORD_PRESENCE=ON
 ninja

 ccache -s
--- a/.travis/macos/build.sh
+++ b/.travis/macos/build.sh
@@ -4,13 +4,12 @@ set -o pipefail

 export MACOSX_DEPLOYMENT_TARGET=10.14
 export Qt5_DIR=$(brew --prefix)/opt/qt5
-export UNICORNDIR=$(pwd)/externals/unicorn
 export PATH="/usr/local/opt/ccache/libexec:$PATH"

 # TODO: Build using ninja instead of make
 mkdir build && cd build
 cmake --version
-cmake .. -DYUZU_USE_BUNDLED_UNICORN=ON -DYUZU_USE_QT_WEB_ENGINE=ON -DCMAKE_BUILD_TYPE=Release -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DYUZU_ENABLE_COMPATIBILITY_REPORTING=${ENABLE_COMPATIBILITY_REPORTING:-"OFF"} -DUSE_DISCORD_PRESENCE=ON
+cmake .. -DYUZU_USE_QT_WEB_ENGINE=ON -DCMAKE_BUILD_TYPE=Release -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DYUZU_ENABLE_COMPATIBILITY_REPORTING=${ENABLE_COMPATIBILITY_REPORTING:-"OFF"} -DUSE_DISCORD_PRESENCE=ON
 make -j4

 ccache -s
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,8 +18,6 @@ CMAKE_DEPENDENT_OPTION(YUZU_USE_BUNDLED_QT "Download bundled Qt binaries" ON "EN

 option(ENABLE_WEB_SERVICE "Enable web services (telemetry, etc.)" ON)

-option(YUZU_USE_BUNDLED_UNICORN "Build/Download bundled Unicorn" ON)
-
 option(YUZU_USE_QT_WEB_ENGINE "Use QtWebEngine for web applet implementation" OFF)

 option(YUZU_ENABLE_BOXCAT "Enable the Boxcat service, a yuzu high-level implementation of BCAT" ON)
@@ -161,7 +159,7 @@ macro(yuzu_find_packages)
    #    Cmake Pkg Prefix  Version     Conan Pkg
        "Boost             1.73        boost/1.73.0"
        "Catch2            2.13        catch2/2.13.0"
-        "fmt               7.1         fmt/7.1.0"
+        "fmt               7.1         fmt/7.1.2"
    # can't use until https://github.com/bincrafters/community/issues/1173
        #"libzip            1.5         libzip/1.5.2@bincrafters/stable"
        "lz4               1.8         lz4/1.9.2"
@@ -372,81 +370,6 @@ endif()
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)

-# If unicorn isn't found, msvc -> download bundled unicorn; everyone else -> build external
-if (YUZU_USE_BUNDLED_UNICORN)
-    if (MSVC)
-        message(STATUS "unicorn not found, falling back to bundled")
-        # Detect toolchain and platform
-        if ((MSVC_VERSION GREATER_EQUAL 1910 AND MSVC_VERSION LESS 1930) AND ARCHITECTURE_x86_64)
-            set(UNICORN_VER "unicorn-yuzu")
-        else()
-            message(FATAL_ERROR "No bundled Unicorn binaries for your toolchain. Disable YUZU_USE_BUNDLED_UNICORN and provide your own.")
-        endif()
-
-        if (DEFINED UNICORN_VER)
-            download_bundled_external("unicorn/" ${UNICORN_VER} UNICORN_PREFIX)
-        endif()
-
-        if (DEFINED UNICORN_VER)
-            download_bundled_external("unicorn/" ${UNICORN_VER} UNICORN_PREFIX)
-        endif()
-
-        set(UNICORN_FOUND YES)
-        set(LIBUNICORN_INCLUDE_DIR "${UNICORN_PREFIX}/include" CACHE PATH "Path to Unicorn headers" FORCE)
-        set(LIBUNICORN_LIBRARY "${UNICORN_PREFIX}/lib/x64/unicorn_dynload.lib" CACHE PATH "Path to Unicorn library" FORCE)
-        set(UNICORN_DLL_DIR "${UNICORN_PREFIX}/lib/x64/" CACHE PATH "Path to unicorn.dll" FORCE)
-    else()
-        message(STATUS "unicorn not found, falling back to externals")
-        if (MINGW)
-            set(UNICORN_LIB_NAME "unicorn.a")
-        else()
-            set(UNICORN_LIB_NAME "libunicorn.a")
-        endif()
-
-        set(UNICORN_FOUND YES)
-        set(UNICORN_PREFIX ${PROJECT_SOURCE_DIR}/externals/unicorn)
-        set(LIBUNICORN_LIBRARY "${UNICORN_PREFIX}/${UNICORN_LIB_NAME}" CACHE PATH "Path to Unicorn library" FORCE)
-        set(LIBUNICORN_INCLUDE_DIR "${UNICORN_PREFIX}/include" CACHE PATH "Path to Unicorn headers" FORCE)
-        set(UNICORN_DLL_DIR "${UNICORN_PREFIX}/" CACHE PATH "Path to unicorn dynamic library" FORCE)
-
-        find_package(PythonInterp 2.7 REQUIRED)
-
-        if (MINGW)
-            # Intentionally call the unicorn makefile directly instead of using make.sh so that we can override the
-            # UNAME_S makefile variable to MINGW. This way we don't have to hack at the uname binary to build
-            # Additionally, overriding DO_WINDOWS_EXPORT prevents unicorn from patching the static unicorn.a by using msvc and cmd,
-            # which are both things we don't have in a mingw cross compiling environment.
-            add_custom_command(OUTPUT ${LIBUNICORN_LIBRARY}
-                COMMAND ${CMAKE_COMMAND} -E env UNICORN_ARCHS="aarch64" PYTHON="${PYTHON_EXECUTABLE}" CC=x86_64-w64-mingw32-gcc AR=x86_64-w64-mingw32-gcc-ar RANLIB=x86_64-w64-mingw32-gcc-ranlib make UNAME_S=MINGW DO_WINDOWS_EXPORT=0
-                WORKING_DIRECTORY ${UNICORN_PREFIX}
-            )
-        else()
-            add_custom_command(OUTPUT ${LIBUNICORN_LIBRARY}
-                COMMAND ${CMAKE_COMMAND} -E env UNICORN_ARCHS="aarch64" PYTHON="${PYTHON_EXECUTABLE}" /bin/sh make.sh macos-universal-no
-                WORKING_DIRECTORY ${UNICORN_PREFIX}
-            )
-        endif()
-
-        # ALL makes this custom target build every time
-        # but it won't actually build if LIBUNICORN_LIBRARY is up to date
-        add_custom_target(unicorn-build ALL
-            DEPENDS ${LIBUNICORN_LIBRARY}
-        )
-        unset(UNICORN_LIB_NAME)
-    endif()
-else()
-    find_package(Unicorn REQUIRED)
-endif()
-
-if (UNICORN_FOUND)
-    add_library(unicorn INTERFACE)
-    add_dependencies(unicorn unicorn-build)
-    target_link_libraries(unicorn INTERFACE "${LIBUNICORN_LIBRARY}")
-    target_include_directories(unicorn INTERFACE "${LIBUNICORN_INCLUDE_DIR}")
-else()
-    message(FATAL_ERROR "Could not find or build unicorn which is required.")
-endif()
-
 # Platform-specific library requirements
 # ======================================

--- a/CMakeModules/CopyYuzuUnicornDeps.cmake
+++ b/CMakeModules/CopyYuzuUnicornDeps.cmake
@@ -1,9 +0,0 @@
-function(copy_yuzu_unicorn_deps target_dir)
-    include(WindowsCopyFiles)
-    set(DLL_DEST "${CMAKE_BINARY_DIR}/bin/$<CONFIG>/")
-    windows_copy_files(${target_dir} ${UNICORN_DLL_DIR} ${DLL_DEST}
-        libgcc_s_seh-1.dll
-        libwinpthread-1.dll
-        unicorn.dll
-    )
-endfunction(copy_yuzu_unicorn_deps)
--- a/externals/unicorn
+++ b/externals/unicorn
--- a/src/common/fiber.cpp
+++ b/src/common/fiber.cpp
@@ -4,6 +4,8 @@

 #include "common/assert.h"
 #include "common/fiber.h"
+#include "common/spin_lock.h"
+
 #if defined(_WIN32) || defined(WIN32)
 #include <windows.h>
 #else
@@ -14,18 +16,45 @@ namespace Common {

 constexpr std::size_t default_stack_size = 256 * 1024; // 256kb

-#if defined(_WIN32) || defined(WIN32)
-
 struct Fiber::FiberImpl {
+    SpinLock guard{};
+    std::function<void(void*)> entry_point;
+    std::function<void(void*)> rewind_point;
+    void* rewind_parameter{};
+    void* start_parameter{};
+    std::shared_ptr<Fiber> previous_fiber;
+    bool is_thread_fiber{};
+    bool released{};
+
+#if defined(_WIN32) || defined(WIN32)
    LPVOID handle = nullptr;
    LPVOID rewind_handle = nullptr;
+#else
+    alignas(64) std::array<u8, default_stack_size> stack;
+    alignas(64) std::array<u8, default_stack_size> rewind_stack;
+    u8* stack_limit;
+    u8* rewind_stack_limit;
+    boost::context::detail::fcontext_t context;
+    boost::context::detail::fcontext_t rewind_context;
+#endif
 };

+void Fiber::SetStartParameter(void* new_parameter) {
+    impl->start_parameter = new_parameter;
+}
+
+void Fiber::SetRewindPoint(std::function<void(void*)>&& rewind_func, void* rewind_param) {
+    impl->rewind_point = std::move(rewind_func);
+    impl->rewind_parameter = rewind_param;
+}
+
+#if defined(_WIN32) || defined(WIN32)
+
 void Fiber::Start() {
-    ASSERT(previous_fiber != nullptr);
-    previous_fiber->guard.unlock();
-    previous_fiber.reset();
-    entry_point(start_parameter);
+    ASSERT(impl->previous_fiber != nullptr);
+    impl->previous_fiber->impl->guard.unlock();
+    impl->previous_fiber.reset();
+    impl->entry_point(impl->start_parameter);
    UNREACHABLE();
 }

@@ -34,58 +63,54 @@ void Fiber::OnRewind() {
    DeleteFiber(impl->handle);
    impl->handle = impl->rewind_handle;
    impl->rewind_handle = nullptr;
-    rewind_point(rewind_parameter);
+    impl->rewind_point(impl->rewind_parameter);
    UNREACHABLE();
 }

 void Fiber::FiberStartFunc(void* fiber_parameter) {
-    auto fiber = static_cast<Fiber*>(fiber_parameter);
+    auto* fiber = static_cast<Fiber*>(fiber_parameter);
    fiber->Start();
 }

 void Fiber::RewindStartFunc(void* fiber_parameter) {
-    auto fiber = static_cast<Fiber*>(fiber_parameter);
+    auto* fiber = static_cast<Fiber*>(fiber_parameter);
    fiber->OnRewind();
 }

 Fiber::Fiber(std::function<void(void*)>&& entry_point_func, void* start_parameter)
-    : entry_point{std::move(entry_point_func)}, start_parameter{start_parameter} {
-    impl = std::make_unique<FiberImpl>();
+    : impl{std::make_unique<FiberImpl>()} {
+    impl->entry_point = std::move(entry_point_func);
+    impl->start_parameter = start_parameter;
    impl->handle = CreateFiber(default_stack_size, &FiberStartFunc, this);
 }

 Fiber::Fiber() : impl{std::make_unique<FiberImpl>()} {}

 Fiber::~Fiber() {
-    if (released) {
+    if (impl->released) {
        return;
    }
    // Make sure the Fiber is not being used
-    const bool locked = guard.try_lock();
+    const bool locked = impl->guard.try_lock();
    ASSERT_MSG(locked, "Destroying a fiber that's still running");
    if (locked) {
-        guard.unlock();
+        impl->guard.unlock();
    }
    DeleteFiber(impl->handle);
 }

 void Fiber::Exit() {
-    ASSERT_MSG(is_thread_fiber, "Exitting non main thread fiber");
-    if (!is_thread_fiber) {
+    ASSERT_MSG(impl->is_thread_fiber, "Exitting non main thread fiber");
+    if (!impl->is_thread_fiber) {
        return;
    }
    ConvertFiberToThread();
-    guard.unlock();
-    released = true;
-}
-
-void Fiber::SetRewindPoint(std::function<void(void*)>&& rewind_func, void* rewind_param) {
-    rewind_point = std::move(rewind_func);
-    rewind_parameter = rewind_param;
+    impl->guard.unlock();
+    impl->released = true;
 }

 void Fiber::Rewind() {
-    ASSERT(rewind_point);
+    ASSERT(impl->rewind_point);
    ASSERT(impl->rewind_handle == nullptr);
    impl->rewind_handle = CreateFiber(default_stack_size, &RewindStartFunc, this);
    SwitchToFiber(impl->rewind_handle);
@@ -94,39 +119,30 @@ void Fiber::Rewind() {
 void Fiber::YieldTo(std::shared_ptr<Fiber> from, std::shared_ptr<Fiber> to) {
    ASSERT_MSG(from != nullptr, "Yielding fiber is null!");
    ASSERT_MSG(to != nullptr, "Next fiber is null!");
-    to->guard.lock();
-    to->previous_fiber = from;
+    to->impl->guard.lock();
+    to->impl->previous_fiber = from;
    SwitchToFiber(to->impl->handle);
-    ASSERT(from->previous_fiber != nullptr);
-    from->previous_fiber->guard.unlock();
-    from->previous_fiber.reset();
+    ASSERT(from->impl->previous_fiber != nullptr);
+    from->impl->previous_fiber->impl->guard.unlock();
+    from->impl->previous_fiber.reset();
 }

 std::shared_ptr<Fiber> Fiber::ThreadToFiber() {
    std::shared_ptr<Fiber> fiber = std::shared_ptr<Fiber>{new Fiber()};
-    fiber->guard.lock();
+    fiber->impl->guard.lock();
    fiber->impl->handle = ConvertThreadToFiber(nullptr);
-    fiber->is_thread_fiber = true;
+    fiber->impl->is_thread_fiber = true;
    return fiber;
 }

 #else

-struct Fiber::FiberImpl {
-    alignas(64) std::array<u8, default_stack_size> stack;
-    alignas(64) std::array<u8, default_stack_size> rewind_stack;
-    u8* stack_limit;
-    u8* rewind_stack_limit;
-    boost::context::detail::fcontext_t context;
-    boost::context::detail::fcontext_t rewind_context;
-};
-
 void Fiber::Start(boost::context::detail::transfer_t& transfer) {
-    ASSERT(previous_fiber != nullptr);
-    previous_fiber->impl->context = transfer.fctx;
-    previous_fiber->guard.unlock();
-    previous_fiber.reset();
-    entry_point(start_parameter);
+    ASSERT(impl->previous_fiber != nullptr);
+    impl->previous_fiber->impl->context = transfer.fctx;
+    impl->previous_fiber->impl->guard.unlock();
+    impl->previous_fiber.reset();
+    impl->entry_point(impl->start_parameter);
    UNREACHABLE();
 }

@@ -137,23 +153,24 @@ void Fiber::OnRewind([[maybe_unused]] boost::context::detail::transfer_t& transf
    u8* tmp = impl->stack_limit;
    impl->stack_limit = impl->rewind_stack_limit;
    impl->rewind_stack_limit = tmp;
-    rewind_point(rewind_parameter);
+    impl->rewind_point(impl->rewind_parameter);
    UNREACHABLE();
 }

 void Fiber::FiberStartFunc(boost::context::detail::transfer_t transfer) {
-    auto fiber = static_cast<Fiber*>(transfer.data);
+    auto* fiber = static_cast<Fiber*>(transfer.data);
    fiber->Start(transfer);
 }

 void Fiber::RewindStartFunc(boost::context::detail::transfer_t transfer) {
-    auto fiber = static_cast<Fiber*>(transfer.data);
+    auto* fiber = static_cast<Fiber*>(transfer.data);
    fiber->OnRewind(transfer);
 }

 Fiber::Fiber(std::function<void(void*)>&& entry_point_func, void* start_parameter)
-    : entry_point{std::move(entry_point_func)}, start_parameter{start_parameter} {
-    impl = std::make_unique<FiberImpl>();
+    : impl{std::make_unique<FiberImpl>()} {
+    impl->entry_point = std::move(entry_point_func);
+    impl->start_parameter = start_parameter;
    impl->stack_limit = impl->stack.data();
    impl->rewind_stack_limit = impl->rewind_stack.data();
    u8* stack_base = impl->stack_limit + default_stack_size;
@@ -161,37 +178,31 @@ Fiber::Fiber(std::function<void(void*)>&& entry_point_func, void* start_paramete
        boost::context::detail::make_fcontext(stack_base, impl->stack.size(), FiberStartFunc);
 }

-void Fiber::SetRewindPoint(std::function<void(void*)>&& rewind_func, void* rewind_param) {
-    rewind_point = std::move(rewind_func);
-    rewind_parameter = rewind_param;
-}
-
 Fiber::Fiber() : impl{std::make_unique<FiberImpl>()} {}

 Fiber::~Fiber() {
-    if (released) {
+    if (impl->released) {
        return;
    }
    // Make sure the Fiber is not being used
-    const bool locked = guard.try_lock();
+    const bool locked = impl->guard.try_lock();
    ASSERT_MSG(locked, "Destroying a fiber that's still running");
    if (locked) {
-        guard.unlock();
+        impl->guard.unlock();
    }
 }

 void Fiber::Exit() {
-
-    ASSERT_MSG(is_thread_fiber, "Exitting non main thread fiber");
-    if (!is_thread_fiber) {
+    ASSERT_MSG(impl->is_thread_fiber, "Exitting non main thread fiber");
+    if (!impl->is_thread_fiber) {
        return;
    }
-    guard.unlock();
-    released = true;
+    impl->guard.unlock();
+    impl->released = true;
 }

 void Fiber::Rewind() {
-    ASSERT(rewind_point);
+    ASSERT(impl->rewind_point);
    ASSERT(impl->rewind_context == nullptr);
    u8* stack_base = impl->rewind_stack_limit + default_stack_size;
    impl->rewind_context =
@@ -202,19 +213,19 @@ void Fiber::Rewind() {
 void Fiber::YieldTo(std::shared_ptr<Fiber> from, std::shared_ptr<Fiber> to) {
    ASSERT_MSG(from != nullptr, "Yielding fiber is null!");
    ASSERT_MSG(to != nullptr, "Next fiber is null!");
-    to->guard.lock();
-    to->previous_fiber = from;
+    to->impl->guard.lock();
+    to->impl->previous_fiber = from;
    auto transfer = boost::context::detail::jump_fcontext(to->impl->context, to.get());
-    ASSERT(from->previous_fiber != nullptr);
-    from->previous_fiber->impl->context = transfer.fctx;
-    from->previous_fiber->guard.unlock();
-    from->previous_fiber.reset();
+    ASSERT(from->impl->previous_fiber != nullptr);
+    from->impl->previous_fiber->impl->context = transfer.fctx;
+    from->impl->previous_fiber->impl->guard.unlock();
+    from->impl->previous_fiber.reset();
 }

 std::shared_ptr<Fiber> Fiber::ThreadToFiber() {
    std::shared_ptr<Fiber> fiber = std::shared_ptr<Fiber>{new Fiber()};
-    fiber->guard.lock();
-    fiber->is_thread_fiber = true;
+    fiber->impl->guard.lock();
+    fiber->impl->is_thread_fiber = true;
    return fiber;
 }

--- a/src/common/fiber.h
+++ b/src/common/fiber.h
@@ -7,9 +7,6 @@
 #include <functional>
 #include <memory>

-#include "common/common_types.h"
-#include "common/spin_lock.h"
-
 #if !defined(_WIN32) && !defined(WIN32)
 namespace boost::context::detail {
 struct transfer_t;
@@ -57,9 +54,7 @@ public:
    void Exit();

    /// Changes the start parameter of the fiber. Has no effect if the fiber already started
-    void SetStartParameter(void* new_parameter) {
-        start_parameter = new_parameter;
-    }
+    void SetStartParameter(void* new_parameter);

 private:
    Fiber();
@@ -77,16 +72,7 @@ private:
 #endif

    struct FiberImpl;
-
-    SpinLock guard{};
-    std::function<void(void*)> entry_point;
-    std::function<void(void*)> rewind_point;
-    void* rewind_parameter{};
-    void* start_parameter{};
-    std::shared_ptr<Fiber> previous_fiber;
    std::unique_ptr<FiberImpl> impl;
-    bool is_thread_fiber{};
-    bool released{};
 };

 } // namespace Common
--- a/src/common/spin_lock.h
+++ b/src/common/spin_lock.h
@@ -15,6 +15,14 @@ namespace Common {
 */
 class SpinLock {
 public:
+    SpinLock() = default;
+
+    SpinLock(const SpinLock&) = delete;
+    SpinLock& operator=(const SpinLock&) = delete;
+
+    SpinLock(SpinLock&&) = delete;
+    SpinLock& operator=(SpinLock&&) = delete;
+
    void lock();
    void unlock();
    [[nodiscard]] bool try_lock();
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -13,8 +13,6 @@ add_library(core STATIC
    arm/dynarmic/arm_exclusive_monitor.h
    arm/exclusive_monitor.cpp
    arm/exclusive_monitor.h
-    arm/unicorn/arm_unicorn.cpp
-    arm/unicorn/arm_unicorn.h
    constants.cpp
    constants.h
    core.cpp
@@ -454,6 +452,8 @@ add_library(core STATIC
    hle/service/nvdrv/nvdrv.h
    hle/service/nvdrv/nvmemp.cpp
    hle/service/nvdrv/nvmemp.h
+    hle/service/nvdrv/syncpoint_manager.cpp
+    hle/service/nvdrv/syncpoint_manager.h
    hle/service/nvflinger/buffer_queue.cpp
    hle/service/nvflinger/buffer_queue.h
    hle/service/nvflinger/nvflinger.cpp
@@ -644,7 +644,7 @@ endif()
 create_target_directory_groups(core)

 target_link_libraries(core PUBLIC common PRIVATE audio_core video_core)
-target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls opus unicorn zip)
+target_link_libraries(core PUBLIC Boost::boost PRIVATE fmt::fmt nlohmann_json::nlohmann_json mbedtls opus zip)

 if (YUZU_ENABLE_BOXCAT)
    target_compile_definitions(core PRIVATE -DYUZU_ENABLE_BOXCAT)
--- a/src/core/arm/arm_interface.cpp
+++ b/src/core/arm/arm_interface.cpp
@@ -147,10 +147,18 @@ std::vector<ARM_Interface::BacktraceEntry> ARM_Interface::GetBacktraceFromContex
    auto fp = ctx.cpu_registers[29];
    auto lr = ctx.cpu_registers[30];
    while (true) {
-        out.push_back({"", 0, lr, 0});
-        if (!fp) {
+        out.push_back({
+            .module = "",
+            .address = 0,
+            .original_address = lr,
+            .offset = 0,
+            .name = {},
+        });
+
+        if (fp == 0) {
            break;
        }
+
        lr = memory.Read64(fp + 8) - 4;
        fp = memory.Read64(fp);
    }
--- a/src/core/arm/cpu_interrupt_handler.h
+++ b/src/core/arm/cpu_interrupt_handler.h
@@ -21,8 +21,8 @@ public:
    CPUInterruptHandler(const CPUInterruptHandler&) = delete;
    CPUInterruptHandler& operator=(const CPUInterruptHandler&) = delete;

-    CPUInterruptHandler(CPUInterruptHandler&&) = default;
-    CPUInterruptHandler& operator=(CPUInterruptHandler&&) = default;
+    CPUInterruptHandler(CPUInterruptHandler&&) = delete;
+    CPUInterruptHandler& operator=(CPUInterruptHandler&&) = delete;

    bool IsInterrupted() const {
        return is_interrupted;
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@@ -7,6 +7,7 @@
 #include <dynarmic/A32/a32.h>
 #include <dynarmic/A32/config.h>
 #include <dynarmic/A32/context.h>
+#include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/page_table.h"
 #include "core/arm/cpu_interrupt_handler.h"
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@@ -6,6 +6,7 @@
 #include <memory>
 #include <dynarmic/A64/a64.h>
 #include <dynarmic/A64/config.h>
+#include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/page_table.h"
 #include "core/arm/cpu_interrupt_handler.h"
@@ -13,7 +14,6 @@
 #include "core/arm/dynarmic/arm_exclusive_monitor.h"
 #include "core/core.h"
 #include "core/core_timing.h"
-#include "core/core_timing_util.h"
 #include "core/gdbstub/gdbstub.h"
 #include "core/hardware_properties.h"
 #include "core/hle/kernel/process.h"
@@ -82,16 +82,9 @@ public:
    }

    void InterpreterFallback(u64 pc, std::size_t num_instructions) override {
-        LOG_INFO(Core_ARM, "Unicorn fallback @ 0x{:X} for {} instructions (instr = {:08X})", pc,
-                 num_instructions, MemoryReadCode(pc));
-
-        ARM_Interface::ThreadContext64 ctx;
-        parent.SaveContext(ctx);
-        parent.inner_unicorn.LoadContext(ctx);
-        parent.inner_unicorn.ExecuteInstructions(num_instructions);
-        parent.inner_unicorn.SaveContext(ctx);
-        parent.LoadContext(ctx);
-        num_interpreted_instructions += num_instructions;
+        LOG_ERROR(Core_ARM,
+                  "Unimplemented instruction @ 0x{:X} for {} instructions (instr = {:08X})", pc,
+                  num_instructions, MemoryReadCode(pc));
    }

    void ExceptionRaised(u64 pc, Dynarmic::A64::Exception exception) override {
@@ -127,18 +120,17 @@ public:
        if (parent.uses_wall_clock) {
            return;
        }
+
        // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a
        // rough approximation of the amount of executed ticks in the system, it may be thrown off
        // if not all cores are doing a similar amount of work. Instead of doing this, we should
        // device a way so that timing is consistent across all cores without increasing the ticks 4
        // times.
-        u64 amortized_ticks =
-            (ticks - num_interpreted_instructions) / Core::Hardware::NUM_CPU_CORES;
+        u64 amortized_ticks = ticks / Core::Hardware::NUM_CPU_CORES;
        // Always execute at least one tick.
        amortized_ticks = std::max<u64>(amortized_ticks, 1);

        parent.system.CoreTiming().AddTicks(amortized_ticks);
-        num_interpreted_instructions = 0;
    }

    u64 GetTicksRemaining() override {
@@ -156,7 +148,6 @@ public:
    }

    ARM_Dynarmic_64& parent;
-    std::size_t num_interpreted_instructions = 0;
    u64 tpidrro_el0 = 0;
    u64 tpidr_el0 = 0;
    static constexpr u64 minimum_run_cycles = 1000U;
@@ -248,12 +239,8 @@ ARM_Dynarmic_64::ARM_Dynarmic_64(System& system, CPUInterrupts& interrupt_handle
                                 bool uses_wall_clock, ExclusiveMonitor& exclusive_monitor,
                                 std::size_t core_index)
    : ARM_Interface{system, interrupt_handlers, uses_wall_clock},
-      cb(std::make_unique<DynarmicCallbacks64>(*this)), inner_unicorn{system, interrupt_handlers,
-                                                                      uses_wall_clock,
-                                                                      ARM_Unicorn::Arch::AArch64,
-                                                                      core_index},
-      core_index{core_index}, exclusive_monitor{
-                                  dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
+      cb(std::make_unique<DynarmicCallbacks64>(*this)), core_index{core_index},
+      exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}

 ARM_Dynarmic_64::~ARM_Dynarmic_64() = default;

--- a/src/core/arm/dynarmic/arm_dynarmic_64.h
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.h
@@ -12,7 +12,6 @@
 #include "common/hash.h"
 #include "core/arm/arm_interface.h"
 #include "core/arm/exclusive_monitor.h"
-#include "core/arm/unicorn/arm_unicorn.h"

 namespace Core::Memory {
 class Memory;
@@ -71,7 +70,6 @@ private:
    std::unique_ptr<DynarmicCallbacks64> cb;
    JitCacheType jit_cache;
    std::shared_ptr<Dynarmic::A64::Jit> jit;
-    ARM_Unicorn inner_unicorn;

    std::size_t core_index;
    DynarmicExclusiveMonitor& exclusive_monitor;
--- a/src/core/arm/unicorn/arm_unicorn.cpp
+++ b/src/core/arm/unicorn/arm_unicorn.cpp
@@ -1,295 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <algorithm>
-#include <unicorn/arm64.h>
-#include "common/assert.h"
-#include "common/microprofile.h"
-#include "core/arm/cpu_interrupt_handler.h"
-#include "core/arm/unicorn/arm_unicorn.h"
-#include "core/core.h"
-#include "core/core_timing.h"
-#include "core/hle/kernel/scheduler.h"
-#include "core/hle/kernel/svc.h"
-#include "core/memory.h"
-
-namespace Core {
-
-// Load Unicorn DLL once on Windows using RAII
-#ifdef _MSC_VER
-#include <unicorn_dynload.h>
-struct LoadDll {
-private:
-    LoadDll() {
-        ASSERT(uc_dyn_load(NULL, 0));
-    }
-    ~LoadDll() {
-        ASSERT(uc_dyn_free());
-    }
-    static LoadDll g_load_dll;
-};
-LoadDll LoadDll::g_load_dll;
-#endif
-
-#define CHECKED(expr)                                                                              \
-    do {                                                                                           \
-        if (auto _cerr = (expr)) {                                                                 \
-            ASSERT_MSG(false, "Call " #expr " failed with error: {} ({})\n", _cerr,                \
-                       uc_strerror(_cerr));                                                        \
-        }                                                                                          \
-    } while (0)
-
-static void CodeHook(uc_engine* uc, uint64_t address, uint32_t size, void* user_data) {
-    GDBStub::BreakpointAddress bkpt =
-        GDBStub::GetNextBreakpointFromAddress(address, GDBStub::BreakpointType::Execute);
-    if (GDBStub::IsMemoryBreak() ||
-        (bkpt.type != GDBStub::BreakpointType::None && address == bkpt.address)) {
-        auto core = static_cast<ARM_Unicorn*>(user_data);
-        core->RecordBreak(bkpt);
-        uc_emu_stop(uc);
-    }
-}
-
-static bool UnmappedMemoryHook(uc_engine* uc, uc_mem_type type, u64 addr, int size, u64 value,
-                               void* user_data) {
-    auto* const system = static_cast<System*>(user_data);
-
-    ARM_Interface::ThreadContext64 ctx{};
-    system->CurrentArmInterface().SaveContext(ctx);
-    ASSERT_MSG(false, "Attempted to read from unmapped memory: 0x{:X}, pc=0x{:X}, lr=0x{:X}", addr,
-               ctx.pc, ctx.cpu_registers[30]);
-
-    return false;
-}
-
-ARM_Unicorn::ARM_Unicorn(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
-                         Arch architecture, std::size_t core_index)
-    : ARM_Interface{system, interrupt_handlers, uses_wall_clock}, core_index{core_index} {
-    const auto arch = architecture == Arch::AArch32 ? UC_ARCH_ARM : UC_ARCH_ARM64;
-    CHECKED(uc_open(arch, UC_MODE_ARM, &uc));
-
-    auto fpv = 3 << 20;
-    CHECKED(uc_reg_write(uc, UC_ARM64_REG_CPACR_EL1, &fpv));
-
-    uc_hook hook{};
-    CHECKED(uc_hook_add(uc, &hook, UC_HOOK_INTR, (void*)InterruptHook, this, 0, UINT64_MAX));
-    CHECKED(uc_hook_add(uc, &hook, UC_HOOK_MEM_INVALID, (void*)UnmappedMemoryHook, &system, 0,
-                        UINT64_MAX));
-    if (GDBStub::IsServerEnabled()) {
-        CHECKED(uc_hook_add(uc, &hook, UC_HOOK_CODE, (void*)CodeHook, this, 0, UINT64_MAX));
-        last_bkpt_hit = false;
-    }
-}
-
-ARM_Unicorn::~ARM_Unicorn() {
-    CHECKED(uc_close(uc));
-}
-
-void ARM_Unicorn::SetPC(u64 pc) {
-    CHECKED(uc_reg_write(uc, UC_ARM64_REG_PC, &pc));
-}
-
-u64 ARM_Unicorn::GetPC() const {
-    u64 val{};
-    CHECKED(uc_reg_read(uc, UC_ARM64_REG_PC, &val));
-    return val;
-}
-
-u64 ARM_Unicorn::GetReg(int regn) const {
-    u64 val{};
-    auto treg = UC_ARM64_REG_SP;
-    if (regn <= 28) {
-        treg = (uc_arm64_reg)(UC_ARM64_REG_X0 + regn);
-    } else if (regn < 31) {
-        treg = (uc_arm64_reg)(UC_ARM64_REG_X29 + regn - 29);
-    }
-    CHECKED(uc_reg_read(uc, treg, &val));
-    return val;
-}
-
-void ARM_Unicorn::SetReg(int regn, u64 val) {
-    auto treg = UC_ARM64_REG_SP;
-    if (regn <= 28) {
-        treg = (uc_arm64_reg)(UC_ARM64_REG_X0 + regn);
-    } else if (regn < 31) {
-        treg = (uc_arm64_reg)(UC_ARM64_REG_X29 + regn - 29);
-    }
-    CHECKED(uc_reg_write(uc, treg, &val));
-}
-
-u128 ARM_Unicorn::GetVectorReg(int /*index*/) const {
-    UNIMPLEMENTED();
-    static constexpr u128 res{};
-    return res;
-}
-
-void ARM_Unicorn::SetVectorReg(int /*index*/, u128 /*value*/) {
-    UNIMPLEMENTED();
-}
-
-u32 ARM_Unicorn::GetPSTATE() const {
-    u64 nzcv{};
-    CHECKED(uc_reg_read(uc, UC_ARM64_REG_NZCV, &nzcv));
-    return static_cast<u32>(nzcv);
-}
-
-void ARM_Unicorn::SetPSTATE(u32 pstate) {
-    u64 nzcv = pstate;
-    CHECKED(uc_reg_write(uc, UC_ARM64_REG_NZCV, &nzcv));
-}
-
-VAddr ARM_Unicorn::GetTlsAddress() const {
-    u64 base{};
-    CHECKED(uc_reg_read(uc, UC_ARM64_REG_TPIDRRO_EL0, &base));
-    return base;
-}
-
-void ARM_Unicorn::SetTlsAddress(VAddr base) {
-    CHECKED(uc_reg_write(uc, UC_ARM64_REG_TPIDRRO_EL0, &base));
-}
-
-u64 ARM_Unicorn::GetTPIDR_EL0() const {
-    u64 value{};
-    CHECKED(uc_reg_read(uc, UC_ARM64_REG_TPIDR_EL0, &value));
-    return value;
-}
-
-void ARM_Unicorn::SetTPIDR_EL0(u64 value) {
-    CHECKED(uc_reg_write(uc, UC_ARM64_REG_TPIDR_EL0, &value));
-}
-
-void ARM_Unicorn::ChangeProcessorID(std::size_t new_core_id) {
-    core_index = new_core_id;
-}
-
-void ARM_Unicorn::Run() {
-    if (GDBStub::IsServerEnabled()) {
-        ExecuteInstructions(std::max(4000000U, 0U));
-    } else {
-        while (true) {
-            if (interrupt_handlers[core_index].IsInterrupted()) {
-                return;
-            }
-            ExecuteInstructions(10);
-        }
-    }
-}
-
-void ARM_Unicorn::Step() {
-    ExecuteInstructions(1);
-}
-
-MICROPROFILE_DEFINE(ARM_Jit_Unicorn, "ARM JIT", "Unicorn", MP_RGB(255, 64, 64));
-
-void ARM_Unicorn::ExecuteInstructions(std::size_t num_instructions) {
-    MICROPROFILE_SCOPE(ARM_Jit_Unicorn);
-
-    // Temporarily map the code page for Unicorn
-    u64 map_addr{GetPC() & ~Memory::PAGE_MASK};
-    std::vector<u8> page_buffer(Memory::PAGE_SIZE);
-    system.Memory().ReadBlock(map_addr, page_buffer.data(), page_buffer.size());
-
-    CHECKED(uc_mem_map_ptr(uc, map_addr, page_buffer.size(),
-                           UC_PROT_READ | UC_PROT_WRITE | UC_PROT_EXEC, page_buffer.data()));
-    CHECKED(uc_emu_start(uc, GetPC(), 1ULL << 63, 0, num_instructions));
-    CHECKED(uc_mem_unmap(uc, map_addr, page_buffer.size()));
-    if (GDBStub::IsServerEnabled()) {
-        if (last_bkpt_hit && last_bkpt.type == GDBStub::BreakpointType::Execute) {
-            uc_reg_write(uc, UC_ARM64_REG_PC, &last_bkpt.address);
-        }
-
-        Kernel::Thread* const thread = system.CurrentScheduler().GetCurrentThread();
-        SaveContext(thread->GetContext64());
-        if (last_bkpt_hit || GDBStub::IsMemoryBreak() || GDBStub::GetCpuStepFlag()) {
-            last_bkpt_hit = false;
-            GDBStub::Break();
-            GDBStub::SendTrap(thread, 5);
-        }
-    }
-}
-
-void ARM_Unicorn::SaveContext(ThreadContext64& ctx) {
-    int uregs[32];
-    void* tregs[32];
-
-    CHECKED(uc_reg_read(uc, UC_ARM64_REG_SP, &ctx.sp));
-    CHECKED(uc_reg_read(uc, UC_ARM64_REG_PC, &ctx.pc));
-    CHECKED(uc_reg_read(uc, UC_ARM64_REG_NZCV, &ctx.pstate));
-
-    for (auto i = 0; i < 29; ++i) {
-        uregs[i] = UC_ARM64_REG_X0 + i;
-        tregs[i] = &ctx.cpu_registers[i];
-    }
-    uregs[29] = UC_ARM64_REG_X29;
-    tregs[29] = (void*)&ctx.cpu_registers[29];
-    uregs[30] = UC_ARM64_REG_X30;
-    tregs[30] = (void*)&ctx.cpu_registers[30];
-
-    CHECKED(uc_reg_read_batch(uc, uregs, tregs, 31));
-
-    for (int i = 0; i < 32; ++i) {
-        uregs[i] = UC_ARM64_REG_Q0 + i;
-        tregs[i] = &ctx.vector_registers[i];
-    }
-
-    CHECKED(uc_reg_read_batch(uc, uregs, tregs, 32));
-}
-
-void ARM_Unicorn::LoadContext(const ThreadContext64& ctx) {
-    int uregs[32];
-    void* tregs[32];
-
-    CHECKED(uc_reg_write(uc, UC_ARM64_REG_SP, &ctx.sp));
-    CHECKED(uc_reg_write(uc, UC_ARM64_REG_PC, &ctx.pc));
-    CHECKED(uc_reg_write(uc, UC_ARM64_REG_NZCV, &ctx.pstate));
-
-    for (int i = 0; i < 29; ++i) {
-        uregs[i] = UC_ARM64_REG_X0 + i;
-        tregs[i] = (void*)&ctx.cpu_registers[i];
-    }
-    uregs[29] = UC_ARM64_REG_X29;
-    tregs[29] = (void*)&ctx.cpu_registers[29];
-    uregs[30] = UC_ARM64_REG_X30;
-    tregs[30] = (void*)&ctx.cpu_registers[30];
-
-    CHECKED(uc_reg_write_batch(uc, uregs, tregs, 31));
-
-    for (auto i = 0; i < 32; ++i) {
-        uregs[i] = UC_ARM64_REG_Q0 + i;
-        tregs[i] = (void*)&ctx.vector_registers[i];
-    }
-
-    CHECKED(uc_reg_write_batch(uc, uregs, tregs, 32));
-}
-
-void ARM_Unicorn::PrepareReschedule() {
-    CHECKED(uc_emu_stop(uc));
-}
-
-void ARM_Unicorn::ClearExclusiveState() {}
-
-void ARM_Unicorn::ClearInstructionCache() {}
-
-void ARM_Unicorn::RecordBreak(GDBStub::BreakpointAddress bkpt) {
-    last_bkpt = bkpt;
-    last_bkpt_hit = true;
-}
-
-void ARM_Unicorn::InterruptHook(uc_engine* uc, u32 int_no, void* user_data) {
-    u32 esr{};
-    CHECKED(uc_reg_read(uc, UC_ARM64_REG_ESR, &esr));
-
-    const auto ec = esr >> 26;
-    const auto iss = esr & 0xFFFFFF;
-
-    auto* const arm_instance = static_cast<ARM_Unicorn*>(user_data);
-
-    switch (ec) {
-    case 0x15: // SVC
-        Kernel::Svc::Call(arm_instance->system, iss);
-        break;
-    }
-}
-
-} // namespace Core
--- a/src/core/arm/unicorn/arm_unicorn.h
+++ b/src/core/arm/unicorn/arm_unicorn.h
@@ -1,63 +0,0 @@
-// Copyright 2018 yuzu emulator team
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <unicorn/unicorn.h>
-#include "common/common_types.h"
-#include "core/arm/arm_interface.h"
-#include "core/gdbstub/gdbstub.h"
-
-namespace Core {
-
-class System;
-
-class ARM_Unicorn final : public ARM_Interface {
-public:
-    enum class Arch {
-        AArch32, // 32-bit ARM
-        AArch64, // 64-bit ARM
-    };
-
-    explicit ARM_Unicorn(System& system, CPUInterrupts& interrupt_handlers, bool uses_wall_clock,
-                         Arch architecture, std::size_t core_index);
-    ~ARM_Unicorn() override;
-
-    void SetPC(u64 pc) override;
-    u64 GetPC() const override;
-    u64 GetReg(int index) const override;
-    void SetReg(int index, u64 value) override;
-    u128 GetVectorReg(int index) const override;
-    void SetVectorReg(int index, u128 value) override;
-    u32 GetPSTATE() const override;
-    void SetPSTATE(u32 pstate) override;
-    VAddr GetTlsAddress() const override;
-    void SetTlsAddress(VAddr address) override;
-    void SetTPIDR_EL0(u64 value) override;
-    u64 GetTPIDR_EL0() const override;
-    void ChangeProcessorID(std::size_t new_core_id) override;
-    void PrepareReschedule() override;
-    void ClearExclusiveState() override;
-    void ExecuteInstructions(std::size_t num_instructions);
-    void Run() override;
-    void Step() override;
-    void ClearInstructionCache() override;
-    void PageTableChanged(Common::PageTable&, std::size_t) override {}
-    void RecordBreak(GDBStub::BreakpointAddress bkpt);
-
-    void SaveContext(ThreadContext32& ctx) override {}
-    void SaveContext(ThreadContext64& ctx) override;
-    void LoadContext(const ThreadContext32& ctx) override {}
-    void LoadContext(const ThreadContext64& ctx) override;
-
-private:
-    static void InterruptHook(uc_engine* uc, u32 int_no, void* user_data);
-
-    uc_engine* uc{};
-    GDBStub::BreakpointAddress last_bkpt{};
-    bool last_bkpt_hit = false;
-    std::size_t core_index;
-};
-
-} // namespace Core
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -179,16 +179,18 @@ struct System::Impl {
        arp_manager.ResetAll();

        telemetry_session = std::make_unique<Core::TelemetrySession>();
+
+        gpu_core = VideoCore::CreateGPU(emu_window, system);
+        if (!gpu_core) {
+            return ResultStatus::ErrorVideoCore;
+        }
+
        service_manager = std::make_shared<Service::SM::ServiceManager>(kernel);

        Service::Init(service_manager, system);
        GDBStub::DeferStart();

        interrupt_manager = std::make_unique<Core::Hardware::InterruptManager>(system);
-        gpu_core = VideoCore::CreateGPU(emu_window, system);
-        if (!gpu_core) {
-            return ResultStatus::ErrorVideoCore;
-        }

        // Initialize time manager, which must happen after kernel is created
        time_manager.Initialize();
--- a/src/core/hle/ipc_helpers.h
+++ b/src/core/hle/ipc_helpers.h
@@ -12,7 +12,6 @@
 #include <utility>
 #include "common/assert.h"
 #include "common/common_types.h"
-#include "core/core.h"
 #include "core/hle/ipc.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/client_session.h"
@@ -73,14 +72,12 @@ public:
        AlwaysMoveHandles = 1,
    };

-    explicit ResponseBuilder(u32* command_buffer) : RequestHelperBase(command_buffer) {}
-
    explicit ResponseBuilder(Kernel::HLERequestContext& context, u32 normal_params_size,
                             u32 num_handles_to_copy = 0, u32 num_objects_to_move = 0,
                             Flags flags = Flags::None)
-
        : RequestHelperBase(context), normal_params_size(normal_params_size),
-          num_handles_to_copy(num_handles_to_copy), num_objects_to_move(num_objects_to_move) {
+          num_handles_to_copy(num_handles_to_copy),
+          num_objects_to_move(num_objects_to_move), kernel{context.kernel} {

        memset(cmdbuf, 0, sizeof(u32) * IPC::COMMAND_BUFFER_LENGTH);

@@ -140,7 +137,6 @@ public:
        if (context->Session()->IsDomain()) {
            context->AddDomainObject(std::move(iface));
        } else {
-            auto& kernel = Core::System::GetInstance().Kernel();
            auto [client, server] = Kernel::Session::Create(kernel, iface->GetServiceName());
            context->AddMoveObject(std::move(client));
            iface->ClientConnected(std::move(server));
@@ -214,6 +210,7 @@ private:
    u32 num_handles_to_copy{};
    u32 num_objects_to_move{}; ///< Domain objects or move handles, context dependent
    std::ptrdiff_t datapayload_index{};
+    Kernel::KernelCore& kernel;
 };

 /// Push ///
--- a/src/core/hle/kernel/hle_ipc.h
+++ b/src/core/hle/kernel/hle_ipc.h
@@ -24,6 +24,10 @@ namespace Core::Memory {
 class Memory;
 }

+namespace IPC {
+class ResponseBuilder;
+}
+
 namespace Service {
 class ServiceFrameworkBase;
 }
@@ -287,6 +291,8 @@ public:
    }

 private:
+    friend class IPC::ResponseBuilder;
+
    void ParseCommandBuffer(const HandleTable& handle_table, u32_le* src_cmdbuf, bool incoming);

    std::array<u32, IPC::COMMAND_BUFFER_LENGTH> cmd_buf;
--- a/src/core/hle/kernel/physical_core.cpp
+++ b/src/core/hle/kernel/physical_core.cpp
@@ -2,30 +2,18 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include "common/assert.h"
-#include "common/logging/log.h"
 #include "common/spin_lock.h"
-#include "core/arm/arm_interface.h"
-#ifdef ARCHITECTURE_x86_64
-#include "core/arm/dynarmic/arm_dynarmic_32.h"
-#include "core/arm/dynarmic/arm_dynarmic_64.h"
-#endif
 #include "core/arm/cpu_interrupt_handler.h"
-#include "core/arm/exclusive_monitor.h"
-#include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
 #include "core/hle/kernel/physical_core.h"
 #include "core/hle/kernel/scheduler.h"
-#include "core/hle/kernel/thread.h"

 namespace Kernel {

 PhysicalCore::PhysicalCore(Core::System& system, std::size_t id, Kernel::Scheduler& scheduler,
                           Core::CPUInterruptHandler& interrupt_handler)
-    : interrupt_handler{interrupt_handler}, core_index{id}, scheduler{scheduler} {
-
-    guard = std::make_unique<Common::SpinLock>();
-}
+    : interrupt_handler{interrupt_handler},
+      core_index{id}, scheduler{scheduler}, guard{std::make_unique<Common::SpinLock>()} {}

 PhysicalCore::~PhysicalCore() = default;

--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -13,7 +13,6 @@
 #include "common/logging/log.h"
 #include "common/thread_queue_list.h"
 #include "core/arm/arm_interface.h"
-#include "core/arm/unicorn/arm_unicorn.h"
 #include "core/core.h"
 #include "core/cpu_manager.h"
 #include "core/hardware_properties.h"
@@ -217,8 +216,7 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(Core::System& system, ThreadTy
    } else {
        thread->tls_address = 0;
    }
-    // TODO(peachum): move to ScheduleThread() when scheduler is added so selected core is used
-    // to initialize the context
+
    thread->arm_interface.reset();
    if ((type_flags & THREADTYPE_HLE) == 0) {
 #ifdef ARCHITECTURE_x86_64
@@ -231,19 +229,10 @@ ResultVal<std::shared_ptr<Thread>> Thread::Create(Core::System& system, ThreadTy
                system, kernel.Interrupts(), kernel.IsMulticore(), kernel.GetExclusiveMonitor(),
                processor_id);
        }
-
 #else
-        if (owner_process && !owner_process->Is64BitProcess()) {
-            thread->arm_interface = std::make_shared<Core::ARM_Unicorn>(
-                system, kernel.Interrupts(), kernel.IsMulticore(), ARM_Unicorn::Arch::AArch32,
-                processor_id);
-        } else {
-            thread->arm_interface = std::make_shared<Core::ARM_Unicorn>(
-                system, kernel.Interrupts(), kernel.IsMulticore(), ARM_Unicorn::Arch::AArch64,
-                processor_id);
-        }
-        LOG_WARNING(Core, "CPU JIT requested, but Dynarmic not available");
+#error Platform not supported yet.
 #endif
+
        ResetThreadContext32(thread->context_32, static_cast<u32>(stack_top),
                             static_cast<u32>(entry_point), static_cast<u32>(arg));
        ResetThreadContext64(thread->context_64, stack_top, entry_point, arg);
--- a/src/core/hle/service/acc/acc.cpp
+++ b/src/core/hle/service/acc/acc.cpp
@@ -11,6 +11,7 @@
 #include "common/string_util.h"
 #include "common/swap.h"
 #include "core/constants.h"
+#include "core/core.h"
 #include "core/core_timing.h"
 #include "core/file_sys/control_metadata.h"
 #include "core/file_sys/patch_manager.h"
--- a/src/core/hle/service/am/applet_ae.cpp
+++ b/src/core/hle/service/am/applet_ae.cpp
@@ -3,8 +3,8 @@
 // Refer to the license.txt file included.

 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
-#include "core/hle/kernel/process.h"
 #include "core/hle/service/am/am.h"
 #include "core/hle/service/am/applet_ae.h"
 #include "core/hle/service/nvflinger/nvflinger.h"
--- a/src/core/hle/service/aoc/aoc_u.cpp
+++ b/src/core/hle/service/aoc/aoc_u.cpp
@@ -6,6 +6,7 @@
 #include <numeric>
 #include <vector>
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/file_sys/content_archive.h"
 #include "core/file_sys/control_metadata.h"
 #include "core/file_sys/nca_metadata.h"
--- a/src/core/hle/service/apm/apm.cpp
+++ b/src/core/hle/service/apm/apm.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/service/apm/apm.h"
 #include "core/hle/service/apm/interface.h"
--- a/src/core/hle/service/bcat/module.cpp
+++ b/src/core/hle/service/bcat/module.cpp
@@ -8,6 +8,7 @@
 #include "common/hex_util.h"
 #include "common/logging/log.h"
 #include "common/string_util.h"
+#include "core/core.h"
 #include "core/file_sys/vfs.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/process.h"
--- a/src/core/hle/service/btdrv/btdrv.cpp
+++ b/src/core/hle/service/btdrv/btdrv.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.

 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/hle_ipc.h"
 #include "core/hle/kernel/kernel.h"
--- a/src/core/hle/service/btm/btm.cpp
+++ b/src/core/hle/service/btm/btm.cpp
@@ -5,6 +5,7 @@
 #include <memory>

 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/hle_ipc.h"
 #include "core/hle/kernel/kernel.h"
--- a/src/core/hle/service/caps/caps_u.cpp
+++ b/src/core/hle/service/caps/caps_u.cpp
@@ -41,7 +41,7 @@ CAPS_U::CAPS_U() : ServiceFramework("caps:u") {
        {130, nullptr, "PrecheckToCreateContentsForApplication"},
        {140, nullptr, "GetAlbumFileList1AafeAruidDeprecated"},
        {141, nullptr, "GetAlbumFileList2AafeUidAruidDeprecated"},
-        {142, nullptr, "GetAlbumFileList3AaeAruid"},
+        {142, &CAPS_U::GetAlbumFileList3AaeAruid, "GetAlbumFileList3AaeAruid"},
        {143, nullptr, "GetAlbumFileList4AaeUidAruid"},
        {60002, nullptr, "OpenAccessorSessionForApplication"},
    };
@@ -77,17 +77,24 @@ void CAPS_U::GetAlbumContentsFileListForApplication(Kernel::HLERequestContext& c

    // TODO: Update this when we implement the album.
    // Currently we do not have a method of accessing album entries, set this to 0 for now.
-    constexpr s32 total_entries{0};
+    constexpr u32 total_entries_1{};
+    constexpr u32 total_entries_2{};

-    LOG_WARNING(Service_Capture,
-                "(STUBBED) called. pid={}, content_type={}, start_posix_time={}, "
-                "end_posix_time={}, applet_resource_user_id={}, total_entries={}",
-                pid, content_type, start_posix_time, end_posix_time, applet_resource_user_id,
-                total_entries);
+    LOG_WARNING(
+        Service_Capture,
+        "(STUBBED) called. pid={}, content_type={}, start_posix_time={}, "
+        "end_posix_time={}, applet_resource_user_id={}, total_entries_1={}, total_entries_2={}",
+        pid, content_type, start_posix_time, end_posix_time, applet_resource_user_id,
+        total_entries_1, total_entries_2);

-    IPC::ResponseBuilder rb{ctx, 3};
+    IPC::ResponseBuilder rb{ctx, 4};
    rb.Push(RESULT_SUCCESS);
-    rb.Push(total_entries);
+    rb.Push(total_entries_1);
+    rb.Push(total_entries_2);
+}
+
+void CAPS_U::GetAlbumFileList3AaeAruid(Kernel::HLERequestContext& ctx) {
+    GetAlbumContentsFileListForApplication(ctx);
 }

 } // namespace Service::Capture
--- a/src/core/hle/service/caps/caps_u.h
+++ b/src/core/hle/service/caps/caps_u.h
@@ -20,6 +20,7 @@ public:
 private:
    void SetShimLibraryVersion(Kernel::HLERequestContext& ctx);
    void GetAlbumContentsFileListForApplication(Kernel::HLERequestContext& ctx);
+    void GetAlbumFileList3AaeAruid(Kernel::HLERequestContext& ctx);
 };

 } // namespace Service::Capture
--- a/src/core/hle/service/friend/friend.cpp
+++ b/src/core/hle/service/friend/friend.cpp
@@ -5,6 +5,7 @@
 #include <queue>
 #include "common/logging/log.h"
 #include "common/uuid.h"
+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/kernel/writable_event.h"
--- a/src/core/hle/service/glue/arp.cpp
+++ b/src/core/hle/service/glue/arp.cpp
@@ -5,6 +5,7 @@
 #include <memory>

 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/file_sys/control_metadata.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/hle_ipc.h"
--- a/src/core/hle/service/ldr/ldr.cpp
+++ b/src/core/hle/service/ldr/ldr.cpp
@@ -9,6 +9,7 @@
 #include "common/alignment.h"
 #include "common/hex_util.h"
 #include "common/scope_exit.h"
+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/memory/page_table.h"
--- a/src/core/hle/service/lm/lm.cpp
+++ b/src/core/hle/service/lm/lm.cpp
@@ -7,6 +7,7 @@

 #include "common/logging/log.h"
 #include "common/scope_exit.h"
+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/service/lm/lm.h"
 #include "core/hle/service/lm/manager.h"
--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.cpp
@@ -15,8 +15,9 @@

 namespace Service::Nvidia::Devices {

-nvhost_ctrl::nvhost_ctrl(Core::System& system, EventInterface& events_interface)
-    : nvdevice(system), events_interface{events_interface} {}
+nvhost_ctrl::nvhost_ctrl(Core::System& system, EventInterface& events_interface,
+                         SyncpointManager& syncpoint_manager)
+    : nvdevice(system), events_interface{events_interface}, syncpoint_manager{syncpoint_manager} {}
 nvhost_ctrl::~nvhost_ctrl() = default;

 u32 nvhost_ctrl::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@@ -70,19 +71,33 @@ u32 nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>&
        return NvResult::BadParameter;
    }

+    if (syncpoint_manager.IsSyncpointExpired(params.syncpt_id, params.threshold)) {
+        params.value = syncpoint_manager.GetSyncpointMin(params.syncpt_id);
+        std::memcpy(output.data(), &params, sizeof(params));
+        return NvResult::Success;
+    }
+
+    if (const auto new_value = syncpoint_manager.RefreshSyncpoint(params.syncpt_id);
+        syncpoint_manager.IsSyncpointExpired(params.syncpt_id, params.threshold)) {
+        params.value = new_value;
+        std::memcpy(output.data(), &params, sizeof(params));
+        return NvResult::Success;
+    }
+
    auto event = events_interface.events[event_id];
    auto& gpu = system.GPU();
+
    // This is mostly to take into account unimplemented features. As synced
    // gpu is always synced.
    if (!gpu.IsAsync()) {
-        event.writable->Signal();
+        event.event.writable->Signal();
        return NvResult::Success;
    }
    auto lock = gpu.LockSync();
-    const u32 current_syncpoint_value = gpu.GetSyncpointValue(params.syncpt_id);
+    const u32 current_syncpoint_value = event.fence.value;
    const s32 diff = current_syncpoint_value - params.threshold;
    if (diff >= 0) {
-        event.writable->Signal();
+        event.event.writable->Signal();
        params.value = current_syncpoint_value;
        std::memcpy(output.data(), &params, sizeof(params));
        return NvResult::Success;
@@ -109,7 +124,7 @@ u32 nvhost_ctrl::IocCtrlEventWait(const std::vector<u8>& input, std::vector<u8>&
            params.value = ((params.syncpt_id & 0xfff) << 16) | 0x10000000;
        }
        params.value |= event_id;
-        event.writable->Clear();
+        event.event.writable->Clear();
        gpu.RegisterSyncptInterrupt(params.syncpt_id, target_value);
        if (!is_async && ctrl.fresh_call) {
            ctrl.must_delay = true;
@@ -157,15 +172,19 @@ u32 nvhost_ctrl::IocCtrlEventUnregister(const std::vector<u8>& input, std::vecto
 u32 nvhost_ctrl::IocCtrlClearEventWait(const std::vector<u8>& input, std::vector<u8>& output) {
    IocCtrlEventSignalParams params{};
    std::memcpy(&params, input.data(), sizeof(params));
+
    u32 event_id = params.event_id & 0x00FF;
    LOG_WARNING(Service_NVDRV, "cleared event wait on, event_id: {:X}", event_id);
+
    if (event_id >= MaxNvEvents) {
        return NvResult::BadParameter;
    }
    if (events_interface.status[event_id] == EventState::Waiting) {
        events_interface.LiberateEvent(event_id);
-        events_interface.events[event_id].writable->Signal();
    }
+
+    syncpoint_manager.RefreshSyncpoint(events_interface.events[event_id].fence.id);
+
    return NvResult::Success;
 }

--- a/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_ctrl.h
@@ -14,7 +14,8 @@ namespace Service::Nvidia::Devices {

 class nvhost_ctrl final : public nvdevice {
 public:
-    explicit nvhost_ctrl(Core::System& system, EventInterface& events_interface);
+    explicit nvhost_ctrl(Core::System& system, EventInterface& events_interface,
+                         SyncpointManager& syncpoint_manager);
    ~nvhost_ctrl() override;

    u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@@ -145,6 +146,7 @@ private:
    u32 IocCtrlClearEventWait(const std::vector<u8>& input, std::vector<u8>& output);

    EventInterface& events_interface;
+    SyncpointManager& syncpoint_manager;
 };

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -7,14 +7,20 @@
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "core/hle/service/nvdrv/devices/nvhost_gpu.h"
+#include "core/hle/service/nvdrv/syncpoint_manager.h"
 #include "core/memory.h"
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"

 namespace Service::Nvidia::Devices {

-nvhost_gpu::nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev)
-    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)) {}
+nvhost_gpu::nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev,
+                       SyncpointManager& syncpoint_manager)
+    : nvdevice(system), nvmap_dev(std::move(nvmap_dev)), syncpoint_manager{syncpoint_manager} {
+    channel_fence.id = syncpoint_manager.AllocateSyncpoint();
+    channel_fence.value = system.GPU().GetSyncpointValue(channel_fence.id);
+}
+
 nvhost_gpu::~nvhost_gpu() = default;

 u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@@ -126,10 +132,10 @@ u32 nvhost_gpu::AllocGPFIFOEx2(const std::vector<u8>& input, std::vector<u8>& ou
                params.num_entries, params.flags, params.unk0, params.unk1, params.unk2,
                params.unk3);

-    auto& gpu = system.GPU();
-    params.fence_out.id = assigned_syncpoints;
-    params.fence_out.value = gpu.GetSyncpointValue(assigned_syncpoints);
-    assigned_syncpoints++;
+    channel_fence.value = system.GPU().GetSyncpointValue(channel_fence.id);
+
+    params.fence_out = channel_fence;
+
    std::memcpy(output.data(), &params, output.size());
    return 0;
 }
@@ -145,37 +151,97 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
    return 0;
 }

+static std::vector<Tegra::CommandHeader> BuildWaitCommandList(Fence fence) {
+    return {
+        Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceValue, 1,
+                                  Tegra::SubmissionMode::Increasing),
+        {fence.value},
+        Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceAction, 1,
+                                  Tegra::SubmissionMode::Increasing),
+        Tegra::GPU::FenceAction::Build(Tegra::GPU::FenceOperation::Acquire, fence.id),
+    };
+}
+
+static std::vector<Tegra::CommandHeader> BuildIncrementCommandList(Fence fence, u32 add_increment) {
+    std::vector<Tegra::CommandHeader> result{
+        Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceValue, 1,
+                                  Tegra::SubmissionMode::Increasing),
+        {}};
+
+    for (u32 count = 0; count < add_increment; ++count) {
+        result.emplace_back(Tegra::BuildCommandHeader(Tegra::BufferMethods::FenceAction, 1,
+                                                      Tegra::SubmissionMode::Increasing));
+        result.emplace_back(
+            Tegra::GPU::FenceAction::Build(Tegra::GPU::FenceOperation::Increment, fence.id));
+    }
+
+    return result;
+}
+
+static std::vector<Tegra::CommandHeader> BuildIncrementWithWfiCommandList(Fence fence,
+                                                                          u32 add_increment) {
+    std::vector<Tegra::CommandHeader> result{
+        Tegra::BuildCommandHeader(Tegra::BufferMethods::WaitForInterrupt, 1,
+                                  Tegra::SubmissionMode::Increasing),
+        {}};
+    const std::vector<Tegra::CommandHeader> increment{
+        BuildIncrementCommandList(fence, add_increment)};
+
+    result.insert(result.end(), increment.begin(), increment.end());
+
+    return result;
+}
+
+u32 nvhost_gpu::SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector<u8>& output,
+                                 Tegra::CommandList&& entries) {
+    LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address,
+              params.num_entries, params.flags.raw);
+
+    auto& gpu = system.GPU();
+
+    params.fence_out.id = channel_fence.id;
+
+    if (params.flags.add_wait.Value() &&
+        !syncpoint_manager.IsSyncpointExpired(params.fence_out.id, params.fence_out.value)) {
+        gpu.PushGPUEntries(Tegra::CommandList{BuildWaitCommandList(params.fence_out)});
+    }
+
+    if (params.flags.add_increment.Value() || params.flags.increment.Value()) {
+        const u32 increment_value = params.flags.increment.Value() ? params.fence_out.value : 0;
+        params.fence_out.value = syncpoint_manager.IncreaseSyncpoint(
+            params.fence_out.id, params.AddIncrementValue() + increment_value);
+    } else {
+        params.fence_out.value = syncpoint_manager.GetSyncpointMax(params.fence_out.id);
+    }
+
+    gpu.PushGPUEntries(std::move(entries));
+
+    if (params.flags.add_increment.Value()) {
+        if (params.flags.suppress_wfi) {
+            gpu.PushGPUEntries(Tegra::CommandList{
+                BuildIncrementCommandList(params.fence_out, params.AddIncrementValue())});
+        } else {
+            gpu.PushGPUEntries(Tegra::CommandList{
+                BuildIncrementWithWfiCommandList(params.fence_out, params.AddIncrementValue())});
+        }
+    }
+
+    std::memcpy(output.data(), &params, sizeof(IoctlSubmitGpfifo));
+    return 0;
+}
+
 u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& output) {
    if (input.size() < sizeof(IoctlSubmitGpfifo)) {
        UNIMPLEMENTED();
    }
    IoctlSubmitGpfifo params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
-    LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address,
-              params.num_entries, params.flags.raw);
-
-    ASSERT_MSG(input.size() == sizeof(IoctlSubmitGpfifo) +
-                                   params.num_entries * sizeof(Tegra::CommandListHeader),
-               "Incorrect input size");

    Tegra::CommandList entries(params.num_entries);
-    std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
+    std::memcpy(entries.command_lists.data(), &input[sizeof(IoctlSubmitGpfifo)],
                params.num_entries * sizeof(Tegra::CommandListHeader));

-    UNIMPLEMENTED_IF(params.flags.add_wait.Value() != 0);
-    UNIMPLEMENTED_IF(params.flags.add_increment.Value() != 0);
-
-    auto& gpu = system.GPU();
-    u32 current_syncpoint_value = gpu.GetSyncpointValue(params.fence_out.id);
-    if (params.flags.increment.Value()) {
-        params.fence_out.value += current_syncpoint_value;
-    } else {
-        params.fence_out.value = current_syncpoint_value;
-    }
-    gpu.PushGPUEntries(std::move(entries));
-
-    std::memcpy(output.data(), &params, sizeof(IoctlSubmitGpfifo));
-    return 0;
+    return SubmitGPFIFOImpl(params, output, std::move(entries));
 }

 u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output,
@@ -185,31 +251,17 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output,
    }
    IoctlSubmitGpfifo params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
-    LOG_TRACE(Service_NVDRV, "called, gpfifo={:X}, num_entries={:X}, flags={:X}", params.address,
-              params.num_entries, params.flags.raw);

    Tegra::CommandList entries(params.num_entries);
    if (version == IoctlVersion::Version2) {
-        std::memcpy(entries.data(), input2.data(),
+        std::memcpy(entries.command_lists.data(), input2.data(),
                    params.num_entries * sizeof(Tegra::CommandListHeader));
    } else {
-        system.Memory().ReadBlock(params.address, entries.data(),
+        system.Memory().ReadBlock(params.address, entries.command_lists.data(),
                                  params.num_entries * sizeof(Tegra::CommandListHeader));
    }
-    UNIMPLEMENTED_IF(params.flags.add_wait.Value() != 0);
-    UNIMPLEMENTED_IF(params.flags.add_increment.Value() != 0);

-    auto& gpu = system.GPU();
-    u32 current_syncpoint_value = gpu.GetSyncpointValue(params.fence_out.id);
-    if (params.flags.increment.Value()) {
-        params.fence_out.value += current_syncpoint_value;
-    } else {
-        params.fence_out.value = current_syncpoint_value;
-    }
-    gpu.PushGPUEntries(std::move(entries));
-
-    std::memcpy(output.data(), &params, output.size());
-    return 0;
+    return SubmitGPFIFOImpl(params, output, std::move(entries));
 }

 u32 nvhost_gpu::GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output) {
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@@ -11,6 +11,11 @@
 #include "common/swap.h"
 #include "core/hle/service/nvdrv/devices/nvdevice.h"
 #include "core/hle/service/nvdrv/nvdata.h"
+#include "video_core/dma_pusher.h"
+
+namespace Service::Nvidia {
+class SyncpointManager;
+}

 namespace Service::Nvidia::Devices {

@@ -21,7 +26,8 @@ constexpr u32 NVGPU_IOCTL_CHANNEL_KICKOFF_PB(0x1b);

 class nvhost_gpu final : public nvdevice {
 public:
-    explicit nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev);
+    explicit nvhost_gpu(Core::System& system, std::shared_ptr<nvmap> nvmap_dev,
+                        SyncpointManager& syncpoint_manager);
    ~nvhost_gpu() override;

    u32 ioctl(Ioctl command, const std::vector<u8>& input, const std::vector<u8>& input2,
@@ -162,10 +168,15 @@ private:
            u32_le raw;
            BitField<0, 1, u32_le> add_wait;      // append a wait sync_point to the list
            BitField<1, 1, u32_le> add_increment; // append an increment to the list
-            BitField<2, 1, u32_le> new_hw_format; // Mostly ignored
+            BitField<2, 1, u32_le> new_hw_format; // mostly ignored
+            BitField<4, 1, u32_le> suppress_wfi;  // suppress wait for interrupt
            BitField<8, 1, u32_le> increment;     // increment the returned fence
        } flags;
        Fence fence_out; // returned new fence object for others to wait on
+
+        u32 AddIncrementValue() const {
+            return flags.add_increment.Value() << 1;
+        }
    };
    static_assert(sizeof(IoctlSubmitGpfifo) == 16 + sizeof(Fence),
                  "IoctlSubmitGpfifo is incorrect size");
@@ -190,6 +201,8 @@ private:
    u32 SetChannelPriority(const std::vector<u8>& input, std::vector<u8>& output);
    u32 AllocGPFIFOEx2(const std::vector<u8>& input, std::vector<u8>& output);
    u32 AllocateObjectContext(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 SubmitGPFIFOImpl(IoctlSubmitGpfifo& params, std::vector<u8>& output,
+                         Tegra::CommandList&& entries);
    u32 SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& output);
    u32 KickoffPB(const std::vector<u8>& input, std::vector<u8>& output,
                  const std::vector<u8>& input2, IoctlVersion version);
@@ -198,7 +211,8 @@ private:
    u32 ChannelSetTimeslice(const std::vector<u8>& input, std::vector<u8>& output);

    std::shared_ptr<nvmap> nvmap_dev;
-    u32 assigned_syncpoints{};
+    SyncpointManager& syncpoint_manager;
+    Fence channel_fence;
 };

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/nvdrv.cpp
+++ b/src/core/hle/service/nvdrv/nvdrv.cpp
@@ -5,6 +5,7 @@
 #include <utility>

 #include <fmt/format.h>
+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/readable_event.h"
 #include "core/hle/kernel/writable_event.h"
@@ -21,6 +22,7 @@
 #include "core/hle/service/nvdrv/interface.h"
 #include "core/hle/service/nvdrv/nvdrv.h"
 #include "core/hle/service/nvdrv/nvmemp.h"
+#include "core/hle/service/nvdrv/syncpoint_manager.h"
 #include "core/hle/service/nvflinger/nvflinger.h"

 namespace Service::Nvidia {
@@ -36,21 +38,23 @@ void InstallInterfaces(SM::ServiceManager& service_manager, NVFlinger::NVFlinger
    nvflinger.SetNVDrvInstance(module_);
 }

-Module::Module(Core::System& system) {
+Module::Module(Core::System& system) : syncpoint_manager{system.GPU()} {
    auto& kernel = system.Kernel();
    for (u32 i = 0; i < MaxNvEvents; i++) {
        std::string event_label = fmt::format("NVDRV::NvEvent_{}", i);
-        events_interface.events[i] = Kernel::WritableEvent::CreateEventPair(kernel, event_label);
+        events_interface.events[i] = {Kernel::WritableEvent::CreateEventPair(kernel, event_label)};
        events_interface.status[i] = EventState::Free;
        events_interface.registered[i] = false;
    }
    auto nvmap_dev = std::make_shared<Devices::nvmap>(system);
    devices["/dev/nvhost-as-gpu"] = std::make_shared<Devices::nvhost_as_gpu>(system, nvmap_dev);
-    devices["/dev/nvhost-gpu"] = std::make_shared<Devices::nvhost_gpu>(system, nvmap_dev);
+    devices["/dev/nvhost-gpu"] =
+        std::make_shared<Devices::nvhost_gpu>(system, nvmap_dev, syncpoint_manager);
    devices["/dev/nvhost-ctrl-gpu"] = std::make_shared<Devices::nvhost_ctrl_gpu>(system);
    devices["/dev/nvmap"] = nvmap_dev;
    devices["/dev/nvdisp_disp0"] = std::make_shared<Devices::nvdisp_disp0>(system, nvmap_dev);
-    devices["/dev/nvhost-ctrl"] = std::make_shared<Devices::nvhost_ctrl>(system, events_interface);
+    devices["/dev/nvhost-ctrl"] =
+        std::make_shared<Devices::nvhost_ctrl>(system, events_interface, syncpoint_manager);
    devices["/dev/nvhost-nvdec"] = std::make_shared<Devices::nvhost_nvdec>(system, nvmap_dev);
    devices["/dev/nvhost-nvjpg"] = std::make_shared<Devices::nvhost_nvjpg>(system);
    devices["/dev/nvhost-vic"] = std::make_shared<Devices::nvhost_vic>(system, nvmap_dev);
@@ -95,17 +99,17 @@ void Module::SignalSyncpt(const u32 syncpoint_id, const u32 value) {
        if (events_interface.assigned_syncpt[i] == syncpoint_id &&
            events_interface.assigned_value[i] == value) {
            events_interface.LiberateEvent(i);
-            events_interface.events[i].writable->Signal();
+            events_interface.events[i].event.writable->Signal();
        }
    }
 }

 std::shared_ptr<Kernel::ReadableEvent> Module::GetEvent(const u32 event_id) const {
-    return events_interface.events[event_id].readable;
+    return events_interface.events[event_id].event.readable;
 }

 std::shared_ptr<Kernel::WritableEvent> Module::GetEventWriteable(const u32 event_id) const {
-    return events_interface.events[event_id].writable;
+    return events_interface.events[event_id].event.writable;
 }

 } // namespace Service::Nvidia
--- a/src/core/hle/service/nvdrv/nvdrv.h
+++ b/src/core/hle/service/nvdrv/nvdrv.h
@@ -10,6 +10,7 @@
 #include "common/common_types.h"
 #include "core/hle/kernel/writable_event.h"
 #include "core/hle/service/nvdrv/nvdata.h"
+#include "core/hle/service/nvdrv/syncpoint_manager.h"
 #include "core/hle/service/service.h"

 namespace Core {
@@ -22,15 +23,23 @@ class NVFlinger;

 namespace Service::Nvidia {

+class SyncpointManager;
+
 namespace Devices {
 class nvdevice;
 }

+/// Represents an Nvidia event
+struct NvEvent {
+    Kernel::EventPair event;
+    Fence fence{};
+};
+
 struct EventInterface {
    // Mask representing currently busy events
    u64 events_mask{};
    // Each kernel event associated to an NV event
-    std::array<Kernel::EventPair, MaxNvEvents> events;
+    std::array<NvEvent, MaxNvEvents> events;
    // The status of the current NVEvent
    std::array<EventState, MaxNvEvents> status{};
    // Tells if an NVEvent is registered or not
@@ -119,6 +128,9 @@ public:
    std::shared_ptr<Kernel::WritableEvent> GetEventWriteable(u32 event_id) const;

 private:
+    /// Manages syncpoints on the host
+    SyncpointManager syncpoint_manager;
+
    /// Id to use for the next open file descriptor.
    u32 next_fd = 1;

--- a/src/core/hle/service/nvdrv/syncpoint_manager.cpp
+++ b/src/core/hle/service/nvdrv/syncpoint_manager.cpp
@@ -0,0 +1,39 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "core/hle/service/nvdrv/syncpoint_manager.h"
+#include "video_core/gpu.h"
+
+namespace Service::Nvidia {
+
+SyncpointManager::SyncpointManager(Tegra::GPU& gpu) : gpu{gpu} {}
+
+SyncpointManager::~SyncpointManager() = default;
+
+u32 SyncpointManager::RefreshSyncpoint(u32 syncpoint_id) {
+    syncpoints[syncpoint_id].min = gpu.GetSyncpointValue(syncpoint_id);
+    return GetSyncpointMin(syncpoint_id);
+}
+
+u32 SyncpointManager::AllocateSyncpoint() {
+    for (u32 syncpoint_id = 1; syncpoint_id < MaxSyncPoints; syncpoint_id++) {
+        if (!syncpoints[syncpoint_id].is_allocated) {
+            syncpoints[syncpoint_id].is_allocated = true;
+            return syncpoint_id;
+        }
+    }
+    UNREACHABLE_MSG("No more available syncpoints!");
+    return {};
+}
+
+u32 SyncpointManager::IncreaseSyncpoint(u32 syncpoint_id, u32 value) {
+    for (u32 index = 0; index < value; ++index) {
+        syncpoints[syncpoint_id].max.fetch_add(1, std::memory_order_relaxed);
+    }
+
+    return GetSyncpointMax(syncpoint_id);
+}
+
+} // namespace Service::Nvidia
--- a/src/core/hle/service/nvdrv/syncpoint_manager.h
+++ b/src/core/hle/service/nvdrv/syncpoint_manager.h
@@ -0,0 +1,85 @@
+// Copyright 2020 yuzu emulator team
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <atomic>
+
+#include "common/common_types.h"
+#include "core/hle/service/nvdrv/nvdata.h"
+
+namespace Tegra {
+class GPU;
+}
+
+namespace Service::Nvidia {
+
+class SyncpointManager final {
+public:
+    explicit SyncpointManager(Tegra::GPU& gpu);
+    ~SyncpointManager();
+
+    /**
+     * Returns true if the specified syncpoint is expired for the given value.
+     * @param syncpoint_id Syncpoint ID to check.
+     * @param value Value to check against the specified syncpoint.
+     * @returns True if the specified syncpoint is expired for the given value, otherwise False.
+     */
+    bool IsSyncpointExpired(u32 syncpoint_id, u32 value) const {
+        return (GetSyncpointMax(syncpoint_id) - value) >= (GetSyncpointMin(syncpoint_id) - value);
+    }
+
+    /**
+     * Gets the lower bound for the specified syncpoint.
+     * @param syncpoint_id Syncpoint ID to get the lower bound for.
+     * @returns The lower bound for the specified syncpoint.
+     */
+    u32 GetSyncpointMin(u32 syncpoint_id) const {
+        return syncpoints[syncpoint_id].min.load(std::memory_order_relaxed);
+    }
+
+    /**
+     * Gets the uper bound for the specified syncpoint.
+     * @param syncpoint_id Syncpoint ID to get the upper bound for.
+     * @returns The upper bound for the specified syncpoint.
+     */
+    u32 GetSyncpointMax(u32 syncpoint_id) const {
+        return syncpoints[syncpoint_id].max.load(std::memory_order_relaxed);
+    }
+
+    /**
+     * Refreshes the minimum value for the specified syncpoint.
+     * @param syncpoint_id Syncpoint ID to be refreshed.
+     * @returns The new syncpoint minimum value.
+     */
+    u32 RefreshSyncpoint(u32 syncpoint_id);
+
+    /**
+     * Allocates a new syncoint.
+     * @returns The syncpoint ID for the newly allocated syncpoint.
+     */
+    u32 AllocateSyncpoint();
+
+    /**
+     * Increases the maximum value for the specified syncpoint.
+     * @param syncpoint_id Syncpoint ID to be increased.
+     * @param value Value to increase the specified syncpoint by.
+     * @returns The new syncpoint maximum value.
+     */
+    u32 IncreaseSyncpoint(u32 syncpoint_id, u32 value);
+
+private:
+    struct Syncpoint {
+        std::atomic<u32> min;
+        std::atomic<u32> max;
+        std::atomic<bool> is_allocated;
+    };
+
+    std::array<Syncpoint, MaxSyncPoints> syncpoints{};
+
+    Tegra::GPU& gpu;
+};
+
+} // namespace Service::Nvidia
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -29,6 +29,10 @@ void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer)
        .slot = slot,
        .status = Buffer::Status::Free,
        .igbp_buffer = igbp_buffer,
+        .transform = {},
+        .crop_rect = {},
+        .swap_interval = 0,
+        .multi_fence = {},
    });

    buffer_wait_event.writable->Signal();
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -242,6 +242,10 @@ void NVFlinger::Compose() {

        const auto& igbp_buffer = buffer->get().igbp_buffer;

+        if (!system.IsPoweredOn()) {
+            return; // We are likely shutting down
+        }
+
        auto& gpu = system.GPU();
        const auto& multi_fence = buffer->get().multi_fence;
        guard->unlock();
--- a/src/core/hle/service/pm/pm.cpp
+++ b/src/core/hle/service/pm/pm.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/process.h"
--- a/src/core/hle/service/prepo/prepo.cpp
+++ b/src/core/hle/service/prepo/prepo.cpp
@@ -4,6 +4,7 @@

 #include "common/hex_util.h"
 #include "common/logging/log.h"
+#include "core/core.h"
 #include "core/hle/ipc_helpers.h"
 #include "core/hle/kernel/process.h"
 #include "core/hle/service/acc/profile_manager.h"
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@@ -56,6 +56,7 @@ void LogSettings() {
    log_setting("System_RegionIndex", values.region_index.GetValue());
    log_setting("System_TimeZoneIndex", values.time_zone_index.GetValue());
    log_setting("Core_UseMultiCore", values.use_multi_core.GetValue());
+    log_setting("CPU_Accuracy", values.cpu_accuracy);
    log_setting("Renderer_UseResolutionFactor", values.resolution_factor.GetValue());
    log_setting("Renderer_UseFrameLimit", values.use_frame_limit.GetValue());
    log_setting("Renderer_FrameLimit", values.frame_limit.GetValue());
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -101,7 +101,7 @@ struct Values {
    bool renderer_debug;
    Setting<int> vulkan_device;

-    Setting<u16> resolution_factor = Setting(static_cast<u16>(1));
+    Setting<u16> resolution_factor{1};
    Setting<int> aspect_ratio;
    Setting<int> max_anisotropy;
    Setting<bool> use_frame_limit;
--- a/src/video_core/command_classes/codecs/codec.h
+++ b/src/video_core/command_classes/codecs/codec.h
@@ -42,11 +42,11 @@ public:
    void Decode();

    /// Returns most recently decoded frame
-    AVFrame* GetCurrentFrame();
-    const AVFrame* GetCurrentFrame() const;
+    [[nodiscard]] AVFrame* GetCurrentFrame();
+    [[nodiscard]] const AVFrame* GetCurrentFrame() const;

    /// Returns the value of current_codec
-    NvdecCommon::VideoCodec GetCurrentCodec() const;
+    [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const;

 private:
    bool initialized{};
--- a/src/video_core/command_classes/codecs/h264.cpp
+++ b/src/video_core/command_classes/codecs/h264.cpp
@@ -43,7 +43,8 @@ H264::H264(GPU& gpu_) : gpu(gpu_) {}

 H264::~H264() = default;

-std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state, bool is_first_frame) {
+const std::vector<u8>& H264::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
+                                                bool is_first_frame) {
    H264DecoderContext context{};
    gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));

--- a/src/video_core/command_classes/codecs/h264.h
+++ b/src/video_core/command_classes/codecs/h264.h
@@ -51,14 +51,14 @@ public:
    void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count);

    /// Return the bitstream as a vector.
-    std::vector<u8>& GetByteArray();
-    const std::vector<u8>& GetByteArray() const;
+    [[nodiscard]] std::vector<u8>& GetByteArray();
+    [[nodiscard]] const std::vector<u8>& GetByteArray() const;

 private:
    void WriteBits(s32 value, s32 bit_count);
    void WriteExpGolombCodedInt(s32 value);
    void WriteExpGolombCodedUInt(u32 value);
-    s32 GetFreeBufferBits();
+    [[nodiscard]] s32 GetFreeBufferBits();
    void Flush();

    s32 buffer_size{8};
@@ -74,8 +74,8 @@ public:
    ~H264();

    /// Compose the H264 header of the frame for FFmpeg decoding
-    std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
-                                        bool is_first_frame = false);
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state,
+                                                            bool is_first_frame = false);

 private:
    struct H264ParameterSet {
--- a/src/video_core/command_classes/codecs/vp9.cpp
+++ b/src/video_core/command_classes/codecs/vp9.cpp
@@ -854,7 +854,7 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
    return uncomp_writer;
 }

-std::vector<u8>& VP9::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state) {
+const std::vector<u8>& VP9::ComposeFrameHeader(NvdecCommon::NvdecRegisters& state) {
    std::vector<u8> bitstream;
    {
        Vp9FrameContainer curr_frame = GetCurrentFrame(state);
--- a/src/video_core/command_classes/codecs/vp9.h
+++ b/src/video_core/command_classes/codecs/vp9.h
@@ -119,7 +119,7 @@ public:

    /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec
    /// documentation
-    std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state);
+    [[nodiscard]] const std::vector<u8>& ComposeFrameHeader(NvdecCommon::NvdecRegisters& state);

    /// Returns true if the most recent frame was a hidden frame.
    [[nodiscard]] bool WasFrameHidden() const {
--- a/src/video_core/command_classes/codecs/vp9_types.h
+++ b/src/video_core/command_classes/codecs/vp9_types.h
@@ -231,9 +231,8 @@ struct PictureInfo {
    u32 surface_params{};
    INSERT_PADDING_WORDS(3);

-    Vp9PictureInfo Convert() const {
-
-        return Vp9PictureInfo{
+    [[nodiscard]] Vp9PictureInfo Convert() const {
+        return {
            .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0,
            .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0,
            .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0,
--- a/src/video_core/command_classes/nvdec.h
+++ b/src/video_core/command_classes/nvdec.h
@@ -26,8 +26,8 @@ public:
    void ProcessMethod(Method method, const std::vector<u32>& arguments);

    /// Return most recently decoded frame
-    AVFrame* GetFrame();
-    const AVFrame* GetFrame() const;
+    [[nodiscard]] AVFrame* GetFrame();
+    [[nodiscard]] const AVFrame* GetFrame() const;

 private:
    /// Invoke codec to decode a frame
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -2,6 +2,7 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include "common/cityhash.h"
 #include "common/microprofile.h"
 #include "core/core.h"
 #include "core/memory.h"
@@ -45,32 +46,41 @@ bool DmaPusher::Step() {
        return false;
    }

-    const CommandList& command_list{dma_pushbuffer.front()};
-    ASSERT_OR_EXECUTE(!command_list.empty(), {
-        // Somehow the command_list is empty, in order to avoid a crash
-        // We ignore it and assume its size is 0.
+    CommandList& command_list{dma_pushbuffer.front()};
+
+    ASSERT_OR_EXECUTE(
+        command_list.command_lists.size() || command_list.prefetch_command_list.size(), {
+            // Somehow the command_list is empty, in order to avoid a crash
+            // We ignore it and assume its size is 0.
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+            return true;
+        });
+
+    if (command_list.prefetch_command_list.size()) {
+        // Prefetched command list from nvdrv, used for things like synchronization
+        command_headers = std::move(command_list.prefetch_command_list);
        dma_pushbuffer.pop();
-        dma_pushbuffer_subindex = 0;
-        return true;
-    });
-    const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]};
-    const GPUVAddr dma_get = command_list_header.addr;
+    } else {
+        const CommandListHeader command_list_header{
+            command_list.command_lists[dma_pushbuffer_subindex++]};
+        const GPUVAddr dma_get = command_list_header.addr;

-    if (dma_pushbuffer_subindex >= command_list.size()) {
-        // We've gone through the current list, remove it from the queue
-        dma_pushbuffer.pop();
-        dma_pushbuffer_subindex = 0;
+        if (dma_pushbuffer_subindex >= command_list.command_lists.size()) {
+            // We've gone through the current list, remove it from the queue
+            dma_pushbuffer.pop();
+            dma_pushbuffer_subindex = 0;
+        }
+
+        if (command_list_header.size == 0) {
+            return true;
+        }
+
+        // Push buffer non-empty, read a word
+        command_headers.resize(command_list_header.size);
+        gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
+                                            command_list_header.size * sizeof(u32));
    }
-
-    if (command_list_header.size == 0) {
-        return true;
-    }
-
-    // Push buffer non-empty, read a word
-    command_headers.resize(command_list_header.size);
-    gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
-                                        command_list_header.size * sizeof(u32));
-
    for (std::size_t index = 0; index < command_headers.size();) {
        const CommandHeader& command_header = command_headers[index];

--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -18,6 +18,8 @@ class System;

 namespace Tegra {

+class GPU;
+
 enum class SubmissionMode : u32 {
    IncreasingOld = 0,
    Increasing = 1,
@@ -27,6 +29,31 @@ enum class SubmissionMode : u32 {
    IncreaseOnce = 5
 };

+// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
+// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
+// So the values you see in docs might be multiplied by 4.
+enum class BufferMethods : u32 {
+    BindObject = 0x0,
+    Nop = 0x2,
+    SemaphoreAddressHigh = 0x4,
+    SemaphoreAddressLow = 0x5,
+    SemaphoreSequence = 0x6,
+    SemaphoreTrigger = 0x7,
+    NotifyIntr = 0x8,
+    WrcacheFlush = 0x9,
+    Unk28 = 0xA,
+    UnkCacheFlush = 0xB,
+    RefCnt = 0x14,
+    SemaphoreAcquire = 0x1A,
+    SemaphoreRelease = 0x1B,
+    FenceValue = 0x1C,
+    FenceAction = 0x1D,
+    WaitForInterrupt = 0x1E,
+    Unk7c = 0x1F,
+    Yield = 0x20,
+    NonPullerMethods = 0x40,
+};
+
 struct CommandListHeader {
    union {
        u64 raw;
@@ -49,9 +76,23 @@ union CommandHeader {
 static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout");
 static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");

-class GPU;
+inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) {
+    CommandHeader result{};
+    result.method.Assign(static_cast<u32>(method));
+    result.arg_count.Assign(arg_count);
+    result.mode.Assign(mode);
+    return result;
+}

-using CommandList = std::vector<Tegra::CommandListHeader>;
+struct CommandList final {
+    CommandList() = default;
+    explicit CommandList(std::size_t size) : command_lists(size) {}
+    explicit CommandList(std::vector<Tegra::CommandHeader>&& prefetch_command_list)
+        : prefetch_command_list{std::move(prefetch_command_list)} {}
+
+    std::vector<Tegra::CommandListHeader> command_lists;
+    std::vector<Tegra::CommandHeader> prefetch_command_list;
+};

 /**
 * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the
@@ -60,7 +101,7 @@ using CommandList = std::vector<Tegra::CommandListHeader>;
 * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for
 * details on this implementation.
 */
-class DmaPusher {
+class DmaPusher final {
 public:
    explicit DmaPusher(Core::System& system, GPU& gpu);
    ~DmaPusher();
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -124,6 +124,112 @@ void Maxwell3D::InitializeRegisterDefaults() {
    mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true;
 }

+void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
+    if (executing_macro == 0) {
+        // A macro call must begin by writing the macro method's register, not its argument.
+        ASSERT_MSG((method % 2) == 0,
+                   "Can't start macro execution by writing to the ARGS register");
+        executing_macro = method;
+    }
+
+    macro_params.insert(macro_params.end(), base_start, base_start + amount);
+
+    // Call the macro when there are no more parameters in the command buffer
+    if (is_last_call) {
+        CallMacroMethod(executing_macro, macro_params);
+        macro_params.clear();
+    }
+}
+
+u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) {
+    // Keep track of the register value in shadow_state when requested.
+    const auto control = shadow_state.shadow_ram_control;
+    if (control == Regs::ShadowRamControl::Track ||
+        control == Regs::ShadowRamControl::TrackWithFilter) {
+        shadow_state.reg_array[method] = argument;
+        return argument;
+    }
+    if (control == Regs::ShadowRamControl::Replay) {
+        return shadow_state.reg_array[method];
+    }
+    return argument;
+}
+
+void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) {
+    if (regs.reg_array[method] == argument) {
+        return;
+    }
+    regs.reg_array[method] = argument;
+
+    for (const auto& table : dirty.tables) {
+        dirty.flags[table[method]] = true;
+    }
+}
+
+void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument,
+                                  bool is_last_call) {
+    switch (method) {
+    case MAXWELL3D_REG_INDEX(wait_for_idle):
+        return rasterizer->WaitForIdle();
+    case MAXWELL3D_REG_INDEX(shadow_ram_control):
+        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument);
+        return;
+    case MAXWELL3D_REG_INDEX(macros.data):
+        return macro_engine->AddCode(regs.macros.upload_address, argument);
+    case MAXWELL3D_REG_INDEX(macros.bind):
+        return ProcessMacroBind(argument);
+    case MAXWELL3D_REG_INDEX(firmware[4]):
+        return ProcessFirmwareCall4();
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
+        return StartCBData(method);
+    case MAXWELL3D_REG_INDEX(cb_bind[0]):
+        return ProcessCBBind(0);
+    case MAXWELL3D_REG_INDEX(cb_bind[1]):
+        return ProcessCBBind(1);
+    case MAXWELL3D_REG_INDEX(cb_bind[2]):
+        return ProcessCBBind(2);
+    case MAXWELL3D_REG_INDEX(cb_bind[3]):
+        return ProcessCBBind(3);
+    case MAXWELL3D_REG_INDEX(cb_bind[4]):
+        return ProcessCBBind(4);
+    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
+        return DrawArrays();
+    case MAXWELL3D_REG_INDEX(clear_buffers):
+        return ProcessClearBuffers();
+    case MAXWELL3D_REG_INDEX(query.query_get):
+        return ProcessQueryGet();
+    case MAXWELL3D_REG_INDEX(condition.mode):
+        return ProcessQueryCondition();
+    case MAXWELL3D_REG_INDEX(counter_reset):
+        return ProcessCounterReset();
+    case MAXWELL3D_REG_INDEX(sync_info):
+        return ProcessSyncPoint();
+    case MAXWELL3D_REG_INDEX(exec_upload):
+        return upload_state.ProcessExec(regs.exec_upload.linear != 0);
+    case MAXWELL3D_REG_INDEX(data_upload):
+        upload_state.ProcessData(argument, is_last_call);
+        if (is_last_call) {
+            OnMemoryWrite();
+        }
+        return;
+    }
+}
+
 void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
    // Reset the current macro.
    executing_macro = 0;
@@ -157,142 +263,16 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
    // uploaded to the GPU during initialization.
    if (method >= MacroRegistersStart) {
-        // We're trying to execute a macro
-        if (executing_macro == 0) {
-            // A macro call must begin by writing the macro method's register, not its argument.
-            ASSERT_MSG((method % 2) == 0,
-                       "Can't start macro execution by writing to the ARGS register");
-            executing_macro = method;
-        }
-
-        macro_params.push_back(method_argument);
-
-        // Call the macro when there are no more parameters in the command buffer
-        if (is_last_call) {
-            CallMacroMethod(executing_macro, macro_params);
-            macro_params.clear();
-        }
+        ProcessMacro(method, &method_argument, 1, is_last_call);
        return;
    }

    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Maxwell3D register, increase the size of the Regs structure");

-    u32 arg = method_argument;
-    // Keep track of the register value in shadow_state when requested.
-    if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Track ||
-        shadow_state.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter) {
-        shadow_state.reg_array[method] = arg;
-    } else if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Replay) {
-        arg = shadow_state.reg_array[method];
-    }
-
-    if (regs.reg_array[method] != arg) {
-        regs.reg_array[method] = arg;
-
-        for (const auto& table : dirty.tables) {
-            dirty.flags[table[method]] = true;
-        }
-    }
-
-    switch (method) {
-    case MAXWELL3D_REG_INDEX(wait_for_idle): {
-        rasterizer->WaitForIdle();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(shadow_ram_control): {
-        shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_argument);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(macros.data): {
-        macro_engine->AddCode(regs.macros.upload_address, arg);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(macros.bind): {
-        ProcessMacroBind(arg);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(firmware[4]): {
-        ProcessFirmwareCall4();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
-        StartCBData(method);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[0]): {
-        ProcessCBBind(0);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[1]): {
-        ProcessCBBind(1);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[2]): {
-        ProcessCBBind(2);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[3]): {
-        ProcessCBBind(3);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(cb_bind[4]): {
-        ProcessCBBind(4);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): {
-        DrawArrays();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(clear_buffers): {
-        ProcessClearBuffers();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(query.query_get): {
-        ProcessQueryGet();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(condition.mode): {
-        ProcessQueryCondition();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(counter_reset): {
-        ProcessCounterReset();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(sync_info): {
-        ProcessSyncPoint();
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(exec_upload): {
-        upload_state.ProcessExec(regs.exec_upload.linear != 0);
-        break;
-    }
-    case MAXWELL3D_REG_INDEX(data_upload): {
-        upload_state.ProcessData(arg, is_last_call);
-        if (is_last_call) {
-            OnMemoryWrite();
-        }
-        break;
-    }
-    default:
-        break;
-    }
+    const u32 argument = ProcessShadowRam(method, method_argument);
+    ProcessDirtyRegisters(method, argument);
+    ProcessMethodCall(method, argument, method_argument, is_last_call);
 }

 void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
@@ -300,23 +280,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
    // uploaded to the GPU during initialization.
    if (method >= MacroRegistersStart) {
-        // We're trying to execute a macro
-        if (executing_macro == 0) {
-            // A macro call must begin by writing the macro method's register, not its argument.
-            ASSERT_MSG((method % 2) == 0,
-                       "Can't start macro execution by writing to the ARGS register");
-            executing_macro = method;
-        }
-
-        for (std::size_t i = 0; i < amount; i++) {
-            macro_params.push_back(base_start[i]);
-        }
-
-        // Call the macro when there are no more parameters in the command buffer
-        if (amount == methods_pending) {
-            CallMacroMethod(executing_macro, macro_params);
-            macro_params.clear();
-        }
+        ProcessMacro(method, base_start, amount, amount == methods_pending);
        return;
    }
    switch (method) {
@@ -335,15 +299,14 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]):
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
-    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
+    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]):
        ProcessCBMultiData(method, base_start, amount);
        break;
-    }
-    default: {
+    default:
        for (std::size_t i = 0; i < amount; i++) {
            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
        }
-    }
+        break;
    }
 }

--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1461,6 +1461,14 @@ public:
 private:
    void InitializeRegisterDefaults();

+    void ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call);
+
+    u32 ProcessShadowRam(u32 method, u32 argument);
+
+    void ProcessDirtyRegisters(u32 method, u32 argument);
+
+    void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call);
+
    Core::System& system;
    MemoryManager& memory_manager;

--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -194,30 +194,6 @@ void GPU::SyncGuestHost() {
 void GPU::OnCommandListEnd() {
    renderer->Rasterizer().ReleaseFences();
 }
-// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence
-// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4.
-// So the values you see in docs might be multiplied by 4.
-enum class BufferMethods {
-    BindObject = 0x0,
-    Nop = 0x2,
-    SemaphoreAddressHigh = 0x4,
-    SemaphoreAddressLow = 0x5,
-    SemaphoreSequence = 0x6,
-    SemaphoreTrigger = 0x7,
-    NotifyIntr = 0x8,
-    WrcacheFlush = 0x9,
-    Unk28 = 0xA,
-    UnkCacheFlush = 0xB,
-    RefCnt = 0x14,
-    SemaphoreAcquire = 0x1A,
-    SemaphoreRelease = 0x1B,
-    FenceValue = 0x1C,
-    FenceAction = 0x1D,
-    Unk78 = 0x1E,
-    Unk7c = 0x1F,
-    Yield = 0x20,
-    NonPullerMethods = 0x40,
-};

 enum class GpuSemaphoreOperation {
    AcquireEqual = 0x1,
@@ -277,7 +253,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) {
    case BufferMethods::UnkCacheFlush:
    case BufferMethods::WrcacheFlush:
    case BufferMethods::FenceValue:
+        break;
    case BufferMethods::FenceAction:
+        ProcessFenceActionMethod();
+        break;
+    case BufferMethods::WaitForInterrupt:
+        ProcessWaitForInterruptMethod();
        break;
    case BufferMethods::SemaphoreTrigger: {
        ProcessSemaphoreTriggerMethod();
@@ -391,6 +372,25 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) {
    }
 }

+void GPU::ProcessFenceActionMethod() {
+    switch (regs.fence_action.op) {
+    case FenceOperation::Acquire:
+        WaitFence(regs.fence_action.syncpoint_id, regs.fence_value);
+        break;
+    case FenceOperation::Increment:
+        IncrementSyncPoint(regs.fence_action.syncpoint_id);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Unimplemented operation {}",
+                          static_cast<u32>(regs.fence_action.op.Value()));
+    }
+}
+
+void GPU::ProcessWaitForInterruptMethod() {
+    // TODO(bunnei) ImplementMe
+    LOG_WARNING(HW_GPU, "(STUBBED) called");
+}
+
 void GPU::ProcessSemaphoreTriggerMethod() {
    const auto semaphoreOperationMask = 0xF;
    const auto op =
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -263,6 +263,24 @@ public:
        return use_nvdec;
    }

+    enum class FenceOperation : u32 {
+        Acquire = 0,
+        Increment = 1,
+    };
+
+    union FenceAction {
+        u32 raw;
+        BitField<0, 1, FenceOperation> op;
+        BitField<8, 24, u32> syncpoint_id;
+
+        static CommandHeader Build(FenceOperation op, u32 syncpoint_id) {
+            FenceAction result{};
+            result.op.Assign(op);
+            result.syncpoint_id.Assign(syncpoint_id);
+            return {result.raw};
+        }
+    };
+
    struct Regs {
        static constexpr size_t NUM_REGS = 0x40;

@@ -291,10 +309,7 @@ public:
                u32 semaphore_acquire;
                u32 semaphore_release;
                u32 fence_value;
-                union {
-                    BitField<4, 4, u32> operation;
-                    BitField<8, 8, u32> id;
-                } fence_action;
+                FenceAction fence_action;
                INSERT_UNION_PADDING_WORDS(0xE2);

                // Puller state
@@ -342,6 +357,8 @@ protected:

 private:
    void ProcessBindMethod(const MethodCall& method_call);
+    void ProcessFenceActionMethod();
+    void ProcessWaitForInterruptMethod();
    void ProcessSemaphoreTriggerMethod();
    void ProcessSemaphoreRelease();
    void ProcessSemaphoreAcquire();
--- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp
@@ -39,8 +39,8 @@ using Operation = const OperationNode&;
 constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"};

 char Swizzle(std::size_t component) {
-    ASSERT(component < 4);
-    return component["xyzw"];
+    static constexpr std::string_view SWIZZLE{"xyzw"};
+    return SWIZZLE.at(component);
 }

 constexpr bool IsGenericAttribute(Attribute::Index index) {
@@ -224,7 +224,7 @@ private:

    std::string Visit(const Node& node);

-    std::pair<std::string, std::size_t> BuildCoords(Operation);
+    std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation);
    std::string BuildAoffi(Operation);
    std::string GlobalMemoryPointer(const GmemNode& gmem);
    void Exit();
@@ -1416,12 +1416,12 @@ std::string ARBDecompiler::Visit(const Node& node) {
    return {};
 }

-std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
+std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) {
    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
    UNIMPLEMENTED_IF(meta.sampler.is_indexed);
-    UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array &&
-                     meta.sampler.type == Tegra::Shader::TextureType::TextureCube);

+    const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array &&
+                             meta.sampler.type == Tegra::Shader::TextureType::TextureCube;
    const std::size_t count = operation.GetOperandsCount();
    std::string temporary = AllocVectorTemporary();
    std::size_t i = 0;
@@ -1429,12 +1429,21 @@ std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operati
        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i]));
    }
    if (meta.sampler.is_array) {
-        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array));
+        AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array));
+        ++i;
    }
    if (meta.sampler.is_shadow) {
-        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare));
+        std::string compare = Visit(meta.depth_compare);
+        if (is_extended) {
+            ASSERT(i == 4);
+            std::string extra_coord = AllocVectorTemporary();
+            AddLine("MOV.F {}.x, {};", extra_coord, compare);
+            return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0};
+        }
+        AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare);
+        ++i;
    }
-    return {std::move(temporary), i};
+    return {temporary, temporary, i};
 }

 std::string ARBDecompiler::BuildAoffi(Operation operation) {
@@ -1859,7 +1868,7 @@ std::string ARBDecompiler::LogicalAddCarry(Operation operation) {
 std::string ARBDecompiler::Texture(Operation operation) {
    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
-    const auto [temporary, swizzle] = BuildCoords(operation);
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);

    std::string_view opcode = "TEX";
    std::string extra;
@@ -1888,7 +1897,7 @@ std::string ARBDecompiler::Texture(Operation operation) {
        }
    }

-    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id,
+    AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id,
            TextureType(meta), BuildAoffi(operation));
    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
    return fmt::format("{}.x", temporary);
@@ -1897,7 +1906,7 @@ std::string ARBDecompiler::Texture(Operation operation) {
 std::string ARBDecompiler::TextureGather(Operation operation) {
    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
-    const auto [temporary, swizzle] = BuildCoords(operation);
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);

    std::string comp;
    if (!meta.sampler.is_shadow) {
@@ -1907,7 +1916,7 @@ std::string ARBDecompiler::TextureGather(Operation operation) {

    AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp,
            TextureType(meta), BuildAoffi(operation));
-    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
+    AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element));
    return fmt::format("{}.x", temporary);
 }

@@ -1945,13 +1954,13 @@ std::string ARBDecompiler::TextureQueryLod(Operation operation) {
 std::string ARBDecompiler::TexelFetch(Operation operation) {
    const auto& meta = std::get<MetaTexture>(operation.GetMeta());
    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
-    const auto [temporary, swizzle] = BuildCoords(operation);
+    const auto [coords, temporary, swizzle] = BuildCoords(operation);

    if (!meta.sampler.is_buffer) {
        ASSERT(swizzle < 4);
        AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod));
    }
-    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta),
+    AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta),
            BuildAoffi(operation));
    AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element));
    return fmt::format("{}.x", temporary);
@@ -1962,7 +1971,7 @@ std::string ARBDecompiler::TextureGradient(Operation operation) {
    const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index;
    const std::string ddx = AllocVectorTemporary();
    const std::string ddy = AllocVectorTemporary();
-    const std::string coord = BuildCoords(operation).first;
+    const std::string coord = std::get<1>(BuildCoords(operation));

    const std::size_t num_components = meta.derivates.size() / 2;
    for (std::size_t index = 0; index < num_components; ++index) {
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -2056,15 +2056,19 @@ private:
    }

    Expression Texture(Operation operation) {
-        const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
-        ASSERT(meta);
-
-        std::string expr = GenerateTexture(
-            operation, "", {TextureOffset{}, TextureArgument{Type::Float, meta->bias}});
-        if (meta->sampler.is_shadow) {
-            expr = "vec4(" + expr + ')';
+        const auto meta = std::get<MetaTexture>(operation.GetMeta());
+        const bool separate_dc = meta.sampler.type == TextureType::TextureCube &&
+                                 meta.sampler.is_array && meta.sampler.is_shadow;
+        // TODO: Replace this with an array and make GenerateTexture use C++20 std::span
+        const std::vector<TextureIR> extras{
+            TextureOffset{},
+            TextureArgument{Type::Float, meta.bias},
+        };
+        std::string expr = GenerateTexture(operation, "", extras, separate_dc);
+        if (meta.sampler.is_shadow) {
+            expr = fmt::format("vec4({})", expr);
        }
-        return {expr + GetSwizzle(meta->element), Type::Float};
+        return {expr + GetSwizzle(meta.element), Type::Float};
    }

    Expression TextureLod(Operation operation) {
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@@ -771,13 +771,18 @@ void VKDevice::CollectTelemetryParameters() {
    VkPhysicalDeviceDriverPropertiesKHR driver{
        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR,
        .pNext = nullptr,
+        .driverID = {},
+        .driverName = {},
+        .driverInfo = {},
+        .conformanceVersion = {},
    };

-    VkPhysicalDeviceProperties2KHR properties{
+    VkPhysicalDeviceProperties2KHR device_properties{
        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR,
        .pNext = &driver,
+        .properties = {},
    };
-    physical.GetProperties2KHR(properties);
+    physical.GetProperties2KHR(device_properties);

    driver_id = driver.driverID;
    vendor_name = driver.driverName;
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -159,6 +159,7 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules(
        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
        .pNext = nullptr,
        .flags = 0,
+        .codeSize = 0,
    };

    std::vector<vk::ShaderModule> modules;
@@ -388,6 +389,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa
        .logicOp = VK_LOGIC_OP_COPY,
        .attachmentCount = static_cast<u32>(num_attachments),
        .pAttachments = cb_attachments.data(),
+        .blendConstants = {},
    };

    std::vector dynamic_states{
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -556,7 +556,6 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
    const bool is_shadow = depth_compare != nullptr;
    const bool is_bindless = bindless_reg.has_value();

-    UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow);
    ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow,
               "Illegal texture type");

--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -240,6 +240,7 @@ SurfaceParams SurfaceParams::CreateForFermiCopySurface(
        .is_tiled = is_tiled,
        .srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB ||
                           config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB,
+        .is_layered = false,
        .block_width = is_tiled ? std::min(config.BlockWidth(), 5U) : 0U,
        .block_height = is_tiled ? std::min(config.BlockHeight(), 5U) : 0U,
        .block_depth = is_tiled ? std::min(config.BlockDepth(), 5U) : 0U,
--- a/src/yuzu/CMakeLists.txt
+++ b/src/yuzu/CMakeLists.txt
@@ -264,11 +264,9 @@ endif()
 if (MSVC)
    include(CopyYuzuQt5Deps)
    include(CopyYuzuSDLDeps)
-    include(CopyYuzuUnicornDeps)
    include(CopyYuzuFFmpegDeps)
    copy_yuzu_Qt5_deps(yuzu)
    copy_yuzu_SDL_deps(yuzu)
-    copy_yuzu_unicorn_deps(yuzu)
    copy_yuzu_FFmpeg_deps(yuzu)
 endif()

--- a/src/yuzu_cmd/CMakeLists.txt
+++ b/src/yuzu_cmd/CMakeLists.txt
@@ -39,7 +39,5 @@ endif()

 if (MSVC)
    include(CopyYuzuSDLDeps)
-    include(CopyYuzuUnicornDeps)
    copy_yuzu_SDL_deps(yuzu-cmd)
-    copy_yuzu_unicorn_deps(yuzu-cmd)
 endif()
--- a/src/yuzu_tester/CMakeLists.txt
+++ b/src/yuzu_tester/CMakeLists.txt
@@ -28,7 +28,5 @@ endif()

 if (MSVC)
    include(CopyYuzuSDLDeps)
-    include(CopyYuzuUnicornDeps)
    copy_yuzu_SDL_deps(yuzu-tester)
-    copy_yuzu_unicorn_deps(yuzu-tester)
 endif()
Author	SHA1	Message	Date
ReinUsesLisp	622830f4e1	maxwell_3d: Use insert instead of loop push_back This reduces the overhead of bounds checking on each element. It won't reduce the cost of allocation because usually this vector's capacity is usually large enough to hold whatever we push to it.	2020-11-11 19:52:19 -03:00
ReinUsesLisp	9ea8cffe35	maxwell_3d: Move code to separate functions Deduplicate some code and put it in separate functions so it's easier to understand and profile.	2020-11-11 19:52:19 -03:00
bunnei	c22d0d9945	Merge pull request #4901 from bunnei/caps-stub hle: service: caps_u: Stub GetAlbumFileList3AaeAruid.	2020-11-09 21:20:08 -08:00
Rodrigo Locatti	9b24197ca0	Merge pull request #4909 from lioncash/interrupt cpu_interrupt_handler: Mark move contructor/assignment as deleted	2020-11-08 22:09:40 -03:00
Rodrigo Locatti	8008b5ddc9	Merge pull request #4910 from lioncash/service ipc_helpers: Remove usage of the global system instance	2020-11-08 19:11:31 -03:00
Lioncash	da7be67daf	ipc_helpers: Remove usage of the global system instance Resolves numerous deprecation warnings throughout the codebase due to inclusion of this header. Now building core should be significantly less noisy (and also relying on less global state). This also uncovered quite a few modules that were relying on indirect includes, which have also been fixed.	2020-11-08 15:58:11 -05:00
Lioncash	0aad914527	cpu_interrupt_handler: Mark move contructor/assignment as deleted The interrupt handler contains a std::atomic_bool, which isn't copyable or movable, so the special move member functions will always be deleted, despite being defaulted. This can resolve warnings on clang and GCC.	2020-11-08 15:37:04 -05:00
bunnei	7bf9f9ae49	Merge pull request #4903 from bunnei/remove-gpu-integrity video_core: dma_pusher: Remove integrity check on command lists.	2020-11-08 02:48:22 -08:00
Chloe	9f5facc3aa	Merge pull request #4908 from lioncash/fmt externals: Update fmt to 7.1.2	2020-11-08 20:26:03 +11:00
Lioncash	0785796372	externals: Update fmt to 7.1.2 Updates to the latest bugfix release of fmt.	2020-11-08 03:44:07 -05:00
LC	e829973742	Merge pull request #4906 from lat9nq/log-cpu-accuracy settings: log value of CPU_Accuracy	2020-11-07 17:01:33 -05:00
lat9nq	1e149dc18b	settings: log value of CPU_Accuracy	2020-11-07 16:14:10 -05:00
bunnei	dc5396a466	video_core: dma_pusher: Remove integrity check on command lists. - This seems to cause softlocks in Breath of the Wild.	2020-11-07 00:08:19 -08:00
bunnei	af477fb8c5	Merge pull request #4888 from lioncash/unicorn-remove core: Remove usage of unicorn	2020-11-06 22:39:05 -08:00
bunnei	a0d7a2732d	hle: service: caps_u: Stub GetAlbumFileList3AaeAruid. - This works similiar to GetAlbumContentsFileListForApplication. - Since we do not implement the album, this should be safe to stub for now. - Used by Super Smash Bros. Ultimate (newer updates) in World of Light.	2020-11-06 22:23:15 -08:00
bunnei	f6a89edb67	Merge pull request #4899 from lioncash/fiberimpl common/fiber: Move all member variables into impl class	2020-11-06 20:01:03 -08:00
Lioncash	00fb79b2f3	common/fiber: Move all member variables into impl class Hides all of the implementation details for users of the class. This has the benefit of reducing includes and also making the fiber classes movable again.	2020-11-06 20:36:32 -05:00
bunnei	91a45834fd	Merge pull request #4891 from lioncash/clang2 General: Fix clang build	2020-11-06 10:33:13 -08:00
bunnei	0b75ec5316	Merge pull request #4894 from lioncash/fn settings: Simplify initializer of resolution factor	2020-11-06 09:54:02 -08:00
Lioncash	c0ab5b79dc	settings: Simplify initializer of resolution factor This can use a braced initializer to accomplish the same thing with less code.	2020-11-05 22:07:10 -05:00
bunnei	a111a9ae2c	Merge pull request #4854 from ReinUsesLisp/cube-array-shadow shader: Partially implement texture cube array shadow	2020-11-05 16:25:00 -08:00
Lioncash	6f006d051e	General: Fix clang build Allows building on clang to work again	2020-11-05 10:07:16 -05:00
bunnei	d62d28522b	Merge pull request #4889 from lioncash/setting-global core/settings: Move configuring_global behind an API	2020-11-04 17:09:19 -08:00
bunnei	087f52e872	Merge pull request #4858 from lioncash/initializer General: Resolve a few missing initializer warnings	2020-11-04 12:10:10 -08:00
Chloe	6bbbbe8f85	Merge pull request #4869 from bunnei/improve-gpu-sync Improvements to GPU synchronization & various refactoring	2020-11-04 18:36:55 +11:00
Lioncash	fc6db97a09	core: Remove usage of unicorn Unicorn long-since lost most of its use, due to dynarmic gaining support for handling most instructions. At this point any further issues encountered should be used to make dynarmic better. This also allows us to remove our dependency on Python.	2020-11-03 20:22:05 -05:00
bunnei	4bfa411ddc	Merge pull request #4874 from lioncash/nodiscard2 nvdec: Make use of [[nodiscard]] where applicable	2020-11-03 16:34:07 -08:00
Lioncash	4f0f481f63	nvdec: Make use of [[nodiscard]] where applicable Prevents bugs from occurring where the results of a function are accidentally discarded	2020-11-02 02:45:15 -05:00
bunnei	848bdf8a40	fixup! hle service: nvdrv: nvhost_gpu: Update to use SyncpointManager and other improvements.	2020-11-01 01:52:38 -07:00
bunnei	7d2839d7a3	core: Initialize GPU before services.	2020-11-01 01:52:38 -07:00
bunnei	e67b8678f8	hle service: nvdrv: nvhost_gpu: Update to use SyncpointManager and other improvements. - Refactor so that SubmitGPFIFO and KickoffPB use shared functionality. - Implement add_wait and add_increment flags.	2020-11-01 01:52:38 -07:00
bunnei	c6e1c46ac7	video_core: dma_pusher: Add support for integrity checks. - Log corrupted command lists, rather than crash.	2020-11-01 01:52:38 -07:00
bunnei	c64545d07a	video_core: dma_pusher: Add support for prefetched command lists.	2020-11-01 01:52:38 -07:00
bunnei	1d4cbb92f2	service: hle: nvflinger: Fix potential shutdown crash when GPU is destroyed.	2020-11-01 01:52:38 -07:00
bunnei	6053b95552	video_core: gpu: Implement WaitFence and IncrementSyncPoint.	2020-11-01 01:52:37 -07:00
bunnei	66edfd61c6	hle service: nvdrv: nvhost_ctrl: Update to use SyncpointManager.	2020-11-01 01:52:37 -07:00
bunnei	4a3fd97e48	hle service: nvdrv: Update to instantiate SyncpointManager.	2020-11-01 01:52:34 -07:00
bunnei	d567b7e841	hle: service: nvdrv: Implement SyncpointManager, to manage syncpoints.	2020-11-01 01:51:54 -07:00
Lioncash	5553bd3ba2	General: Resolve a few missing initializer warnings Resolves a few -Wmissing-initializer warnings.	2020-10-29 19:37:07 -04:00
ReinUsesLisp	657771bdcb	shader: Partially implement texture cube array shadow This implements texture cube arrays with shadow comparisons but doesn't fix the asserts related to it. Fixes out of bounds reads on swizzle constructors and makes them use bounds checked ::at instead of the unsafe operator[].	2020-10-28 17:12:40 -03:00