Android 228

Merge yuzu-emu#12903
Merge yuzu-emu#12892
2024-02-04 01:00:37 +00:00 · 2024-02-04 01:00:36 +00:00 · 2024-02-04 01:00:36 +00:00 · 2024-02-04 01:00:36 +00:00 · 2024-02-04 01:00:36 +00:00 · 2024-02-04 01:00:36 +00:00
133 changed files with 4397 additions and 1589 deletions
--- a/README.md
+++ b/README.md
@@ -1,3 +1,16 @@
+| Pull Request | Commit | Title | Author | Merged? |
+|----|----|----|----|----|
+| [12461](https://github.com/yuzu-emu/yuzu-android//pull/12461) | [`4c08a0e6d`](https://github.com/yuzu-emu/yuzu-android//pull/12461/files) | Rework Nvdec and VIC to fix out-of-order videos, and speed up decoding. | [Kelebek1](https://github.com/Kelebek1/) | Yes |
+| [12749](https://github.com/yuzu-emu/yuzu-android//pull/12749) | [`aad4b0d6f`](https://github.com/yuzu-emu/yuzu-android//pull/12749/files) | general: workarounds for SMMU syncing issues | [liamwhite](https://github.com/liamwhite/) | Yes |
+| [12756](https://github.com/yuzu-emu/yuzu-android//pull/12756) | [`b19285d4f`](https://github.com/yuzu-emu/yuzu-android//pull/12756/files) | general: applet multiprocess | [liamwhite](https://github.com/liamwhite/) | Yes |
+| [12892](https://github.com/yuzu-emu/yuzu-android//pull/12892) | [`78f72b3bf`](https://github.com/yuzu-emu/yuzu-android//pull/12892/files) | cmif_serialization: enforce const for references | [liamwhite](https://github.com/liamwhite/) | Yes |
+| [12903](https://github.com/yuzu-emu/yuzu-android//pull/12903) | [`5be8121af`](https://github.com/yuzu-emu/yuzu-android//pull/12903/files) | shader_recompiler: use only ConstOffset for OpImageFetch | [liamwhite](https://github.com/liamwhite/) | Yes |
+
+
+End of merge log. You can find the original README.md below the break.
+
+-----
+
 <!--
 SPDX-FileCopyrightText: 2018 yuzu Emulator Project
 SPDX-License-Identifier: GPL-2.0-or-later
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -164,6 +164,7 @@ else()

    if (MINGW)
        add_definitions(-DMINGW_HAS_SECURE_API)
+        add_compile_options("-msse4.1")

        if (MINGW_STATIC_BUILD)
            add_definitions(-DQT_STATICPLUGIN)
--- a/src/common/hex_util.h
+++ b/src/common/hex_util.h
@@ -9,6 +9,7 @@
 #include <string>
 #include <vector>
 #include <fmt/format.h>
+#include "common/assert.h"
 #include "common/common_types.h"

 namespace Common {
@@ -29,6 +30,8 @@ namespace Common {

 template <std::size_t Size, bool le = false>
 [[nodiscard]] constexpr std::array<u8, Size> HexStringToArray(std::string_view str) {
+    ASSERT_MSG(Size * 2 <= str.size(), "Invalid string size");
+
    std::array<u8, Size> out{};
    if constexpr (le) {
        for (std::size_t i = 2 * Size - 2; i <= 2 * Size; i -= 2) {
--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@@ -30,6 +30,7 @@ namespace Settings {
 #define SETTING(TYPE, RANGED) template class Setting<TYPE, RANGED>
 #define SWITCHABLE(TYPE, RANGED) template class SwitchableSetting<TYPE, RANGED>

+SETTING(AppletMode, false);
 SETTING(AudioEngine, false);
 SETTING(bool, false);
 SETTING(int, false);
@@ -215,6 +216,8 @@ const char* TranslateCategory(Category category) {
        return "Debugging";
    case Category::GpuDriver:
        return "GpuDriver";
+    case Category::LibraryApplet:
+        return "LibraryApplet";
    case Category::Miscellaneous:
        return "Miscellaneous";
    case Category::Network:
--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -133,6 +133,38 @@ struct TouchFromButtonMap {
 struct Values {
    Linkage linkage{};

+    // Applet
+    Setting<AppletMode> cabinet_applet_mode{linkage, AppletMode::LLE, "cabinet_applet_mode",
+                                            Category::LibraryApplet};
+    Setting<AppletMode> controller_applet_mode{linkage, AppletMode::HLE, "controller_applet_mode",
+                                               Category::LibraryApplet};
+    Setting<AppletMode> data_erase_applet_mode{linkage, AppletMode::HLE, "data_erase_applet_mode",
+                                               Category::LibraryApplet};
+    Setting<AppletMode> error_applet_mode{linkage, AppletMode::HLE, "error_applet_mode",
+                                          Category::LibraryApplet};
+    Setting<AppletMode> net_connect_applet_mode{linkage, AppletMode::HLE, "net_connect_applet_mode",
+                                                Category::LibraryApplet};
+    Setting<AppletMode> player_select_applet_mode{
+        linkage, AppletMode::HLE, "player_select_applet_mode", Category::LibraryApplet};
+    Setting<AppletMode> swkbd_applet_mode{linkage, AppletMode::LLE, "swkbd_applet_mode",
+                                          Category::LibraryApplet};
+    Setting<AppletMode> mii_edit_applet_mode{linkage, AppletMode::LLE, "mii_edit_applet_mode",
+                                             Category::LibraryApplet};
+    Setting<AppletMode> web_applet_mode{linkage, AppletMode::HLE, "web_applet_mode",
+                                        Category::LibraryApplet};
+    Setting<AppletMode> shop_applet_mode{linkage, AppletMode::HLE, "shop_applet_mode",
+                                         Category::LibraryApplet};
+    Setting<AppletMode> photo_viewer_applet_mode{
+        linkage, AppletMode::LLE, "photo_viewer_applet_mode", Category::LibraryApplet};
+    Setting<AppletMode> offline_web_applet_mode{linkage, AppletMode::LLE, "offline_web_applet_mode",
+                                                Category::LibraryApplet};
+    Setting<AppletMode> login_share_applet_mode{linkage, AppletMode::HLE, "login_share_applet_mode",
+                                                Category::LibraryApplet};
+    Setting<AppletMode> wifi_web_auth_applet_mode{
+        linkage, AppletMode::HLE, "wifi_web_auth_applet_mode", Category::LibraryApplet};
+    Setting<AppletMode> my_page_applet_mode{linkage, AppletMode::LLE, "my_page_applet_mode",
+                                            Category::LibraryApplet};
+
    // Audio
    SwitchableSetting<AudioEngine> sink_id{linkage, AudioEngine::Auto, "output_engine",
                                           Category::Audio, Specialization::RuntimeList};
--- a/src/common/settings_common.h
+++ b/src/common/settings_common.h
@@ -44,6 +44,7 @@ enum class Category : u32 {
    Services,
    Paths,
    Linux,
+    LibraryApplet,
    MaxEnum,
 };

--- a/src/common/settings_enums.h
+++ b/src/common/settings_enums.h
@@ -151,6 +151,8 @@ ENUM(AspectRatio, R16_9, R4_3, R21_9, R16_10, Stretch);

 ENUM(ConsoleMode, Handheld, Docked);

+ENUM(AppletMode, HLE, LLE);
+
 template <typename Type>
 inline std::string CanonicalizeEnum(Type id) {
    const auto group = EnumMetadata<Type>::Canonicalizations();
--- a/src/core/device_memory_manager.h
+++ b/src/core/device_memory_manager.h
@@ -43,6 +43,8 @@ public:
    DeviceMemoryManager(const DeviceMemory& device_memory);
    ~DeviceMemoryManager();

+    static constexpr bool HAS_FLUSH_INVALIDATION = true;
+
    void BindInterface(DeviceInterface* device_inter);

    DAddr Allocate(size_t size);
--- a/src/core/guest_memory.h
+++ b/src/core/guest_memory.h
@@ -44,15 +44,32 @@ public:
    GuestMemory() = delete;
    explicit GuestMemory(M& memory, u64 addr, std::size_t size,
                         Common::ScratchBuffer<T>* backup = nullptr)
-        : m_memory{memory}, m_addr{addr}, m_size{size} {
+        : m_memory{&memory}, m_addr{addr}, m_size{size} {
        static_assert(FLAGS & GuestMemoryFlags::Read || FLAGS & GuestMemoryFlags::Write);
-        if constexpr (FLAGS & GuestMemoryFlags::Read) {
+        if constexpr (!(FLAGS & GuestMemoryFlags::Read)) {
+            if (!this->TrySetSpan()) {
+                if (backup) {
+                    backup->resize_destructive(this->size());
+                    m_data_span = *backup;
+                    m_span_valid = true;
+                    m_is_data_copy = true;
+                } else {
+                    m_data_copy.resize(this->size());
+                    m_data_span = std::span(m_data_copy);
+                    m_span_valid = true;
+                    m_is_data_copy = true;
+                }
+            }
+        } else if constexpr (FLAGS & GuestMemoryFlags::Read) {
            Read(addr, size, backup);
        }
    }

    ~GuestMemory() = default;

+    GuestMemory(GuestMemory&& rhs) = default;
+    GuestMemory& operator=(GuestMemory&& rhs) = default;
+
    T* data() noexcept {
        return m_data_span.data();
    }
@@ -109,8 +126,8 @@ public:
        }

        if (this->TrySetSpan()) {
-            if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-                m_memory.FlushRegion(m_addr, this->size_bytes());
+            if constexpr (FLAGS & GuestMemoryFlags::Safe && M::HAS_FLUSH_INVALIDATION) {
+                m_memory->FlushRegion(m_addr, this->size_bytes());
            }
        } else {
            if (backup) {
@@ -123,9 +140,9 @@ public:
            m_is_data_copy = true;
            m_span_valid = true;
            if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-                m_memory.ReadBlock(m_addr, this->data(), this->size_bytes());
+                m_memory->ReadBlock(m_addr, this->data(), this->size_bytes());
            } else {
-                m_memory.ReadBlockUnsafe(m_addr, this->data(), this->size_bytes());
+                m_memory->ReadBlockUnsafe(m_addr, this->data(), this->size_bytes());
            }
        }
        return m_data_span;
@@ -133,18 +150,19 @@ public:

    void Write(std::span<T> write_data) noexcept {
        if constexpr (FLAGS & GuestMemoryFlags::Cached) {
-            m_memory.WriteBlockCached(m_addr, write_data.data(), this->size_bytes());
+            m_memory->WriteBlockCached(m_addr, write_data.data(), this->size_bytes());
        } else if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-            m_memory.WriteBlock(m_addr, write_data.data(), this->size_bytes());
+            m_memory->WriteBlock(m_addr, write_data.data(), this->size_bytes());
        } else {
-            m_memory.WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes());
+            m_memory->WriteBlockUnsafe(m_addr, write_data.data(), this->size_bytes());
        }
    }

    bool TrySetSpan() noexcept {
-        if (u8* ptr = m_memory.GetSpan(m_addr, this->size_bytes()); ptr) {
+        if (u8* ptr = m_memory->GetSpan(m_addr, this->size_bytes()); ptr) {
            m_data_span = {reinterpret_cast<T*>(ptr), this->size()};
            m_span_valid = true;
+            m_is_data_copy = false;
            return true;
        }
        return false;
@@ -159,7 +177,7 @@ protected:
        return m_addr_changed;
    }

-    M& m_memory;
+    M* m_memory;
    u64 m_addr{};
    size_t m_size{};
    std::span<T> m_data_span{};
@@ -175,17 +193,7 @@ public:
    GuestMemoryScoped() = delete;
    explicit GuestMemoryScoped(M& memory, u64 addr, std::size_t size,
                               Common::ScratchBuffer<T>* backup = nullptr)
-        : GuestMemory<M, T, FLAGS>(memory, addr, size, backup) {
-        if constexpr (!(FLAGS & GuestMemoryFlags::Read)) {
-            if (!this->TrySetSpan()) {
-                if (backup) {
-                    this->m_data_span = *backup;
-                    this->m_span_valid = true;
-                    this->m_is_data_copy = true;
-                }
-            }
-        }
-    }
+        : GuestMemory<M, T, FLAGS>(memory, addr, size, backup) {}

    ~GuestMemoryScoped() {
        if constexpr (FLAGS & GuestMemoryFlags::Write) {
@@ -196,15 +204,17 @@ public:
            if (this->AddressChanged() || this->IsDataCopy()) {
                ASSERT(this->m_span_valid);
                if constexpr (FLAGS & GuestMemoryFlags::Cached) {
-                    this->m_memory.WriteBlockCached(this->m_addr, this->data(), this->size_bytes());
+                    this->m_memory->WriteBlockCached(this->m_addr, this->data(),
+                                                     this->size_bytes());
                } else if constexpr (FLAGS & GuestMemoryFlags::Safe) {
-                    this->m_memory.WriteBlock(this->m_addr, this->data(), this->size_bytes());
+                    this->m_memory->WriteBlock(this->m_addr, this->data(), this->size_bytes());
                } else {
-                    this->m_memory.WriteBlockUnsafe(this->m_addr, this->data(), this->size_bytes());
+                    this->m_memory->WriteBlockUnsafe(this->m_addr, this->data(),
+                                                     this->size_bytes());
                }
            } else if constexpr ((FLAGS & GuestMemoryFlags::Safe) ||
                                 (FLAGS & GuestMemoryFlags::Cached)) {
-                this->m_memory.InvalidateRegion(this->m_addr, this->size_bytes());
+                this->m_memory->InvalidateRegion(this->m_addr, this->size_bytes());
            }
        }
    }
--- a/src/core/hle/kernel/k_process.cpp
+++ b/src/core/hle/kernel/k_process.cpp
@@ -4,8 +4,9 @@
 #include <random>
 #include "common/scope_exit.h"
 #include "common/settings.h"
+#include "core/arm/dynarmic/arm_dynarmic.h"
+#include "core/arm/dynarmic/dynarmic_exclusive_monitor.h"
 #include "core/core.h"
-#include "core/gpu_dirty_memory_manager.h"
 #include "core/hle/kernel/k_process.h"
 #include "core/hle/kernel/k_scoped_resource_reservation.h"
 #include "core/hle/kernel/k_shared_memory.h"
@@ -1258,6 +1259,10 @@ void KProcess::InitializeInterfaces() {

 #ifdef HAS_NCE
    if (this->IsApplication() && Settings::IsNceEnabled()) {
+        // Register the scoped JIT handler before creating any NCE instances
+        // so that its signal handler will appear first in the signal chain.
+        Core::ScopedJitExecution::RegisterHandler();
+
        for (size_t i = 0; i < Core::Hardware::NUM_CPU_CORES; i++) {
            m_arm_interfaces[i] = std::make_unique<Core::ArmNce>(m_kernel.System(), true, i);
        }
--- a/src/core/hle/service/am/am_types.h
+++ b/src/core/hle/service/am/am_types.h
@@ -130,9 +130,9 @@ enum class AppletProgramId : u64 {

 enum class LibraryAppletMode : u32 {
    AllForeground = 0,
-    Background = 1,
-    NoUI = 2,
-    BackgroundIndirectDisplay = 3,
+    PartialForeground = 1,
+    NoUi = 2,
+    PartialForegroundIndirectDisplay = 3,
    AllForegroundInitiallyHidden = 4,
 };

--- a/src/core/hle/service/am/frontend/applet_software_keyboard.cpp
+++ b/src/core/hle/service/am/frontend/applet_software_keyboard.cpp
@@ -68,9 +68,9 @@ void SoftwareKeyboard::Initialize() {
    case LibraryAppletMode::AllForeground:
        InitializeForeground();
        break;
-    case LibraryAppletMode::Background:
-    case LibraryAppletMode::BackgroundIndirectDisplay:
-        InitializeBackground(applet_mode);
+    case LibraryAppletMode::PartialForeground:
+    case LibraryAppletMode::PartialForegroundIndirectDisplay:
+        InitializePartialForeground(applet_mode);
        break;
    default:
        ASSERT_MSG(false, "Invalid LibraryAppletMode={}", applet_mode);
@@ -243,7 +243,7 @@ void SoftwareKeyboard::InitializeForeground() {
    InitializeFrontendNormalKeyboard();
 }

-void SoftwareKeyboard::InitializeBackground(LibraryAppletMode library_applet_mode) {
+void SoftwareKeyboard::InitializePartialForeground(LibraryAppletMode library_applet_mode) {
    LOG_INFO(Service_AM, "Initializing Inline Software Keyboard Applet.");

    is_background = true;
@@ -258,9 +258,9 @@ void SoftwareKeyboard::InitializeBackground(LibraryAppletMode library_applet_mod
                swkbd_inline_initialize_arg.size());

    if (swkbd_initialize_arg.library_applet_mode_flag) {
-        ASSERT(library_applet_mode == LibraryAppletMode::Background);
+        ASSERT(library_applet_mode == LibraryAppletMode::PartialForeground);
    } else {
-        ASSERT(library_applet_mode == LibraryAppletMode::BackgroundIndirectDisplay);
+        ASSERT(library_applet_mode == LibraryAppletMode::PartialForegroundIndirectDisplay);
    }
 }

--- a/src/core/hle/service/am/frontend/applet_software_keyboard.h
+++ b/src/core/hle/service/am/frontend/applet_software_keyboard.h
@@ -62,7 +62,7 @@ private:
    void InitializeForeground();

    /// Initializes the inline software keyboard.
-    void InitializeBackground(LibraryAppletMode library_applet_mode);
+    void InitializePartialForeground(LibraryAppletMode library_applet_mode);

    /// Processes the text check sent by the application.
    void ProcessTextCheck();
--- a/src/core/hle/service/am/library_applet_creator.cpp
+++ b/src/core/hle/service/am/library_applet_creator.cpp
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

+#include "common/settings.h"
 #include "core/hle/kernel/k_transfer_memory.h"
 #include "core/hle/service/am/applet_data_broker.h"
 #include "core/hle/service/am/applet_manager.h"
@@ -16,6 +17,34 @@ namespace Service::AM {

 namespace {

+bool ShouldCreateGuestApplet(AppletId applet_id) {
+#define X(Name, name)                                                                              \
+    if (applet_id == AppletId::Name &&                                                             \
+        Settings::values.name##_applet_mode.GetValue() != Settings::AppletMode::LLE) {             \
+        return false;                                                                              \
+    }
+
+    X(Cabinet, cabinet)
+    X(Controller, controller)
+    X(DataErase, data_erase)
+    X(Error, error)
+    X(NetConnect, net_connect)
+    X(ProfileSelect, player_select)
+    X(SoftwareKeyboard, swkbd)
+    X(MiiEdit, mii_edit)
+    X(Web, web)
+    X(Shop, shop)
+    X(PhotoViewer, photo_viewer)
+    X(OfflineWeb, offline_web)
+    X(LoginShare, login_share)
+    X(WebAuth, wifi_web_auth)
+    X(MyPage, my_page)
+
+#undef X
+
+    return true;
+}
+
 AppletProgramId AppletIdToProgramId(AppletId applet_id) {
    switch (applet_id) {
    case AppletId::OverlayDisplay:
@@ -63,9 +92,10 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) {
    }
 }

-[[maybe_unused]] std::shared_ptr<ILibraryAppletAccessor> CreateGuestApplet(
-    Core::System& system, std::shared_ptr<Applet> caller_applet, AppletId applet_id,
-    LibraryAppletMode mode) {
+std::shared_ptr<ILibraryAppletAccessor> CreateGuestApplet(Core::System& system,
+                                                          std::shared_ptr<Applet> caller_applet,
+                                                          AppletId applet_id,
+                                                          LibraryAppletMode mode) {
    const auto program_id = static_cast<u64>(AppletIdToProgramId(applet_id));
    if (program_id == 0) {
        // Unknown applet
@@ -87,7 +117,7 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) {
    // Set focus state
    switch (mode) {
    case LibraryAppletMode::AllForeground:
-    case LibraryAppletMode::NoUI:
+    case LibraryAppletMode::NoUi:
        applet->focus_state = FocusState::InFocus;
        applet->hid_registration.EnableAppletToGetInput(true);
        applet->message_queue.PushMessage(AppletMessageQueue::AppletMessage::ChangeIntoForeground);
@@ -99,8 +129,8 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) {
        applet->hid_registration.EnableAppletToGetInput(false);
        applet->message_queue.PushMessage(AppletMessageQueue::AppletMessage::FocusStateChanged);
        break;
-    case LibraryAppletMode::Background:
-    case LibraryAppletMode::BackgroundIndirectDisplay:
+    case LibraryAppletMode::PartialForeground:
+    case LibraryAppletMode::PartialForegroundIndirectDisplay:
    default:
        applet->focus_state = FocusState::Background;
        applet->hid_registration.EnableAppletToGetInput(true);
@@ -117,9 +147,10 @@ AppletProgramId AppletIdToProgramId(AppletId applet_id) {
    return std::make_shared<ILibraryAppletAccessor>(system, broker, applet);
 }

-[[maybe_unused]] std::shared_ptr<ILibraryAppletAccessor> CreateFrontendApplet(
-    Core::System& system, std::shared_ptr<Applet> caller_applet, AppletId applet_id,
-    LibraryAppletMode mode) {
+std::shared_ptr<ILibraryAppletAccessor> CreateFrontendApplet(Core::System& system,
+                                                             std::shared_ptr<Applet> caller_applet,
+                                                             AppletId applet_id,
+                                                             LibraryAppletMode mode) {
    const auto program_id = static_cast<u64>(AppletIdToProgramId(applet_id));

    auto process = std::make_unique<Process>(system);
@@ -163,7 +194,13 @@ void ILibraryAppletCreator::CreateLibraryApplet(HLERequestContext& ctx) {
    LOG_DEBUG(Service_AM, "called with applet_id={:08X}, applet_mode={:08X}", applet_id,
              applet_mode);

-    auto library_applet = CreateFrontendApplet(system, applet, applet_id, applet_mode);
+    std::shared_ptr<ILibraryAppletAccessor> library_applet;
+    if (ShouldCreateGuestApplet(applet_id)) {
+        library_applet = CreateGuestApplet(system, applet, applet_id, applet_mode);
+    }
+    if (!library_applet) {
+        library_applet = CreateFrontendApplet(system, applet, applet_id, applet_mode);
+    }
    if (!library_applet) {
        LOG_ERROR(Service_AM, "Applet doesn't exist! applet_id={}", applet_id);

--- a/src/core/hle/service/am/self_controller.cpp
+++ b/src/core/hle/service/am/self_controller.cpp
@@ -1,10 +1,13 @@
 // SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

+#include "common/logging/log.h"
+#include "core/hle/result.h"
 #include "core/hle/service/am/am_results.h"
 #include "core/hle/service/am/frontend/applets.h"
 #include "core/hle/service/am/self_controller.h"
 #include "core/hle/service/caps/caps_su.h"
+#include "core/hle/service/hle_ipc.h"
 #include "core/hle/service/ipc_helpers.h"
 #include "core/hle/service/nvnflinger/fb_share_buffer_manager.h"
 #include "core/hle/service/nvnflinger/nvnflinger.h"
@@ -47,7 +50,7 @@ ISelfController::ISelfController(Core::System& system_, std::shared_ptr<Applet>
        {50, &ISelfController::SetHandlesRequestToDisplay, "SetHandlesRequestToDisplay"},
        {51, &ISelfController::ApproveToDisplay, "ApproveToDisplay"},
        {60, nullptr, "OverrideAutoSleepTimeAndDimmingTime"},
-        {61, nullptr, "SetMediaPlaybackState"},
+        {61, &ISelfController::SetMediaPlaybackState, "SetMediaPlaybackState"},
        {62, &ISelfController::SetIdleTimeDetectionExtension, "SetIdleTimeDetectionExtension"},
        {63, &ISelfController::GetIdleTimeDetectionExtension, "GetIdleTimeDetectionExtension"},
        {64, nullptr, "SetInputDetectionSourceSet"},
@@ -288,7 +291,8 @@ void ISelfController::GetSystemSharedBufferHandle(HLERequestContext& ctx) {
 }

 Result ISelfController::EnsureBufferSharingEnabled(Kernel::KProcess* process) {
-    if (applet->system_buffer_manager.Initialize(&nvnflinger, process, applet->applet_id)) {
+    if (applet->system_buffer_manager.Initialize(&nvnflinger, process, applet->applet_id,
+                                                 applet->library_applet_mode)) {
        return ResultSuccess;
    }

@@ -323,6 +327,16 @@ void ISelfController::ApproveToDisplay(HLERequestContext& ctx) {
    rb.Push(ResultSuccess);
 }

+void ISelfController::SetMediaPlaybackState(HLERequestContext& ctx) {
+    IPC::RequestParser rp{ctx};
+    const u8 state = rp.Pop<u8>();
+
+    LOG_WARNING(Service_AM, "(STUBBED) called, state={}", state);
+
+    IPC::ResponseBuilder rb{ctx, 2};
+    rb.Push(ResultSuccess);
+}
+
 void ISelfController::SetIdleTimeDetectionExtension(HLERequestContext& ctx) {
    IPC::RequestParser rp{ctx};

--- a/src/core/hle/service/am/self_controller.h
+++ b/src/core/hle/service/am/self_controller.h
@@ -3,6 +3,7 @@

 #pragma once

+#include "core/hle/service/hle_ipc.h"
 #include "core/hle/service/kernel_helpers.h"
 #include "core/hle/service/service.h"

@@ -38,6 +39,7 @@ private:
    void CreateManagedDisplaySeparableLayer(HLERequestContext& ctx);
    void SetHandlesRequestToDisplay(HLERequestContext& ctx);
    void ApproveToDisplay(HLERequestContext& ctx);
+    void SetMediaPlaybackState(HLERequestContext& ctx);
    void SetIdleTimeDetectionExtension(HLERequestContext& ctx);
    void GetIdleTimeDetectionExtension(HLERequestContext& ctx);
    void ReportUserIsActive(HLERequestContext& ctx);
--- a/src/core/hle/service/am/system_buffer_manager.cpp
+++ b/src/core/hle/service/am/system_buffer_manager.cpp
@@ -17,11 +17,12 @@ SystemBufferManager::~SystemBufferManager() {

    // Clean up shared layers.
    if (m_buffer_sharing_enabled) {
+        m_nvnflinger->GetSystemBufferManager().Finalize(m_process);
    }
 }

 bool SystemBufferManager::Initialize(Nvnflinger::Nvnflinger* nvnflinger, Kernel::KProcess* process,
-                                     AppletId applet_id) {
+                                     AppletId applet_id, LibraryAppletMode mode) {
    if (m_nvnflinger) {
        return m_buffer_sharing_enabled;
    }
@@ -36,9 +37,14 @@ bool SystemBufferManager::Initialize(Nvnflinger::Nvnflinger* nvnflinger, Kernel:
        return false;
    }

+    Nvnflinger::LayerBlending blending = Nvnflinger::LayerBlending::None;
+    if (mode == LibraryAppletMode::PartialForeground) {
+        blending = Nvnflinger::LayerBlending::Coverage;
+    }
+
    const auto display_id = m_nvnflinger->OpenDisplay("Default").value();
    const auto res = m_nvnflinger->GetSystemBufferManager().Initialize(
-        &m_system_shared_buffer_id, &m_system_shared_layer_id, display_id);
+        m_process, &m_system_shared_buffer_id, &m_system_shared_layer_id, display_id, blending);

    if (res.IsSuccess()) {
        m_buffer_sharing_enabled = true;
@@ -62,8 +68,12 @@ void SystemBufferManager::SetWindowVisibility(bool visible) {

 Result SystemBufferManager::WriteAppletCaptureBuffer(bool* out_was_written,
                                                     s32* out_fbshare_layer_index) {
-    // TODO
-    R_SUCCEED();
+    if (!m_buffer_sharing_enabled) {
+        return VI::ResultPermissionDenied;
+    }
+
+    return m_nvnflinger->GetSystemBufferManager().WriteAppletCaptureBuffer(out_was_written,
+                                                                           out_fbshare_layer_index);
 }

 } // namespace Service::AM
--- a/src/core/hle/service/am/system_buffer_manager.h
+++ b/src/core/hle/service/am/system_buffer_manager.h
@@ -27,7 +27,8 @@ public:
    SystemBufferManager();
    ~SystemBufferManager();

-    bool Initialize(Nvnflinger::Nvnflinger* flinger, Kernel::KProcess* process, AppletId applet_id);
+    bool Initialize(Nvnflinger::Nvnflinger* flinger, Kernel::KProcess* process, AppletId applet_id,
+                    LibraryAppletMode mode);

    void GetSystemSharedLayerHandle(u64* out_system_shared_buffer_id,
                                    u64* out_system_shared_layer_id) {
--- a/src/core/hle/service/cmif_serialization.h
+++ b/src/core/hle/service/cmif_serialization.h
@@ -115,6 +115,11 @@ struct ArgumentTraits {
    static constexpr ArgumentType Type = ArgumentType::InData;
 };

+template <typename... Ts>
+consteval bool ConstIfReference() {
+    return ((!std::is_reference_v<Ts> || std::is_const_v<std::remove_reference_t<Ts>>) && ... && true);
+}
+
 struct RequestLayout {
    u32 copy_handle_count;
    u32 move_handle_count;
@@ -435,6 +440,7 @@ void CmifReplyWrapImpl(HLERequestContext& ctx, T& t, Result (T::*f)(A...)) {
    }
    const bool is_domain = Domain ? ctx.GetManager()->IsDomain() : false;

+    static_assert(ConstIfReference<A...>(), "Arguments taken by reference must be const");
    using MethodArguments = std::tuple<std::remove_cvref_t<A>...>;

    OutTemporaryBuffers buffers{};
--- a/src/core/hle/service/cmif_types.h
+++ b/src/core/hle/service/cmif_types.h
@@ -4,10 +4,9 @@
 #pragma once

 #include <memory>
+#include <span>

-#include "common/common_funcs.h"
 #include "common/common_types.h"
-#include "core/hle/service/hle_ipc.h"

 namespace Service {

@@ -22,8 +21,10 @@ class Out {
 public:
    using Type = T;

+    /* implicit */ Out(const Out& t) : raw(t.raw) {}
    /* implicit */ Out(AutoOut<Type>& t) : raw(&t.raw) {}
    /* implicit */ Out(Type* t) : raw(t) {}
+    Out& operator=(const Out&) = delete;

    Type* Get() const {
        return raw;
@@ -37,6 +38,10 @@ public:
        return raw;
    }

+    operator Type*() const {
+        return raw;
+    }
+
 private:
    Type* raw;
 };
@@ -113,8 +118,10 @@ class OutCopyHandle {
 public:
    using Type = T*;

+    /* implicit */ OutCopyHandle(const OutCopyHandle& t) : raw(t.raw) {}
    /* implicit */ OutCopyHandle(AutoOut<Type>& t) : raw(&t.raw) {}
    /* implicit */ OutCopyHandle(Type* t) : raw(t) {}
+    OutCopyHandle& operator=(const OutCopyHandle&) = delete;

    Type* Get() const {
        return raw;
@@ -128,6 +135,10 @@ public:
        return raw;
    }

+    operator Type*() const {
+        return raw;
+    }
+
 private:
    Type* raw;
 };
@@ -137,8 +148,10 @@ class OutMoveHandle {
 public:
    using Type = T*;

+    /* implicit */ OutMoveHandle(const OutMoveHandle& t) : raw(t.raw) {}
    /* implicit */ OutMoveHandle(AutoOut<Type>& t) : raw(&t.raw) {}
    /* implicit */ OutMoveHandle(Type* t) : raw(t) {}
+    OutMoveHandle& operator=(const OutMoveHandle&) = delete;

    Type* Get() const {
        return raw;
@@ -152,6 +165,10 @@ public:
        return raw;
    }

+    operator Type*() const {
+        return raw;
+    }
+
 private:
    Type* raw;
 };
@@ -248,8 +265,10 @@ public:
    static constexpr BufferAttr Attr = static_cast<BufferAttr>(A | BufferAttr_In | BufferAttr_FixedSize);
    using Type = T;

+    /* implicit */ OutLargeData(const OutLargeData& t) : raw(t.raw) {}
    /* implicit */ OutLargeData(Type* t) : raw(t) {}
    /* implicit */ OutLargeData(AutoOut<T>& t) : raw(&t.raw) {}
+    OutLargeData& operator=(const OutLargeData&) = delete;

    Type* Get() const {
        return raw;
@@ -263,6 +282,10 @@ public:
        return raw;
    }

+    operator Type*() const {
+        return raw;
+    }
+
 private:
    Type* raw;
 };
--- a/src/core/hle/service/filesystem/fsp/fsp_srv.cpp
+++ b/src/core/hle/service/filesystem/fsp/fsp_srv.cpp
@@ -115,6 +115,11 @@ private:
            if (type->GetName() == "save") {
                for (const auto& save_id : type->GetSubdirectories()) {
                    for (const auto& user_id : save_id->GetSubdirectories()) {
+                        // Skip non user id subdirectories
+                        if (user_id->GetName().size() != 0x20) {
+                            continue;
+                        }
+
                        const auto save_id_numeric = stoull_be(save_id->GetName());
                        auto user_id_numeric = Common::HexStringToArray<0x10>(user_id->GetName());
                        std::reverse(user_id_numeric.begin(), user_id_numeric.end());
@@ -160,6 +165,10 @@ private:
            } else if (space == FileSys::SaveDataSpaceId::TemporaryStorage) {
                // Temporary Storage
                for (const auto& user_id : type->GetSubdirectories()) {
+                    // Skip non user id subdirectories
+                    if (user_id->GetName().size() != 0x20) {
+                        continue;
+                    }
                    for (const auto& title_id : user_id->GetSubdirectories()) {
                        if (!title_id->GetFiles().empty() ||
                            !title_id->GetSubdirectories().empty()) {
--- a/src/core/hle/service/glue/time/time_zone_binary.cpp
+++ b/src/core/hle/service/glue/time/time_zone_binary.cpp
@@ -65,6 +65,7 @@ Result MountTimeZoneBinary(Core::System& system) {
        // Validate that the romfs is readable, using invalid firmware keys can cause this to get
        // set but the files to be garbage. In that case, we want to hit the next path and
        // synthesise them instead.
+        g_time_zone_binary_mount_result = ResultSuccess;
        Service::PSC::Time::LocationName name{"Etc/GMT"};
        if (!IsTimeZoneBinaryValid(name)) {
            ResetTimeZoneBinary();
--- a/src/core/hle/service/nvdrv/core/container.cpp
+++ b/src/core/hle/service/nvdrv/core/container.cpp
@@ -49,6 +49,7 @@ SessionId Container::OpenSession(Kernel::KProcess* process) {
            continue;
        }
        if (session.process == process) {
+            session.ref_count++;
            return session.id;
        }
    }
@@ -66,6 +67,7 @@ SessionId Container::OpenSession(Kernel::KProcess* process) {
    }
    auto& session = impl->sessions[new_id];
    session.is_active = true;
+    session.ref_count = 1;
    // Optimization
    if (process->IsApplication()) {
        auto& page_table = process->GetPageTable().GetBasePageTable();
@@ -114,8 +116,11 @@ SessionId Container::OpenSession(Kernel::KProcess* process) {

 void Container::CloseSession(SessionId session_id) {
    std::scoped_lock lk(impl->session_guard);
-    impl->file.UnmapAllHandles(session_id);
    auto& session = impl->sessions[session_id.id];
+    if (--session.ref_count > 0) {
+        return;
+    }
+    impl->file.UnmapAllHandles(session_id);
    auto& smmu = impl->host1x.MemoryManager();
    if (session.has_preallocated_area) {
        const DAddr region_start = session.mapper->GetRegionStart();
--- a/src/core/hle/service/nvdrv/core/container.h
+++ b/src/core/hle/service/nvdrv/core/container.h
@@ -46,6 +46,7 @@ struct Session {
    bool has_preallocated_area{};
    std::unique_ptr<HeapMapper> mapper{};
    bool is_active{};
+    s32 ref_count{};
 };

 class Container {
@@ -67,10 +68,7 @@ public:
    const SyncpointManager& GetSyncpointManager() const;

    struct Host1xDeviceFileData {
-        std::unordered_map<DeviceFD, u32> fd_to_id{};
        std::deque<u32> syncpts_accumulated{};
-        u32 nvdec_next_id{};
-        u32 vic_next_id{};
    };

    Host1xDeviceFileData& Host1xDeviceFile();
--- a/src/core/hle/service/nvdrv/core/nvmap.cpp
+++ b/src/core/hle/service/nvdrv/core/nvmap.cpp
@@ -333,9 +333,13 @@ void NvMap::UnmapAllHandles(NvCore::SessionId session_id) {
    }();

    for (auto& [id, handle] : handles_copy) {
-        if (handle->session_id.id == session_id.id) {
-            FreeHandle(id, false);
+        {
+            std::scoped_lock lk{handle->mutex};
+            if (handle->session_id.id != session_id.id || handle->dupes <= 0) {
+                continue;
+            }
        }
+        FreeHandle(id, false);
    }
 }

--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@@ -15,6 +15,22 @@

 namespace Service::Nvidia::Devices {

+namespace {
+
+Tegra::BlendMode ConvertBlending(Service::Nvnflinger::LayerBlending blending) {
+    switch (blending) {
+    case Service::Nvnflinger::LayerBlending::None:
+    default:
+        return Tegra::BlendMode::Opaque;
+    case Service::Nvnflinger::LayerBlending::Premultiplied:
+        return Tegra::BlendMode::Premultiplied;
+    case Service::Nvnflinger::LayerBlending::Coverage:
+        return Tegra::BlendMode::Coverage;
+    }
+}
+
+} // namespace
+
 nvdisp_disp0::nvdisp_disp0(Core::System& system_, NvCore::Container& core)
    : nvdevice{system_}, container{core}, nvmap{core.GetNvMapFile()} {}
 nvdisp_disp0::~nvdisp_disp0() = default;
@@ -56,6 +72,7 @@ void nvdisp_disp0::Composite(std::span<const Nvnflinger::HwcLayer> sorted_layers
            .pixel_format = layer.format,
            .transform_flags = layer.transform,
            .crop_rect = layer.crop_rect,
+            .blending = ConvertBlending(layer.blending),
        });

        for (size_t i = 0; i < layer.acquire_fence.num_fences; i++) {
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp
@@ -8,6 +8,7 @@
 #include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/devices/ioctl_serialization.h"
 #include "core/hle/service/nvdrv/devices/nvhost_nvdec.h"
+#include "video_core/host1x/host1x.h"
 #include "video_core/renderer_base.h"

 namespace Service::Nvidia::Devices {
@@ -21,13 +22,8 @@ NvResult nvhost_nvdec::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> in
    switch (command.group) {
    case 0x0:
        switch (command.cmd) {
-        case 0x1: {
-            auto& host1x_file = core.Host1xDeviceFile();
-            if (!host1x_file.fd_to_id.contains(fd)) {
-                host1x_file.fd_to_id[fd] = host1x_file.nvdec_next_id++;
-            }
+        case 0x1:
            return WrapFixedVariable(this, &nvhost_nvdec::Submit, input, output, fd);
-        }
        case 0x2:
            return WrapFixed(this, &nvhost_nvdec::GetSyncpoint, input, output);
        case 0x3:
@@ -72,15 +68,12 @@ void nvhost_nvdec::OnOpen(NvCore::SessionId session_id, DeviceFD fd) {
    LOG_INFO(Service_NVDRV, "NVDEC video stream started");
    system.SetNVDECActive(true);
    sessions[fd] = session_id;
+    host1x.StartDevice(fd, Tegra::Host1x::ChannelType::NvDec, channel_syncpoint);
 }

 void nvhost_nvdec::OnClose(DeviceFD fd) {
    LOG_INFO(Service_NVDRV, "NVDEC video stream ended");
-    auto& host1x_file = core.Host1xDeviceFile();
-    const auto iter = host1x_file.fd_to_id.find(fd);
-    if (iter != host1x_file.fd_to_id.end()) {
-        system.GPU().ClearCdmaInstance(iter->second);
-    }
+    host1x.StopDevice(fd, Tegra::Host1x::ChannelType::NvDec);
    system.SetNVDECActive(false);
    auto it = sessions.find(fd);
    if (it != sessions.end()) {
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp
@@ -55,8 +55,9 @@ std::size_t WriteVectors(std::span<u8> dst, const std::vector<T>& src, std::size

 nvhost_nvdec_common::nvhost_nvdec_common(Core::System& system_, NvCore::Container& core_,
                                         NvCore::ChannelType channel_type_)
-    : nvdevice{system_}, core{core_}, syncpoint_manager{core.GetSyncpointManager()},
-      nvmap{core.GetNvMapFile()}, channel_type{channel_type_} {
+    : nvdevice{system_}, host1x{system_.Host1x()}, core{core_},
+      syncpoint_manager{core.GetSyncpointManager()}, nvmap{core.GetNvMapFile()},
+      channel_type{channel_type_} {
    auto& syncpts_accumulated = core.Host1xDeviceFile().syncpts_accumulated;
    if (syncpts_accumulated.empty()) {
        channel_syncpoint = syncpoint_manager.AllocateSyncpoint(false);
@@ -95,24 +96,24 @@ NvResult nvhost_nvdec_common::Submit(IoctlSubmit& params, std::span<u8> data, De
    offset += SliceVectors(data, syncpt_increments, params.syncpoint_count, offset);
    offset += SliceVectors(data, fence_thresholds, params.fence_count, offset);

-    auto& gpu = system.GPU();
    auto* session = core.GetSession(sessions[fd]);

-    if (gpu.UseNvdec()) {
-        for (std::size_t i = 0; i < syncpt_increments.size(); i++) {
-            const SyncptIncr& syncpt_incr = syncpt_increments[i];
-            fence_thresholds[i] =
-                syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments);
-        }
+    for (std::size_t i = 0; i < syncpt_increments.size(); i++) {
+        const SyncptIncr& syncpt_incr = syncpt_increments[i];
+        fence_thresholds[i] =
+            syncpoint_manager.IncrementSyncpointMaxExt(syncpt_incr.id, syncpt_incr.increments);
    }
+
    for (const auto& cmd_buffer : command_buffers) {
        const auto object = nvmap.GetHandle(cmd_buffer.memory_id);
        ASSERT_OR_EXECUTE(object, return NvResult::InvalidState;);
-        Tegra::ChCommandHeaderList cmdlist(cmd_buffer.word_count);
-        session->process->GetMemory().ReadBlock(object->address + cmd_buffer.offset, cmdlist.data(),
-                                                cmdlist.size() * sizeof(u32));
-        gpu.PushCommandBuffer(core.Host1xDeviceFile().fd_to_id[fd], cmdlist);
+        Core::Memory::CpuGuestMemory<Tegra::ChCommandHeader,
+                                     Core::Memory::GuestMemoryFlags::SafeRead>
+            cmdlist(session->process->GetMemory(), object->address + cmd_buffer.offset,
+                    cmd_buffer.word_count);
+        host1x.PushEntries(fd, std::move(cmdlist));
    }
+
    // Some games expect command_buffers to be written back
    offset = 0;
    offset += WriteVectors(data, command_buffers, offset);
--- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h
@@ -119,6 +119,7 @@ protected:

    Kernel::KEvent* QueryEvent(u32 event_id) override;

+    Tegra::Host1x::Host1x& host1x;
    u32 channel_syncpoint;
    s32_le nvmap_fd{};
    u32_le submit_timeout{};
--- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp
@@ -7,6 +7,7 @@
 #include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/devices/ioctl_serialization.h"
 #include "core/hle/service/nvdrv/devices/nvhost_vic.h"
+#include "video_core/host1x/host1x.h"
 #include "video_core/renderer_base.h"

 namespace Service::Nvidia::Devices {
@@ -21,13 +22,8 @@ NvResult nvhost_vic::Ioctl1(DeviceFD fd, Ioctl command, std::span<const u8> inpu
    switch (command.group) {
    case 0x0:
        switch (command.cmd) {
-        case 0x1: {
-            auto& host1x_file = core.Host1xDeviceFile();
-            if (!host1x_file.fd_to_id.contains(fd)) {
-                host1x_file.fd_to_id[fd] = host1x_file.vic_next_id++;
-            }
+        case 0x1:
            return WrapFixedVariable(this, &nvhost_vic::Submit, input, output, fd);
-        }
        case 0x2:
            return WrapFixed(this, &nvhost_vic::GetSyncpoint, input, output);
        case 0x3:
@@ -70,14 +66,11 @@ NvResult nvhost_vic::Ioctl3(DeviceFD fd, Ioctl command, std::span<const u8> inpu

 void nvhost_vic::OnOpen(NvCore::SessionId session_id, DeviceFD fd) {
    sessions[fd] = session_id;
+    host1x.StartDevice(fd, Tegra::Host1x::ChannelType::VIC, channel_syncpoint);
 }

 void nvhost_vic::OnClose(DeviceFD fd) {
-    auto& host1x_file = core.Host1xDeviceFile();
-    const auto iter = host1x_file.fd_to_id.find(fd);
-    if (iter != host1x_file.fd_to_id.end()) {
-        system.GPU().ClearCdmaInstance(iter->second);
-    }
+    host1x.StopDevice(fd, Tegra::Host1x::ChannelType::VIC);
    sessions.erase(fd);
 }

--- a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp
+++ b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.cpp
@@ -14,24 +14,20 @@
 #include "core/hle/service/nvnflinger/ui/graphic_buffer.h"
 #include "core/hle/service/vi/layer/vi_layer.h"
 #include "core/hle/service/vi/vi_results.h"
+#include "video_core/gpu.h"
+#include "video_core/host1x/host1x.h"

 namespace Service::Nvnflinger {

 namespace {

-Result AllocateIoForProcessAddressSpace(Common::ProcessAddress* out_map_address,
-                                        std::unique_ptr<Kernel::KPageGroup>* out_page_group,
-                                        Core::System& system, u32 size) {
+Result AllocateSharedBufferMemory(std::unique_ptr<Kernel::KPageGroup>* out_page_group,
+                                  Core::System& system, u32 size) {
    using Core::Memory::YUZU_PAGESIZE;

    // Allocate memory for the system shared buffer.
-    // FIXME: Because the gmmu can only point to cpu addresses, we need
-    //        to map this in the application space to allow it to be used.
-    // FIXME: Add proper smmu emulation.
    // FIXME: This memory belongs to vi's .data section.
    auto& kernel = system.Kernel();
-    auto* process = system.ApplicationProcess();
-    auto& page_table = process->GetPageTable();

    // Hold a temporary page group reference while we try to map it.
    auto pg = std::make_unique<Kernel::KPageGroup>(
@@ -43,6 +39,30 @@ Result AllocateIoForProcessAddressSpace(Common::ProcessAddress* out_map_address,
        Kernel::KMemoryManager::EncodeOption(Kernel::KMemoryManager::Pool::Secure,
                                             Kernel::KMemoryManager::Direction::FromBack)));

+    // Fill the output data with red.
+    for (auto& block : *pg) {
+        u32* start = system.DeviceMemory().GetPointer<u32>(block.GetAddress());
+        u32* end = system.DeviceMemory().GetPointer<u32>(block.GetAddress() + block.GetSize());
+
+        for (; start < end; start++) {
+            *start = 0xFF0000FF;
+        }
+    }
+
+    // Return the mapped page group.
+    *out_page_group = std::move(pg);
+
+    // We succeeded.
+    R_SUCCEED();
+}
+
+Result MapSharedBufferIntoProcessAddressSpace(Common::ProcessAddress* out_map_address,
+                                              std::unique_ptr<Kernel::KPageGroup>& pg,
+                                              Kernel::KProcess* process, Core::System& system) {
+    using Core::Memory::YUZU_PAGESIZE;
+
+    auto& page_table = process->GetPageTable();
+
    // Get bounds of where mapping is possible.
    const VAddr alias_code_begin = GetInteger(page_table.GetAliasCodeRegionStart());
    const VAddr alias_code_size = page_table.GetAliasCodeRegionSize() / YUZU_PAGESIZE;
@@ -64,9 +84,6 @@ Result AllocateIoForProcessAddressSpace(Common::ProcessAddress* out_map_address,
    // Return failure, if necessary
    R_UNLESS(i < 64, res);

-    // Return the mapped page group.
-    *out_page_group = std::move(pg);
-
    // We succeeded.
    R_SUCCEED();
 }
@@ -135,6 +152,13 @@ Result AllocateHandleForBuffer(u32* out_handle, Nvidia::Module& nvdrv, Nvidia::D
    R_RETURN(AllocNvMapHandle(*nvmap, *out_handle, buffer, size, nvmap_fd));
 }

+void FreeHandle(u32 handle, Nvidia::Module& nvdrv, Nvidia::DeviceFD nvmap_fd) {
+    auto nvmap = nvdrv.GetDevice<Nvidia::Devices::nvmap>(nvmap_fd);
+    ASSERT(nvmap != nullptr);
+
+    R_ASSERT(FreeNvMapHandle(*nvmap, handle, nvmap_fd));
+}
+
 constexpr auto SharedBufferBlockLinearFormat = android::PixelFormat::Rgba8888;
 constexpr u32 SharedBufferBlockLinearBpp = 4;

@@ -186,53 +210,97 @@ FbShareBufferManager::FbShareBufferManager(Core::System& system, Nvnflinger& fli

 FbShareBufferManager::~FbShareBufferManager() = default;

-Result FbShareBufferManager::Initialize(u64* out_buffer_id, u64* out_layer_id, u64 display_id) {
+Result FbShareBufferManager::Initialize(Kernel::KProcess* owner_process, u64* out_buffer_id,
+                                        u64* out_layer_handle, u64 display_id,
+                                        LayerBlending blending) {
    std::scoped_lock lk{m_guard};

-    // Ensure we have not already created a buffer.
-    R_UNLESS(m_buffer_id == 0, VI::ResultOperationFailed);
+    // Ensure we haven't already created.
+    const u64 aruid = owner_process->GetProcessId();
+    R_UNLESS(!m_sessions.contains(aruid), VI::ResultPermissionDenied);

-    // Allocate memory and space for the shared buffer.
-    Common::ProcessAddress map_address;
-    R_TRY(AllocateIoForProcessAddressSpace(std::addressof(map_address),
-                                           std::addressof(m_buffer_page_group), m_system,
-                                           SharedBufferSize));
+    // Allocate memory for the shared buffer if needed.
+    if (!m_buffer_page_group) {
+        R_TRY(AllocateSharedBufferMemory(std::addressof(m_buffer_page_group), m_system,
+                                         SharedBufferSize));
+
+        // Record buffer id.
+        m_buffer_id = m_next_buffer_id++;
+
+        // Record display id.
+        m_display_id = display_id;
+    }
+
+    // Map into process.
+    Common::ProcessAddress map_address{};
+    R_TRY(MapSharedBufferIntoProcessAddressSpace(std::addressof(map_address), m_buffer_page_group,
+                                                 owner_process, m_system));
+
+    // Create new session.
+    auto [it, was_emplaced] = m_sessions.emplace(aruid, FbShareSession{});
+    auto& session = it->second;

    auto& container = m_nvdrv->GetContainer();
-    m_session_id = container.OpenSession(m_system.ApplicationProcess());
-    m_nvmap_fd = m_nvdrv->Open("/dev/nvmap", m_session_id);
+    session.session_id = container.OpenSession(owner_process);
+    session.nvmap_fd = m_nvdrv->Open("/dev/nvmap", session.session_id);

    // Create an nvmap handle for the buffer and assign the memory to it.
-    R_TRY(AllocateHandleForBuffer(std::addressof(m_buffer_nvmap_handle), *m_nvdrv, m_nvmap_fd,
-                                  map_address, SharedBufferSize));
-
-    // Record the display id.
-    m_display_id = display_id;
+    R_TRY(AllocateHandleForBuffer(std::addressof(session.buffer_nvmap_handle), *m_nvdrv,
+                                  session.nvmap_fd, map_address, SharedBufferSize));

    // Create and open a layer for the display.
-    m_layer_id = m_flinger.CreateLayer(m_display_id).value();
-    m_flinger.OpenLayer(m_layer_id);
-
-    // Set up the buffer.
-    m_buffer_id = m_next_buffer_id++;
+    session.layer_id = m_flinger.CreateLayer(m_display_id, blending).value();
+    m_flinger.OpenLayer(session.layer_id);

    // Get the layer.
-    VI::Layer* layer = m_flinger.FindLayer(m_display_id, m_layer_id);
+    VI::Layer* layer = m_flinger.FindLayer(m_display_id, session.layer_id);
    ASSERT(layer != nullptr);

    // Get the producer and set preallocated buffers.
    auto& producer = layer->GetBufferQueue();
-    MakeGraphicBuffer(producer, 0, m_buffer_nvmap_handle);
-    MakeGraphicBuffer(producer, 1, m_buffer_nvmap_handle);
+    MakeGraphicBuffer(producer, 0, session.buffer_nvmap_handle);
+    MakeGraphicBuffer(producer, 1, session.buffer_nvmap_handle);

    // Assign outputs.
    *out_buffer_id = m_buffer_id;
-    *out_layer_id = m_layer_id;
+    *out_layer_handle = session.layer_id;

    // We succeeded.
    R_SUCCEED();
 }

+void FbShareBufferManager::Finalize(Kernel::KProcess* owner_process) {
+    std::scoped_lock lk{m_guard};
+
+    if (m_buffer_id == 0) {
+        return;
+    }
+
+    const u64 aruid = owner_process->GetProcessId();
+    const auto it = m_sessions.find(aruid);
+    if (it == m_sessions.end()) {
+        return;
+    }
+
+    auto& session = it->second;
+
+    // Destroy the layer.
+    m_flinger.DestroyLayer(session.layer_id);
+
+    // Close nvmap handle.
+    FreeHandle(session.buffer_nvmap_handle, *m_nvdrv, session.nvmap_fd);
+
+    // Close nvmap device.
+    m_nvdrv->Close(session.nvmap_fd);
+
+    // Close session.
+    auto& container = m_nvdrv->GetContainer();
+    container.CloseSession(session.session_id);
+
+    // Erase.
+    m_sessions.erase(it);
+}
+
 Result FbShareBufferManager::GetSharedBufferMemoryHandleId(u64* out_buffer_size,
                                                           s32* out_nvmap_handle,
                                                           SharedMemoryPoolLayout* out_pool_layout,
@@ -242,17 +310,18 @@ Result FbShareBufferManager::GetSharedBufferMemoryHandleId(u64* out_buffer_size,

    R_UNLESS(m_buffer_id > 0, VI::ResultNotFound);
    R_UNLESS(buffer_id == m_buffer_id, VI::ResultNotFound);
+    R_UNLESS(m_sessions.contains(applet_resource_user_id), VI::ResultNotFound);

    *out_pool_layout = SharedBufferPoolLayout;
    *out_buffer_size = SharedBufferSize;
-    *out_nvmap_handle = m_buffer_nvmap_handle;
+    *out_nvmap_handle = m_sessions[applet_resource_user_id].buffer_nvmap_handle;

    R_SUCCEED();
 }

 Result FbShareBufferManager::GetLayerFromId(VI::Layer** out_layer, u64 layer_id) {
    // Ensure the layer id is valid.
-    R_UNLESS(m_layer_id > 0 && layer_id == m_layer_id, VI::ResultNotFound);
+    R_UNLESS(layer_id > 0, VI::ResultNotFound);

    // Get the layer.
    VI::Layer* layer = m_flinger.FindLayer(m_display_id, layer_id);
@@ -309,6 +378,10 @@ Result FbShareBufferManager::PresentSharedFrameBuffer(android::Fence fence,
                 android::Status::NoError,
             VI::ResultOperationFailed);

+    ON_RESULT_FAILURE {
+        producer.CancelBuffer(static_cast<s32>(slot), fence);
+    };
+
    // Queue the buffer to the producer.
    android::QueueBufferInput input{};
    android::QueueBufferOutput output{};
@@ -342,4 +415,33 @@ Result FbShareBufferManager::GetSharedFrameBufferAcquirableEvent(Kernel::KReadab
    R_SUCCEED();
 }

+Result FbShareBufferManager::WriteAppletCaptureBuffer(bool* out_was_written, s32* out_layer_index) {
+    std::vector<u8> capture_buffer(m_system.GPU().GetAppletCaptureBuffer());
+    Common::ScratchBuffer<u32> scratch;
+
+    // TODO: this could be optimized
+    s64 e = -1280 * 768 * 4;
+    for (auto& block : *m_buffer_page_group) {
+        u8* start = m_system.DeviceMemory().GetPointer<u8>(block.GetAddress());
+        u8* end = m_system.DeviceMemory().GetPointer<u8>(block.GetAddress() + block.GetSize());
+
+        for (; start < end; start++) {
+            *start = 0;
+
+            if (e >= 0 && e < static_cast<s64>(capture_buffer.size())) {
+                *start = capture_buffer[e];
+            }
+            e++;
+        }
+
+        m_system.GPU().Host1x().MemoryManager().ApplyOpOnPointer(start, scratch, [&](DAddr addr) {
+            m_system.GPU().InvalidateRegion(addr, end - start);
+        });
+    }
+
+    *out_was_written = true;
+    *out_layer_index = 1;
+    R_SUCCEED();
+}
+
 } // namespace Service::Nvnflinger
--- a/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h
+++ b/src/core/hle/service/nvnflinger/fb_share_buffer_manager.h
@@ -3,9 +3,12 @@

 #pragma once

+#include <map>
+
 #include "common/math_util.h"
 #include "core/hle/service/nvdrv/core/container.h"
 #include "core/hle/service/nvdrv/nvdata.h"
+#include "core/hle/service/nvnflinger/hwc_layer.h"
 #include "core/hle/service/nvnflinger/nvnflinger.h"
 #include "core/hle/service/nvnflinger/ui/fence.h"

@@ -29,13 +32,18 @@ struct SharedMemoryPoolLayout {
 };
 static_assert(sizeof(SharedMemoryPoolLayout) == 0x188, "SharedMemoryPoolLayout has wrong size");

+struct FbShareSession;
+
 class FbShareBufferManager final {
 public:
    explicit FbShareBufferManager(Core::System& system, Nvnflinger& flinger,
                                  std::shared_ptr<Nvidia::Module> nvdrv);
    ~FbShareBufferManager();

-    Result Initialize(u64* out_buffer_id, u64* out_layer_handle, u64 display_id);
+    Result Initialize(Kernel::KProcess* owner_process, u64* out_buffer_id, u64* out_layer_handle,
+                      u64 display_id, LayerBlending blending);
+    void Finalize(Kernel::KProcess* owner_process);
+
    Result GetSharedBufferMemoryHandleId(u64* out_buffer_size, s32* out_nvmap_handle,
                                         SharedMemoryPoolLayout* out_pool_layout, u64 buffer_id,
                                         u64 applet_resource_user_id);
@@ -45,6 +53,8 @@ public:
                                    u32 transform, s32 swap_interval, u64 layer_id, s64 slot);
    Result GetSharedFrameBufferAcquirableEvent(Kernel::KReadableEvent** out_event, u64 layer_id);

+    Result WriteAppletCaptureBuffer(bool* out_was_written, s32* out_layer_index);
+
 private:
    Result GetLayerFromId(VI::Layer** out_layer, u64 layer_id);

@@ -52,11 +62,8 @@ private:
    u64 m_next_buffer_id = 1;
    u64 m_display_id = 0;
    u64 m_buffer_id = 0;
-    u64 m_layer_id = 0;
-    u32 m_buffer_nvmap_handle = 0;
    SharedMemoryPoolLayout m_pool_layout = {};
-    Nvidia::DeviceFD m_nvmap_fd = {};
-    Nvidia::NvCore::SessionId m_session_id = {};
+    std::map<u64, FbShareSession> m_sessions;
    std::unique_ptr<Kernel::KPageGroup> m_buffer_page_group;

    std::mutex m_guard;
@@ -65,4 +72,11 @@ private:
    std::shared_ptr<Nvidia::Module> m_nvdrv;
 };

+struct FbShareSession {
+    Nvidia::DeviceFD nvmap_fd = {};
+    Nvidia::NvCore::SessionId session_id = {};
+    u64 layer_id = {};
+    u32 buffer_nvmap_handle = 0;
+};
+
 } // namespace Service::Nvnflinger
--- a/src/core/hle/service/nvnflinger/hardware_composer.cpp
+++ b/src/core/hle/service/nvnflinger/hardware_composer.cpp
@@ -109,6 +109,7 @@ u32 HardwareComposer::ComposeLocked(f32* out_speed_scale, VI::Display& display,
            .height = igbp_buffer.Height(),
            .stride = igbp_buffer.Stride(),
            .z_index = 0,
+            .blending = layer.GetBlending(),
            .transform = static_cast<android::BufferTransformFlags>(item.transform),
            .crop_rect = item.crop,
            .acquire_fence = item.fence,
--- a/src/core/hle/service/nvnflinger/hwc_layer.h
+++ b/src/core/hle/service/nvnflinger/hwc_layer.h
@@ -11,6 +11,18 @@

 namespace Service::Nvnflinger {

+// hwc_layer_t::blending values
+enum class LayerBlending : u32 {
+    // No blending
+    None = 0x100,
+
+    // ONE / ONE_MINUS_SRC_ALPHA
+    Premultiplied = 0x105,
+
+    // SRC_ALPHA / ONE_MINUS_SRC_ALPHA
+    Coverage = 0x405,
+};
+
 struct HwcLayer {
    u32 buffer_handle;
    u32 offset;
@@ -19,6 +31,7 @@ struct HwcLayer {
    u32 height;
    u32 stride;
    s32 z_index;
+    LayerBlending blending;
    android::BufferTransformFlags transform;
    Common::Rectangle<int> crop_rect;
    android::Fence acquire_fence;
--- a/src/core/hle/service/nvnflinger/nvnflinger.cpp
+++ b/src/core/hle/service/nvnflinger/nvnflinger.cpp
@@ -157,7 +157,7 @@ bool Nvnflinger::CloseDisplay(u64 display_id) {
    return true;
 }

-std::optional<u64> Nvnflinger::CreateLayer(u64 display_id) {
+std::optional<u64> Nvnflinger::CreateLayer(u64 display_id, LayerBlending blending) {
    const auto lock_guard = Lock();
    auto* const display = FindDisplay(display_id);

@@ -166,13 +166,14 @@ std::optional<u64> Nvnflinger::CreateLayer(u64 display_id) {
    }

    const u64 layer_id = next_layer_id++;
-    CreateLayerAtId(*display, layer_id);
+    CreateLayerAtId(*display, layer_id, blending);
    return layer_id;
 }

-void Nvnflinger::CreateLayerAtId(VI::Display& display, u64 layer_id) {
+void Nvnflinger::CreateLayerAtId(VI::Display& display, u64 layer_id, LayerBlending blending) {
    const auto buffer_id = next_buffer_queue_id++;
    display.CreateLayer(layer_id, buffer_id, nvdrv->container);
+    display.FindLayer(layer_id)->SetBlending(blending);
 }

 bool Nvnflinger::OpenLayer(u64 layer_id) {
--- a/src/core/hle/service/nvnflinger/nvnflinger.h
+++ b/src/core/hle/service/nvnflinger/nvnflinger.h
@@ -15,6 +15,7 @@
 #include "common/thread.h"
 #include "core/hle/result.h"
 #include "core/hle/service/kernel_helpers.h"
+#include "core/hle/service/nvnflinger/hwc_layer.h"

 namespace Common {
 class Event;
@@ -72,7 +73,8 @@ public:
    /// Creates a layer on the specified display and returns the layer ID.
    ///
    /// If an invalid display ID is specified, then an empty optional is returned.
-    [[nodiscard]] std::optional<u64> CreateLayer(u64 display_id);
+    [[nodiscard]] std::optional<u64> CreateLayer(u64 display_id,
+                                                 LayerBlending blending = LayerBlending::None);

    /// Opens a layer on all displays for the given layer ID.
    bool OpenLayer(u64 layer_id);
@@ -128,7 +130,7 @@ private:
    [[nodiscard]] VI::Layer* FindLayer(u64 display_id, u64 layer_id);

    /// Creates a layer with the specified layer ID in the desired display.
-    void CreateLayerAtId(VI::Display& display, u64 layer_id);
+    void CreateLayerAtId(VI::Display& display, u64 layer_id, LayerBlending blending);

    void SplitVSync(std::stop_token stop_token);

--- a/src/core/hle/service/vi/layer/vi_layer.cpp
+++ b/src/core/hle/service/vi/layer/vi_layer.cpp
@@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

+#include "core/hle/service/nvnflinger/hwc_layer.h"
 #include "core/hle/service/vi/layer/vi_layer.h"

 namespace Service::VI {
@@ -8,8 +9,9 @@ namespace Service::VI {
 Layer::Layer(u64 layer_id_, u32 binder_id_, android::BufferQueueCore& core_,
             android::BufferQueueProducer& binder_,
             std::shared_ptr<android::BufferItemConsumer>&& consumer_)
-    : layer_id{layer_id_}, binder_id{binder_id_}, core{core_}, binder{binder_},
-      consumer{std::move(consumer_)}, open{false}, visible{true} {}
+    : layer_id{layer_id_}, binder_id{binder_id_}, core{core_}, binder{binder_}, consumer{std::move(
+                                                                                    consumer_)},
+      blending{Nvnflinger::LayerBlending::None}, open{false}, visible{true} {}

 Layer::~Layer() = default;

--- a/src/core/hle/service/vi/layer/vi_layer.h
+++ b/src/core/hle/service/vi/layer/vi_layer.h
@@ -14,6 +14,10 @@ class BufferQueueCore;
 class BufferQueueProducer;
 } // namespace Service::android

+namespace Service::Nvnflinger {
+enum class LayerBlending : u32;
+}
+
 namespace Service::VI {

 /// Represents a single display layer.
@@ -92,12 +96,21 @@ public:
        return !std::exchange(open, true);
    }

+    Nvnflinger::LayerBlending GetBlending() {
+        return blending;
+    }
+
+    void SetBlending(Nvnflinger::LayerBlending b) {
+        blending = b;
+    }
+
 private:
    const u64 layer_id;
    const u32 binder_id;
    android::BufferQueueCore& core;
    android::BufferQueueProducer& binder;
    std::shared_ptr<android::BufferItemConsumer> consumer;
+    Service::Nvnflinger::LayerBlending blending;
    bool open;
    bool visible;
 };
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -64,6 +64,8 @@ public:
    Memory(Memory&&) = default;
    Memory& operator=(Memory&&) = delete;

+    static constexpr bool HAS_FLUSH_INVALIDATION = false;
+
    /**
     * Resets the state of the Memory system.
     */
--- a/src/frontend_common/config.cpp
+++ b/src/frontend_common/config.cpp
@@ -401,6 +401,14 @@ void Config::ReadNetworkValues() {
    EndGroup();
 }

+void Config::ReadLibraryAppletValues() {
+    BeginGroup(Settings::TranslateCategory(Settings::Category::LibraryApplet));
+
+    ReadCategory(Settings::Category::LibraryApplet);
+
+    EndGroup();
+}
+
 void Config::ReadValues() {
    if (global) {
        ReadDataStorageValues();
@@ -410,6 +418,7 @@ void Config::ReadValues() {
        ReadServiceValues();
        ReadWebServiceValues();
        ReadMiscellaneousValues();
+        ReadLibraryAppletValues();
    }
    ReadControlValues();
    ReadCoreValues();
@@ -511,6 +520,7 @@ void Config::SaveValues() {
        SaveNetworkValues();
        SaveWebServiceValues();
        SaveMiscellaneousValues();
+        SaveLibraryAppletValues();
    } else {
        LOG_DEBUG(Config, "Saving only generic configuration values");
    }
@@ -691,6 +701,14 @@ void Config::SaveWebServiceValues() {
    EndGroup();
 }

+void Config::SaveLibraryAppletValues() {
+    BeginGroup(Settings::TranslateCategory(Settings::Category::LibraryApplet));
+
+    WriteCategory(Settings::Category::LibraryApplet);
+
+    EndGroup();
+}
+
 bool Config::ReadBooleanSetting(const std::string& key, const std::optional<bool> default_value) {
    std::string full_key = GetFullKey(key, false);
    if (!default_value.has_value()) {
--- a/src/frontend_common/config.h
+++ b/src/frontend_common/config.h
@@ -88,6 +88,7 @@ protected:
    void ReadSystemValues();
    void ReadWebServiceValues();
    void ReadNetworkValues();
+    void ReadLibraryAppletValues();

    // Read platform specific sections
    virtual void ReadHidbusValues() = 0;
@@ -121,6 +122,7 @@ protected:
    void SaveScreenshotValues();
    void SaveSystemValues();
    void SaveWebServiceValues();
+    void SaveLibraryAppletValues();

    // Save platform specific sections
    virtual void SaveHidbusValues() = 0;
--- a/src/hid_core/resource_manager.cpp
+++ b/src/hid_core/resource_manager.cpp
@@ -52,9 +52,42 @@ ResourceManager::ResourceManager(Core::System& system_,
                                 std::shared_ptr<HidFirmwareSettings> settings)
    : firmware_settings{settings}, system{system_}, service_context{system_, "hid"} {
    applet_resource = std::make_shared<AppletResource>(system);
+
+    // Register update callbacks
+    npad_update_event = Core::Timing::CreateEvent("HID::UpdatePadCallback",
+                                                  [this](s64 time, std::chrono::nanoseconds ns_late)
+                                                      -> std::optional<std::chrono::nanoseconds> {
+                                                      UpdateNpad(ns_late);
+                                                      return std::nullopt;
+                                                  });
+    default_update_event = Core::Timing::CreateEvent(
+        "HID::UpdateDefaultCallback",
+        [this](s64 time,
+               std::chrono::nanoseconds ns_late) -> std::optional<std::chrono::nanoseconds> {
+            UpdateControllers(ns_late);
+            return std::nullopt;
+        });
+    mouse_keyboard_update_event = Core::Timing::CreateEvent(
+        "HID::UpdateMouseKeyboardCallback",
+        [this](s64 time,
+               std::chrono::nanoseconds ns_late) -> std::optional<std::chrono::nanoseconds> {
+            UpdateMouseKeyboard(ns_late);
+            return std::nullopt;
+        });
+    motion_update_event = Core::Timing::CreateEvent(
+        "HID::UpdateMotionCallback",
+        [this](s64 time,
+               std::chrono::nanoseconds ns_late) -> std::optional<std::chrono::nanoseconds> {
+            UpdateMotion(ns_late);
+            return std::nullopt;
+        });
 }

 ResourceManager::~ResourceManager() {
+    system.CoreTiming().UnscheduleEvent(npad_update_event);
+    system.CoreTiming().UnscheduleEvent(default_update_event);
+    system.CoreTiming().UnscheduleEvent(mouse_keyboard_update_event);
+    system.CoreTiming().UnscheduleEvent(motion_update_event);
    system.CoreTiming().UnscheduleEvent(touch_update_event);
    input_event->Finalize();
 };
@@ -201,6 +234,7 @@ void ResourceManager::InitializeHidCommonSampler() {

    debug_pad->SetAppletResource(applet_resource, &shared_mutex);
    digitizer->SetAppletResource(applet_resource, &shared_mutex);
+    unique_pad->SetAppletResource(applet_resource, &shared_mutex);
    keyboard->SetAppletResource(applet_resource, &shared_mutex);

    const auto settings =
@@ -214,6 +248,14 @@ void ResourceManager::InitializeHidCommonSampler() {
    home_button->SetAppletResource(applet_resource, &shared_mutex);
    sleep_button->SetAppletResource(applet_resource, &shared_mutex);
    capture_button->SetAppletResource(applet_resource, &shared_mutex);
+
+    system.CoreTiming().ScheduleLoopingEvent(npad_update_ns, npad_update_ns, npad_update_event);
+    system.CoreTiming().ScheduleLoopingEvent(default_update_ns, default_update_ns,
+                                             default_update_event);
+    system.CoreTiming().ScheduleLoopingEvent(mouse_keyboard_update_ns, mouse_keyboard_update_ns,
+                                             mouse_keyboard_update_event);
+    system.CoreTiming().ScheduleLoopingEvent(motion_update_ns, motion_update_ns,
+                                             motion_update_event);
 }

 void ResourceManager::InitializeTouchScreenSampler() {
@@ -465,55 +507,9 @@ IAppletResource::IAppletResource(Core::System& system_, std::shared_ptr<Resource
        {0, &IAppletResource::GetSharedMemoryHandle, "GetSharedMemoryHandle"},
    };
    RegisterHandlers(functions);
-
-    // Register update callbacks
-    npad_update_event = Core::Timing::CreateEvent(
-        "HID::UpdatePadCallback",
-        [this, resource](
-            s64 time, std::chrono::nanoseconds ns_late) -> std::optional<std::chrono::nanoseconds> {
-            const auto guard = LockService();
-            resource->UpdateNpad(ns_late);
-            return std::nullopt;
-        });
-    default_update_event = Core::Timing::CreateEvent(
-        "HID::UpdateDefaultCallback",
-        [this, resource](
-            s64 time, std::chrono::nanoseconds ns_late) -> std::optional<std::chrono::nanoseconds> {
-            const auto guard = LockService();
-            resource->UpdateControllers(ns_late);
-            return std::nullopt;
-        });
-    mouse_keyboard_update_event = Core::Timing::CreateEvent(
-        "HID::UpdateMouseKeyboardCallback",
-        [this, resource](
-            s64 time, std::chrono::nanoseconds ns_late) -> std::optional<std::chrono::nanoseconds> {
-            const auto guard = LockService();
-            resource->UpdateMouseKeyboard(ns_late);
-            return std::nullopt;
-        });
-    motion_update_event = Core::Timing::CreateEvent(
-        "HID::UpdateMotionCallback",
-        [this, resource](
-            s64 time, std::chrono::nanoseconds ns_late) -> std::optional<std::chrono::nanoseconds> {
-            const auto guard = LockService();
-            resource->UpdateMotion(ns_late);
-            return std::nullopt;
-        });
-
-    system.CoreTiming().ScheduleLoopingEvent(npad_update_ns, npad_update_ns, npad_update_event);
-    system.CoreTiming().ScheduleLoopingEvent(default_update_ns, default_update_ns,
-                                             default_update_event);
-    system.CoreTiming().ScheduleLoopingEvent(mouse_keyboard_update_ns, mouse_keyboard_update_ns,
-                                             mouse_keyboard_update_event);
-    system.CoreTiming().ScheduleLoopingEvent(motion_update_ns, motion_update_ns,
-                                             motion_update_event);
 }

 IAppletResource::~IAppletResource() {
-    system.CoreTiming().UnscheduleEvent(npad_update_event);
-    system.CoreTiming().UnscheduleEvent(default_update_event);
-    system.CoreTiming().UnscheduleEvent(mouse_keyboard_update_event);
-    system.CoreTiming().UnscheduleEvent(motion_update_event);
    resource_manager->FreeAppletResourceId(aruid);
 }

--- a/src/hid_core/resource_manager.h
+++ b/src/hid_core/resource_manager.h
@@ -147,6 +147,10 @@ private:
    std::shared_ptr<SixAxis> six_axis{nullptr};
    std::shared_ptr<SleepButton> sleep_button{nullptr};
    std::shared_ptr<UniquePad> unique_pad{nullptr};
+    std::shared_ptr<Core::Timing::EventType> npad_update_event;
+    std::shared_ptr<Core::Timing::EventType> default_update_event;
+    std::shared_ptr<Core::Timing::EventType> mouse_keyboard_update_event;
+    std::shared_ptr<Core::Timing::EventType> motion_update_event;

    // TODO: Create these resources
    // std::shared_ptr<AudioControl> audio_control{nullptr};
@@ -179,11 +183,6 @@ public:
 private:
    void GetSharedMemoryHandle(HLERequestContext& ctx);

-    std::shared_ptr<Core::Timing::EventType> npad_update_event{nullptr};
-    std::shared_ptr<Core::Timing::EventType> default_update_event{nullptr};
-    std::shared_ptr<Core::Timing::EventType> mouse_keyboard_update_event{nullptr};
-    std::shared_ptr<Core::Timing::EventType> motion_update_event{nullptr};
-
    u64 aruid{};
    std::shared_ptr<ResourceManager> resource_manager;
 };
--- a/src/hid_core/resources/digitizer/digitizer.cpp
+++ b/src/hid_core/resources/digitizer/digitizer.cpp
@@ -17,10 +17,6 @@ void Digitizer::OnInit() {}
 void Digitizer::OnRelease() {}

 void Digitizer::OnUpdate(const Core::Timing::CoreTiming& core_timing) {
-    if (!smart_update) {
-        return;
-    }
-
    std::scoped_lock shared_lock{*shared_mutex};
    const u64 aruid = applet_resource->GetActiveAruid();
    auto* data = applet_resource->GetAruidData(aruid);
--- a/src/hid_core/resources/digitizer/digitizer.h
+++ b/src/hid_core/resources/digitizer/digitizer.h
@@ -20,8 +20,5 @@ public:

    // When the controller is requesting an update for the shared memory
    void OnUpdate(const Core::Timing::CoreTiming& core_timing) override;
-
-private:
-    bool smart_update{};
 };
 } // namespace Service::HID
--- a/src/hid_core/resources/npad/npad.cpp
+++ b/src/hid_core/resources/npad/npad.cpp
@@ -102,6 +102,8 @@ Result NPad::Activate(u64 aruid) {
        for (std::size_t i = 0; i < 19; ++i) {
            WriteEmptyEntry(npad);
        }
+
+        controller.is_active = true;
    }

    return ResultSuccess;
@@ -467,6 +469,13 @@ void NPad::OnUpdate(const Core::Timing::CoreTiming& core_timing) {
            continue;
        }

+        bool is_set{};
+        npad_resource.IsSupportedNpadStyleSet(is_set, aruid);
+        // Wait until style is defined
+        if (!is_set) {
+            continue;
+        }
+
        for (std::size_t i = 0; i < controller_data[aruid_index].size(); ++i) {
            auto& controller = controller_data[aruid_index][i];
            controller.shared_memory =
@@ -484,6 +493,10 @@ void NPad::OnUpdate(const Core::Timing::CoreTiming& core_timing) {
                continue;
            }

+            if (!controller.is_active) {
+                continue;
+            }
+
            RequestPadStateUpdate(aruid, controller.device->GetNpadIdType());
            auto& pad_state = controller.npad_pad_state;
            auto& libnx_state = controller.npad_libnx_state;
@@ -592,7 +605,9 @@ void NPad::OnUpdate(const Core::Timing::CoreTiming& core_timing) {
            libnx_state.npad_buttons.raw = pad_state.npad_buttons.raw;
            libnx_state.l_stick = pad_state.l_stick;
            libnx_state.r_stick = pad_state.r_stick;
-            npad->system_ext_lifo.WriteNextEntry(pad_state);
+            libnx_state.sampling_number =
+                npad->system_ext_lifo.ReadCurrentEntry().state.sampling_number + 1;
+            npad->system_ext_lifo.WriteNextEntry(libnx_state);

            press_state |= static_cast<u64>(pad_state.npad_buttons.raw);
        }
@@ -1060,6 +1075,7 @@ void NPad::UnregisterAppletResourceUserId(u64 aruid) {
    // TODO: Remove this once abstract pad is emulated properly
    const auto aruid_index = npad_resource.GetIndexFromAruid(aruid);
    for (auto& controller : controller_data[aruid_index]) {
+        controller.is_active = false;
        controller.is_connected = false;
        controller.shared_memory = nullptr;
    }
--- a/src/hid_core/resources/npad/npad.h
+++ b/src/hid_core/resources/npad/npad.h
@@ -164,6 +164,7 @@ private:
        NpadInternalState* shared_memory = nullptr;
        Core::HID::EmulatedController* device = nullptr;

+        bool is_active{};
        bool is_connected{};

        // Dual joycons can have only one side connected
--- a/src/hid_core/resources/unique_pad/unique_pad.cpp
+++ b/src/hid_core/resources/unique_pad/unique_pad.cpp
@@ -17,10 +17,6 @@ void UniquePad::OnInit() {}
 void UniquePad::OnRelease() {}

 void UniquePad::OnUpdate(const Core::Timing::CoreTiming& core_timing) {
-    if (!smart_update) {
-        return;
-    }
-
    const u64 aruid = applet_resource->GetActiveAruid();
    auto* data = applet_resource->GetAruidData(aruid);

--- a/src/hid_core/resources/unique_pad/unique_pad.h
+++ b/src/hid_core/resources/unique_pad/unique_pad.h
@@ -20,8 +20,5 @@ public:

    // When the controller is requesting an update for the shared memory
    void OnUpdate(const Core::Timing::CoreTiming& core_timing) override;
-
-private:
-    bool smart_update{};
 };
 } // namespace Service::HID
--- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp
@@ -60,7 +60,8 @@ public:
        Add(spv::ImageOperandsMask::ConstOffsets, offsets);
    }

-    explicit ImageOperands(Id lod, Id ms) {
+    explicit ImageOperands(EmitContext& ctx, const IR::Value& offset, Id lod, Id ms) {
+        AddOffset(ctx, offset, ImageFetchOffsetAllowed);
        if (Sirit::ValidId(lod)) {
            Add(spv::ImageOperandsMask::Lod, lod);
        }
@@ -311,37 +312,6 @@ Id ImageGatherSubpixelOffset(EmitContext& ctx, const IR::TextureInstInfo& info,
        return coords;
    }
 }
-
-void AddOffsetToCoordinates(EmitContext& ctx, const IR::TextureInstInfo& info, Id& coords,
-                            Id offset) {
-    if (!Sirit::ValidId(offset)) {
-        return;
-    }
-
-    Id result_type{};
-    switch (info.type) {
-    case TextureType::Buffer:
-    case TextureType::Color1D:
-    case TextureType::ColorArray1D: {
-        result_type = ctx.U32[1];
-        break;
-    }
-    case TextureType::Color2D:
-    case TextureType::Color2DRect:
-    case TextureType::ColorArray2D: {
-        result_type = ctx.U32[2];
-        break;
-    }
-    case TextureType::Color3D: {
-        result_type = ctx.U32[3];
-        break;
-    }
-    case TextureType::ColorCube:
-    case TextureType::ColorArrayCube:
-        return;
-    }
-    coords = ctx.OpIAdd(result_type, coords, offset);
-}
 } // Anonymous namespace

 Id EmitBindlessImageSampleImplicitLod(EmitContext&) {
@@ -524,10 +494,9 @@ Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, const IR::Value& index,
                operands.Span());
 }

-Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id offset,
-                  Id lod, Id ms) {
+Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords,
+                  const IR::Value& offset, Id lod, Id ms) {
    const auto info{inst->Flags<IR::TextureInstInfo>()};
-    AddOffsetToCoordinates(ctx, info, coords, offset);
    if (info.type == TextureType::Buffer) {
        lod = Id{};
    }
@@ -535,7 +504,7 @@ Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id c
        // This image is multisampled, lod must be implicit
        lod = Id{};
    }
-    const ImageOperands operands(lod, ms);
+    const ImageOperands operands(ctx, offset, lod, ms);
    return Emit(&EmitContext::OpImageSparseFetch, &EmitContext::OpImageFetch, ctx, inst, ctx.F32[4],
                TextureImage(ctx, info, index), coords, operands.MaskOptional(), operands.Span());
 }
--- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h
@@ -537,8 +537,8 @@ Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id
                   const IR::Value& offset, const IR::Value& offset2);
 Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords,
                       const IR::Value& offset, const IR::Value& offset2, Id dref);
-Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id offset,
-                  Id lod, Id ms);
+Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords,
+                  const IR::Value& offset, Id lod, Id ms);
 Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id lod,
                            const IR::Value& skip_mips);
 Id EmitImageQueryLod(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords);
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -18,6 +18,7 @@ add_library(video_core STATIC
    buffer_cache/usage_tracker.h
    buffer_cache/word_manager.h
    cache_types.h
+    capture.h
    cdma_pusher.cpp
    cdma_pusher.h
    compatible_formats.cpp
@@ -59,8 +60,8 @@ add_library(video_core STATIC
    framebuffer_config.h
    fsr.cpp
    fsr.h
-    host1x/codecs/codec.cpp
-    host1x/codecs/codec.h
+    host1x/codecs/decoder.cpp
+    host1x/codecs/decoder.h
    host1x/codecs/h264.cpp
    host1x/codecs/h264.h
    host1x/codecs/vp8.cpp
@@ -79,8 +80,6 @@ add_library(video_core STATIC
    host1x/nvdec.cpp
    host1x/nvdec.h
    host1x/nvdec_common.h
-    host1x/sync_manager.cpp
-    host1x/sync_manager.h
    host1x/syncpoint_manager.cpp
    host1x/syncpoint_manager.h
    host1x/vic.cpp
@@ -101,6 +100,7 @@ add_library(video_core STATIC
    memory_manager.cpp
    memory_manager.h
    precompiled_headers.h
+    present.h
    pte_kind.h
    query_cache/bank_base.h
    query_cache/query_base.h
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -1546,7 +1546,10 @@ void BufferCache<P>::ImmediateUploadMemory([[maybe_unused]] Buffer& buffer,
            std::span<const u8> upload_span;
            const DAddr device_addr = buffer.CpuAddr() + copy.dst_offset;
            if (IsRangeGranular(device_addr, copy.size)) {
-                upload_span = std::span(device_memory.GetPointer<u8>(device_addr), copy.size);
+                auto* const ptr = device_memory.GetPointer<u8>(device_addr);
+                if (ptr != nullptr) {
+                    upload_span = std::span(ptr, copy.size);
+                }
            } else {
                if (immediate_buffer.empty()) {
                    immediate_buffer = ImmediateBuffer(largest_copy);
--- a/src/video_core/capture.h
+++ b/src/video_core/capture.h
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/alignment.h"
+#include "common/bit_util.h"
+#include "common/common_types.h"
+#include "core/frontend/framebuffer_layout.h"
+#include "video_core/surface.h"
+
+namespace VideoCore::Capture {
+
+constexpr u32 BlockHeight = 4;
+constexpr u32 BlockDepth = 0;
+constexpr u32 BppLog2 = 2;
+
+constexpr auto PixelFormat = Surface::PixelFormat::B8G8R8A8_UNORM;
+
+constexpr auto LinearWidth = Layout::ScreenUndocked::Width;
+constexpr auto LinearHeight = Layout::ScreenUndocked::Height;
+constexpr auto LinearDepth = 1U;
+constexpr auto BytesPerPixel = 4U;
+
+constexpr auto TiledWidth = LinearWidth;
+constexpr auto TiledHeight = Common::AlignUpLog2(LinearHeight, BlockHeight + BlockDepth + BppLog2);
+constexpr auto TiledSize = TiledWidth * TiledHeight * (1 << BppLog2);
+
+constexpr Layout::FramebufferLayout Layout{
+    .width = LinearWidth,
+    .height = LinearHeight,
+    .screen = {0, 0, LinearWidth, LinearHeight},
+    .is_srgb = false,
+};
+
+} // namespace VideoCore::Capture
--- a/src/video_core/cdma_pusher.cpp
+++ b/src/video_core/cdma_pusher.cpp
@@ -2,136 +2,130 @@
 // SPDX-License-Identifier: MIT

 #include <bit>
+
+#include "common/thread.h"
+#include "core/core.h"
 #include "video_core/cdma_pusher.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/host1x/control.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/host1x/nvdec.h"
 #include "video_core/host1x/nvdec_common.h"
-#include "video_core/host1x/sync_manager.h"
 #include "video_core/host1x/vic.h"
 #include "video_core/memory_manager.h"

 namespace Tegra {
-CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_)
-    : host1x{host1x_}, nvdec_processor(std::make_shared<Host1x::Nvdec>(host1x)),
-      vic_processor(std::make_unique<Host1x::Vic>(host1x, nvdec_processor)),
-      host1x_processor(std::make_unique<Host1x::Control>(host1x)),
-      sync_manager(std::make_unique<Host1x::SyncptIncrManager>(host1x)) {}
+
+CDmaPusher::CDmaPusher(Host1x::Host1x& host1x_, s32 id)
+    : host1x{host1x_}, memory_manager{host1x.GMMU()},
+      host_processor{std::make_unique<Host1x::Control>(host1x_)}, current_class{
+                                                                      static_cast<ChClassId>(id)} {
+    thread = std::jthread([this](std::stop_token stop_token) { ProcessEntries(stop_token); });
+}

 CDmaPusher::~CDmaPusher() = default;

-void CDmaPusher::ProcessEntries(ChCommandHeaderList&& entries) {
-    for (const auto& value : entries) {
-        if (mask != 0) {
-            const auto lbs = static_cast<u32>(std::countr_zero(mask));
-            mask &= ~(1U << lbs);
-            ExecuteCommand(offset + lbs, value.raw);
-            continue;
-        } else if (count != 0) {
-            --count;
-            ExecuteCommand(offset, value.raw);
-            if (incrementing) {
-                ++offset;
+void CDmaPusher::ProcessEntries(std::stop_token stop_token) {
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::High);
+    ChCommandHeaderList command_list{host1x.System().ApplicationMemory(), 0, 0};
+    u32 count{};
+    u32 method_offset{};
+    u32 mask{};
+    bool incrementing{};
+
+    while (!stop_token.stop_requested()) {
+        {
+            std::unique_lock l{command_mutex};
+            Common::CondvarWait(command_cv, l, stop_token,
+                                [this]() { return command_lists.size() > 0; });
+            if (stop_token.stop_requested()) {
+                return;
            }
-            continue;
+
+            command_list = std::move(command_lists.front());
+            command_lists.pop_front();
        }
-        const auto mode = value.submission_mode.Value();
-        switch (mode) {
-        case ChSubmissionMode::SetClass: {
-            mask = value.value & 0x3f;
-            offset = value.method_offset;
-            current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff);
-            break;
-        }
-        case ChSubmissionMode::Incrementing:
-        case ChSubmissionMode::NonIncrementing:
-            count = value.value;
-            offset = value.method_offset;
-            incrementing = mode == ChSubmissionMode::Incrementing;
-            break;
-        case ChSubmissionMode::Mask:
-            mask = value.value;
-            offset = value.method_offset;
-            break;
-        case ChSubmissionMode::Immediate: {
-            const u32 data = value.value & 0xfff;
-            offset = value.method_offset;
-            ExecuteCommand(offset, data);
-            break;
-        }
-        default:
-            UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode));
-            break;
+
+        size_t i = 0;
+        for (const auto value : command_list) {
+            i++;
+            if (mask != 0) {
+                const auto lbs = static_cast<u32>(std::countr_zero(mask));
+                mask &= ~(1U << lbs);
+                ExecuteCommand(method_offset + lbs, value.raw);
+                continue;
+            } else if (count != 0) {
+                --count;
+                ExecuteCommand(method_offset, value.raw);
+                if (incrementing) {
+                    ++method_offset;
+                }
+                continue;
+            }
+            const auto mode = value.submission_mode.Value();
+            switch (mode) {
+            case ChSubmissionMode::SetClass: {
+                mask = value.value & 0x3f;
+                method_offset = value.method_offset;
+                current_class = static_cast<ChClassId>((value.value >> 6) & 0x3ff);
+                break;
+            }
+            case ChSubmissionMode::Incrementing:
+            case ChSubmissionMode::NonIncrementing:
+                count = value.value;
+                method_offset = value.method_offset;
+                incrementing = mode == ChSubmissionMode::Incrementing;
+                break;
+            case ChSubmissionMode::Mask:
+                mask = value.value;
+                method_offset = value.method_offset;
+                break;
+            case ChSubmissionMode::Immediate: {
+                const u32 data = value.value & 0xfff;
+                method_offset = value.method_offset;
+                ExecuteCommand(method_offset, data);
+                break;
+            }
+            default:
+                LOG_ERROR(HW_GPU, "Bad command at index {} (bytes 0x{:X}), buffer size {}", i - 1,
+                          (i - 1) * sizeof(u32), command_list.size());
+                UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!",
+                                  static_cast<u32>(mode));
+                break;
+            }
        }
    }
 }

-void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) {
+void CDmaPusher::ExecuteCommand(u32 method, u32 arg) {
    switch (current_class) {
-    case ChClassId::NvDec:
-        ThiStateWrite(nvdec_thi_state, offset, data);
-        switch (static_cast<ThiMethod>(offset)) {
-        case ThiMethod::IncSyncpt: {
-            LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method");
-            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
-            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
-            if (cond == 0) {
-                sync_manager->Increment(syncpoint_id);
-            } else {
-                sync_manager->SignalDone(
-                    sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
-            }
-            break;
-        }
-        case ThiMethod::SetMethod1:
-            LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}",
-                      static_cast<u32>(nvdec_thi_state.method_0));
-            nvdec_processor->ProcessMethod(nvdec_thi_state.method_0, data);
-            break;
-        default:
-            break;
-        }
-        break;
-    case ChClassId::GraphicsVic:
-        ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data});
-        switch (static_cast<ThiMethod>(state_offset)) {
-        case ThiMethod::IncSyncpt: {
-            LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method");
-            const auto syncpoint_id = static_cast<u32>(data & 0xFF);
-            const auto cond = static_cast<u32>((data >> 8) & 0xFF);
-            if (cond == 0) {
-                sync_manager->Increment(syncpoint_id);
-            } else {
-                sync_manager->SignalDone(
-                    sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id));
-            }
-            break;
-        }
-        case ThiMethod::SetMethod1:
-            LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})",
-                      static_cast<u32>(vic_thi_state.method_0), data);
-            vic_processor->ProcessMethod(static_cast<Host1x::Vic::Method>(vic_thi_state.method_0),
-                                         data);
-            break;
-        default:
-            break;
-        }
-        break;
    case ChClassId::Control:
-        // This device is mainly for syncpoint synchronization
-        LOG_DEBUG(Service_NVDRV, "Host1X Class Method");
-        host1x_processor->ProcessMethod(static_cast<Host1x::Control::Method>(offset), data);
+        LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}",
+                  static_cast<u32>(current_class), method, arg);
+        host_processor->ProcessMethod(static_cast<Host1x::Control::Method>(method), arg);
        break;
    default:
-        UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class));
-        break;
+        thi_regs.reg_array[method] = arg;
+        switch (static_cast<ThiMethod>(method)) {
+        case ThiMethod::IncSyncpt: {
+            const auto syncpoint_id = static_cast<u32>(arg & 0xFF);
+            [[maybe_unused]] const auto cond = static_cast<u32>((arg >> 8) & 0xFF);
+            LOG_TRACE(Service_NVDRV, "Class {} IncSyncpt Method, syncpt {} cond {}",
+                      static_cast<u32>(current_class), syncpoint_id, cond);
+            auto& syncpoint_manager = host1x.GetSyncpointManager();
+            syncpoint_manager.IncrementGuest(syncpoint_id);
+            syncpoint_manager.IncrementHost(syncpoint_id);
+            break;
+        }
+        case ThiMethod::SetMethod1:
+            LOG_TRACE(Service_NVDRV, "Class {} method 0x{:X} arg 0x{:X}",
+                      static_cast<u32>(current_class), static_cast<u32>(thi_regs.method_0), arg);
+            ProcessMethod(thi_regs.method_0, arg);
+            break;
+        default:
+            break;
+        }
    }
 }

-void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, u32 argument) {
-    u8* const offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset;
-    std::memcpy(offset_ptr, &argument, sizeof(u32));
-}
-
 } // namespace Tegra
--- a/src/video_core/cdma_pusher.h
+++ b/src/video_core/cdma_pusher.h
@@ -3,12 +3,18 @@

 #pragma once

+#include <condition_variable>
+#include <deque>
 #include <memory>
+#include <mutex>
+#include <thread>
 #include <vector>

 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
+#include "common/polyfill_thread.h"
+#include "core/memory.h"

 namespace Tegra {

@@ -62,23 +68,31 @@ struct ChCommand {
    std::vector<u32> arguments;
 };

-using ChCommandHeaderList = std::vector<ChCommandHeader>;
+using ChCommandHeaderList =
+    Core::Memory::CpuGuestMemory<Tegra::ChCommandHeader, Core::Memory::GuestMemoryFlags::SafeRead>;

 struct ThiRegisters {
-    u32_le increment_syncpt{};
-    INSERT_PADDING_WORDS(1);
-    u32_le increment_syncpt_error{};
-    u32_le ctx_switch_incremement_syncpt{};
-    INSERT_PADDING_WORDS(4);
-    u32_le ctx_switch{};
-    INSERT_PADDING_WORDS(1);
-    u32_le ctx_syncpt_eof{};
-    INSERT_PADDING_WORDS(5);
-    u32_le method_0{};
-    u32_le method_1{};
-    INSERT_PADDING_WORDS(12);
-    u32_le int_status{};
-    u32_le int_mask{};
+    static constexpr std::size_t NUM_REGS = 0x20;
+
+    union {
+        struct {
+            u32_le increment_syncpt;
+            INSERT_PADDING_WORDS_NOINIT(1);
+            u32_le increment_syncpt_error;
+            u32_le ctx_switch_incremement_syncpt;
+            INSERT_PADDING_WORDS_NOINIT(4);
+            u32_le ctx_switch;
+            INSERT_PADDING_WORDS_NOINIT(1);
+            u32_le ctx_syncpt_eof;
+            INSERT_PADDING_WORDS_NOINIT(5);
+            u32_le method_0;
+            u32_le method_1;
+            INSERT_PADDING_WORDS_NOINIT(12);
+            u32_le int_status;
+            u32_le int_mask;
+        };
+        std::array<u32, NUM_REGS> reg_array;
+    };
 };

 enum class ThiMethod : u32 {
@@ -89,32 +103,39 @@ enum class ThiMethod : u32 {

 class CDmaPusher {
 public:
-    explicit CDmaPusher(Host1x::Host1x& host1x);
-    ~CDmaPusher();
+    CDmaPusher() = delete;
+    virtual ~CDmaPusher();

-    /// Process the command entry
-    void ProcessEntries(ChCommandHeaderList&& entries);
+    void PushEntries(ChCommandHeaderList&& entries) {
+        std::scoped_lock l{command_mutex};
+        command_lists.push_back(std::move(entries));
+        command_cv.notify_one();
+    }
+
+protected:
+    explicit CDmaPusher(Host1x::Host1x& host1x, s32 id);
+
+    virtual void ProcessMethod(u32 method, u32 arg) = 0;
+
+    Host1x::Host1x& host1x;
+    Tegra::MemoryManager& memory_manager;

 private:
+    /// Process the command entry
+    void ProcessEntries(std::stop_token stop_token);
+
    /// Invoke command class devices to execute the command based on the current state
    void ExecuteCommand(u32 state_offset, u32 data);

-    /// Write arguments value to the ThiRegisters member at the specified offset
-    void ThiStateWrite(ThiRegisters& state, u32 offset, u32 argument);
+    std::unique_ptr<Host1x::Control> host_processor;

-    Host1x::Host1x& host1x;
-    std::shared_ptr<Tegra::Host1x::Nvdec> nvdec_processor;
-    std::unique_ptr<Tegra::Host1x::Vic> vic_processor;
-    std::unique_ptr<Tegra::Host1x::Control> host1x_processor;
-    std::unique_ptr<Host1x::SyncptIncrManager> sync_manager;
-    ChClassId current_class{};
-    ThiRegisters vic_thi_state{};
-    ThiRegisters nvdec_thi_state{};
+    std::mutex command_mutex;
+    std::condition_variable_any command_cv;
+    std::deque<ChCommandHeaderList> command_lists;
+    std::jthread thread;

-    u32 count{};
-    u32 offset{};
-    u32 mask{};
-    bool incrementing{};
+    ThiRegisters thi_regs{};
+    ChClassId current_class;
 };

 } // namespace Tegra
--- a/src/video_core/framebuffer_config.h
+++ b/src/video_core/framebuffer_config.h
@@ -11,6 +11,12 @@

 namespace Tegra {

+enum class BlendMode {
+    Opaque,
+    Premultiplied,
+    Coverage,
+};
+
 /**
 * Struct describing framebuffer configuration
 */
@@ -23,6 +29,7 @@ struct FramebufferConfig {
    Service::android::PixelFormat pixel_format{};
    Service::android::BufferTransformFlags transform_flags{};
    Common::Rectangle<int> crop_rect{};
+    BlendMode blending{};
 };

 Common::Rectangle<f32> NormalizeCrop(const FramebufferConfig& framebuffer, u32 texture_width,
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -250,30 +250,6 @@ struct GPU::Impl {
        gpu_thread.SubmitList(channel, std::move(entries));
    }

-    /// Push GPU command buffer entries to be processed
-    void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) {
-        if (!use_nvdec) {
-            return;
-        }
-
-        if (!cdma_pushers.contains(id)) {
-            cdma_pushers.insert_or_assign(id, std::make_unique<Tegra::CDmaPusher>(host1x));
-        }
-
-        // SubmitCommandBuffer would make the nvdec operations async, this is not currently working
-        // TODO(ameerj): RE proper async nvdec operation
-        // gpu_thread.SubmitCommandBuffer(std::move(entries));
-        cdma_pushers[id]->ProcessEntries(std::move(entries));
-    }
-
-    /// Frees the CDMAPusher instance to free up resources
-    void ClearCdmaInstance(u32 id) {
-        const auto iter = cdma_pushers.find(id);
-        if (iter != cdma_pushers.end()) {
-            cdma_pushers.erase(iter);
-        }
-    }
-
    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
    void FlushRegion(DAddr addr, u64 size) {
        gpu_thread.FlushRegion(addr, size);
@@ -347,11 +323,21 @@ struct GPU::Impl {
        WaitForSyncOperation(wait_fence);
    }

+    std::vector<u8> GetAppletCaptureBuffer() {
+        std::vector<u8> out;
+
+        const auto wait_fence =
+            RequestSyncOperation([&] { out = renderer->GetAppletCaptureBuffer(); });
+        gpu_thread.TickGPU();
+        WaitForSyncOperation(wait_fence);
+
+        return out;
+    }
+
    GPU& gpu;
    Core::System& system;
    Host1x::Host1x& host1x;

-    std::map<u32, std::unique_ptr<Tegra::CDmaPusher>> cdma_pushers;
    std::unique_ptr<VideoCore::RendererBase> renderer;
    VideoCore::RasterizerInterface* rasterizer = nullptr;
    const bool use_nvdec;
@@ -505,6 +491,10 @@ void GPU::RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
    impl->RequestComposite(std::move(layers), std::move(fences));
 }

+std::vector<u8> GPU::GetAppletCaptureBuffer() {
+    return impl->GetAppletCaptureBuffer();
+}
+
 u64 GPU::GetTicks() const {
    return impl->GetTicks();
 }
@@ -541,14 +531,6 @@ void GPU::PushGPUEntries(s32 channel, Tegra::CommandList&& entries) {
    impl->PushGPUEntries(channel, std::move(entries));
 }

-void GPU::PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries) {
-    impl->PushCommandBuffer(id, entries);
-}
-
-void GPU::ClearCdmaInstance(u32 id) {
-    impl->ClearCdmaInstance(id);
-}
-
 VideoCore::RasterizerDownloadArea GPU::OnCPURead(PAddr addr, u64 size) {
    return impl->OnCPURead(addr, size);
 }
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -215,6 +215,8 @@ public:
    void RequestComposite(std::vector<Tegra::FramebufferConfig>&& layers,
                          std::vector<Service::Nvidia::NvFence>&& fences);

+    std::vector<u8> GetAppletCaptureBuffer();
+
    /// Performs any additional setup necessary in order to begin GPU emulation.
    /// This can be used to launch any necessary threads and register any necessary
    /// core timing events.
@@ -232,15 +234,6 @@ public:
    /// Push GPU command entries to be processed
    void PushGPUEntries(s32 channel, Tegra::CommandList&& entries);

-    /// Push GPU command buffer entries to be processed
-    void PushCommandBuffer(u32 id, Tegra::ChCommandHeaderList& entries);
-
-    /// Frees the CDMAPusher instance to free up resources
-    void ClearCdmaInstance(u32 id);
-
-    /// Swap buffers (render frame)
-    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
-
    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
    [[nodiscard]] VideoCore::RasterizerDownloadArea OnCPURead(DAddr addr, u64 size);

--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -12,6 +12,7 @@
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
 #include "video_core/gpu_thread.h"
+#include "video_core/host1x/host1x.h"
 #include "video_core/renderer_base.h"

 namespace VideoCommon::GPUThread {
--- a/src/video_core/host1x/codecs/codec.cpp
+++ b/src/video_core/host1x/codecs/codec.cpp
@@ -1,113 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#include "common/assert.h"
-#include "common/settings.h"
-#include "video_core/host1x/codecs/codec.h"
-#include "video_core/host1x/codecs/h264.h"
-#include "video_core/host1x/codecs/vp8.h"
-#include "video_core/host1x/codecs/vp9.h"
-#include "video_core/host1x/host1x.h"
-#include "video_core/memory_manager.h"
-
-namespace Tegra {
-
-Codec::Codec(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs)
-    : host1x(host1x_), state{regs}, h264_decoder(std::make_unique<Decoder::H264>(host1x)),
-      vp8_decoder(std::make_unique<Decoder::VP8>(host1x)),
-      vp9_decoder(std::make_unique<Decoder::VP9>(host1x)) {}
-
-Codec::~Codec() = default;
-
-void Codec::Initialize() {
-    initialized = decode_api.Initialize(current_codec);
-}
-
-void Codec::SetTargetCodec(Host1x::NvdecCommon::VideoCodec codec) {
-    if (current_codec != codec) {
-        current_codec = codec;
-        LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", GetCurrentCodecName());
-    }
-}
-
-void Codec::Decode() {
-    const bool is_first_frame = !initialized;
-    if (is_first_frame) {
-        Initialize();
-    }
-
-    if (!initialized) {
-        return;
-    }
-
-    // Assemble bitstream.
-    bool vp9_hidden_frame = false;
-    size_t configuration_size = 0;
-    const auto packet_data = [&]() {
-        switch (current_codec) {
-        case Tegra::Host1x::NvdecCommon::VideoCodec::H264:
-            return h264_decoder->ComposeFrame(state, &configuration_size, is_first_frame);
-        case Tegra::Host1x::NvdecCommon::VideoCodec::VP8:
-            return vp8_decoder->ComposeFrame(state);
-        case Tegra::Host1x::NvdecCommon::VideoCodec::VP9:
-            vp9_decoder->ComposeFrame(state);
-            vp9_hidden_frame = vp9_decoder->WasFrameHidden();
-            return vp9_decoder->GetFrameBytes();
-        default:
-            ASSERT(false);
-            return std::span<const u8>{};
-        }
-    }();
-
-    // Send assembled bitstream to decoder.
-    if (!decode_api.SendPacket(packet_data, configuration_size)) {
-        return;
-    }
-
-    // Only receive/store visible frames.
-    if (vp9_hidden_frame) {
-        return;
-    }
-
-    // Receive output frames from decoder.
-    decode_api.ReceiveFrames(frames);
-
-    while (frames.size() > 10) {
-        LOG_DEBUG(HW_GPU, "ReceiveFrames overflow, dropped frame");
-        frames.pop();
-    }
-}
-
-std::unique_ptr<FFmpeg::Frame> Codec::GetCurrentFrame() {
-    // Sometimes VIC will request more frames than have been decoded.
-    // in this case, return a blank frame and don't overwrite previous data.
-    if (frames.empty()) {
-        return {};
-    }
-
-    auto frame = std::move(frames.front());
-    frames.pop();
-    return frame;
-}
-
-Host1x::NvdecCommon::VideoCodec Codec::GetCurrentCodec() const {
-    return current_codec;
-}
-
-std::string_view Codec::GetCurrentCodecName() const {
-    switch (current_codec) {
-    case Host1x::NvdecCommon::VideoCodec::None:
-        return "None";
-    case Host1x::NvdecCommon::VideoCodec::H264:
-        return "H264";
-    case Host1x::NvdecCommon::VideoCodec::VP8:
-        return "VP8";
-    case Host1x::NvdecCommon::VideoCodec::H265:
-        return "H265";
-    case Host1x::NvdecCommon::VideoCodec::VP9:
-        return "VP9";
-    default:
-        return "Unknown";
-    }
-}
-} // namespace Tegra
--- a/src/video_core/host1x/codecs/codec.h
+++ b/src/video_core/host1x/codecs/codec.h
@@ -1,63 +0,0 @@
-// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
-// SPDX-License-Identifier: GPL-2.0-or-later
-
-#pragma once
-
-#include <memory>
-#include <optional>
-#include <string_view>
-#include <queue>
-#include "common/common_types.h"
-#include "video_core/host1x/ffmpeg/ffmpeg.h"
-#include "video_core/host1x/nvdec_common.h"
-
-namespace Tegra {
-
-namespace Decoder {
-class H264;
-class VP8;
-class VP9;
-} // namespace Decoder
-
-namespace Host1x {
-class Host1x;
-} // namespace Host1x
-
-class Codec {
-public:
-    explicit Codec(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs);
-    ~Codec();
-
-    /// Initialize the codec, returning success or failure
-    void Initialize();
-
-    /// Sets NVDEC video stream codec
-    void SetTargetCodec(Host1x::NvdecCommon::VideoCodec codec);
-
-    /// Call decoders to construct headers, decode AVFrame with ffmpeg
-    void Decode();
-
-    /// Returns next decoded frame
-    [[nodiscard]] std::unique_ptr<FFmpeg::Frame> GetCurrentFrame();
-
-    /// Returns the value of current_codec
-    [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const;
-
-    /// Return name of the current codec
-    [[nodiscard]] std::string_view GetCurrentCodecName() const;
-
-private:
-    bool initialized{};
-    Host1x::NvdecCommon::VideoCodec current_codec{Host1x::NvdecCommon::VideoCodec::None};
-    FFmpeg::DecodeApi decode_api;
-
-    Host1x::Host1x& host1x;
-    const Host1x::NvdecCommon::NvdecRegisters& state;
-    std::unique_ptr<Decoder::H264> h264_decoder;
-    std::unique_ptr<Decoder::VP8> vp8_decoder;
-    std::unique_ptr<Decoder::VP9> vp9_decoder;
-
-    std::queue<std::unique_ptr<FFmpeg::Frame>> frames{};
-};
-
-} // namespace Tegra
--- a/src/video_core/host1x/codecs/decoder.cpp
+++ b/src/video_core/host1x/codecs/decoder.cpp
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "common/assert.h"
+#include "common/settings.h"
+#include "video_core/host1x/codecs/decoder.h"
+#include "video_core/host1x/host1x.h"
+#include "video_core/memory_manager.h"
+
+namespace Tegra {
+
+Decoder::Decoder(Host1x::Host1x& host1x_, s32 id_, const Host1x::NvdecCommon::NvdecRegisters& regs_,
+                 Host1x::FrameQueue& frame_queue_)
+    : host1x(host1x_), memory_manager{host1x.GMMU()}, regs{regs_}, id{id_}, frame_queue{
+                                                                                frame_queue_} {}
+
+Decoder::~Decoder() = default;
+
+void Decoder::Decode() {
+    if (!initialized) {
+        return;
+    }
+
+    const auto packet_data = ComposeFrame();
+    // Send assembled bitstream to decoder.
+    if (!decode_api.SendPacket(packet_data)) {
+        return;
+    }
+
+    // Only receive/store visible frames.
+    if (vp9_hidden_frame) {
+        return;
+    }
+
+    // Receive output frames from decoder.
+    auto frame = decode_api.ReceiveFrame();
+
+    if (IsInterlaced()) {
+        auto [luma_top, luma_bottom, chroma_top, chroma_bottom] = GetInterlacedOffsets();
+        auto frame_copy = frame;
+
+        if (!frame.get()) {
+            LOG_ERROR(HW_GPU, "Failed to decode interlaced frame for top 0x{:X} bottom 0x{:X}",
+                      luma_top, luma_bottom);
+        }
+
+        if (UsingDecodeOrder()) {
+            frame_queue.PushDecodeOrder(id, luma_top, std::move(frame));
+            frame_queue.PushDecodeOrder(id, luma_bottom, std::move(frame_copy));
+        } else {
+            frame_queue.PushPresentOrder(id, luma_top, std::move(frame));
+            frame_queue.PushPresentOrder(id, luma_bottom, std::move(frame_copy));
+        }
+    } else {
+        auto [luma_offset, chroma_offset] = GetProgressiveOffsets();
+
+        if (!frame.get()) {
+            LOG_ERROR(HW_GPU, "Failed to decode progressive frame for luma 0x{:X}", luma_offset);
+        }
+
+        if (UsingDecodeOrder()) {
+            frame_queue.PushDecodeOrder(id, luma_offset, std::move(frame));
+        } else {
+            frame_queue.PushPresentOrder(id, luma_offset, std::move(frame));
+        }
+    }
+}
+
+} // namespace Tegra
--- a/src/video_core/host1x/codecs/decoder.h
+++ b/src/video_core/host1x/codecs/decoder.h
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string_view>
+#include <unordered_map>
+#include <queue>
+
+#include "common/common_types.h"
+#include "video_core/host1x/ffmpeg/ffmpeg.h"
+#include "video_core/host1x/nvdec_common.h"
+
+namespace Tegra {
+
+namespace Host1x {
+class Host1x;
+class FrameQueue;
+} // namespace Host1x
+
+class Decoder {
+public:
+    virtual ~Decoder();
+
+    /// Call decoders to construct headers, decode AVFrame with ffmpeg
+    void Decode();
+
+    bool UsingDecodeOrder() const {
+        return decode_api.UsingDecodeOrder();
+    }
+
+    /// Returns the value of current_codec
+    [[nodiscard]] Host1x::NvdecCommon::VideoCodec GetCurrentCodec() const {
+        return codec;
+    }
+
+    /// Return name of the current codec
+    [[nodiscard]] virtual std::string_view GetCurrentCodecName() const = 0;
+
+protected:
+    explicit Decoder(Host1x::Host1x& host1x, s32 id,
+                     const Host1x::NvdecCommon::NvdecRegisters& regs,
+                     Host1x::FrameQueue& frame_queue);
+
+    virtual std::span<const u8> ComposeFrame() = 0;
+    virtual std::tuple<u64, u64> GetProgressiveOffsets() = 0;
+    virtual std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() = 0;
+    virtual bool IsInterlaced() = 0;
+
+    Host1x::Host1x& host1x;
+    Tegra::MemoryManager& memory_manager;
+    const Host1x::NvdecCommon::NvdecRegisters& regs;
+    s32 id;
+    Host1x::FrameQueue& frame_queue;
+    Host1x::NvdecCommon::VideoCodec codec;
+    FFmpeg::DecodeApi decode_api;
+    bool initialized{};
+    bool vp9_hidden_frame{};
+};
+
+} // namespace Tegra
--- a/src/video_core/host1x/codecs/h264.cpp
+++ b/src/video_core/host1x/codecs/h264.cpp
@@ -10,7 +10,7 @@
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"

-namespace Tegra::Decoder {
+namespace Tegra::Decoders {
 namespace {
 // ZigZag LUTs from libavcodec.
 constexpr std::array<u8, 64> zig_zag_direct{
@@ -25,23 +25,56 @@ constexpr std::array<u8, 16> zig_zag_scan{
 };
 } // Anonymous namespace

-H264::H264(Host1x::Host1x& host1x_) : host1x{host1x_} {}
+H264::H264(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+           Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, regs_, frame_queue_} {
+    codec = Host1x::NvdecCommon::VideoCodec::H264;
+    initialized = decode_api.Initialize(codec);
+}

 H264::~H264() = default;

-std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
-                                       size_t* out_configuration_size, bool is_first_frame) {
-    H264DecoderContext context;
-    host1x.GMMU().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext));
+std::tuple<u64, u64> H264::GetProgressiveOffsets() {
+    auto pic_idx{current_context.h264_parameter_set.curr_pic_idx};
+    auto luma{regs.surface_luma_offsets[pic_idx].Address() +
+              current_context.h264_parameter_set.luma_frame_offset.Address()};
+    auto chroma{regs.surface_chroma_offsets[pic_idx].Address() +
+                current_context.h264_parameter_set.chroma_frame_offset.Address()};
+    return {luma, chroma};
+}

-    const s64 frame_number = context.h264_parameter_set.frame_number.Value();
+std::tuple<u64, u64, u64, u64> H264::GetInterlacedOffsets() {
+    auto pic_idx{current_context.h264_parameter_set.curr_pic_idx};
+    auto luma_top{regs.surface_luma_offsets[pic_idx].Address() +
+                  current_context.h264_parameter_set.luma_top_offset.Address()};
+    auto luma_bottom{regs.surface_luma_offsets[pic_idx].Address() +
+                     current_context.h264_parameter_set.luma_bot_offset.Address()};
+    auto chroma_top{regs.surface_chroma_offsets[pic_idx].Address() +
+                    current_context.h264_parameter_set.chroma_top_offset.Address()};
+    auto chroma_bottom{regs.surface_chroma_offsets[pic_idx].Address() +
+                       current_context.h264_parameter_set.chroma_bot_offset.Address()};
+    return {luma_top, luma_bottom, chroma_top, chroma_bottom};
+}
+
+bool H264::IsInterlaced() {
+    return current_context.h264_parameter_set.luma_top_offset.Address() != 0 ||
+           current_context.h264_parameter_set.luma_bot_offset.Address() != 0;
+}
+
+std::span<const u8> H264::ComposeFrame() {
+    memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_context,
+                             sizeof(H264DecoderContext));
+
+    const s64 frame_number = current_context.h264_parameter_set.frame_number.Value();
    if (!is_first_frame && frame_number != 0) {
-        frame.resize_destructive(context.stream_len);
-        host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size());
-        *out_configuration_size = 0;
-        return frame;
+        frame_scratch.resize_destructive(current_context.stream_len);
+        memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(), frame_scratch.data(),
+                                 frame_scratch.size());
+        return frame_scratch;
    }

+    is_first_frame = false;
+
    // Encode header
    H264BitWriter writer{};
    writer.WriteU(1, 24);
@@ -53,7 +86,7 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
    writer.WriteU(31, 8);
    writer.WriteUe(0);
    const u32 chroma_format_idc =
-        static_cast<u32>(context.h264_parameter_set.chroma_format_idc.Value());
+        static_cast<u32>(current_context.h264_parameter_set.chroma_format_idc.Value());
    writer.WriteUe(chroma_format_idc);
    if (chroma_format_idc == 3) {
        writer.WriteBit(false);
@@ -61,42 +94,44 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters

    writer.WriteUe(0);
    writer.WriteUe(0);
-    writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag
+    writer.WriteBit(current_context.qpprime_y_zero_transform_bypass_flag.Value() != 0);
    writer.WriteBit(false); // Scaling matrix present flag

-    writer.WriteUe(static_cast<u32>(context.h264_parameter_set.log2_max_frame_num_minus4.Value()));
+    writer.WriteUe(
+        static_cast<u32>(current_context.h264_parameter_set.log2_max_frame_num_minus4.Value()));

    const auto order_cnt_type =
-        static_cast<u32>(context.h264_parameter_set.pic_order_cnt_type.Value());
+        static_cast<u32>(current_context.h264_parameter_set.pic_order_cnt_type.Value());
    writer.WriteUe(order_cnt_type);
    if (order_cnt_type == 0) {
-        writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4);
+        writer.WriteUe(current_context.h264_parameter_set.log2_max_pic_order_cnt_lsb_minus4);
    } else if (order_cnt_type == 1) {
-        writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);
+        writer.WriteBit(current_context.h264_parameter_set.delta_pic_order_always_zero_flag != 0);

        writer.WriteSe(0);
        writer.WriteSe(0);
        writer.WriteUe(0);
    }

-    const s32 pic_height = context.h264_parameter_set.frame_height_in_map_units /
-                           (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);
+    const s32 pic_height = current_context.h264_parameter_set.frame_height_in_mbs /
+                           (current_context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2);

-    // TODO (ameerj): Where do we get this number, it seems to be particular for each stream
-    const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue();
-    const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::Gpu;
-    const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u;
+    u32 max_num_ref_frames =
+        std::max(std::max(current_context.h264_parameter_set.num_refidx_l0_default_active,
+                          current_context.h264_parameter_set.num_refidx_l1_default_active) +
+                     1,
+                 4);
    writer.WriteUe(max_num_ref_frames);
    writer.WriteBit(false);
-    writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1);
+    writer.WriteUe(current_context.h264_parameter_set.pic_width_in_mbs - 1);
    writer.WriteUe(pic_height - 1);
-    writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.frame_mbs_only_flag != 0);

-    if (!context.h264_parameter_set.frame_mbs_only_flag) {
-        writer.WriteBit(context.h264_parameter_set.flags.mbaff_frame.Value() != 0);
+    if (!current_context.h264_parameter_set.frame_mbs_only_flag) {
+        writer.WriteBit(current_context.h264_parameter_set.flags.mbaff_frame.Value() != 0);
    }

-    writer.WriteBit(context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0);
+    writer.WriteBit(current_context.h264_parameter_set.flags.direct_8x8_inference.Value() != 0);
    writer.WriteBit(false); // Frame cropping flag
    writer.WriteBit(false); // VUI parameter present flag

@@ -111,57 +146,59 @@ std::span<const u8> H264::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters
    writer.WriteUe(0);
    writer.WriteUe(0);

-    writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0);
-    writer.WriteBit(context.h264_parameter_set.pic_order_present_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.entropy_coding_mode_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.pic_order_present_flag != 0);
    writer.WriteUe(0);
-    writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active);
-    writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active);
-    writer.WriteBit(context.h264_parameter_set.flags.weighted_pred.Value() != 0);
-    writer.WriteU(static_cast<s32>(context.h264_parameter_set.weighted_bipred_idc.Value()), 2);
-    s32 pic_init_qp = static_cast<s32>(context.h264_parameter_set.pic_init_qp_minus26.Value());
+    writer.WriteUe(current_context.h264_parameter_set.num_refidx_l0_default_active);
+    writer.WriteUe(current_context.h264_parameter_set.num_refidx_l1_default_active);
+    writer.WriteBit(current_context.h264_parameter_set.flags.weighted_pred.Value() != 0);
+    writer.WriteU(static_cast<s32>(current_context.h264_parameter_set.weighted_bipred_idc.Value()),
+                  2);
+    s32 pic_init_qp =
+        static_cast<s32>(current_context.h264_parameter_set.pic_init_qp_minus26.Value());
    writer.WriteSe(pic_init_qp);
    writer.WriteSe(0);
    s32 chroma_qp_index_offset =
-        static_cast<s32>(context.h264_parameter_set.chroma_qp_index_offset.Value());
+        static_cast<s32>(current_context.h264_parameter_set.chroma_qp_index_offset.Value());

    writer.WriteSe(chroma_qp_index_offset);
-    writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_present_flag != 0);
-    writer.WriteBit(context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0);
-    writer.WriteBit(context.h264_parameter_set.redundant_pic_cnt_present_flag != 0);
-    writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.deblocking_filter_control_present_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.flags.constrained_intra_pred.Value() != 0);
+    writer.WriteBit(current_context.h264_parameter_set.redundant_pic_cnt_present_flag != 0);
+    writer.WriteBit(current_context.h264_parameter_set.transform_8x8_mode_flag != 0);

    writer.WriteBit(true); // pic_scaling_matrix_present_flag

    for (s32 index = 0; index < 6; index++) {
        writer.WriteBit(true);
-        std::span<const u8> matrix{context.weight_scale};
-        writer.WriteScalingList(scan, matrix, index * 16, 16);
+        std::span<const u8> matrix{current_context.weight_scale_4x4};
+        writer.WriteScalingList(scan_scratch, matrix, index * 16, 16);
    }

-    if (context.h264_parameter_set.transform_8x8_mode_flag) {
+    if (current_context.h264_parameter_set.transform_8x8_mode_flag) {
        for (s32 index = 0; index < 2; index++) {
            writer.WriteBit(true);
-            std::span<const u8> matrix{context.weight_scale_8x8};
-            writer.WriteScalingList(scan, matrix, index * 64, 64);
+            std::span<const u8> matrix{current_context.weight_scale_8x8};
+            writer.WriteScalingList(scan_scratch, matrix, index * 64, 64);
        }
    }

    s32 chroma_qp_index_offset2 =
-        static_cast<s32>(context.h264_parameter_set.second_chroma_qp_index_offset.Value());
+        static_cast<s32>(current_context.h264_parameter_set.second_chroma_qp_index_offset.Value());

    writer.WriteSe(chroma_qp_index_offset2);

    writer.End();

    const auto& encoded_header = writer.GetByteArray();
-    frame.resize(encoded_header.size() + context.stream_len);
-    std::memcpy(frame.data(), encoded_header.data(), encoded_header.size());
+    frame_scratch.resize(encoded_header.size() + current_context.stream_len);
+    std::memcpy(frame_scratch.data(), encoded_header.data(), encoded_header.size());

-    *out_configuration_size = encoded_header.size();
-    host1x.GMMU().ReadBlock(state.frame_bitstream_offset, frame.data() + encoded_header.size(),
-                            context.stream_len);
+    memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(),
+                             frame_scratch.data() + encoded_header.size(),
+                             current_context.stream_len);

-    return frame;
+    return frame_scratch;
 }

 H264BitWriter::H264BitWriter() = default;
@@ -278,4 +315,4 @@ void H264BitWriter::Flush() {
    buffer = 0;
    buffer_pos = 0;
 }
-} // namespace Tegra::Decoder
+} // namespace Tegra::Decoders
--- a/src/video_core/host1x/codecs/h264.h
+++ b/src/video_core/host1x/codecs/h264.h
@@ -10,6 +10,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
+#include "video_core/host1x/codecs/decoder.h"
 #include "video_core/host1x/nvdec_common.h"

 namespace Tegra {
@@ -18,7 +19,7 @@ namespace Host1x {
 class Host1x;
 } // namespace Host1x

-namespace Decoder {
+namespace Decoders {

 class H264BitWriter {
 public:
@@ -60,123 +61,213 @@ private:
    std::vector<u8> byte_array;
 };

-class H264 {
-public:
-    explicit H264(Host1x::Host1x& host1x);
-    ~H264();
-
-    /// Compose the H264 frame for FFmpeg decoding
-    [[nodiscard]] std::span<const u8> ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state,
-                                                   size_t* out_configuration_size,
-                                                   bool is_first_frame = false);
+struct Offset {
+    constexpr u32 Address() const noexcept {
+        return offset << 8;
+    }

 private:
-    Common::ScratchBuffer<u8> frame;
-    Common::ScratchBuffer<u8> scan;
-    Host1x::Host1x& host1x;
+    u32 offset;
+};
+static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
+static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!");

-    struct H264ParameterSet {
-        s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00
-        s32 delta_pic_order_always_zero_flag;  ///< 0x04
-        s32 frame_mbs_only_flag;               ///< 0x08
-        u32 pic_width_in_mbs;                  ///< 0x0C
-        u32 frame_height_in_map_units;         ///< 0x10
-        union {                                ///< 0x14
-            BitField<0, 2, u32> tile_format;
-            BitField<2, 3, u32> gob_height;
-        };
-        u32 entropy_coding_mode_flag;               ///< 0x18
-        s32 pic_order_present_flag;                 ///< 0x1C
-        s32 num_refidx_l0_default_active;           ///< 0x20
-        s32 num_refidx_l1_default_active;           ///< 0x24
-        s32 deblocking_filter_control_present_flag; ///< 0x28
-        s32 redundant_pic_cnt_present_flag;         ///< 0x2C
-        u32 transform_8x8_mode_flag;                ///< 0x30
-        u32 pitch_luma;                             ///< 0x34
-        u32 pitch_chroma;                           ///< 0x38
-        u32 luma_top_offset;                        ///< 0x3C
-        u32 luma_bot_offset;                        ///< 0x40
-        u32 luma_frame_offset;                      ///< 0x44
-        u32 chroma_top_offset;                      ///< 0x48
-        u32 chroma_bot_offset;                      ///< 0x4C
-        u32 chroma_frame_offset;                    ///< 0x50
-        u32 hist_buffer_size;                       ///< 0x54
-        union {                                     ///< 0x58
-            union {
-                BitField<0, 1, u64> mbaff_frame;
-                BitField<1, 1, u64> direct_8x8_inference;
-                BitField<2, 1, u64> weighted_pred;
-                BitField<3, 1, u64> constrained_intra_pred;
-                BitField<4, 1, u64> ref_pic;
-                BitField<5, 1, u64> field_pic;
-                BitField<6, 1, u64> bottom_field;
-                BitField<7, 1, u64> second_field;
-            } flags;
-            BitField<8, 4, u64> log2_max_frame_num_minus4;
-            BitField<12, 2, u64> chroma_format_idc;
-            BitField<14, 2, u64> pic_order_cnt_type;
-            BitField<16, 6, s64> pic_init_qp_minus26;
-            BitField<22, 5, s64> chroma_qp_index_offset;
-            BitField<27, 5, s64> second_chroma_qp_index_offset;
-            BitField<32, 2, u64> weighted_bipred_idc;
-            BitField<34, 7, u64> curr_pic_idx;
-            BitField<41, 5, u64> curr_col_idx;
-            BitField<46, 16, u64> frame_number;
-            BitField<62, 1, u64> frame_surfaces;
-            BitField<63, 1, u64> output_memory_layout;
-        };
+struct H264ParameterSet {
+    s32 log2_max_pic_order_cnt_lsb_minus4; ///< 0x00
+    s32 delta_pic_order_always_zero_flag;  ///< 0x04
+    s32 frame_mbs_only_flag;               ///< 0x08
+    u32 pic_width_in_mbs;                  ///< 0x0C
+    u32 frame_height_in_mbs;               ///< 0x10
+    union {                                ///< 0x14
+        BitField<0, 2, u32> tile_format;
+        BitField<2, 3, u32> gob_height;
+        BitField<5, 27, u32> reserved_surface_format;
    };
-    static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size");
-
-    struct H264DecoderContext {
-        INSERT_PADDING_WORDS_NOINIT(18);       ///< 0x0000
-        u32 stream_len;                        ///< 0x0048
-        INSERT_PADDING_WORDS_NOINIT(3);        ///< 0x004C
-        H264ParameterSet h264_parameter_set;   ///< 0x0058
-        INSERT_PADDING_WORDS_NOINIT(66);       ///< 0x00B8
-        std::array<u8, 0x60> weight_scale;     ///< 0x01C0
-        std::array<u8, 0x80> weight_scale_8x8; ///< 0x0220
+    u32 entropy_coding_mode_flag;               ///< 0x18
+    s32 pic_order_present_flag;                 ///< 0x1C
+    s32 num_refidx_l0_default_active;           ///< 0x20
+    s32 num_refidx_l1_default_active;           ///< 0x24
+    s32 deblocking_filter_control_present_flag; ///< 0x28
+    s32 redundant_pic_cnt_present_flag;         ///< 0x2C
+    u32 transform_8x8_mode_flag;                ///< 0x30
+    u32 pitch_luma;                             ///< 0x34
+    u32 pitch_chroma;                           ///< 0x38
+    Offset luma_top_offset;                     ///< 0x3C
+    Offset luma_bot_offset;                     ///< 0x40
+    Offset luma_frame_offset;                   ///< 0x44
+    Offset chroma_top_offset;                   ///< 0x48
+    Offset chroma_bot_offset;                   ///< 0x4C
+    Offset chroma_frame_offset;                 ///< 0x50
+    u32 hist_buffer_size;                       ///< 0x54
+    union {                                     ///< 0x58
+        union {
+            BitField<0, 1, u64> mbaff_frame;
+            BitField<1, 1, u64> direct_8x8_inference;
+            BitField<2, 1, u64> weighted_pred;
+            BitField<3, 1, u64> constrained_intra_pred;
+            BitField<4, 1, u64> ref_pic;
+            BitField<5, 1, u64> field_pic;
+            BitField<6, 1, u64> bottom_field;
+            BitField<7, 1, u64> second_field;
+        } flags;
+        BitField<8, 4, u64> log2_max_frame_num_minus4;
+        BitField<12, 2, u64> chroma_format_idc;
+        BitField<14, 2, u64> pic_order_cnt_type;
+        BitField<16, 6, s64> pic_init_qp_minus26;
+        BitField<22, 5, s64> chroma_qp_index_offset;
+        BitField<27, 5, s64> second_chroma_qp_index_offset;
+        BitField<32, 2, u64> weighted_bipred_idc;
+        BitField<34, 7, u64> curr_pic_idx;
+        BitField<41, 5, u64> curr_col_idx;
+        BitField<46, 16, u64> frame_number;
+        BitField<62, 1, u64> frame_surfaces;
+        BitField<63, 1, u64> output_memory_layout;
    };
-    static_assert(sizeof(H264DecoderContext) == 0x2A0, "H264DecoderContext is an invalid size");
+};
+static_assert(sizeof(H264ParameterSet) == 0x60, "H264ParameterSet is an invalid size");

 #define ASSERT_POSITION(field_name, position)                                                      \
    static_assert(offsetof(H264ParameterSet, field_name) == position,                              \
                  "Field " #field_name " has invalid position")

-    ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00);
-    ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04);
-    ASSERT_POSITION(frame_mbs_only_flag, 0x08);
-    ASSERT_POSITION(pic_width_in_mbs, 0x0C);
-    ASSERT_POSITION(frame_height_in_map_units, 0x10);
-    ASSERT_POSITION(tile_format, 0x14);
-    ASSERT_POSITION(entropy_coding_mode_flag, 0x18);
-    ASSERT_POSITION(pic_order_present_flag, 0x1C);
-    ASSERT_POSITION(num_refidx_l0_default_active, 0x20);
-    ASSERT_POSITION(num_refidx_l1_default_active, 0x24);
-    ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28);
-    ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C);
-    ASSERT_POSITION(transform_8x8_mode_flag, 0x30);
-    ASSERT_POSITION(pitch_luma, 0x34);
-    ASSERT_POSITION(pitch_chroma, 0x38);
-    ASSERT_POSITION(luma_top_offset, 0x3C);
-    ASSERT_POSITION(luma_bot_offset, 0x40);
-    ASSERT_POSITION(luma_frame_offset, 0x44);
-    ASSERT_POSITION(chroma_top_offset, 0x48);
-    ASSERT_POSITION(chroma_bot_offset, 0x4C);
-    ASSERT_POSITION(chroma_frame_offset, 0x50);
-    ASSERT_POSITION(hist_buffer_size, 0x54);
-    ASSERT_POSITION(flags, 0x58);
+ASSERT_POSITION(log2_max_pic_order_cnt_lsb_minus4, 0x00);
+ASSERT_POSITION(delta_pic_order_always_zero_flag, 0x04);
+ASSERT_POSITION(frame_mbs_only_flag, 0x08);
+ASSERT_POSITION(pic_width_in_mbs, 0x0C);
+ASSERT_POSITION(frame_height_in_mbs, 0x10);
+ASSERT_POSITION(tile_format, 0x14);
+ASSERT_POSITION(entropy_coding_mode_flag, 0x18);
+ASSERT_POSITION(pic_order_present_flag, 0x1C);
+ASSERT_POSITION(num_refidx_l0_default_active, 0x20);
+ASSERT_POSITION(num_refidx_l1_default_active, 0x24);
+ASSERT_POSITION(deblocking_filter_control_present_flag, 0x28);
+ASSERT_POSITION(redundant_pic_cnt_present_flag, 0x2C);
+ASSERT_POSITION(transform_8x8_mode_flag, 0x30);
+ASSERT_POSITION(pitch_luma, 0x34);
+ASSERT_POSITION(pitch_chroma, 0x38);
+ASSERT_POSITION(luma_top_offset, 0x3C);
+ASSERT_POSITION(luma_bot_offset, 0x40);
+ASSERT_POSITION(luma_frame_offset, 0x44);
+ASSERT_POSITION(chroma_top_offset, 0x48);
+ASSERT_POSITION(chroma_bot_offset, 0x4C);
+ASSERT_POSITION(chroma_frame_offset, 0x50);
+ASSERT_POSITION(hist_buffer_size, 0x54);
+ASSERT_POSITION(flags, 0x58);
 #undef ASSERT_POSITION

+struct DpbEntry {
+    union {
+        BitField<0, 7, u32> index;
+        BitField<7, 5, u32> col_idx;
+        BitField<12, 2, u32> state;
+        BitField<14, 1, u32> is_long_term;
+        BitField<15, 1, u32> non_existing;
+        BitField<16, 1, u32> is_field;
+        BitField<17, 4, u32> top_field_marking;
+        BitField<21, 4, u32> bottom_field_marking;
+        BitField<25, 1, u32> output_memory_layout;
+        BitField<26, 6, u32> reserved;
+    } flags;
+    std::array<u32, 2> field_order_cnt;
+    u32 frame_idx;
+};
+static_assert(sizeof(DpbEntry) == 0x10, "DpbEntry has the wrong size!");
+
+struct DisplayParam {
+    union {
+        BitField<0, 1, u32> enable_tf_output;
+        BitField<1, 1, u32> vc1_map_y_flag;
+        BitField<2, 3, u32> map_y_value;
+        BitField<5, 1, u32> vc1_map_uv_flag;
+        BitField<6, 3, u32> map_uv_value;
+        BitField<9, 8, u32> out_stride;
+        BitField<17, 3, u32> tiling_format;
+        BitField<20, 1, u32> output_structure; // 0=frame, 1=field
+        BitField<21, 11, u32> reserved0;
+    };
+    std::array<s32, 2> output_top;
+    std::array<s32, 2> output_bottom;
+    union {
+        BitField<0, 1, u32> enable_histogram;
+        BitField<1, 12, u32> histogram_start_x;
+        BitField<13, 12, u32> histogram_start_y;
+        BitField<25, 7, u32> reserved1;
+    };
+    union {
+        BitField<0, 12, u32> histogram_end_x;
+        BitField<12, 12, u32> histogram_end_y;
+        BitField<24, 8, u32> reserved2;
+    };
+};
+static_assert(sizeof(DisplayParam) == 0x1C, "DisplayParam has the wrong size!");
+
+struct H264DecoderContext {
+    INSERT_PADDING_WORDS_NOINIT(13);                        ///< 0x0000
+    std::array<u8, 16> eos;                                 ///< 0x0034
+    u8 explicit_eos_present_flag;                           ///< 0x0044
+    u8 hint_dump_en;                                        ///< 0x0045
+    INSERT_PADDING_BYTES_NOINIT(2);                         ///< 0x0046
+    u32 stream_len;                                         ///< 0x0048
+    u32 slice_count;                                        ///< 0x004C
+    u32 mbhist_buffer_size;                                 ///< 0x0050
+    u32 gptimer_timeout_value;                              ///< 0x0054
+    H264ParameterSet h264_parameter_set;                    ///< 0x0058
+    std::array<s32, 2> curr_field_order_cnt;                ///< 0x00B8
+    std::array<DpbEntry, 16> dpb;                           ///< 0x00C0
+    std::array<u8, 0x60> weight_scale_4x4;                  ///< 0x01C0
+    std::array<u8, 0x80> weight_scale_8x8;                  ///< 0x0220
+    std::array<u8, 2> num_inter_view_refs_lX;               ///< 0x02A0
+    std::array<u8, 14> reserved2;                           ///< 0x02A2
+    std::array<std::array<s8, 16>, 2> inter_view_refidx_lX; ///< 0x02B0
+    union {                                                 ///< 0x02D0
+        BitField<0, 1, u32> lossless_ipred8x8_filter_enable;
+        BitField<1, 1, u32> qpprime_y_zero_transform_bypass_flag;
+        BitField<2, 30, u32> reserved3;
+    };
+    DisplayParam display_param;   ///< 0x02D4
+    std::array<u32, 3> reserved4; ///< 0x02F0
+};
+static_assert(sizeof(H264DecoderContext) == 0x2FC, "H264DecoderContext is an invalid size");
+
 #define ASSERT_POSITION(field_name, position)                                                      \
    static_assert(offsetof(H264DecoderContext, field_name) == position,                            \
                  "Field " #field_name " has invalid position")

-    ASSERT_POSITION(stream_len, 0x48);
-    ASSERT_POSITION(h264_parameter_set, 0x58);
-    ASSERT_POSITION(weight_scale, 0x1C0);
+ASSERT_POSITION(stream_len, 0x48);
+ASSERT_POSITION(h264_parameter_set, 0x58);
+ASSERT_POSITION(dpb, 0xC0);
+ASSERT_POSITION(weight_scale_4x4, 0x1C0);
 #undef ASSERT_POSITION
+
+class H264 final : public Decoder {
+public:
+    explicit H264(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
+                  Host1x::FrameQueue& frame_queue);
+    ~H264() override;
+
+    H264(const H264&) = delete;
+    H264& operator=(const H264&) = delete;
+
+    H264(H264&&) = delete;
+    H264& operator=(H264&&) = delete;
+
+    /// Compose the H264 frame for FFmpeg decoding
+    [[nodiscard]] std::span<const u8> ComposeFrame() override;
+
+    std::tuple<u64, u64> GetProgressiveOffsets() override;
+    std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
+    bool IsInterlaced() override;
+
+    std::string_view GetCurrentCodecName() const override {
+        return "H264";
+    }
+
+private:
+    bool is_first_frame{true};
+    Common::ScratchBuffer<u8> frame_scratch;
+    Common::ScratchBuffer<u8> scan_scratch;
+    H264DecoderContext current_context{};
 };

-} // namespace Decoder
+} // namespace Decoders
 } // namespace Tegra
--- a/src/video_core/host1x/codecs/vp8.cpp
+++ b/src/video_core/host1x/codecs/vp8.cpp
@@ -7,47 +7,70 @@
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"

-namespace Tegra::Decoder {
-VP8::VP8(Host1x::Host1x& host1x_) : host1x{host1x_} {}
+namespace Tegra::Decoders {
+VP8::VP8(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+         Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, regs_, frame_queue_} {
+    codec = Host1x::NvdecCommon::VideoCodec::VP8;
+    initialized = decode_api.Initialize(codec);
+}

 VP8::~VP8() = default;

-std::span<const u8> VP8::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
-    VP8PictureInfo info;
-    host1x.GMMU().ReadBlock(state.picture_info_offset, &info, sizeof(VP8PictureInfo));
+std::tuple<u64, u64> VP8::GetProgressiveOffsets() {
+    auto luma{regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    auto chroma{regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    return {luma, chroma};
+}

-    const bool is_key_frame = info.key_frame == 1u;
-    const auto bitstream_size = static_cast<size_t>(info.vld_buffer_size);
+std::tuple<u64, u64, u64, u64> VP8::GetInterlacedOffsets() {
+    auto luma_top{regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    auto luma_bottom{
+        regs.surface_luma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    auto chroma_top{
+        regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    auto chroma_bottom{
+        regs.surface_chroma_offsets[static_cast<u32>(Vp8SurfaceIndex::Current)].Address()};
+    return {luma_top, luma_bottom, chroma_top, chroma_bottom};
+}
+
+std::span<const u8> VP8::ComposeFrame() {
+    memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_context,
+                             sizeof(VP8PictureInfo));
+
+    const bool is_key_frame = current_context.key_frame == 1u;
+    const auto bitstream_size = static_cast<size_t>(current_context.vld_buffer_size);
    const size_t header_size = is_key_frame ? 10u : 3u;
-    frame.resize(header_size + bitstream_size);
+    frame_scratch.resize(header_size + bitstream_size);

    // Based on page 30 of the VP8 specification.
    // https://datatracker.ietf.org/doc/rfc6386/
-    frame[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes).
-    frame[0] |= static_cast<u8>((info.version & 7u) << 1u); // 3-bit version number
-    frame[0] |= static_cast<u8>(1u << 4u);                  // 1-bit show_frame flag
+    frame_scratch[0] = is_key_frame ? 0u : 1u; // 1-bit frame type (0: keyframe, 1: interframes).
+    frame_scratch[0] |=
+        static_cast<u8>((current_context.version & 7u) << 1u); // 3-bit version number
+    frame_scratch[0] |= static_cast<u8>(1u << 4u);             // 1-bit show_frame flag

    // The next 19-bits are the first partition size
-    frame[0] |= static_cast<u8>((info.first_part_size & 7u) << 5u);
-    frame[1] = static_cast<u8>((info.first_part_size & 0x7f8u) >> 3u);
-    frame[2] = static_cast<u8>((info.first_part_size & 0x7f800u) >> 11u);
+    frame_scratch[0] |= static_cast<u8>((current_context.first_part_size & 7u) << 5u);
+    frame_scratch[1] = static_cast<u8>((current_context.first_part_size & 0x7f8u) >> 3u);
+    frame_scratch[2] = static_cast<u8>((current_context.first_part_size & 0x7f800u) >> 11u);

    if (is_key_frame) {
-        frame[3] = 0x9du;
-        frame[4] = 0x01u;
-        frame[5] = 0x2au;
+        frame_scratch[3] = 0x9du;
+        frame_scratch[4] = 0x01u;
+        frame_scratch[5] = 0x2au;
        // TODO(ameerj): Horizontal/Vertical Scale
        // 16 bits: (2 bits Horizontal Scale << 14) | Width (14 bits)
-        frame[6] = static_cast<u8>(info.frame_width & 0xff);
-        frame[7] = static_cast<u8>(((info.frame_width >> 8) & 0x3f));
+        frame_scratch[6] = static_cast<u8>(current_context.frame_width & 0xff);
+        frame_scratch[7] = static_cast<u8>(((current_context.frame_width >> 8) & 0x3f));
        // 16 bits:(2 bits Vertical Scale << 14) | Height (14 bits)
-        frame[8] = static_cast<u8>(info.frame_height & 0xff);
-        frame[9] = static_cast<u8>(((info.frame_height >> 8) & 0x3f));
+        frame_scratch[8] = static_cast<u8>(current_context.frame_height & 0xff);
+        frame_scratch[9] = static_cast<u8>(((current_context.frame_height >> 8) & 0x3f));
    }
-    const u64 bitstream_offset = state.frame_bitstream_offset;
-    host1x.GMMU().ReadBlock(bitstream_offset, frame.data() + header_size, bitstream_size);
+    const u64 bitstream_offset = regs.frame_bitstream_offset.Address();
+    memory_manager.ReadBlock(bitstream_offset, frame_scratch.data() + header_size, bitstream_size);

-    return frame;
+    return frame_scratch;
 }

-} // namespace Tegra::Decoder
+} // namespace Tegra::Decoders
--- a/src/video_core/host1x/codecs/vp8.h
+++ b/src/video_core/host1x/codecs/vp8.h
@@ -9,6 +9,7 @@
 #include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
+#include "video_core/host1x/codecs/decoder.h"
 #include "video_core/host1x/nvdec_common.h"

 namespace Tegra {
@@ -17,20 +18,41 @@ namespace Host1x {
 class Host1x;
 } // namespace Host1x

-namespace Decoder {
+namespace Decoders {
+enum class Vp8SurfaceIndex : u32 {
+    Last = 0,
+    Golden = 1,
+    AltRef = 2,
+    Current = 3,
+};

-class VP8 {
+class VP8 final : public Decoder {
 public:
-    explicit VP8(Host1x::Host1x& host1x);
-    ~VP8();
+    explicit VP8(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
+                 Host1x::FrameQueue& frame_queue);
+    ~VP8() override;

-    /// Compose the VP8 frame for FFmpeg decoding
-    [[nodiscard]] std::span<const u8> ComposeFrame(
-        const Host1x::NvdecCommon::NvdecRegisters& state);
+    VP8(const VP8&) = delete;
+    VP8& operator=(const VP8&) = delete;
+
+    VP8(VP8&&) = delete;
+    VP8& operator=(VP8&&) = delete;
+
+    [[nodiscard]] std::span<const u8> ComposeFrame() override;
+
+    std::tuple<u64, u64> GetProgressiveOffsets() override;
+    std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
+
+    bool IsInterlaced() override {
+        return false;
+    }
+
+    std::string_view GetCurrentCodecName() const override {
+        return "VP8";
+    }

 private:
-    Common::ScratchBuffer<u8> frame;
-    Host1x::Host1x& host1x;
+    Common::ScratchBuffer<u8> frame_scratch;

    struct VP8PictureInfo {
        INSERT_PADDING_WORDS_NOINIT(14);
@@ -73,7 +95,9 @@ private:
        INSERT_PADDING_WORDS_NOINIT(3);
    };
    static_assert(sizeof(VP8PictureInfo) == 0xc0, "PictureInfo is an invalid size");
+
+    VP8PictureInfo current_context{};
 };

-} // namespace Decoder
+} // namespace Decoders
 } // namespace Tegra
--- a/src/video_core/host1x/codecs/vp9.cpp
+++ b/src/video_core/host1x/codecs/vp9.cpp
@@ -4,12 +4,13 @@
 #include <algorithm> // for std::copy
 #include <numeric>

+#include "common/alignment.h"
 #include "common/assert.h"
 #include "video_core/host1x/codecs/vp9.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/memory_manager.h"

-namespace Tegra::Decoder {
+namespace Tegra::Decoders {
 namespace {
 constexpr u32 diff_update_probability = 252;
 constexpr u32 frame_sync_code = 0x498342;
@@ -237,7 +238,12 @@ constexpr std::array<u8, 254> map_lut{
 }
 } // Anonymous namespace

-VP9::VP9(Host1x::Host1x& host1x_) : host1x{host1x_} {}
+VP9::VP9(Host1x::Host1x& host1x_, const Host1x::NvdecCommon::NvdecRegisters& regs_, s32 id_,
+         Host1x::FrameQueue& frame_queue_)
+    : Decoder{host1x_, id_, regs_, frame_queue_} {
+    codec = Host1x::NvdecCommon::VideoCodec::VP9;
+    initialized = decode_api.Initialize(codec);
+}

 VP9::~VP9() = default;

@@ -356,35 +362,113 @@ void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_
    }
 }

-Vp9PictureInfo VP9::GetVp9PictureInfo(const Host1x::NvdecCommon::NvdecRegisters& state) {
-    PictureInfo picture_info;
-    host1x.GMMU().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo));
-    Vp9PictureInfo vp9_info = picture_info.Convert();
+void VP9::WriteSegmentation(VpxBitStreamWriter& writer) {
+    bool enabled = current_picture_info.segmentation.enabled != 0;
+    writer.WriteBit(enabled);
+    if (!enabled) {
+        return;
+    }

-    InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy);
+    auto update_map = current_picture_info.segmentation.update_map != 0;
+    writer.WriteBit(update_map);
+
+    if (update_map) {
+        EntropyProbs entropy_probs{};
+        memory_manager.ReadBlock(regs.vp9_prob_tab_buffer_offset.Address(), &entropy_probs,
+                                 sizeof(entropy_probs));
+
+        auto WriteProb = [&](u8 prob) {
+            bool coded = prob != 255;
+            writer.WriteBit(coded);
+            if (coded) {
+                writer.WriteU(prob, 8);
+            }
+        };
+
+        for (size_t i = 0; i < entropy_probs.mb_segment_tree_probs.size(); i++) {
+            WriteProb(entropy_probs.mb_segment_tree_probs[i]);
+        }
+
+        auto temporal_update = current_picture_info.segmentation.temporal_update != 0;
+        writer.WriteBit(temporal_update);
+
+        if (temporal_update) {
+            for (s32 i = 0; i < 3; i++) {
+                WriteProb(entropy_probs.segment_pred_probs[i]);
+            }
+        }
+    }
+
+    if (last_segmentation == current_picture_info.segmentation) {
+        writer.WriteBit(false);
+        return;
+    }
+
+    last_segmentation = current_picture_info.segmentation;
+    writer.WriteBit(true);
+    writer.WriteBit(current_picture_info.segmentation.abs_delta != 0);
+
+    constexpr s32 MAX_SEGMENTS = 8;
+    constexpr std::array SegmentationFeatureBits = {8, 6, 2, 0};
+
+    for (s32 i = 0; i < MAX_SEGMENTS; i++) {
+        auto q_enabled = current_picture_info.segmentation.feature_enabled[i][0] != 0;
+        writer.WriteBit(q_enabled);
+        if (q_enabled) {
+            writer.WriteS(current_picture_info.segmentation.feature_data[i][0],
+                          SegmentationFeatureBits[0]);
+        }
+
+        auto lf_enabled = current_picture_info.segmentation.feature_enabled[i][1] != 0;
+        writer.WriteBit(lf_enabled);
+        if (lf_enabled) {
+            writer.WriteS(current_picture_info.segmentation.feature_data[i][1],
+                          SegmentationFeatureBits[1]);
+        }
+
+        auto ref_enabled = current_picture_info.segmentation.feature_enabled[i][2] != 0;
+        writer.WriteBit(ref_enabled);
+        if (ref_enabled) {
+            writer.WriteU(current_picture_info.segmentation.feature_data[i][2],
+                          SegmentationFeatureBits[2]);
+        }
+
+        auto skip_enabled = current_picture_info.segmentation.feature_enabled[i][3] != 0;
+        writer.WriteBit(skip_enabled);
+    }
+}
+
+Vp9PictureInfo VP9::GetVp9PictureInfo() {
+    memory_manager.ReadBlock(regs.picture_info_offset.Address(), &current_picture_info,
+                             sizeof(PictureInfo));
+    Vp9PictureInfo vp9_info = current_picture_info.Convert();
+
+    InsertEntropy(regs.vp9_prob_tab_buffer_offset.Address(), vp9_info.entropy);

    // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following
    // order: last, golden, altref, current.
-    std::copy(state.surface_luma_offset.begin(), state.surface_luma_offset.begin() + 4,
-              vp9_info.frame_offsets.begin());
+    for (size_t i = 0; i < 4; i++) {
+        vp9_info.frame_offsets[i] = regs.surface_luma_offsets[i].Address();
+    }

    return vp9_info;
 }

 void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) {
    EntropyProbs entropy;
-    host1x.GMMU().ReadBlock(offset, &entropy, sizeof(EntropyProbs));
+    memory_manager.ReadBlock(offset, &entropy, sizeof(EntropyProbs));
    entropy.Convert(dst);
 }

-Vp9FrameContainer VP9::GetCurrentFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
+Vp9FrameContainer VP9::GetCurrentFrame() {
    Vp9FrameContainer current_frame{};
    {
        // gpu.SyncGuestHost(); epic, why?
-        current_frame.info = GetVp9PictureInfo(state);
+        current_frame.info = GetVp9PictureInfo();
        current_frame.bit_stream.resize(current_frame.info.bitstream_size);
-        host1x.GMMU().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(),
-                                current_frame.info.bitstream_size);
+        memory_manager.ReadBlock(regs.frame_bitstream_offset.Address(),
+                                 current_frame.bit_stream.data(),
+                                 current_frame.info.bitstream_size);
    }
    if (!next_frame.bit_stream.empty()) {
        Vp9FrameContainer temp{
@@ -742,8 +826,7 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
    uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q);
    uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q);

-    ASSERT(!current_frame_info.segment_enabled);
-    uncomp_writer.WriteBit(false); // Segmentation enabled (TODO).
+    WriteSegmentation(uncomp_writer);

    const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width);
    const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width);
@@ -770,10 +853,29 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() {
    return uncomp_writer;
 }

-void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
+std::tuple<u64, u64> VP9::GetProgressiveOffsets() {
+    auto luma{regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    auto chroma{regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    return {luma, chroma};
+}
+
+std::tuple<u64, u64, u64, u64> VP9::GetInterlacedOffsets() {
+    auto luma_top{regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    auto luma_bottom{
+        regs.surface_luma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    auto chroma_top{
+        regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    auto chroma_bottom{
+        regs.surface_chroma_offsets[static_cast<u32>(Vp9SurfaceIndex::Current)].Address()};
+    return {luma_top, luma_bottom, chroma_top, chroma_bottom};
+}
+
+std::span<const u8> VP9::ComposeFrame() {
+    vp9_hidden_frame = false;
+
    std::vector<u8> bitstream;
    {
-        Vp9FrameContainer curr_frame = GetCurrentFrame(state);
+        Vp9FrameContainer curr_frame = GetCurrentFrame();
        current_frame_info = curr_frame.info;
        bitstream = std::move(curr_frame.bit_stream);
    }
@@ -786,12 +888,16 @@ void VP9::ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state) {
    std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray();

    // Write headers and frame to buffer
-    frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
-    std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame.begin());
+    frame_scratch.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size());
+    std::copy(uncompressed_header.begin(), uncompressed_header.end(), frame_scratch.begin());
    std::copy(compressed_header.begin(), compressed_header.end(),
-              frame.begin() + uncompressed_header.size());
+              frame_scratch.begin() + uncompressed_header.size());
    std::copy(bitstream.begin(), bitstream.end(),
-              frame.begin() + uncompressed_header.size() + compressed_header.size());
+              frame_scratch.begin() + uncompressed_header.size() + compressed_header.size());
+
+    vp9_hidden_frame = WasFrameHidden();
+
+    return GetFrameBytes();
 }

 VpxRangeEncoder::VpxRangeEncoder() {
@@ -944,4 +1050,4 @@ const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const {
    return byte_array;
 }

-} // namespace Tegra::Decoder
+} // namespace Tegra::Decoders
--- a/src/video_core/host1x/codecs/vp9.h
+++ b/src/video_core/host1x/codecs/vp9.h
@@ -10,6 +10,7 @@
 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
 #include "common/stream.h"
+#include "video_core/host1x/codecs/decoder.h"
 #include "video_core/host1x/codecs/vp9_types.h"
 #include "video_core/host1x/nvdec_common.h"

@@ -19,7 +20,7 @@ namespace Host1x {
 class Host1x;
 } // namespace Host1x

-namespace Decoder {
+namespace Decoders {

 /// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the
 /// VP9 header bitstreams.
@@ -110,21 +111,32 @@ private:
    std::vector<u8> byte_array;
 };

-class VP9 {
+class VP9 final : public Decoder {
 public:
-    explicit VP9(Host1x::Host1x& host1x);
-    ~VP9();
+    explicit VP9(Host1x::Host1x& host1x, const Host1x::NvdecCommon::NvdecRegisters& regs, s32 id,
+                 Host1x::FrameQueue& frame_queue);
+    ~VP9() override;

    VP9(const VP9&) = delete;
    VP9& operator=(const VP9&) = delete;

-    VP9(VP9&&) = default;
+    VP9(VP9&&) = delete;
    VP9& operator=(VP9&&) = delete;

-    /// Composes the VP9 frame from the GPU state information.
-    /// Based on the official VP9 spec documentation
-    void ComposeFrame(const Host1x::NvdecCommon::NvdecRegisters& state);
+    [[nodiscard]] std::span<const u8> ComposeFrame() override;

+    std::tuple<u64, u64> GetProgressiveOffsets() override;
+    std::tuple<u64, u64, u64, u64> GetInterlacedOffsets() override;
+
+    bool IsInterlaced() override {
+        return false;
+    }
+
+    std::string_view GetCurrentCodecName() const override {
+        return "VP9";
+    }
+
+private:
    /// Returns true if the most recent frame was a hidden frame.
    [[nodiscard]] bool WasFrameHidden() const {
        return !current_frame_info.show_frame;
@@ -132,10 +144,9 @@ public:

    /// Returns a const span to the composed frame data.
    [[nodiscard]] std::span<const u8> GetFrameBytes() const {
-        return frame;
+        return frame_scratch;
    }

-private:
    /// Generates compressed header probability updates in the bitstream writer
    template <typename T, std::size_t N>
    void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob,
@@ -167,23 +178,22 @@ private:
    /// Write motion vector probability updates. 6.3.17 in the spec
    void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob);

+    void WriteSegmentation(VpxBitStreamWriter& writer);
+
    /// Returns VP9 information from NVDEC provided offset and size
-    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(
-        const Host1x::NvdecCommon::NvdecRegisters& state);
+    [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo();

    /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct
    void InsertEntropy(u64 offset, Vp9EntropyProbs& dst);

    /// Returns frame to be decoded after buffering
-    [[nodiscard]] Vp9FrameContainer GetCurrentFrame(
-        const Host1x::NvdecCommon::NvdecRegisters& state);
+    [[nodiscard]] Vp9FrameContainer GetCurrentFrame();

    /// Use NVDEC providied information to compose the headers for the current frame
    [[nodiscard]] std::vector<u8> ComposeCompressedHeader();
    [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader();

-    Host1x::Host1x& host1x;
-    Common::ScratchBuffer<u8> frame;
+    Common::ScratchBuffer<u8> frame_scratch;

    std::array<s8, 4> loop_filter_ref_deltas{};
    std::array<s8, 2> loop_filter_mode_deltas{};
@@ -192,9 +202,11 @@ private:
    std::array<Vp9EntropyProbs, 4> frame_ctxs{};
    bool swap_ref_indices{};

+    Segmentation last_segmentation{};
+    PictureInfo current_picture_info{};
    Vp9PictureInfo current_frame_info{};
    Vp9EntropyProbs prev_frame_probs{};
 };

-} // namespace Decoder
+} // namespace Decoders
 } // namespace Tegra
--- a/src/video_core/host1x/codecs/vp9_types.h
+++ b/src/video_core/host1x/codecs/vp9_types.h
@@ -11,7 +11,14 @@

 namespace Tegra {

-namespace Decoder {
+namespace Decoders {
+enum class Vp9SurfaceIndex : u32 {
+    Last = 0,
+    Golden = 1,
+    AltRef = 2,
+    Current = 3,
+};
+
 struct Vp9FrameDimensions {
    s16 width;
    s16 height;
@@ -48,11 +55,13 @@ enum class TxMode {
 };

 struct Segmentation {
+    constexpr bool operator==(const Segmentation& rhs) const = default;
+
    u8 enabled;
    u8 update_map;
    u8 temporal_update;
    u8 abs_delta;
-    std::array<u32, 8> feature_mask;
+    std::array<std::array<u8, 4>, 8> feature_enabled;
    std::array<std::array<s16, 4>, 8> feature_data;
 };
 static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size");
@@ -190,7 +199,17 @@ struct PictureInfo {
 static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size");

 struct EntropyProbs {
-    INSERT_PADDING_BYTES_NOINIT(1024);                 ///< 0x0000
+    std::array<u8, 10 * 10 * 8> kf_bmode_prob;         ///< 0x0000
+    std::array<u8, 10 * 10 * 1> kf_bmode_probB;        ///< 0x0320
+    std::array<u8, 3> ref_pred_probs;                  ///< 0x0384
+    std::array<u8, 7> mb_segment_tree_probs;           ///< 0x0387
+    std::array<u8, 3> segment_pred_probs;              ///< 0x038E
+    std::array<u8, 4> ref_scores;                      ///< 0x0391
+    std::array<u8, 2> prob_comppred;                   ///< 0x0395
+    INSERT_PADDING_BYTES_NOINIT(9);                    ///< 0x0397
+    std::array<u8, 10 * 8> kf_uv_mode_prob;            ///< 0x03A0
+    std::array<u8, 10 * 1> kf_uv_mode_probB;           ///< 0x03F0
+    INSERT_PADDING_BYTES_NOINIT(6);                    ///< 0x03FA
    std::array<u8, 28> inter_mode_prob;                ///< 0x0400
    std::array<u8, 4> intra_inter_prob;                ///< 0x041C
    INSERT_PADDING_BYTES_NOINIT(80);                   ///< 0x0420
@@ -302,5 +321,5 @@ ASSERT_POSITION(class_0_fr, 0x560);
 ASSERT_POSITION(coef_probs, 0x5A0);
 #undef ASSERT_POSITION

-}; // namespace Decoder
+}; // namespace Decoders
 }; // namespace Tegra
--- a/src/video_core/host1x/control.cpp
+++ b/src/video_core/host1x/control.cpp
@@ -27,6 +27,7 @@ void Control::ProcessMethod(Method method, u32 argument) {
 }

 void Control::Execute(u32 data) {
+    LOG_TRACE(Service_NVDRV, "Control wait syncpt {} value {}", data, syncpoint_value);
    host1x.GetSyncpointManager().WaitHost(data, syncpoint_value);
 }

--- a/src/video_core/host1x/control.h
+++ b/src/video_core/host1x/control.h
@@ -6,9 +6,7 @@

 #include "common/common_types.h"

-namespace Tegra {
-
-namespace Host1x {
+namespace Tegra::Host1x {

 class Host1x;
 class Nvdec;
@@ -31,10 +29,8 @@ private:
    /// For Host1x, execute is waiting on a syncpoint previously written into the state
    void Execute(u32 data);

-    u32 syncpoint_value{};
    Host1x& host1x;
+    u32 syncpoint_value{};
 };

-} // namespace Host1x
-
-} // namespace Tegra
+} // namespace Tegra::Host1x
--- a/src/video_core/host1x/ffmpeg/ffmpeg.cpp
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.cpp
@@ -5,7 +5,9 @@
 #include "common/logging/log.h"
 #include "common/scope_exit.h"
 #include "common/settings.h"
+#include "core/memory.h"
 #include "video_core/host1x/ffmpeg/ffmpeg.h"
+#include "video_core/memory_manager.h"

 extern "C" {
 #ifdef LIBVA_FOUND
@@ -149,6 +151,7 @@ bool HardwareContext::InitializeForDecoder(DecoderContext& decoder_context,
        }
    }

+    LOG_INFO(HW_GPU, "Hardware decoding is disabled due to implementation issues, using CPU.");
    return false;
 }

@@ -183,8 +186,8 @@ bool HardwareContext::InitializeWithType(AVHWDeviceType type) {
    return true;
 }

-DecoderContext::DecoderContext(const Decoder& decoder) {
-    m_codec_context = avcodec_alloc_context3(decoder.GetCodec());
+DecoderContext::DecoderContext(const Decoder& decoder) : m_decoder{decoder} {
+    m_codec_context = avcodec_alloc_context3(m_decoder.GetCodec());
    av_opt_set(m_codec_context->priv_data, "tune", "zerolatency", 0);
    m_codec_context->thread_count = 0;
    m_codec_context->thread_type &= ~FF_THREAD_FRAME;
@@ -216,6 +219,25 @@ bool DecoderContext::OpenContext(const Decoder& decoder) {
 }

 bool DecoderContext::SendPacket(const Packet& packet) {
+    m_temp_frame = std::make_shared<Frame>();
+    m_got_frame = 0;
+
+// Android can randomly crash when calling decode directly, so skip.
+// TODO update ffmpeg and hope that fixes it.
+#ifndef ANDROID
+    if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
+        m_decode_order = true;
+        auto* codec{ffcodec(m_decoder.GetCodec())};
+        if (const int ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(),
+                                             &m_got_frame, packet.GetPacket());
+            ret < 0) {
+            LOG_DEBUG(Service_NVDRV, "avcodec_send_packet error {}", AVError(ret));
+            return false;
+        }
+        return true;
+    }
+#endif
+
    if (const int ret = avcodec_send_packet(m_codec_context, packet.GetPacket()); ret < 0) {
        LOG_ERROR(HW_GPU, "avcodec_send_packet error: {}", AVError(ret));
        return false;
@@ -224,139 +246,73 @@ bool DecoderContext::SendPacket(const Packet& packet) {
    return true;
 }

-std::unique_ptr<Frame> DecoderContext::ReceiveFrame(bool* out_is_interlaced) {
-    auto dst_frame = std::make_unique<Frame>();
+std::shared_ptr<Frame> DecoderContext::ReceiveFrame() {
+    // Android can randomly crash when calling decode directly, so skip.
+    // TODO update ffmpeg and hope that fixes it.
+#ifndef ANDROID
+    if (!m_codec_context->hw_device_ctx && m_codec_context->codec_id == AV_CODEC_ID_H264) {
+        m_decode_order = true;
+        auto* codec{ffcodec(m_decoder.GetCodec())};
+        int ret{0};

-    const auto ReceiveImpl = [&](AVFrame* frame) {
-        if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
-            LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
-            return false;
+        if (m_got_frame == 0) {
+            Packet packet{{}};
+            auto* pkt = packet.GetPacket();
+            pkt->data = nullptr;
+            pkt->size = 0;
+            ret = codec->cb.decode(m_codec_context, m_temp_frame->GetFrame(), &m_got_frame, pkt);
+            m_codec_context->has_b_frames = 0;
        }

-        *out_is_interlaced =
-#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59
-            (frame->flags & AV_FRAME_FLAG_INTERLACED) != 0;
-#else
-            frame->interlaced_frame != 0;
+        if (m_got_frame == 0 || ret < 0) {
+            LOG_ERROR(Service_NVDRV, "Failed to receive a frame! error {}", ret);
+            return {};
+        }
+    } else
 #endif
-        return true;
-    };
+    {

-    if (m_codec_context->hw_device_ctx) {
-        // If we have a hardware context, make a separate frame here to receive the
-        // hardware result before sending it to the output.
-        Frame intermediate_frame;
+        const auto ReceiveImpl = [&](AVFrame* frame) {
+            if (const int ret = avcodec_receive_frame(m_codec_context, frame); ret < 0) {
+                LOG_ERROR(HW_GPU, "avcodec_receive_frame error: {}", AVError(ret));
+                return false;
+            }

-        if (!ReceiveImpl(intermediate_frame.GetFrame())) {
-            return {};
-        }
+            return true;
+        };

-        dst_frame->SetFormat(PreferredGpuFormat);
-        if (const int ret =
-                av_hwframe_transfer_data(dst_frame->GetFrame(), intermediate_frame.GetFrame(), 0);
-            ret < 0) {
-            LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
-            return {};
-        }
-    } else {
-        // Otherwise, decode the frame as normal.
-        if (!ReceiveImpl(dst_frame->GetFrame())) {
-            return {};
+        if (m_codec_context->hw_device_ctx) {
+            // If we have a hardware context, make a separate frame here to receive the
+            // hardware result before sending it to the output.
+            Frame intermediate_frame;
+
+            if (!ReceiveImpl(intermediate_frame.GetFrame())) {
+                return {};
+            }
+
+            m_temp_frame->SetFormat(PreferredGpuFormat);
+            if (const int ret = av_hwframe_transfer_data(m_temp_frame->GetFrame(),
+                                                         intermediate_frame.GetFrame(), 0);
+                ret < 0) {
+                LOG_ERROR(HW_GPU, "av_hwframe_transfer_data error: {}", AVError(ret));
+                return {};
+            }
+        } else {
+            // Otherwise, decode the frame as normal.
+            if (!ReceiveImpl(m_temp_frame->GetFrame())) {
+                return {};
+            }
        }
    }

-    return dst_frame;
-}
-
-DeinterlaceFilter::DeinterlaceFilter(const Frame& frame) {
-    const AVFilter* buffer_src = avfilter_get_by_name("buffer");
-    const AVFilter* buffer_sink = avfilter_get_by_name("buffersink");
-    AVFilterInOut* inputs = avfilter_inout_alloc();
-    AVFilterInOut* outputs = avfilter_inout_alloc();
-    SCOPE_EXIT({
-        avfilter_inout_free(&inputs);
-        avfilter_inout_free(&outputs);
-    });
-
-    // Don't know how to get the accurate time_base but it doesn't matter for yadif filter
-    // so just use 1/1 to make buffer filter happy
-    std::string args = fmt::format("video_size={}x{}:pix_fmt={}:time_base=1/1", frame.GetWidth(),
-                                   frame.GetHeight(), static_cast<int>(frame.GetPixelFormat()));
-
-    m_filter_graph = avfilter_graph_alloc();
-    int ret = avfilter_graph_create_filter(&m_source_context, buffer_src, "in", args.c_str(),
-                                           nullptr, m_filter_graph);
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "avfilter_graph_create_filter source error: {}", AVError(ret));
-        return;
-    }
-
-    ret = avfilter_graph_create_filter(&m_sink_context, buffer_sink, "out", nullptr, nullptr,
-                                       m_filter_graph);
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "avfilter_graph_create_filter sink error: {}", AVError(ret));
-        return;
-    }
-
-    inputs->name = av_strdup("out");
-    inputs->filter_ctx = m_sink_context;
-    inputs->pad_idx = 0;
-    inputs->next = nullptr;
-
-    outputs->name = av_strdup("in");
-    outputs->filter_ctx = m_source_context;
-    outputs->pad_idx = 0;
-    outputs->next = nullptr;
-
-    const char* description = "yadif=1:-1:0";
-    ret = avfilter_graph_parse_ptr(m_filter_graph, description, &inputs, &outputs, nullptr);
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "avfilter_graph_parse_ptr error: {}", AVError(ret));
-        return;
-    }
-
-    ret = avfilter_graph_config(m_filter_graph, nullptr);
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "avfilter_graph_config error: {}", AVError(ret));
-        return;
-    }
-
-    m_initialized = true;
-}
-
-bool DeinterlaceFilter::AddSourceFrame(const Frame& frame) {
-    if (const int ret = av_buffersrc_add_frame_flags(m_source_context, frame.GetFrame(),
-                                                     AV_BUFFERSRC_FLAG_KEEP_REF);
-        ret < 0) {
-        LOG_ERROR(HW_GPU, "av_buffersrc_add_frame_flags error: {}", AVError(ret));
-        return false;
-    }
-
-    return true;
-}
-
-std::unique_ptr<Frame> DeinterlaceFilter::DrainSinkFrame() {
-    auto dst_frame = std::make_unique<Frame>();
-    const int ret = av_buffersink_get_frame(m_sink_context, dst_frame->GetFrame());
-
-    if (ret == AVERROR(EAGAIN) || ret == AVERROR(AVERROR_EOF)) {
-        return {};
-    }
-
-    if (ret < 0) {
-        LOG_ERROR(HW_GPU, "av_buffersink_get_frame error: {}", AVError(ret));
-        return {};
-    }
-
-    return dst_frame;
-}
-
-DeinterlaceFilter::~DeinterlaceFilter() {
-    avfilter_graph_free(&m_filter_graph);
+#if defined(FF_API_INTERLACED_FRAME) || LIBAVUTIL_VERSION_MAJOR >= 59
+    m_temp_frame->GetFrame()->interlaced_frame =
+        (m_temp_frame->GetFrame()->flags & AV_FRAME_FLAG_INTERLACED) != 0;
+#endif
+    return std::move(m_temp_frame);
 }

 void DecodeApi::Reset() {
-    m_deinterlace_filter.reset();
    m_hardware_context.reset();
    m_decoder_context.reset();
    m_decoder.reset();
@@ -382,43 +338,14 @@ bool DecodeApi::Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec) {
    return true;
 }

-bool DecodeApi::SendPacket(std::span<const u8> packet_data, size_t configuration_size) {
+bool DecodeApi::SendPacket(std::span<const u8> packet_data) {
    FFmpeg::Packet packet(packet_data);
    return m_decoder_context->SendPacket(packet);
 }

-void DecodeApi::ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue) {
+std::shared_ptr<Frame> DecodeApi::ReceiveFrame() {
    // Receive raw frame from decoder.
-    bool is_interlaced;
-    auto frame = m_decoder_context->ReceiveFrame(&is_interlaced);
-    if (!frame) {
-        return;
-    }
-
-    if (!is_interlaced) {
-        // If the frame is not interlaced, we can pend it now.
-        frame_queue.push(std::move(frame));
-    } else {
-        // Create the deinterlacer if needed.
-        if (!m_deinterlace_filter) {
-            m_deinterlace_filter.emplace(*frame);
-        }
-
-        // Add the frame we just received.
-        if (!m_deinterlace_filter->AddSourceFrame(*frame)) {
-            return;
-        }
-
-        // Pend output fields.
-        while (true) {
-            auto filter_frame = m_deinterlace_filter->DrainSinkFrame();
-            if (!filter_frame) {
-                break;
-            }
-
-            frame_queue.push(std::move(filter_frame));
-        }
-    }
+    return m_decoder_context->ReceiveFrame();
 }

 } // namespace FFmpeg
--- a/src/video_core/host1x/ffmpeg/ffmpeg.h
+++ b/src/video_core/host1x/ffmpeg/ffmpeg.h
@@ -20,17 +20,20 @@ extern "C" {
 #endif

 #include <libavcodec/avcodec.h>
-#include <libavfilter/avfilter.h>
-#include <libavfilter/buffersink.h>
-#include <libavfilter/buffersrc.h>
-#include <libavutil/avutil.h>
 #include <libavutil/opt.h>
+#ifndef ANDROID
+#include <libavcodec/codec_internal.h>
+#endif

 #if defined(__GNUC__) || defined(__clang__)
 #pragma GCC diagnostic pop
 #endif
 }

+namespace Tegra {
+class MemoryManager;
+}
+
 namespace FFmpeg {

 class Packet;
@@ -90,6 +93,10 @@ public:
        return m_frame->data[plane];
    }

+    const u8* GetPlane(int plane) const {
+        return m_frame->data[plane];
+    }
+
    u8** GetPlanes() const {
        return m_frame->data;
    }
@@ -98,6 +105,14 @@ public:
        m_frame->format = format;
    }

+    bool IsInterlaced() const {
+        return m_frame->interlaced_frame != 0;
+    }
+
+    bool IsHardwareDecoded() const {
+        return m_frame->hw_frames_ctx != nullptr;
+    }
+
    AVFrame* GetFrame() const {
        return m_frame;
    }
@@ -160,33 +175,22 @@ public:
    void InitializeHardwareDecoder(const HardwareContext& context, AVPixelFormat hw_pix_fmt);
    bool OpenContext(const Decoder& decoder);
    bool SendPacket(const Packet& packet);
-    std::unique_ptr<Frame> ReceiveFrame(bool* out_is_interlaced);
+    std::shared_ptr<Frame> ReceiveFrame();

    AVCodecContext* GetCodecContext() const {
        return m_codec_context;
    }

+    bool UsingDecodeOrder() const {
+        return m_decode_order;
+    }
+
 private:
+    const Decoder& m_decoder;
    AVCodecContext* m_codec_context{};
-};
-
-// Wraps an AVFilterGraph.
-class DeinterlaceFilter {
-public:
-    YUZU_NON_COPYABLE(DeinterlaceFilter);
-    YUZU_NON_MOVEABLE(DeinterlaceFilter);
-
-    explicit DeinterlaceFilter(const Frame& frame);
-    ~DeinterlaceFilter();
-
-    bool AddSourceFrame(const Frame& frame);
-    std::unique_ptr<Frame> DrainSinkFrame();
-
-private:
-    AVFilterGraph* m_filter_graph{};
-    AVFilterContext* m_source_context{};
-    AVFilterContext* m_sink_context{};
-    bool m_initialized{};
+    s32 m_got_frame{};
+    std::shared_ptr<Frame> m_temp_frame{};
+    bool m_decode_order{};
 };

 class DecodeApi {
@@ -200,14 +204,17 @@ public:
    bool Initialize(Tegra::Host1x::NvdecCommon::VideoCodec codec);
    void Reset();

-    bool SendPacket(std::span<const u8> packet_data, size_t configuration_size);
-    void ReceiveFrames(std::queue<std::unique_ptr<Frame>>& frame_queue);
+    bool UsingDecodeOrder() const {
+        return m_decoder_context->UsingDecodeOrder();
+    }
+
+    bool SendPacket(std::span<const u8> packet_data);
+    std::shared_ptr<Frame> ReceiveFrame();

 private:
    std::optional<FFmpeg::Decoder> m_decoder;
    std::optional<FFmpeg::DecoderContext> m_decoder_context;
    std::optional<FFmpeg::HardwareContext> m_hardware_context;
-    std::optional<FFmpeg::DeinterlaceFilter> m_deinterlace_filter;
 };

 } // namespace FFmpeg
--- a/src/video_core/host1x/host1x.cpp
+++ b/src/video_core/host1x/host1x.cpp
@@ -3,10 +3,10 @@

 #include "core/core.h"
 #include "video_core/host1x/host1x.h"
+#include "video_core/host1x/nvdec.h"
+#include "video_core/host1x/vic.h"

-namespace Tegra {
-
-namespace Host1x {
+namespace Tegra::Host1x {

 Host1x::Host1x(Core::System& system_)
    : system{system_}, syncpoint_manager{},
@@ -15,6 +15,22 @@ Host1x::Host1x(Core::System& system_)

 Host1x::~Host1x() = default;

-} // namespace Host1x
+void Host1x::StartDevice(s32 fd, ChannelType type, u32 syncpt) {
+    switch (type) {
+    case ChannelType::NvDec:
+        devices[fd] = std::make_unique<Tegra::Host1x::Nvdec>(*this, fd, syncpt, frame_queue);
+        break;
+    case ChannelType::VIC:
+        devices[fd] = std::make_unique<Tegra::Host1x::Vic>(*this, fd, syncpt, frame_queue);
+        break;
+    default:
+        LOG_ERROR(HW_GPU, "Unimplemented host1x device {}", static_cast<u32>(type));
+        break;
+    }
+}

-} // namespace Tegra
+void Host1x::StopDevice(s32 fd, ChannelType type) {
+    devices.erase(fd);
+}
+
+} // namespace Tegra::Host1x
--- a/src/video_core/host1x/host1x.h
+++ b/src/video_core/host1x/host1x.h
@@ -3,9 +3,14 @@

 #pragma once

+#include <unordered_map>
+#include <unordered_set>
+#include <queue>
+
 #include "common/common_types.h"

 #include "common/address_space.h"
+#include "video_core/cdma_pusher.h"
 #include "video_core/host1x/gpu_device_memory_manager.h"
 #include "video_core/host1x/syncpoint_manager.h"
 #include "video_core/memory_manager.h"
@@ -14,15 +19,128 @@ namespace Core {
 class System;
 } // namespace Core

-namespace Tegra {
+namespace FFmpeg {
+class Frame;
+} // namespace FFmpeg

-namespace Host1x {
+namespace Tegra::Host1x {
+class Nvdec;
+
+class FrameQueue {
+public:
+    void Open(s32 fd) {
+        std::scoped_lock l{m_mutex};
+        m_presentation_order.insert({fd, {}});
+        m_decode_order.insert({fd, {}});
+    }
+
+    void Close(s32 fd) {
+        std::scoped_lock l{m_mutex};
+        m_presentation_order.erase(fd);
+        m_decode_order.erase(fd);
+    }
+
+    s32 VicFindNvdecFdFromOffset(u64 search_offset) {
+        std::scoped_lock l{m_mutex};
+        // Vic does not know which nvdec is producing frames for it, so search all the fds here for
+        // the given offset.
+        for (auto& map : m_presentation_order) {
+            for (auto& [offset, frame] : map.second) {
+                if (offset == search_offset) {
+                    return map.first;
+                }
+            }
+        }
+
+        for (auto& map : m_decode_order) {
+            for (auto& [offset, frame] : map.second) {
+                if (offset == search_offset) {
+                    return map.first;
+                }
+            }
+        }
+
+        return -1;
+    }
+
+    void PushPresentOrder(s32 fd, u64 offset, std::shared_ptr<FFmpeg::Frame>&& frame) {
+        std::scoped_lock l{m_mutex};
+        auto map = m_presentation_order.find(fd);
+        map->second.emplace_back(offset, std::move(frame));
+    }
+
+    void PushDecodeOrder(s32 fd, u64 offset, std::shared_ptr<FFmpeg::Frame>&& frame) {
+        std::scoped_lock l{m_mutex};
+        auto map = m_decode_order.find(fd);
+        map->second.insert_or_assign(offset, std::move(frame));
+    }
+
+    std::shared_ptr<FFmpeg::Frame> GetFrame(s32 fd, u64 offset) {
+        if (fd == -1) {
+            return {};
+        }
+
+        std::scoped_lock l{m_mutex};
+        auto present_map = m_presentation_order.find(fd);
+        if (present_map->second.size() > 0) {
+            return GetPresentOrderLocked(fd);
+        }
+
+        auto decode_map = m_decode_order.find(fd);
+        if (decode_map->second.size() > 0) {
+            return GetDecodeOrderLocked(fd, offset);
+        }
+
+        return {};
+    }
+
+private:
+    std::shared_ptr<FFmpeg::Frame> GetPresentOrderLocked(s32 fd) {
+        auto map = m_presentation_order.find(fd);
+        if (map->second.size() == 0) {
+            return {};
+        }
+        auto frame = std::move(map->second.front().second);
+        map->second.pop_front();
+        return frame;
+    }
+
+    std::shared_ptr<FFmpeg::Frame> GetDecodeOrderLocked(s32 fd, u64 offset) {
+        auto map = m_decode_order.find(fd);
+        auto it = map->second.find(offset);
+        if (it == map->second.end()) {
+            return {};
+        }
+        return std::move(map->second.extract(it).mapped());
+    }
+
+    using FramePtr = std::shared_ptr<FFmpeg::Frame>;
+
+    std::mutex m_mutex{};
+    std::unordered_map<s32, std::deque<std::pair<u64, FramePtr>>> m_presentation_order;
+    std::unordered_map<s32, std::unordered_map<u64, FramePtr>> m_decode_order;
+};
+
+enum class ChannelType : u32 {
+    MsEnc = 0,
+    VIC = 1,
+    GPU = 2,
+    NvDec = 3,
+    Display = 4,
+    NvJpg = 5,
+    TSec = 6,
+    Max = 7,
+};

 class Host1x {
 public:
    explicit Host1x(Core::System& system);
    ~Host1x();

+    Core::System& System() {
+        return system;
+    }
+
    SyncpointManager& GetSyncpointManager() {
        return syncpoint_manager;
    }
@@ -55,14 +173,25 @@ public:
        return *allocator;
    }

+    void StartDevice(s32 fd, ChannelType type, u32 syncpt);
+    void StopDevice(s32 fd, ChannelType type);
+
+    void PushEntries(s32 fd, ChCommandHeaderList&& entries) {
+        auto it = devices.find(fd);
+        if (it == devices.end()) {
+            return;
+        }
+        it->second->PushEntries(std::move(entries));
+    }
+
 private:
    Core::System& system;
    SyncpointManager syncpoint_manager;
    Tegra::MaxwellDeviceMemoryManager memory_manager;
    Tegra::MemoryManager gmmu_manager;
    std::unique_ptr<Common::FlatAllocator<u32, 0, 32>> allocator;
+    FrameQueue frame_queue;
+    std::unordered_map<s32, std::unique_ptr<CDmaPusher>> devices;
 };

-} // namespace Host1x
-
-} // namespace Tegra
+} // namespace Tegra::Host1x
--- a/src/video_core/host1x/nvdec.cpp
+++ b/src/video_core/host1x/nvdec.cpp
@@ -2,6 +2,12 @@
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include "common/assert.h"
+
+#include "common/polyfill_thread.h"
+#include "common/settings.h"
+#include "video_core/host1x/codecs/h264.h"
+#include "video_core/host1x/codecs/vp8.h"
+#include "video_core/host1x/codecs/vp9.h"
 #include "video_core/host1x/host1x.h"
 #include "video_core/host1x/nvdec.h"

@@ -10,37 +16,70 @@ namespace Tegra::Host1x {
 #define NVDEC_REG_INDEX(field_name)                                                                \
    (offsetof(NvdecCommon::NvdecRegisters, field_name) / sizeof(u64))

-Nvdec::Nvdec(Host1x& host1x_)
-    : host1x(host1x_), state{}, codec(std::make_unique<Codec>(host1x, state)) {}
+Nvdec::Nvdec(Host1x& host1x_, s32 id_, u32 syncpt, FrameQueue& frame_queue_)
+    : CDmaPusher{host1x_, id_}, id{id_}, syncpoint{syncpt}, frame_queue{frame_queue_} {
+    LOG_INFO(HW_GPU, "Created nvdec {}", id);
+    frame_queue.Open(id);
+}

-Nvdec::~Nvdec() = default;
+Nvdec::~Nvdec() {
+    LOG_INFO(HW_GPU, "Destroying nvdec {}", id);
+    frame_queue.Close(id);
+}

 void Nvdec::ProcessMethod(u32 method, u32 argument) {
-    state.reg_array[method] = static_cast<u64>(argument) << 8;
+    regs.reg_array[method] = argument;

    switch (method) {
    case NVDEC_REG_INDEX(set_codec_id):
-        codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(argument));
+        CreateDecoder(static_cast<NvdecCommon::VideoCodec>(argument));
        break;
-    case NVDEC_REG_INDEX(execute):
+    case NVDEC_REG_INDEX(execute): {
+        if (wait_needed) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(32));
+            wait_needed = false;
+        }
        Execute();
-        break;
+    } break;
    }
 }

-std::unique_ptr<FFmpeg::Frame> Nvdec::GetFrame() {
-    return codec->GetCurrentFrame();
+void Nvdec::CreateDecoder(NvdecCommon::VideoCodec codec) {
+    if (decoder.get()) {
+        return;
+    }
+    switch (codec) {
+    case NvdecCommon::VideoCodec::H264:
+        decoder = std::make_unique<Decoders::H264>(host1x, regs, id, frame_queue);
+        break;
+    case NvdecCommon::VideoCodec::VP8:
+        decoder = std::make_unique<Decoders::VP8>(host1x, regs, id, frame_queue);
+        break;
+    case NvdecCommon::VideoCodec::VP9:
+        decoder = std::make_unique<Decoders::VP9>(host1x, regs, id, frame_queue);
+        break;
+    default:
+        UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());
+        break;
+    }
+    LOG_INFO(HW_GPU, "Created decoder {} for id {}", decoder->GetCurrentCodecName(), id);
 }

 void Nvdec::Execute() {
-    switch (codec->GetCurrentCodec()) {
+    if (Settings::values.nvdec_emulation.GetValue() == Settings::NvdecEmulation::Off) [[unlikely]] {
+        // Signalling syncpts too fast can cause games to get stuck as they don't expect a <1ms
+        // execution time. Sleep for half of a 60 fps frame just in case.
+        std::this_thread::sleep_for(std::chrono::milliseconds(8));
+        return;
+    }
+    switch (decoder->GetCurrentCodec()) {
    case NvdecCommon::VideoCodec::H264:
    case NvdecCommon::VideoCodec::VP8:
    case NvdecCommon::VideoCodec::VP9:
-        codec->Decode();
+        decoder->Decode();
        break;
    default:
-        UNIMPLEMENTED_MSG("Codec {}", codec->GetCurrentCodecName());
+        UNIMPLEMENTED_MSG("Codec {}", decoder->GetCurrentCodecName());
        break;
    }
 }
--- a/src/video_core/host1x/nvdec.h
+++ b/src/video_core/host1x/nvdec.h
@@ -5,33 +5,47 @@

 #include <memory>
 #include <vector>
+
 #include "common/common_types.h"
-#include "video_core/host1x/codecs/codec.h"
+#include "video_core/cdma_pusher.h"
+#include "video_core/host1x/codecs/decoder.h"

 namespace Tegra {

 namespace Host1x {
-
 class Host1x;
+class FrameQueue;

-class Nvdec {
+class Nvdec final : public CDmaPusher {
 public:
-    explicit Nvdec(Host1x& host1x);
+    explicit Nvdec(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue_);
    ~Nvdec();

    /// Writes the method into the state, Invoke Execute() if encountered
-    void ProcessMethod(u32 method, u32 argument);
+    void ProcessMethod(u32 method, u32 arg) override;

-    /// Return most recently decoded frame
-    [[nodiscard]] std::unique_ptr<FFmpeg::Frame> GetFrame();
+    u32 GetSyncpoint() const {
+        return syncpoint;
+    }
+
+    void SetWait() {
+        wait_needed = true;
+    }

 private:
+    /// Create the decoder when the codec id is set
+    void CreateDecoder(NvdecCommon::VideoCodec codec);
+
    /// Invoke codec to decode a frame
    void Execute();

-    Host1x& host1x;
-    NvdecCommon::NvdecRegisters state;
-    std::unique_ptr<Codec> codec;
+    s32 id;
+    u32 syncpoint;
+    FrameQueue& frame_queue;
+
+    NvdecCommon::NvdecRegisters regs{};
+    std::unique_ptr<Decoder> decoder;
+    bool wait_needed{false};
 };

 } // namespace Host1x
--- a/src/video_core/host1x/nvdec_common.h
+++ b/src/video_core/host1x/nvdec_common.h
@@ -17,6 +17,17 @@ enum class VideoCodec : u64 {
    VP9 = 0x9,
 };

+struct Offset {
+    constexpr u64 Address() const noexcept {
+        return offset << 8;
+    }
+
+private:
+    u64 offset;
+};
+static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
+static_assert(sizeof(Offset) == 0x8, "Offset has the wrong size!");
+
 // NVDEC should use a 32-bit address space, but is mapped to 64-bit,
 // doubling the sizes here is compensating for that.
 struct NvdecRegisters {
@@ -38,29 +49,40 @@ struct NvdecRegisters {
                    BitField<17, 1, u64> all_intra_frame;
                };
            } control_params;
-            u64 picture_info_offset;                   ///< 0x0808
-            u64 frame_bitstream_offset;                ///< 0x0810
-            u64 frame_number;                          ///< 0x0818
-            u64 h264_slice_data_offsets;               ///< 0x0820
-            u64 h264_mv_dump_offset;                   ///< 0x0828
-            INSERT_PADDING_WORDS_NOINIT(6);            ///< 0x0830
-            u64 frame_stats_offset;                    ///< 0x0848
-            u64 h264_last_surface_luma_offset;         ///< 0x0850
-            u64 h264_last_surface_chroma_offset;       ///< 0x0858
-            std::array<u64, 17> surface_luma_offset;   ///< 0x0860
-            std::array<u64, 17> surface_chroma_offset; ///< 0x08E8
-            INSERT_PADDING_WORDS_NOINIT(68);           ///< 0x0970
-            u64 vp8_prob_data_offset;                  ///< 0x0A80
-            u64 vp8_header_partition_buf_offset;       ///< 0x0A88
-            INSERT_PADDING_WORDS_NOINIT(60);           ///< 0x0A90
-            u64 vp9_entropy_probs_offset;              ///< 0x0B80
-            u64 vp9_backward_updates_offset;           ///< 0x0B88
-            u64 vp9_last_frame_segmap_offset;          ///< 0x0B90
-            u64 vp9_curr_frame_segmap_offset;          ///< 0x0B98
-            INSERT_PADDING_WORDS_NOINIT(2);            ///< 0x0BA0
-            u64 vp9_last_frame_mvs_offset;             ///< 0x0BA8
-            u64 vp9_curr_frame_mvs_offset;             ///< 0x0BB0
-            INSERT_PADDING_WORDS_NOINIT(2);            ///< 0x0BB8
+            Offset picture_info_offset;                    ///< 0x0808
+            Offset frame_bitstream_offset;                 ///< 0x0810
+            u64 frame_number;                              ///< 0x0818
+            Offset h264_slice_data_offsets;                ///< 0x0820
+            Offset h264_mv_dump_offset;                    ///< 0x0828
+            INSERT_PADDING_WORDS_NOINIT(6);                ///< 0x0830
+            Offset frame_stats_offset;                     ///< 0x0848
+            Offset h264_last_surface_luma_offset;          ///< 0x0850
+            Offset h264_last_surface_chroma_offset;        ///< 0x0858
+            std::array<Offset, 17> surface_luma_offsets;   ///< 0x0860
+            std::array<Offset, 17> surface_chroma_offsets; ///< 0x08E8
+            Offset pic_scratch_buf_offset;                 ///< 0x0970
+            Offset external_mvbuffer_offset;               ///< 0x0978
+            INSERT_PADDING_WORDS_NOINIT(32);               ///< 0x0980
+            Offset h264_mbhist_buffer_offset;              ///< 0x0A00
+            INSERT_PADDING_WORDS_NOINIT(30);               ///< 0x0A08
+            Offset vp8_prob_data_offset;                   ///< 0x0A80
+            Offset vp8_header_partition_buf_offset;        ///< 0x0A88
+            INSERT_PADDING_WORDS_NOINIT(28);               ///< 0x0A90
+            Offset hvec_scalist_list_offset;               ///< 0x0B00
+            Offset hvec_tile_sizes_offset;                 ///< 0x0B08
+            Offset hvec_filter_buffer_offset;              ///< 0x0B10
+            Offset hvec_sao_buffer_offset;                 ///< 0x0B18
+            Offset hvec_slice_info_buffer_offset;          ///< 0x0B20
+            Offset hvec_slice_group_index_offset;          ///< 0x0B28
+            INSERT_PADDING_WORDS_NOINIT(20);               ///< 0x0B30
+            Offset vp9_prob_tab_buffer_offset;             ///< 0x0B80
+            Offset vp9_ctx_counter_buffer_offset;          ///< 0x0B88
+            Offset vp9_segment_read_buffer_offset;         ///< 0x0B90
+            Offset vp9_segment_write_buffer_offset;        ///< 0x0B98
+            Offset vp9_tile_size_buffer_offset;            ///< 0x0BA0
+            Offset vp9_col_mvwrite_buffer_offset;          ///< 0x0BA8
+            Offset vp9_col_mvread_buffer_offset;           ///< 0x0BB0
+            Offset vp9_filter_buffer_offset;               ///< 0x0BB8
        };
        std::array<u64, NUM_REGS> reg_array;
    };
@@ -81,16 +103,16 @@ ASSERT_REG_POSITION(h264_slice_data_offsets, 0x104);
 ASSERT_REG_POSITION(frame_stats_offset, 0x109);
 ASSERT_REG_POSITION(h264_last_surface_luma_offset, 0x10A);
 ASSERT_REG_POSITION(h264_last_surface_chroma_offset, 0x10B);
-ASSERT_REG_POSITION(surface_luma_offset, 0x10C);
-ASSERT_REG_POSITION(surface_chroma_offset, 0x11D);
+ASSERT_REG_POSITION(surface_luma_offsets, 0x10C);
+ASSERT_REG_POSITION(surface_chroma_offsets, 0x11D);
 ASSERT_REG_POSITION(vp8_prob_data_offset, 0x150);
 ASSERT_REG_POSITION(vp8_header_partition_buf_offset, 0x151);
-ASSERT_REG_POSITION(vp9_entropy_probs_offset, 0x170);
-ASSERT_REG_POSITION(vp9_backward_updates_offset, 0x171);
-ASSERT_REG_POSITION(vp9_last_frame_segmap_offset, 0x172);
-ASSERT_REG_POSITION(vp9_curr_frame_segmap_offset, 0x173);
-ASSERT_REG_POSITION(vp9_last_frame_mvs_offset, 0x175);
-ASSERT_REG_POSITION(vp9_curr_frame_mvs_offset, 0x176);
+ASSERT_REG_POSITION(vp9_prob_tab_buffer_offset, 0x170);
+ASSERT_REG_POSITION(vp9_ctx_counter_buffer_offset, 0x171);
+ASSERT_REG_POSITION(vp9_segment_read_buffer_offset, 0x172);
+ASSERT_REG_POSITION(vp9_segment_write_buffer_offset, 0x173);
+ASSERT_REG_POSITION(vp9_col_mvwrite_buffer_offset, 0x175);
+ASSERT_REG_POSITION(vp9_col_mvread_buffer_offset, 0x176);

 #undef ASSERT_REG_POSITION

--- a/src/video_core/host1x/sync_manager.cpp
+++ b/src/video_core/host1x/sync_manager.cpp
@@ -1,50 +0,0 @@
-// SPDX-FileCopyrightText: Ryujinx Team and Contributors
-// SPDX-License-Identifier: MIT
-
-#include <algorithm>
-#include "sync_manager.h"
-#include "video_core/host1x/host1x.h"
-#include "video_core/host1x/syncpoint_manager.h"
-
-namespace Tegra {
-namespace Host1x {
-
-SyncptIncrManager::SyncptIncrManager(Host1x& host1x_) : host1x(host1x_) {}
-SyncptIncrManager::~SyncptIncrManager() = default;
-
-void SyncptIncrManager::Increment(u32 id) {
-    increments.emplace_back(0, 0, id, true);
-    IncrementAllDone();
-}
-
-u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) {
-    const u32 handle = current_id++;
-    increments.emplace_back(handle, class_id, id);
-    return handle;
-}
-
-void SyncptIncrManager::SignalDone(u32 handle) {
-    const auto done_incr =
-        std::find_if(increments.begin(), increments.end(),
-                     [handle](const SyncptIncr& incr) { return incr.id == handle; });
-    if (done_incr != increments.cend()) {
-        done_incr->complete = true;
-    }
-    IncrementAllDone();
-}
-
-void SyncptIncrManager::IncrementAllDone() {
-    std::size_t done_count = 0;
-    for (; done_count < increments.size(); ++done_count) {
-        if (!increments[done_count].complete) {
-            break;
-        }
-        auto& syncpoint_manager = host1x.GetSyncpointManager();
-        syncpoint_manager.IncrementGuest(increments[done_count].syncpt_id);
-        syncpoint_manager.IncrementHost(increments[done_count].syncpt_id);
-    }
-    increments.erase(increments.begin(), increments.begin() + done_count);
-}
-
-} // namespace Host1x
-} // namespace Tegra
--- a/src/video_core/host1x/sync_manager.h
+++ b/src/video_core/host1x/sync_manager.h
@@ -1,53 +0,0 @@
-// SPDX-FileCopyrightText: Ryujinx Team and Contributors
-// SPDX-License-Identifier: MIT
-
-#pragma once
-
-#include <mutex>
-#include <vector>
-#include "common/common_types.h"
-
-namespace Tegra {
-
-namespace Host1x {
-
-class Host1x;
-
-struct SyncptIncr {
-    u32 id;
-    u32 class_id;
-    u32 syncpt_id;
-    bool complete;
-
-    SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false)
-        : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {}
-};
-
-class SyncptIncrManager {
-public:
-    explicit SyncptIncrManager(Host1x& host1x);
-    ~SyncptIncrManager();
-
-    /// Add syncpoint id and increment all
-    void Increment(u32 id);
-
-    /// Returns a handle to increment later
-    u32 IncrementWhenDone(u32 class_id, u32 id);
-
-    /// IncrememntAllDone, including handle
-    void SignalDone(u32 handle);
-
-    /// Increment all sequential pending increments that are already done.
-    void IncrementAllDone();
-
-private:
-    std::vector<SyncptIncr> increments;
-    std::mutex increment_lock;
-    u32 current_id{};
-
-    Host1x& host1x;
-};
-
-} // namespace Host1x
-
-} // namespace Tegra
--- a/src/video_core/host1x/syncpoint_manager.cpp
+++ b/src/video_core/host1x/syncpoint_manager.cpp
@@ -18,7 +18,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction(
        return {};
    }

-    std::unique_lock lk(guard);
+    std::scoped_lock lk(guard);
    if (syncpoint.load(std::memory_order_relaxed) >= expected_value) {
        action();
        return {};
@@ -35,7 +35,7 @@ SyncpointManager::ActionHandle SyncpointManager::RegisterAction(

 void SyncpointManager::DeregisterAction(std::list<RegisteredAction>& action_storage,
                                        const ActionHandle& handle) {
-    std::unique_lock lk(guard);
+    std::scoped_lock lk(guard);

    // We want to ensure the iterator still exists prior to erasing it
    // Otherwise, if an invalid iterator was passed in then it could lead to UB
@@ -78,7 +78,7 @@ void SyncpointManager::Increment(std::atomic<u32>& syncpoint, std::condition_var
                                 std::list<RegisteredAction>& action_storage) {
    auto new_value{syncpoint.fetch_add(1, std::memory_order_acq_rel) + 1};

-    std::unique_lock lk(guard);
+    std::scoped_lock lk(guard);
    auto it = action_storage.begin();
    while (it != action_storage.end()) {
        if (it->expected_value > new_value) {
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
--- a/src/video_core/host1x/vic.h
+++ b/src/video_core/host1x/vic.h
@@ -3,65 +3,646 @@

 #pragma once

+#include <condition_variable>
+#include <functional>
 #include <memory>
+#include <mutex>
+#include <thread>

 #include "common/common_types.h"
 #include "common/scratch_buffer.h"
+#include "video_core/cdma_pusher.h"

-struct SwsContext;
-
-namespace Tegra {
-
-namespace Host1x {
-
+namespace Tegra::Host1x {
 class Host1x;
 class Nvdec;
-union VicConfig;

-class Vic {
+struct Pixel {
+    u16 r;
+    u16 g;
+    u16 b;
+    u16 a;
+};
+
+// One underscore represents separate pixels.
+// Double underscore represents separate planes.
+// _N represents chroma subsampling, not a separate pixel.
+enum class VideoPixelFormat : u32 {
+    A8 = 0,
+    L8 = 1,
+    A4L4 = 2,
+    L4A4 = 3,
+    R8 = 4,
+    A8L8 = 5,
+    L8A8 = 6,
+    R8G8 = 7,
+    G8R8 = 8,
+    B5G6R5 = 9,
+    R5G6B5 = 10,
+    B6G5R5 = 11,
+    R5G5B6 = 12,
+    A1B5G5R5 = 13,
+    A1R5G5B5 = 14,
+    B5G5R5A1 = 15,
+    R5G5B5A1 = 16,
+    A5B5G5R1 = 17,
+    A5R1G5B5 = 18,
+    B5G5R1A5 = 19,
+    R1G5B5A5 = 20,
+    X1B5G5R5 = 21,
+    X1R5G5B5 = 22,
+    B5G5R5X1 = 23,
+    R5G5B5X1 = 24,
+    A4B4G5R4 = 25,
+    A4R4G4B4 = 26,
+    B4G4R4A4 = 27,
+    R4G4B4A4 = 28,
+    B8G8R8 = 29,
+    R8G8B8 = 30,
+    A8B8G8R8 = 31,
+    A8R8G8B8 = 32,
+    B8G8R8A8 = 33,
+    R8G8B8A8 = 34,
+    X8B8G8R8 = 35,
+    X8R8G8B8 = 36,
+    B8G8R8X8 = 37,
+    R8G8B8X8 = 38,
+    A8B10G10R10 = 39,
+    A2R10G10B10 = 40,
+    B10G10R10A2 = 41,
+    R10G10B10A2 = 42,
+    A4P4 = 43,
+    P4A4 = 44,
+    P8A8 = 45,
+    A8P8 = 46,
+    P8 = 47,
+    P1 = 48,
+    U8V8 = 49,
+    V8U8 = 50,
+    A8Y8U8V8 = 51,
+    V8U8Y8A8 = 52,
+    Y8U8V8 = 53,
+    Y8V8U8 = 54,
+    U8V8Y8 = 55,
+    V8U8Y8 = 56,
+    Y8U8_Y8V8 = 57,
+    Y8V8_Y8U8 = 58,
+    U8Y8_V8Y8 = 59,
+    V8Y8_U8Y8 = 60,
+    Y8__U8V8_N444 = 61,
+    Y8__V8U8_N444 = 62,
+    Y8__U8V8_N422 = 63,
+    Y8__V8U8_N422 = 64,
+    Y8__U8V8_N422R = 65,
+    Y8__V8U8_N422R = 66,
+    Y8__U8V8_N420 = 67,
+    Y8__V8U8_N420 = 68,
+    Y8__U8__V8_N444 = 69,
+    Y8__U8__V8_N422 = 70,
+    Y8__U8__V8_N422R = 71,
+    Y8__U8__V8_N420 = 72,
+    U8 = 73,
+    V8 = 74,
+};
+
+struct Offset {
+    constexpr u32 Address() const noexcept {
+        return offset << 8;
+    }
+
+private:
+    u32 offset;
+};
+static_assert(std::is_trivial_v<Offset>, "Offset must be trivial");
+static_assert(sizeof(Offset) == 0x4, "Offset has the wrong size!");
+
+struct PlaneOffsets {
+    Offset luma;
+    Offset chroma_u;
+    Offset chroma_v;
+};
+static_assert(sizeof(PlaneOffsets) == 0xC, "PlaneOffsets has the wrong size!");
+
+enum SurfaceIndex : u32 {
+    Current = 0,
+    Previous = 1,
+    Next = 2,
+    NextNoiseReduced = 3,
+    CurrentMotion = 4,
+    PreviousMotion = 5,
+    PreviousPreviousMotion = 6,
+    CombinedMotion = 7,
+};
+
+enum class DXVAHD_ALPHA_FILL_MODE : u32 {
+    OPAQUE = 0,
+    BACKGROUND = 1,
+    DESTINATION = 2,
+    SOURCE_STREAM = 3,
+    COMPOSITED = 4,
+    SOURCE_ALPHA = 5,
+};
+
+enum class DXVAHD_FRAME_FORMAT : u64 {
+    PROGRESSIVE = 0,
+    INTERLACED_TOP_FIELD_FIRST = 1,
+    INTERLACED_BOTTOM_FIELD_FIRST = 2,
+    TOP_FIELD = 3,
+    BOTTOM_FIELD = 4,
+    SUBPIC_PROGRESSIVE = 5,
+    SUBPIC_INTERLACED_TOP_FIELD_FIRST = 6,
+    SUBPIC_INTERLACED_BOTTOM_FIELD_FIRST = 7,
+    SUBPIC_TOP_FIELD = 8,
+    SUBPIC_BOTTOM_FIELD = 9,
+    TOP_FIELD_CHROMA_BOTTOM = 10,
+    BOTTOM_FIELD_CHROMA_TOP = 11,
+    SUBPIC_TOP_FIELD_CHROMA_BOTTOM = 12,
+    SUBPIC_BOTTOM_FIELD_CHROMA_TOP = 13,
+};
+
+enum class DXVAHD_DEINTERLACE_MODE_PRIVATE : u64 {
+    WEAVE = 0,
+    BOB_FIELD = 1,
+    BOB = 2,
+    NEWBOB = 3,
+    DISI1 = 4,
+    WEAVE_LUMA_BOB_FIELD_CHROMA = 5,
+    MAX = 0xF,
+};
+
+enum class BLK_KIND {
+    PITCH = 0,
+    GENERIC_16Bx2 = 1,
+    // These are unsupported in the vic
+    BL_NAIVE = 2,
+    BL_KEPLER_XBAR_RAW = 3,
+    VP2_TILED = 15,
+};
+
+enum class BLEND_SRCFACTC : u32 {
+    K1 = 0,
+    K1_TIMES_DST = 1,
+    NEG_K1_TIMES_DST = 2,
+    K1_TIMES_SRC = 3,
+    ZERO = 4,
+};
+
+enum class BLEND_DSTFACTC : u32 {
+    K1 = 0,
+    K2 = 1,
+    K1_TIMES_DST = 2,
+    NEG_K1_TIMES_DST = 3,
+    NEG_K1_TIMES_SRC = 4,
+    ZERO = 5,
+    ONE = 6,
+};
+
+enum class BLEND_SRCFACTA : u32 {
+    K1 = 0,
+    K2 = 1,
+    NEG_K1_TIMES_DST = 2,
+    ZERO = 3,
+    MAX = 7,
+};
+
+enum class BLEND_DSTFACTA : u32 {
+    K2 = 0,
+    NEG_K1_TIMES_SRC = 1,
+    ZERO = 2,
+    ONE = 3,
+    MAX = 7,
+};
+
+struct PipeConfig {
+    union {
+        BitField<0, 11, u32> downsample_horiz;
+        BitField<11, 5, u32> reserved0;
+        BitField<16, 11, u32> downsample_vert;
+        BitField<27, 5, u32> reserved1;
+    };
+    u32 reserved2;
+    u32 reserved3;
+    u32 reserved4;
+};
+static_assert(sizeof(PipeConfig) == 0x10, "PipeConfig has the wrong size!");
+
+struct OutputConfig {
+    union {
+        BitField<0, 3, DXVAHD_ALPHA_FILL_MODE> alpha_fill_mode;
+        BitField<3, 3, u64> alpha_fill_slot;
+        BitField<6, 10, u64> background_a;
+        BitField<16, 10, u64> background_r;
+        BitField<26, 10, u64> background_g;
+        BitField<36, 10, u64> background_b;
+        BitField<46, 2, u64> regamma_mode;
+        BitField<48, 1, u64> output_flip_x;
+        BitField<49, 1, u64> output_flip_y;
+        BitField<50, 1, u64> output_transpose;
+        BitField<51, 1, u64> reserved1;
+        BitField<52, 12, u64> reserved2;
+    };
+    union {
+        BitField<0, 14, u32> target_rect_left;
+        BitField<14, 2, u32> reserved3;
+        BitField<16, 14, u32> target_rect_right;
+        BitField<30, 2, u32> reserved4;
+    };
+    union {
+        BitField<0, 14, u32> target_rect_top;
+        BitField<14, 2, u32> reserved5;
+        BitField<16, 14, u32> target_rect_bottom;
+        BitField<30, 2, u32> reserved6;
+    };
+};
+static_assert(sizeof(OutputConfig) == 0x10, "OutputConfig has the wrong size!");
+
+struct OutputSurfaceConfig {
+    union {
+        BitField<0, 7, VideoPixelFormat> out_pixel_format;
+        BitField<7, 2, u32> out_chroma_loc_horiz;
+        BitField<9, 2, u32> out_chroma_loc_vert;
+        BitField<11, 4, BLK_KIND> out_block_kind;
+        BitField<15, 4, u32> out_block_height; // in gobs, log2
+        BitField<19, 3, u32> reserved0;
+        BitField<22, 10, u32> reserved1;
+    };
+    union {
+        BitField<0, 14, u32> out_surface_width;   // - 1
+        BitField<14, 14, u32> out_surface_height; // - 1
+        BitField<28, 4, u32> reserved2;
+    };
+    union {
+        BitField<0, 14, u32> out_luma_width;   // - 1
+        BitField<14, 14, u32> out_luma_height; // - 1
+        BitField<28, 4, u32> reserved3;
+    };
+    union {
+        BitField<0, 14, u32> out_chroma_width;   // - 1
+        BitField<14, 14, u32> out_chroma_height; // - 1
+        BitField<28, 4, u32> reserved4;
+    };
+};
+static_assert(sizeof(OutputSurfaceConfig) == 0x10, "OutputSurfaceConfig has the wrong size!");
+
+struct MatrixStruct {
+    union {
+        BitField<0, 20, s64> matrix_coeff00;  // (0,0) of 4x3 conversion matrix
+        BitField<20, 20, s64> matrix_coeff10; // (1,0) of 4x3 conversion matrix
+        BitField<40, 20, s64> matrix_coeff20; // (2,0) of 4x3 conversion matrix
+        BitField<60, 4, u64> matrix_r_shift;
+    };
+    union {
+        BitField<0, 20, s64> matrix_coeff01;  // (0,1) of 4x3 conversion matrix
+        BitField<20, 20, s64> matrix_coeff11; // (1,1) of 4x3 conversion matrix
+        BitField<40, 20, s64> matrix_coeff21; // (2,1) of 4x3 conversion matrix
+        BitField<60, 3, u64> reserved0;
+        BitField<63, 1, u64> matrix_enable;
+    };
+    union {
+        BitField<0, 20, s64> matrix_coeff02;  // (0,2) of 4x3 conversion matrix
+        BitField<20, 20, s64> matrix_coeff12; // (1,2) of 4x3 conversion matrix
+        BitField<40, 20, s64> matrix_coeff22; // (2,2) of 4x3 conversion matrix
+        BitField<60, 4, u64> reserved1;
+    };
+    union {
+        BitField<0, 20, s64> matrix_coeff03;  // (0,3) of 4x3 conversion matrix
+        BitField<20, 20, s64> matrix_coeff13; // (1,3) of 4x3 conversion matrix
+        BitField<40, 20, s64> matrix_coeff23; // (2,3) of 4x3 conversion matrix
+        BitField<60, 4, u64> reserved2;
+    };
+};
+static_assert(sizeof(MatrixStruct) == 0x20, "MatrixStruct has the wrong size!");
+
+struct ClearRectStruct {
+    union {
+        BitField<0, 14, u32> clear_rect0_left;
+        BitField<14, 2, u32> reserved0;
+        BitField<16, 14, u32> clear_rect0_right;
+        BitField<30, 2, u32> reserved1;
+    };
+    union {
+        BitField<0, 14, u32> clear_rect0_top;
+        BitField<14, 2, u32> reserved2;
+        BitField<16, 14, u32> clear_rect0_bottom;
+        BitField<30, 2, u32> reserved3;
+    };
+    union {
+        BitField<0, 14, u32> clear_rect1_left;
+        BitField<14, 2, u32> reserved4;
+        BitField<16, 14, u32> clear_rect1_right;
+        BitField<30, 2, u32> reserved5;
+    };
+    union {
+        BitField<0, 14, u32> clear_rect1_top;
+        BitField<14, 2, u32> reserved6;
+        BitField<16, 14, u32> clear_rect1_bottom;
+        BitField<30, 2, u32> reserved7;
+    };
+};
+static_assert(sizeof(ClearRectStruct) == 0x10, "ClearRectStruct has the wrong size!");
+
+struct SlotConfig {
+    union {
+        BitField<0, 1, u64> slot_enable;
+        BitField<1, 1, u64> denoise;
+        BitField<2, 1, u64> advanced_denoise;
+        BitField<3, 1, u64> cadence_detect;
+        BitField<4, 1, u64> motion_map;
+        BitField<5, 1, u64> motion_map_capture;
+        BitField<6, 1, u64> is_even;
+        BitField<7, 1, u64> chroma_even;
+        // fetch control struct
+        BitField<8, 1, u64> current_field_enable;
+        BitField<9, 1, u64> prev_field_enable;
+        BitField<10, 1, u64> next_field_enable;
+        BitField<11, 1, u64> next_nr_field_enable; // noise reduction
+        BitField<12, 1, u64> current_motion_field_enable;
+        BitField<13, 1, u64> prev_motion_field_enable;
+        BitField<14, 1, u64> prev_prev_motion_field_enable;
+        BitField<15, 1, u64> combined_motion_field_enable;
+
+        BitField<16, 4, DXVAHD_FRAME_FORMAT> frame_format;
+        BitField<20, 2, u64> filter_length_y; // 0: 1-tap, 1: 2-tap, 2: 5-tap, 3: 10-tap
+        BitField<22, 2, u64> filter_length_x;
+        BitField<24, 12, u64> panoramic;
+        BitField<36, 22, u64> reserved1;
+        BitField<58, 6, u64> detail_filter_clamp;
+    };
+    union {
+        BitField<0, 10, u64> filter_noise;
+        BitField<10, 10, u64> filter_detail;
+        BitField<20, 10, u64> chroma_noise;
+        BitField<30, 10, u64> chroma_detail;
+        BitField<40, 4, DXVAHD_DEINTERLACE_MODE_PRIVATE> deinterlace_mode;
+        BitField<44, 3, u64> motion_accumulation_weight;
+        BitField<47, 11, u64> noise_iir;
+        BitField<58, 4, u64> light_level;
+        BitField<62, 2, u64> reserved4;
+    };
+    union {
+        BitField<0, 10, u64> soft_clamp_low;
+        BitField<10, 10, u64> soft_clamp_high;
+        BitField<20, 3, u64> reserved5;
+        BitField<23, 9, u64> reserved6;
+        BitField<32, 10, u64> planar_alpha;
+        BitField<42, 1, u64> constant_alpha;
+        BitField<43, 3, u64> stereo_interleave;
+        BitField<46, 1, u64> clip_enabled;
+        BitField<47, 8, u64> clear_rect_mask;
+        BitField<55, 2, u64> degamma_mode;
+        BitField<57, 1, u64> reserved7;
+        BitField<58, 1, u64> decompress_enable;
+        BitField<59, 5, u64> reserved9;
+    };
+    union {
+        BitField<0, 8, u64> decompress_ctb_count;
+        BitField<8, 32, u64> decompress_zbc_count;
+        BitField<40, 24, u64> reserved12;
+    };
+    union {
+        BitField<0, 30, u64> source_rect_left;
+        BitField<30, 2, u64> reserved14;
+        BitField<32, 30, u64> source_rect_right;
+        BitField<62, 2, u64> reserved15;
+    };
+    union {
+        BitField<0, 30, u64> source_rect_top;
+        BitField<30, 2, u64> reserved16;
+        BitField<32, 30, u64> source_rect_bottom;
+        BitField<62, 2, u64> reserved17;
+    };
+    union {
+        BitField<0, 14, u64> dest_rect_left;
+        BitField<14, 2, u64> reserved18;
+        BitField<16, 14, u64> dest_rect_right;
+        BitField<30, 2, u64> reserved19;
+        BitField<32, 14, u64> dest_rect_top;
+        BitField<46, 2, u64> reserved20;
+        BitField<48, 14, u64> dest_rect_bottom;
+        BitField<62, 2, u64> reserved21;
+    };
+    u32 reserved22;
+    u32 reserved23;
+};
+static_assert(sizeof(SlotConfig) == 0x40, "SlotConfig has the wrong size!");
+
+struct SlotSurfaceConfig {
+    union {
+        BitField<0, 7, VideoPixelFormat> slot_pixel_format;
+        BitField<7, 2, u32> slot_chroma_loc_horiz;
+        BitField<9, 2, u32> slot_chroma_loc_vert;
+        BitField<11, 4, u32> slot_block_kind;
+        BitField<15, 4, u32> slot_block_height;
+        BitField<19, 3, u32> slot_cache_width;
+        BitField<22, 10, u32> reserved0;
+    };
+    union {
+        BitField<0, 14, u32> slot_surface_width;   //  - 1
+        BitField<14, 14, u32> slot_surface_height; //  - 1
+        BitField<28, 4, u32> reserved1;
+    };
+    union {
+        BitField<0, 14, u32> slot_luma_width;   // padded, - 1
+        BitField<14, 14, u32> slot_luma_height; // padded, - 1
+        BitField<28, 4, u32> reserved2;
+    };
+    union {
+        BitField<0, 14, u32> slot_chroma_width;   // padded, - 1
+        BitField<14, 14, u32> slot_chroma_height; // padded, - 1
+        BitField<28, 4, u32> reserved3;
+    };
+};
+static_assert(sizeof(SlotSurfaceConfig) == 0x10, "SlotSurfaceConfig has the wrong size!");
+
+struct LumaKeyStruct {
+    union {
+        BitField<0, 20, u64> luma_coeff0;  // (0) of 4x1 conversion matrix, S12.8 format
+        BitField<20, 20, u64> luma_coeff1; // (1) of 4x1 conversion matrix, S12.8 format
+        BitField<40, 20, u64> luma_coeff2; // (2) of 4x1 conversion matrix, S12.8 format
+        BitField<60, 4, u64> luma_r_shift;
+    };
+    union {
+        BitField<0, 20, u64> luma_coeff3; // (3) of 4x1 conversion matrix, S12.8 format
+        BitField<20, 10, u64> luma_key_lower;
+        BitField<30, 10, u64> luma_key_upper;
+        BitField<40, 1, u64> luma_key_enabled;
+        BitField<41, 2, u64> reserved0;
+        BitField<43, 21, u64> reserved1;
+    };
+};
+static_assert(sizeof(LumaKeyStruct) == 0x10, "LumaKeyStruct has the wrong size!");
+
+struct BlendingSlotStruct {
+    union {
+        BitField<0, 10, u32> alpha_k1;
+        BitField<10, 6, u32> reserved0;
+        BitField<16, 10, u32> alpha_k2;
+        BitField<26, 6, u32> reserved1;
+    };
+    union {
+        BitField<0, 3, BLEND_SRCFACTC> src_factor_color_match_select;
+        BitField<3, 1, u32> reserved2;
+        BitField<4, 3, BLEND_DSTFACTC> dst_factor_color_match_select;
+        BitField<7, 1, u32> reserved3;
+        BitField<8, 3, BLEND_SRCFACTA> src_factor_a_match_select;
+        BitField<11, 1, u32> reserved4;
+        BitField<12, 3, BLEND_DSTFACTA> dst_factor_a_match_select;
+        BitField<15, 1, u32> reserved5;
+        BitField<16, 4, u32> reserved6;
+        BitField<20, 4, u32> reserved7;
+        BitField<24, 4, u32> reserved8;
+        BitField<28, 4, u32> reserved9;
+    };
+    union {
+        BitField<0, 2, u32> reserved10;
+        BitField<2, 10, u32> override_r;
+        BitField<12, 10, u32> override_g;
+        BitField<22, 10, u32> override_b;
+    };
+    union {
+        BitField<0, 10, u32> override_a;
+        BitField<10, 2, u32> reserved11;
+        BitField<12, 1, u32> use_override_r;
+        BitField<13, 1, u32> use_override_g;
+        BitField<14, 1, u32> use_override_b;
+        BitField<15, 1, u32> use_override_a;
+        BitField<16, 1, u32> mask_r;
+        BitField<17, 1, u32> mask_g;
+        BitField<18, 1, u32> mask_b;
+        BitField<19, 1, u32> mask_a;
+        BitField<20, 12, u32> reserved12;
+    };
+};
+static_assert(sizeof(BlendingSlotStruct) == 0x10, "BlendingSlotStruct has the wrong size!");
+
+struct SlotStruct {
+    SlotConfig config;
+    SlotSurfaceConfig surface_config;
+    LumaKeyStruct luma_key;
+    MatrixStruct color_matrix;
+    MatrixStruct gamut_matrix;
+    BlendingSlotStruct blending;
+};
+static_assert(sizeof(SlotStruct) == 0xB0, "SlotStruct has the wrong size!");
+
+struct ConfigStruct {
+    PipeConfig pipe_config;
+    OutputConfig output_config;
+    OutputSurfaceConfig output_surface_config;
+    MatrixStruct out_color_matrix;
+    std::array<ClearRectStruct, 4> clear_rects;
+    std::array<SlotStruct, 8> slot_structs;
+};
+static_assert(offsetof(ConfigStruct, pipe_config) == 0x0, "pipe_config is in the wrong place!");
+static_assert(offsetof(ConfigStruct, output_config) == 0x10,
+              "output_config is in the wrong place!");
+static_assert(offsetof(ConfigStruct, output_surface_config) == 0x20,
+              "output_surface_config is in the wrong place!");
+static_assert(offsetof(ConfigStruct, out_color_matrix) == 0x30,
+              "out_color_matrix is in the wrong place!");
+static_assert(offsetof(ConfigStruct, clear_rects) == 0x50, "clear_rects is in the wrong place!");
+static_assert(offsetof(ConfigStruct, slot_structs) == 0x90, "slot_structs is in the wrong place!");
+static_assert(sizeof(ConfigStruct) == 0x610, "ConfigStruct has the wrong size!");
+
+struct VicRegisters {
+    static constexpr std::size_t NUM_REGS = 0x446;
+
+    union {
+        struct {
+            INSERT_PADDING_WORDS_NOINIT(0xC0);
+            u32 execute;
+            INSERT_PADDING_WORDS_NOINIT(0x3F);
+            std::array<std::array<PlaneOffsets, 8>, 8> surfaces;
+            u32 picture_index;
+            u32 control_params;
+            Offset config_struct_offset;
+            Offset filter_struct_offset;
+            Offset palette_offset;
+            Offset hist_offset;
+            u32 context_id;
+            u32 fce_ucode_size;
+            PlaneOffsets output_surface;
+            Offset fce_ucode_offset;
+            INSERT_PADDING_WORDS_NOINIT(0x4);
+            std::array<u32, 8> slot_context_ids;
+            std::array<Offset, 8> comp_tag_buffer_offsets;
+            std::array<Offset, 8> history_buffer_offset;
+            INSERT_PADDING_WORDS_NOINIT(0x25D);
+            u32 pm_trigger_end;
+        };
+        std::array<u32, NUM_REGS> reg_array;
+    };
+};
+static_assert(offsetof(VicRegisters, execute) == 0x300, "execute is in the wrong place!");
+static_assert(offsetof(VicRegisters, surfaces) == 0x400, "surfaces is in the wrong place!");
+static_assert(offsetof(VicRegisters, picture_index) == 0x700,
+              "picture_index is in the wrong place!");
+static_assert(offsetof(VicRegisters, control_params) == 0x704,
+              "control_params is in the wrong place!");
+static_assert(offsetof(VicRegisters, config_struct_offset) == 0x708,
+              "config_struct_offset is in the wrong place!");
+static_assert(offsetof(VicRegisters, output_surface) == 0x720,
+              "output_surface is in the wrong place!");
+static_assert(offsetof(VicRegisters, slot_context_ids) == 0x740,
+              "slot_context_ids is in the wrong place!");
+static_assert(offsetof(VicRegisters, history_buffer_offset) == 0x780,
+              "history_buffer_offset is in the wrong place!");
+static_assert(offsetof(VicRegisters, pm_trigger_end) == 0x1114,
+              "pm_trigger_end is in the wrong place!");
+static_assert(sizeof(VicRegisters) == 0x1118, "VicRegisters has the wrong size!");
+
+class Vic final : public CDmaPusher {
 public:
    enum class Method : u32 {
-        Execute = 0xc0,
-        SetControlParams = 0x1c1,
-        SetConfigStructOffset = 0x1c2,
-        SetOutputSurfaceLumaOffset = 0x1c8,
-        SetOutputSurfaceChromaOffset = 0x1c9,
-        SetOutputSurfaceChromaUnusedOffset = 0x1ca
+        Execute = offsetof(VicRegisters, execute),
+        SetControlParams = offsetof(VicRegisters, control_params),
+        SetConfigStructOffset = offsetof(VicRegisters, config_struct_offset),
+        SetOutputSurfaceLumaOffset = offsetof(VicRegisters, output_surface.luma),
+        SetOutputSurfaceChromaOffset = offsetof(VicRegisters, output_surface.chroma_u),
+        SetOutputSurfaceChromaUnusedOffset = offsetof(VicRegisters, output_surface.chroma_v)
    };

-    explicit Vic(Host1x& host1x, std::shared_ptr<Nvdec> nvdec_processor);
-
+    explicit Vic(Host1x& host1x, s32 id, u32 syncpt, FrameQueue& frame_queue);
    ~Vic();

    /// Write to the device state.
-    void ProcessMethod(Method method, u32 argument);
+    void ProcessMethod(u32 method, u32 arg) override;

 private:
    void Execute();

-    void WriteRGBFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);
+    void Blend(const ConfigStruct& config, const SlotStruct& slot);

-    void WriteYUVFrame(std::unique_ptr<FFmpeg::Frame> frame, const VicConfig& config);
+    template <bool Planar, bool Interlaced = false>
+    void ReadProgressiveY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+                                      std::shared_ptr<const FFmpeg::Frame> frame);
+    template <bool Planar, bool TopField>
+    void ReadInterlacedY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+                                     std::shared_ptr<const FFmpeg::Frame> frame);

-    Host1x& host1x;
-    std::shared_ptr<Tegra::Host1x::Nvdec> nvdec_processor;
+    template <bool Planar>
+    void ReadY8__V8U8_N420(const SlotStruct& slot, std::span<const PlaneOffsets> offsets,
+                           std::shared_ptr<const FFmpeg::Frame> frame);

-    /// Avoid reallocation of the following buffers every frame, as their
-    /// size does not change during a stream
-    using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>;
-    AVMallocPtr converted_frame_buffer;
-    Common::ScratchBuffer<u8> luma_buffer;
-    Common::ScratchBuffer<u8> chroma_buffer;
+    void WriteY8__V8U8_N420(const OutputSurfaceConfig& output_surface_config);

-    GPUVAddr config_struct_address{};
-    GPUVAddr output_surface_luma_address{};
-    GPUVAddr output_surface_chroma_address{};
+    template <VideoPixelFormat Format>
+    void WriteABGR(const OutputSurfaceConfig& output_surface_config);

-    SwsContext* scaler_ctx{};
-    s32 scaler_width{};
-    s32 scaler_height{};
+    s32 id;
+    s32 nvdec_id{-1};
+    u32 syncpoint;
+
+    VicRegisters regs{};
+    FrameQueue& frame_queue;
+
+    const bool has_sse41{false};
+
+    Common::ScratchBuffer<Pixel> output_surface;
+    Common::ScratchBuffer<Pixel> slot_surface;
+    Common::ScratchBuffer<u8> luma_scratch;
+    Common::ScratchBuffer<u8> chroma_scratch;
+    Common::ScratchBuffer<u8> swizzle_scratch;
 };

-} // namespace Host1x
-
-} // namespace Tegra
+} // namespace Tegra::Host1x
--- a/src/video_core/host_shaders/fidelityfx_fsr.frag
+++ b/src/video_core/host_shaders/fidelityfx_fsr.frag
@@ -37,6 +37,7 @@ layout(set=0,binding=0) uniform sampler2D InputTexture;

 #define A_GPU 1
 #define A_GLSL 1
+#define FSR_RCAS_PASSTHROUGH_ALPHA 1

 #ifndef YUZU_USE_FP16
    #include "ffx_a.h"
@@ -71,9 +72,7 @@ layout(set=0,binding=0) uniform sampler2D InputTexture;

 #include "ffx_fsr1.h"

-#if USE_RCAS
-    layout(location = 0) in vec2 frag_texcoord;
-#endif
+layout (location = 0) in vec2 frag_texcoord;
 layout (location = 0) out vec4 frag_color;

 void CurrFilter(AU2 pos) {
@@ -81,22 +80,22 @@ void CurrFilter(AU2 pos) {
    #ifndef YUZU_USE_FP16
        AF3 c;
        FsrEasuF(c, pos, Const0, Const1, Const2, Const3);
-        frag_color = AF4(c, 1.0);
+        frag_color = AF4(c, texture(InputTexture, frag_texcoord).a);
    #else
        AH3 c;
        FsrEasuH(c, pos, Const0, Const1, Const2, Const3);
-        frag_color = AH4(c, 1.0);
+        frag_color = AH4(c, texture(InputTexture, frag_texcoord).a);
    #endif
 #endif
 #if USE_RCAS
    #ifndef YUZU_USE_FP16
-        AF3 c;
-        FsrRcasF(c.r, c.g, c.b, pos, Const0);
-        frag_color = AF4(c, 1.0);
+        AF4 c;
+        FsrRcasF(c.r, c.g, c.b, c.a, pos, Const0);
+        frag_color = c;
    #else
-        AH3 c;
-        FsrRcasH(c.r, c.g, c.b, pos, Const0);
-        frag_color = AH4(c, 1.0);
+        AH4 c;
+        FsrRcasH(c.r, c.g, c.b, c.a, pos, Const0);
+        frag_color = c;
    #endif
 #endif
 }
--- a/src/video_core/host_shaders/fxaa.frag
+++ b/src/video_core/host_shaders/fxaa.frag
@@ -71,5 +71,5 @@ vec3 FxaaPixelShader(vec4 posPos, sampler2D tex) {
 }

 void main() {
-  frag_color = vec4(FxaaPixelShader(posPos, input_texture), 1.0);
+  frag_color = vec4(FxaaPixelShader(posPos, input_texture), texture(input_texture, posPos.xy).a);
 }
--- a/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag
+++ b/src/video_core/host_shaders/opengl_fidelityfx_fsr.frag
@@ -31,6 +31,7 @@ layout (location = 0) uniform uvec4 constants[4];

 #define A_GPU 1
 #define A_GLSL 1
+#define FSR_RCAS_PASSTHROUGH_ALPHA 1

 #ifdef YUZU_USE_FP16
    #define A_HALF
@@ -67,9 +68,7 @@ layout (location = 0) uniform uvec4 constants[4];

 #include "ffx_fsr1.h"

-#if USE_RCAS
-    layout(location = 0) in vec2 frag_texcoord;
-#endif
+layout (location = 0) in vec2 frag_texcoord;
 layout (location = 0) out vec4 frag_color;

 void CurrFilter(AU2 pos)
@@ -78,22 +77,22 @@ void CurrFilter(AU2 pos)
    #ifndef YUZU_USE_FP16
        AF3 c;
        FsrEasuF(c, pos, constants[0], constants[1], constants[2], constants[3]);
-        frag_color = AF4(c, 1.0);
+        frag_color = AF4(c, texture(InputTexture, frag_texcoord).a);
    #else
        AH3 c;
        FsrEasuH(c, pos, constants[0], constants[1], constants[2], constants[3]);
-        frag_color = AH4(c, 1.0);
+        frag_color = AH4(c, texture(InputTexture, frag_texcoord).a);
    #endif
 #endif
 #if USE_RCAS
    #ifndef YUZU_USE_FP16
-        AF3 c;
-        FsrRcasF(c.r, c.g, c.b, pos, constants[0]);
-        frag_color = AF4(c, 1.0);
+        AF4 c;
+        FsrRcasF(c.r, c.g, c.b, c.a, pos, constants[0]);
+        frag_color = c;
    #else
        AH3 c;
-        FsrRcasH(c.r, c.g, c.b, pos, constants[0]);
-        frag_color = AH4(c, 1.0);
+        FsrRcasH(c.r, c.g, c.b, c.a, pos, constants[0]);
+        frag_color = c;
    #endif
 #endif
 }
--- a/src/video_core/host_shaders/opengl_present.frag
+++ b/src/video_core/host_shaders/opengl_present.frag
@@ -9,5 +9,5 @@ layout (location = 0) out vec4 color;
 layout (binding = 0) uniform sampler2D color_texture;

 void main() {
-    color = vec4(texture(color_texture, frag_tex_coord).rgb, 1.0f);
+    color = vec4(texture(color_texture, frag_tex_coord));
 }
--- a/src/video_core/host_shaders/present_bicubic.frag
+++ b/src/video_core/host_shaders/present_bicubic.frag
@@ -52,5 +52,5 @@ vec4 textureBicubic( sampler2D textureSampler, vec2 texCoords ) {
 }

 void main() {
-    color = vec4(textureBicubic(color_texture, frag_tex_coord).rgb, 1.0f);
+    color = textureBicubic(color_texture, frag_tex_coord);
 }
--- a/src/video_core/host_shaders/present_gaussian.frag
+++ b/src/video_core/host_shaders/present_gaussian.frag
@@ -46,14 +46,14 @@ vec4 blurDiagonal(sampler2D textureSampler, vec2 coord, vec2 norm) {
 }

 void main() {
-    vec3 base = texture(color_texture, vec2(frag_tex_coord)).rgb * weight[0];
+    vec4 base = texture(color_texture, vec2(frag_tex_coord)) * weight[0];
    vec2 tex_offset = 1.0f / textureSize(color_texture, 0);

    // TODO(Blinkhawk): This code can be optimized through shader group instructions.
-    vec3 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset).rgb;
-    vec3 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset).rgb;
-    vec3 diagonalA = blurDiagonal(color_texture, frag_tex_coord, tex_offset).rgb;
-    vec3 diagonalB = blurDiagonal(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0)).rgb;
-    vec3 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f);
-    color = vec4(combination + base, 1.0f);
+    vec4 horizontal = blurHorizontal(color_texture, frag_tex_coord, tex_offset);
+    vec4 vertical = blurVertical(color_texture, frag_tex_coord, tex_offset);
+    vec4 diagonalA = blurDiagonal(color_texture, frag_tex_coord, tex_offset);
+    vec4 diagonalB = blurDiagonal(color_texture, frag_tex_coord, tex_offset * vec2(1.0, -1.0));
+    vec4 combination = mix(mix(horizontal, vertical, 0.5f), mix(diagonalA, diagonalB, 0.5f), 0.5f);
+    color = combination + base;
 }
--- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp16.frag
@@ -6,5 +6,6 @@

 #define YUZU_USE_FP16
 #define USE_EASU 1
+#define VERSION 1

 #include "fidelityfx_fsr.frag"
--- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_easu_fp32.frag
@@ -5,5 +5,6 @@
 #extension GL_GOOGLE_include_directive : enable

 #define USE_EASU 1
+#define VERSION 1

 #include "fidelityfx_fsr.frag"
--- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp16.frag
@@ -6,5 +6,6 @@

 #define YUZU_USE_FP16
 #define USE_RCAS 1
+#define VERSION 1

 #include "fidelityfx_fsr.frag"
--- a/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag
+++ b/src/video_core/host_shaders/vulkan_fidelityfx_fsr_rcas_fp32.frag
@@ -5,5 +5,6 @@
 #extension GL_GOOGLE_include_directive : enable

 #define USE_RCAS 1
+#define VERSION 1

 #include "fidelityfx_fsr.frag"
--- a/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag
+++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp16.frag
@@ -5,7 +5,7 @@

 #extension GL_GOOGLE_include_directive : enable

-#define VERSION 1
+#define VERSION 2
 #define YUZU_USE_FP16

 #include "opengl_present_scaleforce.frag"
--- a/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag
+++ b/src/video_core/host_shaders/vulkan_present_scaleforce_fp32.frag
@@ -5,6 +5,6 @@

 #extension GL_GOOGLE_include_directive : enable

-#define VERSION 1
+#define VERSION 2

 #include "opengl_present_scaleforce.frag"
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -42,6 +42,8 @@ public:
                           u64 page_bits_ = 12);
    ~MemoryManager();

+    static constexpr bool HAS_FLUSH_INVALIDATION = true;
+
    size_t GetID() const {
        return unique_identifier;
    }
--- a/src/video_core/present.h
+++ b/src/video_core/present.h
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: Copyright 2024 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/settings.h"
+
+static inline Settings::ScalingFilter GetScalingFilter() {
+    return Settings::values.scaling_filter.GetValue();
+}
+
+static inline Settings::AntiAliasing GetAntiAliasing() {
+    return Settings::values.anti_aliasing.GetValue();
+}
+
+static inline Settings::ScalingFilter GetScalingFilterForAppletCapture() {
+    return Settings::ScalingFilter::Bilinear;
+}
+
+static inline Settings::AntiAliasing GetAntiAliasingForAppletCapture() {
+    return Settings::AntiAliasing::None;
+}
+
+struct PresentFilters {
+    Settings::ScalingFilter (*get_scaling_filter)();
+    Settings::AntiAliasing (*get_anti_aliasing)();
+};
+
+constexpr PresentFilters PresentFiltersForDisplay{
+    .get_scaling_filter = &GetScalingFilter,
+    .get_anti_aliasing = &GetAntiAliasing,
+};
+
+constexpr PresentFilters PresentFiltersForAppletCapture{
+    .get_scaling_filter = &GetScalingFilterForAppletCapture,
+    .get_anti_aliasing = &GetAntiAliasingForAppletCapture,
+};
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -40,6 +40,9 @@ public:
    /// Finalize rendering the guest frame and draw into the presentation texture
    virtual void Composite(std::span<const Tegra::FramebufferConfig> layers) = 0;

+    /// Get the tiled applet layer capture buffer
+    virtual std::vector<u8> GetAppletCaptureBuffer() = 0;
+
    [[nodiscard]] virtual RasterizerInterface* ReadRasterizer() = 0;

    [[nodiscard]] virtual std::string GetDeviceVendor() const = 0;
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
yuzubot	039a84338a	Android 228	2024-02-04 01:00:37 +00:00
yuzubot	f0eb935051	Merge yuzu-emu#12903	2024-02-04 01:00:36 +00:00
yuzubot	db727765f1	Merge yuzu-emu#12892	2024-02-04 01:00:36 +00:00
yuzubot	60dda4ed9a	Merge yuzu-emu#12756	2024-02-04 01:00:36 +00:00
yuzubot	dde2c301f3	Merge yuzu-emu#12749	2024-02-04 01:00:36 +00:00
yuzubot	0bf2470bdf	Merge yuzu-emu#12461	2024-02-04 01:00:36 +00:00
liamwhite	5da55cbac9	Merge pull request #12901 from Kelebek1/timezone_firmware_fix Fix firmware timezone boot load check.	2024-02-03 11:10:30 -05:00
liamwhite	81cc4df1f9	Merge pull request #12895 from german77/files service: fs: Skip non user id folders	2024-02-03 11:10:24 -05:00
liamwhite	25f3d358b1	Merge pull request #12877 from german77/npad-fixed service: hid: Multiple fixes	2024-02-03 11:10:14 -05:00
liamwhite	a3c8bb251d	Merge pull request #12852 from Calinou/multiplayer-color-player-counts Color player counts in the multiplayer public lobby list	2024-02-03 11:10:00 -05:00
liamwhite	327533be1f	Merge pull request #12851 from Calinou/multiplayer-persist-filters Persist filters in multiplayer public lobby list	2024-02-03 11:09:51 -05:00
liamwhite	61ea2115c7	Merge pull request #12850 from Calinou/multiplayer-add-hotkeys Add hotkeys for multiplayer actions	2024-02-03 11:09:41 -05:00
Kelebek1	108a72ea8a	Fix firmware timezone boot load check.	2024-02-03 15:21:10 +00:00
Narr the Reg	fb3ef957bb	service: fs: Skip non user id folders	2024-02-02 13:25:38 -06:00
Narr the Reg	818721d12d	service: hid: Multiple fixes	2024-02-01 10:37:44 -06:00
Hugo Locurcio	442aad9b27	Persist filters in multiplayer public lobby list After connecting to a room, the chosen filter text, "Games I Own", "Hide Empty Rooms" and "Hide Full Rooms" values are persisted to configuration so they are preserved across restarts. This makes it easier to rejoin a room if you regularly play the same game, or after a crash.	2024-01-30 17:40:29 +01:00
Hugo Locurcio	8e0f97ac96	Color player counts in the multiplayer public lobby list - Full lobbies have their player count displayed in red. - Lobbies with one slot left have their player count displayed in orange. - Empty lobbies have their player count grayed out.	2024-01-30 17:38:21 +01:00
Hugo Locurcio	345d691328	Add hotkeys for multiplayer actions Default shortcuts were chosen as to be intuitive (use the first letter of the action, or the second word's first letter) and work on all types of keyboards. The hotkeys can be used while playing a game too, as they are application-wide.	2024-01-30 01:32:14 +01:00