Merge pull request #2704 from FernandoS27/conditional

maxwell3d: Implement Conditional Rendering
Fix README change mistake (#2754 )
2019-07-24 17:07:57 -04:00 · 2019-07-24 16:42:33 -04:00 · 2019-07-22 11:12:55 -04:00 · 2019-07-21 00:59:52 -04:00 · 2019-07-20 21:03:30 -04:00 · 2019-07-20 17:25:08 -04:00
70 changed files with 2465 additions and 1010 deletions
--- a/.ci/templates/build-single.yml
+++ b/.ci/templates/build-single.yml
@@ -14,7 +14,7 @@ steps:
    cacheHitVar: CACHE_RESTORED
 - script: chmod a+x ./.ci/scripts/$(ScriptFolder)/exec.sh && ./.ci/scripts/$(ScriptFolder)/exec.sh
  displayName: 'Build'
- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && ./.ci/scripts/$(ScriptFolder)/upload.sh
+- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && RELEASE_NAME=$(BuildName) ./.ci/scripts/$(ScriptFolder)/upload.sh
  displayName: 'Package Artifacts'
 - publish: artifacts
  artifact: 'yuzu-$(BuildName)-$(BuildSuffix)'
--- a/.ci/templates/build-standard.yml
+++ b/.ci/templates/build-standard.yml
@@ -3,7 +3,7 @@ jobs:
  displayName: 'standard'
  pool:
    vmImage: ubuntu-latest
-  strategy: 
+  strategy:
    maxParallel: 10
    matrix:
      windows:
--- a/.ci/templates/build-testing.yml
+++ b/.ci/templates/build-testing.yml
@@ -3,19 +3,21 @@ jobs:
  displayName: 'testing'
  pool:
    vmImage: ubuntu-latest
-  strategy: 
-    maxParallel: 10
+  strategy:
+    maxParallel: 5
    matrix:
      windows:
        BuildSuffix: 'windows-testing'
        ScriptFolder: 'windows'
  steps:
+  - script: pip install requests urllib3
+    displayName: 'Prepare Environment'
  - task: PythonScript@0
    condition: eq(variables['Build.Reason'], 'PullRequest')
    displayName: 'Determine Testing Status'
    inputs:
      scriptSource: 'filePath'
-      scriptPath: '../scripts/merge/check-label-presence.py'
+      scriptPath: '.ci/scripts/merge/check-label-presence.py'
      arguments: '$(System.PullRequest.PullRequestNumber) create-testing-build'
  - ${{ if eq(variables.enabletesting, 'true') }}:
    - template: ./sync-source.yml
@@ -27,4 +29,4 @@ jobs:
        matchLabel: 'testing-merge'
    - template: ./build-single.yml
      parameters:
-        artifactSource: 'false'
+        artifactSource: 'false'
--- a/.ci/templates/release.yml
+++ b/.ci/templates/release.yml
@@ -1,29 +0,0 @@
-steps:
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Windows Release'
-    inputs:
-      artifactName: 'yuzu-$(BuildName)-windows-mingw'
-      buildType: 'current'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Linux Release'
-    inputs:
-      artifactName: 'yuzu-$(BuildName)-linux'
-      buildType: 'current'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-  - task: DownloadPipelineArtifact@2
-    displayName: 'Download Release Point'
-    inputs:
-      artifactName: 'yuzu-$(BuildName)-release-point'
-      buildType: 'current'
-      targetPath: '$(Build.ArtifactStagingDirectory)'
-  - script: echo '##vso[task.setvariable variable=tagcommit]' && cat $(Build.ArtifactStagingDirectory)/tag-commit.sha
-    displayName: 'Calculate Release Point'
-  - task: GitHubRelease@0
-    inputs:
-      gitHubConnection: $(GitHubReleaseConnectionName)
-      repositoryName: '$(GitHubReleaseRepoName)'
-      action: 'create'
-      target: $(variables.tagcommit)
-      title: 'yuzu $(BuildName) #$(Build.BuildId)'
-      assets: '$(Build.ArtifactStagingDirectory)/*'
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@ yuzu emulator
 =============
 [![Travis CI Build Status](https://travis-ci.org/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.org/yuzu-emu/yuzu)
 [![AppVeyor CI Build Status](https://ci.appveyor.com/api/projects/status/77k97svb2usreu68?svg=true)](https://ci.appveyor.com/project/bunnei/yuzu)
+[![Azure Mainline CI Build Status](https://dev.azure.com/yuzu-emu/yuzu/_apis/build/status/yuzu%20mainline?branchName=master)](https://dev.azure.com/yuzu-emu/yuzu/)

 yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/).

--- a/src/core/arm/unicorn/arm_unicorn.cpp
+++ b/src/core/arm/unicorn/arm_unicorn.cpp
@@ -50,11 +50,14 @@ static void CodeHook(uc_engine* uc, uint64_t address, uint32_t size, void* user_

 static bool UnmappedMemoryHook(uc_engine* uc, uc_mem_type type, u64 addr, int size, u64 value,
                               void* user_data) {
+    auto* const system = static_cast<System*>(user_data);
+
    ARM_Interface::ThreadContext ctx{};
-    Core::CurrentArmInterface().SaveContext(ctx);
+    system->CurrentArmInterface().SaveContext(ctx);
    ASSERT_MSG(false, "Attempted to read from unmapped memory: 0x{:X}, pc=0x{:X}, lr=0x{:X}", addr,
               ctx.pc, ctx.cpu_registers[30]);
-    return {};
+
+    return false;
 }

 ARM_Unicorn::ARM_Unicorn(System& system) : system{system} {
@@ -65,7 +68,7 @@ ARM_Unicorn::ARM_Unicorn(System& system) : system{system} {

    uc_hook hook{};
    CHECKED(uc_hook_add(uc, &hook, UC_HOOK_INTR, (void*)InterruptHook, this, 0, -1));
-    CHECKED(uc_hook_add(uc, &hook, UC_HOOK_MEM_INVALID, (void*)UnmappedMemoryHook, this, 0, -1));
+    CHECKED(uc_hook_add(uc, &hook, UC_HOOK_MEM_INVALID, (void*)UnmappedMemoryHook, &system, 0, -1));
    if (GDBStub::IsServerEnabled()) {
        CHECKED(uc_hook_add(uc, &hook, UC_HOOK_CODE, (void*)CodeHook, this, 0, -1));
        last_bkpt_hit = false;
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -327,10 +327,6 @@ private:
    static System s_instance;
 };

-inline ARM_Interface& CurrentArmInterface() {
-    return System::GetInstance().CurrentArmInterface();
-}
-
 inline Kernel::Process* CurrentProcess() {
    return System::GetInstance().CurrentProcess();
 }
--- a/src/core/file_sys/program_metadata.cpp
+++ b/src/core/file_sys/program_metadata.cpp
@@ -94,6 +94,10 @@ u64 ProgramMetadata::GetFilesystemPermissions() const {
    return aci_file_access.permissions;
 }

+u32 ProgramMetadata::GetSystemResourceSize() const {
+    return npdm_header.system_resource_size;
+}
+
 const ProgramMetadata::KernelCapabilityDescriptors& ProgramMetadata::GetKernelCapabilities() const {
    return aci_kernel_capabilities;
 }
--- a/src/core/file_sys/program_metadata.h
+++ b/src/core/file_sys/program_metadata.h
@@ -58,6 +58,7 @@ public:
    u32 GetMainThreadStackSize() const;
    u64 GetTitleID() const;
    u64 GetFilesystemPermissions() const;
+    u32 GetSystemResourceSize() const;
    const KernelCapabilityDescriptors& GetKernelCapabilities() const;

    void Print() const;
@@ -76,7 +77,8 @@ private:
        u8 reserved_3;
        u8 main_thread_priority;
        u8 main_thread_cpu;
-        std::array<u8, 8> reserved_4;
+        std::array<u8, 4> reserved_4;
+        u32_le system_resource_size;
        u32_le process_category;
        u32_le main_stack_size;
        std::array<u8, 0x10> application_name;
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -129,20 +129,17 @@ u64 Process::GetTotalPhysicalMemoryAvailable() const {
    return vm_manager.GetTotalPhysicalMemoryAvailable();
 }

-u64 Process::GetTotalPhysicalMemoryAvailableWithoutMmHeap() const {
-    // TODO: Subtract the personal heap size from this when the
-    //       personal heap is implemented.
-    return GetTotalPhysicalMemoryAvailable();
+u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const {
+    return GetTotalPhysicalMemoryAvailable() - GetSystemResourceSize();
 }

 u64 Process::GetTotalPhysicalMemoryUsed() const {
-    return vm_manager.GetCurrentHeapSize() + main_thread_stack_size + code_memory_size;
+    return vm_manager.GetCurrentHeapSize() + main_thread_stack_size + code_memory_size +
+           GetSystemResourceUsage();
 }

-u64 Process::GetTotalPhysicalMemoryUsedWithoutMmHeap() const {
-    // TODO: Subtract the personal heap size from this when the
-    //       personal heap is implemented.
-    return GetTotalPhysicalMemoryUsed();
+u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const {
+    return GetTotalPhysicalMemoryUsed() - GetSystemResourceUsage();
 }

 void Process::RegisterThread(const Thread* thread) {
@@ -172,6 +169,7 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) {
    program_id = metadata.GetTitleID();
    ideal_core = metadata.GetMainThreadCore();
    is_64bit_process = metadata.Is64BitProgram();
+    system_resource_size = metadata.GetSystemResourceSize();

    vm_manager.Reset(metadata.GetAddressSpaceType());

@@ -186,19 +184,11 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) {
 }

 void Process::Run(s32 main_thread_priority, u64 stack_size) {
-    // The kernel always ensures that the given stack size is page aligned.
-    main_thread_stack_size = Common::AlignUp(stack_size, Memory::PAGE_SIZE);
-
-    // Allocate and map the main thread stack
-    // TODO(bunnei): This is heap area that should be allocated by the kernel and not mapped as part
-    // of the user address space.
-    const VAddr mapping_address = vm_manager.GetTLSIORegionEndAddress() - main_thread_stack_size;
-    vm_manager
-        .MapMemoryBlock(mapping_address, std::make_shared<std::vector<u8>>(main_thread_stack_size),
-                        0, main_thread_stack_size, MemoryState::Stack)
-        .Unwrap();
+    AllocateMainThreadStack(stack_size);
+    tls_region_address = CreateTLSRegion();

    vm_manager.LogLayout();
+
    ChangeStatus(ProcessStatus::Running);

    SetupMainThread(*this, kernel, main_thread_priority);
@@ -228,6 +218,9 @@ void Process::PrepareForTermination() {
    stop_threads(system.Scheduler(2).GetThreadList());
    stop_threads(system.Scheduler(3).GetThreadList());

+    FreeTLSRegion(tls_region_address);
+    tls_region_address = 0;
+
    ChangeStatus(ProcessStatus::Exited);
 }

@@ -327,4 +320,16 @@ void Process::ChangeStatus(ProcessStatus new_status) {
    WakeupAllWaitingThreads();
 }

+void Process::AllocateMainThreadStack(u64 stack_size) {
+    // The kernel always ensures that the given stack size is page aligned.
+    main_thread_stack_size = Common::AlignUp(stack_size, Memory::PAGE_SIZE);
+
+    // Allocate and map the main thread stack
+    const VAddr mapping_address = vm_manager.GetTLSIORegionEndAddress() - main_thread_stack_size;
+    vm_manager
+        .MapMemoryBlock(mapping_address, std::make_shared<std::vector<u8>>(main_thread_stack_size),
+                        0, main_thread_stack_size, MemoryState::Stack)
+        .Unwrap();
+}
+
 } // namespace Kernel
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@@ -135,6 +135,11 @@ public:
        return mutex;
    }

+    /// Gets the address to the process' dedicated TLS region.
+    VAddr GetTLSRegionAddress() const {
+        return tls_region_address;
+    }
+
    /// Gets the current status of the process
    ProcessStatus GetStatus() const {
        return status;
@@ -168,8 +173,24 @@ public:
        return capabilities.GetPriorityMask();
    }

-    u32 IsVirtualMemoryEnabled() const {
-        return is_virtual_address_memory_enabled;
+    /// Gets the amount of secure memory to allocate for memory management.
+    u32 GetSystemResourceSize() const {
+        return system_resource_size;
+    }
+
+    /// Gets the amount of secure memory currently in use for memory management.
+    u32 GetSystemResourceUsage() const {
+        // On hardware, this returns the amount of system resource memory that has
+        // been used by the kernel. This is problematic for Yuzu to emulate, because
+        // system resource memory is used for page tables -- and yuzu doesn't really
+        // have a way to calculate how much memory is required for page tables for
+        // the current process at any given time.
+        // TODO: Is this even worth implementing? Games may retrieve this value via
+        // an SDK function that gets used + available system resource size for debug
+        // or diagnostic purposes. However, it seems unlikely that a game would make
+        // decisions based on how much system memory is dedicated to its page tables.
+        // Is returning a value other than zero wise?
+        return 0;
    }

    /// Whether this process is an AArch64 or AArch32 process.
@@ -196,15 +217,15 @@ public:
    u64 GetTotalPhysicalMemoryAvailable() const;

    /// Retrieves the total physical memory available to this process in bytes,
-    /// without the size of the personal heap added to it.
-    u64 GetTotalPhysicalMemoryAvailableWithoutMmHeap() const;
+    /// without the size of the personal system resource heap added to it.
+    u64 GetTotalPhysicalMemoryAvailableWithoutSystemResource() const;

    /// Retrieves the total physical memory used by this process in bytes.
    u64 GetTotalPhysicalMemoryUsed() const;

    /// Retrieves the total physical memory used by this process in bytes,
-    /// without the size of the personal heap added to it.
-    u64 GetTotalPhysicalMemoryUsedWithoutMmHeap() const;
+    /// without the size of the personal system resource heap added to it.
+    u64 GetTotalPhysicalMemoryUsedWithoutSystemResource() const;

    /// Gets the list of all threads created with this process as their owner.
    const std::list<const Thread*>& GetThreadList() const {
@@ -280,6 +301,9 @@ private:
    /// a process signal.
    void ChangeStatus(ProcessStatus new_status);

+    /// Allocates the main thread stack for the process, given the stack size in bytes.
+    void AllocateMainThreadStack(u64 stack_size);
+
    /// Memory manager for this process.
    Kernel::VMManager vm_manager;

@@ -298,12 +322,16 @@ private:
    /// Title ID corresponding to the process
    u64 program_id = 0;

+    /// Specifies additional memory to be reserved for the process's memory management by the
+    /// system. When this is non-zero, secure memory is allocated and used for page table allocation
+    /// instead of using the normal global page tables/memory block management.
+    u32 system_resource_size = 0;
+
    /// Resource limit descriptor for this process
    SharedPtr<ResourceLimit> resource_limit;

    /// The ideal CPU core for this process, threads are scheduled on this core by default.
    u8 ideal_core = 0;
-    u32 is_virtual_address_memory_enabled = 0;

    /// The Thread Local Storage area is allocated as processes create threads,
    /// each TLS area is 0x200 bytes, so one page (0x1000) is split up in 8 parts, and each part
@@ -338,6 +366,9 @@ private:
    /// variable related facilities.
    Mutex mutex;

+    /// Address indicating the location of the process' dedicated TLS region.
+    VAddr tls_region_address = 0;
+
    /// Random values for svcGetInfo RandomEntropy
    std::array<u64, RANDOM_ENTROPY_SIZE> random_entropy{};

--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -736,16 +736,16 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
        StackRegionBaseAddr = 14,
        StackRegionSize = 15,
        // 3.0.0+
-        IsVirtualAddressMemoryEnabled = 16,
-        PersonalMmHeapUsage = 17,
+        SystemResourceSize = 16,
+        SystemResourceUsage = 17,
        TitleId = 18,
        // 4.0.0+
        PrivilegedProcessId = 19,
        // 5.0.0+
        UserExceptionContextAddr = 20,
        // 6.0.0+
-        TotalPhysicalMemoryAvailableWithoutMmHeap = 21,
-        TotalPhysicalMemoryUsedWithoutMmHeap = 22,
+        TotalPhysicalMemoryAvailableWithoutSystemResource = 21,
+        TotalPhysicalMemoryUsedWithoutSystemResource = 22,
    };

    const auto info_id_type = static_cast<GetInfoType>(info_id);
@@ -763,12 +763,12 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
    case GetInfoType::StackRegionSize:
    case GetInfoType::TotalPhysicalMemoryAvailable:
    case GetInfoType::TotalPhysicalMemoryUsed:
-    case GetInfoType::IsVirtualAddressMemoryEnabled:
-    case GetInfoType::PersonalMmHeapUsage:
+    case GetInfoType::SystemResourceSize:
+    case GetInfoType::SystemResourceUsage:
    case GetInfoType::TitleId:
    case GetInfoType::UserExceptionContextAddr:
-    case GetInfoType::TotalPhysicalMemoryAvailableWithoutMmHeap:
-    case GetInfoType::TotalPhysicalMemoryUsedWithoutMmHeap: {
+    case GetInfoType::TotalPhysicalMemoryAvailableWithoutSystemResource:
+    case GetInfoType::TotalPhysicalMemoryUsedWithoutSystemResource: {
        if (info_sub_id != 0) {
            return ERR_INVALID_ENUM_VALUE;
        }
@@ -829,8 +829,13 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
            *result = process->GetTotalPhysicalMemoryUsed();
            return RESULT_SUCCESS;

-        case GetInfoType::IsVirtualAddressMemoryEnabled:
-            *result = process->IsVirtualMemoryEnabled();
+        case GetInfoType::SystemResourceSize:
+            *result = process->GetSystemResourceSize();
+            return RESULT_SUCCESS;
+
+        case GetInfoType::SystemResourceUsage:
+            LOG_WARNING(Kernel_SVC, "(STUBBED) Attempted to query system resource usage");
+            *result = process->GetSystemResourceUsage();
            return RESULT_SUCCESS;

        case GetInfoType::TitleId:
@@ -838,17 +843,15 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
            return RESULT_SUCCESS;

        case GetInfoType::UserExceptionContextAddr:
-            LOG_WARNING(Kernel_SVC,
-                        "(STUBBED) Attempted to query user exception context address, returned 0");
-            *result = 0;
+            *result = process->GetTLSRegionAddress();
            return RESULT_SUCCESS;

-        case GetInfoType::TotalPhysicalMemoryAvailableWithoutMmHeap:
-            *result = process->GetTotalPhysicalMemoryAvailable();
+        case GetInfoType::TotalPhysicalMemoryAvailableWithoutSystemResource:
+            *result = process->GetTotalPhysicalMemoryAvailableWithoutSystemResource();
            return RESULT_SUCCESS;

-        case GetInfoType::TotalPhysicalMemoryUsedWithoutMmHeap:
-            *result = process->GetTotalPhysicalMemoryUsedWithoutMmHeap();
+        case GetInfoType::TotalPhysicalMemoryUsedWithoutSystemResource:
+            *result = process->GetTotalPhysicalMemoryUsedWithoutSystemResource();
            return RESULT_SUCCESS;

        default:
@@ -953,6 +956,86 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
    }
 }

+/// Maps memory at a desired address
+static ResultCode MapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
+    LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
+
+    if (!Common::Is4KBAligned(addr)) {
+        LOG_ERROR(Kernel_SVC, "Address is not aligned to 4KB, 0x{:016X}", addr);
+        return ERR_INVALID_ADDRESS;
+    }
+
+    if (!Common::Is4KBAligned(size)) {
+        LOG_ERROR(Kernel_SVC, "Size is not aligned to 4KB, 0x{:X}", size);
+        return ERR_INVALID_SIZE;
+    }
+
+    if (size == 0) {
+        LOG_ERROR(Kernel_SVC, "Size is zero");
+        return ERR_INVALID_SIZE;
+    }
+
+    if (!(addr < addr + size)) {
+        LOG_ERROR(Kernel_SVC, "Size causes 64-bit overflow of address");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    Process* const current_process = system.Kernel().CurrentProcess();
+    auto& vm_manager = current_process->VMManager();
+
+    if (current_process->GetSystemResourceSize() == 0) {
+        LOG_ERROR(Kernel_SVC, "System Resource Size is zero");
+        return ERR_INVALID_STATE;
+    }
+
+    if (!vm_manager.IsWithinMapRegion(addr, size)) {
+        LOG_ERROR(Kernel_SVC, "Range not within map region");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    return vm_manager.MapPhysicalMemory(addr, size);
+}
+
+/// Unmaps memory previously mapped via MapPhysicalMemory
+static ResultCode UnmapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
+    LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
+
+    if (!Common::Is4KBAligned(addr)) {
+        LOG_ERROR(Kernel_SVC, "Address is not aligned to 4KB, 0x{:016X}", addr);
+        return ERR_INVALID_ADDRESS;
+    }
+
+    if (!Common::Is4KBAligned(size)) {
+        LOG_ERROR(Kernel_SVC, "Size is not aligned to 4KB, 0x{:X}", size);
+        return ERR_INVALID_SIZE;
+    }
+
+    if (size == 0) {
+        LOG_ERROR(Kernel_SVC, "Size is zero");
+        return ERR_INVALID_SIZE;
+    }
+
+    if (!(addr < addr + size)) {
+        LOG_ERROR(Kernel_SVC, "Size causes 64-bit overflow of address");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    Process* const current_process = system.Kernel().CurrentProcess();
+    auto& vm_manager = current_process->VMManager();
+
+    if (current_process->GetSystemResourceSize() == 0) {
+        LOG_ERROR(Kernel_SVC, "System Resource Size is zero");
+        return ERR_INVALID_STATE;
+    }
+
+    if (!vm_manager.IsWithinMapRegion(addr, size)) {
+        LOG_ERROR(Kernel_SVC, "Range not within map region");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    return vm_manager.UnmapPhysicalMemory(addr, size);
+}
+
 /// Sets the thread activity
 static ResultCode SetThreadActivity(Core::System& system, Handle handle, u32 activity) {
    LOG_DEBUG(Kernel_SVC, "called, handle=0x{:08X}, activity=0x{:08X}", handle, activity);
@@ -1654,8 +1737,8 @@ static ResultCode SignalProcessWideKey(Core::System& system, VAddr condition_var
 // Wait for an address (via Address Arbiter)
 static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type, s32 value,
                                 s64 timeout) {
-    LOG_WARNING(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, timeout={}",
-                address, type, value, timeout);
+    LOG_TRACE(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, timeout={}", address,
+              type, value, timeout);

    // If the passed address is a kernel virtual address, return invalid memory state.
    if (Memory::IsKernelVirtualAddress(address)) {
@@ -1677,8 +1760,8 @@ static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type,
 // Signals to an address (via Address Arbiter)
 static ResultCode SignalToAddress(Core::System& system, VAddr address, u32 type, s32 value,
                                  s32 num_to_wake) {
-    LOG_WARNING(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, num_to_wake=0x{:X}",
-                address, type, value, num_to_wake);
+    LOG_TRACE(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, num_to_wake=0x{:X}",
+              address, type, value, num_to_wake);

    // If the passed address is a kernel virtual address, return invalid memory state.
    if (Memory::IsKernelVirtualAddress(address)) {
@@ -2310,8 +2393,8 @@ static const FunctionDef SVC_Table[] = {
    {0x29, SvcWrap<GetInfo>, "GetInfo"},
    {0x2A, nullptr, "FlushEntireDataCache"},
    {0x2B, nullptr, "FlushDataCache"},
-    {0x2C, nullptr, "MapPhysicalMemory"},
-    {0x2D, nullptr, "UnmapPhysicalMemory"},
+    {0x2C, SvcWrap<MapPhysicalMemory>, "MapPhysicalMemory"},
+    {0x2D, SvcWrap<UnmapPhysicalMemory>, "UnmapPhysicalMemory"},
    {0x2E, nullptr, "GetFutureThreadInfo"},
    {0x2F, nullptr, "GetLastThreadInfo"},
    {0x30, SvcWrap<GetResourceLimitLimitValue>, "GetResourceLimitLimitValue"},
--- a/src/core/hle/kernel/svc_wrap.h
+++ b/src/core/hle/kernel/svc_wrap.h
@@ -32,6 +32,11 @@ void SvcWrap(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0)).raw);
 }

+template <ResultCode func(Core::System&, u64, u64)>
+void SvcWrap(Core::System& system) {
+    FuncReturn(system, func(system, Param(system, 0), Param(system, 1)).raw);
+}
+
 template <ResultCode func(Core::System&, u32)>
 void SvcWrap(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw);
--- a/src/core/hle/kernel/vm_manager.cpp
+++ b/src/core/hle/kernel/vm_manager.cpp
@@ -11,6 +11,8 @@
 #include "core/core.h"
 #include "core/file_sys/program_metadata.h"
 #include "core/hle/kernel/errors.h"
+#include "core/hle/kernel/process.h"
+#include "core/hle/kernel/resource_limit.h"
 #include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
 #include "core/memory_setup.h"
@@ -48,10 +50,14 @@ bool VirtualMemoryArea::CanBeMergedWith(const VirtualMemoryArea& next) const {
        type != next.type) {
        return false;
    }
-    if (type == VMAType::AllocatedMemoryBlock &&
-        (backing_block != next.backing_block || offset + size != next.offset)) {
+    if ((attribute & MemoryAttribute::DeviceMapped) == MemoryAttribute::DeviceMapped) {
+        // TODO: Can device mapped memory be merged sanely?
+        // Not merging it may cause inaccuracies versus hardware when memory layout is queried.
        return false;
    }
+    if (type == VMAType::AllocatedMemoryBlock) {
+        return true;
+    }
    if (type == VMAType::BackingMemory && backing_memory + size != next.backing_memory) {
        return false;
    }
@@ -99,7 +105,7 @@ bool VMManager::IsValidHandle(VMAHandle handle) const {
 ResultVal<VMManager::VMAHandle> VMManager::MapMemoryBlock(VAddr target,
                                                          std::shared_ptr<std::vector<u8>> block,
                                                          std::size_t offset, u64 size,
-                                                          MemoryState state) {
+                                                          MemoryState state, VMAPermission perm) {
    ASSERT(block != nullptr);
    ASSERT(offset + size <= block->size());

@@ -109,7 +115,7 @@ ResultVal<VMManager::VMAHandle> VMManager::MapMemoryBlock(VAddr target,
    ASSERT(final_vma.size == size);

    final_vma.type = VMAType::AllocatedMemoryBlock;
-    final_vma.permissions = VMAPermission::ReadWrite;
+    final_vma.permissions = perm;
    final_vma.state = state;
    final_vma.backing_block = std::move(block);
    final_vma.offset = offset;
@@ -288,6 +294,166 @@ ResultVal<VAddr> VMManager::SetHeapSize(u64 size) {
    return MakeResult<VAddr>(heap_region_base);
 }

+ResultCode VMManager::MapPhysicalMemory(VAddr target, u64 size) {
+    const auto end_addr = target + size;
+    const auto last_addr = end_addr - 1;
+    VAddr cur_addr = target;
+
+    ResultCode result = RESULT_SUCCESS;
+
+    // Check how much memory we've already mapped.
+    const auto mapped_size_result = SizeOfAllocatedVMAsInRange(target, size);
+    if (mapped_size_result.Failed()) {
+        return mapped_size_result.Code();
+    }
+
+    // If we've already mapped the desired amount, return early.
+    const std::size_t mapped_size = *mapped_size_result;
+    if (mapped_size == size) {
+        return RESULT_SUCCESS;
+    }
+
+    // Check that we can map the memory we want.
+    const auto res_limit = system.CurrentProcess()->GetResourceLimit();
+    const u64 physmem_remaining = res_limit->GetMaxResourceValue(ResourceType::PhysicalMemory) -
+                                  res_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory);
+    if (physmem_remaining < (size - mapped_size)) {
+        return ERR_RESOURCE_LIMIT_EXCEEDED;
+    }
+
+    // Keep track of the memory regions we unmap.
+    std::vector<std::pair<u64, u64>> mapped_regions;
+
+    // Iterate, trying to map memory.
+    {
+        cur_addr = target;
+
+        auto iter = FindVMA(target);
+        ASSERT_MSG(iter != vma_map.end(), "MapPhysicalMemory iter != end");
+
+        while (true) {
+            const auto& vma = iter->second;
+            const auto vma_start = vma.base;
+            const auto vma_end = vma_start + vma.size;
+            const auto vma_last = vma_end - 1;
+
+            // Map the memory block
+            const auto map_size = std::min(end_addr - cur_addr, vma_end - cur_addr);
+            if (vma.state == MemoryState::Unmapped) {
+                const auto map_res =
+                    MapMemoryBlock(cur_addr, std::make_shared<std::vector<u8>>(map_size, 0), 0,
+                                   map_size, MemoryState::Heap, VMAPermission::ReadWrite);
+                result = map_res.Code();
+                if (result.IsError()) {
+                    break;
+                }
+
+                mapped_regions.emplace_back(cur_addr, map_size);
+            }
+
+            // Break once we hit the end of the range.
+            if (last_addr <= vma_last) {
+                break;
+            }
+
+            // Advance to the next block.
+            cur_addr = vma_end;
+            iter = FindVMA(cur_addr);
+            ASSERT_MSG(iter != vma_map.end(), "MapPhysicalMemory iter != end");
+        }
+    }
+
+    // If we failed, unmap memory.
+    if (result.IsError()) {
+        for (const auto [unmap_address, unmap_size] : mapped_regions) {
+            ASSERT_MSG(UnmapRange(unmap_address, unmap_size).IsSuccess(),
+                       "MapPhysicalMemory un-map on error");
+        }
+
+        return result;
+    }
+
+    // Update amount of mapped physical memory.
+    physical_memory_mapped += size - mapped_size;
+
+    return RESULT_SUCCESS;
+}
+
+ResultCode VMManager::UnmapPhysicalMemory(VAddr target, u64 size) {
+    const auto end_addr = target + size;
+    const auto last_addr = end_addr - 1;
+    VAddr cur_addr = target;
+
+    ResultCode result = RESULT_SUCCESS;
+
+    // Check how much memory is currently mapped.
+    const auto mapped_size_result = SizeOfUnmappablePhysicalMemoryInRange(target, size);
+    if (mapped_size_result.Failed()) {
+        return mapped_size_result.Code();
+    }
+
+    // If we've already unmapped all the memory, return early.
+    const std::size_t mapped_size = *mapped_size_result;
+    if (mapped_size == 0) {
+        return RESULT_SUCCESS;
+    }
+
+    // Keep track of the memory regions we unmap.
+    std::vector<std::pair<u64, u64>> unmapped_regions;
+
+    // Try to unmap regions.
+    {
+        cur_addr = target;
+
+        auto iter = FindVMA(target);
+        ASSERT_MSG(iter != vma_map.end(), "UnmapPhysicalMemory iter != end");
+
+        while (true) {
+            const auto& vma = iter->second;
+            const auto vma_start = vma.base;
+            const auto vma_end = vma_start + vma.size;
+            const auto vma_last = vma_end - 1;
+
+            // Unmap the memory block
+            const auto unmap_size = std::min(end_addr - cur_addr, vma_end - cur_addr);
+            if (vma.state == MemoryState::Heap) {
+                result = UnmapRange(cur_addr, unmap_size);
+                if (result.IsError()) {
+                    break;
+                }
+
+                unmapped_regions.emplace_back(cur_addr, unmap_size);
+            }
+
+            // Break once we hit the end of the range.
+            if (last_addr <= vma_last) {
+                break;
+            }
+
+            // Advance to the next block.
+            cur_addr = vma_end;
+            iter = FindVMA(cur_addr);
+            ASSERT_MSG(iter != vma_map.end(), "UnmapPhysicalMemory iter != end");
+        }
+    }
+
+    // If we failed, re-map regions.
+    // TODO: Preserve memory contents?
+    if (result.IsError()) {
+        for (const auto [map_address, map_size] : unmapped_regions) {
+            const auto remap_res =
+                MapMemoryBlock(map_address, std::make_shared<std::vector<u8>>(map_size, 0), 0,
+                               map_size, MemoryState::Heap, VMAPermission::None);
+            ASSERT_MSG(remap_res.Succeeded(), "UnmapPhysicalMemory re-map on error");
+        }
+    }
+
+    // Update mapped amount
+    physical_memory_mapped -= mapped_size;
+
+    return RESULT_SUCCESS;
+}
+
 ResultCode VMManager::MapCodeMemory(VAddr dst_address, VAddr src_address, u64 size) {
    constexpr auto ignore_attribute = MemoryAttribute::LockedForIPC | MemoryAttribute::DeviceMapped;
    const auto src_check_result = CheckRangeState(
@@ -435,7 +601,7 @@ ResultCode VMManager::MirrorMemory(VAddr dst_addr, VAddr src_addr, u64 size, Mem
    // Protect mirror with permissions from old region
    Reprotect(new_vma, vma->second.permissions);
    // Remove permissions from old region
-    Reprotect(vma, VMAPermission::None);
+    ReprotectRange(src_addr, size, VMAPermission::None);

    return RESULT_SUCCESS;
 }
@@ -568,14 +734,14 @@ VMManager::VMAIter VMManager::SplitVMA(VMAIter vma_handle, u64 offset_in_vma) {
 VMManager::VMAIter VMManager::MergeAdjacent(VMAIter iter) {
    const VMAIter next_vma = std::next(iter);
    if (next_vma != vma_map.end() && iter->second.CanBeMergedWith(next_vma->second)) {
-        iter->second.size += next_vma->second.size;
+        MergeAdjacentVMA(iter->second, next_vma->second);
        vma_map.erase(next_vma);
    }

    if (iter != vma_map.begin()) {
        VMAIter prev_vma = std::prev(iter);
        if (prev_vma->second.CanBeMergedWith(iter->second)) {
-            prev_vma->second.size += iter->second.size;
+            MergeAdjacentVMA(prev_vma->second, iter->second);
            vma_map.erase(iter);
            iter = prev_vma;
        }
@@ -584,6 +750,38 @@ VMManager::VMAIter VMManager::MergeAdjacent(VMAIter iter) {
    return iter;
 }

+void VMManager::MergeAdjacentVMA(VirtualMemoryArea& left, const VirtualMemoryArea& right) {
+    ASSERT(left.CanBeMergedWith(right));
+
+    // Always merge allocated memory blocks, even when they don't share the same backing block.
+    if (left.type == VMAType::AllocatedMemoryBlock &&
+        (left.backing_block != right.backing_block || left.offset + left.size != right.offset)) {
+        // Check if we can save work.
+        if (left.offset == 0 && left.size == left.backing_block->size()) {
+            // Fast case: left is an entire backing block.
+            left.backing_block->insert(left.backing_block->end(),
+                                       right.backing_block->begin() + right.offset,
+                                       right.backing_block->begin() + right.offset + right.size);
+        } else {
+            // Slow case: make a new memory block for left and right.
+            auto new_memory = std::make_shared<std::vector<u8>>();
+            new_memory->insert(new_memory->end(), left.backing_block->begin() + left.offset,
+                               left.backing_block->begin() + left.offset + left.size);
+            new_memory->insert(new_memory->end(), right.backing_block->begin() + right.offset,
+                               right.backing_block->begin() + right.offset + right.size);
+            left.backing_block = new_memory;
+            left.offset = 0;
+        }
+
+        // Page table update is needed, because backing memory changed.
+        left.size += right.size;
+        UpdatePageTableForVMA(left);
+    } else {
+        // Just update the size.
+        left.size += right.size;
+    }
+}
+
 void VMManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) {
    switch (vma.type) {
    case VMAType::Free:
@@ -758,6 +956,84 @@ VMManager::CheckResults VMManager::CheckRangeState(VAddr address, u64 size, Memo
        std::make_tuple(initial_state, initial_permissions, initial_attributes & ~ignore_mask));
 }

+ResultVal<std::size_t> VMManager::SizeOfAllocatedVMAsInRange(VAddr address,
+                                                             std::size_t size) const {
+    const VAddr end_addr = address + size;
+    const VAddr last_addr = end_addr - 1;
+    std::size_t mapped_size = 0;
+
+    VAddr cur_addr = address;
+    auto iter = FindVMA(cur_addr);
+    ASSERT_MSG(iter != vma_map.end(), "SizeOfAllocatedVMAsInRange iter != end");
+
+    while (true) {
+        const auto& vma = iter->second;
+        const VAddr vma_start = vma.base;
+        const VAddr vma_end = vma_start + vma.size;
+        const VAddr vma_last = vma_end - 1;
+
+        // Add size if relevant.
+        if (vma.state != MemoryState::Unmapped) {
+            mapped_size += std::min(end_addr - cur_addr, vma_end - cur_addr);
+        }
+
+        // Break once we hit the end of the range.
+        if (last_addr <= vma_last) {
+            break;
+        }
+
+        // Advance to the next block.
+        cur_addr = vma_end;
+        iter = std::next(iter);
+        ASSERT_MSG(iter != vma_map.end(), "SizeOfAllocatedVMAsInRange iter != end");
+    }
+
+    return MakeResult(mapped_size);
+}
+
+ResultVal<std::size_t> VMManager::SizeOfUnmappablePhysicalMemoryInRange(VAddr address,
+                                                                        std::size_t size) const {
+    const VAddr end_addr = address + size;
+    const VAddr last_addr = end_addr - 1;
+    std::size_t mapped_size = 0;
+
+    VAddr cur_addr = address;
+    auto iter = FindVMA(cur_addr);
+    ASSERT_MSG(iter != vma_map.end(), "SizeOfUnmappablePhysicalMemoryInRange iter != end");
+
+    while (true) {
+        const auto& vma = iter->second;
+        const auto vma_start = vma.base;
+        const auto vma_end = vma_start + vma.size;
+        const auto vma_last = vma_end - 1;
+        const auto state = vma.state;
+        const auto attr = vma.attribute;
+
+        // Memory within region must be free or mapped heap.
+        if (!((state == MemoryState::Heap && attr == MemoryAttribute::None) ||
+              (state == MemoryState::Unmapped))) {
+            return ERR_INVALID_ADDRESS_STATE;
+        }
+
+        // Add size if relevant.
+        if (state != MemoryState::Unmapped) {
+            mapped_size += std::min(end_addr - cur_addr, vma_end - cur_addr);
+        }
+
+        // Break once we hit the end of the range.
+        if (last_addr <= vma_last) {
+            break;
+        }
+
+        // Advance to the next block.
+        cur_addr = vma_end;
+        iter = std::next(iter);
+        ASSERT_MSG(iter != vma_map.end(), "SizeOfUnmappablePhysicalMemoryInRange iter != end");
+    }
+
+    return MakeResult(mapped_size);
+}
+
 u64 VMManager::GetTotalPhysicalMemoryAvailable() const {
    LOG_WARNING(Kernel, "(STUBBED) called");
    return 0xF8000000;
--- a/src/core/hle/kernel/vm_manager.h
+++ b/src/core/hle/kernel/vm_manager.h
@@ -349,7 +349,8 @@ public:
     * @param state MemoryState tag to attach to the VMA.
     */
    ResultVal<VMAHandle> MapMemoryBlock(VAddr target, std::shared_ptr<std::vector<u8>> block,
-                                        std::size_t offset, u64 size, MemoryState state);
+                                        std::size_t offset, u64 size, MemoryState state,
+                                        VMAPermission perm = VMAPermission::ReadWrite);

    /**
     * Maps an unmanaged host memory pointer at a given address.
@@ -450,6 +451,34 @@ public:
    ///
    ResultVal<VAddr> SetHeapSize(u64 size);

+    /// Maps memory at a given address.
+    ///
+    /// @param addr The virtual address to map memory at.
+    /// @param size The amount of memory to map.
+    ///
+    /// @note The destination address must lie within the Map region.
+    ///
+    /// @note This function requires that SystemResourceSize be non-zero,
+    ///       however, this is just because if it were not then the
+    ///       resulting page tables could be exploited on hardware by
+    ///       a malicious program. SystemResource usage does not need
+    ///       to be explicitly checked or updated here.
+    ResultCode MapPhysicalMemory(VAddr target, u64 size);
+
+    /// Unmaps memory at a given address.
+    ///
+    /// @param addr The virtual address to unmap memory at.
+    /// @param size The amount of memory to unmap.
+    ///
+    /// @note The destination address must lie within the Map region.
+    ///
+    /// @note This function requires that SystemResourceSize be non-zero,
+    ///       however, this is just because if it were not then the
+    ///       resulting page tables could be exploited on hardware by
+    ///       a malicious program. SystemResource usage does not need
+    ///       to be explicitly checked or updated here.
+    ResultCode UnmapPhysicalMemory(VAddr target, u64 size);
+
    /// Maps a region of memory as code memory.
    ///
    /// @param dst_address The base address of the region to create the aliasing memory region.
@@ -657,6 +686,11 @@ private:
     */
    VMAIter MergeAdjacent(VMAIter vma);

+    /**
+     * Merges two adjacent VMAs.
+     */
+    void MergeAdjacentVMA(VirtualMemoryArea& left, const VirtualMemoryArea& right);
+
    /// Updates the pages corresponding to this VMA so they match the VMA's attributes.
    void UpdatePageTableForVMA(const VirtualMemoryArea& vma);

@@ -701,6 +735,13 @@ private:
                                 MemoryAttribute attribute_mask, MemoryAttribute attribute,
                                 MemoryAttribute ignore_mask) const;

+    /// Gets the amount of memory currently mapped (state != Unmapped) in a range.
+    ResultVal<std::size_t> SizeOfAllocatedVMAsInRange(VAddr address, std::size_t size) const;
+
+    /// Gets the amount of memory unmappable by UnmapPhysicalMemory in a range.
+    ResultVal<std::size_t> SizeOfUnmappablePhysicalMemoryInRange(VAddr address,
+                                                                 std::size_t size) const;
+
    /**
     * A map covering the entirety of the managed address space, keyed by the `base` field of each
     * VMA. It must always be modified by splitting or merging VMAs, so that the invariant
@@ -742,6 +783,11 @@ private:
    // end of the range. This is essentially 'base_address + current_size'.
    VAddr heap_end = 0;

+    // The current amount of memory mapped via MapPhysicalMemory.
+    // This is used here (and in Nintendo's kernel) only for debugging, and does not impact
+    // any behavior.
+    u64 physical_memory_mapped = 0;
+
    Core::System& system;
 };
 } // namespace Kernel
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_library(video_core STATIC
+    buffer_cache.h
    dma_pusher.cpp
    dma_pusher.h
    debug_utils/debug_utils.cpp
@@ -43,8 +44,6 @@ add_library(video_core STATIC
    renderer_opengl/gl_device.h
    renderer_opengl/gl_framebuffer_cache.cpp
    renderer_opengl/gl_framebuffer_cache.h
-    renderer_opengl/gl_global_cache.cpp
-    renderer_opengl/gl_global_cache.h
    renderer_opengl/gl_rasterizer.cpp
    renderer_opengl/gl_rasterizer.h
    renderer_opengl/gl_resource_manager.cpp
--- a/src/video_core/buffer_cache.h
+++ b/src/video_core/buffer_cache.h
@@ -0,0 +1,299 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_cache.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace VideoCommon {
+
+template <typename BufferStorageType>
+class CachedBuffer final : public RasterizerCacheObject {
+public:
+    explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr)
+        : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {}
+    ~CachedBuffer() override = default;
+
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
+    }
+
+    std::size_t GetSizeInBytes() const override {
+        return size;
+    }
+
+    u8* GetWritableHostPtr() const {
+        return host_ptr;
+    }
+
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    std::size_t GetCapacity() const {
+        return capacity;
+    }
+
+    bool IsInternalized() const {
+        return is_internal;
+    }
+
+    const BufferStorageType& GetBuffer() const {
+        return buffer;
+    }
+
+    void SetSize(std::size_t new_size) {
+        size = new_size;
+    }
+
+    void SetInternalState(bool is_internal_) {
+        is_internal = is_internal_;
+    }
+
+    BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) {
+        capacity = new_capacity;
+        std::swap(buffer, buffer_);
+        return buffer_;
+    }
+
+private:
+    u8* host_ptr{};
+    VAddr cpu_addr{};
+    std::size_t size{};
+    std::size_t capacity{};
+    bool is_internal{};
+    BufferStorageType buffer;
+};
+
+template <typename BufferStorageType, typename BufferType, typename StreamBuffer>
+class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> {
+public:
+    using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>;
+    using BufferInfo = std::pair<const BufferType*, u64>;
+
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                         std::unique_ptr<StreamBuffer> stream_buffer)
+        : RasterizerCache<Buffer>{rasterizer}, system{system},
+          stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{
+                                                       this->stream_buffer->GetHandle()} {}
+    ~BufferCache() = default;
+
+    void Unregister(const Buffer& entry) override {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+        if (entry->IsInternalized()) {
+            internalized_entries.erase(entry->GetCacheAddr());
+        }
+        ReserveBuffer(entry);
+        RasterizerCache<Buffer>::Unregister(entry);
+    }
+
+    void TickFrame() {
+        marked_for_destruction_index =
+            (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size();
+        MarkedForDestruction().clear();
+    }
+
+    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+                            bool internalize = false, bool is_written = false) {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+        if (!host_ptr) {
+            return {GetEmptyBuffer(size), 0};
+        }
+        const auto cache_addr = ToCacheAddr(host_ptr);
+
+        // Cache management is a big overhead, so only cache entries with a given size.
+        // TODO: Figure out which size is the best for given games.
+        constexpr std::size_t max_stream_size = 0x800;
+        if (!internalize && size < max_stream_size &&
+            internalized_entries.find(cache_addr) == internalized_entries.end()) {
+            return StreamBufferUpload(host_ptr, size, alignment);
+        }
+
+        auto entry = RasterizerCache<Buffer>::TryGet(cache_addr);
+        if (!entry) {
+            return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written);
+        }
+
+        if (entry->GetSize() < size) {
+            IncreaseBufferSize(entry, size);
+        }
+        if (is_written) {
+            entry->MarkAsModified(true, *this);
+        }
+        return {ToHandle(entry->GetBuffer()), 0};
+    }
+
+    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
+    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                std::size_t alignment = 4) {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+        return StreamBufferUpload(raw_pointer, size, alignment);
+    }
+
+    void Map(std::size_t max_size) {
+        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
+        buffer_offset = buffer_offset_base;
+    }
+
+    /// Finishes the upload stream, returns true on bindings invalidation.
+    bool Unmap() {
+        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
+        return std::exchange(invalidated, false);
+    }
+
+    virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0;
+
+protected:
+    void FlushObjectInner(const Buffer& entry) override {
+        DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr());
+    }
+
+    virtual BufferStorageType CreateBuffer(std::size_t size) = 0;
+
+    virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0;
+
+    virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset,
+                                  std::size_t size, const u8* data) = 0;
+
+    virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset,
+                                    std::size_t size, u8* data) = 0;
+
+    virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst,
+                                std::size_t src_offset, std::size_t dst_offset,
+                                std::size_t size) = 0;
+
+private:
+    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
+                                  std::size_t alignment) {
+        AlignBuffer(alignment);
+        const std::size_t uploaded_offset = buffer_offset;
+        std::memcpy(buffer_ptr, raw_pointer, size);
+
+        buffer_ptr += size;
+        buffer_offset += size;
+        return {&stream_buffer_handle, uploaded_offset};
+    }
+
+    BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size,
+                                 bool internalize, bool is_written) {
+        auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
+        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+        ASSERT(cpu_addr);
+
+        auto entry = GetUncachedBuffer(*cpu_addr, host_ptr);
+        entry->SetSize(size);
+        entry->SetInternalState(internalize);
+        RasterizerCache<Buffer>::Register(entry);
+
+        if (internalize) {
+            internalized_entries.emplace(ToCacheAddr(host_ptr));
+        }
+        if (is_written) {
+            entry->MarkAsModified(true, *this);
+        }
+
+        if (entry->GetCapacity() < size) {
+            MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size));
+        }
+
+        UploadBufferData(entry->GetBuffer(), 0, size, host_ptr);
+        return {ToHandle(entry->GetBuffer()), 0};
+    }
+
+    void IncreaseBufferSize(Buffer& entry, std::size_t new_size) {
+        const std::size_t old_size = entry->GetSize();
+        if (entry->GetCapacity() < new_size) {
+            const auto& old_buffer = entry->GetBuffer();
+            auto new_buffer = CreateBuffer(new_size);
+
+            // Copy bits from the old buffer to the new buffer.
+            CopyBufferData(old_buffer, new_buffer, 0, 0, old_size);
+            MarkedForDestruction().push_back(
+                entry->ExchangeBuffer(std::move(new_buffer), new_size));
+
+            // This buffer could have been used
+            invalidated = true;
+        }
+        // Upload the new bits.
+        const std::size_t size_diff = new_size - old_size;
+        UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size);
+
+        // Update entry's size in the object and in the cache.
+        Unregister(entry);
+
+        entry->SetSize(new_size);
+        RasterizerCache<Buffer>::Register(entry);
+    }
+
+    Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) {
+        if (auto entry = TryGetReservedBuffer(host_ptr)) {
+            return entry;
+        }
+        return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr);
+    }
+
+    Buffer TryGetReservedBuffer(u8* host_ptr) {
+        const auto it = buffer_reserve.find(ToCacheAddr(host_ptr));
+        if (it == buffer_reserve.end()) {
+            return {};
+        }
+        auto& reserve = it->second;
+        auto entry = reserve.back();
+        reserve.pop_back();
+        return entry;
+    }
+
+    void ReserveBuffer(Buffer entry) {
+        buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry));
+    }
+
+    void AlignBuffer(std::size_t alignment) {
+        // Align the offset, not the mapped pointer
+        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
+        buffer_ptr += offset_aligned - buffer_offset;
+        buffer_offset = offset_aligned;
+    }
+
+    std::vector<BufferStorageType>& MarkedForDestruction() {
+        return marked_for_destruction_ring_buffer[marked_for_destruction_index];
+    }
+
+    Core::System& system;
+
+    std::unique_ptr<StreamBuffer> stream_buffer;
+    BufferType stream_buffer_handle{};
+
+    bool invalidated = false;
+
+    u8* buffer_ptr = nullptr;
+    u64 buffer_offset = 0;
+    u64 buffer_offset_base = 0;
+
+    std::size_t marked_for_destruction_index = 0;
+    std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer;
+
+    std::unordered_set<CacheAddr> internalized_entries;
+    std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -22,7 +22,7 @@ void DmaPusher::DispatchCalls() {
    MICROPROFILE_SCOPE(DispatchCalls);

    // On entering GPU code, assume all memory may be touched by the ARM core.
-    gpu.Maxwell3D().dirty_flags.OnMemoryWrite();
+    gpu.Maxwell3D().dirty.OnMemoryWrite();

    dma_pushbuffer_subindex = 0;

--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@@ -37,7 +37,7 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
-            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
        }
        break;
    }
@@ -50,13 +50,14 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) {
 }

 void KeplerCompute::ProcessLaunch() {
-
    const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address();
    memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description,
                                   LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32));

-    const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start;
-    LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc);
+    const GPUVAddr code_addr = regs.code_loc.Address() + launch_description.program_start;
+    LOG_TRACE(HW_GPU, "Compute invocation launched at address 0x{:016x}", code_addr);
+
+    rasterizer.DispatchCompute(code_addr);
 }

 } // namespace Tegra::Engines
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -34,7 +34,7 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
-            system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+            system.GPU().Maxwell3D().dirty.OnMemoryWrite();
        }
        break;
    }
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -22,6 +22,7 @@ Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& raste
                     MemoryManager& memory_manager)
    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager},
      macro_interpreter{*this}, upload_state{memory_manager, regs.upload} {
+    InitDirtySettings();
    InitializeRegisterDefaults();
 }

@@ -69,6 +70,10 @@ void Maxwell3D::InitializeRegisterDefaults() {
    regs.stencil_back_func_mask = 0xFFFFFFFF;
    regs.stencil_back_mask = 0xFFFFFFFF;

+    regs.depth_test_func = Regs::ComparisonOp::Always;
+    regs.cull.front_face = Regs::Cull::FrontFace::CounterClockWise;
+    regs.cull.cull_face = Regs::Cull::CullFace::Back;
+
    // TODO(Rodrigo): Most games do not set a point size. I think this is a case of a
    // register carrying a default value. Assume it's OpenGL's default (1).
    regs.point_size = 1.0f;
@@ -86,6 +91,159 @@ void Maxwell3D::InitializeRegisterDefaults() {
    regs.rt_separate_frag_data = 1;
 }

+#define DIRTY_REGS_POS(field_name) (offsetof(Maxwell3D::DirtyRegs, field_name))
+
+void Maxwell3D::InitDirtySettings() {
+    const auto set_block = [this](const u32 start, const u32 range, const u8 position) {
+        const auto start_itr = dirty_pointers.begin() + start;
+        const auto end_itr = start_itr + range;
+        std::fill(start_itr, end_itr, position);
+    };
+    dirty.regs.fill(true);
+
+    // Init Render Targets
+    constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
+    constexpr u32 rt_start_reg = MAXWELL3D_REG_INDEX(rt);
+    constexpr u32 rt_end_reg = rt_start_reg + registers_per_rt * 8;
+    u32 rt_dirty_reg = DIRTY_REGS_POS(render_target);
+    for (u32 rt_reg = rt_start_reg; rt_reg < rt_end_reg; rt_reg += registers_per_rt) {
+        set_block(rt_reg, registers_per_rt, rt_dirty_reg);
+        rt_dirty_reg++;
+    }
+    constexpr u32 depth_buffer_flag = DIRTY_REGS_POS(depth_buffer);
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_enable)] = depth_buffer_flag;
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_width)] = depth_buffer_flag;
+    dirty_pointers[MAXWELL3D_REG_INDEX(zeta_height)] = depth_buffer_flag;
+    constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
+    constexpr u32 zeta_reg = MAXWELL3D_REG_INDEX(zeta);
+    set_block(zeta_reg, registers_in_zeta, depth_buffer_flag);
+
+    // Init Vertex Arrays
+    constexpr u32 vertex_array_start = MAXWELL3D_REG_INDEX(vertex_array);
+    constexpr u32 vertex_array_size = sizeof(regs.vertex_array[0]) / sizeof(u32);
+    constexpr u32 vertex_array_end = vertex_array_start + vertex_array_size * Regs::NumVertexArrays;
+    u32 va_reg = DIRTY_REGS_POS(vertex_array);
+    u32 vi_reg = DIRTY_REGS_POS(vertex_instance);
+    for (u32 vertex_reg = vertex_array_start; vertex_reg < vertex_array_end;
+         vertex_reg += vertex_array_size) {
+        set_block(vertex_reg, 3, va_reg);
+        // The divisor concerns vertex array instances
+        dirty_pointers[vertex_reg + 3] = vi_reg;
+        va_reg++;
+        vi_reg++;
+    }
+    constexpr u32 vertex_limit_start = MAXWELL3D_REG_INDEX(vertex_array_limit);
+    constexpr u32 vertex_limit_size = sizeof(regs.vertex_array_limit[0]) / sizeof(u32);
+    constexpr u32 vertex_limit_end = vertex_limit_start + vertex_limit_size * Regs::NumVertexArrays;
+    va_reg = DIRTY_REGS_POS(vertex_array);
+    for (u32 vertex_reg = vertex_limit_start; vertex_reg < vertex_limit_end;
+         vertex_reg += vertex_limit_size) {
+        set_block(vertex_reg, vertex_limit_size, va_reg);
+        va_reg++;
+    }
+    constexpr u32 vertex_instance_start = MAXWELL3D_REG_INDEX(instanced_arrays);
+    constexpr u32 vertex_instance_size =
+        sizeof(regs.instanced_arrays.is_instanced[0]) / sizeof(u32);
+    constexpr u32 vertex_instance_end =
+        vertex_instance_start + vertex_instance_size * Regs::NumVertexArrays;
+    vi_reg = DIRTY_REGS_POS(vertex_instance);
+    for (u32 vertex_reg = vertex_instance_start; vertex_reg < vertex_instance_end;
+         vertex_reg += vertex_instance_size) {
+        set_block(vertex_reg, vertex_instance_size, vi_reg);
+        vi_reg++;
+    }
+    set_block(MAXWELL3D_REG_INDEX(vertex_attrib_format), regs.vertex_attrib_format.size(),
+              DIRTY_REGS_POS(vertex_attrib_format));
+
+    // Init Shaders
+    constexpr u32 shader_registers_count =
+        sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
+    set_block(MAXWELL3D_REG_INDEX(shader_config[0]), shader_registers_count,
+              DIRTY_REGS_POS(shaders));
+
+    // State
+
+    // Viewport
+    constexpr u32 viewport_dirty_reg = DIRTY_REGS_POS(viewport);
+    constexpr u32 viewport_start = MAXWELL3D_REG_INDEX(viewports);
+    constexpr u32 viewport_size = sizeof(regs.viewports) / sizeof(u32);
+    set_block(viewport_start, viewport_size, viewport_dirty_reg);
+    constexpr u32 view_volume_start = MAXWELL3D_REG_INDEX(view_volume_clip_control);
+    constexpr u32 view_volume_size = sizeof(regs.view_volume_clip_control) / sizeof(u32);
+    set_block(view_volume_start, view_volume_size, viewport_dirty_reg);
+
+    // Viewport transformation
+    constexpr u32 viewport_trans_start = MAXWELL3D_REG_INDEX(viewport_transform);
+    constexpr u32 viewport_trans_size = sizeof(regs.viewport_transform) / sizeof(u32);
+    set_block(viewport_trans_start, viewport_trans_size, DIRTY_REGS_POS(viewport_transform));
+
+    // Cullmode
+    constexpr u32 cull_mode_start = MAXWELL3D_REG_INDEX(cull);
+    constexpr u32 cull_mode_size = sizeof(regs.cull) / sizeof(u32);
+    set_block(cull_mode_start, cull_mode_size, DIRTY_REGS_POS(cull_mode));
+
+    // Screen y control
+    dirty_pointers[MAXWELL3D_REG_INDEX(screen_y_control)] = DIRTY_REGS_POS(screen_y_control);
+
+    // Primitive Restart
+    constexpr u32 primitive_restart_start = MAXWELL3D_REG_INDEX(primitive_restart);
+    constexpr u32 primitive_restart_size = sizeof(regs.primitive_restart) / sizeof(u32);
+    set_block(primitive_restart_start, primitive_restart_size, DIRTY_REGS_POS(primitive_restart));
+
+    // Depth Test
+    constexpr u32 depth_test_dirty_reg = DIRTY_REGS_POS(depth_test);
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_enable)] = depth_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_write_enabled)] = depth_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(depth_test_func)] = depth_test_dirty_reg;
+
+    // Stencil Test
+    constexpr u32 stencil_test_dirty_reg = DIRTY_REGS_POS(stencil_test);
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_enable)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_func)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_ref)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_func_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_fail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zfail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_op_zpass)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_front_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_two_side_enable)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_func)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_ref)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_func_mask)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_fail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zfail)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_op_zpass)] = stencil_test_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(stencil_back_mask)] = stencil_test_dirty_reg;
+
+    // Color Mask
+    constexpr u32 color_mask_dirty_reg = DIRTY_REGS_POS(color_mask);
+    dirty_pointers[MAXWELL3D_REG_INDEX(color_mask_common)] = color_mask_dirty_reg;
+    set_block(MAXWELL3D_REG_INDEX(color_mask), sizeof(regs.color_mask) / sizeof(u32),
+              color_mask_dirty_reg);
+    // Blend State
+    constexpr u32 blend_state_dirty_reg = DIRTY_REGS_POS(blend_state);
+    set_block(MAXWELL3D_REG_INDEX(blend_color), sizeof(regs.blend_color) / sizeof(u32),
+              blend_state_dirty_reg);
+    dirty_pointers[MAXWELL3D_REG_INDEX(independent_blend_enable)] = blend_state_dirty_reg;
+    set_block(MAXWELL3D_REG_INDEX(blend), sizeof(regs.blend) / sizeof(u32), blend_state_dirty_reg);
+    set_block(MAXWELL3D_REG_INDEX(independent_blend), sizeof(regs.independent_blend) / sizeof(u32),
+              blend_state_dirty_reg);
+
+    // Scissor State
+    constexpr u32 scissor_test_dirty_reg = DIRTY_REGS_POS(scissor_test);
+    set_block(MAXWELL3D_REG_INDEX(scissor_test), sizeof(regs.scissor_test) / sizeof(u32),
+              scissor_test_dirty_reg);
+
+    // Polygon Offset
+    constexpr u32 polygon_offset_dirty_reg = DIRTY_REGS_POS(polygon_offset);
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_fill_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_line_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_point_enable)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_units)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_factor)] = polygon_offset_dirty_reg;
+    dirty_pointers[MAXWELL3D_REG_INDEX(polygon_offset_clamp)] = polygon_offset_dirty_reg;
+}
+
 void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
    // Reset the current macro.
    executing_macro = 0;
@@ -108,6 +266,14 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {

    const u32 method = method_call.method;

+    if (method == cb_data_state.current) {
+        regs.reg_array[method] = method_call.argument;
+        ProcessCBData(method_call.argument);
+        return;
+    } else if (cb_data_state.current != null_cb_data) {
+        FinishCBData();
+    }
+
    // It is an error to write to a register other than the current macro's ARG register before it
    // has finished execution.
    if (executing_macro != 0) {
@@ -143,49 +309,19 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {

    if (regs.reg_array[method] != method_call.argument) {
        regs.reg_array[method] = method_call.argument;
-        // Color buffers
-        constexpr u32 first_rt_reg = MAXWELL3D_REG_INDEX(rt);
-        constexpr u32 registers_per_rt = sizeof(regs.rt[0]) / sizeof(u32);
-        if (method >= first_rt_reg &&
-            method < first_rt_reg + registers_per_rt * Regs::NumRenderTargets) {
-            const std::size_t rt_index = (method - first_rt_reg) / registers_per_rt;
-            dirty_flags.color_buffer.set(rt_index);
-        }
-
-        // Zeta buffer
-        constexpr u32 registers_in_zeta = sizeof(regs.zeta) / sizeof(u32);
-        if (method == MAXWELL3D_REG_INDEX(zeta_enable) ||
-            method == MAXWELL3D_REG_INDEX(zeta_width) ||
-            method == MAXWELL3D_REG_INDEX(zeta_height) ||
-            (method >= MAXWELL3D_REG_INDEX(zeta) &&
-             method < MAXWELL3D_REG_INDEX(zeta) + registers_in_zeta)) {
-            dirty_flags.zeta_buffer = true;
-        }
-
-        // Shader
-        constexpr u32 shader_registers_count =
-            sizeof(regs.shader_config[0]) * Regs::MaxShaderProgram / sizeof(u32);
-        if (method >= MAXWELL3D_REG_INDEX(shader_config[0]) &&
-            method < MAXWELL3D_REG_INDEX(shader_config[0]) + shader_registers_count) {
-            dirty_flags.shaders = true;
-        }
-
-        // Vertex format
-        if (method >= MAXWELL3D_REG_INDEX(vertex_attrib_format) &&
-            method < MAXWELL3D_REG_INDEX(vertex_attrib_format) + regs.vertex_attrib_format.size()) {
-            dirty_flags.vertex_attrib_format = true;
-        }
-
-        // Vertex buffer
-        if (method >= MAXWELL3D_REG_INDEX(vertex_array) &&
-            method < MAXWELL3D_REG_INDEX(vertex_array) + 4 * Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array)) >> 2);
-        } else if (method >= MAXWELL3D_REG_INDEX(vertex_array_limit) &&
-                   method < MAXWELL3D_REG_INDEX(vertex_array_limit) + 2 * Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set((method - MAXWELL3D_REG_INDEX(vertex_array_limit)) >> 1);
-        } else if (method >= MAXWELL3D_REG_INDEX(instanced_arrays) &&
-                   method < MAXWELL3D_REG_INDEX(instanced_arrays) + Regs::NumVertexArrays) {
-            dirty_flags.vertex_array.set(method - MAXWELL3D_REG_INDEX(instanced_arrays));
+        const std::size_t dirty_reg = dirty_pointers[method];
+        if (dirty_reg) {
+            dirty.regs[dirty_reg] = true;
+            if (dirty_reg >= DIRTY_REGS_POS(vertex_array) &&
+                dirty_reg < DIRTY_REGS_POS(vertex_array_buffers)) {
+                dirty.vertex_array_buffers = true;
+            } else if (dirty_reg >= DIRTY_REGS_POS(vertex_instance) &&
+                       dirty_reg < DIRTY_REGS_POS(vertex_instances)) {
+                dirty.vertex_instances = true;
+            } else if (dirty_reg >= DIRTY_REGS_POS(render_target) &&
+                       dirty_reg < DIRTY_REGS_POS(render_settings)) {
+                dirty.render_settings = true;
+            }
        }
    }

@@ -214,7 +350,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]):
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]):
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): {
-        ProcessCBData(method_call.argument);
+        StartCBData(method);
        break;
    }
    case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): {
@@ -249,6 +385,10 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
        ProcessQueryGet();
        break;
    }
+    case MAXWELL3D_REG_INDEX(condition.mode): {
+        ProcessQueryCondition();
+        break;
+    }
    case MAXWELL3D_REG_INDEX(sync_info): {
        ProcessSyncPoint();
        break;
@@ -261,7 +401,7 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
        const bool is_last_call = method_call.IsLastCall();
        upload_state.ProcessData(method_call.argument, is_last_call);
        if (is_last_call) {
-            dirty_flags.OnMemoryWrite();
+            dirty.OnMemoryWrite();
        }
        break;
    }
@@ -302,6 +442,7 @@ void Maxwell3D::ProcessQueryGet() {
        result = regs.query.query_sequence;
        break;
    default:
+        result = 1;
        UNIMPLEMENTED_MSG("Unimplemented query select type {}",
                          static_cast<u32>(regs.query.query_get.select.Value()));
    }
@@ -333,7 +474,6 @@ void Maxwell3D::ProcessQueryGet() {
            query_result.timestamp = system.CoreTiming().GetTicks();
            memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result));
        }
-        dirty_flags.OnMemoryWrite();
        break;
    }
    default:
@@ -342,6 +482,45 @@ void Maxwell3D::ProcessQueryGet() {
    }
 }

+void Maxwell3D::ProcessQueryCondition() {
+    const GPUVAddr condition_address{regs.condition.Address()};
+    switch (regs.condition.mode) {
+    case Regs::ConditionMode::Always: {
+        execute_on = true;
+        break;
+    }
+    case Regs::ConditionMode::Never: {
+        execute_on = false;
+        break;
+    }
+    case Regs::ConditionMode::ResNonZero: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on = cmp.initial_sequence != 0U && cmp.initial_mode != 0U;
+        break;
+    }
+    case Regs::ConditionMode::Equal: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence == cmp.current_sequence && cmp.initial_mode == cmp.current_mode;
+        break;
+    }
+    case Regs::ConditionMode::NotEqual: {
+        Regs::QueryCompare cmp;
+        memory_manager.ReadBlockUnsafe(condition_address, &cmp, sizeof(cmp));
+        execute_on =
+            cmp.initial_sequence != cmp.current_sequence || cmp.initial_mode != cmp.current_mode;
+        break;
+    }
+    default: {
+        UNIMPLEMENTED_MSG("Uninplemented Condition Mode!");
+        execute_on = true;
+        break;
+    }
+    }
+}
+
 void Maxwell3D::ProcessSyncPoint() {
    const u32 sync_point = regs.sync_info.sync_point.Value();
    const u32 increment = regs.sync_info.increment.Value();
@@ -405,23 +584,39 @@ void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
 }

 void Maxwell3D::ProcessCBData(u32 value) {
+    const u32 id = cb_data_state.id;
+    cb_data_state.buffer[id][cb_data_state.counter] = value;
+    // Increment the current buffer position.
+    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.counter++;
+}
+
+void Maxwell3D::StartCBData(u32 method) {
+    constexpr u32 first_cb_data = MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]);
+    cb_data_state.start_pos = regs.const_buffer.cb_pos;
+    cb_data_state.id = method - first_cb_data;
+    cb_data_state.current = method;
+    cb_data_state.counter = 0;
+    ProcessCBData(regs.const_buffer.cb_data[cb_data_state.id]);
+}
+
+void Maxwell3D::FinishCBData() {
    // Write the input value to the current const buffer at the current position.
    const GPUVAddr buffer_address = regs.const_buffer.BufferAddress();
    ASSERT(buffer_address != 0);

    // Don't allow writing past the end of the buffer.
-    ASSERT(regs.const_buffer.cb_pos + sizeof(u32) <= regs.const_buffer.cb_size);
+    ASSERT(regs.const_buffer.cb_pos <= regs.const_buffer.cb_size);

-    const GPUVAddr address{buffer_address + regs.const_buffer.cb_pos};
+    const GPUVAddr address{buffer_address + cb_data_state.start_pos};
+    const std::size_t size = regs.const_buffer.cb_pos - cb_data_state.start_pos;

-    u8* ptr{memory_manager.GetPointer(address)};
-    rasterizer.InvalidateRegion(ToCacheAddr(ptr), sizeof(u32));
-    memory_manager.Write<u32>(address, value);
+    const u32 id = cb_data_state.id;
+    memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
+    dirty.OnMemoryWrite();

-    dirty_flags.OnMemoryWrite();
-
-    // Increment the current buffer position.
-    regs.const_buffer.cb_pos = regs.const_buffer.cb_pos + 4;
+    cb_data_state.id = null_cb_data;
+    cb_data_state.current = null_cb_data;
 }

 Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -67,6 +67,7 @@ public:
        static constexpr std::size_t MaxShaderStage = 5;
        // Maximum number of const buffers per shader stage.
        static constexpr std::size_t MaxConstBuffers = 18;
+        static constexpr std::size_t MaxConstBufferSize = 0x10000;

        enum class QueryMode : u32 {
            Write = 0,
@@ -89,6 +90,20 @@ public:

        enum class QuerySelect : u32 {
            Zero = 0,
+            TimeElapsed = 2,
+            TransformFeedbackPrimitivesGenerated = 11,
+            PrimitivesGenerated = 18,
+            SamplesPassed = 21,
+            TransformFeedbackUnknown = 26,
+        };
+
+        struct QueryCompare {
+            u32 initial_sequence;
+            u32 initial_mode;
+            u32 unknown1;
+            u32 unknown2;
+            u32 current_sequence;
+            u32 current_mode;
        };

        enum class QuerySyncCondition : u32 {
@@ -96,6 +111,14 @@ public:
            GreaterThan = 1,
        };

+        enum class ConditionMode : u32 {
+            Never = 0,
+            Always = 1,
+            ResNonZero = 2,
+            Equal = 3,
+            NotEqual = 4,
+        };
+
        enum class ShaderProgram : u32 {
            VertexA = 0,
            VertexB = 1,
@@ -814,7 +837,18 @@ public:
                    BitField<4, 1, u32> alpha_to_one;
                } multisample_control;

-                INSERT_PADDING_WORDS(0x7);
+                INSERT_PADDING_WORDS(0x4);
+
+                struct {
+                    u32 address_high;
+                    u32 address_low;
+                    ConditionMode mode;
+
+                    GPUVAddr Address() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
+                                                     address_low);
+                    }
+                } condition;

                struct {
                    u32 tsc_address_high;
@@ -1123,23 +1157,77 @@ public:

    State state{};

-    struct DirtyFlags {
-        std::bitset<8> color_buffer{0xFF};
-        std::bitset<32> vertex_array{0xFFFFFFFF};
+    struct DirtyRegs {
+        static constexpr std::size_t NUM_REGS = 256;
+        union {
+            struct {
+                bool null_dirty;

-        bool vertex_attrib_format = true;
-        bool zeta_buffer = true;
-        bool shaders = true;
+                // Vertex Attributes
+                bool vertex_attrib_format;
+
+                // Vertex Arrays
+                std::array<bool, 32> vertex_array;
+
+                bool vertex_array_buffers;
+
+                // Vertex Instances
+                std::array<bool, 32> vertex_instance;
+
+                bool vertex_instances;
+
+                // Render Targets
+                std::array<bool, 8> render_target;
+                bool depth_buffer;
+
+                bool render_settings;
+
+                // Shaders
+                bool shaders;
+
+                // Rasterizer State
+                bool viewport;
+                bool clip_coefficient;
+                bool cull_mode;
+                bool primitive_restart;
+                bool depth_test;
+                bool stencil_test;
+                bool blend_state;
+                bool scissor_test;
+                bool transform_feedback;
+                bool color_mask;
+                bool polygon_offset;
+
+                // Complementary
+                bool viewport_transform;
+                bool screen_y_control;
+
+                bool memory_general;
+            };
+            std::array<bool, NUM_REGS> regs;
+        };
+
+        void ResetVertexArrays() {
+            vertex_array.fill(true);
+            vertex_array_buffers = true;
+        }
+
+        void ResetRenderTargets() {
+            depth_buffer = true;
+            render_target.fill(true);
+            render_settings = true;
+        }

        void OnMemoryWrite() {
-            zeta_buffer = true;
            shaders = true;
-            color_buffer.set();
-            vertex_array.set();
+            memory_general = true;
+            ResetRenderTargets();
+            ResetVertexArrays();
        }
-    };

-    DirtyFlags dirty_flags;
+    } dirty{};
+
+    std::array<u8, Regs::NUM_REGS> dirty_pointers{};

    /// Reads a register value located at the input method address
    u32 GetRegisterValue(u32 method) const;
@@ -1168,6 +1256,10 @@ public:
        return macro_memory;
    }

+    bool ShouldExecute() const {
+        return execute_on;
+    }
+
 private:
    void InitializeRegisterDefaults();

@@ -1191,14 +1283,27 @@ private:
    /// Interpreter for the macro codes uploaded to the GPU.
    MacroInterpreter macro_interpreter;

+    static constexpr u32 null_cb_data = 0xFFFFFFFF;
+    struct {
+        std::array<std::array<u32, 0x4000>, 16> buffer;
+        u32 current{null_cb_data};
+        u32 id{null_cb_data};
+        u32 start_pos{};
+        u32 counter{};
+    } cb_data_state;
+
    Upload::State upload_state;

+    bool execute_on{true};
+
    /// Retrieves information about a specific TIC entry from the TIC buffer.
    Texture::TICEntry GetTICEntry(u32 tic_index) const;

    /// Retrieves information about a specific TSC entry from the TSC buffer.
    Texture::TSCEntry GetTSCEntry(u32 tsc_index) const;

+    void InitDirtySettings();
+
    /**
     * Call a macro on this engine.
     * @param method Method to call
@@ -1218,11 +1323,16 @@ private:
    /// Handles a write to the QUERY_GET register.
    void ProcessQueryGet();

+    // Handles Conditional Rendering
+    void ProcessQueryCondition();
+
    /// Handles writes to syncing register.
    void ProcessSyncPoint();

    /// Handles a write to the CB_DATA[i] register.
+    void StartCBData(u32 method);
    void ProcessCBData(u32 value);
+    void FinishCBData();

    /// Handles a write to the CB_BIND register.
    void ProcessCBBind(Regs::ShaderStage stage);
@@ -1289,6 +1399,7 @@ ASSERT_REG_POSITION(clip_distance_enabled, 0x544);
 ASSERT_REG_POSITION(point_size, 0x546);
 ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(multisample_control, 0x54F);
+ASSERT_REG_POSITION(condition, 0x554);
 ASSERT_REG_POSITION(tsc, 0x557);
 ASSERT_REG_POSITION(polygon_offset_factor, 0x55b);
 ASSERT_REG_POSITION(tic, 0x55D);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -58,7 +58,7 @@ void MaxwellDMA::HandleCopy() {
    }

    // All copies here update the main memory, so mark all rasterizer states as invalid.
-    system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
+    system.GPU().Maxwell3D().dirty.OnMemoryWrite();

    if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -78,7 +78,7 @@ union Attribute {
    constexpr explicit Attribute(u64 value) : value(value) {}

    enum class Index : u64 {
-        PointSize = 6,
+        LayerViewportPointSize = 6,
        Position = 7,
        Attribute_0 = 8,
        Attribute_31 = 39,
@@ -931,8 +931,6 @@ union Instruction {
    } csetp;

    union {
-        BitField<35, 4, PredCondition> cond;
-        BitField<49, 1, u64> h_and;
        BitField<6, 1, u64> ftz;
        BitField<45, 2, PredOperation> op;
        BitField<3, 3, u64> pred3;
@@ -940,9 +938,21 @@ union Instruction {
        BitField<43, 1, u64> negate_a;
        BitField<44, 1, u64> abs_a;
        BitField<47, 2, HalfType> type_a;
-        BitField<31, 1, u64> negate_b;
-        BitField<30, 1, u64> abs_b;
-        BitField<28, 2, HalfType> type_b;
+        union {
+            BitField<35, 4, PredCondition> cond;
+            BitField<49, 1, u64> h_and;
+            BitField<31, 1, u64> negate_b;
+            BitField<30, 1, u64> abs_b;
+            BitField<28, 2, HalfType> type_b;
+        } reg;
+        union {
+            BitField<56, 1, u64> negate_b;
+            BitField<54, 1, u64> abs_b;
+        } cbuf;
+        union {
+            BitField<49, 4, PredCondition> cond;
+            BitField<53, 1, u64> h_and;
+        } cbuf_and_imm;
        BitField<42, 1, u64> neg_pred;
        BitField<39, 3, u64> pred39;
    } hsetp2;
@@ -1548,7 +1558,9 @@ public:
        HFMA2_RC,
        HFMA2_RR,
        HFMA2_IMM_R,
+        HSETP2_C,
        HSETP2_R,
+        HSETP2_IMM,
        HSET2_R,
        POPC_C,
        POPC_R,
@@ -1831,7 +1843,9 @@ private:
            INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"),
            INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"),
            INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"),
-            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"),
+            INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
+            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
+            INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
            INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
            INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
            INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -31,7 +31,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {

 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
    auto& rasterizer{renderer.Rasterizer()};
-    memory_manager = std::make_unique<Tegra::MemoryManager>(rasterizer);
+    memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
@@ -50,6 +50,14 @@ const Engines::Maxwell3D& GPU::Maxwell3D() const {
    return *maxwell_3d;
 }

+Engines::KeplerCompute& GPU::KeplerCompute() {
+    return *kepler_compute;
+}
+
+const Engines::KeplerCompute& GPU::KeplerCompute() const {
+    return *kepler_compute;
+}
+
 MemoryManager& GPU::MemoryManager() {
    return *memory_manager;
 }
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -155,6 +155,12 @@ public:
    /// Returns a const reference to the Maxwell3D GPU engine.
    const Engines::Maxwell3D& Maxwell3D() const;

+    /// Returns a reference to the KeplerCompute GPU engine.
+    Engines::KeplerCompute& KeplerCompute();
+
+    /// Returns a reference to the KeplerCompute GPU engine.
+    const Engines::KeplerCompute& KeplerCompute() const;
+
    /// Returns a reference to the GPU memory manager.
    Tegra::MemoryManager& MemoryManager();

--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -4,14 +4,18 @@

 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro_interpreter.h"

+MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
+
 namespace Tegra {

 MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}

 void MacroInterpreter::Execute(u32 offset, std::vector<u32> parameters) {
+    MICROPROFILE_SCOPE(MacroInterp);
    Reset();
    registers[1] = parameters[0];
    this->parameters = std::move(parameters);
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -5,13 +5,17 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
+#include "core/hle/kernel/process.h"
+#include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"

 namespace Tegra {

-MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {
+MemoryManager::MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+    : rasterizer{rasterizer}, system{system} {
    std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
    std::fill(page_table.attributes.begin(), page_table.attributes.end(),
              Common::PageType::Unmapped);
@@ -49,6 +53,11 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) {
    const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};

    MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::DeviceMapped)
+               .IsSuccess());

    return gpu_addr;
 }
@@ -59,7 +68,11 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size)
    const u64 aligned_size{Common::AlignUp(size, page_size)};

    MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
-
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::DeviceMapped)
+               .IsSuccess());
    return gpu_addr;
 }

@@ -68,9 +81,16 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {

    const u64 aligned_size{Common::AlignUp(size, page_size)};
    const CacheAddr cache_addr{ToCacheAddr(GetPointer(gpu_addr))};
+    const auto cpu_addr = GpuToCpuAddress(gpu_addr);
+    ASSERT(cpu_addr);

    rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
    UnmapRange(gpu_addr, aligned_size);
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr.value(), size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::None)
+               .IsSuccess());

    return gpu_addr;
 }
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -14,6 +14,10 @@ namespace VideoCore {
 class RasterizerInterface;
 }

+namespace Core {
+class System;
+}
+
 namespace Tegra {

 /**
@@ -47,7 +51,7 @@ struct VirtualMemoryArea {

 class MemoryManager final {
 public:
-    explicit MemoryManager(VideoCore::RasterizerInterface& rasterizer);
+    explicit MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer);
    ~MemoryManager();

    GPUVAddr AllocateSpace(u64 size, u64 align);
@@ -173,6 +177,8 @@ private:
    Common::PageTable page_table{page_bits};
    VMAMap vma_map;
    VideoCore::RasterizerInterface& rasterizer;
+
+    Core::System& system;
 };

 } // namespace Tegra
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -34,6 +34,9 @@ public:
    /// Clear the current framebuffer
    virtual void Clear() = 0;

+    /// Dispatches a compute shader invocation
+    virtual void DispatchCompute(GPUVAddr code_addr) = 0;
+
    /// Notify rasterizer that all caches should be flushed to Switch memory
    virtual void FlushAll() = 0;

@@ -47,6 +50,9 @@ public:
    /// and invalidated
    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;

+    /// Notify rasterizer that a frame is about to finish
+    virtual void TickFrame() = 0;
+
    /// Attempt to use a faster method to perform a surface copy
    virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                       const Tegra::Engines::Fermi2D::Regs::Surface& dst,
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -2,103 +2,57 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <cstring>
 #include <memory>

-#include "common/alignment.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
+#include <glad/glad.h>
+
+#include "common/assert.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"

 namespace OpenGL {

-CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
-                                     std::size_t alignment, u8* host_ptr)
-    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset},
-      alignment{alignment} {}
+OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+                               std::size_t stream_size)
+    : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{
+          rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}

-OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size)
-    : RasterizerCache{rasterizer}, stream_buffer(size, true) {}
+OGLBufferCache::~OGLBufferCache() = default;

-GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment,
-                                      bool cache) {
-    std::lock_guard lock{mutex};
-    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-
-    // Cache management is a big overhead, so only cache entries with a given size.
-    // TODO: Figure out which size is the best for given games.
-    cache &= size >= 2048;
-
-    const auto& host_ptr{memory_manager.GetPointer(gpu_addr)};
-    if (cache) {
-        auto entry = TryGet(host_ptr);
-        if (entry) {
-            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
-                return entry->GetOffset();
-            }
-            Unregister(entry);
-        }
-    }
-
-    AlignBuffer(alignment);
-    const GLintptr uploaded_offset = buffer_offset;
-
-    if (!host_ptr) {
-        return uploaded_offset;
-    }
-
-    std::memcpy(buffer_ptr, host_ptr, size);
-    buffer_ptr += size;
-    buffer_offset += size;
-
-    if (cache) {
-        auto entry = std::make_shared<CachedBufferEntry>(
-            *memory_manager.GpuToCpuAddress(gpu_addr), size, uploaded_offset, alignment, host_ptr);
-        Register(entry);
-    }
-
-    return uploaded_offset;
+OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) {
+    OGLBuffer buffer;
+    buffer.Create();
+    glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    return buffer;
 }

-GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size,
-                                          std::size_t alignment) {
-    std::lock_guard lock{mutex};
-    AlignBuffer(alignment);
-    std::memcpy(buffer_ptr, raw_pointer, size);
-    const GLintptr uploaded_offset = buffer_offset;
-
-    buffer_ptr += size;
-    buffer_offset += size;
-    return uploaded_offset;
+const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) {
+    return &buffer.handle;
 }

-bool OGLBufferCache::Map(std::size_t max_size) {
-    bool invalidate;
-    std::tie(buffer_ptr, buffer_offset_base, invalidate) =
-        stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4);
-    buffer_offset = buffer_offset_base;
-
-    if (invalidate) {
-        InvalidateAll();
-    }
-    return invalidate;
+const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    static const GLuint null_buffer = 0;
+    return &null_buffer;
 }

-void OGLBufferCache::Unmap() {
-    stream_buffer.Unmap(buffer_offset - buffer_offset_base);
+void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                                      const u8* data) {
+    glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                         static_cast<GLsizeiptr>(size), data);
 }

-GLuint OGLBufferCache::GetHandle() const {
-    return stream_buffer.GetHandle();
+void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset,
+                                        std::size_t size, u8* data) {
+    glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                            static_cast<GLsizeiptr>(size), data);
 }

-void OGLBufferCache::AlignBuffer(std::size_t alignment) {
-    // Align the offset, not the mapped pointer
-    const GLintptr offset_aligned =
-        static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment));
-    buffer_ptr += offset_aligned - buffer_offset;
-    buffer_offset = offset_aligned;
+void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst,
+                                    std::size_t src_offset, std::size_t dst_offset,
+                                    std::size_t size) {
+    glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -4,80 +4,44 @@

 #pragma once

-#include <cstddef>
 #include <memory>
-#include <tuple>

 #include "common/common_types.h"
+#include "video_core/buffer_cache.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"

+namespace Core {
+class System;
+}
+
 namespace OpenGL {

+class OGLStreamBuffer;
 class RasterizerOpenGL;

-class CachedBufferEntry final : public RasterizerCacheObject {
+class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> {
 public:
-    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
-                               std::size_t alignment, u8* host_ptr);
+    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+                            std::size_t stream_size);
+    ~OGLBufferCache();

-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    std::size_t GetSize() const {
-        return size;
-    }
-
-    GLintptr GetOffset() const {
-        return offset;
-    }
-
-    std::size_t GetAlignment() const {
-        return alignment;
-    }
-
-private:
-    VAddr cpu_addr{};
-    std::size_t size{};
-    GLintptr offset{};
-    std::size_t alignment{};
-};
-
-class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
-public:
-    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size);
-
-    /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been
-    /// allocated.
-    GLintptr UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
-                          bool cache = true);
-
-    /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
-    GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);
-
-    bool Map(std::size_t max_size);
-    void Unmap();
-
-    GLuint GetHandle() const;
+    const GLuint* GetEmptyBuffer(std::size_t) override;

 protected:
-    void AlignBuffer(std::size_t alignment);
+    OGLBuffer CreateBuffer(std::size_t size) override;

-    // We do not have to flush this cache as things in it are never modified by us.
-    void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {}
+    const GLuint* ToHandle(const OGLBuffer& buffer) override;

-private:
-    OGLStreamBuffer stream_buffer;
+    void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                          const u8* data) override;

-    u8* buffer_ptr = nullptr;
-    GLintptr buffer_offset = 0;
-    GLintptr buffer_offset_base = 0;
+    void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                            u8* data) override;
+
+    void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset,
+                        std::size_t dst_offset, std::size_t size) override;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -24,8 +24,10 @@ T GetInteger(GLenum pname) {

 Device::Device() {
    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
    max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+    has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
    has_variable_aoffi = TestVariableAoffi();
    has_component_indexing_bug = TestComponentIndexingBug();
 }
@@ -34,6 +36,7 @@ Device::Device(std::nullptr_t) {
    uniform_buffer_alignment = 0;
    max_vertex_attributes = 16;
    max_varyings = 15;
+    has_vertex_viewport_layer = true;
    has_variable_aoffi = true;
    has_component_indexing_bug = false;
 }
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -18,6 +18,10 @@ public:
        return uniform_buffer_alignment;
    }

+    std::size_t GetShaderStorageBufferAlignment() const {
+        return shader_storage_alignment;
+    }
+
    u32 GetMaxVertexAttributes() const {
        return max_vertex_attributes;
    }
@@ -26,6 +30,10 @@ public:
        return max_varyings;
    }

+    bool HasVertexViewportLayer() const {
+        return has_vertex_viewport_layer;
+    }
+
    bool HasVariableAoffi() const {
        return has_variable_aoffi;
    }
@@ -39,8 +47,10 @@ private:
    static bool TestComponentIndexingBug();

    std::size_t uniform_buffer_alignment{};
+    std::size_t shader_storage_alignment{};
    u32 max_vertex_attributes{};
    u32 max_varyings{};
+    bool has_vertex_viewport_layer{};
    bool has_variable_aoffi{};
    bool has_component_indexing_bug{};
 };
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_global_cache.cpp
@@ -1,102 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <glad/glad.h>
-
-#include "common/logging/log.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
-#include "video_core/renderer_opengl/gl_global_cache.h"
-#include "video_core/renderer_opengl/gl_rasterizer.h"
-#include "video_core/renderer_opengl/gl_shader_decompiler.h"
-#include "video_core/renderer_opengl/utils.h"
-
-namespace OpenGL {
-
-CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size)
-    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size},
-      max_size{max_size} {
-    buffer.Create();
-    LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
-}
-
-CachedGlobalRegion::~CachedGlobalRegion() = default;
-
-void CachedGlobalRegion::Reload(u32 size_) {
-    size = size_;
-    if (size > max_size) {
-        size = max_size;
-        LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_,
-                     max_size);
-    }
-    glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW);
-}
-
-void CachedGlobalRegion::Flush() {
-    LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr);
-    glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr);
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
-    const auto search{reserve.find(addr)};
-    if (search == reserve.end()) {
-        return {};
-    }
-    return search->second;
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr,
-                                                              u32 size) {
-    GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
-    if (!region) {
-        // No reserved surface available, create a new one and reserve it
-        auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
-        const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)};
-        ASSERT(cpu_addr);
-
-        region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size);
-        ReserveGlobalRegion(region);
-    }
-    region->Reload(size);
-    return region;
-}
-
-void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
-    reserve.insert_or_assign(region->GetCacheAddr(), std::move(region));
-}
-
-GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
-    : RasterizerCache{rasterizer} {
-    GLint max_ssbo_size_;
-    glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_);
-    max_ssbo_size = static_cast<u32>(max_ssbo_size_);
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
-    const GLShader::GlobalMemoryEntry& global_region,
-    Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) {
-    std::lock_guard lock{mutex};
-
-    auto& gpu{Core::System::GetInstance().GPU()};
-    auto& memory_manager{gpu.MemoryManager()};
-    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
-    const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
-                    global_region.GetCbufOffset()};
-    const auto actual_addr{memory_manager.Read<u64>(addr)};
-    const auto size{memory_manager.Read<u32>(addr + 8)};
-
-    // Look up global region in the cache based on address
-    const auto& host_ptr{memory_manager.GetPointer(actual_addr)};
-    GlobalRegion region{TryGet(host_ptr)};
-
-    if (!region) {
-        // No global region found - create a new one
-        region = GetUncachedGlobalRegion(actual_addr, host_ptr, size);
-        Register(region);
-    }
-
-    return region;
-}
-
-} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ b/src/video_core/renderer_opengl/gl_global_cache.h
@@ -1,82 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <memory>
-#include <unordered_map>
-
-#include <glad/glad.h>
-
-#include "common/assert.h"
-#include "common/common_types.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
-#include "video_core/renderer_opengl/gl_resource_manager.h"
-
-namespace OpenGL {
-
-namespace GLShader {
-class GlobalMemoryEntry;
-}
-
-class RasterizerOpenGL;
-class CachedGlobalRegion;
-using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
-
-class CachedGlobalRegion final : public RasterizerCacheObject {
-public:
-    explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size);
-    ~CachedGlobalRegion();
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    /// Gets the GL program handle for the buffer
-    GLuint GetBufferHandle() const {
-        return buffer.handle;
-    }
-
-    /// Reloads the global region from guest memory
-    void Reload(u32 size_);
-
-    void Flush();
-
-private:
-    VAddr cpu_addr{};
-    u8* host_ptr{};
-    u32 size{};
-    u32 max_size{};
-
-    OGLBuffer buffer;
-};
-
-class GlobalRegionCacheOpenGL final : public RasterizerCache<GlobalRegion> {
-public:
-    explicit GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer);
-
-    /// Gets the current specified shader stage program
-    GlobalRegion GetGlobalRegion(const GLShader::GlobalMemoryEntry& descriptor,
-                                 Tegra::Engines::Maxwell3D::Regs::ShaderStage stage);
-
-protected:
-    void FlushObjectInner(const GlobalRegion& object) override {
-        object->Flush();
-    }
-
-private:
-    GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
-    GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size);
-    void ReserveGlobalRegion(GlobalRegion region);
-
-    std::unordered_map<CacheAddr, GlobalRegion> reserve;
-    u32 max_ssbo_size{};
-};
-
-} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -4,6 +4,7 @@

 #include <algorithm>
 #include <array>
+#include <bitset>
 #include <memory>
 #include <string>
 #include <string_view>
@@ -19,7 +20,9 @@
 #include "core/core.h"
 #include "core/hle/kernel/process.h"
 #include "core/settings.h"
+#include "video_core/engines/kepler_compute.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
@@ -80,16 +83,31 @@ struct DrawParameters {
    }
 };

+static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
+                                      const GLShader::ConstBufferEntry& entry) {
+    if (!entry.IsIndirect()) {
+        return entry.GetSize();
+    }
+
+    if (buffer.size > Maxwell::MaxConstBufferSize) {
+        LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
+                    Maxwell::MaxConstBufferSize);
+        return Maxwell::MaxConstBufferSize;
+    }
+
+    return buffer.size;
+}
+
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                   ScreenInfo& info)
    : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device},
-      global_cache{*this}, system{system}, screen_info{info},
-      buffer_cache(*this, STREAM_BUFFER_SIZE) {
+      system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} {
    OpenGLState::ApplyDefaultState();

    shader_program_manager = std::make_unique<GLShader::ProgramManager>();
    state.draw.shader_program = 0;
    state.Apply();
+    clear_framebuffer.Create();

    LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");
    CheckExtensions();
@@ -109,10 +127,10 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
    auto& gpu = system.GPU().Maxwell3D();
    const auto& regs = gpu.regs;

-    if (!gpu.dirty_flags.vertex_attrib_format) {
+    if (!gpu.dirty.vertex_attrib_format) {
        return state.draw.vertex_array;
    }
-    gpu.dirty_flags.vertex_attrib_format = false;
+    gpu.dirty.vertex_attrib_format = false;

    MICROPROFILE_SCOPE(OpenGL_VAO);

@@ -129,8 +147,6 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
        state.draw.vertex_array = vao;
        state.ApplyVertexArrayState();

-        glVertexArrayElementBuffer(vao, buffer_cache.GetHandle());
-
        // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
        // Enables the first 16 vertex attributes always, as we don't know which ones are actually
        // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16
@@ -168,7 +184,7 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
    }

    // Rebinding the VAO invalidates the vertex buffer bindings.
-    gpu.dirty_flags.vertex_array.set();
+    gpu.dirty.ResetVertexArrays();

    state.draw.vertex_array = vao_entry.handle;
    return vao_entry.handle;
@@ -176,17 +192,20 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {

 void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
    auto& gpu = system.GPU().Maxwell3D();
-    const auto& regs = gpu.regs;
-
-    if (gpu.dirty_flags.vertex_array.none())
+    if (!gpu.dirty.vertex_array_buffers)
        return;
+    gpu.dirty.vertex_array_buffers = false;
+
+    const auto& regs = gpu.regs;

    MICROPROFILE_SCOPE(OpenGL_VB);

    // Upload all guest vertex arrays sequentially to our buffer
    for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
-        if (!gpu.dirty_flags.vertex_array[index])
+        if (!gpu.dirty.vertex_array[index])
            continue;
+        gpu.dirty.vertex_array[index] = false;
+        gpu.dirty.vertex_instance[index] = false;

        const auto& vertex_array = regs.vertex_array[index];
        if (!vertex_array.IsEnabled())
@@ -197,11 +216,11 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {

        ASSERT(end > start);
        const u64 size = end - start + 1;
-        const GLintptr vertex_buffer_offset = buffer_cache.UploadMemory(start, size);
+        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);

        // Bind the vertex array to the buffer at the current offset.
-        glVertexArrayVertexBuffer(vao, index, buffer_cache.GetHandle(), vertex_buffer_offset,
-                                  vertex_array.stride);
+        vertex_array_pushbuffer.SetVertexBuffer(index, vertex_buffer, vertex_buffer_offset,
+                                                vertex_array.stride);

        if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
            // Enable vertex buffer instancing with the specified divisor.
@@ -211,11 +230,47 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
            glVertexArrayBindingDivisor(vao, index, 0);
        }
    }
-
-    gpu.dirty_flags.vertex_array.reset();
 }

-DrawParameters RasterizerOpenGL::SetupDraw() {
+void RasterizerOpenGL::SetupVertexInstances(GLuint vao) {
+    auto& gpu = system.GPU().Maxwell3D();
+
+    if (!gpu.dirty.vertex_instances)
+        return;
+    gpu.dirty.vertex_instances = false;
+
+    const auto& regs = gpu.regs;
+    // Upload all guest vertex arrays sequentially to our buffer
+    for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
+        if (!gpu.dirty.vertex_instance[index])
+            continue;
+
+        gpu.dirty.vertex_instance[index] = false;
+
+        if (regs.instanced_arrays.IsInstancingEnabled(index) &&
+            regs.vertex_array[index].divisor != 0) {
+            // Enable vertex buffer instancing with the specified divisor.
+            glVertexArrayBindingDivisor(vao, index, regs.vertex_array[index].divisor);
+        } else {
+            // Disable the vertex buffer instancing.
+            glVertexArrayBindingDivisor(vao, index, 0);
+        }
+    }
+}
+
+GLintptr RasterizerOpenGL::SetupIndexBuffer() {
+    if (accelerate_draw != AccelDraw::Indexed) {
+        return 0;
+    }
+    MICROPROFILE_SCOPE(OpenGL_Index);
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    const std::size_t size = CalculateIndexBufferSize();
+    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    vertex_array_pushbuffer.SetIndexBuffer(buffer);
+    return offset;
+}
+
+DrawParameters RasterizerOpenGL::SetupDraw(GLintptr index_buffer_offset) {
    const auto& gpu = system.GPU().Maxwell3D();
    const auto& regs = gpu.regs;
    const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
@@ -227,11 +282,9 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
    params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);

    if (is_indexed) {
-        MICROPROFILE_SCOPE(OpenGL_Index);
        params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
        params.count = regs.index_array.count;
-        params.index_buffer_offset =
-            buffer_cache.UploadMemory(regs.index_array.IndexStart(), CalculateIndexBufferSize());
+        params.index_buffer_offset = index_buffer_offset;
        params.base_vertex = static_cast<GLint>(regs.vb_element_base);
    } else {
        params.count = regs.vertex_buffer.count;
@@ -247,10 +300,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
    BaseBindings base_bindings;
    std::array<bool, Maxwell::NumClipDistances> clip_distances{};

-    // Prepare packed bindings
-    bind_ubo_pushbuffer.Setup(base_bindings.cbuf);
-    bind_ssbo_pushbuffer.Setup(base_bindings.gmem);
-
    for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
        const auto& shader_config = gpu.regs.shader_config[index];
        const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)};
@@ -271,18 +320,17 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {

        GLShader::MaxwellUniformData ubo{};
        ubo.SetFromRegs(gpu, stage);
-        const GLintptr offset =
+        const auto [buffer, offset] =
            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());

        // Bind the emulation info buffer
-        bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset,
-                                 static_cast<GLsizeiptr>(sizeof(ubo)));
+        bind_ubo_pushbuffer.Push(buffer, offset, static_cast<GLsizeiptr>(sizeof(ubo)));

        Shader shader{shader_cache.GetStageProgram(program)};

-        const auto stage_enum{static_cast<Maxwell::ShaderStage>(stage)};
+        const auto stage_enum = static_cast<Maxwell::ShaderStage>(stage);
        SetupDrawConstBuffers(stage_enum, shader);
-        SetupGlobalRegions(stage_enum, shader);
+        SetupDrawGlobalMemory(stage_enum, shader);
        const auto texture_buffer_usage{SetupTextures(stage_enum, shader, base_bindings)};

        const ProgramVariant variant{base_bindings, primitive_mode, texture_buffer_usage};
@@ -321,12 +369,9 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
        base_bindings = next_bindings;
    }

-    bind_ubo_pushbuffer.Bind();
-    bind_ssbo_pushbuffer.Bind();
-
    SyncClipEnabled(clip_distances);

-    gpu.dirty_flags.shaders = false;
+    gpu.dirty.shaders = false;
 }

 std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
@@ -409,13 +454,13 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(

    const FramebufferConfigState fb_config_state{using_color_fb, using_depth_fb, preserve_contents,
                                                 single_color_target};
-    if (fb_config_state == current_framebuffer_config_state &&
-        gpu.dirty_flags.color_buffer.none() && !gpu.dirty_flags.zeta_buffer) {
+    if (fb_config_state == current_framebuffer_config_state && !gpu.dirty.render_settings) {
        // Only skip if the previous ConfigureFramebuffers call was from the same kind (multiple or
        // single color targets). This is done because the guest registers may not change but the
        // host framebuffer may contain different attachments
        return current_depth_stencil_usage;
    }
+    gpu.dirty.render_settings = false;
    current_framebuffer_config_state = fb_config_state;

    texture_cache.GuardRenderTargets(true);
@@ -504,13 +549,71 @@ std::pair<bool, bool> RasterizerOpenGL::ConfigureFramebuffers(
    return current_depth_stencil_usage = {static_cast<bool>(depth_surface), fbkey.stencil_enable};
 }

+void RasterizerOpenGL::ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
+                                                 bool using_depth_fb, bool using_stencil_fb) {
+    auto& gpu = system.GPU().Maxwell3D();
+    const auto& regs = gpu.regs;
+
+    texture_cache.GuardRenderTargets(true);
+    View color_surface{};
+    if (using_color_fb) {
+        color_surface = texture_cache.GetColorBufferSurface(regs.clear_buffers.RT, false);
+    }
+    View depth_surface{};
+    if (using_depth_fb || using_stencil_fb) {
+        depth_surface = texture_cache.GetDepthBufferSurface(false);
+    }
+    texture_cache.GuardRenderTargets(false);
+
+    current_state.draw.draw_framebuffer = clear_framebuffer.handle;
+    current_state.ApplyFramebufferState();
+
+    if (color_surface) {
+        color_surface->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER);
+    } else {
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+    }
+
+    if (depth_surface) {
+        const auto& params = depth_surface->GetSurfaceParams();
+        switch (params.type) {
+        case VideoCore::Surface::SurfaceType::Depth: {
+            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
+            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+            break;
+        }
+        case VideoCore::Surface::SurfaceType::DepthStencil: {
+            depth_surface->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER);
+            break;
+        }
+        default: { UNIMPLEMENTED(); }
+        }
+    } else {
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+    }
+}
+
 void RasterizerOpenGL::Clear() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    const auto& maxwell3d = system.GPU().Maxwell3D();
+
+    if (!maxwell3d.ShouldExecute()) {
+        return;
+    }
+
+    const auto& regs = maxwell3d.regs;
    bool use_color{};
    bool use_depth{};
    bool use_stencil{};

-    OpenGLState clear_state;
+    OpenGLState prev_state{OpenGLState::GetCurState()};
+    SCOPE_EXIT({
+        prev_state.AllDirty();
+        prev_state.Apply();
+    });
+
+    OpenGLState clear_state{OpenGLState::GetCurState()};
+    clear_state.SetDefaultViewports();
    if (regs.clear_buffers.R || regs.clear_buffers.G || regs.clear_buffers.B ||
        regs.clear_buffers.A) {
        use_color = true;
@@ -530,6 +633,7 @@ void RasterizerOpenGL::Clear() {
        // true.
        clear_state.depth.test_enabled = true;
        clear_state.depth.test_func = GL_ALWAYS;
+        clear_state.depth.write_mask = GL_TRUE;
    }
    if (regs.clear_buffers.S) {
        ASSERT_MSG(regs.zeta_enable != 0, "Tried to clear stencil but buffer is not enabled!");
@@ -566,8 +670,9 @@ void RasterizerOpenGL::Clear() {
        return;
    }

-    const auto [clear_depth, clear_stencil] = ConfigureFramebuffers(
-        clear_state, use_color, use_depth || use_stencil, false, regs.clear_buffers.RT.Value());
+    ConfigureClearFramebuffer(clear_state, use_color, use_depth, use_stencil);
+
+    SyncViewport(clear_state);
    if (regs.clear_flags.scissor) {
        SyncScissorTest(clear_state);
    }
@@ -576,21 +681,18 @@ void RasterizerOpenGL::Clear() {
        clear_state.EmulateViewportWithScissor();
    }

-    clear_state.ApplyColorMask();
-    clear_state.ApplyDepth();
-    clear_state.ApplyStencilTest();
-    clear_state.ApplyViewport();
-    clear_state.ApplyFramebufferState();
+    clear_state.AllDirty();
+    clear_state.Apply();

    if (use_color) {
-        glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
+        glClearBufferfv(GL_COLOR, 0, regs.clear_color);
    }

-    if (clear_depth && clear_stencil) {
+    if (use_depth && use_stencil) {
        glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil);
-    } else if (clear_depth) {
+    } else if (use_depth) {
        glClearBufferfv(GL_DEPTH, 0, &regs.clear_depth);
-    } else if (clear_stencil) {
+    } else if (use_stencil) {
        glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
    }
 }
@@ -601,6 +703,11 @@ void RasterizerOpenGL::DrawArrays() {

    MICROPROFILE_SCOPE(OpenGL_Drawing);
    auto& gpu = system.GPU().Maxwell3D();
+
+    if (!gpu.ShouldExecute()) {
+        return;
+    }
+
    const auto& regs = gpu.regs;

    SyncColorMask();
@@ -634,26 +741,47 @@ void RasterizerOpenGL::DrawArrays() {
                      Maxwell::MaxShaderStage;

    // Add space for at least 18 constant buffers
-    buffer_size +=
-        Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
+    buffer_size += Maxwell::MaxConstBuffers *
+                   (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());

-    const bool invalidate = buffer_cache.Map(buffer_size);
-    if (invalidate) {
-        // As all cached buffers are invalidated, we need to recheck their state.
-        gpu.dirty_flags.vertex_array.set();
-    }
+    // Prepare the vertex array.
+    buffer_cache.Map(buffer_size);

+    // Prepare vertex array format.
    const GLuint vao = SetupVertexFormat();
-    SetupVertexBuffer(vao);
+    vertex_array_pushbuffer.Setup(vao);

-    DrawParameters params = SetupDraw();
+    // Upload vertex and index data.
+    SetupVertexBuffer(vao);
+    SetupVertexInstances(vao);
+    const GLintptr index_buffer_offset = SetupIndexBuffer();
+
+    // Setup draw parameters. It will automatically choose what glDraw* method to use.
+    const DrawParameters params = SetupDraw(index_buffer_offset);
+
+    // Prepare packed bindings.
+    bind_ubo_pushbuffer.Setup(0);
+    bind_ssbo_pushbuffer.Setup(0);
+
+    // Setup shaders and their used resources.
    texture_cache.GuardSamplers(true);
    SetupShaders(params.primitive_mode);
    texture_cache.GuardSamplers(false);

    ConfigureFramebuffers(state);

-    buffer_cache.Unmap();
+    // Signal the buffer cache that we are not going to upload more things.
+    const bool invalidate = buffer_cache.Unmap();
+
+    // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL.
+    vertex_array_pushbuffer.Bind();
+    bind_ubo_pushbuffer.Bind();
+    bind_ssbo_pushbuffer.Bind();
+
+    if (invalidate) {
+        // As all cached buffers are invalidated, we need to recheck their state.
+        gpu.dirty.ResetVertexArrays();
+    }

    shader_program_manager->ApplyTo(state);
    state.Apply();
@@ -665,6 +793,46 @@ void RasterizerOpenGL::DrawArrays() {
    params.DispatchDraw();

    accelerate_draw = AccelDraw::Disabled;
+    gpu.dirty.memory_general = false;
+}
+
+void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
+    if (!GLAD_GL_ARB_compute_variable_group_size) {
+        LOG_ERROR(Render_OpenGL, "Compute is currently not supported on this device due to the "
+                                 "lack of GL_ARB_compute_variable_group_size");
+        return;
+    }
+
+    auto kernel = shader_cache.GetComputeKernel(code_addr);
+    const auto [program, next_bindings] = kernel->GetProgramHandle({});
+    state.draw.shader_program = program;
+    state.draw.program_pipeline = 0;
+
+    const std::size_t buffer_size =
+        Tegra::Engines::KeplerCompute::NumConstBuffers *
+        (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
+    buffer_cache.Map(buffer_size);
+
+    bind_ubo_pushbuffer.Setup(0);
+    bind_ssbo_pushbuffer.Setup(0);
+
+    SetupComputeConstBuffers(kernel);
+    SetupComputeGlobalMemory(kernel);
+
+    // TODO(Rodrigo): Bind images and samplers
+
+    buffer_cache.Unmap();
+
+    bind_ubo_pushbuffer.Bind();
+    bind_ssbo_pushbuffer.Bind();
+
+    state.ApplyShaderProgram();
+    state.ApplyProgramPipeline();
+
+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    glDispatchComputeGroupSizeARB(launch_desc.grid_dim_x, launch_desc.grid_dim_y,
+                                  launch_desc.grid_dim_z, launch_desc.block_dim_x,
+                                  launch_desc.block_dim_y, launch_desc.block_dim_z);
 }

 void RasterizerOpenGL::FlushAll() {}
@@ -675,7 +843,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
        return;
    }
    texture_cache.FlushRegion(addr, size);
-    global_cache.FlushRegion(addr, size);
+    buffer_cache.FlushRegion(addr, size);
 }

 void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -685,7 +853,6 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
    }
    texture_cache.InvalidateRegion(addr, size);
    shader_cache.InvalidateRegion(addr, size);
-    global_cache.InvalidateRegion(addr, size);
    buffer_cache.InvalidateRegion(addr, size);
 }

@@ -696,6 +863,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
    InvalidateRegion(addr, size);
 }

+void RasterizerOpenGL::TickFrame() {
+    buffer_cache.TickFrame();
+}
+
 bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                             const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                                             const Tegra::Engines::Fermi2D::Config& copy_config) {
@@ -737,14 +908,25 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
 void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                             const Shader& shader) {
    MICROPROFILE_SCOPE(OpenGL_UBO);
-    const auto stage_index = static_cast<std::size_t>(stage);
-    const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
-    const auto& entries = shader->GetShaderEntries().const_buffers;
+    const auto& stages = system.GPU().Maxwell3D().state.shader_stages;
+    const auto& shader_stage = stages[static_cast<std::size_t>(stage)];
+    for (const auto& entry : shader->GetShaderEntries().const_buffers) {
+        const auto& buffer = shader_stage.const_buffers[entry.GetIndex()];
+        SetupConstBuffer(buffer, entry);
+    }
+}

-    // Upload only the enabled buffers from the 16 constbuffers of each shader stage
-    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& entry = entries[bindpoint];
-        SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
+void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) {
+    MICROPROFILE_SCOPE(OpenGL_UBO);
+    const auto& launch_desc = system.GPU().KeplerCompute().launch_description;
+    for (const auto& entry : kernel->GetShaderEntries().const_buffers) {
+        const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
+        const std::bitset<8> mask = launch_desc.memory_config.const_buffer_enable_mask.Value();
+        Tegra::Engines::ConstBufferInfo buffer;
+        buffer.address = config.Address();
+        buffer.size = config.size;
+        buffer.enabled = mask[entry.GetIndex()];
+        SetupConstBuffer(buffer, entry);
    }
 }

@@ -752,49 +934,52 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
                                        const GLShader::ConstBufferEntry& entry) {
    if (!buffer.enabled) {
        // Set values to zero to unbind buffers
-        bind_ubo_pushbuffer.Push(0, 0, 0);
+        bind_ubo_pushbuffer.Push(buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
        return;
    }

-    std::size_t size;
-    if (entry.IsIndirect()) {
-        // Buffer is accessed indirectly, so upload the entire thing
-        size = buffer.size;
-
-        if (size > MaxConstbufferSize) {
-            LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
-                        MaxConstbufferSize);
-            size = MaxConstbufferSize;
-        }
-    } else {
-        // Buffer is accessed directly, upload just what we use
-        size = entry.GetSize();
-    }
-
    // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
    // UBO alignment requirements.
-    size = Common::AlignUp(size, sizeof(GLvec4));
-    ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big");
+    const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));

-    const std::size_t alignment = device.GetUniformBufferAlignment();
-    const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment);
-    bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size);
+    const auto alignment = device.GetUniformBufferAlignment();
+    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment);
+    bind_ubo_pushbuffer.Push(cbuf, offset, size);
 }

-void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                                          const Shader& shader) {
-    const auto& entries = shader->GetShaderEntries().global_memory_entries;
-    for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& entry{entries[bindpoint]};
-        const auto& region{global_cache.GetGlobalRegion(entry, stage)};
-        if (entry.IsWritten()) {
-            region->MarkAsModified(true, global_cache);
-        }
-        bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0,
-                                  static_cast<GLsizeiptr>(region->GetSizeInBytes()));
+void RasterizerOpenGL::SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                                             const Shader& shader) {
+    auto& gpu{system.GPU()};
+    auto& memory_manager{gpu.MemoryManager()};
+    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
+    for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
+        const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
+        const auto gpu_addr{memory_manager.Read<u64>(addr)};
+        const auto size{memory_manager.Read<u32>(addr + 8)};
+        SetupGlobalMemory(entry, gpu_addr, size);
    }
 }

+void RasterizerOpenGL::SetupComputeGlobalMemory(const Shader& kernel) {
+    auto& gpu{system.GPU()};
+    auto& memory_manager{gpu.MemoryManager()};
+    const auto cbufs{gpu.KeplerCompute().launch_description.const_buffer_config};
+    for (const auto& entry : kernel->GetShaderEntries().global_memory_entries) {
+        const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
+        const auto gpu_addr{memory_manager.Read<u64>(addr)};
+        const auto size{memory_manager.Read<u32>(addr + 8)};
+        SetupGlobalMemory(entry, gpu_addr, size);
+    }
+}
+
+void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry,
+                                         GPUVAddr gpu_addr, std::size_t size) {
+    const auto alignment{device.GetShaderStorageBufferAlignment()};
+    const auto [ssbo, buffer_offset] =
+        buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten());
+    bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
+}
+
 TextureBufferUsage RasterizerOpenGL::SetupTextures(Maxwell::ShaderStage stage, const Shader& shader,
                                                   BaseBindings base_bindings) {
    MICROPROFILE_SCOPE(OpenGL_Texture);
@@ -883,10 +1068,11 @@ void RasterizerOpenGL::SyncClipCoef() {
 }

 void RasterizerOpenGL::SyncCullMode() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+
+    const auto& regs = maxwell3d.regs;

    state.cull.enabled = regs.cull.enabled != 0;
-
    if (state.cull.enabled) {
        state.cull.front_face = MaxwellToGL::FrontFace(regs.cull.front_face);
        state.cull.mode = MaxwellToGL::CullFace(regs.cull.cull_face);
@@ -919,16 +1105,21 @@ void RasterizerOpenGL::SyncDepthTestState() {
    state.depth.test_enabled = regs.depth_test_enable != 0;
    state.depth.write_mask = regs.depth_write_enabled ? GL_TRUE : GL_FALSE;

-    if (!state.depth.test_enabled)
+    if (!state.depth.test_enabled) {
        return;
+    }

    state.depth.test_func = MaxwellToGL::ComparisonOp(regs.depth_test_func);
 }

 void RasterizerOpenGL::SyncStencilTestState() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
-    state.stencil.test_enabled = regs.stencil_enable != 0;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.stencil_test) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;

+    state.stencil.test_enabled = regs.stencil_enable != 0;
    if (!regs.stencil_enable) {
        return;
    }
@@ -957,10 +1148,17 @@ void RasterizerOpenGL::SyncStencilTestState() {
        state.stencil.back.action_depth_fail = GL_KEEP;
        state.stencil.back.action_depth_pass = GL_KEEP;
    }
+    state.MarkDirtyStencilState();
+    maxwell3d.dirty.stencil_test = false;
 }

 void RasterizerOpenGL::SyncColorMask() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.color_mask) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
+
    const std::size_t count =
        regs.independent_blend_enable ? Tegra::Engines::Maxwell3D::Regs::NumRenderTargets : 1;
    for (std::size_t i = 0; i < count; i++) {
@@ -971,6 +1169,9 @@ void RasterizerOpenGL::SyncColorMask() {
        dest.blue_enabled = (source.B == 0) ? GL_FALSE : GL_TRUE;
        dest.alpha_enabled = (source.A == 0) ? GL_FALSE : GL_TRUE;
    }
+
+    state.MarkDirtyColorMask();
+    maxwell3d.dirty.color_mask = false;
 }

 void RasterizerOpenGL::SyncMultiSampleState() {
@@ -985,7 +1186,11 @@ void RasterizerOpenGL::SyncFragmentColorClampState() {
 }

 void RasterizerOpenGL::SyncBlendState() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.blend_state) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;

    state.blend_color.red = regs.blend_color.r;
    state.blend_color.green = regs.blend_color.g;
@@ -1008,6 +1213,8 @@ void RasterizerOpenGL::SyncBlendState() {
        for (std::size_t i = 1; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
            state.blend[i].enabled = false;
        }
+        maxwell3d.dirty.blend_state = false;
+        state.MarkDirtyBlendState();
        return;
    }

@@ -1024,6 +1231,9 @@ void RasterizerOpenGL::SyncBlendState() {
        blend.src_a_func = MaxwellToGL::BlendFunc(src.factor_source_a);
        blend.dst_a_func = MaxwellToGL::BlendFunc(src.factor_dest_a);
    }
+
+    state.MarkDirtyBlendState();
+    maxwell3d.dirty.blend_state = false;
 }

 void RasterizerOpenGL::SyncLogicOpState() {
@@ -1075,13 +1285,21 @@ void RasterizerOpenGL::SyncPointState() {
 }

 void RasterizerOpenGL::SyncPolygonOffset() {
-    const auto& regs = system.GPU().Maxwell3D().regs;
+    auto& maxwell3d = system.GPU().Maxwell3D();
+    if (!maxwell3d.dirty.polygon_offset) {
+        return;
+    }
+    const auto& regs = maxwell3d.regs;
+
    state.polygon_offset.fill_enable = regs.polygon_offset_fill_enable != 0;
    state.polygon_offset.line_enable = regs.polygon_offset_line_enable != 0;
    state.polygon_offset.point_enable = regs.polygon_offset_point_enable != 0;
    state.polygon_offset.units = regs.polygon_offset_units;
    state.polygon_offset.factor = regs.polygon_offset_factor;
    state.polygon_offset.clamp = regs.polygon_offset_clamp;
+
+    state.MarkDirtyPolygonOffset();
+    maxwell3d.dirty.polygon_offset = false;
 }

 void RasterizerOpenGL::SyncAlphaTest() {
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -24,7 +24,6 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
-#include "video_core/renderer_opengl/gl_global_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -59,10 +58,12 @@ public:

    void DrawArrays() override;
    void Clear() override;
+    void DispatchCompute(GPUVAddr code_addr) override;
    void FlushAll() override;
    void FlushRegion(CacheAddr addr, u64 size) override;
    void InvalidateRegion(CacheAddr addr, u64 size) override;
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+    void TickFrame() override;
    bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                               const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                               const Tegra::Engines::Fermi2D::Config& copy_config) override;
@@ -73,11 +74,6 @@ public:
    void LoadDiskResources(const std::atomic_bool& stop_loading,
                           const VideoCore::DiskResourceLoadCallback& callback) override;

-    /// Maximum supported size that a constbuffer can have in bytes.
-    static constexpr std::size_t MaxConstbufferSize = 0x10000;
-    static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
-                  "The maximum size of a constbuffer must be a multiple of the size of GLvec4");
-
 private:
    struct FramebufferConfigState {
        bool using_color_fb{};
@@ -113,17 +109,30 @@ private:
        OpenGLState& current_state, bool using_color_fb = true, bool using_depth_fb = true,
        bool preserve_contents = true, std::optional<std::size_t> single_color_target = {});

+    void ConfigureClearFramebuffer(OpenGLState& current_state, bool using_color_fb,
+                                   bool using_depth_fb, bool using_stencil_fb);
+
    /// Configures the current constbuffers to use for the draw command.
    void SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                               const Shader& shader);

+    /// Configures the current constbuffers to use for the kernel invocation.
+    void SetupComputeConstBuffers(const Shader& kernel);
+
    /// Configures a constant buffer.
    void SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& buffer,
                          const GLShader::ConstBufferEntry& entry);

    /// Configures the current global memory entries to use for the draw command.
-    void SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
-                            const Shader& shader);
+    void SetupDrawGlobalMemory(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
+                               const Shader& shader);
+
+    /// Configures the current global memory entries to use for the kernel invocation.
+    void SetupComputeGlobalMemory(const Shader& kernel);
+
+    /// Configures a constant buffer.
+    void SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
+                           std::size_t size);

    /// Configures the current textures to use for the draw command. Returns shaders texture buffer
    /// usage.
@@ -191,7 +200,6 @@ private:

    TextureCacheOpenGL texture_cache;
    ShaderCacheOpenGL shader_cache;
-    GlobalRegionCacheOpenGL global_cache;
    SamplerCacheOpenGL sampler_cache;
    FramebufferCacheOpenGL framebuffer_cache;

@@ -210,6 +218,7 @@ private:
    static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
    OGLBufferCache buffer_cache;

+    VertexArrayPushBuffer vertex_array_pushbuffer;
    BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
    BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};

@@ -221,14 +230,19 @@ private:
    GLuint SetupVertexFormat();

    void SetupVertexBuffer(GLuint vao);
+    void SetupVertexInstances(GLuint vao);

-    DrawParameters SetupDraw();
+    GLintptr SetupIndexBuffer();
+
+    DrawParameters SetupDraw(GLintptr index_buffer_offset);

    void SetupShaders(GLenum primitive_mode);

    enum class AccelDraw { Disabled, Arrays, Indexed };
    AccelDraw accelerate_draw = AccelDraw::Disabled;

+    OGLFramebuffer clear_framebuffer;
+
    using CachedPageMap = boost::icl::interval_map<u64, int>;
    CachedPageMap cached_pages;
 };
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -23,13 +23,13 @@ namespace OpenGL {

 using VideoCommon::Shader::ProgramCode;

-// One UBO is always reserved for emulation values
-constexpr u32 RESERVED_UBOS = 1;
+// One UBO is always reserved for emulation values on staged shaders
+constexpr u32 STAGE_RESERVED_UBOS = 1;

 struct UnspecializedShader {
    std::string code;
    GLShader::ShaderEntries entries;
-    Maxwell::ShaderProgram program_type;
+    ProgramType program_type;
 };

 namespace {
@@ -55,15 +55,17 @@ ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr g
 }

 /// Gets the shader type from a Maxwell program type
-constexpr GLenum GetShaderType(Maxwell::ShaderProgram program_type) {
+constexpr GLenum GetShaderType(ProgramType program_type) {
    switch (program_type) {
-    case Maxwell::ShaderProgram::VertexA:
-    case Maxwell::ShaderProgram::VertexB:
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
        return GL_VERTEX_SHADER;
-    case Maxwell::ShaderProgram::Geometry:
+    case ProgramType::Geometry:
        return GL_GEOMETRY_SHADER;
-    case Maxwell::ShaderProgram::Fragment:
+    case ProgramType::Fragment:
        return GL_FRAGMENT_SHADER;
+    case ProgramType::Compute:
+        return GL_COMPUTE_SHADER;
    default:
        return GL_NONE;
    }
@@ -100,6 +102,25 @@ constexpr std::tuple<const char*, const char*, u32> GetPrimitiveDescription(GLen
    }
 }

+ProgramType GetProgramType(Maxwell::ShaderProgram program) {
+    switch (program) {
+    case Maxwell::ShaderProgram::VertexA:
+        return ProgramType::VertexA;
+    case Maxwell::ShaderProgram::VertexB:
+        return ProgramType::VertexB;
+    case Maxwell::ShaderProgram::TesselationControl:
+        return ProgramType::TessellationControl;
+    case Maxwell::ShaderProgram::TesselationEval:
+        return ProgramType::TessellationEval;
+    case Maxwell::ShaderProgram::Geometry:
+        return ProgramType::Geometry;
+    case Maxwell::ShaderProgram::Fragment:
+        return ProgramType::Fragment;
+    }
+    UNREACHABLE();
+    return {};
+}
+
 /// Calculates the size of a program stream
 std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
    constexpr std::size_t start_offset = 10;
@@ -128,13 +149,13 @@ std::size_t CalculateProgramSize(const GLShader::ProgramCode& program) {
 }

 /// Hashes one (or two) program streams
-u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& code,
+u64 GetUniqueIdentifier(ProgramType program_type, const ProgramCode& code,
                        const ProgramCode& code_b, std::size_t size_a = 0, std::size_t size_b = 0) {
    if (size_a == 0) {
        size_a = CalculateProgramSize(code);
    }
    u64 unique_identifier = Common::CityHash64(reinterpret_cast<const char*>(code.data()), size_a);
-    if (program_type != Maxwell::ShaderProgram::VertexA) {
+    if (program_type != ProgramType::VertexA) {
        return unique_identifier;
    }
    // VertexA programs include two programs
@@ -152,12 +173,12 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
 }

 /// Creates an unspecialized program from code streams
-GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
+GLShader::ProgramResult CreateProgram(const Device& device, ProgramType program_type,
                                      ProgramCode program_code, ProgramCode program_code_b) {
    GLShader::ShaderSetup setup(program_code);
    setup.program.size_a = CalculateProgramSize(program_code);
    setup.program.size_b = 0;
-    if (program_type == Maxwell::ShaderProgram::VertexA) {
+    if (program_type == ProgramType::VertexA) {
        // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
        // Conventional HW does not support this, so we combine VertexA and VertexB into one
        // stage here.
@@ -168,30 +189,41 @@ GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgr
        program_type, program_code, program_code_b, setup.program.size_a, setup.program.size_b);

    switch (program_type) {
-    case Maxwell::ShaderProgram::VertexA:
-    case Maxwell::ShaderProgram::VertexB:
+    case ProgramType::VertexA:
+    case ProgramType::VertexB:
        return GLShader::GenerateVertexShader(device, setup);
-    case Maxwell::ShaderProgram::Geometry:
+    case ProgramType::Geometry:
        return GLShader::GenerateGeometryShader(device, setup);
-    case Maxwell::ShaderProgram::Fragment:
+    case ProgramType::Fragment:
        return GLShader::GenerateFragmentShader(device, setup);
+    case ProgramType::Compute:
+        return GLShader::GenerateComputeShader(device, setup);
    default:
-        LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
-        UNREACHABLE();
+        UNIMPLEMENTED_MSG("Unimplemented program_type={}", static_cast<u32>(program_type));
        return {};
    }
 }

 CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEntries& entries,
-                               Maxwell::ShaderProgram program_type, const ProgramVariant& variant,
+                               ProgramType program_type, const ProgramVariant& variant,
                               bool hint_retrievable = false) {
    auto base_bindings{variant.base_bindings};
    const auto primitive_mode{variant.primitive_mode};
    const auto texture_buffer_usage{variant.texture_buffer_usage};

    std::string source = "#version 430 core\n"
-                         "#extension GL_ARB_separate_shader_objects : enable\n\n";
-    source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+                         "#extension GL_ARB_separate_shader_objects : enable\n";
+    if (entries.shader_viewport_layer_array) {
+        source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
+    }
+    if (program_type == ProgramType::Compute) {
+        source += "#extension GL_ARB_compute_variable_group_size : require\n";
+    }
+    source += '\n';
+
+    if (program_type != ProgramType::Compute) {
+        source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+    }

    for (const auto& cbuf : entries.const_buffers) {
        source +=
@@ -218,13 +250,16 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
        source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i);
    }

-    if (program_type == Maxwell::ShaderProgram::Geometry) {
+    if (program_type == ProgramType::Geometry) {
        const auto [glsl_topology, debug_name, max_vertices] =
            GetPrimitiveDescription(primitive_mode);

        source += "layout (" + std::string(glsl_topology) + ") in;\n";
        source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
    }
+    if (program_type == ProgramType::Compute) {
+        source += "layout (local_size_variable) in;\n";
+    }

    source += code;

@@ -252,7 +287,7 @@ std::set<GLenum> GetSupportedFormats() {

 } // Anonymous namespace

-CachedShader::CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
                           GLShader::ProgramResult result)
    : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr},
      unique_identifier{params.unique_identifier}, program_type{program_type},
@@ -265,29 +300,50 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params,
                                           ProgramCode&& program_code_b) {
    const auto code_size{CalculateProgramSize(program_code)};
    const auto code_size_b{CalculateProgramSize(program_code_b)};
-    auto result{CreateProgram(params.device, program_type, program_code, program_code_b)};
+    auto result{
+        CreateProgram(params.device, GetProgramType(program_type), program_code, program_code_b)};
    if (result.first.empty()) {
        // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
        return {};
    }

    params.disk_cache.SaveRaw(ShaderDiskCacheRaw(
-        params.unique_identifier, program_type, static_cast<u32>(code_size / sizeof(u64)),
-        static_cast<u32>(code_size_b / sizeof(u64)), std::move(program_code),
-        std::move(program_code_b)));
+        params.unique_identifier, GetProgramType(program_type),
+        static_cast<u32>(code_size / sizeof(u64)), static_cast<u32>(code_size_b / sizeof(u64)),
+        std::move(program_code), std::move(program_code_b)));

-    return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, GetProgramType(program_type), std::move(result)));
 }

 Shader CachedShader::CreateStageFromCache(const ShaderParameters& params,
                                          Maxwell::ShaderProgram program_type,
                                          GLShader::ProgramResult result) {
-    return std::shared_ptr<CachedShader>(new CachedShader(params, program_type, std::move(result)));
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, GetProgramType(program_type), std::move(result)));
+}
+
+Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code) {
+    auto result{CreateProgram(params.device, ProgramType::Compute, code, {})};
+
+    const auto code_size{CalculateProgramSize(code)};
+    params.disk_cache.SaveRaw(ShaderDiskCacheRaw(params.unique_identifier, ProgramType::Compute,
+                                                 static_cast<u32>(code_size / sizeof(u64)), 0,
+                                                 std::move(code), {}));
+
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, ProgramType::Compute, std::move(result)));
+}
+
+Shader CachedShader::CreateKernelFromCache(const ShaderParameters& params,
+                                           GLShader::ProgramResult result) {
+    return std::shared_ptr<CachedShader>(
+        new CachedShader(params, ProgramType::Compute, std::move(result)));
 }

 std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVariant& variant) {
    GLuint handle{};
-    if (program_type == Maxwell::ShaderProgram::Geometry) {
+    if (program_type == ProgramType::Geometry) {
        handle = GetGeometryShader(variant);
    } else {
        const auto [entry, is_cache_miss] = programs.try_emplace(variant);
@@ -305,8 +361,11 @@ std::tuple<GLuint, BaseBindings> CachedShader::GetProgramHandle(const ProgramVar
        handle = program->handle;
    }

-    auto base_bindings{variant.base_bindings};
-    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size()) + RESERVED_UBOS;
+    auto base_bindings = variant.base_bindings;
+    base_bindings.cbuf += static_cast<u32>(entries.const_buffers.size());
+    if (program_type != ProgramType::Compute) {
+        base_bindings.cbuf += STAGE_RESERVED_UBOS;
+    }
    base_bindings.gmem += static_cast<u32>(entries.global_memory_entries.size());
    base_bindings.sampler += static_cast<u32>(entries.samplers.size());

@@ -569,7 +628,7 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
 }

 Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
-    if (!system.GPU().Maxwell3D().dirty_flags.shaders) {
+    if (!system.GPU().Maxwell3D().dirty.shaders) {
        return last_shaders[static_cast<std::size_t>(program)];
    }

@@ -586,13 +645,15 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
    // No shader found - create a new one
    ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
    ProgramCode program_code_b;
-    if (program == Maxwell::ShaderProgram::VertexA) {
+    const bool is_program_a{program == Maxwell::ShaderProgram::VertexA};
+    if (is_program_a) {
        const GPUVAddr program_addr_b{GetShaderAddress(system, Maxwell::ShaderProgram::VertexB)};
        program_code_b = GetShaderCode(memory_manager, program_addr_b,
                                       memory_manager.GetPointer(program_addr_b));
    }

-    const auto unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
+    const auto unique_identifier =
+        GetUniqueIdentifier(GetProgramType(program), program_code, program_code_b);
    const auto cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
    const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
                                  host_ptr,   unique_identifier};
@@ -609,4 +670,30 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
    return last_shaders[static_cast<std::size_t>(program)] = shader;
 }

+Shader ShaderCacheOpenGL::GetComputeKernel(GPUVAddr code_addr) {
+    auto& memory_manager{system.GPU().MemoryManager()};
+    const auto host_ptr{memory_manager.GetPointer(code_addr)};
+    auto kernel = TryGet(host_ptr);
+    if (kernel) {
+        return kernel;
+    }
+
+    // No kernel found - create a new one
+    auto code{GetShaderCode(memory_manager, code_addr, host_ptr)};
+    const auto unique_identifier{GetUniqueIdentifier(ProgramType::Compute, code, {})};
+    const auto cpu_addr{*memory_manager.GpuToCpuAddress(code_addr)};
+    const ShaderParameters params{disk_cache, precompiled_programs, device, cpu_addr,
+                                  host_ptr,   unique_identifier};
+
+    const auto found = precompiled_shaders.find(unique_identifier);
+    if (found == precompiled_shaders.end()) {
+        kernel = CachedShader::CreateKernelFromMemory(params, std::move(code));
+    } else {
+        kernel = CachedShader::CreateKernelFromCache(params, found->second);
+    }
+
+    Register(kernel);
+    return kernel;
+}
+
 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -61,6 +61,11 @@ public:
                                       Maxwell::ShaderProgram program_type,
                                       GLShader::ProgramResult result);

+    static Shader CreateKernelFromMemory(const ShaderParameters& params, ProgramCode&& code);
+
+    static Shader CreateKernelFromCache(const ShaderParameters& params,
+                                        GLShader::ProgramResult result);
+
    VAddr GetCpuAddr() const override {
        return cpu_addr;
    }
@@ -78,7 +83,7 @@ public:
    std::tuple<GLuint, BaseBindings> GetProgramHandle(const ProgramVariant& variant);

 private:
-    explicit CachedShader(const ShaderParameters& params, Maxwell::ShaderProgram program_type,
+    explicit CachedShader(const ShaderParameters& params, ProgramType program_type,
                          GLShader::ProgramResult result);

    // Geometry programs. These are needed because GLSL needs an input topology but it's not
@@ -104,7 +109,7 @@ private:
    u8* host_ptr{};
    VAddr cpu_addr{};
    u64 unique_identifier{};
-    Maxwell::ShaderProgram program_type{};
+    ProgramType program_type{};
    ShaderDiskCacheOpenGL& disk_cache;
    const PrecompiledPrograms& precompiled_programs;

@@ -132,6 +137,9 @@ public:
    /// Gets the current specified shader stage program
    Shader GetStageProgram(Maxwell::ShaderProgram program);

+    /// Gets a compute kernel in the passed address
+    Shader GetComputeKernel(GPUVAddr code_addr);
+
 protected:
    // We do not have to flush this cache as things in it are never modified by us.
    void FlushObjectInner(const Shader& object) override {}
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -14,6 +14,7 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "common/logging/log.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -36,7 +37,6 @@ using namespace std::string_literals;
 using namespace VideoCommon::Shader;

 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-using ShaderStage = Tegra::Engines::Maxwell3D::Regs::ShaderStage;
 using Operation = const OperationNode&;

 enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
@@ -46,7 +46,7 @@ using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureAoffi, TextureArgument>;

 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
-    static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
+    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));

 class ShaderWriter {
 public:
@@ -161,9 +161,13 @@ std::string FlowStackTopName(MetaStackClass stack) {
    return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
 }

+constexpr bool IsVertexShader(ProgramType stage) {
+    return stage == ProgramType::VertexA || stage == ProgramType::VertexB;
+}
+
 class GLSLDecompiler final {
 public:
-    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
+    explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ProgramType stage,
                            std::string suffix)
        : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}

@@ -246,24 +250,22 @@ public:
                                                       usage.is_read, usage.is_written);
        }
        entries.clip_distances = ir.GetClipDistances();
+        entries.shader_viewport_layer_array =
+            IsVertexShader(stage) && (ir.UsesLayer() || ir.UsesViewportIndex());
        entries.shader_length = ir.GetLength();
        return entries;
    }

 private:
-    using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
    void DeclareVertex() {
-        if (stage != ShaderStage::Vertex)
+        if (!IsVertexShader(stage))
            return;

        DeclareVertexRedeclarations();
    }

    void DeclareGeometry() {
-        if (stage != ShaderStage::Geometry) {
+        if (stage != ProgramType::Geometry) {
            return;
        }

@@ -282,22 +284,35 @@ private:
    }

    void DeclareVertexRedeclarations() {
-        bool clip_distances_declared = false;
-
        code.AddLine("out gl_PerVertex {{");
        ++code.scope;

        code.AddLine("vec4 gl_Position;");

-        for (const auto o : ir.GetOutputAttributes()) {
-            if (o == Attribute::Index::PointSize)
-                code.AddLine("float gl_PointSize;");
-            if (!clip_distances_declared && (o == Attribute::Index::ClipDistances0123 ||
-                                             o == Attribute::Index::ClipDistances4567)) {
+        for (const auto attribute : ir.GetOutputAttributes()) {
+            if (attribute == Attribute::Index::ClipDistances0123 ||
+                attribute == Attribute::Index::ClipDistances4567) {
                code.AddLine("float gl_ClipDistance[];");
-                clip_distances_declared = true;
+                break;
            }
        }
+        if (!IsVertexShader(stage) || device.HasVertexViewportLayer()) {
+            if (ir.UsesLayer()) {
+                code.AddLine("int gl_Layer;");
+            }
+            if (ir.UsesViewportIndex()) {
+                code.AddLine("int gl_ViewportIndex;");
+            }
+        } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && IsVertexShader(stage) &&
+                   !device.HasVertexViewportLayer()) {
+            LOG_ERROR(
+                Render_OpenGL,
+                "GL_ARB_shader_viewport_layer_array is not available and its required by a shader");
+        }
+
+        if (ir.UsesPointSize()) {
+            code.AddLine("float gl_PointSize;");
+        }

        --code.scope;
        code.AddLine("}};");
@@ -325,11 +340,16 @@ private:
    }

    void DeclareLocalMemory() {
-        if (const u64 local_memory_size = header.GetLocalMemorySize(); local_memory_size > 0) {
-            const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
-            code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
-            code.AddNewLine();
+        // TODO(Rodrigo): Unstub kernel local memory size and pass it from a register at
+        // specialization time.
+        const u64 local_memory_size =
+            stage == ProgramType::Compute ? 0x400 : header.GetLocalMemorySize();
+        if (local_memory_size == 0) {
+            return;
        }
+        const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
+        code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
+        code.AddNewLine();
    }

    void DeclareInternalFlags() {
@@ -383,12 +403,12 @@ private:
        const u32 location{GetGenericAttributeIndex(index)};

        std::string name{GetInputAttribute(index)};
-        if (stage == ShaderStage::Geometry) {
+        if (stage == ProgramType::Geometry) {
            name = "gs_" + name + "[]";
        }

        std::string suffix;
-        if (stage == ShaderStage::Fragment) {
+        if (stage == ProgramType::Fragment) {
            const auto input_mode{header.ps.GetAttributeUse(location)};
            if (skip_unused && input_mode == AttributeUse::Unused) {
                return;
@@ -400,7 +420,7 @@ private:
    }

    void DeclareOutputAttributes() {
-        if (ir.HasPhysicalAttributes() && stage != ShaderStage::Fragment) {
+        if (ir.HasPhysicalAttributes() && stage != ProgramType::Fragment) {
            for (u32 i = 0; i < GetNumPhysicalVaryings(); ++i) {
                DeclareOutputAttribute(ToGenericAttribute(i));
            }
@@ -522,7 +542,7 @@ private:
                constexpr u32 element_stride{4};
                const u32 address{generic_base + index * generic_stride + element * element_stride};

-                const bool declared{stage != ShaderStage::Fragment ||
+                const bool declared{stage != ProgramType::Fragment ||
                                    header.ps.GetAttributeUse(index) != AttributeUse::Unused};
                const std::string value{declared ? ReadAttribute(attribute, element) : "0"};
                code.AddLine("case 0x{:x}: return {};", address, value);
@@ -626,7 +646,7 @@ private:
        }

        if (const auto abuf = std::get_if<AbufNode>(&*node)) {
-            UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ShaderStage::Geometry,
+            UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ProgramType::Geometry,
                                 "Physical attributes in geometry shaders are not implemented");
            if (abuf->IsPhysicalBuffer()) {
                return fmt::format("readPhysicalAttribute(ftou({}))",
@@ -681,6 +701,9 @@ private:
        }

        if (const auto lmem = std::get_if<LmemNode>(&*node)) {
+            if (stage == ProgramType::Compute) {
+                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
+            }
            return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
        }

@@ -710,7 +733,7 @@ private:

    std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) {
        const auto GeometryPass = [&](std::string_view name) {
-            if (stage == ShaderStage::Geometry && buffer) {
+            if (stage == ProgramType::Geometry && buffer) {
                // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
                // set an 0x80000000 index for those and the shader fails to build. Find out why
                // this happens and what's its intent.
@@ -722,10 +745,10 @@ private:
        switch (attribute) {
        case Attribute::Index::Position:
            switch (stage) {
-            case ShaderStage::Geometry:
+            case ProgramType::Geometry:
                return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer),
                                   GetSwizzle(element));
-            case ShaderStage::Fragment:
+            case ProgramType::Fragment:
                return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element));
            default:
                UNREACHABLE();
@@ -746,7 +769,7 @@ private:
            // TODO(Subv): Find out what the values are for the first two elements when inside a
            // vertex shader, and what's the value of the fourth element when inside a Tess Eval
            // shader.
-            ASSERT(stage == ShaderStage::Vertex);
+            ASSERT(IsVertexShader(stage));
            switch (element) {
            case 2:
                // Config pack's first value is instance_id.
@@ -758,7 +781,7 @@ private:
            return "0";
        case Attribute::Index::FrontFacing:
            // TODO(Subv): Find out what the values are for the other elements.
-            ASSERT(stage == ShaderStage::Fragment);
+            ASSERT(stage == ProgramType::Fragment);
            switch (element) {
            case 3:
                return "itof(gl_FrontFacing ? -1 : 0)";
@@ -780,7 +803,7 @@ private:
            return value;
        }
        // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders
-        const std::string precise = stage != ShaderStage::Fragment ? "precise " : "";
+        const std::string precise = stage != ProgramType::Fragment ? "precise " : "";

        const std::string temporary = code.GenerateTemporary();
        code.AddLine("{}float {} = {};", precise, temporary, value);
@@ -805,6 +828,45 @@ private:
        return CastOperand(VisitOperand(operation, operand_index), type);
    }

+    std::optional<std::pair<std::string, bool>> GetOutputAttribute(const AbufNode* abuf) {
+        switch (const auto attribute = abuf->GetIndex()) {
+        case Attribute::Index::Position:
+            return std::make_pair("gl_Position"s + GetSwizzle(abuf->GetElement()), false);
+        case Attribute::Index::LayerViewportPointSize:
+            switch (abuf->GetElement()) {
+            case 0:
+                UNIMPLEMENTED();
+                return {};
+            case 1:
+                if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
+                    return {};
+                }
+                return std::make_pair("gl_Layer", true);
+            case 2:
+                if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
+                    return {};
+                }
+                return std::make_pair("gl_ViewportIndex", true);
+            case 3:
+                UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader");
+                return std::make_pair("gl_PointSize", false);
+            }
+            return {};
+        case Attribute::Index::ClipDistances0123:
+            return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), false);
+        case Attribute::Index::ClipDistances4567:
+            return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4),
+                                  false);
+        default:
+            if (IsGenericAttribute(attribute)) {
+                return std::make_pair(
+                    GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), false);
+            }
+            UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
+            return {};
+        }
+    }
+
    std::string CastOperand(const std::string& value, Type type) const {
        switch (type) {
        case Type::Bool:
@@ -1001,6 +1063,8 @@ private:
        const Node& src = operation[1];

        std::string target;
+        bool is_integer = false;
+
        if (const auto gpr = std::get_if<GprNode>(&*dest)) {
            if (gpr->GetIndex() == Register::ZeroIndex) {
                // Writing to Register::ZeroIndex is a no op
@@ -1009,27 +1073,16 @@ private:
            target = GetRegister(gpr->GetIndex());
        } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
            UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
-
-            target = [&]() -> std::string {
-                switch (const auto attribute = abuf->GetIndex(); abuf->GetIndex()) {
-                case Attribute::Index::Position:
-                    return "gl_Position"s + GetSwizzle(abuf->GetElement());
-                case Attribute::Index::PointSize:
-                    return "gl_PointSize";
-                case Attribute::Index::ClipDistances0123:
-                    return fmt::format("gl_ClipDistance[{}]", abuf->GetElement());
-                case Attribute::Index::ClipDistances4567:
-                    return fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4);
-                default:
-                    if (IsGenericAttribute(attribute)) {
-                        return GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement());
-                    }
-                    UNIMPLEMENTED_MSG("Unhandled output attribute: {}",
-                                      static_cast<u32>(attribute));
-                    return "0";
-                }
-            }();
+            const auto result = GetOutputAttribute(abuf);
+            if (!result) {
+                return {};
+            }
+            target = result->first;
+            is_integer = result->second;
        } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
+            if (stage == ProgramType::Compute) {
+                LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
+            }
            target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
        } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
            const std::string real = Visit(gmem->GetRealAddress());
@@ -1040,7 +1093,11 @@ private:
            UNREACHABLE_MSG("Assign called without a proper target");
        }

-        code.AddLine("{} = {};", target, Visit(src));
+        if (is_integer) {
+            code.AddLine("{} = ftoi({});", target, Visit(src));
+        } else {
+            code.AddLine("{} = {};", target, Visit(src));
+        }
        return {};
    }

@@ -1353,14 +1410,10 @@ private:
        return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint));
    }

-    std::string LogicalAll2(Operation operation) {
+    std::string LogicalAnd2(Operation operation) {
        return GenerateUnary(operation, "all", Type::Bool, Type::Bool2);
    }

-    std::string LogicalAny2(Operation operation) {
-        return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
-    }
-
    template <bool with_nan>
    std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) {
        const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
@@ -1583,7 +1636,7 @@ private:
    }

    std::string Exit(Operation operation) {
-        if (stage != ShaderStage::Fragment) {
+        if (stage != ProgramType::Fragment) {
            code.AddLine("return;");
            return {};
        }
@@ -1634,7 +1687,7 @@ private:
    }

    std::string EmitVertex(Operation operation) {
-        ASSERT_MSG(stage == ShaderStage::Geometry,
+        ASSERT_MSG(stage == ProgramType::Geometry,
                   "EmitVertex is expected to be used in a geometry shader.");

        // If a geometry shader is attached, it will always flip (it's the last stage before
@@ -1645,7 +1698,7 @@ private:
    }

    std::string EndPrimitive(Operation operation) {
-        ASSERT_MSG(stage == ShaderStage::Geometry,
+        ASSERT_MSG(stage == ProgramType::Geometry,
                   "EndPrimitive is expected to be used in a geometry shader.");

        code.AddLine("EndPrimitive();");
@@ -1667,7 +1720,7 @@ private:
        return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
    }

-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
        &GLSLDecompiler::Assign,

        &GLSLDecompiler::Select,
@@ -1751,8 +1804,7 @@ private:
        &GLSLDecompiler::LogicalXor,
        &GLSLDecompiler::LogicalNegate,
        &GLSLDecompiler::LogicalPick2,
-        &GLSLDecompiler::LogicalAll2,
-        &GLSLDecompiler::LogicalAny2,
+        &GLSLDecompiler::LogicalAnd2,

        &GLSLDecompiler::LogicalLessThan<Type::Float>,
        &GLSLDecompiler::LogicalEqual<Type::Float>,
@@ -1816,6 +1868,7 @@ private:
        &GLSLDecompiler::WorkGroupId<1>,
        &GLSLDecompiler::WorkGroupId<2>,
    };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

    std::string GetRegister(u32 index) const {
        return GetDeclarationWithSuffix(index, "gpr");
@@ -1880,7 +1933,7 @@ private:
    }

    u32 GetNumPhysicalInputAttributes() const {
-        return stage == ShaderStage::Vertex ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
+        return IsVertexShader(stage) ? GetNumPhysicalAttributes() : GetNumPhysicalVaryings();
    }

    u32 GetNumPhysicalAttributes() const {
@@ -1893,7 +1946,7 @@ private:

    const Device& device;
    const ShaderIR& ir;
-    const ShaderStage stage;
+    const ProgramType stage;
    const std::string suffix;
    const Header header;

@@ -1924,7 +1977,7 @@ std::string GetCommonDeclarations() {
        MAX_CONSTBUFFER_ELEMENTS);
 }

-ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
+ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage,
                        const std::string& suffix) {
    GLSLDecompiler decompiler(device, ir, stage, suffix);
    decompiler.Decompile();
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -12,14 +12,26 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/shader/shader_ir.h"

-namespace OpenGL {
-class Device;
-}
-
 namespace VideoCommon::Shader {
 class ShaderIR;
 }

+namespace OpenGL {
+
+class Device;
+
+enum class ProgramType : u32 {
+    VertexA = 0,
+    VertexB = 1,
+    TessellationControl = 2,
+    TessellationEval = 3,
+    Geometry = 4,
+    Fragment = 5,
+    Compute = 6
+};
+
+} // namespace OpenGL
+
 namespace OpenGL::GLShader {

 struct ShaderEntries;
@@ -78,12 +90,13 @@ struct ShaderEntries {
    std::vector<ImageEntry> images;
    std::vector<GlobalMemoryEntry> global_memory_entries;
    std::array<bool, Maxwell::NumClipDistances> clip_distances{};
+    bool shader_viewport_layer_array{};
    std::size_t shader_length{};
 };

 std::string GetCommonDeclarations();

 ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
-                        Maxwell::ShaderStage stage, const std::string& suffix);
+                        ProgramType stage, const std::string& suffix);

 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -51,7 +51,7 @@ ShaderCacheVersionHash GetShaderCacheVersionHash() {

 } // namespace

-ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
+ShaderDiskCacheRaw::ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
                                       u32 program_code_size, u32 program_code_size_b,
                                       ProgramCode program_code, ProgramCode program_code_b)
    : unique_identifier{unique_identifier}, program_type{program_type},
@@ -373,6 +373,12 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
        }
    }

+    bool shader_viewport_layer_array{};
+    if (!LoadObjectFromPrecompiled(shader_viewport_layer_array)) {
+        return {};
+    }
+    entry.entries.shader_viewport_layer_array = shader_viewport_layer_array;
+
    u64 shader_length{};
    if (!LoadObjectFromPrecompiled(shader_length)) {
        return {};
@@ -445,6 +451,10 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std:
        }
    }

+    if (!SaveObjectToPrecompiled(entries.shader_viewport_layer_array)) {
+        return false;
+    }
+
    if (!SaveObjectToPrecompiled(static_cast<u64>(entries.shader_length))) {
        return false;
    }
--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h
@@ -18,7 +18,6 @@
 #include "common/assert.h"
 #include "common/common_types.h"
 #include "core/file_sys/vfs_vector.h"
-#include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"

 namespace Core {
@@ -34,14 +33,11 @@ namespace OpenGL {
 struct ShaderDiskCacheUsage;
 struct ShaderDiskCacheDump;

-using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
-
 using ProgramCode = std::vector<u64>;
-using Maxwell = Tegra::Engines::Maxwell3D::Regs;
-
+using ShaderDumpsMap = std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>;
 using TextureBufferUsage = std::bitset<64>;

-/// Allocated bindings used by an OpenGL shader program.
+/// Allocated bindings used by an OpenGL shader program
 struct BaseBindings {
    u32 cbuf{};
    u32 gmem{};
@@ -126,7 +122,7 @@ namespace OpenGL {
 /// Describes a shader how it's used by the guest GPU
 class ShaderDiskCacheRaw {
 public:
-    explicit ShaderDiskCacheRaw(u64 unique_identifier, Maxwell::ShaderProgram program_type,
+    explicit ShaderDiskCacheRaw(u64 unique_identifier, ProgramType program_type,
                                u32 program_code_size, u32 program_code_size_b,
                                ProgramCode program_code, ProgramCode program_code_b);
    ShaderDiskCacheRaw();
@@ -141,30 +137,13 @@ public:
    }

    bool HasProgramA() const {
-        return program_type == Maxwell::ShaderProgram::VertexA;
+        return program_type == ProgramType::VertexA;
    }

-    Maxwell::ShaderProgram GetProgramType() const {
+    ProgramType GetProgramType() const {
        return program_type;
    }

-    Maxwell::ShaderStage GetProgramStage() const {
-        switch (program_type) {
-        case Maxwell::ShaderProgram::VertexA:
-        case Maxwell::ShaderProgram::VertexB:
-            return Maxwell::ShaderStage::Vertex;
-        case Maxwell::ShaderProgram::TesselationControl:
-            return Maxwell::ShaderStage::TesselationControl;
-        case Maxwell::ShaderProgram::TesselationEval:
-            return Maxwell::ShaderStage::TesselationEval;
-        case Maxwell::ShaderProgram::Geometry:
-            return Maxwell::ShaderStage::Geometry;
-        case Maxwell::ShaderProgram::Fragment:
-            return Maxwell::ShaderStage::Fragment;
-        }
-        UNREACHABLE();
-    }
-
    const ProgramCode& GetProgramCode() const {
        return program_code;
    }
@@ -175,7 +154,7 @@ public:

 private:
    u64 unique_identifier{};
-    Maxwell::ShaderProgram program_type{};
+    ProgramType program_type{};
    u32 program_code_size{};
    u32 program_code_size_b{};

--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -14,7 +14,8 @@ using Tegra::Engines::Maxwell3D;
 using VideoCommon::Shader::ProgramCode;
 using VideoCommon::Shader::ShaderIR;

-static constexpr u32 PROGRAM_OFFSET{10};
+static constexpr u32 PROGRAM_OFFSET = 10;
+static constexpr u32 COMPUTE_OFFSET = 0;

 ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
@@ -29,17 +30,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
 };

 )";
-    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");

+    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
+    const auto stage = setup.IsDualProgram() ? ProgramType::VertexA : ProgramType::VertexB;
+    ProgramResult program = Decompile(device, program_ir, stage, "vertex");
    out += program.first;

    if (setup.IsDualProgram()) {
        const ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET, setup.program.size_b);
-        ProgramResult program_b =
-            Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
-
+        ProgramResult program_b = Decompile(device, program_ir_b, ProgramType::VertexB, "vertex_b");
        out += program_b.first;
    }

@@ -80,9 +79,9 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
 };

 )";
+
    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Geometry, "geometry");
    out += program.first;

    out += R"(
@@ -116,9 +115,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform fs_config {

 )";
    const ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET, setup.program.size_a);
-    ProgramResult program =
-        Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
-
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Fragment, "fragment");
    out += program.first;

    out += R"(
@@ -130,4 +127,22 @@ void main() {
    return {std::move(out), std::move(program.second)};
 }

+ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup) {
+    const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
+
+    std::string out = "// Shader Unique Id: CS" + id + "\n\n";
+    out += GetCommonDeclarations();
+
+    const ShaderIR program_ir(setup.program.code, COMPUTE_OFFSET, setup.program.size_a);
+    ProgramResult program = Decompile(device, program_ir, ProgramType::Compute, "compute");
+    out += program.first;
+
+    out += R"(
+void main() {
+    execute_compute();
+}
+)";
+    return {std::move(out), std::move(program.second)};
+}
+
 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -54,4 +54,7 @@ ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& se
 /// Generates the GLSL fragment shader program source code for the given FS program
 ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);

+/// Generates the GLSL compute shader program source code for the given CS program
+ProgramResult GenerateComputeShader(const Device& device, const ShaderSetup& setup);
+
 } // namespace OpenGL::GLShader
--- a/src/video_core/renderer_opengl/gl_shader_util.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_util.cpp
@@ -10,21 +10,25 @@

 namespace OpenGL::GLShader {

-GLuint LoadShader(const char* source, GLenum type) {
-    const char* debug_type;
+namespace {
+const char* GetStageDebugName(GLenum type) {
    switch (type) {
    case GL_VERTEX_SHADER:
-        debug_type = "vertex";
-        break;
+        return "vertex";
    case GL_GEOMETRY_SHADER:
-        debug_type = "geometry";
-        break;
+        return "geometry";
    case GL_FRAGMENT_SHADER:
-        debug_type = "fragment";
-        break;
-    default:
-        UNREACHABLE();
+        return "fragment";
+    case GL_COMPUTE_SHADER:
+        return "compute";
    }
+    UNIMPLEMENTED();
+    return "unknown";
+}
+} // Anonymous namespace
+
+GLuint LoadShader(const char* source, GLenum type) {
+    const char* debug_type = GetStageDebugName(type);
    const GLuint shader_id = glCreateShader(type);
    glShaderSource(shader_id, 1, &source, nullptr);
    LOG_DEBUG(Render_OpenGL, "Compiling {} shader...", debug_type);
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -6,8 +6,11 @@
 #include <glad/glad.h>
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/renderer_opengl/gl_state.h"

+MICROPROFILE_DEFINE(OpenGL_State, "OpenGL", "State Change", MP_RGB(192, 128, 128));
+
 namespace OpenGL {

 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
@@ -162,6 +165,25 @@ OpenGLState::OpenGLState() {
    alpha_test.ref = 0.0f;
 }

+void OpenGLState::SetDefaultViewports() {
+    for (auto& item : viewports) {
+        item.x = 0;
+        item.y = 0;
+        item.width = 0;
+        item.height = 0;
+        item.depth_range_near = 0.0f;
+        item.depth_range_far = 1.0f;
+        item.scissor.enabled = false;
+        item.scissor.x = 0;
+        item.scissor.y = 0;
+        item.scissor.width = 0;
+        item.scissor.height = 0;
+    }
+
+    depth_clamp.far_plane = false;
+    depth_clamp.near_plane = false;
+}
+
 void OpenGLState::ApplyDefaultState() {
    glEnable(GL_BLEND);
    glDisable(GL_FRAMEBUFFER_SRGB);
@@ -523,7 +545,8 @@ void OpenGLState::ApplySamplers() const {
    }
 }

-void OpenGLState::Apply() const {
+void OpenGLState::Apply() {
+    MICROPROFILE_SCOPE(OpenGL_State);
    ApplyFramebufferState();
    ApplyVertexArrayState();
    ApplyShaderProgram();
@@ -532,19 +555,31 @@ void OpenGLState::Apply() const {
    ApplyPointSize();
    ApplyFragmentColorClamp();
    ApplyMultisample();
+    if (dirty.color_mask) {
+        ApplyColorMask();
+        dirty.color_mask = false;
+    }
    ApplyDepthClamp();
-    ApplyColorMask();
    ApplyViewport();
-    ApplyStencilTest();
+    if (dirty.stencil_state) {
+        ApplyStencilTest();
+        dirty.stencil_state = false;
+    }
    ApplySRgb();
    ApplyCulling();
    ApplyDepth();
    ApplyPrimitiveRestart();
-    ApplyBlending();
+    if (dirty.blend_state) {
+        ApplyBlending();
+        dirty.blend_state = false;
+    }
    ApplyLogicOp();
    ApplyTextures();
    ApplySamplers();
-    ApplyPolygonOffset();
+    if (dirty.polygon_offset) {
+        ApplyPolygonOffset();
+        dirty.polygon_offset = false;
+    }
    ApplyAlphaTest();
 }

--- a/src/video_core/renderer_opengl/gl_state.h
+++ b/src/video_core/renderer_opengl/gl_state.h
@@ -195,8 +195,9 @@ public:
        s_rgb_used = false;
    }

+    void SetDefaultViewports();
    /// Apply this state as the current OpenGL state
-    void Apply() const;
+    void Apply();

    void ApplyFramebufferState() const;
    void ApplyVertexArrayState() const;
@@ -237,11 +238,41 @@ public:
    /// Viewport does not affects glClearBuffer so emulate viewport using scissor test
    void EmulateViewportWithScissor();

+    void MarkDirtyBlendState() {
+        dirty.blend_state = true;
+    }
+
+    void MarkDirtyStencilState() {
+        dirty.stencil_state = true;
+    }
+
+    void MarkDirtyPolygonOffset() {
+        dirty.polygon_offset = true;
+    }
+
+    void MarkDirtyColorMask() {
+        dirty.color_mask = true;
+    }
+
+    void AllDirty() {
+        dirty.blend_state = true;
+        dirty.stencil_state = true;
+        dirty.polygon_offset = true;
+        dirty.color_mask = true;
+    }
+
 private:
    static OpenGLState cur_state;

    // Workaround for sRGB problems caused by QT not supporting srgb output
    static bool s_rgb_used;
+    struct {
+        bool blend_state;
+        bool stencil_state;
+        bool viewport_state;
+        bool polygon_offset;
+        bool color_mask;
+    } dirty{};
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -31,6 +31,8 @@ using VideoCore::Surface::SurfaceType;

 MICROPROFILE_DEFINE(OpenGL_Texture_Upload, "OpenGL", "Texture Upload", MP_RGB(128, 192, 128));
 MICROPROFILE_DEFINE(OpenGL_Texture_Download, "OpenGL", "Texture Download", MP_RGB(128, 192, 128));
+MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
+                    MP_RGB(128, 192, 128));

 namespace {

@@ -483,11 +485,15 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
    const auto& dst_params{dst_view->GetSurfaceParams()};

    OpenGLState prev_state{OpenGLState::GetCurState()};
-    SCOPE_EXIT({ prev_state.Apply(); });
+    SCOPE_EXIT({
+        prev_state.AllDirty();
+        prev_state.Apply();
+    });

    OpenGLState state;
    state.draw.read_framebuffer = src_framebuffer.handle;
    state.draw.draw_framebuffer = dst_framebuffer.handle;
+    state.AllDirty();
    state.Apply();

    u32 buffers{};
@@ -535,6 +541,7 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
 }

 void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface) {
+    MICROPROFILE_SCOPE(OpenGL_Texture_Buffer_Copy);
    const auto& src_params = src_surface->GetSurfaceParams();
    const auto& dst_params = dst_surface->GetSurfaceParams();
    UNIMPLEMENTED_IF(src_params.num_levels > 1 || dst_params.num_levels > 1);
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -101,7 +101,6 @@ RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::Syst

 RendererOpenGL::~RendererOpenGL() = default;

-/// Swap buffers (render frame)
 void RendererOpenGL::SwapBuffers(
    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {

@@ -109,6 +108,7 @@ void RendererOpenGL::SwapBuffers(

    // Maintain the rasterizer's state as a priority
    OpenGLState prev_state = OpenGLState::GetCurState();
+    state.AllDirty();
    state.Apply();

    if (framebuffer) {
@@ -130,6 +130,8 @@ void RendererOpenGL::SwapBuffers(

        DrawScreen(render_window.GetFramebufferLayout());

+        rasterizer->TickFrame();
+
        render_window.SwapBuffers();
    }

@@ -139,6 +141,7 @@ void RendererOpenGL::SwapBuffers(
    system.GetPerfStats().BeginSystemFrame();

    // Restore the rasterizer state
+    prev_state.AllDirty();
    prev_state.Apply();
 }

@@ -205,6 +208,7 @@ void RendererOpenGL::InitOpenGLObjects() {
    // Link shaders and get variable locations
    shader.CreateFromSource(vertex_shader, nullptr, fragment_shader);
    state.draw.shader_program = shader.handle;
+    state.AllDirty();
    state.Apply();
    uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix");
    uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture");
@@ -262,7 +266,6 @@ void RendererOpenGL::CreateRasterizer() {
    if (rasterizer) {
        return;
    }
-    // Initialize sRGB Usage
    OpenGLState::ClearsRGBUsed();
    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info);
 }
@@ -338,12 +341,14 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
    // Workaround brigthness problems in SMO by enabling sRGB in the final output
    // if it has been used in the frame. Needed because of this bug in QT: QTBUG-50987
    state.framebuffer_srgb.enabled = OpenGLState::GetsRGBUsed();
+    state.AllDirty();
    state.Apply();
    glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data());
    glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
    // Restore default state
    state.framebuffer_srgb.enabled = false;
    state.texture_units[0].texture = 0;
+    state.AllDirty();
    state.Apply();
    // Clear sRGB state for the next frame
    OpenGLState::ClearsRGBUsed();
@@ -388,6 +393,7 @@ void RendererOpenGL::CaptureScreenshot() {
    GLuint old_read_fb = state.draw.read_framebuffer;
    GLuint old_draw_fb = state.draw.draw_framebuffer;
    state.draw.read_framebuffer = state.draw.draw_framebuffer = screenshot_framebuffer.handle;
+    state.AllDirty();
    state.Apply();

    Layout::FramebufferLayout layout{renderer_settings.screenshot_framebuffer_layout};
@@ -407,6 +413,7 @@ void RendererOpenGL::CaptureScreenshot() {
    screenshot_framebuffer.Release();
    state.draw.read_framebuffer = old_read_fb;
    state.draw.draw_framebuffer = old_draw_fb;
+    state.AllDirty();
    state.Apply();
    glDeleteRenderbuffers(1, &renderbuffer);

--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -13,29 +13,67 @@

 namespace OpenGL {

+VertexArrayPushBuffer::VertexArrayPushBuffer() = default;
+
+VertexArrayPushBuffer::~VertexArrayPushBuffer() = default;
+
+void VertexArrayPushBuffer::Setup(GLuint vao_) {
+    vao = vao_;
+    index_buffer = nullptr;
+    vertex_buffers.clear();
+}
+
+void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) {
+    index_buffer = buffer;
+}
+
+void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer,
+                                            GLintptr offset, GLsizei stride) {
+    vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride});
+}
+
+void VertexArrayPushBuffer::Bind() {
+    if (index_buffer) {
+        glVertexArrayElementBuffer(vao, *index_buffer);
+    }
+
+    // TODO(Rodrigo): Find a way to ARB_multi_bind this
+    for (const auto& entry : vertex_buffers) {
+        glVertexArrayVertexBuffer(vao, entry.binding_index, *entry.buffer, entry.offset,
+                                  entry.stride);
+    }
+}
+
 BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {}

 BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default;

 void BindBuffersRangePushBuffer::Setup(GLuint first_) {
    first = first_;
-    buffers.clear();
+    buffer_pointers.clear();
    offsets.clear();
    sizes.clear();
 }

-void BindBuffersRangePushBuffer::Push(GLuint buffer, GLintptr offset, GLsizeiptr size) {
-    buffers.push_back(buffer);
+void BindBuffersRangePushBuffer::Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size) {
+    buffer_pointers.push_back(buffer);
    offsets.push_back(offset);
    sizes.push_back(size);
 }

-void BindBuffersRangePushBuffer::Bind() const {
-    const std::size_t count{buffers.size()};
+void BindBuffersRangePushBuffer::Bind() {
+    // Ensure sizes are valid.
+    const std::size_t count{buffer_pointers.size()};
    DEBUG_ASSERT(count == offsets.size() && count == sizes.size());
    if (count == 0) {
        return;
    }
+
+    // Dereference buffers.
+    buffers.resize(count);
+    std::transform(buffer_pointers.begin(), buffer_pointers.end(), buffers.begin(),
+                   [](const GLuint* pointer) { return *pointer; });
+
    glBindBuffersRange(target, first, static_cast<GLsizei>(count), buffers.data(), offsets.data(),
                       sizes.data());
 }
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -11,20 +11,49 @@

 namespace OpenGL {

-class BindBuffersRangePushBuffer {
+class VertexArrayPushBuffer final {
 public:
-    BindBuffersRangePushBuffer(GLenum target);
+    explicit VertexArrayPushBuffer();
+    ~VertexArrayPushBuffer();
+
+    void Setup(GLuint vao_);
+
+    void SetIndexBuffer(const GLuint* buffer);
+
+    void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset,
+                         GLsizei stride);
+
+    void Bind();
+
+private:
+    struct Entry {
+        GLuint binding_index{};
+        const GLuint* buffer{};
+        GLintptr offset{};
+        GLsizei stride{};
+    };
+
+    GLuint vao{};
+    const GLuint* index_buffer{};
+    std::vector<Entry> vertex_buffers;
+};
+
+class BindBuffersRangePushBuffer final {
+public:
+    explicit BindBuffersRangePushBuffer(GLenum target);
    ~BindBuffersRangePushBuffer();

    void Setup(GLuint first_);

-    void Push(GLuint buffer, GLintptr offset, GLsizeiptr size);
+    void Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size);

-    void Bind() const;
+    void Bind();

 private:
-    GLenum target;
-    GLuint first;
+    GLenum target{};
+    GLuint first{};
+    std::vector<const GLuint*> buffer_pointers;
+
    std::vector<GLuint> buffers;
    std::vector<GLintptr> offsets;
    std::vector<GLsizeiptr> sizes;
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -205,10 +205,6 @@ public:
    }

 private:
-    using OperationDecompilerFn = Id (SPIRVDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
    static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount);

    void AllocateBindings() {
@@ -430,20 +426,17 @@ private:
        instance_index = DeclareBuiltIn(spv::BuiltIn::InstanceIndex, spv::StorageClass::Input,
                                        t_in_uint, "instance_index");

-        bool is_point_size_declared = false;
        bool is_clip_distances_declared = false;
        for (const auto index : ir.GetOutputAttributes()) {
-            if (index == Attribute::Index::PointSize) {
-                is_point_size_declared = true;
-            } else if (index == Attribute::Index::ClipDistances0123 ||
-                       index == Attribute::Index::ClipDistances4567) {
+            if (index == Attribute::Index::ClipDistances0123 ||
+                index == Attribute::Index::ClipDistances4567) {
                is_clip_distances_declared = true;
            }
        }

        std::vector<Id> members;
        members.push_back(t_float4);
-        if (is_point_size_declared) {
+        if (ir.UsesPointSize()) {
            members.push_back(t_float);
        }
        if (is_clip_distances_declared) {
@@ -466,7 +459,7 @@ private:

        position_index = MemberDecorateBuiltIn(spv::BuiltIn::Position, "position", true);
        point_size_index =
-            MemberDecorateBuiltIn(spv::BuiltIn::PointSize, "point_size", is_point_size_declared);
+            MemberDecorateBuiltIn(spv::BuiltIn::PointSize, "point_size", ir.UsesPointSize());
        clip_distances_index = MemberDecorateBuiltIn(spv::BuiltIn::ClipDistance, "clip_distances",
                                                     is_clip_distances_declared);

@@ -712,7 +705,8 @@ private:
                case Attribute::Index::Position:
                    return AccessElement(t_out_float, per_vertex, position_index,
                                         abuf->GetElement());
-                case Attribute::Index::PointSize:
+                case Attribute::Index::LayerViewportPointSize:
+                    UNIMPLEMENTED_IF(abuf->GetElement() != 3);
                    return AccessElement(t_out_float, per_vertex, point_size_index);
                case Attribute::Index::ClipDistances0123:
                    return AccessElement(t_out_float, per_vertex, clip_distances_index,
@@ -806,12 +800,7 @@ private:
        return {};
    }

-    Id LogicalAll2(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Id LogicalAny2(Operation operation) {
+    Id LogicalAnd2(Operation operation) {
        UNIMPLEMENTED();
        return {};
    }
@@ -1208,7 +1197,7 @@ private:
        return {};
    }

-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
        &SPIRVDecompiler::Assign,

        &SPIRVDecompiler::Ternary<&Module::OpSelect, Type::Float, Type::Bool, Type::Float,
@@ -1293,8 +1282,7 @@ private:
        &SPIRVDecompiler::Binary<&Module::OpLogicalNotEqual, Type::Bool>,
        &SPIRVDecompiler::Unary<&Module::OpLogicalNot, Type::Bool>,
        &SPIRVDecompiler::LogicalPick2,
-        &SPIRVDecompiler::LogicalAll2,
-        &SPIRVDecompiler::LogicalAny2,
+        &SPIRVDecompiler::LogicalAnd2,

        &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::Float>,
        &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::Float>,
@@ -1359,6 +1347,7 @@ private:
        &SPIRVDecompiler::WorkGroupId<1>,
        &SPIRVDecompiler::WorkGroupId<2>,
    };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

    const VKDevice& device;
    const ShaderIR& ir;
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -46,12 +46,12 @@ void ShaderIR::Decode() {
        coverage_end = shader_info.end;
        if (shader_info.decompilable) {
            disable_flow_stack = true;
-            const auto insert_block = ([this](NodeBlock& nodes, u32 label) {
+            const auto insert_block = [this](NodeBlock& nodes, u32 label) {
                if (label == exit_branch) {
                    return;
                }
                basic_blocks.insert({label, nodes});
-            });
+            };
            const auto& blocks = shader_info.blocks;
            NodeBlock current_block;
            u32 current_label = exit_branch;
@@ -103,7 +103,7 @@ void ShaderIR::DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end) {
 }

 void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {
-    const auto apply_conditions = ([&](const Condition& cond, Node n) -> Node {
+    const auto apply_conditions = [&](const Condition& cond, Node n) -> Node {
        Node result = n;
        if (cond.cc != ConditionCode::T) {
            result = Conditional(GetConditionCode(cond.cc), {result});
@@ -117,7 +117,7 @@ void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {
            result = Conditional(GetPredicate(pred, is_neg), {result});
        }
        return result;
-    });
+    };
    if (block.branch.address < 0) {
        if (block.branch.kills) {
            Node n = Operation(OperationCode::Discard);
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -23,38 +23,51 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
    op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);

-    Node op_b = [&]() {
-        switch (opcode->get().GetId()) {
-        case OpCode::Id::HSETP2_R:
-            return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
-                                        instr.hsetp2.negate_b);
-        default:
-            UNREACHABLE();
-            return Immediate(0);
-        }
-    }();
-    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
-
-    // We can't use the constant predicate as destination.
-    ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
-
-    const Node second_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred != 0);
+    Tegra::Shader::PredCondition cond{};
+    bool h_and{};
+    Node op_b{};
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSETP2_C:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset),
+                                    instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b);
+        break;
+    case OpCode::Id::HSETP2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = UnpackHalfImmediate(instr, true);
+        break;
+    case OpCode::Id::HSETP2_R:
+        cond = instr.hsetp2.reg.cond;
+        h_and = instr.hsetp2.reg.h_and;
+        op_b =
+            UnpackHalfFloat(GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.reg.abs_b,
+                                                 instr.hsetp2.reg.negate_b),
+                            instr.hsetp2.reg.type_b);
+        break;
+    default:
+        UNREACHABLE();
+        op_b = Immediate(0);
+    }

    const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op);
-    const OperationCode pair_combiner =
-        instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
+    const Node pred39 = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred);

-    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
-    const Node first_pred = Operation(pair_combiner, comparison);
+    const auto Write = [&](u64 dest, Node src) {
+        SetPredicate(bb, dest, Operation(combiner, std::move(src), pred39));
+    };

-    // Set the primary predicate to the result of Predicate OP SecondPredicate
-    const Node value = Operation(combiner, first_pred, second_pred);
-    SetPredicate(bb, instr.hsetp2.pred3, value);
-
-    if (instr.hsetp2.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
-        // Set the secondary predicate to the result of !Predicate OP SecondPredicate, if enabled
-        const Node negated_pred = Operation(OperationCode::LogicalNegate, first_pred);
-        SetPredicate(bb, instr.hsetp2.pred0, Operation(combiner, negated_pred, second_pred));
+    const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b);
+    const u64 first = instr.hsetp2.pred0;
+    const u64 second = instr.hsetp2.pred3;
+    if (h_and) {
+        const Node joined = Operation(OperationCode::LogicalAnd2, comparison);
+        Write(first, joined);
+        Write(second, Operation(OperationCode::LogicalNegate, joined));
+    } else {
+        Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0u)));
+        Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1u)));
    }

    return pc;
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -95,12 +95,8 @@ const Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::Image
 const Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg,
                                        Tegra::Shader::ImageType type) {
    const Node image_register{GetRegister(reg)};
-    const Node base_image{
+    const auto [base_image, cbuf_index, cbuf_offset]{
        TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()))};
-    const auto cbuf{std::get_if<CbufNode>(&*base_image)};
-    const auto cbuf_offset_imm{std::get_if<ImmediateNode>(&*cbuf->GetOffset())};
-    const auto cbuf_offset{cbuf_offset_imm->GetValue()};
-    const auto cbuf_index{cbuf->GetIndex()};
    const auto cbuf_key{(static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset)};

    // If this image has already been used, return the existing mapping.
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -95,10 +95,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
            const Node op_b =
                GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 4, index);

-            SetTemporal(bb, 0, op_a);
-            SetTemporal(bb, 1, op_b);
-            SetRegister(bb, instr.gpr0, GetTemporal(0));
-            SetRegister(bb, instr.gpr0.Value() + 1, GetTemporal(1));
+            SetTemporary(bb, 0, op_a);
+            SetTemporary(bb, 1, op_b);
+            SetRegister(bb, instr.gpr0, GetTemporary(0));
+            SetRegister(bb, instr.gpr0.Value() + 1, GetTemporary(1));
            break;
        }
        default:
@@ -136,9 +136,9 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                }
            }();
            for (u32 i = 0; i < count; ++i)
-                SetTemporal(bb, i, GetLmem(i * 4));
+                SetTemporary(bb, i, GetLmem(i * 4));
            for (u32 i = 0; i < count; ++i)
-                SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+                SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
            break;
        }
        default:
@@ -172,10 +172,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
            const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);

-            SetTemporal(bb, i, gmem);
+            SetTemporary(bb, i, gmem);
        }
        for (u32 i = 0; i < count; ++i) {
-            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
        }
        break;
    }
@@ -253,11 +253,11 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
            TrackAndGetGlobalMemory(bb, instr, true);

        // Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
-        SetTemporal(bb, 0, real_address_base);
+        SetTemporary(bb, 0, real_address_base);

        const u32 count = GetUniformTypeElementsCount(type);
        for (u32 i = 0; i < count; ++i) {
-            SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
+            SetTemporary(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
        }
        for (u32 i = 0; i < count; ++i) {
            const Node it_offset = Immediate(i * 4);
@@ -265,7 +265,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
            const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);

-            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
+            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporary(i + 1)));
        }
        break;
    }
@@ -297,18 +297,13 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeB
    const auto addr_register{GetRegister(instr.gmem.gpr)};
    const auto immediate_offset{static_cast<u32>(instr.gmem.offset)};

-    const Node base_address{
-        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
-    const auto cbuf = std::get_if<CbufNode>(&*base_address);
-    ASSERT(cbuf != nullptr);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
-    ASSERT(cbuf_offset_imm != nullptr);
-    const auto cbuf_offset = cbuf_offset_imm->GetValue();
+    const auto [base_address, index, offset] =
+        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()));
+    ASSERT(base_address != nullptr);

-    bb.push_back(
-        Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
+    bb.push_back(Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", index, offset)));

-    const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
+    const GlobalMemoryBase descriptor{index, offset};
    const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
    auto& usage = entry->second;
    if (is_write) {
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -102,7 +102,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                                                 PRECISE, op_a, Immediate(3));
            const Node operand =
                Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
-            branch = Operation(OperationCode::BranchIndirect, convert);
+            branch = Operation(OperationCode::BranchIndirect, operand);
        }

        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -181,10 +181,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                const Node value =
                    Operation(OperationCode::TextureQueryDimensions, meta,
                              GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0)));
-                SetTemporal(bb, indexer++, value);
+                SetTemporary(bb, indexer++, value);
            }
            for (u32 i = 0; i < indexer; ++i) {
-                SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+                SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
            }
            break;
        }
@@ -238,10 +238,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
            auto params = coords;
            MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
            const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
-            SetTemporal(bb, indexer++, value);
+            SetTemporary(bb, indexer++, value);
        }
        for (u32 i = 0; i < indexer; ++i) {
-            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
        }
        break;
    }
@@ -308,13 +308,9 @@ const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, Textu
 const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, TextureType type,
                                            bool is_array, bool is_shadow) {
    const Node sampler_register = GetRegister(reg);
-    const Node base_sampler =
+    const auto [base_sampler, cbuf_index, cbuf_offset] =
        TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    const auto cbuf = std::get_if<CbufNode>(&*base_sampler);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
-    ASSERT(cbuf_offset_imm != nullptr);
-    const auto cbuf_offset = cbuf_offset_imm->GetValue();
-    const auto cbuf_index = cbuf->GetIndex();
+    ASSERT(base_sampler != nullptr);
    const auto cbuf_key = (static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset);

    // If this sampler has already been used, return the existing mapping.
@@ -340,11 +336,11 @@ void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const
            // Skip disabled components
            continue;
        }
-        SetTemporal(bb, dest_elem++, components[elem]);
+        SetTemporary(bb, dest_elem++, components[elem]);
    }
    // After writing values in temporals, move them to the real registers
    for (u32 i = 0; i < dest_elem; ++i) {
-        SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+        SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
    }
 }

@@ -357,17 +353,17 @@ void ShaderIR::WriteTexsInstructionFloat(NodeBlock& bb, Instruction instr,
    for (u32 component = 0; component < 4; ++component) {
        if (!instr.texs.IsComponentEnabled(component))
            continue;
-        SetTemporal(bb, dest_elem++, components[component]);
+        SetTemporary(bb, dest_elem++, components[component]);
    }

    for (u32 i = 0; i < dest_elem; ++i) {
        if (i < 2) {
            // Write the first two swizzle components to gpr0 and gpr0+1
-            SetRegister(bb, instr.gpr0.Value() + i % 2, GetTemporal(i));
+            SetRegister(bb, instr.gpr0.Value() + i % 2, GetTemporary(i));
        } else {
            ASSERT(instr.texs.HasTwoDestinations());
            // Write the rest of the swizzle components to gpr28 and gpr28+1
-            SetRegister(bb, instr.gpr28.Value() + i % 2, GetTemporal(i));
+            SetRegister(bb, instr.gpr28.Value() + i % 2, GetTemporary(i));
        }
    }
 }
@@ -395,11 +391,11 @@ void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr,
        return;
    }

-    SetTemporal(bb, 0, first_value);
-    SetTemporal(bb, 1, Operation(OperationCode::HPack2, values[2], values[3]));
+    SetTemporary(bb, 0, first_value);
+    SetTemporary(bb, 1, Operation(OperationCode::HPack2, values[2], values[3]));

-    SetRegister(bb, instr.gpr0, GetTemporal(0));
-    SetRegister(bb, instr.gpr28, GetTemporal(1));
+    SetRegister(bb, instr.gpr0, GetTemporary(0));
+    SetRegister(bb, instr.gpr28, GetTemporary(1));
 }

 Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
--- a/src/video_core/shader/decode/xmad.cpp
+++ b/src/video_core/shader/decode/xmad.cpp
@@ -73,8 +73,8 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
    if (is_psl) {
        product = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, product, Immediate(16));
    }
-    SetTemporal(bb, 0, product);
-    product = GetTemporal(0);
+    SetTemporary(bb, 0, product);
+    product = GetTemporary(0);

    const Node original_c = op_c;
    const Tegra::Shader::XmadMode set_mode = mode; // Workaround to clang compile error
@@ -98,13 +98,13 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
        }
    }();

-    SetTemporal(bb, 1, op_c);
-    op_c = GetTemporal(1);
+    SetTemporary(bb, 1, op_c);
+    op_c = GetTemporary(1);

    // TODO(Rodrigo): Use an appropiate sign for this operation
    Node sum = Operation(OperationCode::IAdd, product, op_c);
-    SetTemporal(bb, 2, sum);
-    sum = GetTemporal(2);
+    SetTemporary(bb, 2, sum);
+    sum = GetTemporary(2);
    if (is_merge) {
        const Node a = BitfieldExtract(sum, 0, 16);
        const Node b =
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -101,8 +101,7 @@ enum class OperationCode {
    LogicalXor,    /// (bool a, bool b) -> bool
    LogicalNegate, /// (bool a) -> bool
    LogicalPick2,  /// (bool2 pair, uint index) -> bool
-    LogicalAll2,   /// (bool2 a) -> bool
-    LogicalAny2,   /// (bool2 a) -> bool
+    LogicalAnd2,   /// (bool2 a) -> bool

    LogicalFLessThan,     /// (float a, float b) -> bool
    LogicalFEqual,        /// (float a, float b) -> bool
--- a/src/video_core/shader/node_helper.cpp
+++ b/src/video_core/shader/node_helper.cpp
@@ -12,7 +12,7 @@
 namespace VideoCommon::Shader {

 Node Conditional(Node condition, std::vector<Node> code) {
-    return MakeNode<ConditionalNode>(condition, std::move(code));
+    return MakeNode<ConditionalNode>(std::move(condition), std::move(code));
 }

 Node Comment(std::string text) {
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -61,8 +61,17 @@ Node ShaderIR::GetConstBufferIndirect(u64 index_, u64 offset_, Node node) {
    const auto [entry, is_new] = used_cbufs.try_emplace(index);
    entry->second.MarkAsUsedIndirect();

-    const Node final_offset = Operation(OperationCode::UAdd, NO_PRECISE, node, Immediate(offset));
-    return MakeNode<CbufNode>(index, final_offset);
+    Node final_offset = [&] {
+        // Attempt to inline constant buffer without a variable offset. This is done to allow
+        // tracking LDC calls.
+        if (const auto gpr = std::get_if<GprNode>(&*node)) {
+            if (gpr->GetIndex() == Register::ZeroIndex) {
+                return Immediate(offset);
+            }
+        }
+        return Operation(OperationCode::UAdd, NO_PRECISE, std::move(node), Immediate(offset));
+    }();
+    return MakeNode<CbufNode>(index, std::move(final_offset));
 }

 Node ShaderIR::GetPredicate(u64 pred_, bool negated) {
@@ -80,7 +89,7 @@ Node ShaderIR::GetPredicate(bool immediate) {

 Node ShaderIR::GetInputAttribute(Attribute::Index index, u64 element, Node buffer) {
    used_input_attributes.emplace(index);
-    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer);
+    return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));
 }

 Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_address, Node buffer) {
@@ -89,6 +98,22 @@ Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_addres
 }

 Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buffer) {
+    if (index == Attribute::Index::LayerViewportPointSize) {
+        switch (element) {
+        case 0:
+            UNIMPLEMENTED();
+            break;
+        case 1:
+            uses_layer = true;
+            break;
+        case 2:
+            uses_viewport_index = true;
+            break;
+        case 3:
+            uses_point_size = true;
+            break;
+        }
+    }
    if (index == Attribute::Index::ClipDistances0123 ||
        index == Attribute::Index::ClipDistances4567) {
        const auto clip_index =
@@ -97,7 +122,7 @@ Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buff
    }
    used_output_attributes.insert(index);

-    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer);
+    return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));
 }

 Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) {
@@ -109,19 +134,19 @@ Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) {
 }

 Node ShaderIR::GetLocalMemory(Node address) {
-    return MakeNode<LmemNode>(address);
+    return MakeNode<LmemNode>(std::move(address));
 }

-Node ShaderIR::GetTemporal(u32 id) {
+Node ShaderIR::GetTemporary(u32 id) {
    return GetRegister(Register::ZeroIndex + 1 + id);
 }

 Node ShaderIR::GetOperandAbsNegFloat(Node value, bool absolute, bool negate) {
    if (absolute) {
-        value = Operation(OperationCode::FAbsolute, NO_PRECISE, value);
+        value = Operation(OperationCode::FAbsolute, NO_PRECISE, std::move(value));
    }
    if (negate) {
-        value = Operation(OperationCode::FNegate, NO_PRECISE, value);
+        value = Operation(OperationCode::FNegate, NO_PRECISE, std::move(value));
    }
    return value;
 }
@@ -130,24 +155,26 @@ Node ShaderIR::GetSaturatedFloat(Node value, bool saturate) {
    if (!saturate) {
        return value;
    }
-    const Node positive_zero = Immediate(std::copysignf(0, 1));
-    const Node positive_one = Immediate(1.0f);
-    return Operation(OperationCode::FClamp, NO_PRECISE, value, positive_zero, positive_one);
+
+    Node positive_zero = Immediate(std::copysignf(0, 1));
+    Node positive_one = Immediate(1.0f);
+    return Operation(OperationCode::FClamp, NO_PRECISE, std::move(value), std::move(positive_zero),
+                     std::move(positive_one));
 }

-Node ShaderIR::ConvertIntegerSize(Node value, Tegra::Shader::Register::Size size, bool is_signed) {
+Node ShaderIR::ConvertIntegerSize(Node value, Register::Size size, bool is_signed) {
    switch (size) {
    case Register::Size::Byte:
-        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, value,
-                                Immediate(24));
-        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, value,
-                                Immediate(24));
+        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(24));
+        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(24));
        return value;
    case Register::Size::Short:
-        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, value,
-                                Immediate(16));
-        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, value,
-                                Immediate(16));
+        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(16));
+        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(16));
    case Register::Size::Word:
        // Default - do nothing
        return value;
@@ -163,27 +190,29 @@ Node ShaderIR::GetOperandAbsNegInteger(Node value, bool absolute, bool negate, b
        return value;
    }
    if (absolute) {
-        value = Operation(OperationCode::IAbsolute, NO_PRECISE, value);
+        value = Operation(OperationCode::IAbsolute, NO_PRECISE, std::move(value));
    }
    if (negate) {
-        value = Operation(OperationCode::INegate, NO_PRECISE, value);
+        value = Operation(OperationCode::INegate, NO_PRECISE, std::move(value));
    }
    return value;
 }

 Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) {
-    const Node value = Immediate(instr.half_imm.PackImmediates());
+    Node value = Immediate(instr.half_imm.PackImmediates());
    if (!has_negation) {
        return value;
    }
-    const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
-    const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);

-    return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate);
+    Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
+    Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);
+
+    return Operation(OperationCode::HNegate, NO_PRECISE, std::move(value), std::move(first_negate),
+                     std::move(second_negate));
 }

 Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) {
-    return Operation(OperationCode::HUnpack, type, value);
+    return Operation(OperationCode::HUnpack, type, std::move(value));
 }

 Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
@@ -191,11 +220,11 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
    case Tegra::Shader::HalfMerge::H0_H1:
        return src;
    case Tegra::Shader::HalfMerge::F32:
-        return Operation(OperationCode::HMergeF32, src);
+        return Operation(OperationCode::HMergeF32, std::move(src));
    case Tegra::Shader::HalfMerge::Mrg_H0:
-        return Operation(OperationCode::HMergeH0, dest, src);
+        return Operation(OperationCode::HMergeH0, std::move(dest), std::move(src));
    case Tegra::Shader::HalfMerge::Mrg_H1:
-        return Operation(OperationCode::HMergeH1, dest, src);
+        return Operation(OperationCode::HMergeH1, std::move(dest), std::move(src));
    }
    UNREACHABLE();
    return src;
@@ -203,10 +232,10 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {

 Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) {
    if (absolute) {
-        value = Operation(OperationCode::HAbsolute, NO_PRECISE, value);
+        value = Operation(OperationCode::HAbsolute, NO_PRECISE, std::move(value));
    }
    if (negate) {
-        value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true),
+        value = Operation(OperationCode::HNegate, NO_PRECISE, std::move(value), GetPredicate(true),
                          GetPredicate(true));
    }
    return value;
@@ -216,9 +245,11 @@ Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) {
    if (!saturate) {
        return value;
    }
-    const Node positive_zero = Immediate(std::copysignf(0, 1));
-    const Node positive_one = Immediate(1.0f);
-    return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one);
+
+    Node positive_zero = Immediate(std::copysignf(0, 1));
+    Node positive_one = Immediate(1.0f);
+    return Operation(OperationCode::HClamp, NO_PRECISE, std::move(value), std::move(positive_zero),
+                     std::move(positive_one));
 }

 Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) {
@@ -246,7 +277,6 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N
        condition == PredCondition::LessEqualWithNan ||
        condition == PredCondition::GreaterThanWithNan ||
        condition == PredCondition::GreaterEqualWithNan) {
-
        predicate = Operation(OperationCode::LogicalOr, predicate,
                              Operation(OperationCode::LogicalFIsNan, op_a));
        predicate = Operation(OperationCode::LogicalOr, predicate,
@@ -275,7 +305,8 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si
    UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
                         "Unknown predicate comparison operation");

-    Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, op_a, op_b);
+    Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, std::move(op_a),
+                                     std::move(op_b));

    UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan ||
                             condition == PredCondition::NotEqualWithNan ||
@@ -305,9 +336,7 @@ Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition
    UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
                         "Unknown predicate comparison operation");

-    const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b);
-
-    return predicate;
+    return Operation(comparison->second, NO_PRECISE, std::move(op_a), std::move(op_b));
 }

 OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) {
@@ -333,31 +362,32 @@ Node ShaderIR::GetConditionCode(Tegra::Shader::ConditionCode cc) {
 }

 void ShaderIR::SetRegister(NodeBlock& bb, Register dest, Node src) {
-    bb.push_back(Operation(OperationCode::Assign, GetRegister(dest), src));
+    bb.push_back(Operation(OperationCode::Assign, GetRegister(dest), std::move(src)));
 }

 void ShaderIR::SetPredicate(NodeBlock& bb, u64 dest, Node src) {
-    bb.push_back(Operation(OperationCode::LogicalAssign, GetPredicate(dest), src));
+    bb.push_back(Operation(OperationCode::LogicalAssign, GetPredicate(dest), std::move(src)));
 }

 void ShaderIR::SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value) {
-    bb.push_back(Operation(OperationCode::LogicalAssign, GetInternalFlag(flag), value));
+    bb.push_back(Operation(OperationCode::LogicalAssign, GetInternalFlag(flag), std::move(value)));
 }

 void ShaderIR::SetLocalMemory(NodeBlock& bb, Node address, Node value) {
-    bb.push_back(Operation(OperationCode::Assign, GetLocalMemory(address), value));
+    bb.push_back(
+        Operation(OperationCode::Assign, GetLocalMemory(std::move(address)), std::move(value)));
 }

-void ShaderIR::SetTemporal(NodeBlock& bb, u32 id, Node value) {
-    SetRegister(bb, Register::ZeroIndex + 1 + id, value);
+void ShaderIR::SetTemporary(NodeBlock& bb, u32 id, Node value) {
+    SetRegister(bb, Register::ZeroIndex + 1 + id, std::move(value));
 }

 void ShaderIR::SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc) {
    if (!sets_cc) {
        return;
    }
-    const Node zerop = Operation(OperationCode::LogicalFEqual, value, Immediate(0.0f));
-    SetInternalFlag(bb, InternalFlag::Zero, zerop);
+    Node zerop = Operation(OperationCode::LogicalFEqual, std::move(value), Immediate(0.0f));
+    SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop));
    LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete");
 }

@@ -365,14 +395,14 @@ void ShaderIR::SetInternalFlagsFromInteger(NodeBlock& bb, Node value, bool sets_
    if (!sets_cc) {
        return;
    }
-    const Node zerop = Operation(OperationCode::LogicalIEqual, value, Immediate(0));
-    SetInternalFlag(bb, InternalFlag::Zero, zerop);
+    Node zerop = Operation(OperationCode::LogicalIEqual, std::move(value), Immediate(0));
+    SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop));
    LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete");
 }

 Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
-    return Operation(OperationCode::UBitfieldExtract, NO_PRECISE, value, Immediate(offset),
-                     Immediate(bits));
+    return Operation(OperationCode::UBitfieldExtract, NO_PRECISE, std::move(value),
+                     Immediate(offset), Immediate(bits));
 }

 } // namespace VideoCommon::Shader
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -5,13 +5,10 @@
 #pragma once

 #include <array>
-#include <cstring>
 #include <map>
 #include <optional>
 #include <set>
-#include <string>
 #include <tuple>
-#include <variant>
 #include <vector>

 #include "common/common_types.h"
@@ -115,6 +112,18 @@ public:
        return static_cast<std::size_t>(coverage_end * sizeof(u64));
    }

+    bool UsesLayer() const {
+        return uses_layer;
+    }
+
+    bool UsesViewportIndex() const {
+        return uses_viewport_index;
+    }
+
+    bool UsesPointSize() const {
+        return uses_point_size;
+    }
+
    bool HasPhysicalAttributes() const {
        return uses_physical_attributes;
    }
@@ -198,8 +207,8 @@ private:
    Node GetInternalFlag(InternalFlag flag, bool negated = false);
    /// Generates a node representing a local memory address
    Node GetLocalMemory(Node address);
-    /// Generates a temporal, internally it uses a post-RZ register
-    Node GetTemporal(u32 id);
+    /// Generates a temporary, internally it uses a post-RZ register
+    Node GetTemporary(u32 id);

    /// Sets a register. src value must be a number-evaluated node.
    void SetRegister(NodeBlock& bb, Tegra::Shader::Register dest, Node src);
@@ -209,8 +218,8 @@ private:
    void SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value);
    /// Sets a local memory address. address and value must be a number-evaluated node
    void SetLocalMemory(NodeBlock& bb, Node address, Node value);
-    /// Sets a temporal. Internally it uses a post-RZ register
-    void SetTemporal(NodeBlock& bb, u32 id, Node value);
+    /// Sets a temporary. Internally it uses a post-RZ register
+    void SetTemporary(NodeBlock& bb, u32 id, Node value);

    /// Sets internal flags from a float
    void SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc = true);
@@ -316,7 +325,7 @@ private:
    void WriteLop3Instruction(NodeBlock& bb, Tegra::Shader::Register dest, Node op_a, Node op_b,
                              Node op_c, Node imm_lut, bool sets_cc);

-    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
+    std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;

    std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;

@@ -346,6 +355,9 @@ private:
    std::set<Image> used_images;
    std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
    std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
+    bool uses_layer{};
+    bool uses_viewport_index{};
+    bool uses_point_size{};
    bool uses_physical_attributes{}; // Shader uses AL2P or physical attribute read/writes

    Tegra::Shader::Header header;
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -15,56 +15,63 @@ namespace {
 std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                   OperationCode operation_code) {
    for (; cursor >= 0; --cursor) {
-        const Node node = code.at(cursor);
+        Node node = code.at(cursor);
+
        if (const auto operation = std::get_if<OperationNode>(&*node)) {
            if (operation->GetCode() == operation_code) {
-                return {node, cursor};
+                return {std::move(node), cursor};
            }
        }
+
        if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
            const auto& conditional_code = conditional->GetCode();
-            const auto [found, internal_cursor] = FindOperation(
+            auto [found, internal_cursor] = FindOperation(
                conditional_code, static_cast<s64>(conditional_code.size() - 1), operation_code);
            if (found) {
-                return {found, cursor};
+                return {std::move(found), cursor};
            }
        }
    }
    return {};
 }
-} // namespace
+} // Anonymous namespace

-Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const {
+std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
+                                               s64 cursor) const {
    if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
-        // Cbuf found, but it has to be immediate
-        return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr;
+        // Constant buffer found, test if it's an immediate
+        const auto offset = cbuf->GetOffset();
+        if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
+            return {tracked, cbuf->GetIndex(), immediate->GetValue()};
+        }
+        return {};
    }
    if (const auto gpr = std::get_if<GprNode>(&*tracked)) {
        if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
-            return nullptr;
+            return {};
        }
        // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
        // register that it uses as operand
        const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
        if (!source) {
-            return nullptr;
+            return {};
        }
        return TrackCbuf(source, code, new_cursor);
    }
    if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
        for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
-            if (const auto found = TrackCbuf((*operation)[i], code, cursor)) {
-                // Cbuf found in operand
+            if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) {
+                // Cbuf found in operand.
                return found;
            }
        }
-        return nullptr;
+        return {};
    }
    if (const auto conditional = std::get_if<ConditionalNode>(&*tracked)) {
        const auto& conditional_code = conditional->GetCode();
        return TrackCbuf(tracked, conditional_code, static_cast<s64>(conditional_code.size()));
    }
-    return nullptr;
+    return {};
 }

 std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const {
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -75,9 +75,12 @@ MatchStructureResult SurfaceBaseImpl::MatchesStructure(const SurfaceParams& rhs)

    // Linear Surface check
    if (!params.is_tiled) {
-        if (std::tie(params.width, params.height, params.pitch) ==
-            std::tie(rhs.width, rhs.height, rhs.pitch)) {
-            return MatchStructureResult::FullMatch;
+        if (std::tie(params.height, params.pitch) == std::tie(rhs.height, rhs.pitch)) {
+            if (params.width == rhs.width) {
+                return MatchStructureResult::FullMatch;
+            } else {
+                return MatchStructureResult::SemiMatch;
+            }
        }
        return MatchStructureResult::None;
    }
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -200,8 +200,9 @@ public:
        modification_tick = tick;
    }

-    void MarkAsRenderTarget(const bool is_target) {
+    void MarkAsRenderTarget(const bool is_target, const u32 index) {
        this->is_target = is_target;
+        this->index = index;
    }

    void MarkAsPicked(const bool is_picked) {
@@ -221,6 +222,10 @@ public:
        return is_target;
    }

+    u32 GetRenderTarget() const {
+        return index;
+    }
+
    bool IsRegistered() const {
        return is_registered;
    }
@@ -307,10 +312,13 @@ private:
        return view;
    }

+    static constexpr u32 NO_RT = 0xFFFFFFFF;
+
    bool is_modified{};
    bool is_target{};
    bool is_registered{};
    bool is_picked{};
+    u32 index{NO_RT};
    u64 modification_tick{};
 };

--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -290,12 +290,19 @@ std::size_t SurfaceParams::GetLayerSize(bool as_host_size, bool uncompressed) co

 std::size_t SurfaceParams::GetInnerMipmapMemorySize(u32 level, bool as_host_size,
                                                    bool uncompressed) const {
-    const bool tiled{as_host_size ? false : is_tiled};
    const u32 width{GetMipmapSize(uncompressed, GetMipWidth(level), GetDefaultBlockWidth())};
    const u32 height{GetMipmapSize(uncompressed, GetMipHeight(level), GetDefaultBlockHeight())};
    const u32 depth{is_layered ? 1U : GetMipDepth(level)};
-    return Tegra::Texture::CalculateSize(tiled, GetBytesPerPixel(), width, height, depth,
-                                         GetMipBlockHeight(level), GetMipBlockDepth(level));
+    if (is_tiled) {
+        return Tegra::Texture::CalculateSize(!as_host_size, GetBytesPerPixel(), width, height,
+                                             depth, GetMipBlockHeight(level),
+                                             GetMipBlockDepth(level));
+    } else if (as_host_size || IsBuffer()) {
+        return GetBytesPerPixel() * width * height * depth;
+    } else {
+        // Linear Texture Case
+        return pitch * height * depth;
+    }
 }

 bool SurfaceParams::operator==(const SurfaceParams& rhs) const {
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -116,10 +116,10 @@ public:
        std::lock_guard lock{mutex};
        auto& maxwell3d = system.GPU().Maxwell3D();

-        if (!maxwell3d.dirty_flags.zeta_buffer) {
+        if (!maxwell3d.dirty.depth_buffer) {
            return depth_buffer.view;
        }
-        maxwell3d.dirty_flags.zeta_buffer = false;
+        maxwell3d.dirty.depth_buffer = false;

        const auto& regs{maxwell3d.regs};
        const auto gpu_addr{regs.zeta.Address()};
@@ -133,11 +133,11 @@ public:
            regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)};
        auto surface_view = GetSurface(gpu_addr, depth_params, preserve_contents, true);
        if (depth_buffer.target)
-            depth_buffer.target->MarkAsRenderTarget(false);
+            depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
        depth_buffer.target = surface_view.first;
        depth_buffer.view = surface_view.second;
        if (depth_buffer.target)
-            depth_buffer.target->MarkAsRenderTarget(true);
+            depth_buffer.target->MarkAsRenderTarget(true, DEPTH_RT);
        return surface_view.second;
    }

@@ -145,10 +145,10 @@ public:
        std::lock_guard lock{mutex};
        ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets);
        auto& maxwell3d = system.GPU().Maxwell3D();
-        if (!maxwell3d.dirty_flags.color_buffer[index]) {
+        if (!maxwell3d.dirty.render_target[index]) {
            return render_targets[index].view;
        }
-        maxwell3d.dirty_flags.color_buffer.reset(index);
+        maxwell3d.dirty.render_target[index] = false;

        const auto& regs{maxwell3d.regs};
        if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 ||
@@ -167,11 +167,11 @@ public:
        auto surface_view = GetSurface(gpu_addr, SurfaceParams::CreateForFramebuffer(system, index),
                                       preserve_contents, true);
        if (render_targets[index].target)
-            render_targets[index].target->MarkAsRenderTarget(false);
+            render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
        render_targets[index].target = surface_view.first;
        render_targets[index].view = surface_view.second;
        if (render_targets[index].target)
-            render_targets[index].target->MarkAsRenderTarget(true);
+            render_targets[index].target->MarkAsRenderTarget(true, static_cast<u32>(index));
        return surface_view.second;
    }

@@ -191,7 +191,7 @@ public:
        if (depth_buffer.target == nullptr) {
            return;
        }
-        depth_buffer.target->MarkAsRenderTarget(false);
+        depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
        depth_buffer.target = nullptr;
        depth_buffer.view = nullptr;
    }
@@ -200,7 +200,7 @@ public:
        if (render_targets[index].target == nullptr) {
            return;
        }
-        render_targets[index].target->MarkAsRenderTarget(false);
+        render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
        render_targets[index].target = nullptr;
        render_targets[index].view = nullptr;
    }
@@ -270,6 +270,17 @@ protected:
    // and reading it from a sepparate buffer.
    virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0;

+    void ManageRenderTargetUnregister(TSurface& surface) {
+        auto& maxwell3d = system.GPU().Maxwell3D();
+        const u32 index = surface->GetRenderTarget();
+        if (index == DEPTH_RT) {
+            maxwell3d.dirty.depth_buffer = true;
+        } else {
+            maxwell3d.dirty.render_target[index] = true;
+        }
+        maxwell3d.dirty.render_settings = true;
+    }
+
    void Register(TSurface surface) {
        const GPUVAddr gpu_addr = surface->GetGpuAddr();
        const CacheAddr cache_ptr = ToCacheAddr(system.GPU().MemoryManager().GetPointer(gpu_addr));
@@ -294,6 +305,9 @@ protected:
        if (guard_render_targets && surface->IsProtected()) {
            return;
        }
+        if (!guard_render_targets && surface->IsRenderTarget()) {
+            ManageRenderTargetUnregister(surface);
+        }
        const GPUVAddr gpu_addr = surface->GetGpuAddr();
        const CacheAddr cache_ptr = surface->GetCacheAddr();
        const std::size_t size = surface->GetSizeInBytes();
@@ -649,15 +663,6 @@ private:
                }
                return {current_surface, *view};
            }
-            // The next case is unsafe, so if we r in accurate GPU, just skip it
-            if (Settings::values.use_accurate_gpu_emulation) {
-                return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
-                                      MatchTopologyResult::FullMatch);
-            }
-            // This is the case the texture is a part of the parent.
-            if (current_surface->MatchesSubTexture(params, gpu_addr)) {
-                return RebuildSurface(current_surface, params, is_render);
-            }
        } else {
            // If there are many overlaps, odds are they are subtextures of the candidate
            // surface. We try to construct a new surface based on the candidate parameters,
@@ -793,6 +798,9 @@ private:
    static constexpr u64 registry_page_size{1 << registry_page_bits};
    std::unordered_map<CacheAddr, std::vector<TSurface>> registry;

+    static constexpr u32 DEPTH_RT = 8;
+    static constexpr u32 NO_RT = 0xFFFFFFFF;
+
    // The L1 Cache is used for fast texture lookup before checking the overlaps
    // This avoids calculating size and other stuffs.
    std::unordered_map<CacheAddr, TSurface> l1_cache;
Author	SHA1	Message	Date
bunnei	9be9600bdc	Merge pull request #2704 from FernandoS27/conditional maxwell3d: Implement Conditional Rendering	2019-07-24 17:07:57 -04:00
Zach Hilman	12514ccd35	Fix README change mistake (#2754 ) Fix README change mistake	2019-07-24 16:42:33 -04:00
bunnei	f601f25bcc	Merge pull request #2734 from ReinUsesLisp/compute-shaders gl_rasterizer: Implement compute shaders	2019-07-22 11:12:55 -04:00
bunnei	27e10e0442	Merge pull request #2735 from FernandoS27/pipeline-rework Rework Dirty Flags in GPU Pipeline, Optimize CBData and Redo Clearing mechanism	2019-07-21 00:59:52 -04:00
Zach Hilman	6738fb5fef	Update README.md	2019-07-20 21:03:30 -04:00
Fernando Sahmkow	0a67416971	Merge pull request #2693 from ReinUsesLisp/hsetp2 shader/half_set_predicate: Implement missing HSETP2 variants	2019-07-20 17:25:08 -04:00
Flame Sage	369be67039	Update README.md	2019-07-20 19:24:24 +00:00
Flame Sage	aa599ac709	Update README.md	2019-07-20 19:22:30 +00:00
Flame Sage	a2edb27158	Merge pull request #2752 from DarkLordZach/master azure: Fix clang-format and releases	2019-07-20 15:20:53 -04:00
Zach Hilman	f470bcb826	azure: Fix clang-format and releases	2019-07-20 15:19:25 -04:00
Fernando Sahmkow	7a35178ee2	Maxwell3D: Reorganize and address feedback	2019-07-20 10:18:35 -04:00
ReinUsesLisp	45c162444d	shader/half_set_predicate: Fix HSETP2 implementation	2019-07-19 22:21:22 -03:00
ReinUsesLisp	6c4985edc9	shader/half_set_predicate: Implement missing HSETP2 variants	2019-07-19 22:20:47 -03:00
bunnei	5d369112d9	Merge pull request #2687 from lioncash/tls-process kernel/process: Allocate the process' TLS region during initialization	2019-07-18 13:53:04 -04:00
bunnei	63bda67a34	Merge pull request #2738 from lioncash/shader-ir shader-ir: Minor cleanup-related changes	2019-07-18 13:52:01 -04:00
David	d4b95bfc25	Merge pull request #2741 from FernandoS27/trace-log Kernel: Downgrade WaitForAddress and SignalToAddress messages to Trace.	2019-07-18 13:58:29 +10:00
Fernando Sahmkow	5e457bf258	Kernel: Downgrade WaitForAddress and SignalToAddress messages to Trace. This messages were originally set as warnning since few games used these svcs and it was needed for debugging. This is no longer the case.	2019-07-17 22:05:47 -04:00
Fernando Sahmkow	4be61013a1	GL_State: Feedback and fixes	2019-07-17 17:29:56 -04:00
Fernando Sahmkow	5ad889f6fd	Maxwell3D: Address Feedback	2019-07-17 17:29:55 -04:00
Fernando Sahmkow	7826f0afd9	Texture_Cache: Rebase Fixes	2019-07-17 17:29:54 -04:00
Fernando Sahmkow	8cdbfe69b1	GL_Rasterizer: Corrections to Clearing.	2019-07-17 17:29:54 -04:00
Fernando Sahmkow	0ff4a5fa39	Maxwell3D: Correct marking dirtiness on CB upload	2019-07-17 17:29:53 -04:00
Fernando Sahmkow	fec32fed18	GL_Rasterizer: Rework RenderTarget/DepthBuffer clearing	2019-07-17 17:29:52 -04:00
Fernando Sahmkow	a081dea8ab	Maxwell3D: Implement State Dirty Flags.	2019-07-17 17:29:51 -04:00
Fernando Sahmkow	0d3db58657	Maxwell3D: Rework CBData Upload	2019-07-17 17:29:50 -04:00
Fernando Sahmkow	f2e7b29c14	Maxwell3D: Rework the dirty system to be more consistant and scaleable	2019-07-17 17:29:49 -04:00
Fernando Sahmkow	e42bcf2314	maxwell3d: Implement Conditional Rendering Conditional Rendering takes care of conditionaly clearing or drawing depending on a set of queries. This PR implements the query checks to stablish if things can be rendered or not.	2019-07-17 17:13:19 -04:00
Fernando Sahmkow	223a535f3f	Merge pull request #2740 from lioncash/bra shader/decode/other: Correct branch indirect argument within BRA handling	2019-07-17 14:25:08 -04:00
Rodrigo Locatti	c3218c110f	Merge pull request #2726 from lioncash/access core: Remove CurrentArmInterface() global accessor	2019-07-17 03:42:16 -03:00
Lioncash	bebbdc2067	shader_ir: std::move Node instance where applicable These are std::shared_ptr instances underneath the hood, which means copying them isn't as cheap as a regular pointer. Particularly so on weakly-ordered systems. This avoids atomic reference count increments and decrements where they aren't necessary for the core set of operations.	2019-07-16 19:49:23 -04:00
Lioncash	60926ac16b	shader_ir: Rename Get/SetTemporal to Get/SetTemporary This is more accurate in terms of describing what the functions are actually doing. Temporal relates to time, not the setting of a temporary itself.	2019-07-16 19:47:43 -04:00
Lioncash	44d87ff641	shader_ir: Remove unused includes Removes unnecessary header dependencies.	2019-07-16 19:47:42 -04:00
Fernando Sahmkow	b56e7f870a	Merge pull request #2565 from ReinUsesLisp/track-indirect shader/track: Track indirect buffers	2019-07-16 14:58:35 -04:00
Lioncash	e2d7dda166	shader/decode/other: Correct branch indirect argument within BRA handling This appears to have been a copy/paste error introduced within `8a6fc529a9`	2019-07-16 12:20:45 -04:00
ReinUsesLisp	2a4044a858	gl_shader_cache: Fix clang-format issues	2019-07-15 20:33:51 -03:00
ReinUsesLisp	6b0d017675	gl_shader_decompiler: Stub local memory size	2019-07-15 17:38:25 -03:00
ReinUsesLisp	56bca83bde	gl_shader_cache: Address review commentaries	2019-07-15 17:38:25 -03:00
ReinUsesLisp	bbecd13697	gl_shader_cache: Address CI issues	2019-07-15 17:38:25 -03:00
ReinUsesLisp	725ba6cf63	gl_rasterizer: Implement compute shaders	2019-07-15 17:38:25 -03:00
Fernando Sahmkow	1bdb59fc6e	Merge pull request #2695 from ReinUsesLisp/layer-viewport gl_shader_decompiler: Implement gl_ViewportIndex and gl_Layer in vertex shaders	2019-07-15 16:28:07 -04:00
bunnei	b77a1ed67a	Merge pull request #2705 from FernandoS27/tex-cache-fixes GPU: Fixes to Texture Cache and Include Microprofiles for GL State/BufferCopy/Macro Interpreter	2019-07-14 22:44:36 -04:00
ReinUsesLisp	afa8096df5	shader: Allow tracking of indirect buffers without variable offset While changing this code, simplify tracking code to allow returning the base address node, this way callers don't have to manually rebuild it on each invocation.	2019-07-14 22:36:44 -03:00
bunnei	3477b92289	Merge pull request #2675 from ReinUsesLisp/opengl-buffer-cache buffer_cache: Implement a generic buffer cache and its OpenGL backend	2019-07-14 19:03:43 -04:00
Fernando Sahmkow	2ac7472d3f	Texture_Cache: Address Feedback	2019-07-14 17:42:39 -04:00
Fernando Sahmkow	0f54b541f4	Texture_Cache: Remove some unprecise fallback case and clang format	2019-07-14 12:00:32 -04:00
Fernando Sahmkow	5818959e54	Texture_Cache: Force Framebuffer reset if an active render target is unregistered.	2019-07-14 12:00:31 -04:00
Fernando Sahmkow	913b7a6872	GPU: Add a microprofile for macro interpreter	2019-07-14 12:00:30 -04:00
Fernando Sahmkow	a9943222f2	GL_State: Add a microprofile timer to OpenGL state.	2019-07-14 12:00:30 -04:00
Fernando Sahmkow	5c1e1a148e	Gl_Texture_Cache: Measure Buffer Copy Times	2019-07-14 12:00:29 -04:00
Fernando Sahmkow	5d31bab69a	Texture_Cache: Correct Linear Structural Match.	2019-07-14 12:00:28 -04:00
Fernando Sahmkow	4882c058fd	Merge pull request #2690 from SciresM/physmem_fixes Implement MapPhysicalMemory/UnmapPhysicalMemory	2019-07-14 09:16:46 -04:00
Lioncash	093e5440e2	core: Remove CurrentArmInterface() global accessor Replaces the final usage of the global accessor function and removes it. Removes one more enabler of global state.	2019-07-12 21:48:49 -04:00
Michael Scire	d4fc560c05	Remove unicorn mappings/unmappings	2019-07-11 15:12:33 -07:00
ReinUsesLisp	0eb0c24269	gl_shader_decompiler: Fix gl_PointSize redeclaration	2019-07-11 16:10:59 -03:00
ReinUsesLisp	aca40de224	gl_shader_decompiler: Fix conditional usage of GL_ARB_shader_viewport_layer_array	2019-07-11 04:27:00 -03:00
Michael Scire	a1845d1dd3	prefer system reference over global accessor	2019-07-09 08:11:35 -07:00
Michael Scire	697206092e	Prevent merging of device mapped memory blocks. This sets the DeviceMapped attribute for GPU-mapped memory blocks, and prevents merging device mapped blocks. This prevents memory mapped from the gpu from having its backing address changed by block coalesce.	2019-07-08 22:52:05 -07:00
ReinUsesLisp	c9d886c84e	gl_shader_decompiler: Implement gl_ViewportIndex and gl_Layer in vertex shaders This commit implements gl_ViewportIndex and gl_Layer in vertex and geometry shaders. In the case it's used in a vertex shader, it requires ARB_shader_viewport_layer_array. This extension is available on AMD and Nvidia devices (mesa and proprietary drivers), but not available on Intel on any platform. At the moment of writing this description I don't know if this is a hardware limitation or a driver limitation. In the case that ARB_shader_viewport_layer_array is not available, writes to these registers on a vertex shader are ignored, with the appropriate logging.	2019-07-07 20:42:55 -03:00
Michael Scire	ca6f08e3b1	Remove unused member function declaration	2019-07-07 13:02:41 -07:00
Michael Scire	ce64a9fab9	physmem: add helpers, cleanup logic.	2019-07-07 12:55:30 -07:00
Michael Scire	b901cd584e	clang-format fixes	2019-07-07 12:08:29 -07:00
Michael Scire	1689784c19	address review commentary	2019-07-07 11:48:11 -07:00
Michael Scire	13a8fde3ad	Implement MapPhysicalMemory/UnmapPhysicalMemory This implements svcMapPhysicalMemory/svcUnmapPhysicalMemory for Yuzu, which can be used to map memory at a desired address by games since 3.0.0. It also properly parses SystemResourceSize from NPDM, and makes information available via svcGetInfo. This is needed for games like Super Smash Bros. and Diablo 3 -- this PR's implementation does not run into the "ASCII reads" issue mentioned in the comments of #2626, which was caused by the following bugs in Yuzu's memory management that this PR also addresses: * Yuzu's memory coalescing does not properly merge blocks. This results in a polluted address space/svcQueryMemory results that would be impossible to replicate on hardware, which can lead to game code making the wrong assumptions about memory layout. * This implements better merging for AllocatedMemoryBlocks. * Yuzu's implementation of svcMirrorMemory unprotected the entire virtual memory range containing the range being mirrored. This could lead to games attempting to map data at that unprotected range/attempting to access that range after yuzu improperly unmapped it. * This PR fixes it by simply calling ReprotectRange instead of Reprotect.	2019-07-07 11:45:53 -07:00
Lioncash	56c7912159	kernel/process: Allocate the process' TLS region during initialization Prior to execution within a process beginning, the process establishes its own TLS region for uses (as far as I can tell) related to exception handling. Now that TLS creation was decoupled from threads themselves, we can add this behavior to our Process class. This is also good, as it allows us to remove a stub within svcGetInfo, namely querying the address of that region.	2019-07-07 14:08:28 -04:00
Lioncash	eb6f55d880	kernel/process: Move main thread stack allocation to its own function Keeps this particular set of behavior isolated to its own function.	2019-07-07 14:08:25 -04:00
ReinUsesLisp	79a23ca5f0	buffer_cache: Avoid [[nodiscard]] to make clang-format happy	2019-07-06 01:17:05 -03:00
ReinUsesLisp	83050c9495	buffer_cache: Try to fix MinGW build	2019-07-06 01:14:05 -03:00
ReinUsesLisp	f7691ebe57	gl_rasterizer: Fix nullptr dereference on disabled buffers	2019-07-06 00:37:56 -03:00
ReinUsesLisp	7ecf64257a	gl_rasterizer: Minor style changes	2019-07-06 00:37:55 -03:00
ReinUsesLisp	9cdc576f60	gl_rasterizer: Fix vertex and index data invalidations	2019-07-06 00:37:55 -03:00
ReinUsesLisp	1fa21fa192	gl_buffer_cache: Implement with generic buffer cache	2019-07-06 00:37:55 -03:00
ReinUsesLisp	32c0212b24	buffer_cache: Implement a generic buffer cache Implements a templated class with a similar approach to our current generic texture cache. It is designed to be compatible with Vulkan and OpenGL,	2019-07-06 00:37:55 -03:00
ReinUsesLisp	2bcae41a73	gl_buffer_cache: Remove global system getters	2019-07-06 00:37:55 -03:00
ReinUsesLisp	02ab844934	gl_device: Query SSBO alignment	2019-07-06 00:37:55 -03:00
ReinUsesLisp	d14fbfb9b5	gl_buffer_cache: Implement flushing	2019-07-06 00:37:55 -03:00
ReinUsesLisp	345f852bdb	gl_rasterizer: Drop gl_global_cache in favor of gl_buffer_cache	2019-07-06 00:37:55 -03:00
ReinUsesLisp	8155b12d3d	gl_buffer_cache: Rework to support internalized buffers	2019-07-06 00:37:55 -03:00
ReinUsesLisp	f8ba72d491	gl_buffer_cache: Store in CachedBufferEntry the used buffer handle	2019-07-06 00:37:55 -03:00
ReinUsesLisp	b54fb8fc4c	gl_buffer_cache: Return used buffer from Upload function	2019-07-06 00:37:55 -03:00
ReinUsesLisp	a6d2f52fc3	gl_rasterizer: Add some commentaries	2019-07-06 00:37:55 -03:00
ReinUsesLisp	2b9d4088ec	gl_rasterizer: Make DrawParameters rasterizer instance const	2019-07-06 00:37:55 -03:00
ReinUsesLisp	2e39c20da5	gl_rasterizer: Move index buffer uploading to its own method	2019-07-06 00:37:55 -03:00