Shader_Ir: Implement F16 Variants of F2F, F2I, I2F.

This commit takes care of implementing the F16 Variants of the conversion instructions and makes sure conversions are done.
Merge pull request #2693 from ReinUsesLisp/hsetp2
2019-07-20 17:38:25 -04:00 · 2019-07-20 17:25:08 -04:00 · 2019-07-20 19:24:24 +00:00 · 2019-07-20 19:22:30 +00:00 · 2019-07-20 15:20:53 -04:00 · 2019-07-20 15:19:25 -04:00
81 changed files with 2108 additions and 780 deletions
--- a/.ci/scripts/.gitkeep
+++ b/.ci/scripts/.gitkeep
--- a/.ci/scripts/common/post-upload.sh
+++ b/.ci/scripts/common/post-upload.sh
@@ -0,0 +1,15 @@
+#!/bin/bash -ex
+
+# Copy documentation
+cp license.txt "$REV_NAME"
+cp README.md "$REV_NAME"
+
+tar $COMPRESSION_FLAGS "$ARCHIVE_NAME" "$REV_NAME"
+
+mv "$REV_NAME" $RELEASE_NAME
+
+7z a "$REV_NAME.7z" $RELEASE_NAME
+
+# move the compiled archive into the artifacts directory to be uploaded by travis releases
+mv "$ARCHIVE_NAME" artifacts/
+mv "$REV_NAME.7z" artifacts/
--- a/.ci/scripts/common/pre-upload.sh
+++ b/.ci/scripts/common/pre-upload.sh
@@ -0,0 +1,6 @@
+#!/bin/bash -ex
+
+GITDATE="`git show -s --date=short --format='%ad' | sed 's/-//g'`"
+GITREV="`git show -s --format='%h'`"
+
+mkdir -p artifacts
--- a/.ci/scripts/format/docker.sh
+++ b/.ci/scripts/format/docker.sh
@@ -0,0 +1,6 @@
+#!/bin/bash -ex
+
+# Run clang-format
+cd /yuzu
+chmod a+x ./.ci/scripts/format/script.sh
+./.ci/scripts/format/script.sh
--- a/.ci/scripts/format/exec.sh
+++ b/.ci/scripts/format/exec.sh
@@ -0,0 +1,4 @@
+#!/bin/bash -ex
+
+chmod a+x ./.ci/scripts/format/docker.sh
+docker run -v $(pwd):/yuzu yuzuemu/build-environments:linux-clang-format /bin/bash -ex /yuzu/.ci/scripts/format/docker.sh
--- a/.ci/scripts/format/script.sh
+++ b/.ci/scripts/format/script.sh
@@ -0,0 +1,37 @@
+#!/bin/bash -ex
+
+if grep -nrI '\s$' src *.yml *.txt *.md Doxyfile .gitignore .gitmodules .ci* dist/*.desktop \
+                 dist/*.svg dist/*.xml; then
+    echo Trailing whitespace found, aborting
+    exit 1
+fi
+
+# Default clang-format points to default 3.5 version one
+CLANG_FORMAT=clang-format-6.0
+$CLANG_FORMAT --version
+
+if [ "$TRAVIS_EVENT_TYPE" = "pull_request" ]; then
+    # Get list of every file modified in this pull request
+    files_to_lint="$(git diff --name-only --diff-filter=ACMRTUXB $TRAVIS_COMMIT_RANGE | grep '^src/[^.]*[.]\(cpp\|h\)$' || true)"
+else
+    # Check everything for branch pushes
+    files_to_lint="$(find src/ -name '*.cpp' -or -name '*.h')"
+fi
+
+# Turn off tracing for this because it's too verbose
+set +x
+
+for f in $files_to_lint; do
+    d=$(diff -u "$f" <($CLANG_FORMAT "$f") || true)
+    if ! [ -z "$d" ]; then
+        echo "!!! $f not compliant to coding style, here is the fix:"
+        echo "$d"
+        fail=1
+    fi
+done
+
+set -x
+
+if [ "$fail" = 1 ]; then
+    exit 1
+fi
--- a/.ci/scripts/linux/docker.sh
+++ b/.ci/scripts/linux/docker.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -ex
+
+cd /yuzu
+
+ccache -s
+
+mkdir build || true && cd build
+cmake .. -G Ninja -DYUZU_USE_BUNDLED_UNICORN=ON -DYUZU_USE_QT_WEB_ENGINE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=/usr/lib/ccache/gcc -DCMAKE_CXX_COMPILER=/usr/lib/ccache/g++ -DYUZU_ENABLE_COMPATIBILITY_REPORTING=${ENABLE_COMPATIBILITY_REPORTING:-"OFF"} -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DUSE_DISCORD_PRESENCE=ON
+
+ninja
+
+ccache -s
+
+ctest -VV -C Release
--- a/.ci/scripts/linux/exec.sh
+++ b/.ci/scripts/linux/exec.sh
@@ -0,0 +1,5 @@
+#!/bin/bash -ex
+
+mkdir -p "ccache"  || true
+chmod a+x ./.ci/scripts/linux/docker.sh
+docker run -e ENABLE_COMPATIBILITY_REPORTING -e CCACHE_DIR=/yuzu/ccache -v $(pwd):/yuzu yuzuemu/build-environments:linux-fresh /bin/bash /yuzu/.ci/scripts/linux/docker.sh
--- a/.ci/scripts/linux/upload.sh
+++ b/.ci/scripts/linux/upload.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -ex
+
+. .ci/scripts/common/pre-upload.sh
+
+REV_NAME="yuzu-linux-${GITDATE}-${GITREV}"
+ARCHIVE_NAME="${REV_NAME}.tar.xz"
+COMPRESSION_FLAGS="-cJvf"
+
+mkdir "$REV_NAME"
+
+cp build/bin/yuzu-cmd "$REV_NAME"
+cp build/bin/yuzu "$REV_NAME"
+
+. .ci/scripts/common/post-upload.sh
--- a/.ci/scripts/merge/apply-patches-by-label.py
+++ b/.ci/scripts/merge/apply-patches-by-label.py
@@ -0,0 +1,28 @@
+# Download all pull requests as patches that match a specific label
+# Usage: python download-patches-by-label.py <Label to Match> <Root Path Folder to DL to>
+
+import requests, sys, json, urllib3.request, shutil, subprocess
+
+http = urllib3.PoolManager()
+dl_list = {}
+
+def check_individual(labels):
+    for label in labels:
+        if (label["name"] == sys.argv[1]):
+            return True
+    return False
+
+try:
+    url = 'https://api.github.com/repos/yuzu-emu/yuzu/pulls'
+    response = requests.get(url)
+    if (response.ok):
+        j = json.loads(response.content)
+        for pr in j:
+            if (check_individual(pr["labels"])):
+                pn = pr["number"]
+                print("Matched PR# %s" % pn)
+                print(subprocess.check_output(["git", "fetch", "https://github.com/yuzu-emu/yuzu.git", "pull/%s/head:pr-%s" % (pn, pn), "-f"]))
+                print(subprocess.check_output(["git", "merge", "--squash", "pr-%s" % pn]))
+                print(subprocess.check_output(["git", "commit", "-m\"Merge PR %s\"" % pn]))
+except:
+    sys.exit(-1)
--- a/.ci/scripts/merge/check-label-presence.py
+++ b/.ci/scripts/merge/check-label-presence.py
@@ -0,0 +1,18 @@
+# Checks to see if the specified pull request # has the specified tag
+# Usage: python check-label-presence.py <Pull Request ID> <Name of Label>
+
+import requests, json, sys
+
+try:
+    url = 'https://api.github.com/repos/yuzu-emu/yuzu/issues/%s' % sys.argv[1]
+    response = requests.get(url)
+    if (response.ok):
+        j = json.loads(response.content)
+        for label in j["labels"]:
+            if label["name"] == sys.argv[2]:
+                print('##vso[task.setvariable variable=enabletesting;]true')
+                sys.exit()
+except:
+    sys.exit(-1)
+
+print('##vso[task.setvariable variable=enabletesting;]false')
--- a/.ci/scripts/merge/yuzubot-git-config.sh
+++ b/.ci/scripts/merge/yuzubot-git-config.sh
@@ -0,0 +1,2 @@
+git config --global user.email "yuzu@yuzu-emu.org"
+git config --global user.name "yuzubot"
--- a/.ci/scripts/windows/docker.sh
+++ b/.ci/scripts/windows/docker.sh
@@ -0,0 +1,50 @@
+#!/bin/bash -ex
+
+cd /yuzu
+
+ccache -s
+
+# Dirty hack to trick unicorn makefile into believing we are in a MINGW system
+mv /bin/uname /bin/uname1 && echo -e '#!/bin/sh\necho MINGW64' >> /bin/uname
+chmod +x /bin/uname
+
+# Dirty hack to trick unicorn makefile into believing we have cmd
+echo '' >> /bin/cmd
+chmod +x /bin/cmd
+
+mkdir build || true && cd build
+cmake .. -G Ninja -DCMAKE_TOOLCHAIN_FILE="$(pwd)/../CMakeModules/MinGWCross.cmake" -DUSE_CCACHE=ON -DYUZU_USE_BUNDLED_UNICORN=ON -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DCMAKE_BUILD_TYPE=Release
+ninja
+
+# Clean up the dirty hacks
+rm /bin/uname && mv /bin/uname1 /bin/uname
+rm /bin/cmd
+
+ccache -s
+
+echo "Tests skipped"
+#ctest -VV -C Release
+
+echo 'Prepare binaries...'
+cd ..
+mkdir package
+
+QT_PLATFORM_DLL_PATH='/usr/x86_64-w64-mingw32/lib/qt5/plugins/platforms/'
+find build/ -name "yuzu*.exe" -exec cp {} 'package' \;
+
+# copy Qt plugins
+mkdir package/platforms
+cp "${QT_PLATFORM_DLL_PATH}/qwindows.dll" package/platforms/
+cp -rv "${QT_PLATFORM_DLL_PATH}/../mediaservice/" package/
+cp -rv "${QT_PLATFORM_DLL_PATH}/../imageformats/" package/
+rm -f package/mediaservice/*d.dll
+
+for i in package/*.exe; do
+  # we need to process pdb here, however, cv2pdb
+  # does not work here, so we just simply strip all the debug symbols
+  x86_64-w64-mingw32-strip "${i}"
+done
+
+pip3 install pefile
+python3 .ci/scripts/windows/scan_dll.py package/*.exe "package/"
+python3 .ci/scripts/windows/scan_dll.py package/imageformats/*.dll "package/"
--- a/.ci/scripts/windows/exec.sh
+++ b/.ci/scripts/windows/exec.sh
@@ -0,0 +1,5 @@
+#!/bin/bash -ex
+
+mkdir -p "ccache" || true
+chmod a+x ./.ci/scripts/windows/docker.sh
+docker run -e CCACHE_DIR=/yuzu/ccache -v $(pwd):/yuzu yuzuemu/build-environments:linux-mingw /bin/bash -ex /yuzu/.ci/scripts/windows/docker.sh
--- a/.ci/scripts/windows/scan_dll.py
+++ b/.ci/scripts/windows/scan_dll.py
@@ -0,0 +1,106 @@
+import pefile
+import sys
+import re
+import os
+import queue
+import shutil
+
+# constant definitions
+KNOWN_SYS_DLLS = ['WINMM.DLL', 'MSVCRT.DLL', 'VERSION.DLL', 'MPR.DLL',
+                  'DWMAPI.DLL', 'UXTHEME.DLL', 'DNSAPI.DLL', 'IPHLPAPI.DLL']
+# below is for Ubuntu 18.04 with specified PPA enabled, if you are using
+# other distro or different repositories, change the following accordingly
+DLL_PATH = [
+    '/usr/x86_64-w64-mingw32/bin/',
+    '/usr/x86_64-w64-mingw32/lib/',
+    '/usr/lib/gcc/x86_64-w64-mingw32/7.3-posix/'
+]
+
+missing = []
+
+
+def parse_imports(file_name):
+    results = []
+    pe = pefile.PE(file_name, fast_load=True)
+    pe.parse_data_directories()
+
+    for entry in pe.DIRECTORY_ENTRY_IMPORT:
+        current = entry.dll.decode()
+        current_u = current.upper()  # b/c Windows is often case insensitive
+        # here we filter out system dlls
+        # dll w/ names like *32.dll are likely to be system dlls
+        if current_u.upper() not in KNOWN_SYS_DLLS and not re.match(string=current_u, pattern=r'.*32\.DLL'):
+            results.append(current)
+
+    return results
+
+
+def parse_imports_recursive(file_name, path_list=[]):
+    q = queue.Queue()  # create a FIFO queue
+    # file_name can be a string or a list for the convience
+    if isinstance(file_name, str):
+        q.put(file_name)
+    elif isinstance(file_name, list):
+        for i in file_name:
+            q.put(i)
+    full_list = []
+    while q.qsize():
+        current = q.get_nowait()
+        print('> %s' % current)
+        deps = parse_imports(current)
+        # if this dll does not have any import, ignore it
+        if not deps:
+            continue
+        for dep in deps:
+            # the dependency already included in the list, skip
+            if dep in full_list:
+                continue
+            # find the requested dll in the provided paths
+            full_path = find_dll(dep)
+            if not full_path:
+                missing.append(dep)
+                continue
+            full_list.append(dep)
+            q.put(full_path)
+            path_list.append(full_path)
+    return full_list
+
+
+def find_dll(name):
+    for path in DLL_PATH:
+        for root, _, files in os.walk(path):
+            for f in files:
+                if name.lower() == f.lower():
+                    return os.path.join(root, f)
+
+
+def deploy(name, dst, dry_run=False):
+    dlls_path = []
+    parse_imports_recursive(name, dlls_path)
+    for dll_entry in dlls_path:
+        if not dry_run:
+            shutil.copy(dll_entry, dst)
+        else:
+            print('[Dry-Run] Copy %s to %s' % (dll_entry, dst))
+    print('Deploy completed.')
+    return dlls_path
+
+
+def main():
+    if len(sys.argv) < 3:
+        print('Usage: %s [files to examine ...] [target deploy directory]')
+        return 1
+    to_deploy = sys.argv[1:-1]
+    tgt_dir = sys.argv[-1]
+    if not os.path.isdir(tgt_dir):
+        print('%s is not a directory.' % tgt_dir)
+        return 1
+    print('Scanning dependencies...')
+    deploy(to_deploy, tgt_dir)
+    if missing:
+        print('Following DLLs are not found: %s' % ('\n'.join(missing)))
+    return 0
+
+
+if __name__ == '__main__':
+    main()
--- a/.ci/scripts/windows/upload.sh
+++ b/.ci/scripts/windows/upload.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -ex
+
+. .ci/scripts/common/pre-upload.sh
+
+REV_NAME="yuzu-windows-mingw-${GITDATE}-${GITREV}"
+ARCHIVE_NAME="${REV_NAME}.tar.gz"
+COMPRESSION_FLAGS="-czvf"
+
+mkdir "$REV_NAME"
+# get around the permission issues
+cp -r package/* "$REV_NAME"
+
+. .ci/scripts/common/post-upload.sh
--- a/.ci/templates/build-single.yml
+++ b/.ci/templates/build-single.yml
@@ -0,0 +1,21 @@
+parameters:
+  artifactSource: 'true'
+
+steps:
+- task: DockerInstaller@0
+  displayName: 'Prepare Environment'
+  inputs:
+    dockerVersion: '17.09.0-ce'
+- task: CacheBeta@0
+  displayName: 'Cache Build System'
+  inputs:
+    key: yuzu-v1-$(BuildName)-$(BuildSuffix)-$(CacheSuffix)
+    path: $(System.DefaultWorkingDirectory)/ccache
+    cacheHitVar: CACHE_RESTORED
+- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/exec.sh && ./.ci/scripts/$(ScriptFolder)/exec.sh
+  displayName: 'Build'
+- script: chmod a+x ./.ci/scripts/$(ScriptFolder)/upload.sh && ./.ci/scripts/$(ScriptFolder)/upload.sh
+  displayName: 'Package Artifacts'
+- publish: artifacts
+  artifact: 'yuzu-$(BuildName)-$(BuildSuffix)'
+  displayName: 'Upload Artifacts'
--- a/.ci/templates/build-standard.yml
+++ b/.ci/templates/build-standard.yml
@@ -0,0 +1,22 @@
+jobs:
+- job: build
+  displayName: 'standard'
+  pool:
+    vmImage: ubuntu-latest
+  strategy:
+    maxParallel: 10
+    matrix:
+      windows:
+        BuildSuffix: 'windows-mingw'
+        ScriptFolder: 'windows'
+      linux:
+        BuildSuffix: 'linux'
+        ScriptFolder: 'linux'
+  steps:
+  - template: ./sync-source.yml
+    parameters:
+      artifactSource: $(parameters.artifactSource)
+      needSubmodules: 'true'
+  - template: ./build-single.yml
+    parameters:
+      artifactSource: 'false'
--- a/.ci/templates/build-testing.yml
+++ b/.ci/templates/build-testing.yml
@@ -0,0 +1,30 @@
+jobs:
+- job: build_test
+  displayName: 'testing'
+  pool:
+    vmImage: ubuntu-latest
+  strategy:
+    maxParallel: 10
+    matrix:
+      windows:
+        BuildSuffix: 'windows-testing'
+        ScriptFolder: 'windows'
+  steps:
+  - task: PythonScript@0
+    condition: eq(variables['Build.Reason'], 'PullRequest')
+    displayName: 'Determine Testing Status'
+    inputs:
+      scriptSource: 'filePath'
+      scriptPath: '../scripts/merge/check-label-presence.py'
+      arguments: '$(System.PullRequest.PullRequestNumber) create-testing-build'
+  - ${{ if eq(variables.enabletesting, 'true') }}:
+    - template: ./sync-source.yml
+      parameters:
+        artifactSource: $(parameters.artifactSource)
+        needSubmodules: 'true'
+    - template: ./mergebot.yml
+      parameters:
+        matchLabel: 'testing-merge'
+    - template: ./build-single.yml
+      parameters:
+        artifactSource: 'false'
--- a/.ci/templates/format-check.yml
+++ b/.ci/templates/format-check.yml
@@ -0,0 +1,14 @@
+parameters:
+  artifactSource: 'true'
+
+steps:
+- template: ./sync-source.yml
+  parameters:
+    artifactSource: $(parameters.artifactSource)
+    needSubmodules: 'false'
+- task: DockerInstaller@0
+  displayName: 'Prepare Environment'
+  inputs:
+    dockerVersion: '17.09.0-ce'
+- script: chmod a+x ./.ci/scripts/format/exec.sh && ./.ci/scripts/format/exec.sh
+  displayName: 'Verify Formatting'
--- a/.ci/templates/merge.yml
+++ b/.ci/templates/merge.yml
@@ -0,0 +1,46 @@
+jobs:
+- job: merge
+  displayName: 'pull requests'
+  steps:
+  - checkout: self
+    submodules: recursive
+  - template: ./mergebot.yml
+    parameters:
+      matchLabel: '$(BuildName)-merge'
+  - task: ArchiveFiles@2
+    displayName: 'Package Source'
+    inputs:
+      rootFolderOrFile: '$(System.DefaultWorkingDirectory)'
+      includeRootFolder: false
+      archiveType: '7z'
+      archiveFile: '$(Build.ArtifactStagingDirectory)/yuzu-$(BuildName)-source.7z'
+  - task: PublishPipelineArtifact@1
+    displayName: 'Upload Artifacts'
+    inputs:
+      targetPath: '$(Build.ArtifactStagingDirectory)/yuzu-$(BuildName)-source.7z'
+      artifact: 'yuzu-$(BuildName)-source'
+      replaceExistingArchive: true
+- job: upload_source
+  displayName: 'upload'
+  dependsOn: merge
+  steps:
+  - template: ./sync-source.yml
+    parameters:
+      artifactSource: 'true'
+      needSubmodules: 'true'
+  - script: chmod a+x $(System.DefaultWorkingDirectory)/.ci/scripts/merge/yuzubot-git-config.sh && $(System.DefaultWorkingDirectory)/.ci/scripts/merge/yuzubot-git-config.sh
+    displayName: 'Apply Git Configuration'
+  - script: git tag -a $(BuildName)-$(Build.BuildId) -m "yuzu $(BuildName) $(Build.BuildNumber) $(Build.DefinitionName)"
+    displayName: 'Tag Source'
+  - script: git remote add other $(GitRepoPushChangesURL)
+    displayName: 'Register Repository'
+  - script: git push --follow-tags --force other HEAD:$(GitPushBranch)
+    displayName: 'Update Code'
+  - script: git rev-list -n 1 $(BuildName)-$(Build.BuildId) > $(Build.ArtifactStagingDirectory)/tag-commit.sha
+    displayName: 'Calculate Release Point'
+  - task: PublishPipelineArtifact@1
+    displayName: 'Upload Release Point'
+    inputs:
+      targetPath: '$(Build.ArtifactStagingDirectory)/tag-commit.sha'
+      artifact: 'yuzu-$(BuildName)-release-point'
+      replaceExistingArchive: true
--- a/.ci/templates/mergebot.yml
+++ b/.ci/templates/mergebot.yml
@@ -0,0 +1,15 @@
+parameters:
+  matchLabel: 'dummy-merge'
+
+steps:
+  - script: mkdir $(System.DefaultWorkingDirectory)/patches && pip install requests urllib3
+    displayName: 'Prepare Environment'
+  - script: chmod a+x $(System.DefaultWorkingDirectory)/.ci/scripts/merge/yuzubot-git-config.sh && $(System.DefaultWorkingDirectory)/.ci/scripts/merge/yuzubot-git-config.sh
+    displayName: 'Apply Git Configuration'
+  - task: PythonScript@0
+    displayName: 'Discover, Download, and Apply Patches'
+    inputs:
+      scriptSource: 'filePath'
+      scriptPath: '.ci/scripts/merge/apply-patches-by-label.py'
+      arguments: '${{ parameters.matchLabel }} patches'
+      workingDirectory: '$(System.DefaultWorkingDirectory)'
--- a/.ci/templates/retrieve-artifact-source.yml
+++ b/.ci/templates/retrieve-artifact-source.yml
@@ -0,0 +1,16 @@
+steps:
+- checkout: none
+- task: DownloadPipelineArtifact@2
+  displayName: 'Download Source'
+  inputs:
+    artifactName: 'yuzu-$(BuildName)-source'
+    buildType: 'current'
+    targetPath: '$(Build.ArtifactStagingDirectory)'
+- script: rm -rf $(System.DefaultWorkingDirectory) && mkdir $(System.DefaultWorkingDirectory)
+  displayName: 'Clean Working Directory'
+- task: ExtractFiles@1
+  displayName: 'Prepare Source'
+  inputs:
+    archiveFilePatterns: '$(Build.ArtifactStagingDirectory)/*.7z'
+    destinationFolder: '$(System.DefaultWorkingDirectory)'
+    cleanDestinationFolder: false
--- a/.ci/templates/retrieve-master-source.yml
+++ b/.ci/templates/retrieve-master-source.yml
@@ -0,0 +1,11 @@
+parameters:
+  needSubmodules: 'true'
+
+steps:
+- checkout: self
+  displayName: 'Checkout Recursive'
+  submodules: recursive
+#  condition: eq(parameters.needSubmodules, 'true')
+#- checkout: self
+#  displayName: 'Checkout Fast'
+#  condition: ne(parameters.needSubmodules, 'true')
--- a/.ci/templates/sync-source.yml
+++ b/.ci/templates/sync-source.yml
@@ -0,0 +1,7 @@
+steps:
+- ${{ if eq(parameters.artifactSource, 'true') }}:
+  - template: ./retrieve-artifact-source.yml
+- ${{ if ne(parameters.artifactSource, 'true') }}:
+  - template: ./retrieve-master-source.yml
+    parameters:
+      needSubmodules: $(parameters.needSubmodules)
--- a/.ci/yuzu-mainline.yml
+++ b/.ci/yuzu-mainline.yml
@@ -1,19 +1,23 @@
-# Starter pipeline
-# Start with a minimal pipeline that you can customize to build and deploy your code.
-# Add steps that build, run tests, deploy, and more:
-# https://aka.ms/yaml
-
 trigger:
 - master

-pool:
-  vmImage: 'ubuntu-latest'
-
-steps:
- script: echo Hello, world!
-  displayName: 'Run a one-line script'
-
- script: |
-    echo Add other tasks to build, test, and deploy your project.
-    echo See https://aka.ms/yaml
-  displayName: 'Run a multi-line script'
+stages:
+- stage: merge
+  displayName: 'merge'
+  jobs:
+  - template: ./templates/merge.yml
+- stage: format
+  dependsOn: merge
+  displayName: 'format'
+  jobs:
+  - job: format
+    displayName: 'clang'
+    pool:
+      vmImage: ubuntu-latest
+    steps:
+    - template: ./templates/format-check.yml
+- stage: build
+  displayName: 'build'
+  dependsOn: format
+  jobs:
+  - template: ./templates/build-standard.yml
--- a/.ci/yuzu-verify.yml
+++ b/.ci/yuzu-verify.yml
@@ -0,0 +1,18 @@
+stages:
+- stage: format
+  displayName: 'format'
+  jobs:
+  - job: format
+    displayName: 'clang'
+    pool:
+      vmImage: ubuntu-latest
+    steps:
+    - template: ./templates/format-check.yml
+      parameters:
+        artifactSource: 'false'
+- stage: build
+  displayName: 'build'
+  dependsOn: format
+  jobs:
+  - template: ./templates/build-standard.yml
+  - template: ./templates/build-testing.yml
--- a/.ci/yuzu.yml
+++ b/.ci/yuzu.yml
@@ -1,19 +0,0 @@
-# Starter pipeline
-# Start with a minimal pipeline that you can customize to build and deploy your code.
-# Add steps that build, run tests, deploy, and more:
-# https://aka.ms/yaml
-
-trigger:
- master
-
-pool:
-  vmImage: 'ubuntu-latest'
-
-steps:
- script: echo Hello, world!
-  displayName: 'Run a one-line script'
-
- script: |
-    echo Add other tasks to build, test, and deploy your project.
-    echo See https://aka.ms/yaml
-  displayName: 'Run a multi-line script'
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@ yuzu emulator
 =============
 [![Travis CI Build Status](https://travis-ci.org/yuzu-emu/yuzu.svg?branch=master)](https://travis-ci.org/yuzu-emu/yuzu)
 [![AppVeyor CI Build Status](https://ci.appveyor.com/api/projects/status/77k97svb2usreu68?svg=true)](https://ci.appveyor.com/project/bunnei/yuzu)
+[![Azure Mainline CI Build Status](https://dev.azure.com/yuzu-emu/yuzu/_apis/build/status/yuzu%20mainline?branchName=master)](https://dev.azure.com/yuzu-emu/yuzu/)

 yuzu is an experimental open-source emulator for the Nintendo Switch from the creators of [Citra](https://citra-emu.org/).

--- a/src/core/file_sys/program_metadata.cpp
+++ b/src/core/file_sys/program_metadata.cpp
@@ -94,6 +94,10 @@ u64 ProgramMetadata::GetFilesystemPermissions() const {
    return aci_file_access.permissions;
 }

+u32 ProgramMetadata::GetSystemResourceSize() const {
+    return npdm_header.system_resource_size;
+}
+
 const ProgramMetadata::KernelCapabilityDescriptors& ProgramMetadata::GetKernelCapabilities() const {
    return aci_kernel_capabilities;
 }
--- a/src/core/file_sys/program_metadata.h
+++ b/src/core/file_sys/program_metadata.h
@@ -58,6 +58,7 @@ public:
    u32 GetMainThreadStackSize() const;
    u64 GetTitleID() const;
    u64 GetFilesystemPermissions() const;
+    u32 GetSystemResourceSize() const;
    const KernelCapabilityDescriptors& GetKernelCapabilities() const;

    void Print() const;
@@ -76,7 +77,8 @@ private:
        u8 reserved_3;
        u8 main_thread_priority;
        u8 main_thread_cpu;
-        std::array<u8, 8> reserved_4;
+        std::array<u8, 4> reserved_4;
+        u32_le system_resource_size;
        u32_le process_category;
        u32_le main_stack_size;
        std::array<u8, 0x10> application_name;
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -129,20 +129,17 @@ u64 Process::GetTotalPhysicalMemoryAvailable() const {
    return vm_manager.GetTotalPhysicalMemoryAvailable();
 }

-u64 Process::GetTotalPhysicalMemoryAvailableWithoutMmHeap() const {
-    // TODO: Subtract the personal heap size from this when the
-    //       personal heap is implemented.
-    return GetTotalPhysicalMemoryAvailable();
+u64 Process::GetTotalPhysicalMemoryAvailableWithoutSystemResource() const {
+    return GetTotalPhysicalMemoryAvailable() - GetSystemResourceSize();
 }

 u64 Process::GetTotalPhysicalMemoryUsed() const {
-    return vm_manager.GetCurrentHeapSize() + main_thread_stack_size + code_memory_size;
+    return vm_manager.GetCurrentHeapSize() + main_thread_stack_size + code_memory_size +
+           GetSystemResourceUsage();
 }

-u64 Process::GetTotalPhysicalMemoryUsedWithoutMmHeap() const {
-    // TODO: Subtract the personal heap size from this when the
-    //       personal heap is implemented.
-    return GetTotalPhysicalMemoryUsed();
+u64 Process::GetTotalPhysicalMemoryUsedWithoutSystemResource() const {
+    return GetTotalPhysicalMemoryUsed() - GetSystemResourceUsage();
 }

 void Process::RegisterThread(const Thread* thread) {
@@ -172,6 +169,7 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) {
    program_id = metadata.GetTitleID();
    ideal_core = metadata.GetMainThreadCore();
    is_64bit_process = metadata.Is64BitProgram();
+    system_resource_size = metadata.GetSystemResourceSize();

    vm_manager.Reset(metadata.GetAddressSpaceType());

@@ -186,19 +184,11 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) {
 }

 void Process::Run(s32 main_thread_priority, u64 stack_size) {
-    // The kernel always ensures that the given stack size is page aligned.
-    main_thread_stack_size = Common::AlignUp(stack_size, Memory::PAGE_SIZE);
-
-    // Allocate and map the main thread stack
-    // TODO(bunnei): This is heap area that should be allocated by the kernel and not mapped as part
-    // of the user address space.
-    const VAddr mapping_address = vm_manager.GetTLSIORegionEndAddress() - main_thread_stack_size;
-    vm_manager
-        .MapMemoryBlock(mapping_address, std::make_shared<std::vector<u8>>(main_thread_stack_size),
-                        0, main_thread_stack_size, MemoryState::Stack)
-        .Unwrap();
+    AllocateMainThreadStack(stack_size);
+    tls_region_address = CreateTLSRegion();

    vm_manager.LogLayout();
+
    ChangeStatus(ProcessStatus::Running);

    SetupMainThread(*this, kernel, main_thread_priority);
@@ -228,6 +218,9 @@ void Process::PrepareForTermination() {
    stop_threads(system.Scheduler(2).GetThreadList());
    stop_threads(system.Scheduler(3).GetThreadList());

+    FreeTLSRegion(tls_region_address);
+    tls_region_address = 0;
+
    ChangeStatus(ProcessStatus::Exited);
 }

@@ -327,4 +320,16 @@ void Process::ChangeStatus(ProcessStatus new_status) {
    WakeupAllWaitingThreads();
 }

+void Process::AllocateMainThreadStack(u64 stack_size) {
+    // The kernel always ensures that the given stack size is page aligned.
+    main_thread_stack_size = Common::AlignUp(stack_size, Memory::PAGE_SIZE);
+
+    // Allocate and map the main thread stack
+    const VAddr mapping_address = vm_manager.GetTLSIORegionEndAddress() - main_thread_stack_size;
+    vm_manager
+        .MapMemoryBlock(mapping_address, std::make_shared<std::vector<u8>>(main_thread_stack_size),
+                        0, main_thread_stack_size, MemoryState::Stack)
+        .Unwrap();
+}
+
 } // namespace Kernel
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@@ -135,6 +135,11 @@ public:
        return mutex;
    }

+    /// Gets the address to the process' dedicated TLS region.
+    VAddr GetTLSRegionAddress() const {
+        return tls_region_address;
+    }
+
    /// Gets the current status of the process
    ProcessStatus GetStatus() const {
        return status;
@@ -168,8 +173,24 @@ public:
        return capabilities.GetPriorityMask();
    }

-    u32 IsVirtualMemoryEnabled() const {
-        return is_virtual_address_memory_enabled;
+    /// Gets the amount of secure memory to allocate for memory management.
+    u32 GetSystemResourceSize() const {
+        return system_resource_size;
+    }
+
+    /// Gets the amount of secure memory currently in use for memory management.
+    u32 GetSystemResourceUsage() const {
+        // On hardware, this returns the amount of system resource memory that has
+        // been used by the kernel. This is problematic for Yuzu to emulate, because
+        // system resource memory is used for page tables -- and yuzu doesn't really
+        // have a way to calculate how much memory is required for page tables for
+        // the current process at any given time.
+        // TODO: Is this even worth implementing? Games may retrieve this value via
+        // an SDK function that gets used + available system resource size for debug
+        // or diagnostic purposes. However, it seems unlikely that a game would make
+        // decisions based on how much system memory is dedicated to its page tables.
+        // Is returning a value other than zero wise?
+        return 0;
    }

    /// Whether this process is an AArch64 or AArch32 process.
@@ -196,15 +217,15 @@ public:
    u64 GetTotalPhysicalMemoryAvailable() const;

    /// Retrieves the total physical memory available to this process in bytes,
-    /// without the size of the personal heap added to it.
-    u64 GetTotalPhysicalMemoryAvailableWithoutMmHeap() const;
+    /// without the size of the personal system resource heap added to it.
+    u64 GetTotalPhysicalMemoryAvailableWithoutSystemResource() const;

    /// Retrieves the total physical memory used by this process in bytes.
    u64 GetTotalPhysicalMemoryUsed() const;

    /// Retrieves the total physical memory used by this process in bytes,
-    /// without the size of the personal heap added to it.
-    u64 GetTotalPhysicalMemoryUsedWithoutMmHeap() const;
+    /// without the size of the personal system resource heap added to it.
+    u64 GetTotalPhysicalMemoryUsedWithoutSystemResource() const;

    /// Gets the list of all threads created with this process as their owner.
    const std::list<const Thread*>& GetThreadList() const {
@@ -280,6 +301,9 @@ private:
    /// a process signal.
    void ChangeStatus(ProcessStatus new_status);

+    /// Allocates the main thread stack for the process, given the stack size in bytes.
+    void AllocateMainThreadStack(u64 stack_size);
+
    /// Memory manager for this process.
    Kernel::VMManager vm_manager;

@@ -298,12 +322,16 @@ private:
    /// Title ID corresponding to the process
    u64 program_id = 0;

+    /// Specifies additional memory to be reserved for the process's memory management by the
+    /// system. When this is non-zero, secure memory is allocated and used for page table allocation
+    /// instead of using the normal global page tables/memory block management.
+    u32 system_resource_size = 0;
+
    /// Resource limit descriptor for this process
    SharedPtr<ResourceLimit> resource_limit;

    /// The ideal CPU core for this process, threads are scheduled on this core by default.
    u8 ideal_core = 0;
-    u32 is_virtual_address_memory_enabled = 0;

    /// The Thread Local Storage area is allocated as processes create threads,
    /// each TLS area is 0x200 bytes, so one page (0x1000) is split up in 8 parts, and each part
@@ -338,6 +366,9 @@ private:
    /// variable related facilities.
    Mutex mutex;

+    /// Address indicating the location of the process' dedicated TLS region.
+    VAddr tls_region_address = 0;
+
    /// Random values for svcGetInfo RandomEntropy
    std::array<u64, RANDOM_ENTROPY_SIZE> random_entropy{};

--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -736,16 +736,16 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
        StackRegionBaseAddr = 14,
        StackRegionSize = 15,
        // 3.0.0+
-        IsVirtualAddressMemoryEnabled = 16,
-        PersonalMmHeapUsage = 17,
+        SystemResourceSize = 16,
+        SystemResourceUsage = 17,
        TitleId = 18,
        // 4.0.0+
        PrivilegedProcessId = 19,
        // 5.0.0+
        UserExceptionContextAddr = 20,
        // 6.0.0+
-        TotalPhysicalMemoryAvailableWithoutMmHeap = 21,
-        TotalPhysicalMemoryUsedWithoutMmHeap = 22,
+        TotalPhysicalMemoryAvailableWithoutSystemResource = 21,
+        TotalPhysicalMemoryUsedWithoutSystemResource = 22,
    };

    const auto info_id_type = static_cast<GetInfoType>(info_id);
@@ -763,12 +763,12 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
    case GetInfoType::StackRegionSize:
    case GetInfoType::TotalPhysicalMemoryAvailable:
    case GetInfoType::TotalPhysicalMemoryUsed:
-    case GetInfoType::IsVirtualAddressMemoryEnabled:
-    case GetInfoType::PersonalMmHeapUsage:
+    case GetInfoType::SystemResourceSize:
+    case GetInfoType::SystemResourceUsage:
    case GetInfoType::TitleId:
    case GetInfoType::UserExceptionContextAddr:
-    case GetInfoType::TotalPhysicalMemoryAvailableWithoutMmHeap:
-    case GetInfoType::TotalPhysicalMemoryUsedWithoutMmHeap: {
+    case GetInfoType::TotalPhysicalMemoryAvailableWithoutSystemResource:
+    case GetInfoType::TotalPhysicalMemoryUsedWithoutSystemResource: {
        if (info_sub_id != 0) {
            return ERR_INVALID_ENUM_VALUE;
        }
@@ -829,8 +829,13 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
            *result = process->GetTotalPhysicalMemoryUsed();
            return RESULT_SUCCESS;

-        case GetInfoType::IsVirtualAddressMemoryEnabled:
-            *result = process->IsVirtualMemoryEnabled();
+        case GetInfoType::SystemResourceSize:
+            *result = process->GetSystemResourceSize();
+            return RESULT_SUCCESS;
+
+        case GetInfoType::SystemResourceUsage:
+            LOG_WARNING(Kernel_SVC, "(STUBBED) Attempted to query system resource usage");
+            *result = process->GetSystemResourceUsage();
            return RESULT_SUCCESS;

        case GetInfoType::TitleId:
@@ -838,17 +843,15 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
            return RESULT_SUCCESS;

        case GetInfoType::UserExceptionContextAddr:
-            LOG_WARNING(Kernel_SVC,
-                        "(STUBBED) Attempted to query user exception context address, returned 0");
-            *result = 0;
+            *result = process->GetTLSRegionAddress();
            return RESULT_SUCCESS;

-        case GetInfoType::TotalPhysicalMemoryAvailableWithoutMmHeap:
-            *result = process->GetTotalPhysicalMemoryAvailable();
+        case GetInfoType::TotalPhysicalMemoryAvailableWithoutSystemResource:
+            *result = process->GetTotalPhysicalMemoryAvailableWithoutSystemResource();
            return RESULT_SUCCESS;

-        case GetInfoType::TotalPhysicalMemoryUsedWithoutMmHeap:
-            *result = process->GetTotalPhysicalMemoryUsedWithoutMmHeap();
+        case GetInfoType::TotalPhysicalMemoryUsedWithoutSystemResource:
+            *result = process->GetTotalPhysicalMemoryUsedWithoutSystemResource();
            return RESULT_SUCCESS;

        default:
@@ -953,6 +956,86 @@ static ResultCode GetInfo(Core::System& system, u64* result, u64 info_id, u64 ha
    }
 }

+/// Maps memory at a desired address
+static ResultCode MapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
+    LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
+
+    if (!Common::Is4KBAligned(addr)) {
+        LOG_ERROR(Kernel_SVC, "Address is not aligned to 4KB, 0x{:016X}", addr);
+        return ERR_INVALID_ADDRESS;
+    }
+
+    if (!Common::Is4KBAligned(size)) {
+        LOG_ERROR(Kernel_SVC, "Size is not aligned to 4KB, 0x{:X}", size);
+        return ERR_INVALID_SIZE;
+    }
+
+    if (size == 0) {
+        LOG_ERROR(Kernel_SVC, "Size is zero");
+        return ERR_INVALID_SIZE;
+    }
+
+    if (!(addr < addr + size)) {
+        LOG_ERROR(Kernel_SVC, "Size causes 64-bit overflow of address");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    Process* const current_process = system.Kernel().CurrentProcess();
+    auto& vm_manager = current_process->VMManager();
+
+    if (current_process->GetSystemResourceSize() == 0) {
+        LOG_ERROR(Kernel_SVC, "System Resource Size is zero");
+        return ERR_INVALID_STATE;
+    }
+
+    if (!vm_manager.IsWithinMapRegion(addr, size)) {
+        LOG_ERROR(Kernel_SVC, "Range not within map region");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    return vm_manager.MapPhysicalMemory(addr, size);
+}
+
+/// Unmaps memory previously mapped via MapPhysicalMemory
+static ResultCode UnmapPhysicalMemory(Core::System& system, VAddr addr, u64 size) {
+    LOG_DEBUG(Kernel_SVC, "called, addr=0x{:016X}, size=0x{:X}", addr, size);
+
+    if (!Common::Is4KBAligned(addr)) {
+        LOG_ERROR(Kernel_SVC, "Address is not aligned to 4KB, 0x{:016X}", addr);
+        return ERR_INVALID_ADDRESS;
+    }
+
+    if (!Common::Is4KBAligned(size)) {
+        LOG_ERROR(Kernel_SVC, "Size is not aligned to 4KB, 0x{:X}", size);
+        return ERR_INVALID_SIZE;
+    }
+
+    if (size == 0) {
+        LOG_ERROR(Kernel_SVC, "Size is zero");
+        return ERR_INVALID_SIZE;
+    }
+
+    if (!(addr < addr + size)) {
+        LOG_ERROR(Kernel_SVC, "Size causes 64-bit overflow of address");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    Process* const current_process = system.Kernel().CurrentProcess();
+    auto& vm_manager = current_process->VMManager();
+
+    if (current_process->GetSystemResourceSize() == 0) {
+        LOG_ERROR(Kernel_SVC, "System Resource Size is zero");
+        return ERR_INVALID_STATE;
+    }
+
+    if (!vm_manager.IsWithinMapRegion(addr, size)) {
+        LOG_ERROR(Kernel_SVC, "Range not within map region");
+        return ERR_INVALID_MEMORY_RANGE;
+    }
+
+    return vm_manager.UnmapPhysicalMemory(addr, size);
+}
+
 /// Sets the thread activity
 static ResultCode SetThreadActivity(Core::System& system, Handle handle, u32 activity) {
    LOG_DEBUG(Kernel_SVC, "called, handle=0x{:08X}, activity=0x{:08X}", handle, activity);
@@ -1654,8 +1737,8 @@ static ResultCode SignalProcessWideKey(Core::System& system, VAddr condition_var
 // Wait for an address (via Address Arbiter)
 static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type, s32 value,
                                 s64 timeout) {
-    LOG_WARNING(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, timeout={}",
-                address, type, value, timeout);
+    LOG_TRACE(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, timeout={}", address,
+              type, value, timeout);

    // If the passed address is a kernel virtual address, return invalid memory state.
    if (Memory::IsKernelVirtualAddress(address)) {
@@ -1677,8 +1760,8 @@ static ResultCode WaitForAddress(Core::System& system, VAddr address, u32 type,
 // Signals to an address (via Address Arbiter)
 static ResultCode SignalToAddress(Core::System& system, VAddr address, u32 type, s32 value,
                                  s32 num_to_wake) {
-    LOG_WARNING(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, num_to_wake=0x{:X}",
-                address, type, value, num_to_wake);
+    LOG_TRACE(Kernel_SVC, "called, address=0x{:X}, type=0x{:X}, value=0x{:X}, num_to_wake=0x{:X}",
+              address, type, value, num_to_wake);

    // If the passed address is a kernel virtual address, return invalid memory state.
    if (Memory::IsKernelVirtualAddress(address)) {
@@ -2310,8 +2393,8 @@ static const FunctionDef SVC_Table[] = {
    {0x29, SvcWrap<GetInfo>, "GetInfo"},
    {0x2A, nullptr, "FlushEntireDataCache"},
    {0x2B, nullptr, "FlushDataCache"},
-    {0x2C, nullptr, "MapPhysicalMemory"},
-    {0x2D, nullptr, "UnmapPhysicalMemory"},
+    {0x2C, SvcWrap<MapPhysicalMemory>, "MapPhysicalMemory"},
+    {0x2D, SvcWrap<UnmapPhysicalMemory>, "UnmapPhysicalMemory"},
    {0x2E, nullptr, "GetFutureThreadInfo"},
    {0x2F, nullptr, "GetLastThreadInfo"},
    {0x30, SvcWrap<GetResourceLimitLimitValue>, "GetResourceLimitLimitValue"},
--- a/src/core/hle/kernel/svc_wrap.h
+++ b/src/core/hle/kernel/svc_wrap.h
@@ -32,6 +32,11 @@ void SvcWrap(Core::System& system) {
    FuncReturn(system, func(system, Param(system, 0)).raw);
 }

+template <ResultCode func(Core::System&, u64, u64)>
+void SvcWrap(Core::System& system) {
+    FuncReturn(system, func(system, Param(system, 0), Param(system, 1)).raw);
+}
+
 template <ResultCode func(Core::System&, u32)>
 void SvcWrap(Core::System& system) {
    FuncReturn(system, func(system, static_cast<u32>(Param(system, 0))).raw);
--- a/src/core/hle/kernel/vm_manager.cpp
+++ b/src/core/hle/kernel/vm_manager.cpp
@@ -11,6 +11,8 @@
 #include "core/core.h"
 #include "core/file_sys/program_metadata.h"
 #include "core/hle/kernel/errors.h"
+#include "core/hle/kernel/process.h"
+#include "core/hle/kernel/resource_limit.h"
 #include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
 #include "core/memory_setup.h"
@@ -48,10 +50,14 @@ bool VirtualMemoryArea::CanBeMergedWith(const VirtualMemoryArea& next) const {
        type != next.type) {
        return false;
    }
-    if (type == VMAType::AllocatedMemoryBlock &&
-        (backing_block != next.backing_block || offset + size != next.offset)) {
+    if ((attribute & MemoryAttribute::DeviceMapped) == MemoryAttribute::DeviceMapped) {
+        // TODO: Can device mapped memory be merged sanely?
+        // Not merging it may cause inaccuracies versus hardware when memory layout is queried.
        return false;
    }
+    if (type == VMAType::AllocatedMemoryBlock) {
+        return true;
+    }
    if (type == VMAType::BackingMemory && backing_memory + size != next.backing_memory) {
        return false;
    }
@@ -99,7 +105,7 @@ bool VMManager::IsValidHandle(VMAHandle handle) const {
 ResultVal<VMManager::VMAHandle> VMManager::MapMemoryBlock(VAddr target,
                                                          std::shared_ptr<std::vector<u8>> block,
                                                          std::size_t offset, u64 size,
-                                                          MemoryState state) {
+                                                          MemoryState state, VMAPermission perm) {
    ASSERT(block != nullptr);
    ASSERT(offset + size <= block->size());

@@ -109,7 +115,7 @@ ResultVal<VMManager::VMAHandle> VMManager::MapMemoryBlock(VAddr target,
    ASSERT(final_vma.size == size);

    final_vma.type = VMAType::AllocatedMemoryBlock;
-    final_vma.permissions = VMAPermission::ReadWrite;
+    final_vma.permissions = perm;
    final_vma.state = state;
    final_vma.backing_block = std::move(block);
    final_vma.offset = offset;
@@ -288,6 +294,166 @@ ResultVal<VAddr> VMManager::SetHeapSize(u64 size) {
    return MakeResult<VAddr>(heap_region_base);
 }

+ResultCode VMManager::MapPhysicalMemory(VAddr target, u64 size) {
+    const auto end_addr = target + size;
+    const auto last_addr = end_addr - 1;
+    VAddr cur_addr = target;
+
+    ResultCode result = RESULT_SUCCESS;
+
+    // Check how much memory we've already mapped.
+    const auto mapped_size_result = SizeOfAllocatedVMAsInRange(target, size);
+    if (mapped_size_result.Failed()) {
+        return mapped_size_result.Code();
+    }
+
+    // If we've already mapped the desired amount, return early.
+    const std::size_t mapped_size = *mapped_size_result;
+    if (mapped_size == size) {
+        return RESULT_SUCCESS;
+    }
+
+    // Check that we can map the memory we want.
+    const auto res_limit = system.CurrentProcess()->GetResourceLimit();
+    const u64 physmem_remaining = res_limit->GetMaxResourceValue(ResourceType::PhysicalMemory) -
+                                  res_limit->GetCurrentResourceValue(ResourceType::PhysicalMemory);
+    if (physmem_remaining < (size - mapped_size)) {
+        return ERR_RESOURCE_LIMIT_EXCEEDED;
+    }
+
+    // Keep track of the memory regions we unmap.
+    std::vector<std::pair<u64, u64>> mapped_regions;
+
+    // Iterate, trying to map memory.
+    {
+        cur_addr = target;
+
+        auto iter = FindVMA(target);
+        ASSERT_MSG(iter != vma_map.end(), "MapPhysicalMemory iter != end");
+
+        while (true) {
+            const auto& vma = iter->second;
+            const auto vma_start = vma.base;
+            const auto vma_end = vma_start + vma.size;
+            const auto vma_last = vma_end - 1;
+
+            // Map the memory block
+            const auto map_size = std::min(end_addr - cur_addr, vma_end - cur_addr);
+            if (vma.state == MemoryState::Unmapped) {
+                const auto map_res =
+                    MapMemoryBlock(cur_addr, std::make_shared<std::vector<u8>>(map_size, 0), 0,
+                                   map_size, MemoryState::Heap, VMAPermission::ReadWrite);
+                result = map_res.Code();
+                if (result.IsError()) {
+                    break;
+                }
+
+                mapped_regions.emplace_back(cur_addr, map_size);
+            }
+
+            // Break once we hit the end of the range.
+            if (last_addr <= vma_last) {
+                break;
+            }
+
+            // Advance to the next block.
+            cur_addr = vma_end;
+            iter = FindVMA(cur_addr);
+            ASSERT_MSG(iter != vma_map.end(), "MapPhysicalMemory iter != end");
+        }
+    }
+
+    // If we failed, unmap memory.
+    if (result.IsError()) {
+        for (const auto [unmap_address, unmap_size] : mapped_regions) {
+            ASSERT_MSG(UnmapRange(unmap_address, unmap_size).IsSuccess(),
+                       "MapPhysicalMemory un-map on error");
+        }
+
+        return result;
+    }
+
+    // Update amount of mapped physical memory.
+    physical_memory_mapped += size - mapped_size;
+
+    return RESULT_SUCCESS;
+}
+
+ResultCode VMManager::UnmapPhysicalMemory(VAddr target, u64 size) {
+    const auto end_addr = target + size;
+    const auto last_addr = end_addr - 1;
+    VAddr cur_addr = target;
+
+    ResultCode result = RESULT_SUCCESS;
+
+    // Check how much memory is currently mapped.
+    const auto mapped_size_result = SizeOfUnmappablePhysicalMemoryInRange(target, size);
+    if (mapped_size_result.Failed()) {
+        return mapped_size_result.Code();
+    }
+
+    // If we've already unmapped all the memory, return early.
+    const std::size_t mapped_size = *mapped_size_result;
+    if (mapped_size == 0) {
+        return RESULT_SUCCESS;
+    }
+
+    // Keep track of the memory regions we unmap.
+    std::vector<std::pair<u64, u64>> unmapped_regions;
+
+    // Try to unmap regions.
+    {
+        cur_addr = target;
+
+        auto iter = FindVMA(target);
+        ASSERT_MSG(iter != vma_map.end(), "UnmapPhysicalMemory iter != end");
+
+        while (true) {
+            const auto& vma = iter->second;
+            const auto vma_start = vma.base;
+            const auto vma_end = vma_start + vma.size;
+            const auto vma_last = vma_end - 1;
+
+            // Unmap the memory block
+            const auto unmap_size = std::min(end_addr - cur_addr, vma_end - cur_addr);
+            if (vma.state == MemoryState::Heap) {
+                result = UnmapRange(cur_addr, unmap_size);
+                if (result.IsError()) {
+                    break;
+                }
+
+                unmapped_regions.emplace_back(cur_addr, unmap_size);
+            }
+
+            // Break once we hit the end of the range.
+            if (last_addr <= vma_last) {
+                break;
+            }
+
+            // Advance to the next block.
+            cur_addr = vma_end;
+            iter = FindVMA(cur_addr);
+            ASSERT_MSG(iter != vma_map.end(), "UnmapPhysicalMemory iter != end");
+        }
+    }
+
+    // If we failed, re-map regions.
+    // TODO: Preserve memory contents?
+    if (result.IsError()) {
+        for (const auto [map_address, map_size] : unmapped_regions) {
+            const auto remap_res =
+                MapMemoryBlock(map_address, std::make_shared<std::vector<u8>>(map_size, 0), 0,
+                               map_size, MemoryState::Heap, VMAPermission::None);
+            ASSERT_MSG(remap_res.Succeeded(), "UnmapPhysicalMemory re-map on error");
+        }
+    }
+
+    // Update mapped amount
+    physical_memory_mapped -= mapped_size;
+
+    return RESULT_SUCCESS;
+}
+
 ResultCode VMManager::MapCodeMemory(VAddr dst_address, VAddr src_address, u64 size) {
    constexpr auto ignore_attribute = MemoryAttribute::LockedForIPC | MemoryAttribute::DeviceMapped;
    const auto src_check_result = CheckRangeState(
@@ -435,7 +601,7 @@ ResultCode VMManager::MirrorMemory(VAddr dst_addr, VAddr src_addr, u64 size, Mem
    // Protect mirror with permissions from old region
    Reprotect(new_vma, vma->second.permissions);
    // Remove permissions from old region
-    Reprotect(vma, VMAPermission::None);
+    ReprotectRange(src_addr, size, VMAPermission::None);

    return RESULT_SUCCESS;
 }
@@ -568,14 +734,14 @@ VMManager::VMAIter VMManager::SplitVMA(VMAIter vma_handle, u64 offset_in_vma) {
 VMManager::VMAIter VMManager::MergeAdjacent(VMAIter iter) {
    const VMAIter next_vma = std::next(iter);
    if (next_vma != vma_map.end() && iter->second.CanBeMergedWith(next_vma->second)) {
-        iter->second.size += next_vma->second.size;
+        MergeAdjacentVMA(iter->second, next_vma->second);
        vma_map.erase(next_vma);
    }

    if (iter != vma_map.begin()) {
        VMAIter prev_vma = std::prev(iter);
        if (prev_vma->second.CanBeMergedWith(iter->second)) {
-            prev_vma->second.size += iter->second.size;
+            MergeAdjacentVMA(prev_vma->second, iter->second);
            vma_map.erase(iter);
            iter = prev_vma;
        }
@@ -584,6 +750,38 @@ VMManager::VMAIter VMManager::MergeAdjacent(VMAIter iter) {
    return iter;
 }

+void VMManager::MergeAdjacentVMA(VirtualMemoryArea& left, const VirtualMemoryArea& right) {
+    ASSERT(left.CanBeMergedWith(right));
+
+    // Always merge allocated memory blocks, even when they don't share the same backing block.
+    if (left.type == VMAType::AllocatedMemoryBlock &&
+        (left.backing_block != right.backing_block || left.offset + left.size != right.offset)) {
+        // Check if we can save work.
+        if (left.offset == 0 && left.size == left.backing_block->size()) {
+            // Fast case: left is an entire backing block.
+            left.backing_block->insert(left.backing_block->end(),
+                                       right.backing_block->begin() + right.offset,
+                                       right.backing_block->begin() + right.offset + right.size);
+        } else {
+            // Slow case: make a new memory block for left and right.
+            auto new_memory = std::make_shared<std::vector<u8>>();
+            new_memory->insert(new_memory->end(), left.backing_block->begin() + left.offset,
+                               left.backing_block->begin() + left.offset + left.size);
+            new_memory->insert(new_memory->end(), right.backing_block->begin() + right.offset,
+                               right.backing_block->begin() + right.offset + right.size);
+            left.backing_block = new_memory;
+            left.offset = 0;
+        }
+
+        // Page table update is needed, because backing memory changed.
+        left.size += right.size;
+        UpdatePageTableForVMA(left);
+    } else {
+        // Just update the size.
+        left.size += right.size;
+    }
+}
+
 void VMManager::UpdatePageTableForVMA(const VirtualMemoryArea& vma) {
    switch (vma.type) {
    case VMAType::Free:
@@ -758,6 +956,84 @@ VMManager::CheckResults VMManager::CheckRangeState(VAddr address, u64 size, Memo
        std::make_tuple(initial_state, initial_permissions, initial_attributes & ~ignore_mask));
 }

+ResultVal<std::size_t> VMManager::SizeOfAllocatedVMAsInRange(VAddr address,
+                                                             std::size_t size) const {
+    const VAddr end_addr = address + size;
+    const VAddr last_addr = end_addr - 1;
+    std::size_t mapped_size = 0;
+
+    VAddr cur_addr = address;
+    auto iter = FindVMA(cur_addr);
+    ASSERT_MSG(iter != vma_map.end(), "SizeOfAllocatedVMAsInRange iter != end");
+
+    while (true) {
+        const auto& vma = iter->second;
+        const VAddr vma_start = vma.base;
+        const VAddr vma_end = vma_start + vma.size;
+        const VAddr vma_last = vma_end - 1;
+
+        // Add size if relevant.
+        if (vma.state != MemoryState::Unmapped) {
+            mapped_size += std::min(end_addr - cur_addr, vma_end - cur_addr);
+        }
+
+        // Break once we hit the end of the range.
+        if (last_addr <= vma_last) {
+            break;
+        }
+
+        // Advance to the next block.
+        cur_addr = vma_end;
+        iter = std::next(iter);
+        ASSERT_MSG(iter != vma_map.end(), "SizeOfAllocatedVMAsInRange iter != end");
+    }
+
+    return MakeResult(mapped_size);
+}
+
+ResultVal<std::size_t> VMManager::SizeOfUnmappablePhysicalMemoryInRange(VAddr address,
+                                                                        std::size_t size) const {
+    const VAddr end_addr = address + size;
+    const VAddr last_addr = end_addr - 1;
+    std::size_t mapped_size = 0;
+
+    VAddr cur_addr = address;
+    auto iter = FindVMA(cur_addr);
+    ASSERT_MSG(iter != vma_map.end(), "SizeOfUnmappablePhysicalMemoryInRange iter != end");
+
+    while (true) {
+        const auto& vma = iter->second;
+        const auto vma_start = vma.base;
+        const auto vma_end = vma_start + vma.size;
+        const auto vma_last = vma_end - 1;
+        const auto state = vma.state;
+        const auto attr = vma.attribute;
+
+        // Memory within region must be free or mapped heap.
+        if (!((state == MemoryState::Heap && attr == MemoryAttribute::None) ||
+              (state == MemoryState::Unmapped))) {
+            return ERR_INVALID_ADDRESS_STATE;
+        }
+
+        // Add size if relevant.
+        if (state != MemoryState::Unmapped) {
+            mapped_size += std::min(end_addr - cur_addr, vma_end - cur_addr);
+        }
+
+        // Break once we hit the end of the range.
+        if (last_addr <= vma_last) {
+            break;
+        }
+
+        // Advance to the next block.
+        cur_addr = vma_end;
+        iter = std::next(iter);
+        ASSERT_MSG(iter != vma_map.end(), "SizeOfUnmappablePhysicalMemoryInRange iter != end");
+    }
+
+    return MakeResult(mapped_size);
+}
+
 u64 VMManager::GetTotalPhysicalMemoryAvailable() const {
    LOG_WARNING(Kernel, "(STUBBED) called");
    return 0xF8000000;
--- a/src/core/hle/kernel/vm_manager.h
+++ b/src/core/hle/kernel/vm_manager.h
@@ -349,7 +349,8 @@ public:
     * @param state MemoryState tag to attach to the VMA.
     */
    ResultVal<VMAHandle> MapMemoryBlock(VAddr target, std::shared_ptr<std::vector<u8>> block,
-                                        std::size_t offset, u64 size, MemoryState state);
+                                        std::size_t offset, u64 size, MemoryState state,
+                                        VMAPermission perm = VMAPermission::ReadWrite);

    /**
     * Maps an unmanaged host memory pointer at a given address.
@@ -450,6 +451,34 @@ public:
    ///
    ResultVal<VAddr> SetHeapSize(u64 size);

+    /// Maps memory at a given address.
+    ///
+    /// @param addr The virtual address to map memory at.
+    /// @param size The amount of memory to map.
+    ///
+    /// @note The destination address must lie within the Map region.
+    ///
+    /// @note This function requires that SystemResourceSize be non-zero,
+    ///       however, this is just because if it were not then the
+    ///       resulting page tables could be exploited on hardware by
+    ///       a malicious program. SystemResource usage does not need
+    ///       to be explicitly checked or updated here.
+    ResultCode MapPhysicalMemory(VAddr target, u64 size);
+
+    /// Unmaps memory at a given address.
+    ///
+    /// @param addr The virtual address to unmap memory at.
+    /// @param size The amount of memory to unmap.
+    ///
+    /// @note The destination address must lie within the Map region.
+    ///
+    /// @note This function requires that SystemResourceSize be non-zero,
+    ///       however, this is just because if it were not then the
+    ///       resulting page tables could be exploited on hardware by
+    ///       a malicious program. SystemResource usage does not need
+    ///       to be explicitly checked or updated here.
+    ResultCode UnmapPhysicalMemory(VAddr target, u64 size);
+
    /// Maps a region of memory as code memory.
    ///
    /// @param dst_address The base address of the region to create the aliasing memory region.
@@ -657,6 +686,11 @@ private:
     */
    VMAIter MergeAdjacent(VMAIter vma);

+    /**
+     * Merges two adjacent VMAs.
+     */
+    void MergeAdjacentVMA(VirtualMemoryArea& left, const VirtualMemoryArea& right);
+
    /// Updates the pages corresponding to this VMA so they match the VMA's attributes.
    void UpdatePageTableForVMA(const VirtualMemoryArea& vma);

@@ -701,6 +735,13 @@ private:
                                 MemoryAttribute attribute_mask, MemoryAttribute attribute,
                                 MemoryAttribute ignore_mask) const;

+    /// Gets the amount of memory currently mapped (state != Unmapped) in a range.
+    ResultVal<std::size_t> SizeOfAllocatedVMAsInRange(VAddr address, std::size_t size) const;
+
+    /// Gets the amount of memory unmappable by UnmapPhysicalMemory in a range.
+    ResultVal<std::size_t> SizeOfUnmappablePhysicalMemoryInRange(VAddr address,
+                                                                 std::size_t size) const;
+
    /**
     * A map covering the entirety of the managed address space, keyed by the `base` field of each
     * VMA. It must always be modified by splitting or merging VMAs, so that the invariant
@@ -742,6 +783,11 @@ private:
    // end of the range. This is essentially 'base_address + current_size'.
    VAddr heap_end = 0;

+    // The current amount of memory mapped via MapPhysicalMemory.
+    // This is used here (and in Nintendo's kernel) only for debugging, and does not impact
+    // any behavior.
+    u64 physical_memory_mapped = 0;
+
    Core::System& system;
 };
 } // namespace Kernel
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_library(video_core STATIC
+    buffer_cache.h
    dma_pusher.cpp
    dma_pusher.h
    debug_utils/debug_utils.cpp
@@ -43,8 +44,6 @@ add_library(video_core STATIC
    renderer_opengl/gl_device.h
    renderer_opengl/gl_framebuffer_cache.cpp
    renderer_opengl/gl_framebuffer_cache.h
-    renderer_opengl/gl_global_cache.cpp
-    renderer_opengl/gl_global_cache.h
    renderer_opengl/gl_rasterizer.cpp
    renderer_opengl/gl_rasterizer.h
    renderer_opengl/gl_resource_manager.cpp
--- a/src/video_core/buffer_cache.h
+++ b/src/video_core/buffer_cache.h
@@ -0,0 +1,299 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_cache.h"
+
+namespace VideoCore {
+class RasterizerInterface;
+}
+
+namespace VideoCommon {
+
+template <typename BufferStorageType>
+class CachedBuffer final : public RasterizerCacheObject {
+public:
+    explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr)
+        : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {}
+    ~CachedBuffer() override = default;
+
+    VAddr GetCpuAddr() const override {
+        return cpu_addr;
+    }
+
+    std::size_t GetSizeInBytes() const override {
+        return size;
+    }
+
+    u8* GetWritableHostPtr() const {
+        return host_ptr;
+    }
+
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    std::size_t GetCapacity() const {
+        return capacity;
+    }
+
+    bool IsInternalized() const {
+        return is_internal;
+    }
+
+    const BufferStorageType& GetBuffer() const {
+        return buffer;
+    }
+
+    void SetSize(std::size_t new_size) {
+        size = new_size;
+    }
+
+    void SetInternalState(bool is_internal_) {
+        is_internal = is_internal_;
+    }
+
+    BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) {
+        capacity = new_capacity;
+        std::swap(buffer, buffer_);
+        return buffer_;
+    }
+
+private:
+    u8* host_ptr{};
+    VAddr cpu_addr{};
+    std::size_t size{};
+    std::size_t capacity{};
+    bool is_internal{};
+    BufferStorageType buffer;
+};
+
+template <typename BufferStorageType, typename BufferType, typename StreamBuffer>
+class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> {
+public:
+    using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>;
+    using BufferInfo = std::pair<const BufferType*, u64>;
+
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                         std::unique_ptr<StreamBuffer> stream_buffer)
+        : RasterizerCache<Buffer>{rasterizer}, system{system},
+          stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{
+                                                       this->stream_buffer->GetHandle()} {}
+    ~BufferCache() = default;
+
+    void Unregister(const Buffer& entry) override {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+        if (entry->IsInternalized()) {
+            internalized_entries.erase(entry->GetCacheAddr());
+        }
+        ReserveBuffer(entry);
+        RasterizerCache<Buffer>::Unregister(entry);
+    }
+
+    void TickFrame() {
+        marked_for_destruction_index =
+            (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size();
+        MarkedForDestruction().clear();
+    }
+
+    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+                            bool internalize = false, bool is_written = false) {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+        if (!host_ptr) {
+            return {GetEmptyBuffer(size), 0};
+        }
+        const auto cache_addr = ToCacheAddr(host_ptr);
+
+        // Cache management is a big overhead, so only cache entries with a given size.
+        // TODO: Figure out which size is the best for given games.
+        constexpr std::size_t max_stream_size = 0x800;
+        if (!internalize && size < max_stream_size &&
+            internalized_entries.find(cache_addr) == internalized_entries.end()) {
+            return StreamBufferUpload(host_ptr, size, alignment);
+        }
+
+        auto entry = RasterizerCache<Buffer>::TryGet(cache_addr);
+        if (!entry) {
+            return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written);
+        }
+
+        if (entry->GetSize() < size) {
+            IncreaseBufferSize(entry, size);
+        }
+        if (is_written) {
+            entry->MarkAsModified(true, *this);
+        }
+        return {ToHandle(entry->GetBuffer()), 0};
+    }
+
+    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
+    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                std::size_t alignment = 4) {
+        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
+        return StreamBufferUpload(raw_pointer, size, alignment);
+    }
+
+    void Map(std::size_t max_size) {
+        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
+        buffer_offset = buffer_offset_base;
+    }
+
+    /// Finishes the upload stream, returns true on bindings invalidation.
+    bool Unmap() {
+        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
+        return std::exchange(invalidated, false);
+    }
+
+    virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0;
+
+protected:
+    void FlushObjectInner(const Buffer& entry) override {
+        DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr());
+    }
+
+    virtual BufferStorageType CreateBuffer(std::size_t size) = 0;
+
+    virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0;
+
+    virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset,
+                                  std::size_t size, const u8* data) = 0;
+
+    virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset,
+                                    std::size_t size, u8* data) = 0;
+
+    virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst,
+                                std::size_t src_offset, std::size_t dst_offset,
+                                std::size_t size) = 0;
+
+private:
+    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
+                                  std::size_t alignment) {
+        AlignBuffer(alignment);
+        const std::size_t uploaded_offset = buffer_offset;
+        std::memcpy(buffer_ptr, raw_pointer, size);
+
+        buffer_ptr += size;
+        buffer_offset += size;
+        return {&stream_buffer_handle, uploaded_offset};
+    }
+
+    BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size,
+                                 bool internalize, bool is_written) {
+        auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
+        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
+        ASSERT(cpu_addr);
+
+        auto entry = GetUncachedBuffer(*cpu_addr, host_ptr);
+        entry->SetSize(size);
+        entry->SetInternalState(internalize);
+        RasterizerCache<Buffer>::Register(entry);
+
+        if (internalize) {
+            internalized_entries.emplace(ToCacheAddr(host_ptr));
+        }
+        if (is_written) {
+            entry->MarkAsModified(true, *this);
+        }
+
+        if (entry->GetCapacity() < size) {
+            MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size));
+        }
+
+        UploadBufferData(entry->GetBuffer(), 0, size, host_ptr);
+        return {ToHandle(entry->GetBuffer()), 0};
+    }
+
+    void IncreaseBufferSize(Buffer& entry, std::size_t new_size) {
+        const std::size_t old_size = entry->GetSize();
+        if (entry->GetCapacity() < new_size) {
+            const auto& old_buffer = entry->GetBuffer();
+            auto new_buffer = CreateBuffer(new_size);
+
+            // Copy bits from the old buffer to the new buffer.
+            CopyBufferData(old_buffer, new_buffer, 0, 0, old_size);
+            MarkedForDestruction().push_back(
+                entry->ExchangeBuffer(std::move(new_buffer), new_size));
+
+            // This buffer could have been used
+            invalidated = true;
+        }
+        // Upload the new bits.
+        const std::size_t size_diff = new_size - old_size;
+        UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size);
+
+        // Update entry's size in the object and in the cache.
+        Unregister(entry);
+
+        entry->SetSize(new_size);
+        RasterizerCache<Buffer>::Register(entry);
+    }
+
+    Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) {
+        if (auto entry = TryGetReservedBuffer(host_ptr)) {
+            return entry;
+        }
+        return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr);
+    }
+
+    Buffer TryGetReservedBuffer(u8* host_ptr) {
+        const auto it = buffer_reserve.find(ToCacheAddr(host_ptr));
+        if (it == buffer_reserve.end()) {
+            return {};
+        }
+        auto& reserve = it->second;
+        auto entry = reserve.back();
+        reserve.pop_back();
+        return entry;
+    }
+
+    void ReserveBuffer(Buffer entry) {
+        buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry));
+    }
+
+    void AlignBuffer(std::size_t alignment) {
+        // Align the offset, not the mapped pointer
+        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
+        buffer_ptr += offset_aligned - buffer_offset;
+        buffer_offset = offset_aligned;
+    }
+
+    std::vector<BufferStorageType>& MarkedForDestruction() {
+        return marked_for_destruction_ring_buffer[marked_for_destruction_index];
+    }
+
+    Core::System& system;
+
+    std::unique_ptr<StreamBuffer> stream_buffer;
+    BufferType stream_buffer_handle{};
+
+    bool invalidated = false;
+
+    u8* buffer_ptr = nullptr;
+    u64 buffer_offset = 0;
+    u64 buffer_offset_base = 0;
+
+    std::size_t marked_for_destruction_index = 0;
+    std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer;
+
+    std::unordered_set<CacheAddr> internalized_entries;
+    std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -67,6 +67,7 @@ public:
        static constexpr std::size_t MaxShaderStage = 5;
        // Maximum number of const buffers per shader stage.
        static constexpr std::size_t MaxConstBuffers = 18;
+        static constexpr std::size_t MaxConstBufferSize = 0x10000;

        enum class QueryMode : u32 {
            Write = 0,
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -78,7 +78,7 @@ union Attribute {
    constexpr explicit Attribute(u64 value) : value(value) {}

    enum class Index : u64 {
-        PointSize = 6,
+        LayerViewportPointSize = 6,
        Position = 7,
        Attribute_0 = 8,
        Attribute_31 = 39,
@@ -931,8 +931,6 @@ union Instruction {
    } csetp;

    union {
-        BitField<35, 4, PredCondition> cond;
-        BitField<49, 1, u64> h_and;
        BitField<6, 1, u64> ftz;
        BitField<45, 2, PredOperation> op;
        BitField<3, 3, u64> pred3;
@@ -940,9 +938,21 @@ union Instruction {
        BitField<43, 1, u64> negate_a;
        BitField<44, 1, u64> abs_a;
        BitField<47, 2, HalfType> type_a;
-        BitField<31, 1, u64> negate_b;
-        BitField<30, 1, u64> abs_b;
-        BitField<28, 2, HalfType> type_b;
+        union {
+            BitField<35, 4, PredCondition> cond;
+            BitField<49, 1, u64> h_and;
+            BitField<31, 1, u64> negate_b;
+            BitField<30, 1, u64> abs_b;
+            BitField<28, 2, HalfType> type_b;
+        } reg;
+        union {
+            BitField<56, 1, u64> negate_b;
+            BitField<54, 1, u64> abs_b;
+        } cbuf;
+        union {
+            BitField<49, 4, PredCondition> cond;
+            BitField<53, 1, u64> h_and;
+        } cbuf_and_imm;
        BitField<42, 1, u64> neg_pred;
        BitField<39, 3, u64> pred39;
    } hsetp2;
@@ -1008,8 +1018,6 @@ union Instruction {
        } f2i;

        union {
-            BitField<8, 2, Register::Size> src_size;
-            BitField<10, 2, Register::Size> dst_size;
            BitField<39, 4, u64> rounding;
            // H0, H1 extract for F16 missing
            BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
@@ -1278,6 +1286,7 @@ union Instruction {
    union {
        BitField<49, 1, u64> nodep_flag;
        BitField<53, 4, u64> texture_info;
+        BitField<59, 1, u64> fp32_flag;

        TextureType GetTextureType() const {
            // The TLDS instruction has a weird encoding for the texture type.
@@ -1547,7 +1556,9 @@ public:
        HFMA2_RC,
        HFMA2_RR,
        HFMA2_IMM_R,
+        HSETP2_C,
        HSETP2_R,
+        HSETP2_IMM,
        HSET2_R,
        POPC_C,
        POPC_R,
@@ -1776,7 +1787,7 @@ private:
            INST("1101111101010---", Id::TXQ_B, Type::Texture, "TXQ_B"),
            INST("1101-00---------", Id::TEXS, Type::Texture, "TEXS"),
            INST("11011100--11----", Id::TLD, Type::Texture, "TLD"),
-            INST("1101101---------", Id::TLDS, Type::Texture, "TLDS"),
+            INST("1101-01---------", Id::TLDS, Type::Texture, "TLDS"),
            INST("110010----111---", Id::TLD4, Type::Texture, "TLD4"),
            INST("1101111100------", Id::TLD4S, Type::Texture, "TLD4S"),
            INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
@@ -1830,7 +1841,9 @@ private:
            INST("01100---1-------", Id::HFMA2_RC, Type::Hfma2, "HFMA2_RC"),
            INST("0101110100000---", Id::HFMA2_RR, Type::Hfma2, "HFMA2_RR"),
            INST("01110---0-------", Id::HFMA2_IMM_R, Type::Hfma2, "HFMA2_R_IMM"),
-            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP_R"),
+            INST("0111111-1-------", Id::HSETP2_C, Type::HalfSetPredicate, "HSETP2_C"),
+            INST("0101110100100---", Id::HSETP2_R, Type::HalfSetPredicate, "HSETP2_R"),
+            INST("0111111-0-------", Id::HSETP2_IMM, Type::HalfSetPredicate, "HSETP2_IMM"),
            INST("0101110100011---", Id::HSET2_R, Type::HalfSet, "HSET2_R"),
            INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
            INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -31,7 +31,7 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {

 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
    auto& rasterizer{renderer.Rasterizer()};
-    memory_manager = std::make_unique<Tegra::MemoryManager>(rasterizer);
+    memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
--- a/src/video_core/macro_interpreter.cpp
+++ b/src/video_core/macro_interpreter.cpp
@@ -4,14 +4,18 @@

 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/macro_interpreter.h"

+MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192));
+
 namespace Tegra {

 MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {}

 void MacroInterpreter::Execute(u32 offset, std::vector<u32> parameters) {
+    MICROPROFILE_SCOPE(MacroInterp);
    Reset();
    registers[1] = parameters[0];
    this->parameters = std::move(parameters);
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -5,13 +5,17 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "core/core.h"
+#include "core/hle/kernel/process.h"
+#include "core/hle/kernel/vm_manager.h"
 #include "core/memory.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"

 namespace Tegra {

-MemoryManager::MemoryManager(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {
+MemoryManager::MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
+    : rasterizer{rasterizer}, system{system} {
    std::fill(page_table.pointers.begin(), page_table.pointers.end(), nullptr);
    std::fill(page_table.attributes.begin(), page_table.attributes.end(),
              Common::PageType::Unmapped);
@@ -49,6 +53,11 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, u64 size) {
    const GPUVAddr gpu_addr{FindFreeRegion(address_space_base, aligned_size)};

    MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::DeviceMapped)
+               .IsSuccess());

    return gpu_addr;
 }
@@ -59,7 +68,11 @@ GPUVAddr MemoryManager::MapBufferEx(VAddr cpu_addr, GPUVAddr gpu_addr, u64 size)
    const u64 aligned_size{Common::AlignUp(size, page_size)};

    MapBackingMemory(gpu_addr, Memory::GetPointer(cpu_addr), aligned_size, cpu_addr);
-
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr, size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::DeviceMapped)
+               .IsSuccess());
    return gpu_addr;
 }

@@ -68,9 +81,16 @@ GPUVAddr MemoryManager::UnmapBuffer(GPUVAddr gpu_addr, u64 size) {

    const u64 aligned_size{Common::AlignUp(size, page_size)};
    const CacheAddr cache_addr{ToCacheAddr(GetPointer(gpu_addr))};
+    const auto cpu_addr = GpuToCpuAddress(gpu_addr);
+    ASSERT(cpu_addr);

    rasterizer.FlushAndInvalidateRegion(cache_addr, aligned_size);
    UnmapRange(gpu_addr, aligned_size);
+    ASSERT(system.CurrentProcess()
+               ->VMManager()
+               .SetMemoryAttribute(cpu_addr.value(), size, Kernel::MemoryAttribute::DeviceMapped,
+                                   Kernel::MemoryAttribute::None)
+               .IsSuccess());

    return gpu_addr;
 }
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -14,6 +14,10 @@ namespace VideoCore {
 class RasterizerInterface;
 }

+namespace Core {
+class System;
+}
+
 namespace Tegra {

 /**
@@ -47,7 +51,7 @@ struct VirtualMemoryArea {

 class MemoryManager final {
 public:
-    explicit MemoryManager(VideoCore::RasterizerInterface& rasterizer);
+    explicit MemoryManager(Core::System& system, VideoCore::RasterizerInterface& rasterizer);
    ~MemoryManager();

    GPUVAddr AllocateSpace(u64 size, u64 align);
@@ -173,6 +177,8 @@ private:
    Common::PageTable page_table{page_bits};
    VMAMap vma_map;
    VideoCore::RasterizerInterface& rasterizer;
+
+    Core::System& system;
 };

 } // namespace Tegra
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -47,6 +47,9 @@ public:
    /// and invalidated
    virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;

+    /// Notify rasterizer that a frame is about to finish
+    virtual void TickFrame() = 0;
+
    /// Attempt to use a faster method to perform a surface copy
    virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                       const Tegra::Engines::Fermi2D::Regs::Surface& dst,
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -2,103 +2,57 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

-#include <cstring>
 #include <memory>

-#include "common/alignment.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
+#include <glad/glad.h>
+
+#include "common/assert.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"

 namespace OpenGL {

-CachedBufferEntry::CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
-                                     std::size_t alignment, u8* host_ptr)
-    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, size{size}, offset{offset},
-      alignment{alignment} {}
+OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+                               std::size_t stream_size)
+    : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{
+          rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}

-OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size)
-    : RasterizerCache{rasterizer}, stream_buffer(size, true) {}
+OGLBufferCache::~OGLBufferCache() = default;

-GLintptr OGLBufferCache::UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment,
-                                      bool cache) {
-    std::lock_guard lock{mutex};
-    auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-
-    // Cache management is a big overhead, so only cache entries with a given size.
-    // TODO: Figure out which size is the best for given games.
-    cache &= size >= 2048;
-
-    const auto& host_ptr{memory_manager.GetPointer(gpu_addr)};
-    if (cache) {
-        auto entry = TryGet(host_ptr);
-        if (entry) {
-            if (entry->GetSize() >= size && entry->GetAlignment() == alignment) {
-                return entry->GetOffset();
-            }
-            Unregister(entry);
-        }
-    }
-
-    AlignBuffer(alignment);
-    const GLintptr uploaded_offset = buffer_offset;
-
-    if (!host_ptr) {
-        return uploaded_offset;
-    }
-
-    std::memcpy(buffer_ptr, host_ptr, size);
-    buffer_ptr += size;
-    buffer_offset += size;
-
-    if (cache) {
-        auto entry = std::make_shared<CachedBufferEntry>(
-            *memory_manager.GpuToCpuAddress(gpu_addr), size, uploaded_offset, alignment, host_ptr);
-        Register(entry);
-    }
-
-    return uploaded_offset;
+OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) {
+    OGLBuffer buffer;
+    buffer.Create();
+    glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+    return buffer;
 }

-GLintptr OGLBufferCache::UploadHostMemory(const void* raw_pointer, std::size_t size,
-                                          std::size_t alignment) {
-    std::lock_guard lock{mutex};
-    AlignBuffer(alignment);
-    std::memcpy(buffer_ptr, raw_pointer, size);
-    const GLintptr uploaded_offset = buffer_offset;
-
-    buffer_ptr += size;
-    buffer_offset += size;
-    return uploaded_offset;
+const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) {
+    return &buffer.handle;
 }

-bool OGLBufferCache::Map(std::size_t max_size) {
-    bool invalidate;
-    std::tie(buffer_ptr, buffer_offset_base, invalidate) =
-        stream_buffer.Map(static_cast<GLsizeiptr>(max_size), 4);
-    buffer_offset = buffer_offset_base;
-
-    if (invalidate) {
-        InvalidateAll();
-    }
-    return invalidate;
+const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
+    static const GLuint null_buffer = 0;
+    return &null_buffer;
 }

-void OGLBufferCache::Unmap() {
-    stream_buffer.Unmap(buffer_offset - buffer_offset_base);
+void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                                      const u8* data) {
+    glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                         static_cast<GLsizeiptr>(size), data);
 }

-GLuint OGLBufferCache::GetHandle() const {
-    return stream_buffer.GetHandle();
+void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset,
+                                        std::size_t size, u8* data) {
+    glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+                            static_cast<GLsizeiptr>(size), data);
 }

-void OGLBufferCache::AlignBuffer(std::size_t alignment) {
-    // Align the offset, not the mapped pointer
-    const GLintptr offset_aligned =
-        static_cast<GLintptr>(Common::AlignUp(static_cast<std::size_t>(buffer_offset), alignment));
-    buffer_ptr += offset_aligned - buffer_offset;
-    buffer_offset = offset_aligned;
+void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst,
+                                    std::size_t src_offset, std::size_t dst_offset,
+                                    std::size_t size) {
+    glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset),
+                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
 }

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -4,80 +4,44 @@

 #pragma once

-#include <cstddef>
 #include <memory>
-#include <tuple>

 #include "common/common_types.h"
+#include "video_core/buffer_cache.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"

+namespace Core {
+class System;
+}
+
 namespace OpenGL {

+class OGLStreamBuffer;
 class RasterizerOpenGL;

-class CachedBufferEntry final : public RasterizerCacheObject {
+class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> {
 public:
-    explicit CachedBufferEntry(VAddr cpu_addr, std::size_t size, GLintptr offset,
-                               std::size_t alignment, u8* host_ptr);
+    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
+                            std::size_t stream_size);
+    ~OGLBufferCache();

-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    std::size_t GetSize() const {
-        return size;
-    }
-
-    GLintptr GetOffset() const {
-        return offset;
-    }
-
-    std::size_t GetAlignment() const {
-        return alignment;
-    }
-
-private:
-    VAddr cpu_addr{};
-    std::size_t size{};
-    GLintptr offset{};
-    std::size_t alignment{};
-};
-
-class OGLBufferCache final : public RasterizerCache<std::shared_ptr<CachedBufferEntry>> {
-public:
-    explicit OGLBufferCache(RasterizerOpenGL& rasterizer, std::size_t size);
-
-    /// Uploads data from a guest GPU address. Returns host's buffer offset where it's been
-    /// allocated.
-    GLintptr UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
-                          bool cache = true);
-
-    /// Uploads from a host memory. Returns host's buffer offset where it's been allocated.
-    GLintptr UploadHostMemory(const void* raw_pointer, std::size_t size, std::size_t alignment = 4);
-
-    bool Map(std::size_t max_size);
-    void Unmap();
-
-    GLuint GetHandle() const;
+    const GLuint* GetEmptyBuffer(std::size_t) override;

 protected:
-    void AlignBuffer(std::size_t alignment);
+    OGLBuffer CreateBuffer(std::size_t size) override;

-    // We do not have to flush this cache as things in it are never modified by us.
-    void FlushObjectInner(const std::shared_ptr<CachedBufferEntry>& object) override {}
+    const GLuint* ToHandle(const OGLBuffer& buffer) override;

-private:
-    OGLStreamBuffer stream_buffer;
+    void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                          const u8* data) override;

-    u8* buffer_ptr = nullptr;
-    GLintptr buffer_offset = 0;
-    GLintptr buffer_offset_base = 0;
+    void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
+                            u8* data) override;
+
+    void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset,
+                        std::size_t dst_offset, std::size_t size) override;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -24,8 +24,10 @@ T GetInteger(GLenum pname) {

 Device::Device() {
    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
+    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
    max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+    has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
    has_variable_aoffi = TestVariableAoffi();
    has_component_indexing_bug = TestComponentIndexingBug();
 }
@@ -34,6 +36,7 @@ Device::Device(std::nullptr_t) {
    uniform_buffer_alignment = 0;
    max_vertex_attributes = 16;
    max_varyings = 15;
+    has_vertex_viewport_layer = true;
    has_variable_aoffi = true;
    has_component_indexing_bug = false;
 }
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -18,6 +18,10 @@ public:
        return uniform_buffer_alignment;
    }

+    std::size_t GetShaderStorageBufferAlignment() const {
+        return shader_storage_alignment;
+    }
+
    u32 GetMaxVertexAttributes() const {
        return max_vertex_attributes;
    }
@@ -26,6 +30,10 @@ public:
        return max_varyings;
    }

+    bool HasVertexViewportLayer() const {
+        return has_vertex_viewport_layer;
+    }
+
    bool HasVariableAoffi() const {
        return has_variable_aoffi;
    }
@@ -39,8 +47,10 @@ private:
    static bool TestComponentIndexingBug();

    std::size_t uniform_buffer_alignment{};
+    std::size_t shader_storage_alignment{};
    u32 max_vertex_attributes{};
    u32 max_varyings{};
+    bool has_vertex_viewport_layer{};
    bool has_variable_aoffi{};
    bool has_component_indexing_bug{};
 };
--- a/src/video_core/renderer_opengl/gl_global_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_global_cache.cpp
@@ -1,102 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#include <glad/glad.h>
-
-#include "common/logging/log.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
-#include "video_core/renderer_opengl/gl_global_cache.h"
-#include "video_core/renderer_opengl/gl_rasterizer.h"
-#include "video_core/renderer_opengl/gl_shader_decompiler.h"
-#include "video_core/renderer_opengl/utils.h"
-
-namespace OpenGL {
-
-CachedGlobalRegion::CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size)
-    : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, host_ptr{host_ptr}, size{size},
-      max_size{max_size} {
-    buffer.Create();
-    LabelGLObject(GL_BUFFER, buffer.handle, cpu_addr, "GlobalMemory");
-}
-
-CachedGlobalRegion::~CachedGlobalRegion() = default;
-
-void CachedGlobalRegion::Reload(u32 size_) {
-    size = size_;
-    if (size > max_size) {
-        size = max_size;
-        LOG_CRITICAL(HW_GPU, "Global region size {} exceeded the supported size {}!", size_,
-                     max_size);
-    }
-    glNamedBufferData(buffer.handle, size, host_ptr, GL_STREAM_DRAW);
-}
-
-void CachedGlobalRegion::Flush() {
-    LOG_DEBUG(Render_OpenGL, "Flushing {} bytes to CPU memory address 0x{:16}", size, cpu_addr);
-    glGetNamedBufferSubData(buffer.handle, 0, static_cast<GLsizeiptr>(size), host_ptr);
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const {
-    const auto search{reserve.find(addr)};
-    if (search == reserve.end()) {
-        return {};
-    }
-    return search->second;
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr,
-                                                              u32 size) {
-    GlobalRegion region{TryGetReservedGlobalRegion(ToCacheAddr(host_ptr), size)};
-    if (!region) {
-        // No reserved surface available, create a new one and reserve it
-        auto& memory_manager{Core::System::GetInstance().GPU().MemoryManager()};
-        const auto cpu_addr{memory_manager.GpuToCpuAddress(addr)};
-        ASSERT(cpu_addr);
-
-        region = std::make_shared<CachedGlobalRegion>(*cpu_addr, host_ptr, size, max_ssbo_size);
-        ReserveGlobalRegion(region);
-    }
-    region->Reload(size);
-    return region;
-}
-
-void GlobalRegionCacheOpenGL::ReserveGlobalRegion(GlobalRegion region) {
-    reserve.insert_or_assign(region->GetCacheAddr(), std::move(region));
-}
-
-GlobalRegionCacheOpenGL::GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer)
-    : RasterizerCache{rasterizer} {
-    GLint max_ssbo_size_;
-    glGetIntegerv(GL_MAX_SHADER_STORAGE_BLOCK_SIZE, &max_ssbo_size_);
-    max_ssbo_size = static_cast<u32>(max_ssbo_size_);
-}
-
-GlobalRegion GlobalRegionCacheOpenGL::GetGlobalRegion(
-    const GLShader::GlobalMemoryEntry& global_region,
-    Tegra::Engines::Maxwell3D::Regs::ShaderStage stage) {
-    std::lock_guard lock{mutex};
-
-    auto& gpu{Core::System::GetInstance().GPU()};
-    auto& memory_manager{gpu.MemoryManager()};
-    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
-    const auto addr{cbufs.const_buffers[global_region.GetCbufIndex()].address +
-                    global_region.GetCbufOffset()};
-    const auto actual_addr{memory_manager.Read<u64>(addr)};
-    const auto size{memory_manager.Read<u32>(addr + 8)};
-
-    // Look up global region in the cache based on address
-    const auto& host_ptr{memory_manager.GetPointer(actual_addr)};
-    GlobalRegion region{TryGet(host_ptr)};
-
-    if (!region) {
-        // No global region found - create a new one
-        region = GetUncachedGlobalRegion(actual_addr, host_ptr, size);
-        Register(region);
-    }
-
-    return region;
-}
-
-} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_global_cache.h
+++ b/src/video_core/renderer_opengl/gl_global_cache.h
@@ -1,82 +0,0 @@
-// Copyright 2018 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <memory>
-#include <unordered_map>
-
-#include <glad/glad.h>
-
-#include "common/assert.h"
-#include "common/common_types.h"
-#include "video_core/engines/maxwell_3d.h"
-#include "video_core/rasterizer_cache.h"
-#include "video_core/renderer_opengl/gl_resource_manager.h"
-
-namespace OpenGL {
-
-namespace GLShader {
-class GlobalMemoryEntry;
-}
-
-class RasterizerOpenGL;
-class CachedGlobalRegion;
-using GlobalRegion = std::shared_ptr<CachedGlobalRegion>;
-
-class CachedGlobalRegion final : public RasterizerCacheObject {
-public:
-    explicit CachedGlobalRegion(VAddr cpu_addr, u8* host_ptr, u32 size, u32 max_size);
-    ~CachedGlobalRegion();
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    /// Gets the GL program handle for the buffer
-    GLuint GetBufferHandle() const {
-        return buffer.handle;
-    }
-
-    /// Reloads the global region from guest memory
-    void Reload(u32 size_);
-
-    void Flush();
-
-private:
-    VAddr cpu_addr{};
-    u8* host_ptr{};
-    u32 size{};
-    u32 max_size{};
-
-    OGLBuffer buffer;
-};
-
-class GlobalRegionCacheOpenGL final : public RasterizerCache<GlobalRegion> {
-public:
-    explicit GlobalRegionCacheOpenGL(RasterizerOpenGL& rasterizer);
-
-    /// Gets the current specified shader stage program
-    GlobalRegion GetGlobalRegion(const GLShader::GlobalMemoryEntry& descriptor,
-                                 Tegra::Engines::Maxwell3D::Regs::ShaderStage stage);
-
-protected:
-    void FlushObjectInner(const GlobalRegion& object) override {
-        object->Flush();
-    }
-
-private:
-    GlobalRegion TryGetReservedGlobalRegion(CacheAddr addr, u32 size) const;
-    GlobalRegion GetUncachedGlobalRegion(GPUVAddr addr, u8* host_ptr, u32 size);
-    void ReserveGlobalRegion(GlobalRegion region);
-
-    std::unordered_map<CacheAddr, GlobalRegion> reserve;
-    u32 max_ssbo_size{};
-};
-
-} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -20,6 +20,7 @@
 #include "core/hle/kernel/process.h"
 #include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
+#include "video_core/memory_manager.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
@@ -80,11 +81,25 @@ struct DrawParameters {
    }
 };

+static std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
+                                      const GLShader::ConstBufferEntry& entry) {
+    if (!entry.IsIndirect()) {
+        return entry.GetSize();
+    }
+
+    if (buffer.size > Maxwell::MaxConstBufferSize) {
+        LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
+                    Maxwell::MaxConstBufferSize);
+        return Maxwell::MaxConstBufferSize;
+    }
+
+    return buffer.size;
+}
+
 RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWindow& emu_window,
                                   ScreenInfo& info)
    : texture_cache{system, *this, device}, shader_cache{*this, system, emu_window, device},
-      global_cache{*this}, system{system}, screen_info{info},
-      buffer_cache(*this, STREAM_BUFFER_SIZE) {
+      system{system}, screen_info{info}, buffer_cache{*this, system, STREAM_BUFFER_SIZE} {
    OpenGLState::ApplyDefaultState();

    shader_program_manager = std::make_unique<GLShader::ProgramManager>();
@@ -129,8 +144,6 @@ GLuint RasterizerOpenGL::SetupVertexFormat() {
        state.draw.vertex_array = vao;
        state.ApplyVertexArrayState();

-        glVertexArrayElementBuffer(vao, buffer_cache.GetHandle());
-
        // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL.
        // Enables the first 16 vertex attributes always, as we don't know which ones are actually
        // used until shader time. Note, Tegra technically supports 32, but we're capping this to 16
@@ -197,11 +210,11 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {

        ASSERT(end > start);
        const u64 size = end - start + 1;
-        const GLintptr vertex_buffer_offset = buffer_cache.UploadMemory(start, size);
+        const auto [vertex_buffer, vertex_buffer_offset] = buffer_cache.UploadMemory(start, size);

        // Bind the vertex array to the buffer at the current offset.
-        glVertexArrayVertexBuffer(vao, index, buffer_cache.GetHandle(), vertex_buffer_offset,
-                                  vertex_array.stride);
+        vertex_array_pushbuffer.SetVertexBuffer(index, vertex_buffer, vertex_buffer_offset,
+                                                vertex_array.stride);

        if (regs.instanced_arrays.IsInstancingEnabled(index) && vertex_array.divisor != 0) {
            // Enable vertex buffer instancing with the specified divisor.
@@ -215,7 +228,19 @@ void RasterizerOpenGL::SetupVertexBuffer(GLuint vao) {
    gpu.dirty_flags.vertex_array.reset();
 }

-DrawParameters RasterizerOpenGL::SetupDraw() {
+GLintptr RasterizerOpenGL::SetupIndexBuffer() {
+    if (accelerate_draw != AccelDraw::Indexed) {
+        return 0;
+    }
+    MICROPROFILE_SCOPE(OpenGL_Index);
+    const auto& regs = system.GPU().Maxwell3D().regs;
+    const std::size_t size = CalculateIndexBufferSize();
+    const auto [buffer, offset] = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
+    vertex_array_pushbuffer.SetIndexBuffer(buffer);
+    return offset;
+}
+
+DrawParameters RasterizerOpenGL::SetupDraw(GLintptr index_buffer_offset) {
    const auto& gpu = system.GPU().Maxwell3D();
    const auto& regs = gpu.regs;
    const bool is_indexed = accelerate_draw == AccelDraw::Indexed;
@@ -227,11 +252,9 @@ DrawParameters RasterizerOpenGL::SetupDraw() {
    params.primitive_mode = MaxwellToGL::PrimitiveTopology(regs.draw.topology);

    if (is_indexed) {
-        MICROPROFILE_SCOPE(OpenGL_Index);
        params.index_format = MaxwellToGL::IndexFormat(regs.index_array.format);
        params.count = regs.index_array.count;
-        params.index_buffer_offset =
-            buffer_cache.UploadMemory(regs.index_array.IndexStart(), CalculateIndexBufferSize());
+        params.index_buffer_offset = index_buffer_offset;
        params.base_vertex = static_cast<GLint>(regs.vb_element_base);
    } else {
        params.count = regs.vertex_buffer.count;
@@ -247,10 +270,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
    BaseBindings base_bindings;
    std::array<bool, Maxwell::NumClipDistances> clip_distances{};

-    // Prepare packed bindings
-    bind_ubo_pushbuffer.Setup(base_bindings.cbuf);
-    bind_ssbo_pushbuffer.Setup(base_bindings.gmem);
-
    for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) {
        const auto& shader_config = gpu.regs.shader_config[index];
        const Maxwell::ShaderProgram program{static_cast<Maxwell::ShaderProgram>(index)};
@@ -271,12 +290,11 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {

        GLShader::MaxwellUniformData ubo{};
        ubo.SetFromRegs(gpu, stage);
-        const GLintptr offset =
+        const auto [buffer, offset] =
            buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());

        // Bind the emulation info buffer
-        bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset,
-                                 static_cast<GLsizeiptr>(sizeof(ubo)));
+        bind_ubo_pushbuffer.Push(buffer, offset, static_cast<GLsizeiptr>(sizeof(ubo)));

        Shader shader{shader_cache.GetStageProgram(program)};

@@ -321,9 +339,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
        base_bindings = next_bindings;
    }

-    bind_ubo_pushbuffer.Bind();
-    bind_ssbo_pushbuffer.Bind();
-
    SyncClipEnabled(clip_distances);

    gpu.dirty_flags.shaders = false;
@@ -634,26 +649,46 @@ void RasterizerOpenGL::DrawArrays() {
                      Maxwell::MaxShaderStage;

    // Add space for at least 18 constant buffers
-    buffer_size +=
-        Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
+    buffer_size += Maxwell::MaxConstBuffers *
+                   (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());

-    const bool invalidate = buffer_cache.Map(buffer_size);
-    if (invalidate) {
-        // As all cached buffers are invalidated, we need to recheck their state.
-        gpu.dirty_flags.vertex_array.set();
-    }
+    // Prepare the vertex array.
+    buffer_cache.Map(buffer_size);

+    // Prepare vertex array format.
    const GLuint vao = SetupVertexFormat();
-    SetupVertexBuffer(vao);
+    vertex_array_pushbuffer.Setup(vao);

-    DrawParameters params = SetupDraw();
+    // Upload vertex and index data.
+    SetupVertexBuffer(vao);
+    const GLintptr index_buffer_offset = SetupIndexBuffer();
+
+    // Setup draw parameters. It will automatically choose what glDraw* method to use.
+    const DrawParameters params = SetupDraw(index_buffer_offset);
+
+    // Prepare packed bindings.
+    bind_ubo_pushbuffer.Setup(0);
+    bind_ssbo_pushbuffer.Setup(0);
+
+    // Setup shaders and their used resources.
    texture_cache.GuardSamplers(true);
    SetupShaders(params.primitive_mode);
    texture_cache.GuardSamplers(false);

    ConfigureFramebuffers(state);

-    buffer_cache.Unmap();
+    // Signal the buffer cache that we are not going to upload more things.
+    const bool invalidate = buffer_cache.Unmap();
+
+    // Now that we are no longer uploading data, we can safely bind the buffers to OpenGL.
+    vertex_array_pushbuffer.Bind();
+    bind_ubo_pushbuffer.Bind();
+    bind_ssbo_pushbuffer.Bind();
+
+    if (invalidate) {
+        // As all cached buffers are invalidated, we need to recheck their state.
+        gpu.dirty_flags.vertex_array.set();
+    }

    shader_program_manager->ApplyTo(state);
    state.Apply();
@@ -675,7 +710,7 @@ void RasterizerOpenGL::FlushRegion(CacheAddr addr, u64 size) {
        return;
    }
    texture_cache.FlushRegion(addr, size);
-    global_cache.FlushRegion(addr, size);
+    buffer_cache.FlushRegion(addr, size);
 }

 void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
@@ -685,7 +720,6 @@ void RasterizerOpenGL::InvalidateRegion(CacheAddr addr, u64 size) {
    }
    texture_cache.InvalidateRegion(addr, size);
    shader_cache.InvalidateRegion(addr, size);
-    global_cache.InvalidateRegion(addr, size);
    buffer_cache.InvalidateRegion(addr, size);
 }

@@ -696,6 +730,10 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(CacheAddr addr, u64 size) {
    InvalidateRegion(addr, size);
 }

+void RasterizerOpenGL::TickFrame() {
+    buffer_cache.TickFrame();
+}
+
 bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                             const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                                             const Tegra::Engines::Fermi2D::Config& copy_config) {
@@ -739,11 +777,9 @@ void RasterizerOpenGL::SetupDrawConstBuffers(Tegra::Engines::Maxwell3D::Regs::Sh
    MICROPROFILE_SCOPE(OpenGL_UBO);
    const auto stage_index = static_cast<std::size_t>(stage);
    const auto& shader_stage = system.GPU().Maxwell3D().state.shader_stages[stage_index];
-    const auto& entries = shader->GetShaderEntries().const_buffers;

    // Upload only the enabled buffers from the 16 constbuffers of each shader stage
-    for (u32 bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& entry = entries[bindpoint];
+    for (const auto& entry : shader->GetShaderEntries().const_buffers) {
        SetupConstBuffer(shader_stage.const_buffers[entry.GetIndex()], entry);
    }
 }
@@ -752,46 +788,34 @@ void RasterizerOpenGL::SetupConstBuffer(const Tegra::Engines::ConstBufferInfo& b
                                        const GLShader::ConstBufferEntry& entry) {
    if (!buffer.enabled) {
        // Set values to zero to unbind buffers
-        bind_ubo_pushbuffer.Push(0, 0, 0);
+        bind_ubo_pushbuffer.Push(buffer_cache.GetEmptyBuffer(sizeof(float)), 0, sizeof(float));
        return;
    }

-    std::size_t size;
-    if (entry.IsIndirect()) {
-        // Buffer is accessed indirectly, so upload the entire thing
-        size = buffer.size;
-
-        if (size > MaxConstbufferSize) {
-            LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", size,
-                        MaxConstbufferSize);
-            size = MaxConstbufferSize;
-        }
-    } else {
-        // Buffer is accessed directly, upload just what we use
-        size = entry.GetSize();
-    }
-
    // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
    // UBO alignment requirements.
-    size = Common::AlignUp(size, sizeof(GLvec4));
-    ASSERT_MSG(size <= MaxConstbufferSize, "Constant buffer is too big");
+    const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));

-    const std::size_t alignment = device.GetUniformBufferAlignment();
-    const GLintptr offset = buffer_cache.UploadMemory(buffer.address, size, alignment);
-    bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, size);
+    const auto alignment = device.GetUniformBufferAlignment();
+    const auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment);
+    bind_ubo_pushbuffer.Push(cbuf, offset, size);
 }

 void RasterizerOpenGL::SetupGlobalRegions(Tegra::Engines::Maxwell3D::Regs::ShaderStage stage,
                                          const Shader& shader) {
-    const auto& entries = shader->GetShaderEntries().global_memory_entries;
-    for (std::size_t bindpoint = 0; bindpoint < entries.size(); ++bindpoint) {
-        const auto& entry{entries[bindpoint]};
-        const auto& region{global_cache.GetGlobalRegion(entry, stage)};
-        if (entry.IsWritten()) {
-            region->MarkAsModified(true, global_cache);
-        }
-        bind_ssbo_pushbuffer.Push(region->GetBufferHandle(), 0,
-                                  static_cast<GLsizeiptr>(region->GetSizeInBytes()));
+    auto& gpu{system.GPU()};
+    auto& memory_manager{gpu.MemoryManager()};
+    const auto cbufs{gpu.Maxwell3D().state.shader_stages[static_cast<std::size_t>(stage)]};
+    const auto alignment{device.GetShaderStorageBufferAlignment()};
+
+    for (const auto& entry : shader->GetShaderEntries().global_memory_entries) {
+        const auto addr{cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset()};
+        const auto actual_addr{memory_manager.Read<u64>(addr)};
+        const auto size{memory_manager.Read<u32>(addr + 8)};
+
+        const auto [ssbo, buffer_offset] =
+            buffer_cache.UploadMemory(actual_addr, size, alignment, true, entry.IsWritten());
+        bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
    }
 }

--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -24,7 +24,6 @@
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_framebuffer_cache.h"
-#include "video_core/renderer_opengl/gl_global_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_sampler_cache.h"
 #include "video_core/renderer_opengl/gl_shader_cache.h"
@@ -63,6 +62,7 @@ public:
    void FlushRegion(CacheAddr addr, u64 size) override;
    void InvalidateRegion(CacheAddr addr, u64 size) override;
    void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
+    void TickFrame() override;
    bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                               const Tegra::Engines::Fermi2D::Regs::Surface& dst,
                               const Tegra::Engines::Fermi2D::Config& copy_config) override;
@@ -73,11 +73,6 @@ public:
    void LoadDiskResources(const std::atomic_bool& stop_loading,
                           const VideoCore::DiskResourceLoadCallback& callback) override;

-    /// Maximum supported size that a constbuffer can have in bytes.
-    static constexpr std::size_t MaxConstbufferSize = 0x10000;
-    static_assert(MaxConstbufferSize % sizeof(GLvec4) == 0,
-                  "The maximum size of a constbuffer must be a multiple of the size of GLvec4");
-
 private:
    struct FramebufferConfigState {
        bool using_color_fb{};
@@ -191,7 +186,6 @@ private:

    TextureCacheOpenGL texture_cache;
    ShaderCacheOpenGL shader_cache;
-    GlobalRegionCacheOpenGL global_cache;
    SamplerCacheOpenGL sampler_cache;
    FramebufferCacheOpenGL framebuffer_cache;

@@ -210,6 +204,7 @@ private:
    static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
    OGLBufferCache buffer_cache;

+    VertexArrayPushBuffer vertex_array_pushbuffer;
    BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
    BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};

@@ -222,7 +217,9 @@ private:

    void SetupVertexBuffer(GLuint vao);

-    DrawParameters SetupDraw();
+    GLintptr SetupIndexBuffer();
+
+    DrawParameters SetupDraw(GLintptr index_buffer_offset);

    void SetupShaders(GLenum primitive_mode);

--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -190,8 +190,11 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
    const auto texture_buffer_usage{variant.texture_buffer_usage};

    std::string source = "#version 430 core\n"
-                         "#extension GL_ARB_separate_shader_objects : enable\n\n";
-    source += fmt::format("#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);
+                         "#extension GL_ARB_separate_shader_objects : enable\n";
+    if (entries.shader_viewport_layer_array) {
+        source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
+    }
+    source += fmt::format("\n#define EMULATION_UBO_BINDING {}\n", base_bindings.cbuf++);

    for (const auto& cbuf : entries.const_buffers) {
        source +=
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -14,6 +14,7 @@
 #include "common/alignment.h"
 #include "common/assert.h"
 #include "common/common_types.h"
+#include "common/logging/log.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
@@ -46,7 +47,7 @@ using TextureArgument = std::pair<Type, Node>;
 using TextureIR = std::variant<TextureAoffi, TextureArgument>;

 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
-    static_cast<u32>(RasterizerOpenGL::MaxConstbufferSize) / (4 * sizeof(float));
+    static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));

 class ShaderWriter {
 public:
@@ -246,15 +247,13 @@ public:
                                                       usage.is_read, usage.is_written);
        }
        entries.clip_distances = ir.GetClipDistances();
+        entries.shader_viewport_layer_array =
+            stage == ShaderStage::Vertex && (ir.UsesLayer() || ir.UsesViewportIndex());
        entries.shader_length = ir.GetLength();
        return entries;
    }

 private:
-    using OperationDecompilerFn = std::string (GLSLDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
    void DeclareVertex() {
        if (stage != ShaderStage::Vertex)
            return;
@@ -282,22 +281,35 @@ private:
    }

    void DeclareVertexRedeclarations() {
-        bool clip_distances_declared = false;
-
        code.AddLine("out gl_PerVertex {{");
        ++code.scope;

        code.AddLine("vec4 gl_Position;");

-        for (const auto o : ir.GetOutputAttributes()) {
-            if (o == Attribute::Index::PointSize)
-                code.AddLine("float gl_PointSize;");
-            if (!clip_distances_declared && (o == Attribute::Index::ClipDistances0123 ||
-                                             o == Attribute::Index::ClipDistances4567)) {
+        for (const auto attribute : ir.GetOutputAttributes()) {
+            if (attribute == Attribute::Index::ClipDistances0123 ||
+                attribute == Attribute::Index::ClipDistances4567) {
                code.AddLine("float gl_ClipDistance[];");
-                clip_distances_declared = true;
+                break;
            }
        }
+        if (stage != ShaderStage::Vertex || device.HasVertexViewportLayer()) {
+            if (ir.UsesLayer()) {
+                code.AddLine("int gl_Layer;");
+            }
+            if (ir.UsesViewportIndex()) {
+                code.AddLine("int gl_ViewportIndex;");
+            }
+        } else if ((ir.UsesLayer() || ir.UsesViewportIndex()) && stage == ShaderStage::Vertex &&
+                   !device.HasVertexViewportLayer()) {
+            LOG_ERROR(
+                Render_OpenGL,
+                "GL_ARB_shader_viewport_layer_array is not available and its required by a shader");
+        }
+
+        if (ir.UsesPointSize()) {
+            code.AddLine("float gl_PointSize;");
+        }

        --code.scope;
        code.AddLine("}};");
@@ -805,6 +817,45 @@ private:
        return CastOperand(VisitOperand(operation, operand_index), type);
    }

+    std::optional<std::pair<std::string, bool>> GetOutputAttribute(const AbufNode* abuf) {
+        switch (const auto attribute = abuf->GetIndex()) {
+        case Attribute::Index::Position:
+            return std::make_pair("gl_Position"s + GetSwizzle(abuf->GetElement()), false);
+        case Attribute::Index::LayerViewportPointSize:
+            switch (abuf->GetElement()) {
+            case 0:
+                UNIMPLEMENTED();
+                return {};
+            case 1:
+                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                    return {};
+                }
+                return std::make_pair("gl_Layer", true);
+            case 2:
+                if (stage == ShaderStage::Vertex && !device.HasVertexViewportLayer()) {
+                    return {};
+                }
+                return std::make_pair("gl_ViewportIndex", true);
+            case 3:
+                UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader");
+                return std::make_pair("gl_PointSize", false);
+            }
+            return {};
+        case Attribute::Index::ClipDistances0123:
+            return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), false);
+        case Attribute::Index::ClipDistances4567:
+            return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4),
+                                  false);
+        default:
+            if (IsGenericAttribute(attribute)) {
+                return std::make_pair(
+                    GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), false);
+            }
+            UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
+            return {};
+        }
+    }
+
    std::string CastOperand(const std::string& value, Type type) const {
        switch (type) {
        case Type::Bool:
@@ -1001,6 +1052,8 @@ private:
        const Node& src = operation[1];

        std::string target;
+        bool is_integer = false;
+
        if (const auto gpr = std::get_if<GprNode>(&*dest)) {
            if (gpr->GetIndex() == Register::ZeroIndex) {
                // Writing to Register::ZeroIndex is a no op
@@ -1009,26 +1062,12 @@ private:
            target = GetRegister(gpr->GetIndex());
        } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
            UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
-
-            target = [&]() -> std::string {
-                switch (const auto attribute = abuf->GetIndex(); abuf->GetIndex()) {
-                case Attribute::Index::Position:
-                    return "gl_Position"s + GetSwizzle(abuf->GetElement());
-                case Attribute::Index::PointSize:
-                    return "gl_PointSize";
-                case Attribute::Index::ClipDistances0123:
-                    return fmt::format("gl_ClipDistance[{}]", abuf->GetElement());
-                case Attribute::Index::ClipDistances4567:
-                    return fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4);
-                default:
-                    if (IsGenericAttribute(attribute)) {
-                        return GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement());
-                    }
-                    UNIMPLEMENTED_MSG("Unhandled output attribute: {}",
-                                      static_cast<u32>(attribute));
-                    return "0";
-                }
-            }();
+            const auto result = GetOutputAttribute(abuf);
+            if (!result) {
+                return {};
+            }
+            target = result->first;
+            is_integer = result->second;
        } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
            target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
        } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
@@ -1040,7 +1079,11 @@ private:
            UNREACHABLE_MSG("Assign called without a proper target");
        }

-        code.AddLine("{} = {};", target, Visit(src));
+        if (is_integer) {
+            code.AddLine("{} = ftoi({});", target, Visit(src));
+        } else {
+            code.AddLine("{} = {};", target, Visit(src));
+        }
        return {};
    }

@@ -1079,6 +1122,16 @@ private:
                               Type::Float);
    }

+    std::string FCastHalf0(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0, Type::HalfFloat);
+        return fmt::format("({})[0]", op_a);
+    }
+
+    std::string FCastHalf1(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0, Type::HalfFloat);
+        return fmt::format("({})[1]", op_a);
+    }
+
    template <Type type>
    std::string Min(Operation operation) {
        return GenerateBinaryCall(operation, "min", type, type, type);
@@ -1235,6 +1288,11 @@ private:
        return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat));
    }

+    std::string HCastFloat(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0, Type::Float);
+        return fmt::format("fromHalf2(vec2({}, 0.0f))", op_a);
+    }
+
    std::string HUnpack(Operation operation) {
        const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)};
        const auto value = [&]() -> std::string {
@@ -1353,14 +1411,10 @@ private:
        return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint));
    }

-    std::string LogicalAll2(Operation operation) {
+    std::string LogicalAnd2(Operation operation) {
        return GenerateUnary(operation, "all", Type::Bool, Type::Bool2);
    }

-    std::string LogicalAny2(Operation operation) {
-        return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
-    }
-
    template <bool with_nan>
    std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) {
        const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
@@ -1667,7 +1721,7 @@ private:
        return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
    }

-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
        &GLSLDecompiler::Assign,

        &GLSLDecompiler::Select,
@@ -1679,6 +1733,8 @@ private:
        &GLSLDecompiler::Negate<Type::Float>,
        &GLSLDecompiler::Absolute<Type::Float>,
        &GLSLDecompiler::FClamp,
+        &GLSLDecompiler::FCastHalf0,
+        &GLSLDecompiler::FCastHalf1,
        &GLSLDecompiler::Min<Type::Float>,
        &GLSLDecompiler::Max<Type::Float>,
        &GLSLDecompiler::FCos,
@@ -1739,6 +1795,7 @@ private:
        &GLSLDecompiler::Absolute<Type::HalfFloat>,
        &GLSLDecompiler::HNegate,
        &GLSLDecompiler::HClamp,
+        &GLSLDecompiler::HCastFloat,
        &GLSLDecompiler::HUnpack,
        &GLSLDecompiler::HMergeF32,
        &GLSLDecompiler::HMergeH0,
@@ -1751,8 +1808,7 @@ private:
        &GLSLDecompiler::LogicalXor,
        &GLSLDecompiler::LogicalNegate,
        &GLSLDecompiler::LogicalPick2,
-        &GLSLDecompiler::LogicalAll2,
-        &GLSLDecompiler::LogicalAny2,
+        &GLSLDecompiler::LogicalAnd2,

        &GLSLDecompiler::LogicalLessThan<Type::Float>,
        &GLSLDecompiler::LogicalEqual<Type::Float>,
@@ -1816,6 +1872,7 @@ private:
        &GLSLDecompiler::WorkGroupId<1>,
        &GLSLDecompiler::WorkGroupId<2>,
    };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

    std::string GetRegister(u32 index) const {
        return GetDeclarationWithSuffix(index, "gpr");
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -78,6 +78,7 @@ struct ShaderEntries {
    std::vector<ImageEntry> images;
    std::vector<GlobalMemoryEntry> global_memory_entries;
    std::array<bool, Maxwell::NumClipDistances> clip_distances{};
+    bool shader_viewport_layer_array{};
    std::size_t shader_length{};
 };

--- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp
@@ -373,6 +373,12 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn
        }
    }

+    bool shader_viewport_layer_array{};
+    if (!LoadObjectFromPrecompiled(shader_viewport_layer_array)) {
+        return {};
+    }
+    entry.entries.shader_viewport_layer_array = shader_viewport_layer_array;
+
    u64 shader_length{};
    if (!LoadObjectFromPrecompiled(shader_length)) {
        return {};
@@ -445,6 +451,10 @@ bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std:
        }
    }

+    if (!SaveObjectToPrecompiled(entries.shader_viewport_layer_array)) {
+        return false;
+    }
+
    if (!SaveObjectToPrecompiled(static_cast<u64>(entries.shader_length))) {
        return false;
    }
--- a/src/video_core/renderer_opengl/gl_state.cpp
+++ b/src/video_core/renderer_opengl/gl_state.cpp
@@ -6,8 +6,11 @@
 #include <glad/glad.h>
 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/renderer_opengl/gl_state.h"

+MICROPROFILE_DEFINE(OpenGL_State, "OpenGL", "State Change", MP_RGB(192, 128, 128));
+
 namespace OpenGL {

 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
@@ -524,6 +527,7 @@ void OpenGLState::ApplySamplers() const {
 }

 void OpenGLState::Apply() const {
+    MICROPROFILE_SCOPE(OpenGL_State);
    ApplyFramebufferState();
    ApplyVertexArrayState();
    ApplyShaderProgram();
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -31,6 +31,8 @@ using VideoCore::Surface::SurfaceType;

 MICROPROFILE_DEFINE(OpenGL_Texture_Upload, "OpenGL", "Texture Upload", MP_RGB(128, 192, 128));
 MICROPROFILE_DEFINE(OpenGL_Texture_Download, "OpenGL", "Texture Download", MP_RGB(128, 192, 128));
+MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy",
+                    MP_RGB(128, 192, 128));

 namespace {

@@ -535,6 +537,7 @@ void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view,
 }

 void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface) {
+    MICROPROFILE_SCOPE(OpenGL_Texture_Buffer_Copy);
    const auto& src_params = src_surface->GetSurfaceParams();
    const auto& dst_params = dst_surface->GetSurfaceParams();
    UNIMPLEMENTED_IF(src_params.num_levels > 1 || dst_params.num_levels > 1);
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -101,7 +101,6 @@ RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::Syst

 RendererOpenGL::~RendererOpenGL() = default;

-/// Swap buffers (render frame)
 void RendererOpenGL::SwapBuffers(
    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {

@@ -130,6 +129,8 @@ void RendererOpenGL::SwapBuffers(

        DrawScreen(render_window.GetFramebufferLayout());

+        rasterizer->TickFrame();
+
        render_window.SwapBuffers();
    }

@@ -262,7 +263,6 @@ void RendererOpenGL::CreateRasterizer() {
    if (rasterizer) {
        return;
    }
-    // Initialize sRGB Usage
    OpenGLState::ClearsRGBUsed();
    rasterizer = std::make_unique<RasterizerOpenGL>(system, emu_window, screen_info);
 }
--- a/src/video_core/renderer_opengl/utils.cpp
+++ b/src/video_core/renderer_opengl/utils.cpp
@@ -13,29 +13,67 @@

 namespace OpenGL {

+VertexArrayPushBuffer::VertexArrayPushBuffer() = default;
+
+VertexArrayPushBuffer::~VertexArrayPushBuffer() = default;
+
+void VertexArrayPushBuffer::Setup(GLuint vao_) {
+    vao = vao_;
+    index_buffer = nullptr;
+    vertex_buffers.clear();
+}
+
+void VertexArrayPushBuffer::SetIndexBuffer(const GLuint* buffer) {
+    index_buffer = buffer;
+}
+
+void VertexArrayPushBuffer::SetVertexBuffer(GLuint binding_index, const GLuint* buffer,
+                                            GLintptr offset, GLsizei stride) {
+    vertex_buffers.push_back(Entry{binding_index, buffer, offset, stride});
+}
+
+void VertexArrayPushBuffer::Bind() {
+    if (index_buffer) {
+        glVertexArrayElementBuffer(vao, *index_buffer);
+    }
+
+    // TODO(Rodrigo): Find a way to ARB_multi_bind this
+    for (const auto& entry : vertex_buffers) {
+        glVertexArrayVertexBuffer(vao, entry.binding_index, *entry.buffer, entry.offset,
+                                  entry.stride);
+    }
+}
+
 BindBuffersRangePushBuffer::BindBuffersRangePushBuffer(GLenum target) : target{target} {}

 BindBuffersRangePushBuffer::~BindBuffersRangePushBuffer() = default;

 void BindBuffersRangePushBuffer::Setup(GLuint first_) {
    first = first_;
-    buffers.clear();
+    buffer_pointers.clear();
    offsets.clear();
    sizes.clear();
 }

-void BindBuffersRangePushBuffer::Push(GLuint buffer, GLintptr offset, GLsizeiptr size) {
-    buffers.push_back(buffer);
+void BindBuffersRangePushBuffer::Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size) {
+    buffer_pointers.push_back(buffer);
    offsets.push_back(offset);
    sizes.push_back(size);
 }

-void BindBuffersRangePushBuffer::Bind() const {
-    const std::size_t count{buffers.size()};
+void BindBuffersRangePushBuffer::Bind() {
+    // Ensure sizes are valid.
+    const std::size_t count{buffer_pointers.size()};
    DEBUG_ASSERT(count == offsets.size() && count == sizes.size());
    if (count == 0) {
        return;
    }
+
+    // Dereference buffers.
+    buffers.resize(count);
+    std::transform(buffer_pointers.begin(), buffer_pointers.end(), buffers.begin(),
+                   [](const GLuint* pointer) { return *pointer; });
+
    glBindBuffersRange(target, first, static_cast<GLsizei>(count), buffers.data(), offsets.data(),
                       sizes.data());
 }
--- a/src/video_core/renderer_opengl/utils.h
+++ b/src/video_core/renderer_opengl/utils.h
@@ -11,20 +11,49 @@

 namespace OpenGL {

-class BindBuffersRangePushBuffer {
+class VertexArrayPushBuffer final {
 public:
-    BindBuffersRangePushBuffer(GLenum target);
+    explicit VertexArrayPushBuffer();
+    ~VertexArrayPushBuffer();
+
+    void Setup(GLuint vao_);
+
+    void SetIndexBuffer(const GLuint* buffer);
+
+    void SetVertexBuffer(GLuint binding_index, const GLuint* buffer, GLintptr offset,
+                         GLsizei stride);
+
+    void Bind();
+
+private:
+    struct Entry {
+        GLuint binding_index{};
+        const GLuint* buffer{};
+        GLintptr offset{};
+        GLsizei stride{};
+    };
+
+    GLuint vao{};
+    const GLuint* index_buffer{};
+    std::vector<Entry> vertex_buffers;
+};
+
+class BindBuffersRangePushBuffer final {
+public:
+    explicit BindBuffersRangePushBuffer(GLenum target);
    ~BindBuffersRangePushBuffer();

    void Setup(GLuint first_);

-    void Push(GLuint buffer, GLintptr offset, GLsizeiptr size);
+    void Push(const GLuint* buffer, GLintptr offset, GLsizeiptr size);

-    void Bind() const;
+    void Bind();

 private:
-    GLenum target;
-    GLuint first;
+    GLenum target{};
+    GLuint first{};
+    std::vector<const GLuint*> buffer_pointers;
+
    std::vector<GLuint> buffers;
    std::vector<GLintptr> offsets;
    std::vector<GLsizeiptr> sizes;
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -205,10 +205,6 @@ public:
    }

 private:
-    using OperationDecompilerFn = Id (SPIRVDecompiler::*)(Operation);
-    using OperationDecompilersArray =
-        std::array<OperationDecompilerFn, static_cast<std::size_t>(OperationCode::Amount)>;
-
    static constexpr auto INTERNAL_FLAGS_COUNT = static_cast<std::size_t>(InternalFlag::Amount);

    void AllocateBindings() {
@@ -430,20 +426,17 @@ private:
        instance_index = DeclareBuiltIn(spv::BuiltIn::InstanceIndex, spv::StorageClass::Input,
                                        t_in_uint, "instance_index");

-        bool is_point_size_declared = false;
        bool is_clip_distances_declared = false;
        for (const auto index : ir.GetOutputAttributes()) {
-            if (index == Attribute::Index::PointSize) {
-                is_point_size_declared = true;
-            } else if (index == Attribute::Index::ClipDistances0123 ||
-                       index == Attribute::Index::ClipDistances4567) {
+            if (index == Attribute::Index::ClipDistances0123 ||
+                index == Attribute::Index::ClipDistances4567) {
                is_clip_distances_declared = true;
            }
        }

        std::vector<Id> members;
        members.push_back(t_float4);
-        if (is_point_size_declared) {
+        if (ir.UsesPointSize()) {
            members.push_back(t_float);
        }
        if (is_clip_distances_declared) {
@@ -466,7 +459,7 @@ private:

        position_index = MemberDecorateBuiltIn(spv::BuiltIn::Position, "position", true);
        point_size_index =
-            MemberDecorateBuiltIn(spv::BuiltIn::PointSize, "point_size", is_point_size_declared);
+            MemberDecorateBuiltIn(spv::BuiltIn::PointSize, "point_size", ir.UsesPointSize());
        clip_distances_index = MemberDecorateBuiltIn(spv::BuiltIn::ClipDistance, "clip_distances",
                                                     is_clip_distances_declared);

@@ -712,7 +705,8 @@ private:
                case Attribute::Index::Position:
                    return AccessElement(t_out_float, per_vertex, position_index,
                                         abuf->GetElement());
-                case Attribute::Index::PointSize:
+                case Attribute::Index::LayerViewportPointSize:
+                    UNIMPLEMENTED_IF(abuf->GetElement() != 3);
                    return AccessElement(t_out_float, per_vertex, point_size_index);
                case Attribute::Index::ClipDistances0123:
                    return AccessElement(t_out_float, per_vertex, clip_distances_index,
@@ -741,6 +735,16 @@ private:
        return {};
    }

+    Id FCastHalf0(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id FCastHalf1(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
    Id HNegate(Operation operation) {
        UNIMPLEMENTED();
        return {};
@@ -751,6 +755,11 @@ private:
        return {};
    }

+    Id HCastFloat(Operation operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
    Id HUnpack(Operation operation) {
        UNIMPLEMENTED();
        return {};
@@ -806,12 +815,7 @@ private:
        return {};
    }

-    Id LogicalAll2(Operation operation) {
-        UNIMPLEMENTED();
-        return {};
-    }
-
-    Id LogicalAny2(Operation operation) {
+    Id LogicalAnd2(Operation operation) {
        UNIMPLEMENTED();
        return {};
    }
@@ -1208,7 +1212,7 @@ private:
        return {};
    }

-    static constexpr OperationDecompilersArray operation_decompilers = {
+    static constexpr std::array operation_decompilers = {
        &SPIRVDecompiler::Assign,

        &SPIRVDecompiler::Ternary<&Module::OpSelect, Type::Float, Type::Bool, Type::Float,
@@ -1221,6 +1225,8 @@ private:
        &SPIRVDecompiler::Unary<&Module::OpFNegate, Type::Float>,
        &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::Float>,
        &SPIRVDecompiler::Ternary<&Module::OpFClamp, Type::Float>,
+        &SPIRVDecompiler::FCastHalf0,
+        &SPIRVDecompiler::FCastHalf1,
        &SPIRVDecompiler::Binary<&Module::OpFMin, Type::Float>,
        &SPIRVDecompiler::Binary<&Module::OpFMax, Type::Float>,
        &SPIRVDecompiler::Unary<&Module::OpCos, Type::Float>,
@@ -1281,6 +1287,7 @@ private:
        &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>,
        &SPIRVDecompiler::HNegate,
        &SPIRVDecompiler::HClamp,
+        &SPIRVDecompiler::HCastFloat,
        &SPIRVDecompiler::HUnpack,
        &SPIRVDecompiler::HMergeF32,
        &SPIRVDecompiler::HMergeH0,
@@ -1293,8 +1300,7 @@ private:
        &SPIRVDecompiler::Binary<&Module::OpLogicalNotEqual, Type::Bool>,
        &SPIRVDecompiler::Unary<&Module::OpLogicalNot, Type::Bool>,
        &SPIRVDecompiler::LogicalPick2,
-        &SPIRVDecompiler::LogicalAll2,
-        &SPIRVDecompiler::LogicalAny2,
+        &SPIRVDecompiler::LogicalAnd2,

        &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::Float>,
        &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::Float>,
@@ -1359,6 +1365,7 @@ private:
        &SPIRVDecompiler::WorkGroupId<1>,
        &SPIRVDecompiler::WorkGroupId<2>,
    };
+    static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

    const VKDevice& device;
    const ShaderIR& ir;
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -46,12 +46,12 @@ void ShaderIR::Decode() {
        coverage_end = shader_info.end;
        if (shader_info.decompilable) {
            disable_flow_stack = true;
-            const auto insert_block = ([this](NodeBlock& nodes, u32 label) {
+            const auto insert_block = [this](NodeBlock& nodes, u32 label) {
                if (label == exit_branch) {
                    return;
                }
                basic_blocks.insert({label, nodes});
-            });
+            };
            const auto& blocks = shader_info.blocks;
            NodeBlock current_block;
            u32 current_label = exit_branch;
@@ -103,7 +103,7 @@ void ShaderIR::DecodeRangeInner(NodeBlock& bb, u32 begin, u32 end) {
 }

 void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {
-    const auto apply_conditions = ([&](const Condition& cond, Node n) -> Node {
+    const auto apply_conditions = [&](const Condition& cond, Node n) -> Node {
        Node result = n;
        if (cond.cc != ConditionCode::T) {
            result = Conditional(GetConditionCode(cond.cc), {result});
@@ -117,7 +117,7 @@ void ShaderIR::InsertControlFlow(NodeBlock& bb, const ShaderBlock& block) {
            result = Conditional(GetPredicate(pred, is_neg), {result});
        }
        return result;
-    });
+    };
    if (block.branch.address < 0) {
        if (block.branch.kills) {
            Node n = Operation(OperationCode::Discard);
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -57,7 +57,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
    case OpCode::Id::I2F_R:
    case OpCode::Id::I2F_C:
    case OpCode::Id::I2F_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
        UNIMPLEMENTED_IF(instr.conversion.selector);
        UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                             "Condition codes generation in I2F is not implemented");
@@ -82,14 +82,19 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
        value = GetOperandAbsNegFloat(value, false, instr.conversion.negate_a);

        SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
+
+        if (instr.conversion.dst_size == Register::Size::Short) {
+            value = Operation(OperationCode::HCastFloat, PRECISE, value);
+        }
+
        SetRegister(bb, instr.gpr0, value);
        break;
    }
    case OpCode::Id::F2F_R:
    case OpCode::Id::F2F_C:
    case OpCode::Id::F2F_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word);
-        UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
+        UNIMPLEMENTED_IF(instr.conversion.src_size == Register::Size::Long);
        UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                             "Condition codes generation in F2F is not implemented");

@@ -107,6 +112,11 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
            }
        }();

+        if (instr.conversion.src_size == Register::Size::Short) {
+            // TODO: figure where extract is sey in the encoding
+            value = Operation(OperationCode::FCastHalf0, PRECISE, value);
+        }
+
        value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);

        value = [&]() {
@@ -124,19 +134,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
            default:
                UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}",
                                  static_cast<u32>(instr.conversion.f2f.rounding.Value()));
-                return Immediate(0);
+                return value;
            }
        }();
        value = GetSaturatedFloat(value, instr.alu.saturate_d);

        SetInternalFlagsFromFloat(bb, value, instr.generates_cc);
+
+        if (instr.conversion.dst_size == Register::Size::Short) {
+            value = Operation(OperationCode::HCastFloat, PRECISE, value);
+        }
+
        SetRegister(bb, instr.gpr0, value);
        break;
    }
    case OpCode::Id::F2I_R:
    case OpCode::Id::F2I_C:
    case OpCode::Id::F2I_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
+        UNIMPLEMENTED_IF(instr.conversion.src_size == Register::Size::Long);
        UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                             "Condition codes generation in F2I is not implemented");
        Node value = [&]() {
@@ -153,6 +168,11 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
            }
        }();

+        if (instr.conversion.src_size == Register::Size::Short) {
+            // TODO: figure where extract is sey in the encoding
+            value = Operation(OperationCode::FCastHalf0, PRECISE, value);
+        }
+
        value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);

        value = [&]() {
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -23,38 +23,51 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
    Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
    op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);

-    Node op_b = [&]() {
-        switch (opcode->get().GetId()) {
-        case OpCode::Id::HSETP2_R:
-            return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
-                                        instr.hsetp2.negate_b);
-        default:
-            UNREACHABLE();
-            return Immediate(0);
-        }
-    }();
-    op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
-
-    // We can't use the constant predicate as destination.
-    ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
-
-    const Node second_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred != 0);
+    Tegra::Shader::PredCondition cond{};
+    bool h_and{};
+    Node op_b{};
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::HSETP2_C:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = GetOperandAbsNegHalf(GetConstBuffer(instr.cbuf34.index, instr.cbuf34.offset),
+                                    instr.hsetp2.cbuf.abs_b, instr.hsetp2.cbuf.negate_b);
+        break;
+    case OpCode::Id::HSETP2_IMM:
+        cond = instr.hsetp2.cbuf_and_imm.cond;
+        h_and = instr.hsetp2.cbuf_and_imm.h_and;
+        op_b = UnpackHalfImmediate(instr, true);
+        break;
+    case OpCode::Id::HSETP2_R:
+        cond = instr.hsetp2.reg.cond;
+        h_and = instr.hsetp2.reg.h_and;
+        op_b =
+            UnpackHalfFloat(GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.reg.abs_b,
+                                                 instr.hsetp2.reg.negate_b),
+                            instr.hsetp2.reg.type_b);
+        break;
+    default:
+        UNREACHABLE();
+        op_b = Immediate(0);
+    }

    const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op);
-    const OperationCode pair_combiner =
-        instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
+    const Node pred39 = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred);

-    const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
-    const Node first_pred = Operation(pair_combiner, comparison);
+    const auto Write = [&](u64 dest, Node src) {
+        SetPredicate(bb, dest, Operation(combiner, std::move(src), pred39));
+    };

-    // Set the primary predicate to the result of Predicate OP SecondPredicate
-    const Node value = Operation(combiner, first_pred, second_pred);
-    SetPredicate(bb, instr.hsetp2.pred3, value);
-
-    if (instr.hsetp2.pred0 != static_cast<u64>(Pred::UnusedIndex)) {
-        // Set the secondary predicate to the result of !Predicate OP SecondPredicate, if enabled
-        const Node negated_pred = Operation(OperationCode::LogicalNegate, first_pred);
-        SetPredicate(bb, instr.hsetp2.pred0, Operation(combiner, negated_pred, second_pred));
+    const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b);
+    const u64 first = instr.hsetp2.pred0;
+    const u64 second = instr.hsetp2.pred3;
+    if (h_and) {
+        const Node joined = Operation(OperationCode::LogicalAnd2, comparison);
+        Write(first, joined);
+        Write(second, Operation(OperationCode::LogicalNegate, joined));
+    } else {
+        Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0u)));
+        Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1u)));
    }

    return pc;
--- a/src/video_core/shader/decode/image.cpp
+++ b/src/video_core/shader/decode/image.cpp
@@ -95,12 +95,8 @@ const Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::Image
 const Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg,
                                        Tegra::Shader::ImageType type) {
    const Node image_register{GetRegister(reg)};
-    const Node base_image{
+    const auto [base_image, cbuf_index, cbuf_offset]{
        TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size()))};
-    const auto cbuf{std::get_if<CbufNode>(&*base_image)};
-    const auto cbuf_offset_imm{std::get_if<ImmediateNode>(&*cbuf->GetOffset())};
-    const auto cbuf_offset{cbuf_offset_imm->GetValue()};
-    const auto cbuf_index{cbuf->GetIndex()};
    const auto cbuf_key{(static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset)};

    // If this image has already been used, return the existing mapping.
--- a/src/video_core/shader/decode/memory.cpp
+++ b/src/video_core/shader/decode/memory.cpp
@@ -95,10 +95,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
            const Node op_b =
                GetConstBufferIndirect(instr.cbuf36.index, instr.cbuf36.GetOffset() + 4, index);

-            SetTemporal(bb, 0, op_a);
-            SetTemporal(bb, 1, op_b);
-            SetRegister(bb, instr.gpr0, GetTemporal(0));
-            SetRegister(bb, instr.gpr0.Value() + 1, GetTemporal(1));
+            SetTemporary(bb, 0, op_a);
+            SetTemporary(bb, 1, op_b);
+            SetRegister(bb, instr.gpr0, GetTemporary(0));
+            SetRegister(bb, instr.gpr0.Value() + 1, GetTemporary(1));
            break;
        }
        default:
@@ -136,9 +136,9 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                }
            }();
            for (u32 i = 0; i < count; ++i)
-                SetTemporal(bb, i, GetLmem(i * 4));
+                SetTemporary(bb, i, GetLmem(i * 4));
            for (u32 i = 0; i < count; ++i)
-                SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+                SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
            break;
        }
        default:
@@ -172,10 +172,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
            const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);

-            SetTemporal(bb, i, gmem);
+            SetTemporary(bb, i, gmem);
        }
        for (u32 i = 0; i < count; ++i) {
-            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
        }
        break;
    }
@@ -253,11 +253,11 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
            TrackAndGetGlobalMemory(bb, instr, true);

        // Encode in temporary registers like this: real_base_address, {registers_to_be_written...}
-        SetTemporal(bb, 0, real_address_base);
+        SetTemporary(bb, 0, real_address_base);

        const u32 count = GetUniformTypeElementsCount(type);
        for (u32 i = 0; i < count; ++i) {
-            SetTemporal(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
+            SetTemporary(bb, i + 1, GetRegister(instr.gpr0.Value() + i));
        }
        for (u32 i = 0; i < count; ++i) {
            const Node it_offset = Immediate(i * 4);
@@ -265,7 +265,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
                Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
            const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);

-            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporal(i + 1)));
+            bb.push_back(Operation(OperationCode::Assign, gmem, GetTemporary(i + 1)));
        }
        break;
    }
@@ -297,18 +297,13 @@ std::tuple<Node, Node, GlobalMemoryBase> ShaderIR::TrackAndGetGlobalMemory(NodeB
    const auto addr_register{GetRegister(instr.gmem.gpr)};
    const auto immediate_offset{static_cast<u32>(instr.gmem.offset)};

-    const Node base_address{
-        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()))};
-    const auto cbuf = std::get_if<CbufNode>(&*base_address);
-    ASSERT(cbuf != nullptr);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
-    ASSERT(cbuf_offset_imm != nullptr);
-    const auto cbuf_offset = cbuf_offset_imm->GetValue();
+    const auto [base_address, index, offset] =
+        TrackCbuf(addr_register, global_code, static_cast<s64>(global_code.size()));
+    ASSERT(base_address != nullptr);

-    bb.push_back(
-        Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", cbuf->GetIndex(), cbuf_offset)));
+    bb.push_back(Comment(fmt::format("Base address is c[0x{:x}][0x{:x}]", index, offset)));

-    const GlobalMemoryBase descriptor{cbuf->GetIndex(), cbuf_offset};
+    const GlobalMemoryBase descriptor{index, offset};
    const auto& [entry, is_new] = used_global_memory.try_emplace(descriptor);
    auto& usage = entry->second;
    if (is_write) {
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -102,7 +102,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
                                                 PRECISE, op_a, Immediate(3));
            const Node operand =
                Operation(OperationCode::IAdd, PRECISE, convert, Immediate(target));
-            branch = Operation(OperationCode::BranchIndirect, convert);
+            branch = Operation(OperationCode::BranchIndirect, operand);
        }

        const Tegra::Shader::ConditionCode cc = instr.flow_condition_code;
--- a/src/video_core/shader/decode/texture.cpp
+++ b/src/video_core/shader/decode/texture.cpp
@@ -181,10 +181,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
                const Node value =
                    Operation(OperationCode::TextureQueryDimensions, meta,
                              GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0)));
-                SetTemporal(bb, indexer++, value);
+                SetTemporary(bb, indexer++, value);
            }
            for (u32 i = 0; i < indexer; ++i) {
-                SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+                SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
            }
            break;
        }
@@ -238,10 +238,10 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
            auto params = coords;
            MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, element};
            const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
-            SetTemporal(bb, indexer++, value);
+            SetTemporary(bb, indexer++, value);
        }
        for (u32 i = 0; i < indexer; ++i) {
-            SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+            SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
        }
        break;
    }
@@ -269,7 +269,13 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
            LOG_WARNING(HW_GPU, "TLDS.NODEP implementation is incomplete");
        }

-        WriteTexsInstructionFloat(bb, instr, GetTldsCode(instr, texture_type, is_array));
+        const Node4 components = GetTldsCode(instr, texture_type, is_array);
+
+        if (instr.tlds.fp32_flag) {
+            WriteTexsInstructionFloat(bb, instr, components);
+        } else {
+            WriteTexsInstructionHalfFloat(bb, instr, components);
+        }
        break;
    }
    default:
@@ -302,13 +308,9 @@ const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler, Textu
 const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, TextureType type,
                                            bool is_array, bool is_shadow) {
    const Node sampler_register = GetRegister(reg);
-    const Node base_sampler =
+    const auto [base_sampler, cbuf_index, cbuf_offset] =
        TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
-    const auto cbuf = std::get_if<CbufNode>(&*base_sampler);
-    const auto cbuf_offset_imm = std::get_if<ImmediateNode>(&*cbuf->GetOffset());
-    ASSERT(cbuf_offset_imm != nullptr);
-    const auto cbuf_offset = cbuf_offset_imm->GetValue();
-    const auto cbuf_index = cbuf->GetIndex();
+    ASSERT(base_sampler != nullptr);
    const auto cbuf_key = (static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset);

    // If this sampler has already been used, return the existing mapping.
@@ -334,11 +336,11 @@ void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const
            // Skip disabled components
            continue;
        }
-        SetTemporal(bb, dest_elem++, components[elem]);
+        SetTemporary(bb, dest_elem++, components[elem]);
    }
    // After writing values in temporals, move them to the real registers
    for (u32 i = 0; i < dest_elem; ++i) {
-        SetRegister(bb, instr.gpr0.Value() + i, GetTemporal(i));
+        SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
    }
 }

@@ -351,17 +353,17 @@ void ShaderIR::WriteTexsInstructionFloat(NodeBlock& bb, Instruction instr,
    for (u32 component = 0; component < 4; ++component) {
        if (!instr.texs.IsComponentEnabled(component))
            continue;
-        SetTemporal(bb, dest_elem++, components[component]);
+        SetTemporary(bb, dest_elem++, components[component]);
    }

    for (u32 i = 0; i < dest_elem; ++i) {
        if (i < 2) {
            // Write the first two swizzle components to gpr0 and gpr0+1
-            SetRegister(bb, instr.gpr0.Value() + i % 2, GetTemporal(i));
+            SetRegister(bb, instr.gpr0.Value() + i % 2, GetTemporary(i));
        } else {
            ASSERT(instr.texs.HasTwoDestinations());
            // Write the rest of the swizzle components to gpr28 and gpr28+1
-            SetRegister(bb, instr.gpr28.Value() + i % 2, GetTemporal(i));
+            SetRegister(bb, instr.gpr28.Value() + i % 2, GetTemporary(i));
        }
    }
 }
@@ -389,11 +391,11 @@ void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr,
        return;
    }

-    SetTemporal(bb, 0, first_value);
-    SetTemporal(bb, 1, Operation(OperationCode::HPack2, values[2], values[3]));
+    SetTemporary(bb, 0, first_value);
+    SetTemporary(bb, 1, Operation(OperationCode::HPack2, values[2], values[3]));

-    SetRegister(bb, instr.gpr0, GetTemporal(0));
-    SetRegister(bb, instr.gpr28, GetTemporal(1));
+    SetRegister(bb, instr.gpr0, GetTemporary(0));
+    SetRegister(bb, instr.gpr28, GetTemporary(1));
 }

 Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
--- a/src/video_core/shader/decode/xmad.cpp
+++ b/src/video_core/shader/decode/xmad.cpp
@@ -73,8 +73,8 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
    if (is_psl) {
        product = Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, product, Immediate(16));
    }
-    SetTemporal(bb, 0, product);
-    product = GetTemporal(0);
+    SetTemporary(bb, 0, product);
+    product = GetTemporary(0);

    const Node original_c = op_c;
    const Tegra::Shader::XmadMode set_mode = mode; // Workaround to clang compile error
@@ -98,13 +98,13 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) {
        }
    }();

-    SetTemporal(bb, 1, op_c);
-    op_c = GetTemporal(1);
+    SetTemporary(bb, 1, op_c);
+    op_c = GetTemporary(1);

    // TODO(Rodrigo): Use an appropiate sign for this operation
    Node sum = Operation(OperationCode::IAdd, product, op_c);
-    SetTemporal(bb, 2, sum);
-    sum = GetTemporal(2);
+    SetTemporary(bb, 2, sum);
+    sum = GetTemporary(2);
    if (is_merge) {
        const Node a = BitfieldExtract(sum, 0, 16);
        const Node b =
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -30,6 +30,8 @@ enum class OperationCode {
    FNegate,       /// (MetaArithmetic, float a) -> float
    FAbsolute,     /// (MetaArithmetic, float a) -> float
    FClamp,        /// (MetaArithmetic, float value, float min, float max) -> float
+    FCastHalf0,    /// (MetaArithmetic, f16vec2 a) -> float
+    FCastHalf1,    /// (MetaArithmetic, f16vec2 a) -> float
    FMin,          /// (MetaArithmetic, float a, float b) -> float
    FMax,          /// (MetaArithmetic, float a, float b) -> float
    FCos,          /// (MetaArithmetic, float a) -> float
@@ -83,17 +85,18 @@ enum class OperationCode {
    UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
    UBitCount,        /// (MetaArithmetic, uint) -> uint

-    HAdd,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HMul,      /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
-    HFma,      /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
-    HAbsolute, /// (f16vec2 a) -> f16vec2
-    HNegate,   /// (f16vec2 a, bool first, bool second) -> f16vec2
-    HClamp,    /// (f16vec2 src, float min, float max) -> f16vec2
-    HUnpack,   /// (Tegra::Shader::HalfType, T value) -> f16vec2
-    HMergeF32, /// (f16vec2 src) -> float
-    HMergeH0,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
-    HMergeH1,  /// (f16vec2 dest, f16vec2 src) -> f16vec2
-    HPack2,    /// (float a, float b) -> f16vec2
+    HAdd,       /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HMul,       /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
+    HFma,       /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
+    HAbsolute,  /// (f16vec2 a) -> f16vec2
+    HNegate,    /// (f16vec2 a, bool first, bool second) -> f16vec2
+    HClamp,     /// (f16vec2 src, float min, float max) -> f16vec2
+    HCastFloat, /// (MetaArithmetic, float a) -> f16vec2
+    HUnpack,    /// (Tegra::Shader::HalfType, T value) -> f16vec2
+    HMergeF32,  /// (f16vec2 src) -> float
+    HMergeH0,   /// (f16vec2 dest, f16vec2 src) -> f16vec2
+    HMergeH1,   /// (f16vec2 dest, f16vec2 src) -> f16vec2
+    HPack2,     /// (float a, float b) -> f16vec2

    LogicalAssign, /// (bool& dst, bool src) -> void
    LogicalAnd,    /// (bool a, bool b) -> bool
@@ -101,8 +104,7 @@ enum class OperationCode {
    LogicalXor,    /// (bool a, bool b) -> bool
    LogicalNegate, /// (bool a) -> bool
    LogicalPick2,  /// (bool2 pair, uint index) -> bool
-    LogicalAll2,   /// (bool2 a) -> bool
-    LogicalAny2,   /// (bool2 a) -> bool
+    LogicalAnd2,   /// (bool2 a) -> bool

    LogicalFLessThan,     /// (float a, float b) -> bool
    LogicalFEqual,        /// (float a, float b) -> bool
--- a/src/video_core/shader/node_helper.cpp
+++ b/src/video_core/shader/node_helper.cpp
@@ -12,7 +12,7 @@
 namespace VideoCommon::Shader {

 Node Conditional(Node condition, std::vector<Node> code) {
-    return MakeNode<ConditionalNode>(condition, std::move(code));
+    return MakeNode<ConditionalNode>(std::move(condition), std::move(code));
 }

 Node Comment(std::string text) {
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -61,8 +61,17 @@ Node ShaderIR::GetConstBufferIndirect(u64 index_, u64 offset_, Node node) {
    const auto [entry, is_new] = used_cbufs.try_emplace(index);
    entry->second.MarkAsUsedIndirect();

-    const Node final_offset = Operation(OperationCode::UAdd, NO_PRECISE, node, Immediate(offset));
-    return MakeNode<CbufNode>(index, final_offset);
+    Node final_offset = [&] {
+        // Attempt to inline constant buffer without a variable offset. This is done to allow
+        // tracking LDC calls.
+        if (const auto gpr = std::get_if<GprNode>(&*node)) {
+            if (gpr->GetIndex() == Register::ZeroIndex) {
+                return Immediate(offset);
+            }
+        }
+        return Operation(OperationCode::UAdd, NO_PRECISE, std::move(node), Immediate(offset));
+    }();
+    return MakeNode<CbufNode>(index, std::move(final_offset));
 }

 Node ShaderIR::GetPredicate(u64 pred_, bool negated) {
@@ -80,7 +89,7 @@ Node ShaderIR::GetPredicate(bool immediate) {

 Node ShaderIR::GetInputAttribute(Attribute::Index index, u64 element, Node buffer) {
    used_input_attributes.emplace(index);
-    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer);
+    return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));
 }

 Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_address, Node buffer) {
@@ -89,6 +98,22 @@ Node ShaderIR::GetPhysicalInputAttribute(Tegra::Shader::Register physical_addres
 }

 Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buffer) {
+    if (index == Attribute::Index::LayerViewportPointSize) {
+        switch (element) {
+        case 0:
+            UNIMPLEMENTED();
+            break;
+        case 1:
+            uses_layer = true;
+            break;
+        case 2:
+            uses_viewport_index = true;
+            break;
+        case 3:
+            uses_point_size = true;
+            break;
+        }
+    }
    if (index == Attribute::Index::ClipDistances0123 ||
        index == Attribute::Index::ClipDistances4567) {
        const auto clip_index =
@@ -97,7 +122,7 @@ Node ShaderIR::GetOutputAttribute(Attribute::Index index, u64 element, Node buff
    }
    used_output_attributes.insert(index);

-    return MakeNode<AbufNode>(index, static_cast<u32>(element), buffer);
+    return MakeNode<AbufNode>(index, static_cast<u32>(element), std::move(buffer));
 }

 Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) {
@@ -109,19 +134,19 @@ Node ShaderIR::GetInternalFlag(InternalFlag flag, bool negated) {
 }

 Node ShaderIR::GetLocalMemory(Node address) {
-    return MakeNode<LmemNode>(address);
+    return MakeNode<LmemNode>(std::move(address));
 }

-Node ShaderIR::GetTemporal(u32 id) {
+Node ShaderIR::GetTemporary(u32 id) {
    return GetRegister(Register::ZeroIndex + 1 + id);
 }

 Node ShaderIR::GetOperandAbsNegFloat(Node value, bool absolute, bool negate) {
    if (absolute) {
-        value = Operation(OperationCode::FAbsolute, NO_PRECISE, value);
+        value = Operation(OperationCode::FAbsolute, NO_PRECISE, std::move(value));
    }
    if (negate) {
-        value = Operation(OperationCode::FNegate, NO_PRECISE, value);
+        value = Operation(OperationCode::FNegate, NO_PRECISE, std::move(value));
    }
    return value;
 }
@@ -130,24 +155,26 @@ Node ShaderIR::GetSaturatedFloat(Node value, bool saturate) {
    if (!saturate) {
        return value;
    }
-    const Node positive_zero = Immediate(std::copysignf(0, 1));
-    const Node positive_one = Immediate(1.0f);
-    return Operation(OperationCode::FClamp, NO_PRECISE, value, positive_zero, positive_one);
+
+    Node positive_zero = Immediate(std::copysignf(0, 1));
+    Node positive_one = Immediate(1.0f);
+    return Operation(OperationCode::FClamp, NO_PRECISE, std::move(value), std::move(positive_zero),
+                     std::move(positive_one));
 }

-Node ShaderIR::ConvertIntegerSize(Node value, Tegra::Shader::Register::Size size, bool is_signed) {
+Node ShaderIR::ConvertIntegerSize(Node value, Register::Size size, bool is_signed) {
    switch (size) {
    case Register::Size::Byte:
-        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, value,
-                                Immediate(24));
-        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, value,
-                                Immediate(24));
+        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(24));
+        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(24));
        return value;
    case Register::Size::Short:
-        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE, value,
-                                Immediate(16));
-        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE, value,
-                                Immediate(16));
+        value = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(16));
+        value = SignedOperation(OperationCode::IArithmeticShiftRight, is_signed, NO_PRECISE,
+                                std::move(value), Immediate(16));
    case Register::Size::Word:
        // Default - do nothing
        return value;
@@ -163,27 +190,29 @@ Node ShaderIR::GetOperandAbsNegInteger(Node value, bool absolute, bool negate, b
        return value;
    }
    if (absolute) {
-        value = Operation(OperationCode::IAbsolute, NO_PRECISE, value);
+        value = Operation(OperationCode::IAbsolute, NO_PRECISE, std::move(value));
    }
    if (negate) {
-        value = Operation(OperationCode::INegate, NO_PRECISE, value);
+        value = Operation(OperationCode::INegate, NO_PRECISE, std::move(value));
    }
    return value;
 }

 Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) {
-    const Node value = Immediate(instr.half_imm.PackImmediates());
+    Node value = Immediate(instr.half_imm.PackImmediates());
    if (!has_negation) {
        return value;
    }
-    const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
-    const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);

-    return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate);
+    Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
+    Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);
+
+    return Operation(OperationCode::HNegate, NO_PRECISE, std::move(value), std::move(first_negate),
+                     std::move(second_negate));
 }

 Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) {
-    return Operation(OperationCode::HUnpack, type, value);
+    return Operation(OperationCode::HUnpack, type, std::move(value));
 }

 Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
@@ -191,11 +220,11 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
    case Tegra::Shader::HalfMerge::H0_H1:
        return src;
    case Tegra::Shader::HalfMerge::F32:
-        return Operation(OperationCode::HMergeF32, src);
+        return Operation(OperationCode::HMergeF32, std::move(src));
    case Tegra::Shader::HalfMerge::Mrg_H0:
-        return Operation(OperationCode::HMergeH0, dest, src);
+        return Operation(OperationCode::HMergeH0, std::move(dest), std::move(src));
    case Tegra::Shader::HalfMerge::Mrg_H1:
-        return Operation(OperationCode::HMergeH1, dest, src);
+        return Operation(OperationCode::HMergeH1, std::move(dest), std::move(src));
    }
    UNREACHABLE();
    return src;
@@ -203,10 +232,10 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {

 Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) {
    if (absolute) {
-        value = Operation(OperationCode::HAbsolute, NO_PRECISE, value);
+        value = Operation(OperationCode::HAbsolute, NO_PRECISE, std::move(value));
    }
    if (negate) {
-        value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true),
+        value = Operation(OperationCode::HNegate, NO_PRECISE, std::move(value), GetPredicate(true),
                          GetPredicate(true));
    }
    return value;
@@ -216,9 +245,11 @@ Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) {
    if (!saturate) {
        return value;
    }
-    const Node positive_zero = Immediate(std::copysignf(0, 1));
-    const Node positive_one = Immediate(1.0f);
-    return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one);
+
+    Node positive_zero = Immediate(std::copysignf(0, 1));
+    Node positive_one = Immediate(1.0f);
+    return Operation(OperationCode::HClamp, NO_PRECISE, std::move(value), std::move(positive_zero),
+                     std::move(positive_one));
 }

 Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) {
@@ -246,7 +277,6 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N
        condition == PredCondition::LessEqualWithNan ||
        condition == PredCondition::GreaterThanWithNan ||
        condition == PredCondition::GreaterEqualWithNan) {
-
        predicate = Operation(OperationCode::LogicalOr, predicate,
                              Operation(OperationCode::LogicalFIsNan, op_a));
        predicate = Operation(OperationCode::LogicalOr, predicate,
@@ -275,7 +305,8 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si
    UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
                         "Unknown predicate comparison operation");

-    Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, op_a, op_b);
+    Node predicate = SignedOperation(comparison->second, is_signed, NO_PRECISE, std::move(op_a),
+                                     std::move(op_b));

    UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan ||
                             condition == PredCondition::NotEqualWithNan ||
@@ -305,9 +336,7 @@ Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition
    UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
                         "Unknown predicate comparison operation");

-    const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b);
-
-    return predicate;
+    return Operation(comparison->second, NO_PRECISE, std::move(op_a), std::move(op_b));
 }

 OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) {
@@ -333,31 +362,32 @@ Node ShaderIR::GetConditionCode(Tegra::Shader::ConditionCode cc) {
 }

 void ShaderIR::SetRegister(NodeBlock& bb, Register dest, Node src) {
-    bb.push_back(Operation(OperationCode::Assign, GetRegister(dest), src));
+    bb.push_back(Operation(OperationCode::Assign, GetRegister(dest), std::move(src)));
 }

 void ShaderIR::SetPredicate(NodeBlock& bb, u64 dest, Node src) {
-    bb.push_back(Operation(OperationCode::LogicalAssign, GetPredicate(dest), src));
+    bb.push_back(Operation(OperationCode::LogicalAssign, GetPredicate(dest), std::move(src)));
 }

 void ShaderIR::SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value) {
-    bb.push_back(Operation(OperationCode::LogicalAssign, GetInternalFlag(flag), value));
+    bb.push_back(Operation(OperationCode::LogicalAssign, GetInternalFlag(flag), std::move(value)));
 }

 void ShaderIR::SetLocalMemory(NodeBlock& bb, Node address, Node value) {
-    bb.push_back(Operation(OperationCode::Assign, GetLocalMemory(address), value));
+    bb.push_back(
+        Operation(OperationCode::Assign, GetLocalMemory(std::move(address)), std::move(value)));
 }

-void ShaderIR::SetTemporal(NodeBlock& bb, u32 id, Node value) {
-    SetRegister(bb, Register::ZeroIndex + 1 + id, value);
+void ShaderIR::SetTemporary(NodeBlock& bb, u32 id, Node value) {
+    SetRegister(bb, Register::ZeroIndex + 1 + id, std::move(value));
 }

 void ShaderIR::SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc) {
    if (!sets_cc) {
        return;
    }
-    const Node zerop = Operation(OperationCode::LogicalFEqual, value, Immediate(0.0f));
-    SetInternalFlag(bb, InternalFlag::Zero, zerop);
+    Node zerop = Operation(OperationCode::LogicalFEqual, std::move(value), Immediate(0.0f));
+    SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop));
    LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete");
 }

@@ -365,14 +395,14 @@ void ShaderIR::SetInternalFlagsFromInteger(NodeBlock& bb, Node value, bool sets_
    if (!sets_cc) {
        return;
    }
-    const Node zerop = Operation(OperationCode::LogicalIEqual, value, Immediate(0));
-    SetInternalFlag(bb, InternalFlag::Zero, zerop);
+    Node zerop = Operation(OperationCode::LogicalIEqual, std::move(value), Immediate(0));
+    SetInternalFlag(bb, InternalFlag::Zero, std::move(zerop));
    LOG_WARNING(HW_GPU, "Condition codes implementation is incomplete");
 }

 Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
-    return Operation(OperationCode::UBitfieldExtract, NO_PRECISE, value, Immediate(offset),
-                     Immediate(bits));
+    return Operation(OperationCode::UBitfieldExtract, NO_PRECISE, std::move(value),
+                     Immediate(offset), Immediate(bits));
 }

 } // namespace VideoCommon::Shader
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -5,13 +5,10 @@
 #pragma once

 #include <array>
-#include <cstring>
 #include <map>
 #include <optional>
 #include <set>
-#include <string>
 #include <tuple>
-#include <variant>
 #include <vector>

 #include "common/common_types.h"
@@ -115,6 +112,18 @@ public:
        return static_cast<std::size_t>(coverage_end * sizeof(u64));
    }

+    bool UsesLayer() const {
+        return uses_layer;
+    }
+
+    bool UsesViewportIndex() const {
+        return uses_viewport_index;
+    }
+
+    bool UsesPointSize() const {
+        return uses_point_size;
+    }
+
    bool HasPhysicalAttributes() const {
        return uses_physical_attributes;
    }
@@ -198,8 +207,8 @@ private:
    Node GetInternalFlag(InternalFlag flag, bool negated = false);
    /// Generates a node representing a local memory address
    Node GetLocalMemory(Node address);
-    /// Generates a temporal, internally it uses a post-RZ register
-    Node GetTemporal(u32 id);
+    /// Generates a temporary, internally it uses a post-RZ register
+    Node GetTemporary(u32 id);

    /// Sets a register. src value must be a number-evaluated node.
    void SetRegister(NodeBlock& bb, Tegra::Shader::Register dest, Node src);
@@ -209,8 +218,8 @@ private:
    void SetInternalFlag(NodeBlock& bb, InternalFlag flag, Node value);
    /// Sets a local memory address. address and value must be a number-evaluated node
    void SetLocalMemory(NodeBlock& bb, Node address, Node value);
-    /// Sets a temporal. Internally it uses a post-RZ register
-    void SetTemporal(NodeBlock& bb, u32 id, Node value);
+    /// Sets a temporary. Internally it uses a post-RZ register
+    void SetTemporary(NodeBlock& bb, u32 id, Node value);

    /// Sets internal flags from a float
    void SetInternalFlagsFromFloat(NodeBlock& bb, Node value, bool sets_cc = true);
@@ -316,7 +325,7 @@ private:
    void WriteLop3Instruction(NodeBlock& bb, Tegra::Shader::Register dest, Node op_a, Node op_b,
                              Node op_c, Node imm_lut, bool sets_cc);

-    Node TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;
+    std::tuple<Node, u32, u32> TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const;

    std::optional<u32> TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const;

@@ -346,6 +355,9 @@ private:
    std::set<Image> used_images;
    std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{};
    std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory;
+    bool uses_layer{};
+    bool uses_viewport_index{};
+    bool uses_point_size{};
    bool uses_physical_attributes{}; // Shader uses AL2P or physical attribute read/writes

    Tegra::Shader::Header header;
--- a/src/video_core/shader/track.cpp
+++ b/src/video_core/shader/track.cpp
@@ -15,56 +15,63 @@ namespace {
 std::pair<Node, s64> FindOperation(const NodeBlock& code, s64 cursor,
                                   OperationCode operation_code) {
    for (; cursor >= 0; --cursor) {
-        const Node node = code.at(cursor);
+        Node node = code.at(cursor);
+
        if (const auto operation = std::get_if<OperationNode>(&*node)) {
            if (operation->GetCode() == operation_code) {
-                return {node, cursor};
+                return {std::move(node), cursor};
            }
        }
+
        if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
            const auto& conditional_code = conditional->GetCode();
-            const auto [found, internal_cursor] = FindOperation(
+            auto [found, internal_cursor] = FindOperation(
                conditional_code, static_cast<s64>(conditional_code.size() - 1), operation_code);
            if (found) {
-                return {found, cursor};
+                return {std::move(found), cursor};
            }
        }
    }
    return {};
 }
-} // namespace
+} // Anonymous namespace

-Node ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code, s64 cursor) const {
+std::tuple<Node, u32, u32> ShaderIR::TrackCbuf(Node tracked, const NodeBlock& code,
+                                               s64 cursor) const {
    if (const auto cbuf = std::get_if<CbufNode>(&*tracked)) {
-        // Cbuf found, but it has to be immediate
-        return std::holds_alternative<ImmediateNode>(*cbuf->GetOffset()) ? tracked : nullptr;
+        // Constant buffer found, test if it's an immediate
+        const auto offset = cbuf->GetOffset();
+        if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
+            return {tracked, cbuf->GetIndex(), immediate->GetValue()};
+        }
+        return {};
    }
    if (const auto gpr = std::get_if<GprNode>(&*tracked)) {
        if (gpr->GetIndex() == Tegra::Shader::Register::ZeroIndex) {
-            return nullptr;
+            return {};
        }
        // Reduce the cursor in one to avoid infinite loops when the instruction sets the same
        // register that it uses as operand
        const auto [source, new_cursor] = TrackRegister(gpr, code, cursor - 1);
        if (!source) {
-            return nullptr;
+            return {};
        }
        return TrackCbuf(source, code, new_cursor);
    }
    if (const auto operation = std::get_if<OperationNode>(&*tracked)) {
        for (std::size_t i = 0; i < operation->GetOperandsCount(); ++i) {
-            if (const auto found = TrackCbuf((*operation)[i], code, cursor)) {
-                // Cbuf found in operand
+            if (auto found = TrackCbuf((*operation)[i], code, cursor); std::get<0>(found)) {
+                // Cbuf found in operand.
                return found;
            }
        }
-        return nullptr;
+        return {};
    }
    if (const auto conditional = std::get_if<ConditionalNode>(&*tracked)) {
        const auto& conditional_code = conditional->GetCode();
        return TrackCbuf(tracked, conditional_code, static_cast<s64>(conditional_code.size()));
    }
-    return nullptr;
+    return {};
 }

 std::optional<u32> ShaderIR::TrackImmediate(Node tracked, const NodeBlock& code, s64 cursor) const {
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -75,9 +75,12 @@ MatchStructureResult SurfaceBaseImpl::MatchesStructure(const SurfaceParams& rhs)

    // Linear Surface check
    if (!params.is_tiled) {
-        if (std::tie(params.width, params.height, params.pitch) ==
-            std::tie(rhs.width, rhs.height, rhs.pitch)) {
-            return MatchStructureResult::FullMatch;
+        if (std::tie(params.height, params.pitch) == std::tie(rhs.height, rhs.pitch)) {
+            if (params.width == rhs.width) {
+                return MatchStructureResult::FullMatch;
+            } else {
+                return MatchStructureResult::SemiMatch;
+            }
        }
        return MatchStructureResult::None;
    }
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -200,8 +200,9 @@ public:
        modification_tick = tick;
    }

-    void MarkAsRenderTarget(const bool is_target) {
+    void MarkAsRenderTarget(const bool is_target, const u32 index) {
        this->is_target = is_target;
+        this->index = index;
    }

    void MarkAsPicked(const bool is_picked) {
@@ -221,6 +222,10 @@ public:
        return is_target;
    }

+    u32 GetRenderTarget() const {
+        return index;
+    }
+
    bool IsRegistered() const {
        return is_registered;
    }
@@ -307,10 +312,13 @@ private:
        return view;
    }

+    static constexpr u32 NO_RT = 0xFFFFFFFF;
+
    bool is_modified{};
    bool is_target{};
    bool is_registered{};
    bool is_picked{};
+    u32 index{NO_RT};
    u64 modification_tick{};
 };

--- a/src/video_core/texture_cache/surface_params.cpp
+++ b/src/video_core/texture_cache/surface_params.cpp
@@ -290,12 +290,19 @@ std::size_t SurfaceParams::GetLayerSize(bool as_host_size, bool uncompressed) co

 std::size_t SurfaceParams::GetInnerMipmapMemorySize(u32 level, bool as_host_size,
                                                    bool uncompressed) const {
-    const bool tiled{as_host_size ? false : is_tiled};
    const u32 width{GetMipmapSize(uncompressed, GetMipWidth(level), GetDefaultBlockWidth())};
    const u32 height{GetMipmapSize(uncompressed, GetMipHeight(level), GetDefaultBlockHeight())};
    const u32 depth{is_layered ? 1U : GetMipDepth(level)};
-    return Tegra::Texture::CalculateSize(tiled, GetBytesPerPixel(), width, height, depth,
-                                         GetMipBlockHeight(level), GetMipBlockDepth(level));
+    if (is_tiled) {
+        return Tegra::Texture::CalculateSize(!as_host_size, GetBytesPerPixel(), width, height,
+                                             depth, GetMipBlockHeight(level),
+                                             GetMipBlockDepth(level));
+    } else if (as_host_size || IsBuffer()) {
+        return GetBytesPerPixel() * width * height * depth;
+    } else {
+        // Linear Texture Case
+        return pitch * height * depth;
+    }
 }

 bool SurfaceParams::operator==(const SurfaceParams& rhs) const {
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -133,11 +133,11 @@ public:
            regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)};
        auto surface_view = GetSurface(gpu_addr, depth_params, preserve_contents, true);
        if (depth_buffer.target)
-            depth_buffer.target->MarkAsRenderTarget(false);
+            depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
        depth_buffer.target = surface_view.first;
        depth_buffer.view = surface_view.second;
        if (depth_buffer.target)
-            depth_buffer.target->MarkAsRenderTarget(true);
+            depth_buffer.target->MarkAsRenderTarget(true, DEPTH_RT);
        return surface_view.second;
    }

@@ -167,11 +167,11 @@ public:
        auto surface_view = GetSurface(gpu_addr, SurfaceParams::CreateForFramebuffer(system, index),
                                       preserve_contents, true);
        if (render_targets[index].target)
-            render_targets[index].target->MarkAsRenderTarget(false);
+            render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
        render_targets[index].target = surface_view.first;
        render_targets[index].view = surface_view.second;
        if (render_targets[index].target)
-            render_targets[index].target->MarkAsRenderTarget(true);
+            render_targets[index].target->MarkAsRenderTarget(true, static_cast<u32>(index));
        return surface_view.second;
    }

@@ -191,7 +191,7 @@ public:
        if (depth_buffer.target == nullptr) {
            return;
        }
-        depth_buffer.target->MarkAsRenderTarget(false);
+        depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
        depth_buffer.target = nullptr;
        depth_buffer.view = nullptr;
    }
@@ -200,7 +200,7 @@ public:
        if (render_targets[index].target == nullptr) {
            return;
        }
-        render_targets[index].target->MarkAsRenderTarget(false);
+        render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
        render_targets[index].target = nullptr;
        render_targets[index].view = nullptr;
    }
@@ -270,6 +270,16 @@ protected:
    // and reading it from a sepparate buffer.
    virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0;

+    void ManageRenderTargetUnregister(TSurface& surface) {
+        auto& maxwell3d = system.GPU().Maxwell3D();
+        const u32 index = surface->GetRenderTarget();
+        if (index == DEPTH_RT) {
+            maxwell3d.dirty_flags.zeta_buffer = true;
+        } else {
+            maxwell3d.dirty_flags.color_buffer.set(index, true);
+        }
+    }
+
    void Register(TSurface surface) {
        const GPUVAddr gpu_addr = surface->GetGpuAddr();
        const CacheAddr cache_ptr = ToCacheAddr(system.GPU().MemoryManager().GetPointer(gpu_addr));
@@ -294,6 +304,9 @@ protected:
        if (guard_render_targets && surface->IsProtected()) {
            return;
        }
+        if (!guard_render_targets && surface->IsRenderTarget()) {
+            ManageRenderTargetUnregister(surface);
+        }
        const GPUVAddr gpu_addr = surface->GetGpuAddr();
        const CacheAddr cache_ptr = surface->GetCacheAddr();
        const std::size_t size = surface->GetSizeInBytes();
@@ -649,15 +662,6 @@ private:
                }
                return {current_surface, *view};
            }
-            // The next case is unsafe, so if we r in accurate GPU, just skip it
-            if (Settings::values.use_accurate_gpu_emulation) {
-                return RecycleSurface(overlaps, params, gpu_addr, preserve_contents,
-                                      MatchTopologyResult::FullMatch);
-            }
-            // This is the case the texture is a part of the parent.
-            if (current_surface->MatchesSubTexture(params, gpu_addr)) {
-                return RebuildSurface(current_surface, params, is_render);
-            }
        } else {
            // If there are many overlaps, odds are they are subtextures of the candidate
            // surface. We try to construct a new surface based on the candidate parameters,
@@ -793,6 +797,9 @@ private:
    static constexpr u64 registry_page_size{1 << registry_page_bits};
    std::unordered_map<CacheAddr, std::vector<TSurface>> registry;

+    static constexpr u32 DEPTH_RT = 8;
+    static constexpr u32 NO_RT = 0xFFFFFFFF;
+
    // The L1 Cache is used for fast texture lookup before checking the overlaps
    // This avoids calculating size and other stuffs.
    std::unordered_map<CacheAddr, TSurface> l1_cache;
Author	SHA1	Message	Date
Fernando Sahmkow	11f4e739bd	Shader_Ir: Implement F16 Variants of F2F, F2I, I2F. This commit takes care of implementing the F16 Variants of the conversion instructions and makes sure conversions are done.	2019-07-20 17:38:25 -04:00
Fernando Sahmkow	0a67416971	Merge pull request #2693 from ReinUsesLisp/hsetp2 shader/half_set_predicate: Implement missing HSETP2 variants	2019-07-20 17:25:08 -04:00
Flame Sage	369be67039	Update README.md	2019-07-20 19:24:24 +00:00
Flame Sage	aa599ac709	Update README.md	2019-07-20 19:22:30 +00:00
Flame Sage	a2edb27158	Merge pull request #2752 from DarkLordZach/master azure: Fix clang-format and releases	2019-07-20 15:20:53 -04:00
Zach Hilman	f470bcb826	azure: Fix clang-format and releases	2019-07-20 15:19:25 -04:00
ReinUsesLisp	45c162444d	shader/half_set_predicate: Fix HSETP2 implementation	2019-07-19 22:21:22 -03:00
ReinUsesLisp	6c4985edc9	shader/half_set_predicate: Implement missing HSETP2 variants	2019-07-19 22:20:47 -03:00
bunnei	5d369112d9	Merge pull request #2687 from lioncash/tls-process kernel/process: Allocate the process' TLS region during initialization	2019-07-18 13:53:04 -04:00
bunnei	63bda67a34	Merge pull request #2738 from lioncash/shader-ir shader-ir: Minor cleanup-related changes	2019-07-18 13:52:01 -04:00
David	d4b95bfc25	Merge pull request #2741 from FernandoS27/trace-log Kernel: Downgrade WaitForAddress and SignalToAddress messages to Trace.	2019-07-18 13:58:29 +10:00
Fernando Sahmkow	5e457bf258	Kernel: Downgrade WaitForAddress and SignalToAddress messages to Trace. This messages were originally set as warnning since few games used these svcs and it was needed for debugging. This is no longer the case.	2019-07-17 22:05:47 -04:00
Fernando Sahmkow	223a535f3f	Merge pull request #2740 from lioncash/bra shader/decode/other: Correct branch indirect argument within BRA handling	2019-07-17 14:25:08 -04:00
Rodrigo Locatti	c3218c110f	Merge pull request #2726 from lioncash/access core: Remove CurrentArmInterface() global accessor	2019-07-17 03:42:16 -03:00
Lioncash	bebbdc2067	shader_ir: std::move Node instance where applicable These are std::shared_ptr instances underneath the hood, which means copying them isn't as cheap as a regular pointer. Particularly so on weakly-ordered systems. This avoids atomic reference count increments and decrements where they aren't necessary for the core set of operations.	2019-07-16 19:49:23 -04:00
Lioncash	60926ac16b	shader_ir: Rename Get/SetTemporal to Get/SetTemporary This is more accurate in terms of describing what the functions are actually doing. Temporal relates to time, not the setting of a temporary itself.	2019-07-16 19:47:43 -04:00
Lioncash	44d87ff641	shader_ir: Remove unused includes Removes unnecessary header dependencies.	2019-07-16 19:47:42 -04:00
Fernando Sahmkow	b56e7f870a	Merge pull request #2565 from ReinUsesLisp/track-indirect shader/track: Track indirect buffers	2019-07-16 14:58:35 -04:00
Lioncash	e2d7dda166	shader/decode/other: Correct branch indirect argument within BRA handling This appears to have been a copy/paste error introduced within `8a6fc529a9`	2019-07-16 12:20:45 -04:00
Fernando Sahmkow	1bdb59fc6e	Merge pull request #2695 from ReinUsesLisp/layer-viewport gl_shader_decompiler: Implement gl_ViewportIndex and gl_Layer in vertex shaders	2019-07-15 16:28:07 -04:00
bunnei	b77a1ed67a	Merge pull request #2705 from FernandoS27/tex-cache-fixes GPU: Fixes to Texture Cache and Include Microprofiles for GL State/BufferCopy/Macro Interpreter	2019-07-14 22:44:36 -04:00
ReinUsesLisp	afa8096df5	shader: Allow tracking of indirect buffers without variable offset While changing this code, simplify tracking code to allow returning the base address node, this way callers don't have to manually rebuild it on each invocation.	2019-07-14 22:36:44 -03:00
bunnei	3477b92289	Merge pull request #2675 from ReinUsesLisp/opengl-buffer-cache buffer_cache: Implement a generic buffer cache and its OpenGL backend	2019-07-14 19:03:43 -04:00
Fernando Sahmkow	2ac7472d3f	Texture_Cache: Address Feedback	2019-07-14 17:42:39 -04:00
Fernando Sahmkow	0f54b541f4	Texture_Cache: Remove some unprecise fallback case and clang format	2019-07-14 12:00:32 -04:00
Fernando Sahmkow	5818959e54	Texture_Cache: Force Framebuffer reset if an active render target is unregistered.	2019-07-14 12:00:31 -04:00
Fernando Sahmkow	913b7a6872	GPU: Add a microprofile for macro interpreter	2019-07-14 12:00:30 -04:00
Fernando Sahmkow	a9943222f2	GL_State: Add a microprofile timer to OpenGL state.	2019-07-14 12:00:30 -04:00
Fernando Sahmkow	5c1e1a148e	Gl_Texture_Cache: Measure Buffer Copy Times	2019-07-14 12:00:29 -04:00
Fernando Sahmkow	5d31bab69a	Texture_Cache: Correct Linear Structural Match.	2019-07-14 12:00:28 -04:00
Fernando Sahmkow	4882c058fd	Merge pull request #2690 from SciresM/physmem_fixes Implement MapPhysicalMemory/UnmapPhysicalMemory	2019-07-14 09:16:46 -04:00
Fernando Sahmkow	0ec9da2f9f	Merge pull request #2692 from ReinUsesLisp/tlds-f16 shader/texture: Add F16 support for TLDS	2019-07-14 08:44:38 -04:00
Flame Sage	b9e1db1312	Merge pull request #2730 from DarkLordZach/master Finalize Azure Pipelines Definitions	2019-07-13 21:35:37 -04:00
Zach Hilman	bbc5b5d62d	Finalize Azure Pipelines Definitions d	2019-07-13 21:34:40 -04:00
Michael Scire	d4fc560c05	Remove unicorn mappings/unmappings	2019-07-11 15:12:33 -07:00
ReinUsesLisp	0eb0c24269	gl_shader_decompiler: Fix gl_PointSize redeclaration	2019-07-11 16:10:59 -03:00
ReinUsesLisp	aca40de224	gl_shader_decompiler: Fix conditional usage of GL_ARB_shader_viewport_layer_array	2019-07-11 04:27:00 -03:00
Michael Scire	a1845d1dd3	prefer system reference over global accessor	2019-07-09 08:11:35 -07:00
Michael Scire	697206092e	Prevent merging of device mapped memory blocks. This sets the DeviceMapped attribute for GPU-mapped memory blocks, and prevents merging device mapped blocks. This prevents memory mapped from the gpu from having its backing address changed by block coalesce.	2019-07-08 22:52:05 -07:00
ReinUsesLisp	c9d886c84e	gl_shader_decompiler: Implement gl_ViewportIndex and gl_Layer in vertex shaders This commit implements gl_ViewportIndex and gl_Layer in vertex and geometry shaders. In the case it's used in a vertex shader, it requires ARB_shader_viewport_layer_array. This extension is available on AMD and Nvidia devices (mesa and proprietary drivers), but not available on Intel on any platform. At the moment of writing this description I don't know if this is a hardware limitation or a driver limitation. In the case that ARB_shader_viewport_layer_array is not available, writes to these registers on a vertex shader are ignored, with the appropriate logging.	2019-07-07 20:42:55 -03:00
Michael Scire	ca6f08e3b1	Remove unused member function declaration	2019-07-07 13:02:41 -07:00
Michael Scire	ce64a9fab9	physmem: add helpers, cleanup logic.	2019-07-07 12:55:30 -07:00
Michael Scire	b901cd584e	clang-format fixes	2019-07-07 12:08:29 -07:00
ReinUsesLisp	d0966b9f7c	shader/texture: Add F16 support for TLDS	2019-07-07 16:05:56 -03:00
Michael Scire	1689784c19	address review commentary	2019-07-07 11:48:11 -07:00
Michael Scire	13a8fde3ad	Implement MapPhysicalMemory/UnmapPhysicalMemory This implements svcMapPhysicalMemory/svcUnmapPhysicalMemory for Yuzu, which can be used to map memory at a desired address by games since 3.0.0. It also properly parses SystemResourceSize from NPDM, and makes information available via svcGetInfo. This is needed for games like Super Smash Bros. and Diablo 3 -- this PR's implementation does not run into the "ASCII reads" issue mentioned in the comments of #2626, which was caused by the following bugs in Yuzu's memory management that this PR also addresses: * Yuzu's memory coalescing does not properly merge blocks. This results in a polluted address space/svcQueryMemory results that would be impossible to replicate on hardware, which can lead to game code making the wrong assumptions about memory layout. * This implements better merging for AllocatedMemoryBlocks. * Yuzu's implementation of svcMirrorMemory unprotected the entire virtual memory range containing the range being mirrored. This could lead to games attempting to map data at that unprotected range/attempting to access that range after yuzu improperly unmapped it. * This PR fixes it by simply calling ReprotectRange instead of Reprotect.	2019-07-07 11:45:53 -07:00
Lioncash	56c7912159	kernel/process: Allocate the process' TLS region during initialization Prior to execution within a process beginning, the process establishes its own TLS region for uses (as far as I can tell) related to exception handling. Now that TLS creation was decoupled from threads themselves, we can add this behavior to our Process class. This is also good, as it allows us to remove a stub within svcGetInfo, namely querying the address of that region.	2019-07-07 14:08:28 -04:00
Lioncash	eb6f55d880	kernel/process: Move main thread stack allocation to its own function Keeps this particular set of behavior isolated to its own function.	2019-07-07 14:08:25 -04:00
ReinUsesLisp	79a23ca5f0	buffer_cache: Avoid [[nodiscard]] to make clang-format happy	2019-07-06 01:17:05 -03:00
ReinUsesLisp	83050c9495	buffer_cache: Try to fix MinGW build	2019-07-06 01:14:05 -03:00
ReinUsesLisp	f7691ebe57	gl_rasterizer: Fix nullptr dereference on disabled buffers	2019-07-06 00:37:56 -03:00
ReinUsesLisp	7ecf64257a	gl_rasterizer: Minor style changes	2019-07-06 00:37:55 -03:00
ReinUsesLisp	9cdc576f60	gl_rasterizer: Fix vertex and index data invalidations	2019-07-06 00:37:55 -03:00
ReinUsesLisp	1fa21fa192	gl_buffer_cache: Implement with generic buffer cache	2019-07-06 00:37:55 -03:00
ReinUsesLisp	32c0212b24	buffer_cache: Implement a generic buffer cache Implements a templated class with a similar approach to our current generic texture cache. It is designed to be compatible with Vulkan and OpenGL,	2019-07-06 00:37:55 -03:00
ReinUsesLisp	2bcae41a73	gl_buffer_cache: Remove global system getters	2019-07-06 00:37:55 -03:00
ReinUsesLisp	02ab844934	gl_device: Query SSBO alignment	2019-07-06 00:37:55 -03:00
ReinUsesLisp	d14fbfb9b5	gl_buffer_cache: Implement flushing	2019-07-06 00:37:55 -03:00
ReinUsesLisp	345f852bdb	gl_rasterizer: Drop gl_global_cache in favor of gl_buffer_cache	2019-07-06 00:37:55 -03:00
ReinUsesLisp	8155b12d3d	gl_buffer_cache: Rework to support internalized buffers	2019-07-06 00:37:55 -03:00
ReinUsesLisp	f8ba72d491	gl_buffer_cache: Store in CachedBufferEntry the used buffer handle	2019-07-06 00:37:55 -03:00
ReinUsesLisp	b54fb8fc4c	gl_buffer_cache: Return used buffer from Upload function	2019-07-06 00:37:55 -03:00
ReinUsesLisp	a6d2f52fc3	gl_rasterizer: Add some commentaries	2019-07-06 00:37:55 -03:00
ReinUsesLisp	2b9d4088ec	gl_rasterizer: Make DrawParameters rasterizer instance const	2019-07-06 00:37:55 -03:00
ReinUsesLisp	2e39c20da5	gl_rasterizer: Move index buffer uploading to its own method	2019-07-06 00:37:55 -03:00