Compare commits

...

69 Commits

Author SHA1 Message Date
Fernando Sahmkow
218ee18417 Texture Cache: Improve documentation 2019-12-22 12:29:23 -04:00
Fernando Sahmkow
a3916588b6 Texture Cache: Address Feedback 2019-12-22 12:24:34 -04:00
Fernando Sahmkow
51c9e98677 Texture Cache: Add HLE methods for building 3D textures within the GPU in certain scenarios.
This commit adds a series of HLE methods for handling 3D textures in
general. This helps games that generate 3D textures on every frame and
may reduce loading times for certain games.
2019-12-22 12:24:34 -04:00
Fernando Sahmkow
aea978e037 Merge pull request #3230 from ReinUsesLisp/vk-emu-shaders
renderer_vulkan/shader: Add helper GLSL shaders
2019-12-22 11:23:09 -04:00
Fernando Sahmkow
27efcc15e9 Merge pull request #3240 from ReinUsesLisp/decomp-cond-code
vk_shader_decompiler: Use Visit instead of reimplementing it
2019-12-22 11:20:55 -04:00
bunnei
16dcfacbfc Merge pull request #3235 from ReinUsesLisp/ldg-u8
shader/memory: Implement LDG.U8 and unaligned U8 loads
2019-12-21 22:50:28 -05:00
ReinUsesLisp
af93909c9c vk_shader_decompiler: Use Visit instead of reimplementing it
ExprCondCode visit implements the generic Visit. Use this instead of
that one.

As an intended side effect this fixes unwritten memory usages in cases
when a negation of a condition code is used.
2019-12-20 21:36:25 -03:00
bunnei
7be65c6a68 Merge pull request #3234 from ReinUsesLisp/i2f-u8-selector
shader/conversion: Implement byte selector in I2F
2019-12-19 22:36:26 -05:00
bunnei
6d55b14cc0 Merge pull request #3233 from ReinUsesLisp/mismatch-sizes
shader/texture: Properly shrink unused entries in size mismatches
2019-12-19 20:40:27 -05:00
bunnei
1eb4a95d2b Merge pull request #3232 from ReinUsesLisp/gl-decompiler-images
gl_shader_decompiler: Add missing DeclareImages
2019-12-19 11:32:47 -05:00
bunnei
253aa52351 Merge pull request #3231 from ReinUsesLisp/tld4s-encoding
shader_bytecode: Fix TLD4S encoding
2019-12-19 11:32:25 -05:00
bunnei
d53cf05513 Merge pull request #3221 from ReinUsesLisp/vk-scheduler
vk_scheduler: Delegate commands to a worker thread and state track
2019-12-18 22:04:08 -05:00
ReinUsesLisp
ae8d4b6c0c shader/memory: Implement LDG.U8 and unaligned U8 loads
LDG can load single bytes instead of full integers or packs of integers.
These have the advantage of loading bytes that are not aligned to 4
bytes.

To emulate these this commit gets the byte being referenced (by doing
"address & 3" and then using that to extract the byte from the loaded
integer:

result = bitfieldExtract(loaded_integer, (address % 4) * 8, 8)
2019-12-18 01:21:46 -03:00
ReinUsesLisp
a7d6bd1ef1 shader/conversion: Implement byte selector in I2F
I2F's byte selector is used to choose what bytes to convert to float.
e.g. if the input is 0xaabbccdd and the selector is ".B3" it will
convert 0xaa. The default (when it's not shown in nvdisasm) is ".B0", in
that example the default would convert 0xdd to float.
2019-12-18 00:41:22 -03:00
bunnei
c053269017 Merge pull request #3227 from amilajack/patch-1
delete appveyor config
2019-12-17 21:49:22 -05:00
ReinUsesLisp
15a753b9a5 shader/texture: Properly shrink unused entries in size mismatches
When a image format mismatches we were inserting zeroes to the texture
itself. This was not handling cases were the mismatch uses less
coordinates than the guest shader code. Address that by resizing the
vector.
2019-12-17 23:38:10 -03:00
ReinUsesLisp
e438079b50 gl_shader_decompiler: Add missing DeclareImages 2019-12-17 23:34:15 -03:00
ReinUsesLisp
8b26b4228b shader_bytecode: Fix TLD4S encoding 2019-12-17 23:32:10 -03:00
bunnei
8825b88a45 Merge pull request #3173 from yuzu-emu/bunnei-spscqueue
common: SPSCQueue: Notify after incrementing queue size.
2019-12-17 14:11:20 -05:00
Amila Welihinda
8a23c32cf0 delete .appeveyor dir 2019-12-17 00:20:34 -08:00
bunnei
67b8ecc73e common: SPSCQueue: Notify after incrementing queue size. 2019-12-16 20:39:53 -05:00
ReinUsesLisp
b52297767e renderer_vulkan/shader: Add helper GLSL shaders
These shaders are used to specify code that is not dynamically generated
in the Vulkan backend. Instead of packing it inside the build system,
it's manually built and copied to the C++ file to avoid adding
unnecessary build time dependencies.

quad_array should be dropped in the future since it can be emulated with
a memory pool generated from the CPU.
2019-12-16 17:59:08 -03:00
bunnei
65b1b05e05 Merge pull request #3182 from ReinUsesLisp/renderer-opengl
renderer_opengl: Miscellaneous clean ups
2019-12-16 13:01:04 -05:00
Rodrigo Locatti
eac075692b Merge pull request #3219 from FernandoS27/fix-bindless
Corrections and fixes to TLD4S & bindless samplers failing
2019-12-16 01:26:11 -03:00
Amila Welihinda
0471eb6dc7 delete appveyor config 2019-12-15 11:16:39 -08:00
bunnei
3d51153611 Merge pull request #3222 from ReinUsesLisp/maxwell-to-vk
maxwell_to_vk: Use VK_EXT_index_type_uint8 and misc changes
2019-12-14 22:30:12 -05:00
bunnei
ccda77c8c4 Merge pull request #3224 from bunnei/boost-ext-update
externals: Update boost-ext to include safe_numerics.
2019-12-14 16:13:47 -05:00
bunnei
035ec7d9de Merge pull request #3213 from ReinUsesLisp/intel-mesa
gl_device: Enable compute shaders for Intel Mesa drivers
2019-12-14 16:04:31 -05:00
bunnei
285705b5f4 externals: Update boost-ext to include safe_numerics.
- This is useful to me for an upcoming change.
2019-12-14 03:04:42 -05:00
bunnei
2b650543c6 Merge pull request #3212 from ReinUsesLisp/fix-smem-lmem
gl_shader_cache: Add missing new-line on emitted GLSL
2019-12-13 21:35:29 -05:00
ReinUsesLisp
e3ea583893 maxwell_to_vk: Improve image format table and add more formats
A1B5G5R5 uses A1R5G5B5. This is flipped with image view swizzles;
flushing is still not properly implemented on Vulkan for this particular
format.
2019-12-13 03:12:29 -03:00
ReinUsesLisp
f27b21077d maxwell_to_vk: Implement more vertex formats 2019-12-13 03:12:28 -03:00
ReinUsesLisp
8db8631d81 maxwell_to_vk: Implement more primitive topologies
Add an extra argument to query device capabilities in the future. The
intention behind this is to use native quads, quad strips, line loops
and polygons if these are released for Vulkan.
2019-12-13 03:12:28 -03:00
ReinUsesLisp
15513f0801 maxwell_to_vk: Approach GL_CLAMP closer to the GL spec
The OpenGL spec defines GL_CLAMP's formula similarly to CLAMP_TO_EDGE
and CLAMP_TO_BORDER depending on the filter mode used. It doesn't
exactly behave like this, but it's the closest we can get with what
Vulkan offers without emulating it by injecting shader code.
2019-12-13 03:12:28 -03:00
ReinUsesLisp
f845df8651 maxwell_to_vk: Use VK_EXT_index_type_uint8 when available 2019-12-13 02:37:23 -03:00
ReinUsesLisp
2df9a2dcaf vk_scheduler: Delegate commands to a worker thread and state track
Introduce a worker thread approach for delegating Vulkan work derived
from dxvk's approach. https://github.com/doitsujin/dxvk

Now that the scheduler is what handles all Vulkan work related to
command streaming, store state tracking in itself. This way we can know
when to reupload Vulkan dynamic state to the queue (since this one is
invalidated between command buffers unlike NVN). We can also store the
renderpass state and graphics pipeline bound to avoid redundant binds
and renderpass begins/ends.
2019-12-13 02:24:48 -03:00
bunnei
6d0d79109b Merge pull request #3214 from lioncash/svc-func
kernel/svc: Amend function signature of SignalProcessWideKey
2019-12-12 21:32:36 -05:00
bunnei
8fc49a83b6 Merge pull request #3217 from jhol/fix-boost-include
Added missing include
2019-12-11 22:21:24 -05:00
Fernando Sahmkow
c0ee0aa1a8 Shader_IR: Correct TLD4S Depth Compare. 2019-12-11 19:53:17 -04:00
Fernando Sahmkow
af89723fa3 Shader_Ir: Correct TLD4S encoding and implement f16 flag. 2019-12-11 19:53:17 -04:00
Fernando Sahmkow
84a158c977 Gl_Shader_compiler: Correct Depth Compare for Texture Gather operations. 2019-12-11 19:53:16 -04:00
Fernando Sahmkow
271a3264f3 Shader_Ir: default failed tracks on bindless samplers to null values. 2019-12-11 19:53:16 -04:00
Fernando Sahmkow
900b2e5cae Merge pull request #3218 from FernandoS27/tess-gl
Gl_Rasterizer: Skip Tesselation Control and Eval stages as they are unimplemented
2019-12-11 17:50:09 -04:00
Fernando Sahmkow
1d2ba3cc97 Gl_Rasterizer: Skip Tesselation Control and Eval stages as they are un implemented.
This commit ensures the OGL backend does not execute tesselation shader 
stages as they are currently unimplemented.
2019-12-11 15:41:26 -04:00
bunnei
1a66cde175 Merge pull request #3210 from ReinUsesLisp/memory-barrier
shader: Implement MEMBAR.GL
2019-12-11 14:24:39 -05:00
Joel Holdsworth
e9faa1617c Added missing include 2019-12-11 18:11:49 +00:00
Fernando Sahmkow
22c6b9fab2 Kernel: Correct behavior of Address Arbiter threads. (#3165)
* Kernel: Correct behavior of Address Arbiter threads.

This corrects arbitration threads to behave just like in Horizon OS.
They are added into a container and released according to what priority
they had when added. Horizon OS does not reorder them if their priority
changes.

* Kernel: Address Feedback.
2019-12-11 10:55:38 -05:00
Lioncash
30e365e4fc kernel/svc: Correct function signature of SignalProcessWideKey
This function doesn't actually return a result code, so we can amend the
signature of it to match.
2019-12-11 07:13:27 -05:00
ReinUsesLisp
f564eaebed gl_device: Enable compute shaders for Intel Mesa drivers
Previously we naively checked for "Intel" in GL_VENDOR, but this
includes both Intel's proprietary driver and the mesa driver. Re-enable
compute shaders for mesa.
2019-12-11 00:00:30 -03:00
ReinUsesLisp
48e16c4c49 gl_shader_cache: Add missing new-line on emitted GLSL
Add missing new-line. This caused shaders using local memory and shared
memory to inject a preprocessor GLSL line after an expression (resulting
in invalid code).

It looked like this:
shared uint smem[8];#define LOCAL_MEMORY_SIZE 16

It should look like this (addressed by this commit):
shared uint smem[8];
\#define LOCAL_MEMORY_SIZE 16
2019-12-10 23:52:51 -03:00
bunnei
34f8881d3e Merge pull request #3201 from lioncash/dump
kernel/svc: Provide implementations for svcDumpInfo/svcDumpInfoNew
2019-12-10 21:48:37 -05:00
Rodrigo Locatti
c8db7d1399 Merge pull request #3211 from FernandoS27/depth-mode
Maxwell3D: Implement Depth Mode.
2019-12-10 21:20:52 -03:00
Fernando Sahmkow
7ffb672f61 Maxwell3D: Implement Depth Mode.
This commit finishes adding depth mode that was reverted before due to
other unresolved issues.
2019-12-10 19:51:46 -04:00
ReinUsesLisp
425a254fa2 shader: Implement MEMBAR.GL
Implement using memoryBarrier in GLSL and OpMemoryBarrier on SPIR-V.
2019-12-10 16:45:03 -03:00
Fernando Sahmkow
6edadef96d Merge pull request #3208 from ReinUsesLisp/vk-shader-decompiler
vk_shader_decompiler: Add tessellation and misc changes
2019-12-10 08:01:41 -04:00
ReinUsesLisp
233ed96a5c vk_shader_decompiler: Fix build issues on old gcc versions 2019-12-10 01:55:38 -03:00
ReinUsesLisp
d30cf51d7d vk_shader_decompiler: Reduce YNegate's severity 2019-12-09 23:52:28 -03:00
ReinUsesLisp
0b5b93053d shader_ir/other: Implement S2R InvocationId 2019-12-09 23:52:28 -03:00
ReinUsesLisp
ecbfa416f0 vk_shader_decompiler: Misc changes
Update Sirit and its usage in vk_shader_decompiler. Highlights:
- Implement tessellation shaders
- Implement geometry shaders
- Implement some missing features
- Use native half float instructions when available.
2019-12-09 23:51:57 -03:00
ReinUsesLisp
9ad6327fbd shader: Keep track of shaders using warp instructions 2019-12-09 23:40:41 -03:00
ReinUsesLisp
6233b1db08 shader_ir/memory: Implement patch stores 2019-12-09 23:25:21 -03:00
Fernando Sahmkow
f2458106e6 Merge pull request #3205 from ReinUsesLisp/vk-device
vk_device: Misc changes
2019-12-09 20:02:58 -04:00
Lioncash
67b8265bd6 kernel/svc: Provide implementations for svcDumpInfo/svcDumpInfoNew
These are fairly trivial to implement, we can just do nothing. This also
provides a spot for us to potentially dump out any relevant info in the
future (e.g. for debugging purposes with homebrew, etc).

While we're at it, we can also correct the names of both of these
supervisor calls.
2019-12-07 22:01:17 -05:00
ReinUsesLisp
e6a0a30334 renderer_opengl: Make ScreenRectVertex's constructor constexpr 2019-11-28 20:36:02 -03:00
ReinUsesLisp
dee7844443 renderer_opengl: Remove C casts 2019-11-28 20:28:27 -03:00
ReinUsesLisp
3a44faff11 renderer_opengl: Use explicit binding for presentation shaders 2019-11-28 20:25:56 -03:00
ReinUsesLisp
75cc501d52 renderer_opengl: Drop macros for message decorations 2019-11-28 20:15:25 -03:00
ReinUsesLisp
056f049b26 renderer_opengl: Move static definitions to anonymous namespace 2019-11-28 20:14:40 -03:00
ReinUsesLisp
4589582eaf renderer_opengl: Move commentaries to header file 2019-11-28 20:11:03 -03:00
44 changed files with 3043 additions and 1387 deletions

View File

@@ -1,39 +0,0 @@
# Set-up Visual Studio Command Prompt environment for PowerShell
pushd "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\Common7\Tools\"
cmd /c "VsDevCmd.bat -arch=x64 & set" | foreach {
if ($_ -match "=") {
$v = $_.split("="); Set-Item -Force -Path "ENV:\$($v[0])" -Value "$($v[1])"
}
}
popd
function Which ($search_path, $name) {
($search_path).Split(";") | Get-ChildItem -Filter $name | Select -First 1 -Exp FullName
}
function GetDeps ($search_path, $binary) {
((dumpbin /dependents $binary).Where({ $_ -match "dependencies:"}, "SkipUntil") | Select-String "[^ ]*\.dll").Matches | foreach {
Which $search_path $_.Value
}
}
function RecursivelyGetDeps ($search_path, $binary) {
$final_deps = @()
$deps_to_process = GetDeps $search_path $binary
while ($deps_to_process.Count -gt 0) {
$current, $deps_to_process = $deps_to_process
if ($final_deps -contains $current) { continue }
# Is this a system dll file?
# We use the same algorithm that cmake uses to determine this.
if ($current -match "$([regex]::Escape($env:SystemRoot))\\sys") { continue }
if ($current -match "$([regex]::Escape($env:WinDir))\\sys") { continue }
if ($current -match "\\msvc[^\\]+dll") { continue }
if ($current -match "\\api-ms-win-[^\\]+dll") { continue }
$final_deps += $current
$new_deps = GetDeps $search_path $current
$deps_to_process += ($new_deps | ?{-not ($final_deps -contains $_)})
}
return $final_deps
}

View File

@@ -1,178 +0,0 @@
# shallow clone
clone_depth: 10
cache:
- C:\ProgramData\chocolatey\bin -> appveyor.yml
- C:\ProgramData\chocolatey\lib -> appveyor.yml
os: Visual Studio 2017
environment:
# Tell msys2 to add mingw64 to the path
MSYSTEM: MINGW64
# Tell msys2 to inherit the current directory when starting the shell
CHERE_INVOKING: 1
matrix:
- BUILD_TYPE: msvc
- BUILD_TYPE: mingw
platform:
- x64
configuration:
- Release
install:
- git submodule update --init --recursive
- ps: |
if ($env:BUILD_TYPE -eq 'mingw') {
$dependencies = "mingw64/mingw-w64-x86_64-cmake",
"mingw64/mingw-w64-x86_64-qt5",
"mingw64/mingw-w64-x86_64-SDL2"
# redirect err to null to prevent warnings from becoming errors
# workaround to prevent pacman from failing due to cyclical dependencies
C:\msys64\usr\bin\bash -lc "pacman --noconfirm -S mingw64/mingw-w64-x86_64-freetype mingw64/mingw-w64-x86_64-fontconfig" 2> $null
C:\msys64\usr\bin\bash -lc "pacman --noconfirm -S $dependencies" 2> $null
}
before_build:
- mkdir %BUILD_TYPE%_build
- cd %BUILD_TYPE%_build
- ps: |
$COMPAT = if ($env:ENABLE_COMPATIBILITY_REPORTING -eq $null) {0} else {$env:ENABLE_COMPATIBILITY_REPORTING}
if ($env:BUILD_TYPE -eq 'msvc') {
# redirect stderr and change the exit code to prevent powershell from cancelling the build if cmake prints a warning
cmd /C 'cmake -G "Visual Studio 15 2017 Win64" -DYUZU_USE_BUNDLED_QT=1 -DYUZU_USE_BUNDLED_SDL2=1 -DYUZU_USE_BUNDLED_UNICORN=1 -DYUZU_USE_QT_WEB_ENGINE=ON -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DYUZU_ENABLE_COMPATIBILITY_REPORTING=${COMPAT} -DUSE_DISCORD_PRESENCE=ON .. 2>&1 && exit 0'
} else {
C:\msys64\usr\bin\bash.exe -lc "cmake -G 'MSYS Makefiles' -DYUZU_BUILD_UNICORN=1 -DCMAKE_BUILD_TYPE=Release -DENABLE_COMPATIBILITY_LIST_DOWNLOAD=ON -DYUZU_ENABLE_COMPATIBILITY_REPORTING=${COMPAT} -DUSE_DISCORD_PRESENCE=ON .. 2>&1"
}
- cd ..
build_script:
- ps: |
if ($env:BUILD_TYPE -eq 'msvc') {
# https://www.appveyor.com/docs/build-phase
msbuild msvc_build/yuzu.sln /maxcpucount /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
} else {
C:\msys64\usr\bin\bash.exe -lc 'mingw32-make -C mingw_build/ 2>&1'
}
after_build:
- ps: |
$GITDATE = $(git show -s --date=short --format='%ad') -replace "-",""
$GITREV = $(git show -s --format='%h')
# Find out which kind of release we are producing by tag name
if ($env:APPVEYOR_REPO_TAG_NAME) {
$RELEASE_DIST, $RELEASE_VERSION = $env:APPVEYOR_REPO_TAG_NAME.split('-')
} else {
# There is no repo tag - make assumptions
$RELEASE_DIST = "head"
}
if ($env:BUILD_TYPE -eq 'msvc') {
# Where are these spaces coming from? Regardless, let's remove them
$MSVC_BUILD_ZIP = "yuzu-windows-msvc-$GITDATE-$GITREV.zip" -replace " ", ""
$MSVC_BUILD_PDB = "yuzu-windows-msvc-$GITDATE-$GITREV-debugsymbols.zip" -replace " ", ""
$MSVC_SEVENZIP = "yuzu-windows-msvc-$GITDATE-$GITREV.7z" -replace " ", ""
# set the build names as env vars so the artifacts can upload them
$env:BUILD_ZIP = $MSVC_BUILD_ZIP
$env:BUILD_SYMBOLS = $MSVC_BUILD_PDB
$env:BUILD_UPDATE = $MSVC_SEVENZIP
$BUILD_DIR = ".\msvc_build\bin\Release"
# Make a debug symbol upload
mkdir pdb
Get-ChildItem "$BUILD_DIR\" -Recurse -Filter "*.pdb" | Copy-Item -destination .\pdb
7z a -tzip $MSVC_BUILD_PDB .\pdb\*.pdb
rm "$BUILD_DIR\*.pdb"
mkdir $RELEASE_DIST
# get rid of extra exes by copying everything over, then deleting all the exes, then copying just the exes we want
Copy-Item "$BUILD_DIR\*" -Destination $RELEASE_DIST -Recurse
rm "$RELEASE_DIST\*.exe"
Get-ChildItem "$BUILD_DIR" -Recurse -Filter "yuzu*.exe" | Copy-Item -destination $RELEASE_DIST
Get-ChildItem "$BUILD_DIR" -Recurse -Filter "QtWebEngineProcess*.exe" | Copy-Item -destination $RELEASE_DIST
Copy-Item .\license.txt -Destination $RELEASE_DIST
Copy-Item .\README.md -Destination $RELEASE_DIST
7z a -tzip $MSVC_BUILD_ZIP $RELEASE_DIST\*
7z a $MSVC_SEVENZIP $RELEASE_DIST
} else {
$MINGW_BUILD_ZIP = "yuzu-windows-mingw-$GITDATE-$GITREV.zip" -replace " ", ""
$MINGW_SEVENZIP = "yuzu-windows-mingw-$GITDATE-$GITREV.7z" -replace " ", ""
# not going to bother adding separate debug symbols for mingw, so just upload a README for it
# if someone wants to add them, change mingw to compile with -g and use objdump and strip to separate the symbols from the binary
$MINGW_NO_DEBUG_SYMBOLS = "README_No_Debug_Symbols.txt"
Set-Content -Path $MINGW_NO_DEBUG_SYMBOLS -Value "This is a workaround for Appveyor since msvc has debug symbols but mingw doesnt" -Force
# store the build information in env vars so we can use them as artifacts
$env:BUILD_ZIP = $MINGW_BUILD_ZIP
$env:BUILD_SYMBOLS = $MINGW_NO_DEBUG_SYMBOLS
$env:BUILD_UPDATE = $MINGW_SEVENZIP
$CMAKE_SOURCE_DIR = "$env:APPVEYOR_BUILD_FOLDER"
$CMAKE_BINARY_DIR = "$CMAKE_SOURCE_DIR/mingw_build/bin"
$RELEASE_DIST = $RELEASE_DIST + "-mingw"
mkdir $RELEASE_DIST
mkdir $RELEASE_DIST/platforms
mkdir $RELEASE_DIST/styles
mkdir $RELEASE_DIST/imageformats
# copy the compiled binaries and other release files to the release folder
Get-ChildItem "$CMAKE_BINARY_DIR" -Filter "yuzu*.exe" | Copy-Item -destination $RELEASE_DIST
Copy-Item -path "$CMAKE_SOURCE_DIR/license.txt" -destination $RELEASE_DIST
Copy-Item -path "$CMAKE_SOURCE_DIR/README.md" -destination $RELEASE_DIST
# copy the qt windows plugin dll to platforms
Copy-Item -path "C:/msys64/mingw64/share/qt5/plugins/platforms/qwindows.dll" -force -destination "$RELEASE_DIST/platforms"
# copy the qt windows vista style dll to platforms
Copy-Item -path "C:/msys64/mingw64/share/qt5/plugins/styles/qwindowsvistastyle.dll" -force -destination "$RELEASE_DIST/styles"
# copy the qt jpeg imageformat dll to platforms
Copy-Item -path "C:/msys64/mingw64/share/qt5/plugins/imageformats/qjpeg.dll" -force -destination "$RELEASE_DIST/imageformats"
# copy all the dll dependencies to the release folder
. "./.appveyor/UtilityFunctions.ps1"
$DLLSearchPath = "C:\msys64\mingw64\bin;$env:PATH"
$MingwDLLs = RecursivelyGetDeps $DLLSearchPath "$RELEASE_DIST\yuzu.exe"
$MingwDLLs += RecursivelyGetDeps $DLLSearchPath "$RELEASE_DIST\yuzu_cmd.exe"
$MingwDLLs += RecursivelyGetDeps $DLLSearchPath "$RELEASE_DIST\imageformats\qjpeg.dll"
Write-Host "Detected the following dependencies:"
Write-Host $MingwDLLs
foreach ($file in $MingwDLLs) {
Copy-Item -path "$file" -force -destination "$RELEASE_DIST"
}
7z a -tzip $MINGW_BUILD_ZIP $RELEASE_DIST\*
7z a $MINGW_SEVENZIP $RELEASE_DIST
}
test_script:
- cd %BUILD_TYPE%_build
- ps: |
if ($env:BUILD_TYPE -eq 'msvc') {
ctest -VV -C Release
} else {
C:\msys64\usr\bin\bash.exe -lc "ctest -VV -C Release"
}
- cd ..
artifacts:
- path: $(BUILD_ZIP)
name: build
type: zip
deploy:
provider: GitHub
release: $(appveyor_repo_tag_name)
auth_token:
secure: QqePPnXbkzmXct5c8hZ2X5AbsthbI6cS1Sr+VBzcD8oUOIjfWJJKXVAQGUbQAbb0
artifact: update,build
draft: false
prerelease: false
on:
appveyor_repo_tag: true

View File

@@ -46,9 +46,16 @@ public:
ElementPtr* new_ptr = new ElementPtr();
write_ptr->next.store(new_ptr, std::memory_order_release);
write_ptr = new_ptr;
cv.notify_one();
++size;
const size_t previous_size{size++};
// Acquire the mutex and then immediately release it as a fence.
// TODO(bunnei): This can be replaced with C++20 waitable atomics when properly supported.
// See discussion on https://github.com/yuzu-emu/yuzu/pull/3173 for details.
if (previous_size == 0) {
std::lock_guard lock{cv_mutex};
}
cv.notify_one();
}
void Pop() {

View File

@@ -17,10 +17,10 @@
#include "core/memory.h"
namespace Kernel {
namespace {
// Wake up num_to_wake (or all) threads in a vector.
void WakeThreads(const std::vector<std::shared_ptr<Thread>>& waiting_threads, s32 num_to_wake) {
auto& system = Core::System::GetInstance();
void AddressArbiter::WakeThreads(const std::vector<std::shared_ptr<Thread>>& waiting_threads,
s32 num_to_wake) {
// Only process up to 'target' threads, unless 'target' is <= 0, in which case process
// them all.
std::size_t last = waiting_threads.size();
@@ -32,12 +32,12 @@ void WakeThreads(const std::vector<std::shared_ptr<Thread>>& waiting_threads, s3
for (std::size_t i = 0; i < last; i++) {
ASSERT(waiting_threads[i]->GetStatus() == ThreadStatus::WaitArb);
waiting_threads[i]->SetWaitSynchronizationResult(RESULT_SUCCESS);
RemoveThread(waiting_threads[i]);
waiting_threads[i]->SetArbiterWaitAddress(0);
waiting_threads[i]->ResumeFromWait();
system.PrepareReschedule(waiting_threads[i]->GetProcessorID());
}
}
} // Anonymous namespace
AddressArbiter::AddressArbiter(Core::System& system) : system{system} {}
AddressArbiter::~AddressArbiter() = default;
@@ -184,6 +184,7 @@ ResultCode AddressArbiter::WaitForAddressIfEqual(VAddr address, s32 value, s64 t
ResultCode AddressArbiter::WaitForAddressImpl(VAddr address, s64 timeout) {
Thread* current_thread = system.CurrentScheduler().GetCurrentThread();
current_thread->SetArbiterWaitAddress(address);
InsertThread(SharedFrom(current_thread));
current_thread->SetStatus(ThreadStatus::WaitArb);
current_thread->InvalidateWakeupCallback();
current_thread->WakeAfterDelay(timeout);
@@ -192,26 +193,51 @@ ResultCode AddressArbiter::WaitForAddressImpl(VAddr address, s64 timeout) {
return RESULT_TIMEOUT;
}
std::vector<std::shared_ptr<Thread>> AddressArbiter::GetThreadsWaitingOnAddress(
VAddr address) const {
void AddressArbiter::HandleWakeupThread(std::shared_ptr<Thread> thread) {
ASSERT(thread->GetStatus() == ThreadStatus::WaitArb);
RemoveThread(thread);
thread->SetArbiterWaitAddress(0);
}
// Retrieve all threads that are waiting for this address.
std::vector<std::shared_ptr<Thread>> threads;
const auto& scheduler = system.GlobalScheduler();
const auto& thread_list = scheduler.GetThreadList();
for (const auto& thread : thread_list) {
if (thread->GetArbiterWaitAddress() == address) {
threads.push_back(thread);
void AddressArbiter::InsertThread(std::shared_ptr<Thread> thread) {
const VAddr arb_addr = thread->GetArbiterWaitAddress();
std::list<std::shared_ptr<Thread>>& thread_list = arb_threads[arb_addr];
auto it = thread_list.begin();
while (it != thread_list.end()) {
const std::shared_ptr<Thread>& current_thread = *it;
if (current_thread->GetPriority() >= thread->GetPriority()) {
thread_list.insert(it, thread);
return;
}
++it;
}
thread_list.push_back(std::move(thread));
}
// Sort them by priority, such that the highest priority ones come first.
std::sort(threads.begin(), threads.end(),
[](const std::shared_ptr<Thread>& lhs, const std::shared_ptr<Thread>& rhs) {
return lhs->GetPriority() < rhs->GetPriority();
});
void AddressArbiter::RemoveThread(std::shared_ptr<Thread> thread) {
const VAddr arb_addr = thread->GetArbiterWaitAddress();
std::list<std::shared_ptr<Thread>>& thread_list = arb_threads[arb_addr];
auto it = thread_list.begin();
while (it != thread_list.end()) {
const std::shared_ptr<Thread>& current_thread = *it;
if (current_thread.get() == thread.get()) {
thread_list.erase(it);
return;
}
++it;
}
UNREACHABLE();
}
return threads;
std::vector<std::shared_ptr<Thread>> AddressArbiter::GetThreadsWaitingOnAddress(VAddr address) {
std::vector<std::shared_ptr<Thread>> result;
std::list<std::shared_ptr<Thread>>& thread_list = arb_threads[address];
auto it = thread_list.begin();
while (it != thread_list.end()) {
std::shared_ptr<Thread> current_thread = *it;
result.push_back(std::move(current_thread));
++it;
}
return result;
}
} // namespace Kernel

View File

@@ -4,7 +4,9 @@
#pragma once
#include <list>
#include <memory>
#include <unordered_map>
#include <vector>
#include "common/common_types.h"
@@ -48,6 +50,9 @@ public:
/// Waits on an address with a particular arbitration type.
ResultCode WaitForAddress(VAddr address, ArbitrationType type, s32 value, s64 timeout_ns);
/// Removes a thread from the container and resets its address arbiter adress to 0
void HandleWakeupThread(std::shared_ptr<Thread> thread);
private:
/// Signals an address being waited on.
ResultCode SignalToAddressOnly(VAddr address, s32 num_to_wake);
@@ -71,8 +76,20 @@ private:
// Waits on the given address with a timeout in nanoseconds
ResultCode WaitForAddressImpl(VAddr address, s64 timeout);
/// Wake up num_to_wake (or all) threads in a vector.
void WakeThreads(const std::vector<std::shared_ptr<Thread>>& waiting_threads, s32 num_to_wake);
/// Insert a thread into the address arbiter container
void InsertThread(std::shared_ptr<Thread> thread);
/// Removes a thread from the address arbiter container
void RemoveThread(std::shared_ptr<Thread> thread);
// Gets the threads waiting on an address.
std::vector<std::shared_ptr<Thread>> GetThreadsWaitingOnAddress(VAddr address) const;
std::vector<std::shared_ptr<Thread>> GetThreadsWaitingOnAddress(VAddr address);
/// List of threads waiting for a address arbiter
std::unordered_map<VAddr, std::list<std::shared_ptr<Thread>>> arb_threads;
Core::System& system;
};

View File

@@ -78,9 +78,9 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] s64 cycles_
}
}
if (thread->GetArbiterWaitAddress() != 0) {
ASSERT(thread->GetStatus() == ThreadStatus::WaitArb);
thread->SetArbiterWaitAddress(0);
if (thread->GetStatus() == ThreadStatus::WaitArb) {
auto& address_arbiter = thread->GetOwnerProcess()->GetAddressArbiter();
address_arbiter.HandleWakeupThread(thread);
}
if (resume) {

View File

@@ -1650,8 +1650,7 @@ static ResultCode WaitProcessWideKeyAtomic(Core::System& system, VAddr mutex_add
}
/// Signal process wide key
static ResultCode SignalProcessWideKey(Core::System& system, VAddr condition_variable_addr,
s32 target) {
static void SignalProcessWideKey(Core::System& system, VAddr condition_variable_addr, s32 target) {
LOG_TRACE(Kernel_SVC, "called, condition_variable_addr=0x{:X}, target=0x{:08X}",
condition_variable_addr, target);
@@ -1726,8 +1725,6 @@ static ResultCode SignalProcessWideKey(Core::System& system, VAddr condition_var
system.PrepareReschedule(thread->GetProcessorID());
}
}
return RESULT_SUCCESS;
}
// Wait for an address (via Address Arbiter)
@@ -1781,6 +1778,17 @@ static ResultCode SignalToAddress(Core::System& system, VAddr address, u32 type,
return address_arbiter.SignalToAddress(address, signal_type, value, num_to_wake);
}
static void KernelDebug([[maybe_unused]] Core::System& system,
[[maybe_unused]] u32 kernel_debug_type, [[maybe_unused]] u64 param1,
[[maybe_unused]] u64 param2, [[maybe_unused]] u64 param3) {
// Intentionally do nothing, as this does nothing in released kernel binaries.
}
static void ChangeKernelTraceState([[maybe_unused]] Core::System& system,
[[maybe_unused]] u32 trace_state) {
// Intentionally do nothing, as this does nothing in released kernel binaries.
}
/// This returns the total CPU ticks elapsed since the CPU was powered-on
static u64 GetSystemTick(Core::System& system) {
LOG_TRACE(Kernel_SVC, "called");
@@ -2418,8 +2426,8 @@ static const FunctionDef SVC_Table[] = {
{0x39, nullptr, "Unknown"},
{0x3A, nullptr, "Unknown"},
{0x3B, nullptr, "Unknown"},
{0x3C, nullptr, "DumpInfo"},
{0x3D, nullptr, "DumpInfoNew"},
{0x3C, SvcWrap<KernelDebug>, "KernelDebug"},
{0x3D, SvcWrap<ChangeKernelTraceState>, "ChangeKernelTraceState"},
{0x3E, nullptr, "Unknown"},
{0x3F, nullptr, "Unknown"},
{0x40, nullptr, "CreateSession"},

View File

@@ -112,11 +112,6 @@ void SvcWrap(Core::System& system) {
FuncReturn(system, retval);
}
template <ResultCode func(Core::System&, u64, s32)>
void SvcWrap(Core::System& system) {
FuncReturn(system, func(system, Param(system, 0), static_cast<s32>(Param(system, 1))).raw);
}
template <ResultCode func(Core::System&, u64, u32)>
void SvcWrap(Core::System& system) {
FuncReturn(system, func(system, Param(system, 0), static_cast<u32>(Param(system, 1))).raw);
@@ -311,11 +306,27 @@ void SvcWrap(Core::System& system) {
func(system);
}
template <void func(Core::System&, u32)>
void SvcWrap(Core::System& system) {
func(system, static_cast<u32>(Param(system, 0)));
}
template <void func(Core::System&, u32, u64, u64, u64)>
void SvcWrap(Core::System& system) {
func(system, static_cast<u32>(Param(system, 0)), Param(system, 1), Param(system, 2),
Param(system, 3));
}
template <void func(Core::System&, s64)>
void SvcWrap(Core::System& system) {
func(system, static_cast<s64>(Param(system, 0)));
}
template <void func(Core::System&, u64, s32)>
void SvcWrap(Core::System& system) {
func(system, Param(system, 0), static_cast<s32>(Param(system, 1)));
}
template <void func(Core::System&, u64, u64)>
void SvcWrap(Core::System& system) {
func(system, Param(system, 0), Param(system, 1));

View File

@@ -310,6 +310,11 @@ public:
}
};
enum class DepthMode : u32 {
MinusOneToOne = 0,
ZeroToOne = 1,
};
enum class PrimitiveTopology : u32 {
Points = 0x0,
Lines = 0x1,
@@ -491,11 +496,6 @@ public:
INSERT_UNION_PADDING_WORDS(1);
};
enum class DepthMode : u32 {
MinusOneToOne = 0,
ZeroToOne = 1,
};
enum class TessellationPrimitive : u32 {
Isolines = 0,
Triangles = 1,
@@ -676,7 +676,7 @@ public:
u32 count;
} vertex_buffer;
INSERT_UNION_PADDING_WORDS(1);
DepthMode depth_mode;
float clear_color[4];
float clear_depth;
@@ -1425,6 +1425,7 @@ ASSERT_REG_POSITION(rt, 0x200);
ASSERT_REG_POSITION(viewport_transform, 0x280);
ASSERT_REG_POSITION(viewports, 0x300);
ASSERT_REG_POSITION(vertex_buffer, 0x35D);
ASSERT_REG_POSITION(depth_mode, 0x35F);
ASSERT_REG_POSITION(clear_color[0], 0x360);
ASSERT_REG_POSITION(clear_depth, 0x364);
ASSERT_REG_POSITION(clear_stencil, 0x368);

View File

@@ -98,10 +98,11 @@ union Attribute {
BitField<20, 10, u64> immediate;
BitField<22, 2, u64> element;
BitField<24, 6, Index> index;
BitField<31, 1, u64> patch;
BitField<47, 3, AttributeSize> size;
bool IsPhysical() const {
return element == 0 && static_cast<u64>(index.Value()) == 0;
return patch == 0 && element == 0 && static_cast<u64>(index.Value()) == 0;
}
} fmt20;
@@ -383,6 +384,15 @@ enum class IsberdMode : u64 {
enum class IsberdShift : u64 { None = 0, U16 = 1, B32 = 2 };
enum class MembarType : u64 {
CTA = 0,
GL = 1,
SYS = 2,
VC = 3,
};
enum class MembarUnknown : u64 { Default = 0, IVALLD = 1, IVALLT = 2, IVALLTD = 3 };
enum class HalfType : u64 {
H0_H1 = 0,
F32 = 1,
@@ -1282,6 +1292,7 @@ union Instruction {
BitField<50, 1, u64> dc_flag;
BitField<51, 1, u64> aoffi_flag;
BitField<52, 2, u64> component;
BitField<55, 1, u64> fp16_flag;
bool UsesMiscMode(TextureMiscMode mode) const {
switch (mode) {
@@ -1544,6 +1555,11 @@ union Instruction {
BitField<47, 2, IsberdShift> shift;
} isberd;
union {
BitField<8, 2, MembarType> type;
BitField<0, 2, MembarUnknown> unknown;
} membar;
union {
BitField<48, 1, u64> signed_a;
BitField<38, 1, u64> is_byte_chunk_a;
@@ -1668,6 +1684,7 @@ public:
IPA,
OUT_R, // Emit vertex/primitive
ISBERD,
MEMBAR,
VMAD,
VSETP,
FFMA_IMM, // Fused Multiply and Add
@@ -1929,7 +1946,7 @@ private:
INST("111000100100----", Id::BRA, Type::Flow, "BRA"),
INST("111000100101----", Id::BRX, Type::Flow, "BRX"),
INST("1111000011111---", Id::SYNC, Type::Flow, "SYNC"),
INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
INST("111000110100----", Id::BRK, Type::Flow, "BRK"),
INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
@@ -1956,7 +1973,7 @@ private:
INST("1101-01---------", Id::TLDS, Type::Texture, "TLDS"),
INST("110010----111---", Id::TLD4, Type::Texture, "TLD4"),
INST("1101111011111---", Id::TLD4_B, Type::Texture, "TLD4_B"),
INST("1101111100------", Id::TLD4S, Type::Texture, "TLD4S"),
INST("11011111-0------", Id::TLD4S, Type::Texture, "TLD4S"),
INST("110111110110----", Id::TMML_B, Type::Texture, "TMML_B"),
INST("1101111101011---", Id::TMML, Type::Texture, "TMML"),
INST("11011110011110--", Id::TXD_B, Type::Texture, "TXD_B"),
@@ -1968,6 +1985,7 @@ private:
INST("11100000--------", Id::IPA, Type::Trivial, "IPA"),
INST("1111101111100---", Id::OUT_R, Type::Trivial, "OUT_R"),
INST("1110111111010---", Id::ISBERD, Type::Trivial, "ISBERD"),
INST("1110111110011---", Id::MEMBAR, Type::Trivial, "MEMBAR"),
INST("01011111--------", Id::VMAD, Type::Video, "VMAD"),
INST("0101000011110---", Id::VSETP, Type::Video, "VSETP"),
INST("0011001-1-------", Id::FFMA_IMM, Type::Ffma, "FFMA_IMM"),

View File

@@ -5,6 +5,7 @@
#include <mutex>
#include <boost/icl/interval_map.hpp>
#include <boost/range/iterator_range.hpp>
#include "common/assert.h"
#include "common/common_types.h"

View File

@@ -5,6 +5,7 @@
#include <algorithm>
#include <array>
#include <cstddef>
#include <cstring>
#include <optional>
#include <vector>
@@ -134,11 +135,13 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin
Device::Device() : base_bindings{BuildBaseBindings()} {
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER));
const std::vector extensions = GetExtensions();
const bool is_nvidia = vendor == "NVIDIA Corporation";
const bool is_amd = vendor == "ATI Technologies Inc.";
const bool is_intel = vendor == "Intel";
const bool is_intel_proprietary = is_intel && std::strstr(renderer, "Mesa") == nullptr;
uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
@@ -152,7 +155,7 @@ Device::Device() : base_bindings{BuildBaseBindings()} {
has_variable_aoffi = TestVariableAoffi();
has_component_indexing_bug = is_amd;
has_precise_bug = TestPreciseBug();
has_broken_compute = is_intel;
has_broken_compute = is_intel_proprietary;
has_fast_buffer_sub_data = is_nvidia;
LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);

View File

@@ -277,6 +277,14 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
continue;
}
// Currently this stages are not supported in the OpenGL backend.
// Todo(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL
if (program == Maxwell::ShaderProgram::TesselationControl) {
continue;
} else if (program == Maxwell::ShaderProgram::TesselationEval) {
continue;
}
Shader shader{shader_cache.GetStageProgram(program)};
// Stage indices are 0 - 5
@@ -1028,6 +1036,10 @@ void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) {
flip_y = !flip_y;
}
state.clip_control.origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT;
state.clip_control.depth_mode =
regs.depth_mode == Tegra::Engines::Maxwell3D::Regs::DepthMode::ZeroToOne
? GL_ZERO_TO_ONE
: GL_NEGATIVE_ONE_TO_ONE;
}
void RasterizerOpenGL::SyncClipEnabled(

View File

@@ -281,11 +281,11 @@ CachedProgram BuildShader(const Device& device, u64 unique_identifier, ShaderTyp
if (variant.shared_memory_size > 0) {
// TODO(Rodrigo): We should divide by four here, but having a larger shared memory pool
// avoids out of bound stores. Find out why shared memory size is being invalid.
source += fmt::format("shared uint smem[{}];", variant.shared_memory_size);
source += fmt::format("shared uint smem[{}];\n", variant.shared_memory_size);
}
if (variant.local_memory_size > 0) {
source += fmt::format("#define LOCAL_MEMORY_SIZE {}",
source += fmt::format("#define LOCAL_MEMORY_SIZE {}\n",
Common::AlignUp(variant.local_memory_size, 4) / 4);
}
}

View File

@@ -399,6 +399,7 @@ public:
DeclareConstantBuffers();
DeclareGlobalMemory();
DeclareSamplers();
DeclareImages();
DeclarePhysicalAttributeReader();
code.AddLine("void execute_{}() {{", suffix);
@@ -1076,7 +1077,7 @@ private:
}
std::string GenerateTexture(Operation operation, const std::string& function_suffix,
const std::vector<TextureIR>& extras) {
const std::vector<TextureIR>& extras, bool sepparate_dc = false) {
constexpr std::array coord_constructors = {"float", "vec2", "vec3", "vec4"};
const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
@@ -1091,7 +1092,8 @@ private:
expr += "Offset";
}
expr += '(' + GetSampler(meta->sampler) + ", ";
expr += coord_constructors.at(count + (has_array ? 1 : 0) + (has_shadow ? 1 : 0) - 1);
expr += coord_constructors.at(count + (has_array ? 1 : 0) +
(has_shadow && !sepparate_dc ? 1 : 0) - 1);
expr += '(';
for (std::size_t i = 0; i < count; ++i) {
expr += Visit(operation[i]).AsFloat();
@@ -1104,9 +1106,14 @@ private:
expr += ", float(" + Visit(meta->array).AsInt() + ')';
}
if (has_shadow) {
expr += ", " + Visit(meta->depth_compare).AsFloat();
if (sepparate_dc) {
expr += "), " + Visit(meta->depth_compare).AsFloat();
} else {
expr += ", " + Visit(meta->depth_compare).AsFloat() + ')';
}
} else {
expr += ')';
}
expr += ')';
for (const auto& variant : extras) {
if (const auto argument = std::get_if<TextureArgument>(&variant)) {
@@ -1706,10 +1713,17 @@ private:
ASSERT(meta);
const auto type = meta->sampler.IsShadow() ? Type::Float : Type::Int;
return {GenerateTexture(operation, "Gather",
{TextureAoffi{}, TextureArgument{type, meta->component}}) +
GetSwizzle(meta->element),
Type::Float};
if (meta->sampler.IsShadow()) {
return {GenerateTexture(operation, "Gather", {TextureAoffi{}}, true) +
GetSwizzle(meta->element),
Type::Float};
} else {
return {GenerateTexture(operation, "Gather",
{TextureAoffi{}, TextureArgument{type, meta->component}},
false) +
GetSwizzle(meta->element),
Type::Float};
}
}
Expression TextureQueryDimensions(Operation operation) {
@@ -1915,6 +1929,10 @@ private:
return {};
}
Expression InvocationId(Operation operation) {
return {"gl_InvocationID", Type::Int};
}
Expression YNegate(Operation operation) {
return {"y_direction", Type::Float};
}
@@ -1988,6 +2006,11 @@ private:
return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
}
Expression MemoryBarrierGL(Operation) {
code.AddLine("memoryBarrier();");
return {};
}
struct Func final {
Func() = delete;
~Func() = delete;
@@ -2153,6 +2176,7 @@ private:
&GLSLDecompiler::EmitVertex,
&GLSLDecompiler::EndPrimitive,
&GLSLDecompiler::InvocationId,
&GLSLDecompiler::YNegate,
&GLSLDecompiler::LocalInvocationId<0>,
&GLSLDecompiler::LocalInvocationId<1>,
@@ -2168,6 +2192,8 @@ private:
&GLSLDecompiler::ThreadId,
&GLSLDecompiler::ShuffleIndexed,
&GLSLDecompiler::MemoryBarrierGL,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

View File

@@ -411,8 +411,9 @@ void OpenGLState::ApplyAlphaTest() {
}
void OpenGLState::ApplyClipControl() {
if (UpdateValue(cur_state.clip_control.origin, clip_control.origin)) {
glClipControl(clip_control.origin, GL_NEGATIVE_ONE_TO_ONE);
if (UpdateTie(std::tie(cur_state.clip_control.origin, cur_state.clip_control.depth_mode),
std::tie(clip_control.origin, clip_control.depth_mode))) {
glClipControl(clip_control.origin, clip_control.depth_mode);
}
}

View File

@@ -150,6 +150,7 @@ public:
struct {
GLenum origin = GL_LOWER_LEFT;
GLenum depth_mode = GL_NEGATIVE_ONE_TO_ONE;
} clip_control;
OpenGLState();

View File

@@ -24,19 +24,21 @@
namespace OpenGL {
static const char vertex_shader[] = R"(
#version 150 core
namespace {
in vec2 vert_position;
in vec2 vert_tex_coord;
out vec2 frag_tex_coord;
constexpr char vertex_shader[] = R"(
#version 430 core
layout (location = 0) in vec2 vert_position;
layout (location = 1) in vec2 vert_tex_coord;
layout (location = 0) out vec2 frag_tex_coord;
// This is a truncated 3x3 matrix for 2D transformations:
// The upper-left 2x2 submatrix performs scaling/rotation/mirroring.
// The third column performs translation.
// The third row could be used for projection, which we don't need in 2D. It hence is assumed to
// implicitly be [0, 0, 1]
uniform mat3x2 modelview_matrix;
layout (location = 0) uniform mat3x2 modelview_matrix;
void main() {
// Multiply input position by the rotscale part of the matrix and then manually translate by
@@ -47,34 +49,29 @@ void main() {
}
)";
static const char fragment_shader[] = R"(
#version 150 core
constexpr char fragment_shader[] = R"(
#version 430 core
in vec2 frag_tex_coord;
out vec4 color;
layout (location = 0) in vec2 frag_tex_coord;
layout (location = 0) out vec4 color;
uniform sampler2D color_texture;
layout (binding = 0) uniform sampler2D color_texture;
void main() {
// Swap RGBA -> ABGR so we don't have to do this on the CPU. This needs to change if we have to
// support more framebuffer pixel formats.
color = texture(color_texture, frag_tex_coord);
}
)";
/**
* Vertex structure that the drawn screen rectangles are composed of.
*/
struct ScreenRectVertex {
ScreenRectVertex(GLfloat x, GLfloat y, GLfloat u, GLfloat v) {
position[0] = x;
position[1] = y;
tex_coord[0] = u;
tex_coord[1] = v;
}
constexpr GLint PositionLocation = 0;
constexpr GLint TexCoordLocation = 1;
constexpr GLint ModelViewMatrixLocation = 0;
GLfloat position[2];
GLfloat tex_coord[2];
struct ScreenRectVertex {
constexpr ScreenRectVertex(GLfloat x, GLfloat y, GLfloat u, GLfloat v)
: position{{x, y}}, tex_coord{{u, v}} {}
std::array<GLfloat, 2> position;
std::array<GLfloat, 2> tex_coord;
};
/**
@@ -84,18 +81,82 @@ struct ScreenRectVertex {
* The projection part of the matrix is trivial, hence these operations are represented
* by a 3x2 matrix.
*/
static std::array<GLfloat, 3 * 2> MakeOrthographicMatrix(const float width, const float height) {
std::array<GLfloat, 3 * 2> MakeOrthographicMatrix(float width, float height) {
std::array<GLfloat, 3 * 2> matrix; // Laid out in column-major order
// clang-format off
matrix[0] = 2.f / width; matrix[2] = 0.f; matrix[4] = -1.f;
matrix[1] = 0.f; matrix[3] = -2.f / height; matrix[5] = 1.f;
matrix[0] = 2.f / width; matrix[2] = 0.f; matrix[4] = -1.f;
matrix[1] = 0.f; matrix[3] = -2.f / height; matrix[5] = 1.f;
// Last matrix row is implicitly assumed to be [0, 0, 1].
// clang-format on
return matrix;
}
const char* GetSource(GLenum source) {
switch (source) {
case GL_DEBUG_SOURCE_API:
return "API";
case GL_DEBUG_SOURCE_WINDOW_SYSTEM:
return "WINDOW_SYSTEM";
case GL_DEBUG_SOURCE_SHADER_COMPILER:
return "SHADER_COMPILER";
case GL_DEBUG_SOURCE_THIRD_PARTY:
return "THIRD_PARTY";
case GL_DEBUG_SOURCE_APPLICATION:
return "APPLICATION";
case GL_DEBUG_SOURCE_OTHER:
return "OTHER";
default:
UNREACHABLE();
return "Unknown source";
}
}
const char* GetType(GLenum type) {
switch (type) {
case GL_DEBUG_TYPE_ERROR:
return "ERROR";
case GL_DEBUG_TYPE_DEPRECATED_BEHAVIOR:
return "DEPRECATED_BEHAVIOR";
case GL_DEBUG_TYPE_UNDEFINED_BEHAVIOR:
return "UNDEFINED_BEHAVIOR";
case GL_DEBUG_TYPE_PORTABILITY:
return "PORTABILITY";
case GL_DEBUG_TYPE_PERFORMANCE:
return "PERFORMANCE";
case GL_DEBUG_TYPE_OTHER:
return "OTHER";
case GL_DEBUG_TYPE_MARKER:
return "MARKER";
default:
UNREACHABLE();
return "Unknown type";
}
}
void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severity, GLsizei length,
const GLchar* message, const void* user_param) {
const char format[] = "{} {} {}: {}";
const char* const str_source = GetSource(source);
const char* const str_type = GetType(type);
switch (severity) {
case GL_DEBUG_SEVERITY_HIGH:
LOG_CRITICAL(Render_OpenGL, format, str_source, str_type, id, message);
break;
case GL_DEBUG_SEVERITY_MEDIUM:
LOG_WARNING(Render_OpenGL, format, str_source, str_type, id, message);
break;
case GL_DEBUG_SEVERITY_NOTIFICATION:
case GL_DEBUG_SEVERITY_LOW:
LOG_DEBUG(Render_OpenGL, format, str_source, str_type, id, message);
break;
}
}
} // Anonymous namespace
RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system)
: VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system} {}
@@ -138,9 +199,6 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
prev_state.Apply();
}
/**
* Loads framebuffer from emulated memory into the active OpenGL texture.
*/
void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) {
// Framebuffer orientation handling
framebuffer_transform_flags = framebuffer.transform_flags;
@@ -181,19 +239,12 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
}
/**
* Fills active OpenGL texture with the given RGB color. Since the color is solid, the texture can
* be 1x1 but will stretch across whatever it's rendered on.
*/
void RendererOpenGL::LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, u8 color_a,
const TextureInfo& texture) {
const u8 framebuffer_data[4] = {color_a, color_b, color_g, color_r};
glClearTexImage(texture.resource.handle, 0, GL_RGBA, GL_UNSIGNED_BYTE, framebuffer_data);
}
/**
* Initializes the OpenGL state and creates persistent objects.
*/
void RendererOpenGL::InitOpenGLObjects() {
glClearColor(Settings::values.bg_red, Settings::values.bg_green, Settings::values.bg_blue,
0.0f);
@@ -203,10 +254,6 @@ void RendererOpenGL::InitOpenGLObjects() {
state.draw.shader_program = shader.handle;
state.AllDirty();
state.Apply();
uniform_modelview_matrix = glGetUniformLocation(shader.handle, "modelview_matrix");
uniform_color_texture = glGetUniformLocation(shader.handle, "color_texture");
attrib_position = glGetAttribLocation(shader.handle, "vert_position");
attrib_tex_coord = glGetAttribLocation(shader.handle, "vert_tex_coord");
// Generate VBO handle for drawing
vertex_buffer.Create();
@@ -217,14 +264,14 @@ void RendererOpenGL::InitOpenGLObjects() {
// Attach vertex data to VAO
glNamedBufferData(vertex_buffer.handle, sizeof(ScreenRectVertex) * 4, nullptr, GL_STREAM_DRAW);
glVertexArrayAttribFormat(vertex_array.handle, attrib_position, 2, GL_FLOAT, GL_FALSE,
glVertexArrayAttribFormat(vertex_array.handle, PositionLocation, 2, GL_FLOAT, GL_FALSE,
offsetof(ScreenRectVertex, position));
glVertexArrayAttribFormat(vertex_array.handle, attrib_tex_coord, 2, GL_FLOAT, GL_FALSE,
glVertexArrayAttribFormat(vertex_array.handle, TexCoordLocation, 2, GL_FLOAT, GL_FALSE,
offsetof(ScreenRectVertex, tex_coord));
glVertexArrayAttribBinding(vertex_array.handle, attrib_position, 0);
glVertexArrayAttribBinding(vertex_array.handle, attrib_tex_coord, 0);
glEnableVertexArrayAttrib(vertex_array.handle, attrib_position);
glEnableVertexArrayAttrib(vertex_array.handle, attrib_tex_coord);
glVertexArrayAttribBinding(vertex_array.handle, PositionLocation, 0);
glVertexArrayAttribBinding(vertex_array.handle, TexCoordLocation, 0);
glEnableVertexArrayAttrib(vertex_array.handle, PositionLocation);
glEnableVertexArrayAttrib(vertex_array.handle, TexCoordLocation);
glVertexArrayVertexBuffer(vertex_array.handle, 0, vertex_buffer.handle, 0,
sizeof(ScreenRectVertex));
@@ -331,18 +378,18 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
static_cast<f32>(screen_info.texture.height);
}
std::array<ScreenRectVertex, 4> vertices = {{
const std::array vertices = {
ScreenRectVertex(x, y, texcoords.top * scale_u, left * scale_v),
ScreenRectVertex(x + w, y, texcoords.bottom * scale_u, left * scale_v),
ScreenRectVertex(x, y + h, texcoords.top * scale_u, right * scale_v),
ScreenRectVertex(x + w, y + h, texcoords.bottom * scale_u, right * scale_v),
}};
};
state.textures[0] = screen_info.display_texture;
state.framebuffer_srgb.enabled = screen_info.display_srgb;
state.AllDirty();
state.Apply();
glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), vertices.data());
glNamedBufferSubData(vertex_buffer.handle, 0, sizeof(vertices), std::data(vertices));
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
// Restore default state
state.framebuffer_srgb.enabled = false;
@@ -351,9 +398,6 @@ void RendererOpenGL::DrawScreenTriangles(const ScreenInfo& screen_info, float x,
state.Apply();
}
/**
* Draws the emulated screens to the emulator window.
*/
void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
if (renderer_settings.set_background_color) {
// Update background color before drawing
@@ -367,21 +411,17 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
glClear(GL_COLOR_BUFFER_BIT);
// Set projection matrix
std::array<GLfloat, 3 * 2> ortho_matrix =
MakeOrthographicMatrix((float)layout.width, (float)layout.height);
glUniformMatrix3x2fv(uniform_modelview_matrix, 1, GL_FALSE, ortho_matrix.data());
const std::array ortho_matrix =
MakeOrthographicMatrix(static_cast<float>(layout.width), static_cast<float>(layout.height));
glUniformMatrix3x2fv(ModelViewMatrixLocation, 1, GL_FALSE, ortho_matrix.data());
// Bind texture in Texture Unit 0
glActiveTexture(GL_TEXTURE0);
glUniform1i(uniform_color_texture, 0);
DrawScreenTriangles(screen_info, (float)screen.left, (float)screen.top,
(float)screen.GetWidth(), (float)screen.GetHeight());
DrawScreenTriangles(screen_info, static_cast<float>(screen.left),
static_cast<float>(screen.top), static_cast<float>(screen.GetWidth()),
static_cast<float>(screen.GetHeight()));
m_current_frame++;
}
/// Updates the framerate
void RendererOpenGL::UpdateFramerate() {}
void RendererOpenGL::CaptureScreenshot() {
@@ -418,63 +458,6 @@ void RendererOpenGL::CaptureScreenshot() {
renderer_settings.screenshot_requested = false;
}
static const char* GetSource(GLenum source) {
#define RET(s) \
case GL_DEBUG_SOURCE_##s: \
return #s
switch (source) {
RET(API);
RET(WINDOW_SYSTEM);
RET(SHADER_COMPILER);
RET(THIRD_PARTY);
RET(APPLICATION);
RET(OTHER);
default:
UNREACHABLE();
return "Unknown source";
}
#undef RET
}
static const char* GetType(GLenum type) {
#define RET(t) \
case GL_DEBUG_TYPE_##t: \
return #t
switch (type) {
RET(ERROR);
RET(DEPRECATED_BEHAVIOR);
RET(UNDEFINED_BEHAVIOR);
RET(PORTABILITY);
RET(PERFORMANCE);
RET(OTHER);
RET(MARKER);
default:
UNREACHABLE();
return "Unknown type";
}
#undef RET
}
static void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severity,
GLsizei length, const GLchar* message, const void* user_param) {
const char format[] = "{} {} {}: {}";
const char* const str_source = GetSource(source);
const char* const str_type = GetType(type);
switch (severity) {
case GL_DEBUG_SEVERITY_HIGH:
LOG_CRITICAL(Render_OpenGL, format, str_source, str_type, id, message);
break;
case GL_DEBUG_SEVERITY_MEDIUM:
LOG_WARNING(Render_OpenGL, format, str_source, str_type, id, message);
break;
case GL_DEBUG_SEVERITY_NOTIFICATION:
case GL_DEBUG_SEVERITY_LOW:
LOG_DEBUG(Render_OpenGL, format, str_source, str_type, id, message);
break;
}
}
bool RendererOpenGL::Init() {
Core::Frontend::ScopeAcquireWindowContext acquire_context{render_window};
@@ -495,7 +478,6 @@ bool RendererOpenGL::Init() {
return true;
}
/// Shutdown the renderer
void RendererOpenGL::ShutDown() {}
} // namespace OpenGL

View File

@@ -59,21 +59,31 @@ public:
void ShutDown() override;
private:
/// Initializes the OpenGL state and creates persistent objects.
void InitOpenGLObjects();
void AddTelemetryFields();
void CreateRasterizer();
void ConfigureFramebufferTexture(TextureInfo& texture,
const Tegra::FramebufferConfig& framebuffer);
/// Draws the emulated screens to the emulator window.
void DrawScreen(const Layout::FramebufferLayout& layout);
void DrawScreenTriangles(const ScreenInfo& screen_info, float x, float y, float w, float h);
/// Updates the framerate.
void UpdateFramerate();
void CaptureScreenshot();
// Loads framebuffer from emulated memory into the display information structure
/// Loads framebuffer from emulated memory into the active OpenGL texture.
void LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer);
// Fills active OpenGL texture with the given RGBA color.
/// Fills active OpenGL texture with the given RGB color.Since the color is solid, the texture
/// can be 1x1 but will stretch across whatever it's rendered on.
void LoadColorToActiveGLTexture(u8 color_r, u8 color_g, u8 color_b, u8 color_a,
const TextureInfo& texture);
@@ -94,14 +104,6 @@ private:
/// OpenGL framebuffer data
std::vector<u8> gl_framebuffer_data;
// Shader uniform location indices
GLuint uniform_modelview_matrix;
GLuint uniform_color_texture;
// Shader attribute input indices
GLuint attrib_position;
GLuint attrib_tex_coord;
/// Used for transforming the framebuffer orientation
Tegra::FramebufferConfig::TransformFlags framebuffer_transform_flags;
Common::Rectangle<int> framebuffer_crop_rect;

View File

@@ -44,7 +44,8 @@ vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filt
return {};
}
vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode) {
vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode,
Tegra::Texture::TextureFilter filter) {
switch (wrap_mode) {
case Tegra::Texture::WrapMode::Wrap:
return vk::SamplerAddressMode::eRepeat;
@@ -55,10 +56,15 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode) {
case Tegra::Texture::WrapMode::Border:
return vk::SamplerAddressMode::eClampToBorder;
case Tegra::Texture::WrapMode::Clamp:
// TODO(Rodrigo): GL_CLAMP was removed as of OpenGL 3.1, to implement GL_CLAMP, we can use
// eClampToBorder to get the border color of the texture, and then sample the edge to
// manually mix them. However the shader part of this is not yet implemented.
return vk::SamplerAddressMode::eClampToBorder;
// TODO(Rodrigo): Emulate GL_CLAMP properly
switch (filter) {
case Tegra::Texture::TextureFilter::Nearest:
return vk::SamplerAddressMode::eClampToEdge;
case Tegra::Texture::TextureFilter::Linear:
return vk::SamplerAddressMode::eClampToBorder;
}
UNREACHABLE();
return vk::SamplerAddressMode::eClampToEdge;
case Tegra::Texture::WrapMode::MirrorOnceClampToEdge:
return vk::SamplerAddressMode::eMirrorClampToEdge;
case Tegra::Texture::WrapMode::MirrorOnceBorder:
@@ -96,106 +102,140 @@ vk::CompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compar
} // namespace Sampler
namespace {
enum : u32 { Attachable = 1, Storage = 2 };
struct FormatTuple {
vk::Format format; ///< Vulkan format
bool attachable; ///< True when this format can be used as an attachment
};
static constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{
{vk::Format::eA8B8G8R8UnormPack32, true}, // ABGR8U
{vk::Format::eUndefined, false}, // ABGR8S
{vk::Format::eUndefined, false}, // ABGR8UI
{vk::Format::eB5G6R5UnormPack16, false}, // B5G6R5U
{vk::Format::eA2B10G10R10UnormPack32, true}, // A2B10G10R10U
{vk::Format::eUndefined, false}, // A1B5G5R5U
{vk::Format::eR8Unorm, true}, // R8U
{vk::Format::eUndefined, false}, // R8UI
{vk::Format::eUndefined, false}, // RGBA16F
{vk::Format::eUndefined, false}, // RGBA16U
{vk::Format::eUndefined, false}, // RGBA16UI
{vk::Format::eUndefined, false}, // R11FG11FB10F
{vk::Format::eUndefined, false}, // RGBA32UI
{vk::Format::eBc1RgbaUnormBlock, false}, // DXT1
{vk::Format::eBc2UnormBlock, false}, // DXT23
{vk::Format::eBc3UnormBlock, false}, // DXT45
{vk::Format::eBc4UnormBlock, false}, // DXN1
{vk::Format::eUndefined, false}, // DXN2UNORM
{vk::Format::eUndefined, false}, // DXN2SNORM
{vk::Format::eUndefined, false}, // BC7U
{vk::Format::eUndefined, false}, // BC6H_UF16
{vk::Format::eUndefined, false}, // BC6H_SF16
{vk::Format::eUndefined, false}, // ASTC_2D_4X4
{vk::Format::eUndefined, false}, // BGRA8
{vk::Format::eUndefined, false}, // RGBA32F
{vk::Format::eUndefined, false}, // RG32F
{vk::Format::eUndefined, false}, // R32F
{vk::Format::eUndefined, false}, // R16F
{vk::Format::eUndefined, false}, // R16U
{vk::Format::eUndefined, false}, // R16S
{vk::Format::eUndefined, false}, // R16UI
{vk::Format::eUndefined, false}, // R16I
{vk::Format::eUndefined, false}, // RG16
{vk::Format::eUndefined, false}, // RG16F
{vk::Format::eUndefined, false}, // RG16UI
{vk::Format::eUndefined, false}, // RG16I
{vk::Format::eUndefined, false}, // RG16S
{vk::Format::eUndefined, false}, // RGB32F
{vk::Format::eA8B8G8R8SrgbPack32, true}, // RGBA8_SRGB
{vk::Format::eUndefined, false}, // RG8U
{vk::Format::eUndefined, false}, // RG8S
{vk::Format::eUndefined, false}, // RG32UI
{vk::Format::eUndefined, false}, // RGBX16F
{vk::Format::eUndefined, false}, // R32UI
{vk::Format::eUndefined, false}, // ASTC_2D_8X8
{vk::Format::eUndefined, false}, // ASTC_2D_8X5
{vk::Format::eUndefined, false}, // ASTC_2D_5X4
// Compressed sRGB formats
{vk::Format::eUndefined, false}, // BGRA8_SRGB
{vk::Format::eUndefined, false}, // DXT1_SRGB
{vk::Format::eUndefined, false}, // DXT23_SRGB
{vk::Format::eUndefined, false}, // DXT45_SRGB
{vk::Format::eUndefined, false}, // BC7U_SRGB
{vk::Format::eUndefined, false}, // ASTC_2D_4X4_SRGB
{vk::Format::eUndefined, false}, // ASTC_2D_8X8_SRGB
{vk::Format::eUndefined, false}, // ASTC_2D_8X5_SRGB
{vk::Format::eUndefined, false}, // ASTC_2D_5X4_SRGB
{vk::Format::eUndefined, false}, // ASTC_2D_5X5
{vk::Format::eUndefined, false}, // ASTC_2D_5X5_SRGB
{vk::Format::eUndefined, false}, // ASTC_2D_10X8
{vk::Format::eUndefined, false}, // ASTC_2D_10X8_SRGB
int usage; ///< Describes image format usage
} constexpr tex_format_tuples[] = {
{vk::Format::eA8B8G8R8UnormPack32, Attachable | Storage}, // ABGR8U
{vk::Format::eA8B8G8R8SnormPack32, Attachable | Storage}, // ABGR8S
{vk::Format::eA8B8G8R8UintPack32, Attachable | Storage}, // ABGR8UI
{vk::Format::eB5G6R5UnormPack16, {}}, // B5G6R5U
{vk::Format::eA2B10G10R10UnormPack32, Attachable | Storage}, // A2B10G10R10U
{vk::Format::eA1R5G5B5UnormPack16, Attachable | Storage}, // A1B5G5R5U (flipped with swizzle)
{vk::Format::eR8Unorm, Attachable | Storage}, // R8U
{vk::Format::eR8Uint, Attachable | Storage}, // R8UI
{vk::Format::eR16G16B16A16Sfloat, Attachable | Storage}, // RGBA16F
{vk::Format::eR16G16B16A16Unorm, Attachable | Storage}, // RGBA16U
{vk::Format::eR16G16B16A16Uint, Attachable | Storage}, // RGBA16UI
{vk::Format::eB10G11R11UfloatPack32, Attachable | Storage}, // R11FG11FB10F
{vk::Format::eR32G32B32A32Uint, Attachable | Storage}, // RGBA32UI
{vk::Format::eBc1RgbaUnormBlock, {}}, // DXT1
{vk::Format::eBc2UnormBlock, {}}, // DXT23
{vk::Format::eBc3UnormBlock, {}}, // DXT45
{vk::Format::eBc4UnormBlock, {}}, // DXN1
{vk::Format::eBc5UnormBlock, {}}, // DXN2UNORM
{vk::Format::eBc5SnormBlock, {}}, // DXN2SNORM
{vk::Format::eBc7UnormBlock, {}}, // BC7U
{vk::Format::eBc6HUfloatBlock, {}}, // BC6H_UF16
{vk::Format::eBc6HSfloatBlock, {}}, // BC6H_SF16
{vk::Format::eAstc4x4UnormBlock, {}}, // ASTC_2D_4X4
{vk::Format::eB8G8R8A8Unorm, {}}, // BGRA8
{vk::Format::eR32G32B32A32Sfloat, Attachable | Storage}, // RGBA32F
{vk::Format::eR32G32Sfloat, Attachable | Storage}, // RG32F
{vk::Format::eR32Sfloat, Attachable | Storage}, // R32F
{vk::Format::eR16Sfloat, Attachable | Storage}, // R16F
{vk::Format::eR16Unorm, Attachable | Storage}, // R16U
{vk::Format::eUndefined, {}}, // R16S
{vk::Format::eUndefined, {}}, // R16UI
{vk::Format::eUndefined, {}}, // R16I
{vk::Format::eR16G16Unorm, Attachable | Storage}, // RG16
{vk::Format::eR16G16Sfloat, Attachable | Storage}, // RG16F
{vk::Format::eUndefined, {}}, // RG16UI
{vk::Format::eUndefined, {}}, // RG16I
{vk::Format::eR16G16Snorm, Attachable | Storage}, // RG16S
{vk::Format::eUndefined, {}}, // RGB32F
{vk::Format::eR8G8B8A8Srgb, Attachable}, // RGBA8_SRGB
{vk::Format::eR8G8Unorm, Attachable | Storage}, // RG8U
{vk::Format::eR8G8Snorm, Attachable | Storage}, // RG8S
{vk::Format::eR32G32Uint, Attachable | Storage}, // RG32UI
{vk::Format::eUndefined, {}}, // RGBX16F
{vk::Format::eR32Uint, Attachable | Storage}, // R32UI
{vk::Format::eAstc8x8UnormBlock, {}}, // ASTC_2D_8X8
{vk::Format::eUndefined, {}}, // ASTC_2D_8X5
{vk::Format::eUndefined, {}}, // ASTC_2D_5X4
{vk::Format::eUndefined, {}}, // BGRA8_SRGB
{vk::Format::eBc1RgbaSrgbBlock, {}}, // DXT1_SRGB
{vk::Format::eUndefined, {}}, // DXT23_SRGB
{vk::Format::eBc3SrgbBlock, {}}, // DXT45_SRGB
{vk::Format::eBc7SrgbBlock, {}}, // BC7U_SRGB
{vk::Format::eR4G4B4A4UnormPack16, Attachable}, // R4G4B4A4U
{vk::Format::eAstc4x4SrgbBlock, {}}, // ASTC_2D_4X4_SRGB
{vk::Format::eAstc8x8SrgbBlock, {}}, // ASTC_2D_8X8_SRGB
{vk::Format::eAstc8x5SrgbBlock, {}}, // ASTC_2D_8X5_SRGB
{vk::Format::eAstc5x4SrgbBlock, {}}, // ASTC_2D_5X4_SRGB
{vk::Format::eAstc5x5UnormBlock, {}}, // ASTC_2D_5X5
{vk::Format::eAstc5x5SrgbBlock, {}}, // ASTC_2D_5X5_SRGB
{vk::Format::eAstc10x8UnormBlock, {}}, // ASTC_2D_10X8
{vk::Format::eAstc10x8SrgbBlock, {}}, // ASTC_2D_10X8_SRGB
{vk::Format::eAstc6x6UnormBlock, {}}, // ASTC_2D_6X6
{vk::Format::eAstc6x6SrgbBlock, {}}, // ASTC_2D_6X6_SRGB
{vk::Format::eAstc10x10UnormBlock, {}}, // ASTC_2D_10X10
{vk::Format::eAstc10x10SrgbBlock, {}}, // ASTC_2D_10X10_SRGB
{vk::Format::eAstc12x12UnormBlock, {}}, // ASTC_2D_12X12
{vk::Format::eAstc12x12SrgbBlock, {}}, // ASTC_2D_12X12_SRGB
{vk::Format::eAstc8x6UnormBlock, {}}, // ASTC_2D_8X6
{vk::Format::eAstc8x6SrgbBlock, {}}, // ASTC_2D_8X6_SRGB
{vk::Format::eAstc6x5UnormBlock, {}}, // ASTC_2D_6X5
{vk::Format::eAstc6x5SrgbBlock, {}}, // ASTC_2D_6X5_SRGB
{vk::Format::eE5B9G9R9UfloatPack32, {}}, // E5B9G9R9F
// Depth formats
{vk::Format::eD32Sfloat, true}, // Z32F
{vk::Format::eD16Unorm, true}, // Z16
{vk::Format::eD32Sfloat, Attachable}, // Z32F
{vk::Format::eD16Unorm, Attachable}, // Z16
// DepthStencil formats
{vk::Format::eD24UnormS8Uint, true}, // Z24S8
{vk::Format::eD24UnormS8Uint, true}, // S8Z24 (emulated)
{vk::Format::eUndefined, false}, // Z32FS8
}};
{vk::Format::eD24UnormS8Uint, Attachable}, // Z24S8
{vk::Format::eD24UnormS8Uint, Attachable}, // S8Z24 (emulated)
{vk::Format::eD32SfloatS8Uint, Attachable}, // Z32FS8
};
static_assert(std::size(tex_format_tuples) == VideoCore::Surface::MaxPixelFormat);
static constexpr bool IsZetaFormat(PixelFormat pixel_format) {
constexpr bool IsZetaFormat(PixelFormat pixel_format) {
return pixel_format >= PixelFormat::MaxColorFormat &&
pixel_format < PixelFormat::MaxDepthStencilFormat;
}
std::pair<vk::Format, bool> SurfaceFormat(const VKDevice& device, FormatType format_type,
PixelFormat pixel_format) {
ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size());
} // Anonymous namespace
const auto tuple = tex_format_tuples[static_cast<u32>(pixel_format)];
UNIMPLEMENTED_IF_MSG(tuple.format == vk::Format::eUndefined,
"Unimplemented texture format with pixel format={}",
static_cast<u32>(pixel_format));
FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFormat pixel_format) {
ASSERT(static_cast<std::size_t>(pixel_format) < std::size(tex_format_tuples));
auto usage = vk::FormatFeatureFlagBits::eSampledImage |
vk::FormatFeatureFlagBits::eTransferDst | vk::FormatFeatureFlagBits::eTransferSrc;
if (tuple.attachable) {
usage |= IsZetaFormat(pixel_format) ? vk::FormatFeatureFlagBits::eDepthStencilAttachment
: vk::FormatFeatureFlagBits::eColorAttachment;
auto tuple = tex_format_tuples[static_cast<std::size_t>(pixel_format)];
if (tuple.format == vk::Format::eUndefined) {
UNIMPLEMENTED_MSG("Unimplemented texture format with pixel format={}",
static_cast<u32>(pixel_format));
return {vk::Format::eA8B8G8R8UnormPack32, true, true};
}
return {device.GetSupportedFormat(tuple.format, usage, format_type), tuple.attachable};
// Use ABGR8 on hardware that doesn't support ASTC natively
if (!device.IsOptimalAstcSupported() && VideoCore::Surface::IsPixelFormatASTC(pixel_format)) {
tuple.format = VideoCore::Surface::IsPixelFormatSRGB(pixel_format)
? vk::Format::eA8B8G8R8SrgbPack32
: vk::Format::eA8B8G8R8UnormPack32;
}
const bool attachable = tuple.usage & Attachable;
const bool storage = tuple.usage & Storage;
vk::FormatFeatureFlags usage;
if (format_type == FormatType::Buffer) {
usage = vk::FormatFeatureFlagBits::eStorageTexelBuffer |
vk::FormatFeatureFlagBits::eUniformTexelBuffer;
} else {
usage = vk::FormatFeatureFlagBits::eSampledImage | vk::FormatFeatureFlagBits::eTransferDst |
vk::FormatFeatureFlagBits::eTransferSrc;
if (attachable) {
usage |= IsZetaFormat(pixel_format) ? vk::FormatFeatureFlagBits::eDepthStencilAttachment
: vk::FormatFeatureFlagBits::eColorAttachment;
}
if (storage) {
usage |= vk::FormatFeatureFlagBits::eStorageImage;
}
}
return {device.GetSupportedFormat(tuple.format, usage, format_type), attachable, storage};
}
vk::ShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage) {
@@ -215,7 +255,8 @@ vk::ShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage) {
return {};
}
vk::PrimitiveTopology PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
vk::PrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device,
Maxwell::PrimitiveTopology topology) {
switch (topology) {
case Maxwell::PrimitiveTopology::Points:
return vk::PrimitiveTopology::ePointList;
@@ -227,6 +268,13 @@ vk::PrimitiveTopology PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
return vk::PrimitiveTopology::eTriangleList;
case Maxwell::PrimitiveTopology::TriangleStrip:
return vk::PrimitiveTopology::eTriangleStrip;
case Maxwell::PrimitiveTopology::TriangleFan:
return vk::PrimitiveTopology::eTriangleFan;
case Maxwell::PrimitiveTopology::Quads:
// TODO(Rodrigo): Use VK_PRIMITIVE_TOPOLOGY_QUAD_LIST_EXT whenever it releases
return vk::PrimitiveTopology::eTriangleList;
case Maxwell::PrimitiveTopology::Patches:
return vk::PrimitiveTopology::ePatchList;
default:
UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology));
return {};
@@ -236,37 +284,111 @@ vk::PrimitiveTopology PrimitiveTopology(Maxwell::PrimitiveTopology topology) {
vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) {
switch (type) {
case Maxwell::VertexAttribute::Type::SignedNorm:
switch (size) {
case Maxwell::VertexAttribute::Size::Size_8:
return vk::Format::eR8Snorm;
case Maxwell::VertexAttribute::Size::Size_8_8:
return vk::Format::eR8G8Snorm;
case Maxwell::VertexAttribute::Size::Size_8_8_8:
return vk::Format::eR8G8B8Snorm;
case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
return vk::Format::eR8G8B8A8Snorm;
case Maxwell::VertexAttribute::Size::Size_16:
return vk::Format::eR16Snorm;
case Maxwell::VertexAttribute::Size::Size_16_16:
return vk::Format::eR16G16Snorm;
case Maxwell::VertexAttribute::Size::Size_16_16_16:
return vk::Format::eR16G16B16Snorm;
case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
return vk::Format::eR16G16B16A16Snorm;
case Maxwell::VertexAttribute::Size::Size_10_10_10_2:
return vk::Format::eA2B10G10R10SnormPack32;
default:
break;
}
break;
case Maxwell::VertexAttribute::Type::UnsignedNorm:
switch (size) {
case Maxwell::VertexAttribute::Size::Size_8:
return vk::Format::eR8Unorm;
case Maxwell::VertexAttribute::Size::Size_8_8:
return vk::Format::eR8G8Unorm;
case Maxwell::VertexAttribute::Size::Size_8_8_8:
return vk::Format::eR8G8B8Unorm;
case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
return vk::Format::eR8G8B8A8Unorm;
case Maxwell::VertexAttribute::Size::Size_16:
return vk::Format::eR16Unorm;
case Maxwell::VertexAttribute::Size::Size_16_16:
return vk::Format::eR16G16Unorm;
case Maxwell::VertexAttribute::Size::Size_16_16_16:
return vk::Format::eR16G16B16Unorm;
case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
return vk::Format::eR16G16B16A16Unorm;
default:
break;
}
break;
case Maxwell::VertexAttribute::Type::SignedInt:
break;
switch (size) {
case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
return vk::Format::eR16G16B16A16Sint;
case Maxwell::VertexAttribute::Size::Size_8:
return vk::Format::eR8Sint;
case Maxwell::VertexAttribute::Size::Size_8_8:
return vk::Format::eR8G8Sint;
case Maxwell::VertexAttribute::Size::Size_8_8_8:
return vk::Format::eR8G8B8Sint;
case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
return vk::Format::eR8G8B8A8Sint;
case Maxwell::VertexAttribute::Size::Size_32:
return vk::Format::eR32Sint;
default:
break;
}
case Maxwell::VertexAttribute::Type::UnsignedInt:
switch (size) {
case Maxwell::VertexAttribute::Size::Size_8:
return vk::Format::eR8Uint;
case Maxwell::VertexAttribute::Size::Size_8_8:
return vk::Format::eR8G8Uint;
case Maxwell::VertexAttribute::Size::Size_8_8_8:
return vk::Format::eR8G8B8Uint;
case Maxwell::VertexAttribute::Size::Size_8_8_8_8:
return vk::Format::eR8G8B8A8Uint;
case Maxwell::VertexAttribute::Size::Size_32:
return vk::Format::eR32Uint;
default:
break;
}
case Maxwell::VertexAttribute::Type::UnsignedScaled:
switch (size) {
case Maxwell::VertexAttribute::Size::Size_8_8:
return vk::Format::eR8G8Uscaled;
default:
break;
}
break;
case Maxwell::VertexAttribute::Type::SignedScaled:
break;
case Maxwell::VertexAttribute::Type::Float:
switch (size) {
case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
return vk::Format::eR32G32B32A32Sfloat;
case Maxwell::VertexAttribute::Size::Size_32_32_32:
return vk::Format::eR32G32B32Sfloat;
case Maxwell::VertexAttribute::Size::Size_32_32:
return vk::Format::eR32G32Sfloat;
case Maxwell::VertexAttribute::Size::Size_32:
return vk::Format::eR32Sfloat;
case Maxwell::VertexAttribute::Size::Size_32_32:
return vk::Format::eR32G32Sfloat;
case Maxwell::VertexAttribute::Size::Size_32_32_32:
return vk::Format::eR32G32B32Sfloat;
case Maxwell::VertexAttribute::Size::Size_32_32_32_32:
return vk::Format::eR32G32B32A32Sfloat;
case Maxwell::VertexAttribute::Size::Size_16:
return vk::Format::eR16Sfloat;
case Maxwell::VertexAttribute::Size::Size_16_16:
return vk::Format::eR16G16Sfloat;
case Maxwell::VertexAttribute::Size::Size_16_16_16:
return vk::Format::eR16G16B16Sfloat;
case Maxwell::VertexAttribute::Size::Size_16_16_16_16:
return vk::Format::eR16G16B16A16Sfloat;
default:
break;
}
@@ -308,11 +430,14 @@ vk::CompareOp ComparisonOp(Maxwell::ComparisonOp comparison) {
return {};
}
vk::IndexType IndexFormat(Maxwell::IndexFormat index_format) {
vk::IndexType IndexFormat(const VKDevice& device, Maxwell::IndexFormat index_format) {
switch (index_format) {
case Maxwell::IndexFormat::UnsignedByte:
UNIMPLEMENTED_MSG("Vulkan does not support native u8 index format");
return vk::IndexType::eUint16;
if (!device.IsExtIndexTypeUint8Supported()) {
UNIMPLEMENTED_MSG("Native uint8 indices are not supported on this device");
return vk::IndexType::eUint16;
}
return vk::IndexType::eUint8EXT;
case Maxwell::IndexFormat::UnsignedShort:
return vk::IndexType::eUint16;
case Maxwell::IndexFormat::UnsignedInt:

View File

@@ -4,7 +4,6 @@
#pragma once
#include <utility>
#include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_vulkan/declarations.h"
@@ -23,24 +22,31 @@ vk::Filter Filter(Tegra::Texture::TextureFilter filter);
vk::SamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter);
vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode);
vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode,
Tegra::Texture::TextureFilter filter);
vk::CompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func);
} // namespace Sampler
std::pair<vk::Format, bool> SurfaceFormat(const VKDevice& device, FormatType format_type,
PixelFormat pixel_format);
struct FormatInfo {
vk::Format format;
bool attachable;
bool storage;
};
FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFormat pixel_format);
vk::ShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage);
vk::PrimitiveTopology PrimitiveTopology(Maxwell::PrimitiveTopology topology);
vk::PrimitiveTopology PrimitiveTopology(const VKDevice& device,
Maxwell::PrimitiveTopology topology);
vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size);
vk::CompareOp ComparisonOp(Maxwell::ComparisonOp comparison);
vk::IndexType IndexFormat(Maxwell::IndexFormat index_format);
vk::IndexType IndexFormat(const VKDevice& device, Maxwell::IndexFormat index_format);
vk::StencilOp StencilOp(Maxwell::StencilOp stencil_op);

View File

@@ -0,0 +1,24 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
/*
* Build instructions:
* $ glslangValidator -V $THIS_FILE -o output.spv
* $ spirv-opt -O --strip-debug output.spv -o optimized.spv
* $ xxd -i optimized.spv
*
* Then copy that bytecode to the C++ file
*/
#version 460 core
layout (location = 0) in vec2 frag_tex_coord;
layout (location = 0) out vec4 color;
layout (binding = 1) uniform sampler2D color_texture;
void main() {
color = texture(color_texture, frag_tex_coord);
}

View File

@@ -0,0 +1,28 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
/*
* Build instructions:
* $ glslangValidator -V $THIS_FILE -o output.spv
* $ spirv-opt -O --strip-debug output.spv -o optimized.spv
* $ xxd -i optimized.spv
*
* Then copy that bytecode to the C++ file
*/
#version 460 core
layout (location = 0) in vec2 vert_position;
layout (location = 1) in vec2 vert_tex_coord;
layout (location = 0) out vec2 frag_tex_coord;
layout (set = 0, binding = 0) uniform MatrixBlock {
mat4 modelview_matrix;
};
void main() {
gl_Position = modelview_matrix * vec4(vert_position, 0.0, 1.0);
frag_tex_coord = vert_tex_coord;
}

View File

@@ -0,0 +1,37 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
/*
* Build instructions:
* $ glslangValidator -V $THIS_FILE -o output.spv
* $ spirv-opt -O --strip-debug output.spv -o optimized.spv
* $ xxd -i optimized.spv
*
* Then copy that bytecode to the C++ file
*/
#version 460 core
layout (local_size_x = 1024) in;
layout (std430, set = 0, binding = 0) buffer OutputBuffer {
uint output_indexes[];
};
layout (push_constant) uniform PushConstants {
uint first;
};
void main() {
uint primitive = gl_GlobalInvocationID.x;
if (primitive * 6 >= output_indexes.length()) {
return;
}
const uint quad_map[6] = uint[](0, 1, 2, 0, 2, 3);
for (uint vertex = 0; vertex < 6; ++vertex) {
uint index = first + primitive * 4 + quad_map[vertex];
output_indexes[primitive * 6 + vertex] = index;
}
}

View File

@@ -0,0 +1,33 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
/*
* Build instructions:
* $ glslangValidator -V $THIS_FILE -o output.spv
* $ spirv-opt -O --strip-debug output.spv -o optimized.spv
* $ xxd -i optimized.spv
*
* Then copy that bytecode to the C++ file
*/
#version 460 core
#extension GL_EXT_shader_16bit_storage : require
#extension GL_EXT_shader_8bit_storage : require
layout (local_size_x = 1024) in;
layout (std430, set = 0, binding = 0) readonly buffer InputBuffer {
uint8_t input_indexes[];
};
layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
uint16_t output_indexes[];
};
void main() {
uint id = gl_GlobalInvocationID.x;
if (id < input_indexes.length()) {
output_indexes[id] = uint16_t(input_indexes[id]);
}
}

View File

@@ -46,9 +46,10 @@ UniqueSampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc)
{}, MaxwellToVK::Sampler::Filter(tsc.mag_filter),
MaxwellToVK::Sampler::Filter(tsc.min_filter),
MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter),
MaxwellToVK::Sampler::WrapMode(tsc.wrap_u), MaxwellToVK::Sampler::WrapMode(tsc.wrap_v),
MaxwellToVK::Sampler::WrapMode(tsc.wrap_p), tsc.GetLodBias(), has_anisotropy,
max_anisotropy, tsc.depth_compare_enabled,
MaxwellToVK::Sampler::WrapMode(tsc.wrap_u, tsc.mag_filter),
MaxwellToVK::Sampler::WrapMode(tsc.wrap_v, tsc.mag_filter),
MaxwellToVK::Sampler::WrapMode(tsc.wrap_p, tsc.mag_filter), tsc.GetLodBias(),
has_anisotropy, max_anisotropy, tsc.depth_compare_enabled,
MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), tsc.GetMinLod(),
tsc.GetMaxLod(), vk_border_color.value_or(vk::BorderColor::eFloatTransparentBlack),
unnormalized_coords);

View File

@@ -3,7 +3,7 @@
// Refer to the license.txt file included.
#include "common/assert.h"
#include "common/logging/log.h"
#include "common/microprofile.h"
#include "video_core/renderer_vulkan/declarations.h"
#include "video_core/renderer_vulkan/vk_device.h"
#include "video_core/renderer_vulkan/vk_resource_manager.h"
@@ -11,46 +11,172 @@
namespace Vulkan {
VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager)
: device{device}, resource_manager{resource_manager} {
next_fence = &resource_manager.CommitFence();
AllocateNewContext();
MICROPROFILE_DECLARE(Vulkan_WaitForWorker);
void VKScheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf,
const vk::DispatchLoaderDynamic& dld) {
auto command = first;
while (command != nullptr) {
auto next = command->GetNext();
command->Execute(cmdbuf, dld);
command->~Command();
command = next;
}
command_offset = 0;
first = nullptr;
last = nullptr;
}
VKScheduler::~VKScheduler() = default;
VKScheduler::VKScheduler(const VKDevice& device, VKResourceManager& resource_manager)
: device{device}, resource_manager{resource_manager}, next_fence{
&resource_manager.CommitFence()} {
AcquireNewChunk();
AllocateNewContext();
worker_thread = std::thread(&VKScheduler::WorkerThread, this);
}
VKScheduler::~VKScheduler() {
quit = true;
cv.notify_all();
worker_thread.join();
}
void VKScheduler::Flush(bool release_fence, vk::Semaphore semaphore) {
SubmitExecution(semaphore);
if (release_fence)
if (release_fence) {
current_fence->Release();
}
AllocateNewContext();
}
void VKScheduler::Finish(bool release_fence, vk::Semaphore semaphore) {
SubmitExecution(semaphore);
current_fence->Wait();
if (release_fence)
if (release_fence) {
current_fence->Release();
}
AllocateNewContext();
}
void VKScheduler::WaitWorker() {
MICROPROFILE_SCOPE(Vulkan_WaitForWorker);
DispatchWork();
bool finished = false;
do {
cv.notify_all();
std::unique_lock lock{mutex};
finished = chunk_queue.Empty();
} while (!finished);
}
void VKScheduler::DispatchWork() {
if (chunk->Empty()) {
return;
}
chunk_queue.Push(std::move(chunk));
cv.notify_all();
AcquireNewChunk();
}
void VKScheduler::RequestRenderpass(const vk::RenderPassBeginInfo& renderpass_bi) {
if (state.renderpass && renderpass_bi == *state.renderpass) {
return;
}
const bool end_renderpass = state.renderpass.has_value();
state.renderpass = renderpass_bi;
Record([renderpass_bi, end_renderpass](auto cmdbuf, auto& dld) {
if (end_renderpass) {
cmdbuf.endRenderPass(dld);
}
cmdbuf.beginRenderPass(renderpass_bi, vk::SubpassContents::eInline, dld);
});
}
void VKScheduler::RequestOutsideRenderPassOperationContext() {
EndRenderPass();
}
void VKScheduler::BindGraphicsPipeline(vk::Pipeline pipeline) {
if (state.graphics_pipeline == pipeline) {
return;
}
state.graphics_pipeline = pipeline;
Record([pipeline](auto cmdbuf, auto& dld) {
cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, pipeline, dld);
});
}
void VKScheduler::WorkerThread() {
std::unique_lock lock{mutex};
do {
cv.wait(lock, [this] { return !chunk_queue.Empty() || quit; });
if (quit) {
continue;
}
auto extracted_chunk = std::move(chunk_queue.Front());
chunk_queue.Pop();
extracted_chunk->ExecuteAll(current_cmdbuf, device.GetDispatchLoader());
chunk_reserve.Push(std::move(extracted_chunk));
} while (!quit);
}
void VKScheduler::SubmitExecution(vk::Semaphore semaphore) {
EndPendingOperations();
InvalidateState();
WaitWorker();
std::unique_lock lock{mutex};
const auto queue = device.GetGraphicsQueue();
const auto& dld = device.GetDispatchLoader();
current_cmdbuf.end(dld);
const auto queue = device.GetGraphicsQueue();
const vk::SubmitInfo submit_info(0, nullptr, nullptr, 1, &current_cmdbuf, semaphore ? 1u : 0u,
const vk::SubmitInfo submit_info(0, nullptr, nullptr, 1, &current_cmdbuf, semaphore ? 1U : 0U,
&semaphore);
queue.submit({submit_info}, *current_fence, dld);
queue.submit({submit_info}, static_cast<vk::Fence>(*current_fence), dld);
}
void VKScheduler::AllocateNewContext() {
std::unique_lock lock{mutex};
current_fence = next_fence;
current_cmdbuf = resource_manager.CommitCommandBuffer(*current_fence);
next_fence = &resource_manager.CommitFence();
const auto& dld = device.GetDispatchLoader();
current_cmdbuf.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit}, dld);
current_cmdbuf = resource_manager.CommitCommandBuffer(*current_fence);
current_cmdbuf.begin({vk::CommandBufferUsageFlagBits::eOneTimeSubmit},
device.GetDispatchLoader());
}
void VKScheduler::InvalidateState() {
state.graphics_pipeline = nullptr;
state.viewports = false;
state.scissors = false;
state.depth_bias = false;
state.blend_constants = false;
state.depth_bounds = false;
state.stencil_values = false;
}
void VKScheduler::EndPendingOperations() {
EndRenderPass();
}
void VKScheduler::EndRenderPass() {
if (!state.renderpass) {
return;
}
state.renderpass = std::nullopt;
Record([](auto cmdbuf, auto& dld) { cmdbuf.endRenderPass(dld); });
}
void VKScheduler::AcquireNewChunk() {
if (chunk_reserve.Empty()) {
chunk = std::make_unique<CommandChunk>();
return;
}
chunk = std::move(chunk_reserve.Front());
chunk_reserve.Pop();
}
} // namespace Vulkan

View File

@@ -4,7 +4,14 @@
#pragma once
#include <condition_variable>
#include <memory>
#include <optional>
#include <stack>
#include <thread>
#include <utility>
#include "common/common_types.h"
#include "common/threadsafe_queue.h"
#include "video_core/renderer_vulkan/declarations.h"
namespace Vulkan {
@@ -30,23 +37,6 @@ private:
VKFence* const& fence;
};
class VKCommandBufferView {
public:
VKCommandBufferView() = default;
VKCommandBufferView(const vk::CommandBuffer& cmdbuf) : cmdbuf{cmdbuf} {}
const vk::CommandBuffer* operator->() const noexcept {
return &cmdbuf;
}
operator vk::CommandBuffer() const noexcept {
return cmdbuf;
}
private:
const vk::CommandBuffer& cmdbuf;
};
/// The scheduler abstracts command buffer and fence management with an interface that's able to do
/// OpenGL-like operations on Vulkan command buffers.
class VKScheduler {
@@ -54,32 +44,190 @@ public:
explicit VKScheduler(const VKDevice& device, VKResourceManager& resource_manager);
~VKScheduler();
/// Gets a reference to the current fence.
VKFenceView GetFence() const {
return current_fence;
}
/// Gets a reference to the current command buffer.
VKCommandBufferView GetCommandBuffer() const {
return current_cmdbuf;
}
/// Sends the current execution context to the GPU.
void Flush(bool release_fence = true, vk::Semaphore semaphore = nullptr);
/// Sends the current execution context to the GPU and waits for it to complete.
void Finish(bool release_fence = true, vk::Semaphore semaphore = nullptr);
/// Waits for the worker thread to finish executing everything. After this function returns it's
/// safe to touch worker resources.
void WaitWorker();
/// Sends currently recorded work to the worker thread.
void DispatchWork();
/// Requests to begin a renderpass.
void RequestRenderpass(const vk::RenderPassBeginInfo& renderpass_bi);
/// Requests the current executino context to be able to execute operations only allowed outside
/// of a renderpass.
void RequestOutsideRenderPassOperationContext();
/// Binds a pipeline to the current execution context.
void BindGraphicsPipeline(vk::Pipeline pipeline);
/// Returns true when viewports have been set in the current command buffer.
bool TouchViewports() {
return std::exchange(state.viewports, true);
}
/// Returns true when scissors have been set in the current command buffer.
bool TouchScissors() {
return std::exchange(state.scissors, true);
}
/// Returns true when depth bias have been set in the current command buffer.
bool TouchDepthBias() {
return std::exchange(state.depth_bias, true);
}
/// Returns true when blend constants have been set in the current command buffer.
bool TouchBlendConstants() {
return std::exchange(state.blend_constants, true);
}
/// Returns true when depth bounds have been set in the current command buffer.
bool TouchDepthBounds() {
return std::exchange(state.depth_bounds, true);
}
/// Returns true when stencil values have been set in the current command buffer.
bool TouchStencilValues() {
return std::exchange(state.stencil_values, true);
}
/// Send work to a separate thread.
template <typename T>
void Record(T&& command) {
if (chunk->Record(command)) {
return;
}
DispatchWork();
(void)chunk->Record(command);
}
/// Gets a reference to the current fence.
VKFenceView GetFence() const {
return current_fence;
}
private:
class Command {
public:
virtual ~Command() = default;
virtual void Execute(vk::CommandBuffer cmdbuf,
const vk::DispatchLoaderDynamic& dld) const = 0;
Command* GetNext() const {
return next;
}
void SetNext(Command* next_) {
next = next_;
}
private:
Command* next = nullptr;
};
template <typename T>
class TypedCommand final : public Command {
public:
explicit TypedCommand(T&& command) : command{std::move(command)} {}
~TypedCommand() override = default;
TypedCommand(TypedCommand&&) = delete;
TypedCommand& operator=(TypedCommand&&) = delete;
void Execute(vk::CommandBuffer cmdbuf,
const vk::DispatchLoaderDynamic& dld) const override {
command(cmdbuf, dld);
}
private:
T command;
};
class CommandChunk final {
public:
void ExecuteAll(vk::CommandBuffer cmdbuf, const vk::DispatchLoaderDynamic& dld);
template <typename T>
bool Record(T& command) {
using FuncType = TypedCommand<T>;
static_assert(sizeof(FuncType) < sizeof(data), "Lambda is too large");
if (command_offset > sizeof(data) - sizeof(FuncType)) {
return false;
}
Command* current_last = last;
last = new (data.data() + command_offset) FuncType(std::move(command));
if (current_last) {
current_last->SetNext(last);
} else {
first = last;
}
command_offset += sizeof(FuncType);
return true;
}
bool Empty() const {
return command_offset == 0;
}
private:
Command* first = nullptr;
Command* last = nullptr;
std::size_t command_offset = 0;
std::array<u8, 0x8000> data{};
};
void WorkerThread();
void SubmitExecution(vk::Semaphore semaphore);
void AllocateNewContext();
void InvalidateState();
void EndPendingOperations();
void EndRenderPass();
void AcquireNewChunk();
const VKDevice& device;
VKResourceManager& resource_manager;
vk::CommandBuffer current_cmdbuf;
VKFence* current_fence = nullptr;
VKFence* next_fence = nullptr;
struct State {
std::optional<vk::RenderPassBeginInfo> renderpass;
vk::Pipeline graphics_pipeline;
bool viewports = false;
bool scissors = false;
bool depth_bias = false;
bool blend_constants = false;
bool depth_bounds = false;
bool stencil_values = false;
} state;
std::unique_ptr<CommandChunk> chunk;
std::thread worker_thread;
Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_queue;
Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve;
std::mutex mutex;
std::condition_variable cv;
bool quit = false;
};
} // namespace Vulkan

File diff suppressed because it is too large Load Diff

View File

@@ -5,29 +5,28 @@
#pragma once
#include <array>
#include <bitset>
#include <memory>
#include <set>
#include <type_traits>
#include <utility>
#include <vector>
#include <sirit/sirit.h>
#include "common/common_types.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/engines/shader_type.h"
#include "video_core/shader/shader_ir.h"
namespace VideoCommon::Shader {
class ShaderIR;
}
namespace Vulkan {
class VKDevice;
}
namespace Vulkan::VKShader {
namespace Vulkan {
using Maxwell = Tegra::Engines::Maxwell3D::Regs;
using TexelBufferEntry = VideoCommon::Shader::Sampler;
using SamplerEntry = VideoCommon::Shader::Sampler;
using ImageEntry = VideoCommon::Shader::Image;
constexpr u32 DESCRIPTOR_SET = 0;
@@ -46,39 +45,74 @@ private:
class GlobalBufferEntry {
public:
explicit GlobalBufferEntry(u32 cbuf_index, u32 cbuf_offset)
: cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset} {}
constexpr explicit GlobalBufferEntry(u32 cbuf_index, u32 cbuf_offset, bool is_written)
: cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_written{is_written} {}
u32 GetCbufIndex() const {
constexpr u32 GetCbufIndex() const {
return cbuf_index;
}
u32 GetCbufOffset() const {
constexpr u32 GetCbufOffset() const {
return cbuf_offset;
}
constexpr bool IsWritten() const {
return is_written;
}
private:
u32 cbuf_index{};
u32 cbuf_offset{};
bool is_written{};
};
struct ShaderEntries {
u32 const_buffers_base_binding{};
u32 global_buffers_base_binding{};
u32 samplers_base_binding{};
u32 NumBindings() const {
return static_cast<u32>(const_buffers.size() + global_buffers.size() +
texel_buffers.size() + samplers.size() + images.size());
}
std::vector<ConstBufferEntry> const_buffers;
std::vector<GlobalBufferEntry> global_buffers;
std::vector<TexelBufferEntry> texel_buffers;
std::vector<SamplerEntry> samplers;
std::vector<ImageEntry> images;
std::set<u32> attributes;
std::array<bool, Maxwell::NumClipDistances> clip_distances{};
std::size_t shader_length{};
Sirit::Id entry_function{};
std::vector<Sirit::Id> interfaces;
bool uses_warps{};
};
using DecompilerResult = std::pair<std::unique_ptr<Sirit::Module>, ShaderEntries>;
struct Specialization final {
u32 base_binding{};
DecompilerResult Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
Tegra::Engines::ShaderType stage);
// Compute specific
std::array<u32, 3> workgroup_size{};
u32 shared_memory_size{};
} // namespace Vulkan::VKShader
// Graphics specific
Maxwell::PrimitiveTopology primitive_topology{};
std::optional<float> point_size{};
std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{};
// Tessellation specific
struct {
Maxwell::TessellationPrimitive primitive{};
Maxwell::TessellationSpacing spacing{};
bool clockwise{};
} tessellation;
};
// Old gcc versions don't consider this trivially copyable.
// static_assert(std::is_trivially_copyable_v<Specialization>);
struct SPIRVShader {
std::vector<u32> code;
ShaderEntries entries;
};
ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir);
std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir,
Tegra::Engines::ShaderType stage, const Specialization& specialization);
} // namespace Vulkan

View File

@@ -63,12 +63,11 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
case OpCode::Id::I2F_R:
case OpCode::Id::I2F_C:
case OpCode::Id::I2F_IMM: {
UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
UNIMPLEMENTED_IF_MSG(instr.generates_cc,
"Condition codes generation in I2F is not implemented");
Node value = [&]() {
Node value = [&] {
switch (opcode->get().GetId()) {
case OpCode::Id::I2F_R:
return GetRegister(instr.gpr20);
@@ -81,7 +80,19 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
return Immediate(0);
}
}();
const bool input_signed = instr.conversion.is_input_signed;
if (instr.conversion.src_size == Register::Size::Byte) {
const u32 offset = static_cast<u32>(instr.conversion.int_src.selector) * 8;
if (offset > 0) {
value = SignedOperation(OperationCode::ILogicalShiftRight, input_signed,
std::move(value), Immediate(offset));
}
} else {
UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
}
value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, false, input_signed);
value = SignedOperation(OperationCode::FCastInteger, input_signed, PRECISE, value);

View File

@@ -21,7 +21,24 @@ using Tegra::Shader::OpCode;
using Tegra::Shader::Register;
namespace {
u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
u32 GetLdgMemorySize(Tegra::Shader::UniformType uniform_type) {
switch (uniform_type) {
case Tegra::Shader::UniformType::UnsignedByte:
case Tegra::Shader::UniformType::Single:
return 1;
case Tegra::Shader::UniformType::Double:
return 2;
case Tegra::Shader::UniformType::Quad:
case Tegra::Shader::UniformType::UnsignedQuad:
return 4;
default:
UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type));
return 1;
}
}
u32 GetStgMemorySize(Tegra::Shader::UniformType uniform_type) {
switch (uniform_type) {
case Tegra::Shader::UniformType::Single:
return 1;
@@ -35,6 +52,7 @@ u32 GetUniformTypeElementsCount(Tegra::Shader::UniformType uniform_type) {
return 1;
}
}
} // Anonymous namespace
u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
@@ -168,7 +186,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
const auto [real_address_base, base_address, descriptor] =
TrackGlobalMemory(bb, instr, false);
const u32 count = GetUniformTypeElementsCount(type);
const u32 count = GetLdgMemorySize(type);
if (!real_address_base || !base_address) {
// Tracking failed, load zeroes.
for (u32 i = 0; i < count; ++i) {
@@ -179,12 +197,22 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
for (u32 i = 0; i < count; ++i) {
const Node it_offset = Immediate(i * 4);
const Node real_address =
Operation(OperationCode::UAdd, NO_PRECISE, real_address_base, it_offset);
const Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset);
Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
if (type == Tegra::Shader::UniformType::UnsignedByte) {
// To handle unaligned loads get the byte used to dereferenced global memory
// and extract that byte from the loaded uint32.
Node byte = Operation(OperationCode::UBitwiseAnd, real_address, Immediate(3));
byte = Operation(OperationCode::ULogicalShiftLeft, std::move(byte), Immediate(3));
gmem = Operation(OperationCode::UBitfieldExtract, std::move(gmem), std::move(byte),
Immediate(8));
}
SetTemporary(bb, i, gmem);
}
for (u32 i = 0; i < count; ++i) {
SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
}
@@ -196,28 +224,28 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
UNIMPLEMENTED_IF_MSG((instr.attribute.fmt20.immediate.Value() % sizeof(u32)) != 0,
"Unaligned attribute loads are not supported");
u64 next_element = instr.attribute.fmt20.element;
auto next_index = static_cast<u64>(instr.attribute.fmt20.index.Value());
u64 element = instr.attribute.fmt20.element;
auto index = static_cast<u64>(instr.attribute.fmt20.index.Value());
const auto StoreNextElement = [&](u32 reg_offset) {
const auto dest = GetOutputAttribute(static_cast<Attribute::Index>(next_index),
next_element, GetRegister(instr.gpr39));
const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
Node dest;
if (instr.attribute.fmt20.patch) {
const u32 offset = static_cast<u32>(index) * 4 + static_cast<u32>(element);
dest = MakeNode<PatchNode>(offset);
} else {
dest = GetOutputAttribute(static_cast<Attribute::Index>(index), element,
GetRegister(instr.gpr39));
}
const auto src = GetRegister(instr.gpr0.Value() + reg_offset);
bb.push_back(Operation(OperationCode::Assign, dest, src));
// Load the next attribute element into the following register. If the element
// to load goes beyond the vec4 size, load the first element of the next
// attribute.
next_element = (next_element + 1) % 4;
next_index = next_index + (next_element == 0 ? 1 : 0);
};
const u32 num_words = static_cast<u32>(instr.attribute.fmt20.size.Value()) + 1;
for (u32 reg_offset = 0; reg_offset < num_words; ++reg_offset) {
StoreNextElement(reg_offset);
// Load the next attribute element into the following register. If the element to load
// goes beyond the vec4 size, load the first element of the next attribute.
element = (element + 1) % 4;
index = index + (element == 0 ? 1 : 0);
}
break;
}
case OpCode::Id::ST_L:
@@ -274,7 +302,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
break;
}
const u32 count = GetUniformTypeElementsCount(type);
const u32 count = GetStgMemorySize(type);
for (u32 i = 0; i < count; ++i) {
const Node it_offset = Immediate(i * 4);
const Node real_address = Operation(OperationCode::UAdd, real_address_base, it_offset);

View File

@@ -69,6 +69,8 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
case OpCode::Id::MOV_SYS: {
const Node value = [this, instr] {
switch (instr.sys20) {
case SystemVariable::InvocationId:
return Operation(OperationCode::InvocationId);
case SystemVariable::Ydirection:
return Operation(OperationCode::YNegate);
case SystemVariable::InvocationInfo:
@@ -255,6 +257,12 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8));
break;
}
case OpCode::Id::MEMBAR: {
UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
bb.push_back(Operation(OperationCode::MemoryBarrierGL));
break;
}
case OpCode::Id::DEPBAR: {
LOG_DEBUG(HW_GPU, "DEPBAR instruction is stubbed");
break;

View File

@@ -107,8 +107,8 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
break;
}
case OpCode::Id::TLD4S: {
UNIMPLEMENTED_IF_MSG(instr.tld4s.UsesMiscMode(TextureMiscMode::AOFFI),
"AOFFI is not implemented");
const bool uses_aoffi = instr.tld4s.UsesMiscMode(TextureMiscMode::AOFFI);
UNIMPLEMENTED_IF_MSG(uses_aoffi, "AOFFI is not implemented");
const bool depth_compare = instr.tld4s.UsesMiscMode(TextureMiscMode::DC);
const Node op_a = GetRegister(instr.gpr8);
@@ -116,29 +116,40 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
// TODO(Subv): Figure out how the sampler type is encoded in the TLD4S instruction.
std::vector<Node> coords;
Node dc_reg;
if (depth_compare) {
// Note: TLD4S coordinate encoding works just like TEXS's
const Node op_y = GetRegister(instr.gpr8.Value() + 1);
coords.push_back(op_a);
coords.push_back(op_y);
coords.push_back(op_b);
dc_reg = uses_aoffi ? GetRegister(instr.gpr20.Value() + 1) : op_b;
} else {
coords.push_back(op_a);
coords.push_back(op_b);
if (uses_aoffi) {
const Node op_y = GetRegister(instr.gpr8.Value() + 1);
coords.push_back(op_y);
} else {
coords.push_back(op_b);
}
dc_reg = {};
}
const Node component = Immediate(static_cast<u32>(instr.tld4s.component));
const SamplerInfo info{TextureType::Texture2D, false, depth_compare};
const auto& sampler = GetSampler(instr.sampler, info);
const Sampler& sampler = *GetSampler(instr.sampler, info);
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
auto coords_copy = coords;
MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, component, element};
MetaTexture meta{sampler, {}, dc_reg, {}, {}, {}, {}, component, element};
values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
}
WriteTexsInstructionFloat(bb, instr, values, true);
if (instr.tld4s.fp16_flag) {
WriteTexsInstructionHalfFloat(bb, instr, values, true);
} else {
WriteTexsInstructionFloat(bb, instr, values, true);
}
break;
}
case OpCode::Id::TXD_B:
@@ -154,9 +165,17 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
const auto texture_type = instr.txd.texture_type.Value();
const auto coord_count = GetCoordCount(texture_type);
const auto& sampler = is_bindless
? GetBindlessSampler(base_reg, {{texture_type, false, false}})
: GetSampler(instr.sampler, {{texture_type, false, false}});
const Sampler* sampler = is_bindless
? GetBindlessSampler(base_reg, {{texture_type, false, false}})
: GetSampler(instr.sampler, {{texture_type, false, false}});
Node4 values;
if (sampler == nullptr) {
for (u32 element = 0; element < values.size(); ++element) {
values[element] = Immediate(0);
}
WriteTexInstructionFloat(bb, instr, values);
break;
}
if (is_bindless) {
base_reg++;
}
@@ -170,9 +189,8 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
derivates.push_back(GetRegister(derivate_reg + derivate + 1));
}
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
MetaTexture meta{sampler, {}, {}, {}, derivates, {}, {}, {}, element};
MetaTexture meta{*sampler, {}, {}, {}, derivates, {}, {}, {}, element};
values[element] = Operation(OperationCode::TextureGradient, std::move(meta), coords);
}
@@ -187,9 +205,24 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
// TODO: The new commits on the texture refactor, change the way samplers work.
// Sadly, not all texture instructions specify the type of texture their sampler
// uses. This must be fixed at a later instance.
const auto& sampler =
const Sampler* sampler =
is_bindless ? GetBindlessSampler(instr.gpr8) : GetSampler(instr.sampler);
if (sampler == nullptr) {
u32 indexer = 0;
for (u32 element = 0; element < 4; ++element) {
if (!instr.txq.IsComponentEnabled(element)) {
continue;
}
const Node value = Immediate(0);
SetTemporary(bb, indexer++, value);
}
for (u32 i = 0; i < indexer; ++i) {
SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
}
break;
}
u32 indexer = 0;
switch (instr.txq.query_type) {
case Tegra::Shader::TextureQueryType::Dimension: {
@@ -197,7 +230,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
if (!instr.txq.IsComponentEnabled(element)) {
continue;
}
MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, {}, element};
MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, element};
const Node value =
Operation(OperationCode::TextureQueryDimensions, meta,
GetRegister(instr.gpr8.Value() + (is_bindless ? 1 : 0)));
@@ -223,9 +256,24 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
auto texture_type = instr.tmml.texture_type.Value();
const bool is_array = instr.tmml.array != 0;
const auto& sampler =
const Sampler* sampler =
is_bindless ? GetBindlessSampler(instr.gpr20) : GetSampler(instr.sampler);
if (sampler == nullptr) {
u32 indexer = 0;
for (u32 element = 0; element < 2; ++element) {
if (!instr.tmml.IsComponentEnabled(element)) {
continue;
}
const Node value = Immediate(0);
SetTemporary(bb, indexer++, value);
}
for (u32 i = 0; i < indexer; ++i) {
SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i));
}
break;
}
std::vector<Node> coords;
// TODO: Add coordinates for different samplers once other texture types are implemented.
@@ -251,7 +299,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) {
continue;
}
auto params = coords;
MetaTexture meta{sampler, {}, {}, {}, {}, {}, {}, {}, element};
MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, element};
const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params));
SetTemporary(bb, indexer++, value);
}
@@ -307,7 +355,7 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo(std::optional<SamplerInfo> sample
sampler->is_buffer != 0};
}
const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler,
const Sampler* ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler,
std::optional<SamplerInfo> sampler_info) {
const auto offset = static_cast<u32>(sampler.index.Value());
const auto info = GetSamplerInfo(sampler_info, offset);
@@ -319,21 +367,24 @@ const Sampler& ShaderIR::GetSampler(const Tegra::Shader::Sampler& sampler,
if (it != used_samplers.end()) {
ASSERT(!it->IsBindless() && it->GetType() == info.type && it->IsArray() == info.is_array &&
it->IsShadow() == info.is_shadow && it->IsBuffer() == info.is_buffer);
return *it;
return &(*it);
}
// Otherwise create a new mapping for this sampler
const auto next_index = static_cast<u32>(used_samplers.size());
return used_samplers.emplace_back(next_index, offset, info.type, info.is_array, info.is_shadow,
info.is_buffer);
return &used_samplers.emplace_back(next_index, offset, info.type, info.is_array, info.is_shadow,
info.is_buffer);
}
const Sampler& ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
const Sampler* ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
std::optional<SamplerInfo> sampler_info) {
const Node sampler_register = GetRegister(reg);
const auto [base_sampler, buffer, offset] =
TrackCbuf(sampler_register, global_code, static_cast<s64>(global_code.size()));
ASSERT(base_sampler != nullptr);
if (base_sampler == nullptr) {
return nullptr;
}
const auto info = GetSamplerInfo(sampler_info, offset, buffer);
@@ -346,13 +397,13 @@ const Sampler& ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg,
if (it != used_samplers.end()) {
ASSERT(it->IsBindless() && it->GetType() == info.type && it->IsArray() == info.is_array &&
it->IsShadow() == info.is_shadow);
return *it;
return &(*it);
}
// Otherwise create a new mapping for this sampler
const auto next_index = static_cast<u32>(used_samplers.size());
return used_samplers.emplace_back(next_index, offset, buffer, info.type, info.is_array,
info.is_shadow, info.is_buffer);
return &used_samplers.emplace_back(next_index, offset, buffer, info.type, info.is_array,
info.is_shadow, info.is_buffer);
}
void ShaderIR::WriteTexInstructionFloat(NodeBlock& bb, Instruction instr, const Node4& components) {
@@ -395,14 +446,14 @@ void ShaderIR::WriteTexsInstructionFloat(NodeBlock& bb, Instruction instr, const
}
void ShaderIR::WriteTexsInstructionHalfFloat(NodeBlock& bb, Instruction instr,
const Node4& components) {
const Node4& components, bool ignore_mask) {
// TEXS.F16 destionation registers are packed in two registers in pairs (just like any half
// float instruction).
Node4 values;
u32 dest_elem = 0;
for (u32 component = 0; component < 4; ++component) {
if (!instr.texs.IsComponentEnabled(component))
if (!instr.texs.IsComponentEnabled(component) && !ignore_mask)
continue;
values[dest_elem++] = components[component];
}
@@ -438,8 +489,15 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
"This method is not supported.");
const SamplerInfo info{texture_type, is_array, is_shadow, false};
const auto& sampler =
const Sampler* sampler =
is_bindless ? GetBindlessSampler(*bindless_reg, info) : GetSampler(instr.sampler, info);
Node4 values;
if (sampler == nullptr) {
for (u32 element = 0; element < values.size(); ++element) {
values[element] = Immediate(0);
}
return values;
}
const bool lod_needed = process_mode == TextureProcessMode::LZ ||
process_mode == TextureProcessMode::LL ||
@@ -478,10 +536,9 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type,
}
}
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
auto copy_coords = coords;
MetaTexture meta{sampler, array, depth_compare, aoffi, {}, bias, lod, {}, element};
MetaTexture meta{*sampler, array, depth_compare, aoffi, {}, bias, lod, {}, element};
values[element] = Operation(read_method, meta, std::move(copy_coords));
}
@@ -594,8 +651,15 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
u64 parameter_register = instr.gpr20.Value();
const SamplerInfo info{texture_type, is_array, depth_compare, false};
const auto& sampler = is_bindless ? GetBindlessSampler(parameter_register++, info)
: GetSampler(instr.sampler, info);
const Sampler* sampler = is_bindless ? GetBindlessSampler(parameter_register++, info)
: GetSampler(instr.sampler, info);
Node4 values;
if (sampler == nullptr) {
for (u32 element = 0; element < values.size(); ++element) {
values[element] = Immediate(0);
}
return values;
}
std::vector<Node> aoffi;
if (is_aoffi) {
@@ -610,10 +674,9 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de
const Node component = is_bindless ? Immediate(static_cast<u32>(instr.tld4_b.component))
: Immediate(static_cast<u32>(instr.tld4.component));
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
auto coords_copy = coords;
MetaTexture meta{sampler, GetRegister(array_register), dc, aoffi, {}, {}, {}, component,
MetaTexture meta{*sampler, GetRegister(array_register), dc, aoffi, {}, {}, {}, component,
element};
values[element] = Operation(OperationCode::TextureGather, meta, std::move(coords_copy));
}
@@ -642,7 +705,7 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) {
// const Node aoffi_register{is_aoffi ? GetRegister(gpr20_cursor++) : nullptr};
// const Node multisample{is_multisample ? GetRegister(gpr20_cursor++) : nullptr};
const auto& sampler = GetSampler(instr.sampler);
const auto& sampler = *GetSampler(instr.sampler);
Node4 values;
for (u32 element = 0; element < values.size(); ++element) {
@@ -655,7 +718,7 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) {
}
Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is_array) {
const auto& sampler = GetSampler(instr.sampler);
const Sampler& sampler = *GetSampler(instr.sampler);
const std::size_t type_coord_count = GetCoordCount(texture_type);
const bool lod_enabled = instr.tlds.GetTextureProcessMode() == TextureProcessMode::LL;
@@ -680,13 +743,18 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is
// When lod is used always is in gpr20
const Node lod = lod_enabled ? GetRegister(instr.gpr20) : Immediate(0);
// Fill empty entries from the guest sampler.
// Fill empty entries from the guest sampler
const std::size_t entry_coord_count = GetCoordCount(sampler.GetType());
if (type_coord_count != entry_coord_count) {
LOG_WARNING(HW_GPU, "Bound and built texture types mismatch");
}
for (std::size_t i = type_coord_count; i < entry_coord_count; ++i) {
coords.push_back(GetRegister(Register::ZeroIndex));
// When the size is higher we insert zeroes
for (std::size_t i = type_coord_count; i < entry_coord_count; ++i) {
coords.push_back(GetRegister(Register::ZeroIndex));
}
// Then we ensure the size matches the number of entries (dropping unused values)
coords.resize(entry_coord_count);
}
Node4 values;

View File

@@ -38,6 +38,9 @@ u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
const Instruction instr = {program_code[pc]};
const auto opcode = OpCode::Decode(instr);
// Signal the backend that this shader uses warp instructions.
uses_warps = true;
switch (opcode->get().GetId()) {
case OpCode::Id::VOTE: {
const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0);

View File

@@ -172,6 +172,7 @@ enum class OperationCode {
EmitVertex, /// () -> void
EndPrimitive, /// () -> void
InvocationId, /// () -> int
YNegate, /// () -> float
LocalInvocationIdX, /// () -> uint
LocalInvocationIdY, /// () -> uint
@@ -188,6 +189,8 @@ enum class OperationCode {
ThreadId, /// () -> uint
ShuffleIndexed, /// (uint value, uint index) -> uint
MemoryBarrierGL, /// () -> void
Amount,
};
@@ -213,13 +216,14 @@ class PredicateNode;
class AbufNode;
class CbufNode;
class LmemNode;
class PatchNode;
class SmemNode;
class GmemNode;
class CommentNode;
using NodeData =
std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode, InternalFlagNode,
PredicateNode, AbufNode, CbufNode, LmemNode, SmemNode, GmemNode, CommentNode>;
using NodeData = std::variant<OperationNode, ConditionalNode, GprNode, ImmediateNode,
InternalFlagNode, PredicateNode, AbufNode, PatchNode, CbufNode,
LmemNode, SmemNode, GmemNode, CommentNode>;
using Node = std::shared_ptr<NodeData>;
using Node4 = std::array<Node, 4>;
using NodeBlock = std::vector<Node>;
@@ -542,6 +546,19 @@ private:
u32 element{};
};
/// Patch memory (used to communicate tessellation stages).
class PatchNode final {
public:
explicit PatchNode(u32 offset) : offset{offset} {}
u32 GetOffset() const {
return offset;
}
private:
u32 offset{};
};
/// Constant buffer node, usually mapped to uniform buffers in GLSL
class CbufNode final {
public:

View File

@@ -137,6 +137,10 @@ public:
return uses_vertex_id;
}
bool UsesWarps() const {
return uses_warps;
}
bool HasPhysicalAttributes() const {
return uses_physical_attributes;
}
@@ -309,11 +313,11 @@ private:
std::optional<u32> buffer = std::nullopt);
/// Accesses a texture sampler
const Sampler& GetSampler(const Tegra::Shader::Sampler& sampler,
const Sampler* GetSampler(const Tegra::Shader::Sampler& sampler,
std::optional<SamplerInfo> sampler_info = std::nullopt);
/// Accesses a texture sampler for a bindless texture.
const Sampler& GetBindlessSampler(Tegra::Shader::Register reg,
const Sampler* GetBindlessSampler(Tegra::Shader::Register reg,
std::optional<SamplerInfo> sampler_info = std::nullopt);
/// Accesses an image.
@@ -334,7 +338,7 @@ private:
void WriteTexsInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
const Node4& components, bool ignore_mask = false);
void WriteTexsInstructionHalfFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
const Node4& components);
const Node4& components, bool ignore_mask = false);
Node4 GetTexCode(Tegra::Shader::Instruction instr, Tegra::Shader::TextureType texture_type,
Tegra::Shader::TextureProcessMode process_mode, bool depth_compare,
@@ -415,6 +419,7 @@ private:
bool uses_physical_attributes{}; // Shader uses AL2P or physical attribute read/writes
bool uses_instance_id{};
bool uses_vertex_id{};
bool uses_warps{};
Tegra::Shader::Header header;
};

View File

@@ -7,6 +7,7 @@
#include <variant>
#include "common/common_types.h"
#include "video_core/shader/node.h"
#include "video_core/shader/shader_ir.h"
namespace VideoCommon::Shader {

View File

@@ -392,4 +392,42 @@ std::string SurfaceParams::TargetName() const {
}
}
u32 SurfaceParams::GetBlockSize() const {
const u32 x = 64U << block_width;
const u32 y = 8U << block_height;
const u32 z = 1U << block_depth;
return x * y * z;
}
std::pair<u32, u32> SurfaceParams::GetBlockXY() const {
const u32 x_pixels = 64U / GetBytesPerPixel();
const u32 x = x_pixels << block_width;
const u32 y = 8U << block_height;
return {x, y};
}
std::tuple<u32, u32, u32> SurfaceParams::GetBlockOffsetXYZ(u32 offset) const {
const auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); };
const u32 block_size = GetBlockSize();
const u32 block_index = offset / block_size;
const u32 gob_offset = offset % block_size;
const u32 gob_index = gob_offset / static_cast<u32>(Tegra::Texture::GetGOBSize());
const u32 x_gob_pixels = 64U / GetBytesPerPixel();
const u32 x_block_pixels = x_gob_pixels << block_width;
const u32 y_block_pixels = 8U << block_height;
const u32 z_block_pixels = 1U << block_depth;
const u32 x_blocks = div_ceil(width, x_block_pixels);
const u32 y_blocks = div_ceil(height, y_block_pixels);
const u32 z_blocks = div_ceil(depth, z_block_pixels);
const u32 base_x = block_index % x_blocks;
const u32 base_y = (block_index / x_blocks) % y_blocks;
const u32 base_z = (block_index / (x_blocks * y_blocks)) % z_blocks;
u32 x = base_x * x_block_pixels;
u32 y = base_y * y_block_pixels;
u32 z = base_z * z_block_pixels;
z += gob_index >> block_height;
y += (gob_index * 8U) % y_block_pixels;
return {x, y, z};
}
} // namespace VideoCommon

View File

@@ -4,6 +4,8 @@
#pragma once
#include <utility>
#include "common/alignment.h"
#include "common/bit_util.h"
#include "common/cityhash.h"
@@ -136,6 +138,15 @@ public:
std::size_t GetConvertedMipmapSize(u32 level) const;
/// Get this texture Tegra Block size in guest memory layout
u32 GetBlockSize() const;
/// Get X, Y coordinates max sizes of a single block.
std::pair<u32, u32> GetBlockXY() const;
/// Get the offset in x, y, z coordinates from a memory offset
std::tuple<u32, u32, u32> GetBlockOffsetXYZ(u32 offset) const;
/// Returns the size of a layer in bytes in guest memory.
std::size_t GetGuestLayerSize() const {
return GetLayerSize(false, false);
@@ -269,7 +280,8 @@ private:
/// Returns the size of all mipmap levels and aligns as needed.
std::size_t GetInnerMemorySize(bool as_host_size, bool layer_only, bool uncompressed) const {
return GetLayerSize(as_host_size, uncompressed) * (layer_only ? 1U : depth);
return GetLayerSize(as_host_size, uncompressed) *
(layer_only ? 1U : (is_layered ? depth : 1U));
}
/// Returns the size of a layer

View File

@@ -615,6 +615,86 @@ private:
return {{new_surface, new_surface->GetMainView()}};
}
/**
* Takes care of managing 3D textures and its slices. Does HLE methods for reconstructing the 3D
* textures within the GPU if possible. Falls back to LLE when it isn't possible to use any of
* the HLE methods.
*
* @param overlaps The overlapping surfaces registered in the cache.
* @param params The parameters on the new surface.
* @param gpu_addr The starting address of the new surface.
* @param cache_addr The starting address of the new surface on physical memory.
* @param preserve_contents Indicates that the new surface should be loaded from memory or
* left blank.
*/
std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(std::vector<TSurface>& overlaps,
const SurfaceParams& params,
const GPUVAddr gpu_addr,
const CacheAddr cache_addr,
bool preserve_contents) {
if (params.target == SurfaceTarget::Texture3D) {
bool failed = false;
if (params.num_levels > 1) {
// We can't handle mipmaps in 3D textures yet, better fallback to LLE approach
return std::nullopt;
}
TSurface new_surface = GetUncachedSurface(gpu_addr, params);
bool modified = false;
for (auto& surface : overlaps) {
const SurfaceParams& src_params = surface->GetSurfaceParams();
if (src_params.target != SurfaceTarget::Texture2D) {
failed = true;
break;
}
if (src_params.height != params.height) {
failed = true;
break;
}
if (src_params.block_depth != params.block_depth ||
src_params.block_height != params.block_height) {
failed = true;
break;
}
const u32 offset = static_cast<u32>(surface->GetCacheAddr() - cache_addr);
const auto [x, y, z] = params.GetBlockOffsetXYZ(offset);
modified |= surface->IsModified();
const CopyParams copy_params(0, 0, 0, 0, 0, z, 0, 0, params.width, params.height,
1);
ImageCopy(surface, new_surface, copy_params);
}
if (failed) {
return std::nullopt;
}
for (const auto& surface : overlaps) {
Unregister(surface);
}
new_surface->MarkAsModified(modified, Tick());
Register(new_surface);
auto view = new_surface->GetMainView();
return {{std::move(new_surface), view}};
} else {
for (const auto& surface : overlaps) {
if (!surface->MatchTarget(params.target)) {
if (overlaps.size() == 1 && surface->GetCacheAddr() == cache_addr) {
if (Settings::values.use_accurate_gpu_emulation) {
return std::nullopt;
}
Unregister(surface);
return InitializeSurface(gpu_addr, params, preserve_contents);
}
return std::nullopt;
}
if (surface->GetCacheAddr() != cache_addr) {
continue;
}
if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) {
return {{surface, surface->GetMainView()}};
}
}
return InitializeSurface(gpu_addr, params, preserve_contents);
}
}
/**
* Gets the starting address and parameters of a candidate surface and tries
* to find a matching surface within the cache. This is done in 3 big steps:
@@ -687,6 +767,15 @@ private:
}
}
// Check if it's a 3D texture
if (params.block_depth > 0) {
auto surface =
Manage3DSurfaces(overlaps, params, gpu_addr, cache_addr, preserve_contents);
if (surface) {
return *surface;
}
}
// Split cases between 1 overlap or many.
if (overlaps.size() == 1) {
TSurface current_surface = overlaps[0];

View File

@@ -12,6 +12,10 @@ namespace Tegra::Texture {
// GOBSize constant. Calculated by 64 bytes in x multiplied by 8 y coords, represents
// an small rect of (64/bytes_per_pixel)X8.
inline std::size_t GetGOBSize() {
return 512;
}
inline std::size_t GetGOBSizeShift() {
return 9;
}