Compare commits

..

17 Commits

Author SHA1 Message Date
ameerj
c439fc9be9 astc_decoder: Reduce workgroup size
This reduces the amount of over dispatching when there are odd dimensions (i.e. ASTC 8x5), which rarely evenly divide into 32x32.
2021-08-01 01:22:27 -04:00
ameerj
5ab8053511 astc_decoder: Compute offset swizzles in-shader
Alleviates the dependency on the swizzle table and a uniform which is constant for all ASTC texture sizes.
2021-08-01 01:22:26 -04:00
ameerj
b2862e4772 astc_decoder: Make use of uvec4 for payload data 2021-07-31 22:28:04 -04:00
ameerj
a75d70fa90 astc_decoder: Simplify Select2DPartition 2021-07-31 21:36:26 -04:00
ameerj
5665d05547 astc_decoder: Optimize the use EncodingData
This buffer was a list of EncodingData structures sorted by their bit length, with some duplication from the cpu decoder implementation.
We can take advantage of its sorted property to optimize its usage in the shader.

Thanks to wwylele for the optimization idea.
2021-07-31 21:36:26 -04:00
ameerj
15c0c213b1 astc.h: Move data to cpp implementation
Moves leftover values that are no longer used by the gpu decoder back to the cpp implementation.
2021-07-31 21:26:42 -04:00
Mai M
7cf0958b06 Merge pull request #6788 from Morph1984/hle_api_12.1.0
hle: api_version: Update HOS version to 12.1.0
2021-07-31 20:12:35 -04:00
Morph
9143fe5d3a hle: api_version: Update HOS version to 12.1.0
Keeps us up to date with reporting the system version.
2021-07-31 14:39:49 -04:00
bunnei
47f13a9df4 Merge pull request #6752 from Morph1984/pt-br
service: ns, set: Add PT_BR (Brazilian Portuguese)
2021-07-30 14:42:11 -07:00
bunnei
2c7fdee7a7 Merge pull request #6775 from lat9nq/cmd-remove-global-core
emu_window: Remove global system instance
2021-07-30 13:28:24 -07:00
bunnei
7530594602 Merge pull request #6759 from ReinUsesLisp/pipeline-statistics
renderer_vulkan: Add setting to log pipeline statistics
2021-07-30 11:18:52 -07:00
bunnei
0334b9b776 Merge pull request #6770 from Morph1984/swkbd_buffer_size
applet_swkbd: Correct string buffer size calculation
2021-07-30 09:26:35 -07:00
Morph
ba3d230421 applet_swkbd: Correct string buffer size calculation
The buffer size here does not include the initial 8 bytes.
2021-07-30 02:19:04 -04:00
Morph
275db94bb8 configure_system: Add Brazilian Portuguese to the list of languages 2021-07-30 02:15:53 -04:00
Morph
6ca8ed9e58 service: set: Correct 4.0.0 max_entries to 0x40 (64) instead of 17 2021-07-30 02:15:53 -04:00
Morph
21ff0a3d6e service: ns, set: Add PT_BR (Brazilian Portuguese) 2021-07-30 02:15:53 -04:00
ReinUsesLisp
3b006f4fe2 renderer_vulkan: Add setting to log pipeline statistics
Use VK_KHR_pipeline_executable_properties when enabled and available to
log statistics about the pipeline cache in a game.

For example, this is on Turing GPUs when generating a pipeline cache
from Super Smash Bros. Ultimate:

Average pipeline statistics
==========================================
Code size:       6433.167
Register count:    32.939

More advanced results could be presented, at the moment it's just an
average of all 3D and compute pipelines.
2021-07-27 21:29:24 -03:00
32 changed files with 586 additions and 464 deletions

View File

@@ -319,6 +319,7 @@ struct Values {
// Renderer
Setting<RendererBackend> renderer_backend{RendererBackend::OpenGL, "backend"};
BasicSetting<bool> renderer_debug{false, "debug"};
BasicSetting<bool> renderer_shader_feedback{false, "shader_feedback"};
BasicSetting<bool> enable_nsight_aftermath{false, "nsight_aftermath"};
BasicSetting<bool> disable_shader_loop_safety_checks{false,
"disable_shader_loop_safety_checks"};

View File

@@ -12,9 +12,9 @@ namespace HLE::ApiVersion {
// Horizon OS version constants.
constexpr u8 HOS_VERSION_MAJOR = 11;
constexpr u8 HOS_VERSION_MINOR = 0;
constexpr u8 HOS_VERSION_MICRO = 1;
constexpr u8 HOS_VERSION_MAJOR = 12;
constexpr u8 HOS_VERSION_MINOR = 1;
constexpr u8 HOS_VERSION_MICRO = 0;
// NintendoSDK version constants.
@@ -22,15 +22,15 @@ constexpr u8 SDK_REVISION_MAJOR = 1;
constexpr u8 SDK_REVISION_MINOR = 0;
constexpr char PLATFORM_STRING[] = "NX";
constexpr char VERSION_HASH[] = "69103fcb2004dace877094c2f8c29e6113be5dbf";
constexpr char DISPLAY_VERSION[] = "11.0.1";
constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 11.0.1-1.0";
constexpr char VERSION_HASH[] = "76b10c2dab7d3aa73fc162f8dff1655e6a21caf4";
constexpr char DISPLAY_VERSION[] = "12.1.0";
constexpr char DISPLAY_TITLE[] = "NintendoSDK Firmware for NX 12.1.0-1.0";
// Atmosphere version constants.
constexpr u8 ATMOSPHERE_RELEASE_VERSION_MAJOR = 0;
constexpr u8 ATMOSPHERE_RELEASE_VERSION_MINOR = 19;
constexpr u8 ATMOSPHERE_RELEASE_VERSION_MICRO = 4;
constexpr u8 ATMOSPHERE_RELEASE_VERSION_MICRO = 5;
constexpr u32 GetTargetFirmware() {
return u32{HOS_VERSION_MAJOR} << 24 | u32{HOS_VERSION_MINOR} << 16 |

View File

@@ -377,7 +377,7 @@ void SoftwareKeyboard::SubmitForTextCheck(std::u16string submitted_text) {
if (swkbd_config_common.use_utf8) {
std::string utf8_submitted_text = Common::UTF16ToUTF8(current_text);
const u64 buffer_size = sizeof(u64) + utf8_submitted_text.size();
const u64 buffer_size = utf8_submitted_text.size();
LOG_DEBUG(Service_AM, "\nBuffer Size: {}\nUTF-8 Submitted Text: {}", buffer_size,
utf8_submitted_text);
@@ -386,7 +386,7 @@ void SoftwareKeyboard::SubmitForTextCheck(std::u16string submitted_text) {
std::memcpy(out_data.data() + sizeof(u64), utf8_submitted_text.data(),
utf8_submitted_text.size());
} else {
const u64 buffer_size = sizeof(u64) + current_text.size() * sizeof(char16_t);
const u64 buffer_size = current_text.size() * sizeof(char16_t);
LOG_DEBUG(Service_AM, "\nBuffer Size: {}\nUTF-16 Submitted Text: {}", buffer_size,
Common::UTF16ToUTF8(current_text));

View File

@@ -339,6 +339,7 @@ std::optional<ApplicationLanguage> ConvertToApplicationLanguage(
case Set::LanguageCode::FR_CA:
return ApplicationLanguage::CanadianFrench;
case Set::LanguageCode::PT:
case Set::LanguageCode::PT_BR:
return ApplicationLanguage::Portuguese;
case Set::LanguageCode::RU:
return ApplicationLanguage::Russian;

View File

@@ -12,7 +12,7 @@
namespace Service::Set {
namespace {
constexpr std::array<LanguageCode, 17> available_language_codes = {{
constexpr std::array<LanguageCode, 18> available_language_codes = {{
LanguageCode::JA,
LanguageCode::EN_US,
LanguageCode::FR,
@@ -30,6 +30,7 @@ constexpr std::array<LanguageCode, 17> available_language_codes = {{
LanguageCode::ES_419,
LanguageCode::ZH_HANS,
LanguageCode::ZH_HANT,
LanguageCode::PT_BR,
}};
enum class KeyboardLayout : u64 {
@@ -50,7 +51,7 @@ enum class KeyboardLayout : u64 {
ChineseTraditional = 14,
};
constexpr std::array<std::pair<LanguageCode, KeyboardLayout>, 17> language_to_layout{{
constexpr std::array<std::pair<LanguageCode, KeyboardLayout>, 18> language_to_layout{{
{LanguageCode::JA, KeyboardLayout::Japanese},
{LanguageCode::EN_US, KeyboardLayout::EnglishUs},
{LanguageCode::FR, KeyboardLayout::French},
@@ -68,10 +69,11 @@ constexpr std::array<std::pair<LanguageCode, KeyboardLayout>, 17> language_to_la
{LanguageCode::ES_419, KeyboardLayout::SpanishLatin},
{LanguageCode::ZH_HANS, KeyboardLayout::ChineseSimplified},
{LanguageCode::ZH_HANT, KeyboardLayout::ChineseTraditional},
{LanguageCode::PT_BR, KeyboardLayout::Portuguese},
}};
constexpr std::size_t pre4_0_0_max_entries = 15;
constexpr std::size_t post4_0_0_max_entries = 17;
constexpr std::size_t PRE_4_0_0_MAX_ENTRIES = 0xF;
constexpr std::size_t POST_4_0_0_MAX_ENTRIES = 0x40;
constexpr ResultCode ERR_INVALID_LANGUAGE{ErrorModule::Settings, 625};
@@ -81,9 +83,9 @@ void PushResponseLanguageCode(Kernel::HLERequestContext& ctx, std::size_t num_la
rb.Push(static_cast<u32>(num_language_codes));
}
void GetAvailableLanguageCodesImpl(Kernel::HLERequestContext& ctx, std::size_t max_size) {
void GetAvailableLanguageCodesImpl(Kernel::HLERequestContext& ctx, std::size_t max_entries) {
const std::size_t requested_amount = ctx.GetWriteBufferSize() / sizeof(LanguageCode);
const std::size_t copy_amount = std::min(requested_amount, max_size);
const std::size_t copy_amount = std::min(requested_amount, max_entries);
const std::size_t copy_size = copy_amount * sizeof(LanguageCode);
ctx.WriteBuffer(available_language_codes.data(), copy_size);
@@ -118,7 +120,7 @@ LanguageCode GetLanguageCodeFromIndex(std::size_t index) {
void SET::GetAvailableLanguageCodes(Kernel::HLERequestContext& ctx) {
LOG_DEBUG(Service_SET, "called");
GetAvailableLanguageCodesImpl(ctx, pre4_0_0_max_entries);
GetAvailableLanguageCodesImpl(ctx, PRE_4_0_0_MAX_ENTRIES);
}
void SET::MakeLanguageCode(Kernel::HLERequestContext& ctx) {
@@ -140,19 +142,19 @@ void SET::MakeLanguageCode(Kernel::HLERequestContext& ctx) {
void SET::GetAvailableLanguageCodes2(Kernel::HLERequestContext& ctx) {
LOG_DEBUG(Service_SET, "called");
GetAvailableLanguageCodesImpl(ctx, post4_0_0_max_entries);
GetAvailableLanguageCodesImpl(ctx, POST_4_0_0_MAX_ENTRIES);
}
void SET::GetAvailableLanguageCodeCount(Kernel::HLERequestContext& ctx) {
LOG_DEBUG(Service_SET, "called");
PushResponseLanguageCode(ctx, pre4_0_0_max_entries);
PushResponseLanguageCode(ctx, PRE_4_0_0_MAX_ENTRIES);
}
void SET::GetAvailableLanguageCodeCount2(Kernel::HLERequestContext& ctx) {
LOG_DEBUG(Service_SET, "called");
PushResponseLanguageCode(ctx, post4_0_0_max_entries);
PushResponseLanguageCode(ctx, POST_4_0_0_MAX_ENTRIES);
}
void SET::GetQuestFlag(Kernel::HLERequestContext& ctx) {

View File

@@ -31,6 +31,7 @@ enum class LanguageCode : u64 {
ES_419 = 0x00003931342D7365,
ZH_HANS = 0x00736E61482D687A,
ZH_HANT = 0x00746E61482D687A,
PT_BR = 0x00000052422D7470,
};
LanguageCode GetLanguageCodeFromIndex(std::size_t idx);

View File

@@ -106,6 +106,8 @@ add_library(video_core STATIC
renderer_vulkan/maxwell_to_vk.cpp
renderer_vulkan/maxwell_to_vk.h
renderer_vulkan/pipeline_helper.h
renderer_vulkan/pipeline_statistics.cpp
renderer_vulkan/pipeline_statistics.h
renderer_vulkan/renderer_vulkan.h
renderer_vulkan/renderer_vulkan.cpp
renderer_vulkan/vk_blit_screen.cpp

View File

@@ -10,33 +10,27 @@
#define END_PUSH_CONSTANTS };
#define UNIFORM(n)
#define BINDING_INPUT_BUFFER 0
#define BINDING_ENC_BUFFER 1
#define BINDING_SWIZZLE_BUFFER 2
#define BINDING_OUTPUT_IMAGE 3
#define BINDING_OUTPUT_IMAGE 1
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
#define BEGIN_PUSH_CONSTANTS
#define END_PUSH_CONSTANTS
#define UNIFORM(n) layout(location = n) uniform
#define BINDING_SWIZZLE_BUFFER 0
#define BINDING_INPUT_BUFFER 1
#define BINDING_ENC_BUFFER 2
#define BINDING_INPUT_BUFFER 0
#define BINDING_OUTPUT_IMAGE 0
#endif
layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in;
layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
BEGIN_PUSH_CONSTANTS
UNIFORM(1) uvec2 block_dims;
UNIFORM(2) uint bytes_per_block_log2;
UNIFORM(3) uint layer_stride;
UNIFORM(4) uint block_size;
UNIFORM(5) uint x_shift;
UNIFORM(6) uint block_height;
UNIFORM(7) uint block_height_mask;
UNIFORM(2) uint layer_stride;
UNIFORM(3) uint block_size;
UNIFORM(4) uint x_shift;
UNIFORM(5) uint block_height;
UNIFORM(6) uint block_height_mask;
END_PUSH_CONSTANTS
struct EncodingData {
@@ -55,45 +49,35 @@ struct TexelWeightParams {
bool void_extent_hdr;
};
// Swizzle data
layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable {
uint swizzle_table[];
};
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
uint astc_data[];
};
// ASTC Encodings data
layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues {
EncodingData encoding_values[];
uvec4 astc_data[];
};
layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image;
const uint GOB_SIZE_X = 64;
const uint GOB_SIZE_Y = 8;
const uint GOB_SIZE_Z = 1;
const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z;
const uint GOB_SIZE_X_SHIFT = 6;
const uint GOB_SIZE_Y_SHIFT = 3;
const uint GOB_SIZE_Z_SHIFT = 0;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT;
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;
const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1);
const int BLOCK_SIZE_IN_BYTES = 16;
const int BLOCK_INFO_ERROR = 0;
const int BLOCK_INFO_VOID_EXTENT_HDR = 1;
const int BLOCK_INFO_VOID_EXTENT_LDR = 2;
const int BLOCK_INFO_NORMAL = 3;
const uint BYTES_PER_BLOCK_LOG2 = 4;
const int JUST_BITS = 0;
const int QUINT = 1;
const int TRIT = 2;
// ASTC Encodings data, sorted in ascending order based on their BitLength value
// (see GetBitLength() function)
EncodingData encoding_values[22] = EncodingData[](
EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0),
EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0),
EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0),
EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0),
EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0),
EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0),
EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0),
EncodingData(JUST_BITS, 8, 0, 0)
);
// The following constants are expanded variants of the Replicate()
// function calls corresponding to the following arguments:
// value: index into the generated table
@@ -135,44 +119,37 @@ const uint REPLICATE_7_BIT_TO_8_TABLE[128] =
// Input ASTC texture globals
uint current_index = 0;
int bitsread = 0;
uint total_bitsread = 0;
uint local_buff[16];
int total_bitsread = 0;
uvec4 local_buff;
// Color data globals
uint color_endpoint_data[16];
uvec4 color_endpoint_data;
int color_bitsread = 0;
uint total_color_bitsread = 0;
int color_index = 0;
// Four values, two endpoints, four maximum paritions
uint color_values[32];
int colvals_index = 0;
// Weight data globals
uint texel_weight_data[16];
uvec4 texel_weight_data;
int texel_bitsread = 0;
uint total_texel_bitsread = 0;
int texel_index = 0;
bool texel_flag = false;
// Global "vectors" to be pushed into when decoding
EncodingData result_vector[100];
EncodingData result_vector[144];
int result_index = 0;
EncodingData texel_vector[100];
EncodingData texel_vector[144];
int texel_vector_index = 0;
uint unquantized_texel_weights[2][144];
uint SwizzleOffset(uvec2 pos) {
pos = pos & SWIZZLE_MASK;
return swizzle_table[pos.y * 64 + pos.x];
}
uint ReadTexel(uint offset) {
// extract the 8-bit value from the 32-bit packed data.
return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8);
uint x = pos.x;
uint y = pos.y;
return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
(y % 2) * 16 + (x % 16);
}
// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
@@ -278,14 +255,10 @@ uint Hash52(uint p) {
return p;
}
uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) {
if (partition_count == 1) {
return 0;
}
uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
if (small_block) {
x <<= 1;
y <<= 1;
z <<= 1;
}
seed += (partition_count - 1) * 1024;
@@ -299,10 +272,6 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
uint seed6 = uint((rnum >> 20) & 0xF);
uint seed7 = uint((rnum >> 24) & 0xF);
uint seed8 = uint((rnum >> 28) & 0xF);
uint seed9 = uint((rnum >> 18) & 0xF);
uint seed10 = uint((rnum >> 22) & 0xF);
uint seed11 = uint((rnum >> 26) & 0xF);
uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF);
seed1 = (seed1 * seed1);
seed2 = (seed2 * seed2);
@@ -312,12 +281,8 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
seed6 = (seed6 * seed6);
seed7 = (seed7 * seed7);
seed8 = (seed8 * seed8);
seed9 = (seed9 * seed9);
seed10 = (seed10 * seed10);
seed11 = (seed11 * seed11);
seed12 = (seed12 * seed12);
int sh1, sh2, sh3;
uint sh1, sh2;
if ((seed & 1) > 0) {
sh1 = (seed & 2) > 0 ? 4 : 5;
sh2 = (partition_count == 3) ? 6 : 5;
@@ -325,25 +290,19 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
sh1 = (partition_count == 3) ? 6 : 5;
sh2 = (seed & 2) > 0 ? 4 : 5;
}
sh3 = (seed & 0x10) > 0 ? sh1 : sh2;
seed1 >>= sh1;
seed2 >>= sh2;
seed3 >>= sh1;
seed4 >>= sh2;
seed5 >>= sh1;
seed6 >>= sh2;
seed7 >>= sh1;
seed8 >>= sh2;
seed1 = (seed1 >> sh1);
seed2 = (seed2 >> sh2);
seed3 = (seed3 >> sh1);
seed4 = (seed4 >> sh2);
seed5 = (seed5 >> sh1);
seed6 = (seed6 >> sh2);
seed7 = (seed7 >> sh1);
seed8 = (seed8 >> sh2);
seed9 = (seed9 >> sh3);
seed10 = (seed10 >> sh3);
seed11 = (seed11 >> sh3);
seed12 = (seed12 >> sh3);
uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
uint a = seed1 * x + seed2 * y + (rnum >> 14);
uint b = seed3 * x + seed4 * y + (rnum >> 10);
uint c = seed5 * x + seed6 * y + (rnum >> 6);
uint d = seed7 * x + seed8 * y + (rnum >> 2);
a &= 0x3F;
b &= 0x3F;
@@ -368,58 +327,37 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo
}
}
uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
return SelectPartition(seed, x, y, 0, partition_count, small_block);
}
uint ReadBit() {
if (current_index >= local_buff.length()) {
uint ExtractBits(uvec4 payload, int offset, int bits) {
if (bits <= 0) {
return 0;
}
uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1);
++bitsread;
++total_bitsread;
if (bitsread == 8) {
++current_index;
bitsread = 0;
int last_offset = offset + bits - 1;
int shifted_offset = offset >> 5;
if ((last_offset >> 5) == shifted_offset) {
return bitfieldExtract(payload[shifted_offset], offset & 31, bits);
}
return bit;
int first_bits = 32 - (offset & 31);
int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits));
int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits));
return result_first | (result_second << first_bits);
}
uint StreamBits(uint num_bits) {
uint ret = 0;
for (uint i = 0; i < num_bits; i++) {
ret |= ((ReadBit() & 1) << i);
}
int int_bits = int(num_bits);
uint ret = ExtractBits(local_buff, total_bitsread, int_bits);
total_bitsread += int_bits;
return ret;
}
uint ReadColorBit() {
uint bit = 0;
if (texel_flag) {
bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1);
++texel_bitsread;
++total_texel_bitsread;
if (texel_bitsread == 8) {
++texel_index;
texel_bitsread = 0;
}
} else {
bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1);
++color_bitsread;
++total_color_bitsread;
if (color_bitsread == 8) {
++color_index;
color_bitsread = 0;
}
}
return bit;
}
uint StreamColorBits(uint num_bits) {
uint ret = 0;
for (uint i = 0; i < num_bits; i++) {
ret |= ((ReadColorBit() & 1) << i);
int int_bits = int(num_bits);
if (texel_flag) {
ret = ExtractBits(texel_weight_data, texel_bitsread, int_bits);
texel_bitsread += int_bits;
} else {
ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits);
color_bitsread += int_bits;
}
return ret;
}
@@ -596,22 +534,16 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
for (uint i = 0; i < num_partitions; i++) {
num_values += ((modes[i] >> 2) + 1) << 1;
}
int range = 256;
while (--range > 0) {
EncodingData val = encoding_values[range];
// Find the largest encoding that's within color_data_bits
// TODO(ameerj): profile with binary search
int range = 0;
while (++range < encoding_values.length()) {
uint bit_length = GetBitLength(num_values, range);
if (bit_length <= color_data_bits) {
while (--range > 0) {
EncodingData newval = encoding_values[range];
if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) {
break;
}
}
++range;
if (bit_length > color_data_bits) {
break;
}
}
DecodeIntegerSequence(range, num_values);
DecodeIntegerSequence(range - 1, num_values);
uint out_index = 0;
for (int itr = 0; itr < result_index; ++itr) {
if (out_index >= num_values) {
@@ -1028,7 +960,7 @@ int FindLayout(uint mode) {
return 5;
}
TexelWeightParams DecodeBlockInfo(uint block_index) {
TexelWeightParams DecodeBlockInfo() {
TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false);
uint mode = StreamBits(11);
if ((mode & 0x1ff) == 0x1fc) {
@@ -1110,10 +1042,10 @@ TexelWeightParams DecodeBlockInfo(uint block_index) {
}
weight_index -= 2;
if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31);
const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12);
params.max_weight = max_weights[weight_index];
} else {
const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7);
const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6);
params.max_weight = max_weights[weight_index];
}
return params;
@@ -1144,8 +1076,8 @@ void FillVoidExtentLDR(ivec3 coord) {
}
}
void DecompressBlock(ivec3 coord, uint block_index) {
TexelWeightParams params = DecodeBlockInfo(block_index);
void DecompressBlock(ivec3 coord) {
TexelWeightParams params = DecodeBlockInfo();
if (params.error_state) {
FillError(coord);
return;
@@ -1212,7 +1144,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
// Read color data...
uint color_data_bits = remaining_bits;
while (remaining_bits > 0) {
int nb = int(min(remaining_bits, 8U));
int nb = int(min(remaining_bits, 32U));
uint b = StreamBits(nb);
color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
++ced_pointer;
@@ -1254,25 +1186,20 @@ void DecompressBlock(ivec3 coord, uint block_index) {
ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]);
}
for (uint i = 0; i < 16; i++) {
texel_weight_data[i] = local_buff[i];
}
for (uint i = 0; i < 8; i++) {
#define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16
uint a = REVERSE_BYTE(texel_weight_data[i]);
uint b = REVERSE_BYTE(texel_weight_data[15 - i]);
#undef REVERSE_BYTE
texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8));
texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8));
}
texel_weight_data = local_buff;
texel_weight_data = bitfieldReverse(texel_weight_data).wzyx;
uint clear_byte_start =
(GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1;
texel_weight_data[clear_byte_start - 1] =
texel_weight_data[clear_byte_start - 1] &
uint byte_insert = ExtractBits(texel_weight_data, int(clear_byte_start - 1) * 8, 8) &
uint(
((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
for (uint i = 0; i < 16 - clear_byte_start; i++) {
texel_weight_data[clear_byte_start + i] = 0U;
uint vec_index = (clear_byte_start - 1) >> 2;
texel_weight_data[vec_index] =
bitfieldInsert(texel_weight_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8);
for (uint i = clear_byte_start; i < 16; ++i) {
uint idx = i >> 2;
texel_weight_data[idx] = bitfieldInsert(texel_weight_data[idx], 0, int(i % 4) * 8, 8);
}
texel_flag = true; // use texel "vector" and bit stream in integer decoding
DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
@@ -1281,8 +1208,11 @@ void DecompressBlock(ivec3 coord, uint block_index) {
for (uint j = 0; j < block_dims.y; j++) {
for (uint i = 0; i < block_dims.x; i++) {
uint local_partition = Select2DPartition(partition_index, i, j, num_partitions,
uint local_partition = 0;
if (num_partitions > 1) {
local_partition = Select2DPartition(partition_index, i, j, num_partitions,
(block_dims.y * block_dims.x) < 32);
}
vec4 p;
uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
@@ -1303,7 +1233,7 @@ void DecompressBlock(ivec3 coord, uint block_index) {
void main() {
uvec3 pos = gl_GlobalInvocationID;
pos.x <<= bytes_per_block_log2;
pos.x <<= BYTES_PER_BLOCK_LOG2;
// Read as soon as possible due to its latency
const uint swizzle = SwizzleOffset(pos.xy);
@@ -1321,13 +1251,8 @@ void main() {
if (any(greaterThanEqual(coord, imageSize(dest_image)))) {
return;
}
uint block_index =
pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x;
current_index = 0;
bitsread = 0;
for (int i = 0; i < 16; i++) {
local_buff[i] = ReadTexel(offset + i);
}
DecompressBlock(coord, block_index);
local_buff = astc_data[offset / 16];
DecompressBlock(coord);
}

View File

@@ -60,19 +60,14 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) {
const auto swizzle_table = Tegra::Texture::MakeSwizzleTable();
swizzle_table_buffer.Create();
astc_buffer.Create();
glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0);
glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_ENCODINGS_VALUES), &ASTC_ENCODINGS_VALUES,
0);
}
UtilShaders::~UtilShaders() = default;
void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles) {
static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
static constexpr GLuint BINDING_INPUT_BUFFER = 1;
static constexpr GLuint BINDING_ENC_BUFFER = 2;
static constexpr GLuint BINDING_INPUT_BUFFER = 0;
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
const Extent2D tile_size{
@@ -80,34 +75,32 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map,
.height = VideoCore::Surface::DefaultBlockHeight(image.info.format),
};
program_manager.BindComputeProgram(astc_decoder_program.handle);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle);
glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
glUniform2ui(1, tile_size.width, tile_size.height);
// Ensure buffer data is valid before dispatching
glFlush();
for (const SwizzleParameters& swizzle : swizzles) {
const size_t input_offset = swizzle.buffer_offset + map.offset;
const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U);
const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U);
const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
ASSERT(params.bytes_per_block_log2 == 4);
glUniform1ui(2, params.bytes_per_block_log2);
glUniform1ui(3, params.layer_stride);
glUniform1ui(4, params.block_size);
glUniform1ui(5, params.x_shift);
glUniform1ui(6, params.block_height);
glUniform1ui(7, params.block_height_mask);
glUniform1ui(2, params.layer_stride);
glUniform1ui(3, params.block_size);
glUniform1ui(4, params.x_shift);
glUniform1ui(5, params.block_height);
glUniform1ui(6, params.block_height_mask);
glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0,
GL_WRITE_ONLY, GL_RGBA8);
// ASTC texture data
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
image.guest_size_bytes - swizzle.buffer_offset);
glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0,
GL_WRITE_ONLY, GL_RGBA8);
glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers);
}

View File

@@ -62,7 +62,6 @@ private:
ProgramManager& program_manager;
OGLBuffer swizzle_table_buffer;
OGLBuffer astc_buffer;
OGLProgram astc_decoder_program;
OGLProgram block_linear_unswizzle_2d_program;

View File

@@ -0,0 +1,100 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <string_view>
#include <fmt/format.h>
#include "common/common_types.h"
#include "common/logging/log.h"
#include "video_core/renderer_vulkan/pipeline_statistics.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan {
using namespace std::string_view_literals;
static u64 GetUint64(const VkPipelineExecutableStatisticKHR& statistic) {
switch (statistic.format) {
case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_INT64_KHR:
return static_cast<u64>(statistic.value.i64);
case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_UINT64_KHR:
return statistic.value.u64;
case VK_PIPELINE_EXECUTABLE_STATISTIC_FORMAT_FLOAT64_KHR:
return static_cast<u64>(statistic.value.f64);
default:
return 0;
}
}
PipelineStatistics::PipelineStatistics(const Device& device_) : device{device_} {}
void PipelineStatistics::Collect(VkPipeline pipeline) {
const auto& dev{device.GetLogical()};
const std::vector properties{dev.GetPipelineExecutablePropertiesKHR(pipeline)};
const u32 num_executables{static_cast<u32>(properties.size())};
for (u32 executable = 0; executable < num_executables; ++executable) {
const auto statistics{dev.GetPipelineExecutableStatisticsKHR(pipeline, executable)};
if (statistics.empty()) {
continue;
}
Stats stage_stats;
for (const auto& statistic : statistics) {
const char* const name{statistic.name};
if (name == "Binary Size"sv || name == "Code size"sv || name == "Instruction Count"sv) {
stage_stats.code_size = GetUint64(statistic);
} else if (name == "Register Count"sv) {
stage_stats.register_count = GetUint64(statistic);
} else if (name == "SGPRs"sv || name == "numUsedSgprs"sv) {
stage_stats.sgpr_count = GetUint64(statistic);
} else if (name == "VGPRs"sv || name == "numUsedVgprs"sv) {
stage_stats.vgpr_count = GetUint64(statistic);
} else if (name == "Branches"sv) {
stage_stats.branches_count = GetUint64(statistic);
} else if (name == "Basic Block Count"sv) {
stage_stats.basic_block_count = GetUint64(statistic);
}
}
std::lock_guard lock{mutex};
collected_stats.push_back(stage_stats);
}
}
void PipelineStatistics::Report() const {
double num{};
Stats total;
{
std::lock_guard lock{mutex};
for (const Stats& stats : collected_stats) {
total.code_size += stats.code_size;
total.register_count += stats.register_count;
total.sgpr_count += stats.sgpr_count;
total.vgpr_count += stats.vgpr_count;
total.branches_count += stats.branches_count;
total.basic_block_count += stats.basic_block_count;
}
num = static_cast<double>(collected_stats.size());
}
std::string report;
const auto add = [&](const char* fmt, u64 value) {
if (value > 0) {
report += fmt::format(fmt::runtime(fmt), static_cast<double>(value) / num);
}
};
add("Code size: {:9.03f}\n", total.code_size);
add("Register count: {:9.03f}\n", total.register_count);
add("SGPRs: {:9.03f}\n", total.sgpr_count);
add("VGPRs: {:9.03f}\n", total.vgpr_count);
add("Branches count: {:9.03f}\n", total.branches_count);
add("Basic blocks: {:9.03f}\n", total.basic_block_count);
LOG_INFO(Render_Vulkan,
"\nAverage pipeline statistics\n"
"==========================================\n"
"{}\n",
report);
}
} // namespace Vulkan

View File

@@ -0,0 +1,40 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <mutex>
#include <vector>
#include "common/common_types.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan {
class Device;
class PipelineStatistics {
public:
explicit PipelineStatistics(const Device& device_);
void Collect(VkPipeline pipeline);
void Report() const;
private:
struct Stats {
u64 code_size{};
u64 register_count{};
u64 sgpr_count{};
u64 vgpr_count{};
u64 branches_count{};
u64 basic_block_count{};
};
const Device& device;
mutable std::mutex mutex;
std::vector<Stats> collected_stats;
};
} // namespace Vulkan

View File

@@ -30,16 +30,12 @@
namespace Vulkan {
using Tegra::Texture::SWIZZLE_TABLE;
using Tegra::Texture::ASTC::ASTC_ENCODINGS_VALUES;
using namespace Tegra::Texture::ASTC;
namespace {
constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
constexpr u32 ASTC_BINDING_ENC_BUFFER = 1;
constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 2;
constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 3;
constexpr size_t ASTC_NUM_BINDINGS = 4;
constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1;
constexpr size_t ASTC_NUM_BINDINGS = 2;
template <size_t size>
inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{
@@ -75,7 +71,7 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
.score = 2,
};
constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDINGS{{
constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
{
.binding = ASTC_BINDING_INPUT_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
@@ -83,20 +79,6 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDIN
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = ASTC_BINDING_ENC_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = ASTC_BINDING_SWIZZLE_BUFFER,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = ASTC_BINDING_OUTPUT_IMAGE,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
@@ -108,12 +90,12 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDIN
constexpr DescriptorBankInfo ASTC_BANK_INFO{
.uniform_buffers = 0,
.storage_buffers = 3,
.storage_buffers = 1,
.texture_buffers = 0,
.image_buffers = 0,
.textures = 0,
.images = 1,
.score = 4,
.score = 2,
};
constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{
@@ -135,22 +117,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
.offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = ASTC_BINDING_ENC_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = ASTC_BINDING_ENC_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = ASTC_BINDING_SWIZZLE_BUFFER,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry),
.stride = sizeof(DescriptorUpdateEntry),
},
{
.dstBinding = ASTC_BINDING_OUTPUT_IMAGE,
.dstArrayElement = 0,
@@ -163,7 +129,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS>
struct AstcPushConstants {
std::array<u32, 2> blocks_dims;
u32 bytes_per_block_log2;
u32 layer_stride;
u32 block_size;
u32 x_shift;
@@ -354,46 +319,6 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_,
ASTCDecoderPass::~ASTCDecoderPass() = default;
void ASTCDecoderPass::MakeDataBuffer() {
constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_ENCODINGS_VALUES) + sizeof(SWIZZLE_TABLE);
data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = TOTAL_BUFFER_SIZE,
.usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
});
data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload);
const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload);
std::memcpy(staging_ref.mapped_span.data(), &ASTC_ENCODINGS_VALUES,
sizeof(ASTC_ENCODINGS_VALUES));
// Tack on the swizzle table at the end of the buffer
std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_ENCODINGS_VALUES), &SWIZZLE_TABLE,
sizeof(SWIZZLE_TABLE));
scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer,
TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) {
static constexpr VkMemoryBarrier write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT,
};
const VkBufferCopy copy{
.srcOffset = offset,
.dstOffset = 0,
.size = TOTAL_BUFFER_SIZE,
};
cmdbuf.CopyBuffer(src, dst, copy);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
0, write_barrier);
});
}
void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
std::span<const VideoCommon::SwizzleParameters> swizzles) {
using namespace VideoCommon::Accelerated;
@@ -402,9 +327,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
VideoCore::Surface::DefaultBlockHeight(image.info.format),
};
scheduler.RequestOutsideRenderPassOperationContext();
if (!data_buffer) {
MakeDataBuffer();
}
const VkPipeline vk_pipeline = *pipeline;
const VkImageAspectFlags aspect_mask = image.AspectMask();
const VkImage vk_image = image.Handle();
@@ -436,16 +358,13 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
});
for (const VideoCommon::SwizzleParameters& swizzle : swizzles) {
const size_t input_offset = swizzle.buffer_offset + map.offset;
const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U);
const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U);
const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U);
const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U);
const u32 num_dispatches_z = image.info.resources.layers;
update_descriptor_queue.Acquire();
update_descriptor_queue.AddBuffer(map.buffer, input_offset,
image.guest_size_bytes - swizzle.buffer_offset);
update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(ASTC_ENCODINGS_VALUES));
update_descriptor_queue.AddBuffer(*data_buffer, sizeof(ASTC_ENCODINGS_VALUES),
sizeof(SWIZZLE_TABLE));
update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level));
const void* const descriptor_data{update_descriptor_queue.UpdateData()};
@@ -453,11 +372,11 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map,
const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info);
ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0}));
ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0}));
ASSERT(params.bytes_per_block_log2 == 4);
scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims,
params, descriptor_data](vk::CommandBuffer cmdbuf) {
const AstcPushConstants uniforms{
.blocks_dims = block_dims,
.bytes_per_block_log2 = params.bytes_per_block_log2,
.layer_stride = params.layer_stride,
.block_size = params.block_size,
.x_shift = params.x_shift,

View File

@@ -96,15 +96,10 @@ public:
std::span<const VideoCommon::SwizzleParameters> swizzles);
private:
void MakeDataBuffer();
VKScheduler& scheduler;
StagingBufferPool& staging_buffer_pool;
VKUpdateDescriptorQueue& update_descriptor_queue;
MemoryAllocator& memory_allocator;
vk::Buffer data_buffer;
MemoryCommit data_buffer_commit;
};
} // namespace Vulkan

View File

@@ -8,6 +8,7 @@
#include <boost/container/small_vector.hpp>
#include "video_core/renderer_vulkan/pipeline_helper.h"
#include "video_core/renderer_vulkan/pipeline_statistics.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_compute_pipeline.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
@@ -26,6 +27,7 @@ using Tegra::Texture::TexturePair;
ComputePipeline::ComputePipeline(const Device& device_, DescriptorPool& descriptor_pool,
VKUpdateDescriptorQueue& update_descriptor_queue_,
Common::ThreadWorker* thread_worker,
PipelineStatistics* pipeline_statistics,
VideoCore::ShaderNotify* shader_notify, const Shader::Info& info_,
vk::ShaderModule spv_module_)
: device{device_}, update_descriptor_queue{update_descriptor_queue_}, info{info_},
@@ -36,7 +38,7 @@ ComputePipeline::ComputePipeline(const Device& device_, DescriptorPool& descript
std::copy_n(info.constant_buffer_used_sizes.begin(), uniform_buffer_sizes.size(),
uniform_buffer_sizes.begin());
auto func{[this, &descriptor_pool, shader_notify] {
auto func{[this, &descriptor_pool, shader_notify, pipeline_statistics] {
DescriptorLayoutBuilder builder{device};
builder.Add(info, VK_SHADER_STAGE_COMPUTE_BIT);
@@ -50,10 +52,14 @@ ComputePipeline::ComputePipeline(const Device& device_, DescriptorPool& descript
.pNext = nullptr,
.requiredSubgroupSize = GuestWarpSize,
};
VkPipelineCreateFlags flags{};
if (device.IsKhrPipelineEexecutablePropertiesEnabled()) {
flags |= VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR;
}
pipeline = device.GetLogical().CreateComputePipeline({
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.flags = flags,
.stage{
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.pNext = device.IsExtSubgroupSizeControlSupported() ? &subgroup_size_ci : nullptr,
@@ -67,6 +73,9 @@ ComputePipeline::ComputePipeline(const Device& device_, DescriptorPool& descript
.basePipelineHandle = 0,
.basePipelineIndex = 0,
});
if (pipeline_statistics) {
pipeline_statistics->Collect(*pipeline);
}
std::lock_guard lock{build_mutex};
is_built = true;
build_condvar.notify_one();

View File

@@ -25,6 +25,7 @@ class ShaderNotify;
namespace Vulkan {
class Device;
class PipelineStatistics;
class VKScheduler;
class ComputePipeline {
@@ -32,6 +33,7 @@ public:
explicit ComputePipeline(const Device& device, DescriptorPool& descriptor_pool,
VKUpdateDescriptorQueue& update_descriptor_queue,
Common::ThreadWorker* thread_worker,
PipelineStatistics* pipeline_statistics,
VideoCore::ShaderNotify* shader_notify, const Shader::Info& info,
vk::ShaderModule spv_module);

View File

@@ -11,6 +11,7 @@
#include "common/bit_field.h"
#include "video_core/renderer_vulkan/maxwell_to_vk.h"
#include "video_core/renderer_vulkan/pipeline_helper.h"
#include "video_core/renderer_vulkan/pipeline_statistics.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
#include "video_core/renderer_vulkan/vk_render_pass_cache.h"
@@ -217,8 +218,8 @@ GraphicsPipeline::GraphicsPipeline(
VKScheduler& scheduler_, BufferCache& buffer_cache_, TextureCache& texture_cache_,
VideoCore::ShaderNotify* shader_notify, const Device& device_, DescriptorPool& descriptor_pool,
VKUpdateDescriptorQueue& update_descriptor_queue_, Common::ThreadWorker* worker_thread,
RenderPassCache& render_pass_cache, const GraphicsPipelineCacheKey& key_,
std::array<vk::ShaderModule, NUM_STAGES> stages,
PipelineStatistics* pipeline_statistics, RenderPassCache& render_pass_cache,
const GraphicsPipelineCacheKey& key_, std::array<vk::ShaderModule, NUM_STAGES> stages,
const std::array<const Shader::Info*, NUM_STAGES>& infos)
: key{key_}, maxwell3d{maxwell3d_}, gpu_memory{gpu_memory_}, device{device_},
texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, scheduler{scheduler_},
@@ -235,7 +236,7 @@ GraphicsPipeline::GraphicsPipeline(
enabled_uniform_buffer_masks[stage] = info->constant_buffer_mask;
std::ranges::copy(info->constant_buffer_used_sizes, uniform_buffer_sizes[stage].begin());
}
auto func{[this, shader_notify, &render_pass_cache, &descriptor_pool] {
auto func{[this, shader_notify, &render_pass_cache, &descriptor_pool, pipeline_statistics] {
DescriptorLayoutBuilder builder{MakeBuilder(device, stage_infos)};
uses_push_descriptor = builder.CanUsePushDescriptor();
descriptor_set_layout = builder.CreateDescriptorSetLayout(uses_push_descriptor);
@@ -250,6 +251,9 @@ GraphicsPipeline::GraphicsPipeline(
const VkRenderPass render_pass{render_pass_cache.Get(MakeRenderPassKey(key.state))};
Validate();
MakePipeline(render_pass);
if (pipeline_statistics) {
pipeline_statistics->Collect(*pipeline);
}
std::lock_guard lock{build_mutex};
is_built = true;
@@ -782,10 +786,14 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
}
*/
}
VkPipelineCreateFlags flags{};
if (device.IsKhrPipelineEexecutablePropertiesEnabled()) {
flags |= VK_PIPELINE_CREATE_CAPTURE_STATISTICS_BIT_KHR;
}
pipeline = device.GetLogical().CreateGraphicsPipeline({
.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.flags = flags,
.stageCount = static_cast<u32>(shader_stages.size()),
.pStages = shader_stages.data(),
.pVertexInputState = &vertex_input_ci,

View File

@@ -60,6 +60,7 @@ struct hash<Vulkan::GraphicsPipelineCacheKey> {
namespace Vulkan {
class Device;
class PipelineStatistics;
class RenderPassCache;
class VKScheduler;
class VKUpdateDescriptorQueue;
@@ -73,8 +74,9 @@ public:
VKScheduler& scheduler, BufferCache& buffer_cache, TextureCache& texture_cache,
VideoCore::ShaderNotify* shader_notify, const Device& device,
DescriptorPool& descriptor_pool, VKUpdateDescriptorQueue& update_descriptor_queue,
Common::ThreadWorker* worker_thread, RenderPassCache& render_pass_cache,
const GraphicsPipelineCacheKey& key, std::array<vk::ShaderModule, NUM_STAGES> stages,
Common::ThreadWorker* worker_thread, PipelineStatistics* pipeline_statistics,
RenderPassCache& render_pass_cache, const GraphicsPipelineCacheKey& key,
std::array<vk::ShaderModule, NUM_STAGES> stages,
const std::array<const Shader::Info*, NUM_STAGES>& infos);
GraphicsPipeline& operator=(GraphicsPipeline&&) noexcept = delete;

View File

@@ -29,6 +29,7 @@
#include "video_core/renderer_vulkan/fixed_pipeline_state.h"
#include "video_core/renderer_vulkan/maxwell_to_vk.h"
#include "video_core/renderer_vulkan/pipeline_helper.h"
#include "video_core/renderer_vulkan/pipeline_statistics.h"
#include "video_core/renderer_vulkan/vk_compute_pipeline.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_pipeline_cache.h"
@@ -389,15 +390,19 @@ void PipelineCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading
size_t total{};
size_t built{};
bool has_loaded{};
std::unique_ptr<PipelineStatistics> statistics;
} state;
if (device.IsKhrPipelineEexecutablePropertiesEnabled()) {
state.statistics = std::make_unique<PipelineStatistics>(device);
}
const auto load_compute{[&](std::ifstream& file, FileEnvironment env) {
ComputePipelineCacheKey key;
file.read(reinterpret_cast<char*>(&key), sizeof(key));
workers.QueueWork([this, key, env = std::move(env), &state, &callback]() mutable {
ShaderPools pools;
auto pipeline{CreateComputePipeline(pools, key, env, false)};
auto pipeline{CreateComputePipeline(pools, key, env, state.statistics.get(), false)};
std::lock_guard lock{state.mutex};
if (pipeline) {
compute_cache.emplace(key, std::move(pipeline));
@@ -425,7 +430,8 @@ void PipelineCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading
for (auto& env : envs) {
env_ptrs.push_back(&env);
}
auto pipeline{CreateGraphicsPipeline(pools, key, MakeSpan(env_ptrs), false)};
auto pipeline{CreateGraphicsPipeline(pools, key, MakeSpan(env_ptrs),
state.statistics.get(), false)};
std::lock_guard lock{state.mutex};
graphics_cache.emplace(key, std::move(pipeline));
@@ -445,6 +451,10 @@ void PipelineCache::LoadDiskResources(u64 title_id, std::stop_token stop_loading
lock.unlock();
workers.WaitForRequests();
if (state.statistics) {
state.statistics->Report();
}
}
GraphicsPipeline* PipelineCache::CurrentGraphicsPipelineSlowPath() {
@@ -486,7 +496,8 @@ GraphicsPipeline* PipelineCache::BuiltPipeline(GraphicsPipeline* pipeline) const
std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
ShaderPools& pools, const GraphicsPipelineCacheKey& key,
std::span<Shader::Environment* const> envs, bool build_in_parallel) try {
std::span<Shader::Environment* const> envs, PipelineStatistics* statistics,
bool build_in_parallel) try {
LOG_INFO(Render_Vulkan, "0x{:016x}", key.Hash());
size_t env_index{0};
std::array<Shader::IR::Program, Maxwell::MaxShaderProgram> programs;
@@ -540,7 +551,7 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
Common::ThreadWorker* const thread_worker{build_in_parallel ? &workers : nullptr};
return std::make_unique<GraphicsPipeline>(
maxwell3d, gpu_memory, scheduler, buffer_cache, texture_cache, &shader_notify, device,
descriptor_pool, update_descriptor_queue, thread_worker, render_pass_cache, key,
descriptor_pool, update_descriptor_queue, thread_worker, statistics, render_pass_cache, key,
std::move(modules), infos);
} catch (const Shader::Exception& exception) {
@@ -553,7 +564,8 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline() {
GetGraphicsEnvironments(environments, graphics_key.unique_hashes);
main_pools.ReleaseContents();
auto pipeline{CreateGraphicsPipeline(main_pools, graphics_key, environments.Span(), true)};
auto pipeline{
CreateGraphicsPipeline(main_pools, graphics_key, environments.Span(), nullptr, true)};
if (!pipeline || pipeline_cache_filename.empty()) {
return pipeline;
}
@@ -578,7 +590,7 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline(
env.SetCachedSize(shader->size_bytes);
main_pools.ReleaseContents();
auto pipeline{CreateComputePipeline(main_pools, key, env, true)};
auto pipeline{CreateComputePipeline(main_pools, key, env, nullptr, true)};
if (!pipeline || pipeline_cache_filename.empty()) {
return pipeline;
}
@@ -591,7 +603,7 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline(
std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline(
ShaderPools& pools, const ComputePipelineCacheKey& key, Shader::Environment& env,
bool build_in_parallel) try {
PipelineStatistics* statistics, bool build_in_parallel) try {
LOG_INFO(Render_Vulkan, "0x{:016x}", key.Hash());
Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()};
@@ -605,8 +617,8 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline(
}
Common::ThreadWorker* const thread_worker{build_in_parallel ? &workers : nullptr};
return std::make_unique<ComputePipeline>(device, descriptor_pool, update_descriptor_queue,
thread_worker, &shader_notify, program.info,
std::move(spv_module));
thread_worker, statistics, &shader_notify,
program.info, std::move(spv_module));
} catch (const Shader::Exception& exception) {
LOG_ERROR(Render_Vulkan, "{}", exception.what());

View File

@@ -80,8 +80,9 @@ struct hash<Vulkan::ComputePipelineCacheKey> {
namespace Vulkan {
class ComputePipeline;
class Device;
class DescriptorPool;
class Device;
class PipelineStatistics;
class RasterizerVulkan;
class RenderPassCache;
class VKScheduler;
@@ -128,7 +129,8 @@ private:
std::unique_ptr<GraphicsPipeline> CreateGraphicsPipeline(
ShaderPools& pools, const GraphicsPipelineCacheKey& key,
std::span<Shader::Environment* const> envs, bool build_in_parallel);
std::span<Shader::Environment* const> envs, PipelineStatistics* statistics,
bool build_in_parallel);
std::unique_ptr<ComputePipeline> CreateComputePipeline(const ComputePipelineCacheKey& key,
const ShaderInfo* shader);
@@ -136,6 +138,7 @@ private:
std::unique_ptr<ComputePipeline> CreateComputePipeline(ShaderPools& pools,
const ComputePipelineCacheKey& key,
Shader::Environment& env,
PipelineStatistics* statistics,
bool build_in_parallel);
const Device& device;

View File

@@ -151,6 +151,76 @@ private:
const IntType& m_Bits;
};
enum class IntegerEncoding { JustBits, Quint, Trit };
struct IntegerEncodedValue {
constexpr IntegerEncodedValue() = default;
constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
: encoding{encoding_}, num_bits{num_bits_} {}
constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
return encoding == other.encoding && num_bits == other.num_bits;
}
// Returns the number of bits required to encode num_vals values.
u32 GetBitLength(u32 num_vals) const {
u32 total_bits = num_bits * num_vals;
if (encoding == IntegerEncoding::Trit) {
total_bits += (num_vals * 8 + 4) / 5;
} else if (encoding == IntegerEncoding::Quint) {
total_bits += (num_vals * 7 + 2) / 3;
}
return total_bits;
}
IntegerEncoding encoding{};
u32 num_bits = 0;
u32 bit_value = 0;
union {
u32 quint_value = 0;
u32 trit_value;
};
};
// Returns a new instance of this struct that corresponds to the
// can take no more than mav_value values
static constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) {
while (mav_value > 0) {
u32 check = mav_value + 1;
// Is mav_value a power of two?
if (!(check & (check - 1))) {
return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value));
}
// Is mav_value of the type 3*2^n - 1?
if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1));
}
// Is mav_value of the type 5*2^n - 1?
if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1));
}
// Apparently it can't be represented with a bounded integer sequence...
// just iterate.
mav_value--;
}
return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
}
static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
std::array<IntegerEncodedValue, 256> encodings{};
for (std::size_t i = 0; i < encodings.size(); ++i) {
encodings[i] = CreateEncoding(static_cast<u32>(i));
}
return encodings;
}
static constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues();
namespace Tegra::Texture::ASTC {
using IntegerEncodedVector = boost::container::static_vector<
IntegerEncodedValue, 256,
@@ -521,35 +591,41 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
return params;
}
static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
u32 blockHeight) {
// Don't actually care about the void extent, just read the bits...
for (s32 i = 0; i < 4; ++i) {
strm.ReadBits<13>();
// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
// is the same as [(num_bits - 1):0] and repeats all the way down.
template <typename IntType>
static constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) {
if (num_bits == 0 || to_bit == 0) {
return 0;
}
// Decode the RGBA components and renormalize them to the range [0, 255]
u16 r = static_cast<u16>(strm.ReadBits<16>());
u16 g = static_cast<u16>(strm.ReadBits<16>());
u16 b = static_cast<u16>(strm.ReadBits<16>());
u16 a = static_cast<u16>(strm.ReadBits<16>());
u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
(static_cast<u32>(a) & 0xFF00) << 16;
for (u32 j = 0; j < blockHeight; j++) {
for (u32 i = 0; i < blockWidth; i++) {
outBuf[j * blockWidth + i] = rgba;
const IntType v = val & static_cast<IntType>((1 << num_bits) - 1);
IntType res = v;
u32 reslen = num_bits;
while (reslen < to_bit) {
u32 comp = 0;
if (num_bits > to_bit - reslen) {
u32 newshift = to_bit - reslen;
comp = num_bits - newshift;
num_bits = newshift;
}
res = static_cast<IntType>(res << num_bits);
res = static_cast<IntType>(res | (v >> comp));
reslen += num_bits;
}
return res;
}
static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
for (u32 j = 0; j < blockHeight; j++) {
for (u32 i = 0; i < blockWidth; i++) {
outBuf[j * blockWidth + i] = 0xFFFF00FF;
}
static constexpr std::size_t NumReplicateEntries(u32 num_bits) {
return std::size_t(1) << num_bits;
}
template <typename IntType, u32 num_bits, u32 to_bit>
static constexpr auto MakeReplicateTable() {
std::array<IntType, NumReplicateEntries(num_bits)> table{};
for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
table[value] = Replicate(value, num_bits, to_bit);
}
return table;
}
static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>();
@@ -572,6 +648,9 @@ static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8>
static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>();
static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>();
static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>();
static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
/// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
/// to the runtime implementation
static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) {
@@ -1316,6 +1395,37 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues,
#undef READ_INT_VALUES
}
static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth,
u32 blockHeight) {
// Don't actually care about the void extent, just read the bits...
for (s32 i = 0; i < 4; ++i) {
strm.ReadBits<13>();
}
// Decode the RGBA components and renormalize them to the range [0, 255]
u16 r = static_cast<u16>(strm.ReadBits<16>());
u16 g = static_cast<u16>(strm.ReadBits<16>());
u16 b = static_cast<u16>(strm.ReadBits<16>());
u16 a = static_cast<u16>(strm.ReadBits<16>());
u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 |
(static_cast<u32>(a) & 0xFF00) << 16;
for (u32 j = 0; j < blockHeight; j++) {
for (u32 i = 0; i < blockWidth; i++) {
outBuf[j * blockWidth + i] = rgba;
}
}
}
static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) {
for (u32 j = 0; j < blockHeight; j++) {
for (u32 i = 0; i < blockWidth; i++) {
outBuf[j * blockWidth + i] = 0xFFFF00FF;
}
}
}
static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth,
const u32 blockHeight, std::span<u32, 12 * 12> outBuf) {
InputBitStream strm(inBuf);

View File

@@ -9,117 +9,6 @@
namespace Tegra::Texture::ASTC {
enum class IntegerEncoding { JustBits, Quint, Trit };
struct IntegerEncodedValue {
constexpr IntegerEncodedValue() = default;
constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_)
: encoding{encoding_}, num_bits{num_bits_} {}
constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const {
return encoding == other.encoding && num_bits == other.num_bits;
}
// Returns the number of bits required to encode num_vals values.
u32 GetBitLength(u32 num_vals) const {
u32 total_bits = num_bits * num_vals;
if (encoding == IntegerEncoding::Trit) {
total_bits += (num_vals * 8 + 4) / 5;
} else if (encoding == IntegerEncoding::Quint) {
total_bits += (num_vals * 7 + 2) / 3;
}
return total_bits;
}
IntegerEncoding encoding{};
u32 num_bits = 0;
u32 bit_value = 0;
union {
u32 quint_value = 0;
u32 trit_value;
};
};
// Returns a new instance of this struct that corresponds to the
// can take no more than mav_value values
constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) {
while (mav_value > 0) {
u32 check = mav_value + 1;
// Is mav_value a power of two?
if (!(check & (check - 1))) {
return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value));
}
// Is mav_value of the type 3*2^n - 1?
if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) {
return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1));
}
// Is mav_value of the type 5*2^n - 1?
if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) {
return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1));
}
// Apparently it can't be represented with a bounded integer sequence...
// just iterate.
mav_value--;
}
return IntegerEncodedValue(IntegerEncoding::JustBits, 0);
}
constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() {
std::array<IntegerEncodedValue, 256> encodings{};
for (std::size_t i = 0; i < encodings.size(); ++i) {
encodings[i] = CreateEncoding(static_cast<u32>(i));
}
return encodings;
}
constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues();
// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
// is the same as [(num_bits - 1):0] and repeats all the way down.
template <typename IntType>
constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) {
if (num_bits == 0 || to_bit == 0) {
return 0;
}
const IntType v = val & static_cast<IntType>((1 << num_bits) - 1);
IntType res = v;
u32 reslen = num_bits;
while (reslen < to_bit) {
u32 comp = 0;
if (num_bits > to_bit - reslen) {
u32 newshift = to_bit - reslen;
comp = num_bits - newshift;
num_bits = newshift;
}
res = static_cast<IntType>(res << num_bits);
res = static_cast<IntType>(res | (v >> comp));
reslen += num_bits;
}
return res;
}
constexpr std::size_t NumReplicateEntries(u32 num_bits) {
return std::size_t(1) << num_bits;
}
template <typename IntType, u32 num_bits, u32 to_bit>
constexpr auto MakeReplicateTable() {
std::array<IntType, NumReplicateEntries(num_bits)> table{};
for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) {
table[value] = Replicate(value, num_bits, to_bit);
}
return table;
}
constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>();
constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>();
constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>();
void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth,
uint32_t block_width, uint32_t block_height, std::span<uint8_t> output);

View File

@@ -526,6 +526,17 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
SetNext(next, workgroup_layout);
}
VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR executable_properties;
if (khr_pipeline_executable_properties) {
LOG_INFO(Render_Vulkan, "Enabling shader feedback, expect slower shader build times");
executable_properties = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR,
.pNext = nullptr,
.pipelineExecutableInfo = VK_TRUE,
};
SetNext(next, executable_properties);
}
if (!ext_depth_range_unrestricted) {
LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted");
}
@@ -824,6 +835,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
bool has_khr_shader_float16_int8{};
bool has_khr_workgroup_memory_explicit_layout{};
bool has_khr_pipeline_executable_properties{};
bool has_ext_subgroup_size_control{};
bool has_ext_transform_feedback{};
bool has_ext_custom_border_color{};
@@ -878,6 +890,10 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
test(nv_device_diagnostics_config, VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME,
true);
}
if (Settings::values.renderer_shader_feedback) {
test(has_khr_pipeline_executable_properties,
VK_KHR_PIPELINE_EXECUTABLE_PROPERTIES_EXTENSION_NAME, false);
}
}
VkPhysicalDeviceFeatures2KHR features{};
features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR;
@@ -1033,6 +1049,19 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
khr_workgroup_memory_explicit_layout = true;
}
}
if (has_khr_pipeline_executable_properties) {
VkPhysicalDevicePipelineExecutablePropertiesFeaturesKHR executable_properties;
executable_properties.sType =
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PIPELINE_EXECUTABLE_PROPERTIES_FEATURES_KHR;
executable_properties.pNext = nullptr;
features.pNext = &executable_properties;
physical.GetFeatures2KHR(features);
if (executable_properties.pipelineExecutableInfo) {
extensions.push_back(VK_KHR_PIPELINE_EXECUTABLE_PROPERTIES_EXTENSION_NAME);
khr_pipeline_executable_properties = true;
}
}
if (khr_push_descriptor) {
VkPhysicalDevicePushDescriptorPropertiesKHR push_descriptor;
push_descriptor.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR;

View File

@@ -214,6 +214,11 @@ public:
return khr_push_descriptor;
}
/// Returns true if VK_KHR_pipeline_executable_properties is enabled.
bool IsKhrPipelineEexecutablePropertiesEnabled() const {
return khr_pipeline_executable_properties;
}
/// Returns true if the device supports VK_KHR_workgroup_memory_explicit_layout.
bool IsKhrWorkgroupMemoryExplicitLayoutSupported() const {
return khr_workgroup_memory_explicit_layout;
@@ -378,6 +383,7 @@ private:
bool khr_spirv_1_4{}; ///< Support for VK_KHR_spirv_1_4.
bool khr_workgroup_memory_explicit_layout{}; ///< Support for explicit workgroup layouts.
bool khr_push_descriptor{}; ///< Support for VK_KHR_push_descritor.
bool khr_pipeline_executable_properties{}; ///< Support for executable properties.
bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8.
bool ext_sampler_filter_minmax{}; ///< Support for VK_EXT_sampler_filter_minmax.
bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted.

View File

@@ -181,6 +181,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
X(vkGetMemoryWin32HandleKHR);
#endif
X(vkGetQueryPoolResults);
X(vkGetPipelineExecutablePropertiesKHR);
X(vkGetPipelineExecutableStatisticsKHR);
X(vkGetSemaphoreCounterValueKHR);
X(vkMapMemory);
X(vkQueueSubmit);
@@ -809,6 +811,42 @@ VkMemoryRequirements Device::GetImageMemoryRequirements(VkImage image) const noe
return requirements;
}
std::vector<VkPipelineExecutablePropertiesKHR> Device::GetPipelineExecutablePropertiesKHR(
VkPipeline pipeline) const {
const VkPipelineInfoKHR info{
.sType = VK_STRUCTURE_TYPE_PIPELINE_INFO_KHR,
.pNext = nullptr,
.pipeline = pipeline,
};
u32 num{};
dld->vkGetPipelineExecutablePropertiesKHR(handle, &info, &num, nullptr);
std::vector<VkPipelineExecutablePropertiesKHR> properties(num);
for (auto& property : properties) {
property.sType = VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_PROPERTIES_KHR;
}
Check(dld->vkGetPipelineExecutablePropertiesKHR(handle, &info, &num, properties.data()));
return properties;
}
std::vector<VkPipelineExecutableStatisticKHR> Device::GetPipelineExecutableStatisticsKHR(
VkPipeline pipeline, u32 executable_index) const {
const VkPipelineExecutableInfoKHR executable_info{
.sType = VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_INFO_KHR,
.pNext = nullptr,
.pipeline = pipeline,
.executableIndex = executable_index,
};
u32 num{};
dld->vkGetPipelineExecutableStatisticsKHR(handle, &executable_info, &num, nullptr);
std::vector<VkPipelineExecutableStatisticKHR> statistics(num);
for (auto& statistic : statistics) {
statistic.sType = VK_STRUCTURE_TYPE_PIPELINE_EXECUTABLE_STATISTIC_KHR;
}
Check(dld->vkGetPipelineExecutableStatisticsKHR(handle, &executable_info, &num,
statistics.data()));
return statistics;
}
void Device::UpdateDescriptorSets(Span<VkWriteDescriptorSet> writes,
Span<VkCopyDescriptorSet> copies) const noexcept {
dld->vkUpdateDescriptorSets(handle, writes.size(), writes.data(), copies.size(), copies.data());

View File

@@ -295,6 +295,8 @@ struct DeviceDispatch : InstanceDispatch {
#ifdef _WIN32
PFN_vkGetMemoryWin32HandleKHR vkGetMemoryWin32HandleKHR{};
#endif
PFN_vkGetPipelineExecutablePropertiesKHR vkGetPipelineExecutablePropertiesKHR{};
PFN_vkGetPipelineExecutableStatisticsKHR vkGetPipelineExecutableStatisticsKHR{};
PFN_vkGetQueryPoolResults vkGetQueryPoolResults{};
PFN_vkGetSemaphoreCounterValueKHR vkGetSemaphoreCounterValueKHR{};
PFN_vkMapMemory vkMapMemory{};
@@ -879,6 +881,12 @@ public:
VkMemoryRequirements GetImageMemoryRequirements(VkImage image) const noexcept;
std::vector<VkPipelineExecutablePropertiesKHR> GetPipelineExecutablePropertiesKHR(
VkPipeline pipeline) const;
std::vector<VkPipelineExecutableStatisticKHR> GetPipelineExecutableStatisticsKHR(
VkPipeline pipeline, u32 executable_index) const;
void UpdateDescriptorSets(Span<VkWriteDescriptorSet> writes,
Span<VkCopyDescriptorSet> copies) const noexcept;

View File

@@ -825,6 +825,7 @@ void Config::ReadRendererValues() {
if (global) {
ReadBasicSetting(Settings::values.fps_cap);
ReadBasicSetting(Settings::values.renderer_debug);
ReadBasicSetting(Settings::values.renderer_shader_feedback);
ReadBasicSetting(Settings::values.enable_nsight_aftermath);
ReadBasicSetting(Settings::values.disable_shader_loop_safety_checks);
}
@@ -1363,6 +1364,7 @@ void Config::SaveRendererValues() {
if (global) {
WriteBasicSetting(Settings::values.fps_cap);
WriteBasicSetting(Settings::values.renderer_debug);
WriteBasicSetting(Settings::values.renderer_shader_feedback);
WriteBasicSetting(Settings::values.enable_nsight_aftermath);
WriteBasicSetting(Settings::values.disable_shader_loop_safety_checks);
}

View File

@@ -43,6 +43,8 @@ void ConfigureDebug::SetConfiguration() {
ui->use_auto_stub->setChecked(Settings::values.use_auto_stub.GetValue());
ui->enable_graphics_debugging->setEnabled(runtime_lock);
ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug.GetValue());
ui->enable_shader_feedback->setEnabled(runtime_lock);
ui->enable_shader_feedback->setChecked(Settings::values.renderer_shader_feedback.GetValue());
ui->enable_cpu_debugging->setEnabled(runtime_lock);
ui->enable_cpu_debugging->setChecked(Settings::values.cpu_debug_mode.GetValue());
ui->enable_nsight_aftermath->setEnabled(runtime_lock);
@@ -65,6 +67,7 @@ void ConfigureDebug::ApplyConfiguration() {
Settings::values.use_debug_asserts = ui->use_debug_asserts->isChecked();
Settings::values.use_auto_stub = ui->use_auto_stub->isChecked();
Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked();
Settings::values.renderer_shader_feedback = ui->enable_shader_feedback->isChecked();
Settings::values.cpu_debug_mode = ui->enable_cpu_debugging->isChecked();
Settings::values.enable_nsight_aftermath = ui->enable_nsight_aftermath->isChecked();
Settings::values.disable_shader_loop_safety_checks =

View File

@@ -111,8 +111,8 @@
<property name="title">
<string>Graphics</string>
</property>
<layout class="QVBoxLayout" name="verticalLayout_6">
<item>
<layout class="QGridLayout" name="gridLayout_3">
<item row="0" column="0">
<widget class="QCheckBox" name="enable_graphics_debugging">
<property name="enabled">
<bool>true</bool>
@@ -125,7 +125,7 @@
</property>
</widget>
</item>
<item>
<item row="2" column="0">
<widget class="QCheckBox" name="enable_nsight_aftermath">
<property name="toolTip">
<string>When checked, it enables Nsight Aftermath crash dumps</string>
@@ -135,7 +135,7 @@
</property>
</widget>
</item>
<item>
<item row="0" column="1">
<widget class="QCheckBox" name="disable_macro_jit">
<property name="enabled">
<bool>true</bool>
@@ -148,7 +148,17 @@
</property>
</widget>
</item>
<item>
<item row="1" column="0">
<widget class="QCheckBox" name="enable_shader_feedback">
<property name="toolTip">
<string>When checked, yuzu will log statistics about the compiled pipeline cache</string>
</property>
<property name="text">
<string>Enable Shader Feedback</string>
</property>
</widget>
</item>
<item row="1" column="1">
<widget class="QCheckBox" name="disable_loop_safety_checks">
<property name="toolTip">
<string>When checked, it executes shaders without loop logic changes</string>
@@ -276,11 +286,14 @@
<tabstop>open_log_button</tabstop>
<tabstop>homebrew_args_edit</tabstop>
<tabstop>enable_graphics_debugging</tabstop>
<tabstop>enable_shader_feedback</tabstop>
<tabstop>enable_nsight_aftermath</tabstop>
<tabstop>disable_macro_jit</tabstop>
<tabstop>disable_loop_safety_checks</tabstop>
<tabstop>fs_access_log</tabstop>
<tabstop>reporting_services</tabstop>
<tabstop>quest_flag</tabstop>
<tabstop>enable_cpu_debugging</tabstop>
<tabstop>use_debug_asserts</tabstop>
<tabstop>use_auto_stub</tabstop>
</tabstops>

View File

@@ -401,6 +401,11 @@
<string>Traditional Chinese (正體中文)</string>
</property>
</item>
<item>
<property name="text">
<string>Brazilian Portuguese (português do Brasil)</string>
</property>
</item>
</widget>
</item>
<item row="5" column="0">

View File

@@ -444,6 +444,7 @@ void Config::ReadValues() {
// Renderer
ReadSetting("Renderer", Settings::values.renderer_backend);
ReadSetting("Renderer", Settings::values.renderer_debug);
ReadSetting("Renderer", Settings::values.renderer_shader_feedback);
ReadSetting("Renderer", Settings::values.enable_nsight_aftermath);
ReadSetting("Renderer", Settings::values.disable_shader_loop_safety_checks);
ReadSetting("Renderer", Settings::values.vulkan_device);

View File

@@ -221,6 +221,10 @@ backend =
# 0 (default): Disabled, 1: Enabled
debug =
# Enable shader feedback.
# 0 (default): Disabled, 1: Enabled
renderer_shader_feedback =
# Enable Nsight Aftermath crash dumps
# 0 (default): Disabled, 1: Enabled
nsight_aftermath =
@@ -363,7 +367,7 @@ custom_rtc =
# Sets the systems language index
# 0: Japanese, 1: English (default), 2: French, 3: German, 4: Italian, 5: Spanish, 6: Chinese,
# 7: Korean, 8: Dutch, 9: Portuguese, 10: Russian, 11: Taiwanese, 12: British English, 13: Canadian French,
# 14: Latin American Spanish, 15: Simplified Chinese, 16: Traditional Chinese
# 14: Latin American Spanish, 15: Simplified Chinese, 16: Traditional Chinese, 17: Brazilian Portuguese
language_index =
# The system region that yuzu will use during emulation