Compare commits

...

6 Commits

Author SHA1 Message Date
ReinUsesLisp
a59cdf3b4c shader/other: Implement MEMBAR.CTS
This silences an assertion we were hitting and uses workgroup memory
barriers when the game requests it.
2020-05-15 03:03:41 -03:00
ReinUsesLisp
0491b53f34 shader/other: Implement BAR.SYNC 0x0
Trivially implement this particular case of BAR. Unless games use OpenCL
or CUDA barriers, we shouldn't hit any other case here.
2020-05-15 03:03:41 -03:00
ReinUsesLisp
c0959a89f0 shader/other: Implement thread comparisons (NV_shader_thread_group)
Hardware S2R special registers match gl_Thread*MaskNV. We can trivially
implement these using Nvidia's extension on OpenGL or naively stubbing
them with the ARB instructions to match. This might cause issues if the
host device warp size doesn't match Nvidia's. That said, this is
unlikely on proper shaders.

Refer to the attached url for more documentation about these flags.
https://www.khronos.org/registry/OpenGL/extensions/NV/NV_shader_thread_group.txt
2020-05-15 03:03:41 -03:00
ReinUsesLisp
e1048851b6 shader/memory: Implement non-addition operations in RED
Trivially implement these instructions. They are used in Astral Chain.
2020-05-15 03:03:41 -03:00
ReinUsesLisp
2e475a41b9 shader_decompiler: Visit source nodes even when they assign to RZ
Some operations like atomicMin were ignored because they returned were
being stored to RZ. This operations have a side effect and it was being
ignored.
2020-05-15 03:03:41 -03:00
ReinUsesLisp
0afa8a9dcb vk_shader_decompiler: Don't assert for void returns
Atomic instructions can be used without returning anything and this is
valid code. Remove the assert.
2020-05-15 03:03:41 -03:00
6 changed files with 140 additions and 15 deletions

View File

@@ -1538,7 +1538,9 @@ private:
Expression target;
if (const auto gpr = std::get_if<GprNode>(&*dest)) {
if (gpr->GetIndex() == Register::ZeroIndex) {
// Writing to Register::ZeroIndex is a no op
// Writing to Register::ZeroIndex is a no op but we still have to visit the source
// as it might have side effects.
code.AddLine("{};", Visit(src).GetCode());
return {};
}
target = {GetRegister(gpr->GetIndex()), Type::Float};
@@ -2309,6 +2311,18 @@ private:
return {"gl_SubGroupInvocationARB", Type::Uint};
}
template <const std::string_view& comparison>
Expression ThreadMask(Operation) {
if (device.HasWarpIntrinsics()) {
return {fmt::format("gl_Thread{}MaskNV", comparison), Type::Uint};
}
if (device.HasShaderBallot()) {
return {fmt::format("uint(gl_SubGroup{}MaskARB)", comparison), Type::Uint};
}
LOG_ERROR(Render_OpenGL, "Thread mask intrinsics are required by the shader");
return {"0U", Type::Uint};
}
Expression ShuffleIndexed(Operation operation) {
std::string value = VisitOperand(operation, 0).AsFloat();
@@ -2321,7 +2335,21 @@ private:
return {fmt::format("readInvocationARB({}, {})", value, index), Type::Float};
}
Expression MemoryBarrierGL(Operation) {
Expression Barrier(Operation) {
if (!ir.IsDecompiled()) {
LOG_ERROR(Render_OpenGL, "barrier() used but shader is not decompiled");
return {};
}
code.AddLine("barrier();");
return {};
}
Expression MemoryBarrierGroup(Operation) {
code.AddLine("groupMemoryBarrier();");
return {};
}
Expression MemoryBarrierGlobal(Operation) {
code.AddLine("memoryBarrier();");
return {};
}
@@ -2337,6 +2365,12 @@ private:
static constexpr std::string_view NotEqual = "!=";
static constexpr std::string_view GreaterEqual = ">=";
static constexpr std::string_view Eq = "Eq";
static constexpr std::string_view Ge = "Ge";
static constexpr std::string_view Gt = "Gt";
static constexpr std::string_view Le = "Le";
static constexpr std::string_view Lt = "Lt";
static constexpr std::string_view Add = "Add";
static constexpr std::string_view Min = "Min";
static constexpr std::string_view Max = "Max";
@@ -2554,9 +2588,16 @@ private:
&GLSLDecompiler::VoteEqual,
&GLSLDecompiler::ThreadId,
&GLSLDecompiler::ThreadMask<Func::Eq>,
&GLSLDecompiler::ThreadMask<Func::Ge>,
&GLSLDecompiler::ThreadMask<Func::Gt>,
&GLSLDecompiler::ThreadMask<Func::Le>,
&GLSLDecompiler::ThreadMask<Func::Lt>,
&GLSLDecompiler::ShuffleIndexed,
&GLSLDecompiler::MemoryBarrierGL,
&GLSLDecompiler::Barrier,
&GLSLDecompiler::MemoryBarrierGroup,
&GLSLDecompiler::MemoryBarrierGlobal,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));

View File

@@ -515,6 +515,16 @@ private:
void DeclareCommon() {
thread_id =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupLocalInvocationId, t_in_uint, "thread_id");
thread_masks[0] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupEqMask, t_in_uint4, "thread_eq_mask");
thread_masks[1] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupGeMask, t_in_uint4, "thread_ge_mask");
thread_masks[2] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupGtMask, t_in_uint4, "thread_gt_mask");
thread_masks[3] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupLeMask, t_in_uint4, "thread_le_mask");
thread_masks[4] =
DeclareInputBuiltIn(spv::BuiltIn::SubgroupLtMask, t_in_uint4, "thread_lt_mask");
}
void DeclareVertex() {
@@ -1071,8 +1081,7 @@ private:
void VisitBasicBlock(const NodeBlock& bb) {
for (const auto& node : bb) {
[[maybe_unused]] const Type type = Visit(node).type;
ASSERT(type == Type::Void);
Visit(node);
}
}
@@ -1362,7 +1371,9 @@ private:
Expression target{};
if (const auto gpr = std::get_if<GprNode>(&*dest)) {
if (gpr->GetIndex() == Register::ZeroIndex) {
// Writing to Register::ZeroIndex is a no op
// Writing to Register::ZeroIndex is a no op but we still have to visit its source
// because it might have side effects.
Visit(src);
return {};
}
target = {registers.at(gpr->GetIndex()), Type::Float};
@@ -2175,14 +2186,37 @@ private:
return {OpLoad(t_uint, thread_id), Type::Uint};
}
template <std::size_t index>
Expression ThreadMask(Operation) {
// TODO(Rodrigo): Handle devices with different warp sizes
const Id mask = thread_masks[index];
return {OpLoad(t_uint, AccessElement(t_in_uint, mask, 0)), Type::Uint};
}
Expression ShuffleIndexed(Operation operation) {
const Id value = AsFloat(Visit(operation[0]));
const Id index = AsUint(Visit(operation[1]));
return {OpSubgroupReadInvocationKHR(t_float, value, index), Type::Float};
}
Expression MemoryBarrierGL(Operation) {
const auto scope = spv::Scope::Device;
Expression Barrier(Operation) {
if (!ir.IsDecompiled()) {
LOG_ERROR(Render_Vulkan, "OpBarrier used by shader is not decompiled");
return {};
}
const auto scope = spv::Scope::Workgroup;
const auto memory = spv::Scope::Workgroup;
const auto semantics =
spv::MemorySemanticsMask::WorkgroupMemory | spv::MemorySemanticsMask::AcquireRelease;
OpControlBarrier(Constant(t_uint, static_cast<u32>(scope)),
Constant(t_uint, static_cast<u32>(memory)),
Constant(t_uint, static_cast<u32>(semantics)));
return {};
}
template <spv::Scope scope>
Expression MemoryBarrier(Operation) {
const auto semantics =
spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::UniformMemory |
spv::MemorySemanticsMask::WorkgroupMemory |
@@ -2639,9 +2673,16 @@ private:
&SPIRVDecompiler::Vote<&Module::OpSubgroupAllEqualKHR>,
&SPIRVDecompiler::ThreadId,
&SPIRVDecompiler::ThreadMask<0>, // Eq
&SPIRVDecompiler::ThreadMask<1>, // Ge
&SPIRVDecompiler::ThreadMask<2>, // Gt
&SPIRVDecompiler::ThreadMask<3>, // Le
&SPIRVDecompiler::ThreadMask<4>, // Lt
&SPIRVDecompiler::ShuffleIndexed,
&SPIRVDecompiler::MemoryBarrierGL,
&SPIRVDecompiler::Barrier,
&SPIRVDecompiler::MemoryBarrier<spv::Scope::Workgroup>,
&SPIRVDecompiler::MemoryBarrier<spv::Scope::Device>,
};
static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
@@ -2763,6 +2804,7 @@ private:
Id workgroup_id{};
Id local_invocation_id{};
Id thread_id{};
std::array<Id, 5> thread_masks{}; // eq, ge, gt, le, lt
VertexIndices in_indices;
VertexIndices out_indices;

View File

@@ -387,7 +387,6 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
}
case OpCode::Id::RED: {
UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32);
UNIMPLEMENTED_IF_MSG(instr.red.operation != AtomicOp::Add);
const auto [real_address, base_address, descriptor] =
TrackGlobalMemory(bb, instr, true, true);
if (!real_address || !base_address) {
@@ -396,7 +395,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) {
}
Node gmem = MakeNode<GmemNode>(real_address, base_address, descriptor);
Node value = GetRegister(instr.gpr0);
bb.push_back(Operation(OperationCode::ReduceIAdd, move(gmem), move(value)));
bb.push_back(Operation(GetAtomOperation(instr.red.operation), move(gmem), move(value)));
break;
}
case OpCode::Id::ATOM: {

View File

@@ -109,6 +109,27 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
return Operation(OperationCode::WorkGroupIdY);
case SystemVariable::CtaIdZ:
return Operation(OperationCode::WorkGroupIdZ);
case SystemVariable::EqMask:
case SystemVariable::LtMask:
case SystemVariable::LeMask:
case SystemVariable::GtMask:
case SystemVariable::GeMask:
uses_warps = true;
switch (instr.sys20) {
case SystemVariable::EqMask:
return Operation(OperationCode::ThreadEqMask);
case SystemVariable::LtMask:
return Operation(OperationCode::ThreadLtMask);
case SystemVariable::LeMask:
return Operation(OperationCode::ThreadLeMask);
case SystemVariable::GtMask:
return Operation(OperationCode::ThreadGtMask);
case SystemVariable::GeMask:
return Operation(OperationCode::ThreadGeMask);
default:
UNREACHABLE();
return Immediate(0u);
}
default:
UNIMPLEMENTED_MSG("Unhandled system move: {}",
static_cast<u32>(instr.sys20.Value()));
@@ -272,10 +293,25 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
SetRegister(bb, instr.gpr0, GetRegister(instr.gpr8));
break;
}
case OpCode::Id::BAR: {
UNIMPLEMENTED_IF_MSG(instr.value != 0xF0A81B8000070000ULL, "BAR is not BAR.SYNC 0x0");
bb.push_back(Operation(OperationCode::Barrier));
break;
}
case OpCode::Id::MEMBAR: {
UNIMPLEMENTED_IF(instr.membar.type != Tegra::Shader::MembarType::GL);
UNIMPLEMENTED_IF(instr.membar.unknown != Tegra::Shader::MembarUnknown::Default);
bb.push_back(Operation(OperationCode::MemoryBarrierGL));
const OperationCode type = [instr] {
switch (instr.membar.type) {
case Tegra::Shader::MembarType::CTA:
return OperationCode::MemoryBarrierGroup;
case Tegra::Shader::MembarType::GL:
return OperationCode::MemoryBarrierGlobal;
default:
UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value()));
return OperationCode::MemoryBarrierGlobal;
}
}();
bb.push_back(Operation(type));
break;
}
case OpCode::Id::DEPBAR: {

View File

@@ -226,9 +226,16 @@ enum class OperationCode {
VoteEqual, /// (bool) -> bool
ThreadId, /// () -> uint
ThreadEqMask, /// () -> uint
ThreadGeMask, /// () -> uint
ThreadGtMask, /// () -> uint
ThreadLeMask, /// () -> uint
ThreadLtMask, /// () -> uint
ShuffleIndexed, /// (uint value, uint index) -> uint
MemoryBarrierGL, /// () -> void
Barrier, /// () -> void
MemoryBarrierGroup, /// () -> void
MemoryBarrierGlobal, /// () -> void
Amount,
};