core: hle: service: buffer_queue: Improve management of KEvent.

Merge pull request #6968 from bunnei/nvflinger-event
core: hle: service: nvflinger/vi: Improve management of KEvent.
2021-09-04 22:25:46 -07:00 · 2021-09-04 22:25:20 -07:00 · 2021-09-03 21:53:00 -07:00 · 2021-09-01 17:36:26 -07:00 · 2021-09-01 20:21:15 -04:00 · 2021-09-01 19:13:33 -05:00
16 changed files with 249 additions and 149 deletions
--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -18,6 +18,7 @@
 #include "common/fs/fs_paths.h"
 #include "common/fs/path_util.h"
 #include "common/literals.h"
+#include "common/thread.h"

 #include "common/logging/backend.h"
 #include "common/logging/log.h"
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -9,17 +9,20 @@
 #include "core/core.h"
 #include "core/hle/kernel/k_writable_event.h"
 #include "core/hle/kernel/kernel.h"
+#include "core/hle/service/kernel_helpers.h"
 #include "core/hle/service/nvflinger/buffer_queue.h"

 namespace Service::NVFlinger {

-BufferQueue::BufferQueue(Kernel::KernelCore& kernel, u32 id_, u64 layer_id_)
-    : id(id_), layer_id(layer_id_), buffer_wait_event{kernel} {
-    Kernel::KAutoObject::Create(std::addressof(buffer_wait_event));
-    buffer_wait_event.Initialize("BufferQueue:WaitEvent");
+BufferQueue::BufferQueue(Kernel::KernelCore& kernel, u32 id_, u64 layer_id_,
+                         KernelHelpers::ServiceContext& service_context_)
+    : id(id_), layer_id(layer_id_), service_context{service_context_} {
+    buffer_wait_event = service_context.CreateEvent("BufferQueue:WaitEvent");
 }

-BufferQueue::~BufferQueue() = default;
+BufferQueue::~BufferQueue() {
+    service_context.CloseEvent(buffer_wait_event);
+}

 void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer) {
    ASSERT(slot < buffer_slots);
@@ -41,7 +44,7 @@ void BufferQueue::SetPreallocatedBuffer(u32 slot, const IGBPBuffer& igbp_buffer)
        .multi_fence = {},
    };

-    buffer_wait_event.GetWritableEvent().Signal();
+    buffer_wait_event->GetWritableEvent().Signal();
 }

 std::optional<std::pair<u32, Service::Nvidia::MultiFence*>> BufferQueue::DequeueBuffer(u32 width,
@@ -119,7 +122,7 @@ void BufferQueue::CancelBuffer(u32 slot, const Service::Nvidia::MultiFence& mult
    }
    free_buffers_condition.notify_one();

-    buffer_wait_event.GetWritableEvent().Signal();
+    buffer_wait_event->GetWritableEvent().Signal();
 }

 std::optional<std::reference_wrapper<const BufferQueue::Buffer>> BufferQueue::AcquireBuffer() {
@@ -154,7 +157,7 @@ void BufferQueue::ReleaseBuffer(u32 slot) {
    }
    free_buffers_condition.notify_one();

-    buffer_wait_event.GetWritableEvent().Signal();
+    buffer_wait_event->GetWritableEvent().Signal();
 }

 void BufferQueue::Connect() {
@@ -169,7 +172,7 @@ void BufferQueue::Disconnect() {
        std::unique_lock lock{queue_sequence_mutex};
        queue_sequence.clear();
    }
-    buffer_wait_event.GetWritableEvent().Signal();
+    buffer_wait_event->GetWritableEvent().Signal();
    is_connect = false;
    free_buffers_condition.notify_one();
 }
@@ -189,11 +192,11 @@ u32 BufferQueue::Query(QueryType type) {
 }

 Kernel::KWritableEvent& BufferQueue::GetWritableBufferWaitEvent() {
-    return buffer_wait_event.GetWritableEvent();
+    return buffer_wait_event->GetWritableEvent();
 }

 Kernel::KReadableEvent& BufferQueue::GetBufferWaitEvent() {
-    return buffer_wait_event.GetReadableEvent();
+    return buffer_wait_event->GetReadableEvent();
 }

 } // namespace Service::NVFlinger
--- a/src/core/hle/service/nvflinger/buffer_queue.h
+++ b/src/core/hle/service/nvflinger/buffer_queue.h
@@ -24,6 +24,10 @@ class KReadableEvent;
 class KWritableEvent;
 } // namespace Kernel

+namespace Service::KernelHelpers {
+class ServiceContext;
+} // namespace Service::KernelHelpers
+
 namespace Service::NVFlinger {

 constexpr u32 buffer_slots = 0x40;
@@ -54,7 +58,8 @@ public:
        NativeWindowFormat = 2,
    };

-    explicit BufferQueue(Kernel::KernelCore& kernel, u32 id_, u64 layer_id_);
+    explicit BufferQueue(Kernel::KernelCore& kernel, u32 id_, u64 layer_id_,
+                         KernelHelpers::ServiceContext& service_context_);
    ~BufferQueue();

    enum class BufferTransformFlags : u32 {
@@ -130,12 +135,14 @@ private:
    std::list<u32> free_buffers;
    std::array<Buffer, buffer_slots> buffers;
    std::list<u32> queue_sequence;
-    Kernel::KEvent buffer_wait_event;
+    Kernel::KEvent* buffer_wait_event{};

    std::mutex free_buffers_mutex;
    std::condition_variable free_buffers_condition;

    std::mutex queue_sequence_mutex;
+
+    KernelHelpers::ServiceContext& service_context;
 };

 } // namespace Service::NVFlinger
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -61,12 +61,13 @@ void NVFlinger::SplitVSync() {
    }
 }

-NVFlinger::NVFlinger(Core::System& system_) : system(system_) {
-    displays.emplace_back(0, "Default", system);
-    displays.emplace_back(1, "External", system);
-    displays.emplace_back(2, "Edid", system);
-    displays.emplace_back(3, "Internal", system);
-    displays.emplace_back(4, "Null", system);
+NVFlinger::NVFlinger(Core::System& system_)
+    : system(system_), service_context(system_, "nvflinger") {
+    displays.emplace_back(0, "Default", service_context, system);
+    displays.emplace_back(1, "External", service_context, system);
+    displays.emplace_back(2, "Edid", service_context, system);
+    displays.emplace_back(3, "Internal", service_context, system);
+    displays.emplace_back(4, "Null", service_context, system);
    guard = std::make_shared<std::mutex>();

    // Schedule the screen composition events
@@ -146,7 +147,7 @@ std::optional<u64> NVFlinger::CreateLayer(u64 display_id) {
 void NVFlinger::CreateLayerAtId(VI::Display& display, u64 layer_id) {
    const u32 buffer_queue_id = next_buffer_queue_id++;
    buffer_queues.emplace_back(
-        std::make_unique<BufferQueue>(system.Kernel(), buffer_queue_id, layer_id));
+        std::make_unique<BufferQueue>(system.Kernel(), buffer_queue_id, layer_id, service_context));
    display.CreateLayer(layer_id, *buffer_queues.back());
 }

--- a/src/core/hle/service/nvflinger/nvflinger.h
+++ b/src/core/hle/service/nvflinger/nvflinger.h
@@ -15,6 +15,7 @@
 #include <vector>

 #include "common/common_types.h"
+#include "core/hle/service/kernel_helpers.h"

 namespace Common {
 class Event;
@@ -135,6 +136,8 @@ private:
    std::unique_ptr<std::thread> vsync_thread;
    std::unique_ptr<Common::Event> wait_event;
    std::atomic<bool> is_running{};
+
+    KernelHelpers::ServiceContext service_context;
 };

 } // namespace Service::NVFlinger
--- a/src/core/hle/service/vi/display/vi_display.cpp
+++ b/src/core/hle/service/vi/display/vi_display.cpp
@@ -12,18 +12,21 @@
 #include "core/hle/kernel/k_event.h"
 #include "core/hle/kernel/k_readable_event.h"
 #include "core/hle/kernel/k_writable_event.h"
+#include "core/hle/service/kernel_helpers.h"
 #include "core/hle/service/vi/display/vi_display.h"
 #include "core/hle/service/vi/layer/vi_layer.h"

 namespace Service::VI {

-Display::Display(u64 id, std::string name_, Core::System& system)
-    : display_id{id}, name{std::move(name_)}, vsync_event{system.Kernel()} {
-    Kernel::KAutoObject::Create(std::addressof(vsync_event));
-    vsync_event.Initialize(fmt::format("Display VSync Event {}", id));
+Display::Display(u64 id, std::string name_, KernelHelpers::ServiceContext& service_context_,
+                 Core::System& system_)
+    : display_id{id}, name{std::move(name_)}, service_context{service_context_} {
+    vsync_event = service_context.CreateEvent(fmt::format("Display VSync Event {}", id));
 }

-Display::~Display() = default;
+Display::~Display() {
+    service_context.CloseEvent(vsync_event);
+}

 Layer& Display::GetLayer(std::size_t index) {
    return *layers.at(index);
@@ -34,11 +37,11 @@ const Layer& Display::GetLayer(std::size_t index) const {
 }

 Kernel::KReadableEvent& Display::GetVSyncEvent() {
-    return vsync_event.GetReadableEvent();
+    return vsync_event->GetReadableEvent();
 }

 void Display::SignalVSyncEvent() {
-    vsync_event.GetWritableEvent().Signal();
+    vsync_event->GetWritableEvent().Signal();
 }

 void Display::CreateLayer(u64 layer_id, NVFlinger::BufferQueue& buffer_queue) {
--- a/src/core/hle/service/vi/display/vi_display.h
+++ b/src/core/hle/service/vi/display/vi_display.h
@@ -18,6 +18,9 @@ class KEvent;
 namespace Service::NVFlinger {
 class BufferQueue;
 }
+namespace Service::KernelHelpers {
+class ServiceContext;
+} // namespace Service::KernelHelpers

 namespace Service::VI {

@@ -31,10 +34,13 @@ class Display {
 public:
    /// Constructs a display with a given unique ID and name.
    ///
-    /// @param id   The unique ID for this display.
+    /// @param id The unique ID for this display.
+    /// @param service_context_ The ServiceContext for the owning service.
    /// @param name_ The name for this display.
+    /// @param system_ The global system instance.
    ///
-    Display(u64 id, std::string name_, Core::System& system);
+    Display(u64 id, std::string name_, KernelHelpers::ServiceContext& service_context_,
+            Core::System& system_);
    ~Display();

    /// Gets the unique ID assigned to this display.
@@ -98,9 +104,10 @@ public:
 private:
    u64 display_id;
    std::string name;
+    KernelHelpers::ServiceContext& service_context;

    std::vector<std::shared_ptr<Layer>> layers;
-    Kernel::KEvent vsync_event;
+    Kernel::KEvent* vsync_event{};
 };

 } // namespace Service::VI
--- a/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
+++ b/src/shader_recompiler/backend/glsl/emit_glsl_warp.cpp
@@ -11,8 +11,6 @@

 namespace Shader::Backend::GLSL {
 namespace {
-constexpr char THREAD_ID[]{"gl_SubGroupInvocationARB"};
-
 void SetInBoundsFlag(EmitContext& ctx, IR::Inst& inst) {
    IR::Inst* const in_bounds{inst.GetAssociatedPseudoOperation(IR::Opcode::GetInBoundsFromOp)};
    if (!in_bounds) {
@@ -45,100 +43,84 @@ void UseShuffleNv(EmitContext& ctx, IR::Inst& inst, std::string_view shfl_op,
    ctx.AddU32("{}={}({},{},{},shfl_in_bounds);", inst, shfl_op, value, index, width);
    SetInBoundsFlag(ctx, inst);
 }
-
-std::string_view BallotIndex(EmitContext& ctx) {
-    if (!ctx.profile.warp_size_potentially_larger_than_guest) {
-        return ".x";
-    }
-    return "[gl_SubGroupInvocationARB>>5]";
-}
-
-std::string GetMask(EmitContext& ctx, std::string_view mask) {
-    const auto ballot_index{BallotIndex(ctx)};
-    return fmt::format("uint(uvec2({}){})", mask, ballot_index);
-}
 } // Anonymous namespace

 void EmitLaneId(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}={}&31u;", inst, THREAD_ID);
+    ctx.AddU32("{}=gl_SubGroupInvocationARB&31u;", inst);
 }

 void EmitVoteAll(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
    if (!ctx.profile.warp_size_potentially_larger_than_guest) {
        ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred);
-        return;
+    } else {
+        const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")};
+        const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)};
+        ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask);
    }
-    const auto ballot_index{BallotIndex(ctx)};
-    const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)};
-    const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)};
-    ctx.AddU1("{}=({}&{})=={};", inst, ballot, active_mask, active_mask);
 }

 void EmitVoteAny(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
    if (!ctx.profile.warp_size_potentially_larger_than_guest) {
        ctx.AddU1("{}=anyInvocationARB({});", inst, pred);
-        return;
+    } else {
+        const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")};
+        const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)};
+        ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask);
    }
-    const auto ballot_index{BallotIndex(ctx)};
-    const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)};
-    const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)};
-    ctx.AddU1("{}=({}&{})!=0u;", inst, ballot, active_mask, active_mask);
 }

 void EmitVoteEqual(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
    if (!ctx.profile.warp_size_potentially_larger_than_guest) {
        ctx.AddU1("{}=allInvocationsEqualARB({});", inst, pred);
-        return;
+    } else {
+        const auto active_mask{fmt::format("uvec2(ballotARB(true))[gl_SubGroupInvocationARB]")};
+        const auto ballot{fmt::format("uvec2(ballotARB({}))[gl_SubGroupInvocationARB]", pred)};
+        const auto value{fmt::format("({}^{})", ballot, active_mask)};
+        ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask);
    }
-    const auto ballot_index{BallotIndex(ctx)};
-    const auto active_mask{fmt::format("uvec2(ballotARB(true)){}", ballot_index)};
-    const auto ballot{fmt::format("uvec2(ballotARB({})){}", pred, ballot_index)};
-    const auto value{fmt::format("({}^{})", ballot, active_mask)};
-    ctx.AddU1("{}=({}==0)||({}=={});", inst, value, value, active_mask);
 }

 void EmitSubgroupBallot(EmitContext& ctx, IR::Inst& inst, std::string_view pred) {
-    const auto ballot_index{BallotIndex(ctx)};
-    ctx.AddU32("{}=uvec2(ballotARB({})){};", inst, pred, ballot_index);
+    if (!ctx.profile.warp_size_potentially_larger_than_guest) {
+        ctx.AddU32("{}=uvec2(ballotARB({})).x;", inst, pred);
+    } else {
+        ctx.AddU32("{}=uvec2(ballotARB({}))[gl_SubGroupInvocationARB];", inst, pred);
+    }
 }

 void EmitSubgroupEqMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupEqMaskARB"));
+    ctx.AddU32("{}=uint(gl_SubGroupEqMaskARB.x);", inst);
 }

 void EmitSubgroupLtMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLtMaskARB"));
+    ctx.AddU32("{}=uint(gl_SubGroupLtMaskARB.x);", inst);
 }

 void EmitSubgroupLeMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupLeMaskARB"));
+    ctx.AddU32("{}=uint(gl_SubGroupLeMaskARB.x);", inst);
 }

 void EmitSubgroupGtMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGtMaskARB"));
+    ctx.AddU32("{}=uint(gl_SubGroupGtMaskARB.x);", inst);
 }

 void EmitSubgroupGeMask(EmitContext& ctx, IR::Inst& inst) {
-    ctx.AddU32("{}={};", inst, GetMask(ctx, "gl_SubGroupGeMaskARB"));
+    ctx.AddU32("{}=uint(gl_SubGroupGeMaskARB.x);", inst);
 }

 void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value,
-                      std::string_view index, std::string_view clamp, std::string_view seg_mask) {
+                      std::string_view index, std::string_view clamp,
+                      std::string_view segmentation_mask) {
    if (ctx.profile.support_gl_warp_intrinsics) {
-        UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, seg_mask);
+        UseShuffleNv(ctx, inst, "shuffleNV", value, index, clamp, segmentation_mask);
        return;
    }
-    const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
-    const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
-    const auto upper_index{fmt::format("{}?{}+32:{}", is_upper_partition, index, index)};
-    const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
+    const auto not_seg_mask{fmt::format("(~{})", segmentation_mask)};
+    const auto thread_id{"gl_SubGroupInvocationARB"};
+    const auto min_thread_id{ComputeMinThreadId(thread_id, segmentation_mask)};
+    const auto max_thread_id{ComputeMaxThreadId(min_thread_id, clamp, not_seg_mask)};

-    const auto not_seg_mask{fmt::format("(~{})", seg_mask)};
-    const auto min_thread_id{ComputeMinThreadId(THREAD_ID, seg_mask)};
-    const auto max_thread_id{
-        ComputeMaxThreadId(min_thread_id, big_warp ? upper_clamp : clamp, not_seg_mask)};
-
-    const auto lhs{fmt::format("({}&{})", big_warp ? upper_index : index, not_seg_mask)};
+    const auto lhs{fmt::format("({}&{})", index, not_seg_mask)};
    const auto src_thread_id{fmt::format("({})|({})", lhs, min_thread_id)};
    ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id);
    SetInBoundsFlag(ctx, inst);
@@ -146,34 +128,29 @@ void EmitShuffleIndex(EmitContext& ctx, IR::Inst& inst, std::string_view value,
 }

 void EmitShuffleUp(EmitContext& ctx, IR::Inst& inst, std::string_view value, std::string_view index,
-                   std::string_view clamp, std::string_view seg_mask) {
+                   std::string_view clamp, std::string_view segmentation_mask) {
    if (ctx.profile.support_gl_warp_intrinsics) {
-        UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, seg_mask);
+        UseShuffleNv(ctx, inst, "shuffleUpNV", value, index, clamp, segmentation_mask);
        return;
    }
-    const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
-    const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
-    const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
-
-    const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)};
-    const auto src_thread_id{fmt::format("({}-{})", THREAD_ID, index)};
+    const auto thread_id{"gl_SubGroupInvocationARB"};
+    const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)};
+    const auto src_thread_id{fmt::format("({}-{})", thread_id, index)};
    ctx.Add("shfl_in_bounds=int({})>=int({});", src_thread_id, max_thread_id);
    SetInBoundsFlag(ctx, inst);
    ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value);
 }

 void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value,
-                     std::string_view index, std::string_view clamp, std::string_view seg_mask) {
+                     std::string_view index, std::string_view clamp,
+                     std::string_view segmentation_mask) {
    if (ctx.profile.support_gl_warp_intrinsics) {
-        UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, seg_mask);
+        UseShuffleNv(ctx, inst, "shuffleDownNV", value, index, clamp, segmentation_mask);
        return;
    }
-    const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
-    const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
-    const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
-
-    const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)};
-    const auto src_thread_id{fmt::format("({}+{})", THREAD_ID, index)};
+    const auto thread_id{"gl_SubGroupInvocationARB"};
+    const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)};
+    const auto src_thread_id{fmt::format("({}+{})", thread_id, index)};
    ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id);
    SetInBoundsFlag(ctx, inst);
    ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value);
@@ -181,17 +158,14 @@ void EmitShuffleDown(EmitContext& ctx, IR::Inst& inst, std::string_view value,

 void EmitShuffleButterfly(EmitContext& ctx, IR::Inst& inst, std::string_view value,
                          std::string_view index, std::string_view clamp,
-                          std::string_view seg_mask) {
+                          std::string_view segmentation_mask) {
    if (ctx.profile.support_gl_warp_intrinsics) {
-        UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, seg_mask);
+        UseShuffleNv(ctx, inst, "shuffleXorNV", value, index, clamp, segmentation_mask);
        return;
    }
-    const bool big_warp{ctx.profile.warp_size_potentially_larger_than_guest};
-    const auto is_upper_partition{"int(gl_SubGroupInvocationARB)>=32"};
-    const auto upper_clamp{fmt::format("{}?{}+32:{}", is_upper_partition, clamp, clamp)};
-
-    const auto max_thread_id{GetMaxThreadId(THREAD_ID, big_warp ? upper_clamp : clamp, seg_mask)};
-    const auto src_thread_id{fmt::format("({}^{})", THREAD_ID, index)};
+    const auto thread_id{"gl_SubGroupInvocationARB"};
+    const auto max_thread_id{GetMaxThreadId(thread_id, clamp, segmentation_mask)};
+    const auto src_thread_id{fmt::format("({}^{})", thread_id, index)};
    ctx.Add("shfl_in_bounds=int({})<=int({});", src_thread_id, max_thread_id);
    SetInBoundsFlag(ctx, inst);
    ctx.AddU32("{}=shfl_in_bounds?readInvocationARB({},{}):{};", inst, value, src_thread_id, value);
--- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
+++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp
@@ -7,13 +7,8 @@

 namespace Shader::Backend::SPIRV {
 namespace {
-Id GetThreadId(EmitContext& ctx) {
-    return ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id);
-}
-
 Id WarpExtract(EmitContext& ctx, Id value) {
-    const Id thread_id{GetThreadId(ctx)};
-    const Id local_index{ctx.OpShiftRightArithmetic(ctx.U32[1], thread_id, ctx.Const(5U))};
+    const Id local_index{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
    return ctx.OpVectorExtractDynamic(ctx.U32[1], value, local_index);
 }

@@ -53,17 +48,10 @@ Id SelectValue(EmitContext& ctx, Id in_range, Id value, Id src_thread_id) {
    return ctx.OpSelect(ctx.U32[1], in_range,
                        ctx.OpSubgroupReadInvocationKHR(ctx.U32[1], value, src_thread_id), value);
 }
-
-Id GetUpperClamp(EmitContext& ctx, Id invocation_id, Id clamp) {
-    const Id thirty_two{ctx.Const(32u)};
-    const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, invocation_id, thirty_two)};
-    const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)};
-    return ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp);
-}
 } // Anonymous namespace

 Id EmitLaneId(EmitContext& ctx) {
-    const Id id{GetThreadId(ctx)};
+    const Id id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
    if (!ctx.profile.warp_size_potentially_larger_than_guest) {
        return id;
    }
@@ -135,15 +123,7 @@ Id EmitSubgroupGeMask(EmitContext& ctx) {
 Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                    Id segmentation_mask) {
    const Id not_seg_mask{ctx.OpNot(ctx.U32[1], segmentation_mask)};
-    const Id thread_id{GetThreadId(ctx)};
-    if (ctx.profile.warp_size_potentially_larger_than_guest) {
-        const Id thirty_two{ctx.Const(32u)};
-        const Id is_upper_partition{ctx.OpSGreaterThanEqual(ctx.U1, thread_id, thirty_two)};
-        const Id upper_index{ctx.OpIAdd(ctx.U32[1], thirty_two, index)};
-        const Id upper_clamp{ctx.OpIAdd(ctx.U32[1], thirty_two, clamp)};
-        index = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_index, index);
-        clamp = ctx.OpSelect(ctx.U32[1], is_upper_partition, upper_clamp, clamp);
-    }
+    const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
    const Id min_thread_id{ComputeMinThreadId(ctx, thread_id, segmentation_mask)};
    const Id max_thread_id{ComputeMaxThreadId(ctx, min_thread_id, clamp, not_seg_mask)};

@@ -157,10 +137,7 @@ Id EmitShuffleIndex(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id cla

 Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                 Id segmentation_mask) {
-    const Id thread_id{GetThreadId(ctx)};
-    if (ctx.profile.warp_size_potentially_larger_than_guest) {
-        clamp = GetUpperClamp(ctx, thread_id, clamp);
-    }
+    const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
    const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
    const Id src_thread_id{ctx.OpISub(ctx.U32[1], thread_id, index)};
    const Id in_range{ctx.OpSGreaterThanEqual(ctx.U1, src_thread_id, max_thread_id)};
@@ -171,10 +148,7 @@ Id EmitShuffleUp(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,

 Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                   Id segmentation_mask) {
-    const Id thread_id{GetThreadId(ctx)};
-    if (ctx.profile.warp_size_potentially_larger_than_guest) {
-        clamp = GetUpperClamp(ctx, thread_id, clamp);
-    }
+    const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
    const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
    const Id src_thread_id{ctx.OpIAdd(ctx.U32[1], thread_id, index)};
    const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)};
@@ -185,10 +159,7 @@ Id EmitShuffleDown(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clam

 Id EmitShuffleButterfly(EmitContext& ctx, IR::Inst* inst, Id value, Id index, Id clamp,
                        Id segmentation_mask) {
-    const Id thread_id{GetThreadId(ctx)};
-    if (ctx.profile.warp_size_potentially_larger_than_guest) {
-        clamp = GetUpperClamp(ctx, thread_id, clamp);
-    }
+    const Id thread_id{ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id)};
    const Id max_thread_id{GetMaxThreadId(ctx, thread_id, clamp, segmentation_mask)};
    const Id src_thread_id{ctx.OpBitwiseXor(ctx.U32[1], thread_id, index)};
    const Id in_range{ctx.OpSLessThanEqual(ctx.U1, src_thread_id, max_thread_id)};
--- a/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp
+++ b/src/shader_recompiler/frontend/maxwell/structured_control_flow.cpp
@@ -20,6 +20,7 @@
 #include "shader_recompiler/frontend/maxwell/decode.h"
 #include "shader_recompiler/frontend/maxwell/structured_control_flow.h"
 #include "shader_recompiler/frontend/maxwell/translate/translate.h"
+#include "shader_recompiler/host_translate_info.h"
 #include "shader_recompiler/object_pool.h"

 namespace Shader::Maxwell {
@@ -652,7 +653,7 @@ class TranslatePass {
 public:
    TranslatePass(ObjectPool<IR::Inst>& inst_pool_, ObjectPool<IR::Block>& block_pool_,
                  ObjectPool<Statement>& stmt_pool_, Environment& env_, Statement& root_stmt,
-                  IR::AbstractSyntaxList& syntax_list_)
+                  IR::AbstractSyntaxList& syntax_list_, const HostTranslateInfo& host_info)
        : stmt_pool{stmt_pool_}, inst_pool{inst_pool_}, block_pool{block_pool_}, env{env_},
          syntax_list{syntax_list_} {
        Visit(root_stmt, nullptr, nullptr);
@@ -660,6 +661,9 @@ public:
        IR::Block& first_block{*syntax_list.front().data.block};
        IR::IREmitter ir(first_block, first_block.begin());
        ir.Prologue();
+        if (uses_demote_to_helper && host_info.needs_demote_reorder) {
+            DemoteCombinationPass();
+        }
    }

 private:
@@ -809,7 +813,14 @@ private:
            }
            case StatementType::Return: {
                ensure_block();
-                IR::IREmitter{*current_block}.Epilogue();
+                IR::Block* return_block{block_pool.Create(inst_pool)};
+                IR::IREmitter{*return_block}.Epilogue();
+                current_block->AddBranch(return_block);
+
+                auto& merge{syntax_list.emplace_back()};
+                merge.type = IR::AbstractSyntaxNode::Type::Block;
+                merge.data.block = return_block;
+
                current_block = nullptr;
                syntax_list.emplace_back().type = IR::AbstractSyntaxNode::Type::Return;
                break;
@@ -824,6 +835,7 @@ private:
                auto& merge{syntax_list.emplace_back()};
                merge.type = IR::AbstractSyntaxNode::Type::Block;
                merge.data.block = demote_block;
+                uses_demote_to_helper = true;
                break;
            }
            case StatementType::Unreachable: {
@@ -855,11 +867,117 @@ private:
        return block_pool.Create(inst_pool);
    }

+    void DemoteCombinationPass() {
+        using Type = IR::AbstractSyntaxNode::Type;
+        std::vector<IR::Block*> demote_blocks;
+        std::vector<IR::U1> demote_conds;
+        u32 num_epilogues{};
+        u32 branch_depth{};
+        for (const IR::AbstractSyntaxNode& node : syntax_list) {
+            if (node.type == Type::If) {
+                ++branch_depth;
+            }
+            if (node.type == Type::EndIf) {
+                --branch_depth;
+            }
+            if (node.type != Type::Block) {
+                continue;
+            }
+            if (branch_depth > 1) {
+                // Skip reordering nested demote branches.
+                continue;
+            }
+            for (const IR::Inst& inst : node.data.block->Instructions()) {
+                const IR::Opcode op{inst.GetOpcode()};
+                if (op == IR::Opcode::DemoteToHelperInvocation) {
+                    demote_blocks.push_back(node.data.block);
+                    break;
+                }
+                if (op == IR::Opcode::Epilogue) {
+                    ++num_epilogues;
+                }
+            }
+        }
+        if (demote_blocks.size() == 0) {
+            return;
+        }
+        if (num_epilogues > 1) {
+            LOG_DEBUG(Shader, "Combining demotes with more than one return is not implemented.");
+            return;
+        }
+        s64 last_iterator_offset{};
+        auto& asl{syntax_list};
+        for (const IR::Block* demote_block : demote_blocks) {
+            const auto start_it{asl.begin() + last_iterator_offset};
+            auto asl_it{std::find_if(start_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) {
+                return asn.type == Type::If && asn.data.if_node.body == demote_block;
+            })};
+            if (asl_it == asl.end()) {
+                // Demote without a conditional branch.
+                // No need to proceed since all fragment instances will be demoted regardless.
+                return;
+            }
+            const IR::Block* const end_if = asl_it->data.if_node.merge;
+            demote_conds.push_back(asl_it->data.if_node.cond);
+            last_iterator_offset = std::distance(asl.begin(), asl_it);
+
+            asl_it = asl.erase(asl_it);
+            asl_it = std::find_if(asl_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) {
+                return asn.type == Type::Block && asn.data.block == demote_block;
+            });
+
+            asl_it = asl.erase(asl_it);
+            asl_it = std::find_if(asl_it, asl.end(), [&](const IR::AbstractSyntaxNode& asn) {
+                return asn.type == Type::EndIf && asn.data.end_if.merge == end_if;
+            });
+            asl_it = asl.erase(asl_it);
+        }
+        const auto epilogue_func{[](const IR::AbstractSyntaxNode& asn) {
+            if (asn.type != Type::Block) {
+                return false;
+            }
+            for (const auto& inst : asn.data.block->Instructions()) {
+                if (inst.GetOpcode() == IR::Opcode::Epilogue) {
+                    return true;
+                }
+            }
+            return false;
+        }};
+        const auto reverse_it{std::find_if(asl.rbegin(), asl.rend(), epilogue_func)};
+        const auto return_block_it{(reverse_it + 1).base()};
+
+        IR::IREmitter ir{*(return_block_it - 1)->data.block};
+        IR::U1 cond(IR::Value(false));
+        for (const auto& demote_cond : demote_conds) {
+            cond = ir.LogicalOr(cond, demote_cond);
+        }
+        cond.Inst()->DestructiveAddUsage(1);
+
+        IR::AbstractSyntaxNode demote_if_node{};
+        demote_if_node.type = Type::If;
+        demote_if_node.data.if_node.cond = cond;
+        demote_if_node.data.if_node.body = demote_blocks[0];
+        demote_if_node.data.if_node.merge = return_block_it->data.block;
+
+        IR::AbstractSyntaxNode demote_node{};
+        demote_node.type = Type::Block;
+        demote_node.data.block = demote_blocks[0];
+
+        IR::AbstractSyntaxNode demote_endif_node{};
+        demote_endif_node.type = Type::EndIf;
+        demote_endif_node.data.end_if.merge = return_block_it->data.block;
+
+        asl.insert(return_block_it, demote_endif_node);
+        asl.insert(return_block_it, demote_node);
+        asl.insert(return_block_it, demote_if_node);
+    }
+
    ObjectPool<Statement>& stmt_pool;
    ObjectPool<IR::Inst>& inst_pool;
    ObjectPool<IR::Block>& block_pool;
    Environment& env;
    IR::AbstractSyntaxList& syntax_list;
+    bool uses_demote_to_helper{};

 // TODO: C++20 Remove this when all compilers support constexpr std::vector
 #if __cpp_lib_constexpr_vector >= 201907
@@ -871,12 +989,13 @@ private:
 } // Anonymous namespace

 IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
-                                Environment& env, Flow::CFG& cfg) {
+                                Environment& env, Flow::CFG& cfg,
+                                const HostTranslateInfo& host_info) {
    ObjectPool<Statement> stmt_pool{64};
    GotoPass goto_pass{cfg, stmt_pool};
    Statement& root{goto_pass.RootStatement()};
    IR::AbstractSyntaxList syntax_list;
-    TranslatePass{inst_pool, block_pool, stmt_pool, env, root, syntax_list};
+    TranslatePass{inst_pool, block_pool, stmt_pool, env, root, syntax_list, host_info};
    return syntax_list;
 }

--- a/src/shader_recompiler/frontend/maxwell/structured_control_flow.h
+++ b/src/shader_recompiler/frontend/maxwell/structured_control_flow.h
@@ -11,10 +11,13 @@
 #include "shader_recompiler/frontend/maxwell/control_flow.h"
 #include "shader_recompiler/object_pool.h"

-namespace Shader::Maxwell {
+namespace Shader {
+struct HostTranslateInfo;
+namespace Maxwell {

 [[nodiscard]] IR::AbstractSyntaxList BuildASL(ObjectPool<IR::Inst>& inst_pool,
                                              ObjectPool<IR::Block>& block_pool, Environment& env,
-                                              Flow::CFG& cfg);
+                                              Flow::CFG& cfg, const HostTranslateInfo& host_info);

-} // namespace Shader::Maxwell
+} // namespace Maxwell
+} // namespace Shader
--- a/src/shader_recompiler/frontend/maxwell/translate_program.cpp
+++ b/src/shader_recompiler/frontend/maxwell/translate_program.cpp
@@ -130,7 +130,7 @@ void AddNVNStorageBuffers(IR::Program& program) {
 IR::Program TranslateProgram(ObjectPool<IR::Inst>& inst_pool, ObjectPool<IR::Block>& block_pool,
                             Environment& env, Flow::CFG& cfg, const HostTranslateInfo& host_info) {
    IR::Program program;
-    program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg);
+    program.syntax_list = BuildASL(inst_pool, block_pool, env, cfg, host_info);
    program.blocks = GenerateBlocks(program.syntax_list);
    program.post_order_blocks = PostOrder(program.syntax_list.front());
    program.stage = env.ShaderStage();
--- a/src/shader_recompiler/host_translate_info.h
+++ b/src/shader_recompiler/host_translate_info.h
@@ -11,8 +11,9 @@ namespace Shader {

 /// Misc information about the host
 struct HostTranslateInfo {
-    bool support_float16{}; ///< True when the device supports 16-bit floats
-    bool support_int64{};   ///< True when the device supports 64-bit integers
+    bool support_float16{};      ///< True when the device supports 16-bit floats
+    bool support_int64{};        ///< True when the device supports 64-bit integers
+    bool needs_demote_reorder{}; ///< True when the device needs DemoteToHelperInvocation reordered
 };

 } // namespace Shader
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -156,6 +156,10 @@ public:
        return shader_backend;
    }

+    bool IsAmd() const {
+        return vendor_name == "ATI Technologies Inc.";
+    }
+
 private:
    static bool TestVariableAoffi();
    static bool TestPreciseBug();
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -219,6 +219,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
      host_info{
          .support_float16 = false,
          .support_int64 = device.HasShaderInt64(),
+          .needs_demote_reorder = device.IsAmd(),
      } {
    if (use_asynchronous_shaders) {
        workers = CreateWorkers();
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -325,6 +325,8 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, Tegra::Engines::Maxw
    host_info = Shader::HostTranslateInfo{
        .support_float16 = device.IsFloat16Supported(),
        .support_int64 = device.IsShaderInt64Supported(),
+        .needs_demote_reorder = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY_KHR ||
+                                driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR,
    };
 }
Author	SHA1	Message	Date
bunnei	e05bfd2f54	core: hle: service: buffer_queue: Improve management of KEvent.	2021-09-04 22:25:46 -07:00
bunnei	d9ce179ec2	Merge pull request #6968 from bunnei/nvflinger-event core: hle: service: nvflinger/vi: Improve management of KEvent.	2021-09-04 22:25:20 -07:00
bunnei	fb3e9314b9	core: hle: service: nvflinger/vi: Improve management of KEvent.	2021-09-03 21:53:00 -07:00
bunnei	b2572a56d3	Merge pull request #6900 from ameerj/attr-reorder structured_control_flow: Add DemoteCombinationPass	2021-09-01 17:36:26 -07:00
Mai M	25444041d0	Merge pull request #6951 from german77/log common/logging: Add missing include	2021-09-01 20:21:15 -04:00
german77	c57e0b3b24	common/logging: Add missing include	2021-09-01 19:13:33 -05:00
ameerj	907dfbea71	structured_control_flow: Skip reordering nested demote branches. Nested demote branches add complexity with combining the condition if it has not been initialized yet. Skip them for the time being.	2021-08-30 11:46:25 -04:00
ameerj	4fda7f1c82	structured_control_flow: Conditionally invoke demote reorder pass This is only needed on select drivers when a fragment shader discards/demotes.	2021-08-30 11:46:24 -04:00
ameerj	862dc2b2b3	structured_control_flow: Add DemoteCombinationPass Some drivers misread data when demotes are interleaved in the program. This moves demote branches to be checked at the end of the program. Fixes "wireframe" issue in Pokemon SwSh on some drivers	2021-08-28 11:35:25 -04:00