video_core: Implement other missing vulkan topology

video_core: Implement vulkan QuadStrip topology
2022-12-26 12:20:49 +08:00 · 2022-12-26 11:37:34 +08:00
21 changed files with 299 additions and 223 deletions
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -11,7 +11,6 @@
 #include <mutex>
 #include <thread>
 #include "common/common_types.h"
-#include "common/polyfill_thread.h"

 namespace Common {

@@ -70,7 +69,7 @@ public:
    explicit Barrier(std::size_t count_) : count(count_) {}

    /// Blocks until all "count" threads have called Sync()
-    bool Sync(std::stop_token token = {}) {
+    void Sync() {
        std::unique_lock lk{mutex};
        const std::size_t current_generation = generation;

@@ -78,16 +77,14 @@ public:
            generation++;
            waiting = 0;
            condvar.notify_all();
-            return true;
        } else {
-            CondvarWait(condvar, lk, token,
-                        [this, current_generation] { return current_generation != generation; });
-            return !token.stop_requested();
+            condvar.wait(lk,
+                         [this, current_generation] { return current_generation != generation; });
        }
    }

 private:
-    std::condition_variable_any condvar;
+    std::condition_variable condvar;
    std::mutex mutex;
    std::size_t count;
    std::size_t waiting = 0;
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -389,9 +389,7 @@ struct System::Impl {
        kernel.ShutdownCores();
        cpu_manager.Shutdown();
        debugger.reset();
-        if (services) {
-            services->KillNVNFlinger();
-        }
+        services->KillNVNFlinger();
        kernel.CloseServices();
        services.reset();
        service_manager.reset();
--- a/src/core/cpu_manager.cpp
+++ b/src/core/cpu_manager.cpp
@@ -20,20 +20,23 @@ namespace Core {
 CpuManager::CpuManager(System& system_) : system{system_} {}
 CpuManager::~CpuManager() = default;

+void CpuManager::ThreadStart(std::stop_token stop_token, CpuManager& cpu_manager,
+                             std::size_t core) {
+    cpu_manager.RunThread(core);
+}
+
 void CpuManager::Initialize() {
    num_cores = is_multicore ? Core::Hardware::NUM_CPU_CORES : 1;
    gpu_barrier = std::make_unique<Common::Barrier>(num_cores + 1);

    for (std::size_t core = 0; core < num_cores; core++) {
-        core_data[core].host_thread =
-            std::jthread([this, core](std::stop_token token) { RunThread(token, core); });
+        core_data[core].host_thread = std::jthread(ThreadStart, std::ref(*this), core);
    }
 }

 void CpuManager::Shutdown() {
    for (std::size_t core = 0; core < num_cores; core++) {
        if (core_data[core].host_thread.joinable()) {
-            core_data[core].host_thread.request_stop();
            core_data[core].host_thread.join();
        }
    }
@@ -181,7 +184,7 @@ void CpuManager::ShutdownThread() {
    UNREACHABLE();
 }

-void CpuManager::RunThread(std::stop_token token, std::size_t core) {
+void CpuManager::RunThread(std::size_t core) {
    /// Initialization
    system.RegisterCoreThread(core);
    std::string name;
@@ -203,9 +206,7 @@ void CpuManager::RunThread(std::stop_token token, std::size_t core) {
    });

    // Running
-    if (!gpu_barrier->Sync(token)) {
-        return;
-    }
+    gpu_barrier->Sync();

    if (!is_async_gpu && !is_multicore) {
        system.GPU().ObtainContext();
--- a/src/core/cpu_manager.h
+++ b/src/core/cpu_manager.h
@@ -81,10 +81,12 @@ private:
    void SingleCoreRunGuestThread();
    void SingleCoreRunIdleThread();

+    static void ThreadStart(std::stop_token stop_token, CpuManager& cpu_manager, std::size_t core);
+
    void GuestActivate();
    void HandleInterrupt();
    void ShutdownThread();
-    void RunThread(std::stop_token stop_token, std::size_t core);
+    void RunThread(std::size_t core);

    struct CoreData {
        std::shared_ptr<Common::Fiber> host_context;
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -104,16 +104,12 @@ struct KernelCore::Impl {
    }

    void CloseCurrentProcess() {
-        KProcess* old_process = current_process.exchange(nullptr);
-        if (old_process == nullptr) {
-            return;
-        }
-
-        // old_process->Close();
-        // TODO: The process should be destroyed based on accurate ref counting after
+        (*current_process).Finalize();
+        // current_process->Close();
+        // TODO: The current process should be destroyed based on accurate ref counting after
        // calling Close(). Adding a manual Destroy() call instead to avoid a memory leak.
-        old_process->Finalize();
-        old_process->Destroy();
+        (*current_process).Destroy();
+        current_process = nullptr;
    }

    void Shutdown() {
--- a/src/input_common/drivers/sdl_driver.cpp
+++ b/src/input_common/drivers/sdl_driver.cpp
@@ -16,8 +16,6 @@ Common::UUID GetGUID(SDL_Joystick* joystick) {
    const SDL_JoystickGUID guid = SDL_JoystickGetGUID(joystick);
    std::array<u8, 16> data{};
    std::memcpy(data.data(), guid.data, sizeof(data));
-    // Clear controller name crc
-    std::memset(data.data() + 2, 0, sizeof(u16));
    return Common::UUID{data};
 }
 } // Anonymous namespace
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -666,9 +666,10 @@ void BufferCache<P>::BindHostGeometryBuffers(bool is_indexed) {
        BindHostIndexBuffer();
    } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) {
        const auto& draw_state = maxwell3d->draw_manager->GetDrawState();
-        if (draw_state.topology == Maxwell::PrimitiveTopology::Quads) {
-            runtime.BindQuadArrayIndexBuffer(draw_state.vertex_buffer.first,
-                                             draw_state.vertex_buffer.count);
+        if (draw_state.topology == Maxwell::PrimitiveTopology::Quads ||
+            draw_state.topology == Maxwell::PrimitiveTopology::QuadStrip) {
+            runtime.BindQuadIndexBuffer(draw_state.topology, draw_state.vertex_buffer.first,
+                                        draw_state.vertex_buffer.count);
        }
    }
    BindHostVertexBuffers();
--- a/src/video_core/engines/draw_manager.cpp
+++ b/src/video_core/engines/draw_manager.cpp
@@ -46,26 +46,21 @@ void DrawManager::ProcessMethodCall(u32 method, u32 argument) {
        SetInlineIndexBuffer(regs.inline_index_4x8.index2);
        SetInlineIndexBuffer(regs.inline_index_4x8.index3);
        break;
-    case MAXWELL3D_REG_INDEX(vertex_array_instance_first):
-    case MAXWELL3D_REG_INDEX(vertex_array_instance_subsequent): {
-        LOG_WARNING(HW_GPU, "(STUBBED) called");
+    case MAXWELL3D_REG_INDEX(topology_override):
+        use_topology_override = true;
        break;
-    }
    default:
        break;
    }
 }

 void DrawManager::Clear(u32 layer_count) {
-    if (maxwell3d->ShouldExecute()) {
-        maxwell3d->rasterizer->Clear(layer_count);
-    }
+    maxwell3d->rasterizer->Clear(layer_count);
 }

 void DrawManager::DrawDeferred() {
-    if (draw_state.draw_mode != DrawMode::Instance || draw_state.instance_count == 0) {
+    if (draw_state.draw_mode != DrawMode::Instance || draw_state.instance_count == 0)
        return;
-    }
    DrawEnd(draw_state.instance_count + 1, true);
    draw_state.instance_count = 0;
 }
@@ -120,9 +115,8 @@ void DrawManager::DrawEnd(u32 instance_count, bool force_draw) {
    const auto& regs{maxwell3d->regs};
    switch (draw_state.draw_mode) {
    case DrawMode::Instance:
-        if (!force_draw) {
+        if (!force_draw)
            break;
-        }
        [[fallthrough]];
    case DrawMode::General:
        draw_state.base_instance = regs.global_base_instance_index;
@@ -162,28 +156,25 @@ void DrawManager::DrawIndexSmall(u32 argument) {
    ProcessDraw(true, 1);
 }

-void DrawManager::UpdateTopology() {
+void DrawManager::ProcessTopologyOverride() {
+    if (!use_topology_override)
+        return;
+
    const auto& regs{maxwell3d->regs};
-    switch (regs.primitive_topology_control) {
-    case PrimitiveTopologyControl::UseInBeginMethods:
+    switch (regs.topology_override) {
+    case PrimitiveTopologyOverride::None:
        break;
-    case PrimitiveTopologyControl::UseSeparateState:
-        switch (regs.topology_override) {
-        case PrimitiveTopologyOverride::None:
-            break;
-        case PrimitiveTopologyOverride::Points:
-            draw_state.topology = PrimitiveTopology::Points;
-            break;
-        case PrimitiveTopologyOverride::Lines:
-            draw_state.topology = PrimitiveTopology::Lines;
-            break;
-        case PrimitiveTopologyOverride::LineStrip:
-            draw_state.topology = PrimitiveTopology::LineStrip;
-            break;
-        default:
-            draw_state.topology = static_cast<PrimitiveTopology>(regs.topology_override);
-            break;
-        }
+    case PrimitiveTopologyOverride::Points:
+        draw_state.topology = PrimitiveTopology::Points;
+        break;
+    case PrimitiveTopologyOverride::Lines:
+        draw_state.topology = PrimitiveTopology::Lines;
+        break;
+    case PrimitiveTopologyOverride::LineStrip:
+        draw_state.topology = PrimitiveTopology::LineStrip;
+        break;
+    default:
+        draw_state.topology = static_cast<PrimitiveTopology>(regs.topology_override);
        break;
    }
 }
@@ -192,10 +183,9 @@ void DrawManager::ProcessDraw(bool draw_indexed, u32 instance_count) {
    LOG_TRACE(HW_GPU, "called, topology={}, count={}", draw_state.topology,
              draw_indexed ? draw_state.index_buffer.count : draw_state.vertex_buffer.count);

-    UpdateTopology();
+    ProcessTopologyOverride();

-    if (maxwell3d->ShouldExecute()) {
+    if (maxwell3d->ShouldExecute())
        maxwell3d->rasterizer->Draw(draw_indexed, instance_count);
-    }
 }
 } // namespace Tegra::Engines
--- a/src/video_core/engines/draw_manager.h
+++ b/src/video_core/engines/draw_manager.h
@@ -10,7 +10,6 @@ class RasterizerInterface;
 }

 namespace Tegra::Engines {
-using PrimitiveTopologyControl = Maxwell3D::Regs::PrimitiveTopologyControl;
 using PrimitiveTopology = Maxwell3D::Regs::PrimitiveTopology;
 using PrimitiveTopologyOverride = Maxwell3D::Regs::PrimitiveTopologyOverride;
 using IndexBuffer = Maxwell3D::Regs::IndexBuffer;
@@ -59,11 +58,12 @@ private:

    void DrawIndexSmall(u32 argument);

-    void UpdateTopology();
+    void ProcessTopologyOverride();

    void ProcessDraw(bool draw_indexed, u32 instance_count);

    Maxwell3D* maxwell3d{};
    State draw_state{};
+    bool use_topology_override{};
 };
 } // namespace Tegra::Engines
--- a/src/video_core/host_shaders/vulkan_quad_indexed.comp
+++ b/src/video_core/host_shaders/vulkan_quad_indexed.comp
@@ -16,6 +16,7 @@ layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
 layout (push_constant) uniform PushConstants {
    uint base_vertex;
    int index_shift; // 0: uint8, 1: uint16, 2: uint32
+    int is_strip; // 0: quads 1: quadstrip
 };

 void main() {
@@ -28,9 +29,10 @@ void main() {
    int flipped_shift = 2 - index_shift;
    int mask = (1 << flipped_shift) - 1;

-    const int quad_swizzle[6] = int[](0, 1, 2, 0, 2, 3);
+    const int quads_swizzle[6] = int[](0, 1, 2, 0, 2, 3);
+    const int quad_strip_swizzle[6] = int[](0, 3, 1, 0, 2, 3);
    for (uint vertex = 0; vertex < 6; ++vertex) {
-        int offset = primitive * 4 + quad_swizzle[vertex];
+        int offset = (is_strip == 0 ? primitive * 4 + quads_swizzle[vertex] : primitive * 2 + quad_strip_swizzle[vertex]);
        int int_offset = offset >> flipped_shift;
        int bit_offset = (offset & mask) * index_size;
        uint packed_input = input_indexes[int_offset];
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -138,6 +138,9 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, std::stop_token stop_load

 void RasterizerOpenGL::Clear(u32 layer_count) {
    MICROPROFILE_SCOPE(OpenGL_Clears);
+    if (!maxwell3d->ShouldExecute()) {
+        return;
+    }

    const auto& regs = maxwell3d->regs;
    bool use_color{};
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -301,6 +301,8 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const Device& device,
        return VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
    case Maxwell::PrimitiveTopology::Lines:
        return VK_PRIMITIVE_TOPOLOGY_LINE_LIST;
+    case Maxwell::PrimitiveTopology::LineLoop:
+        return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
    case Maxwell::PrimitiveTopology::LineStrip:
        return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP;
    case Maxwell::PrimitiveTopology::Triangles:
@@ -309,15 +311,28 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const Device& device,
        return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP;
    case Maxwell::PrimitiveTopology::TriangleFan:
        return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN;
+    case Maxwell::PrimitiveTopology::LinesAdjacency:
+        return VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY;
+    case Maxwell::PrimitiveTopology::LineStripAdjacency:
+        return VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY;
+    case Maxwell::PrimitiveTopology::TrianglesAdjacency:
+        return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY;
+    case Maxwell::PrimitiveTopology::TriangleStripAdjacency:
+        return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY;
    case Maxwell::PrimitiveTopology::Quads:
-        // TODO(Rodrigo): Use VK_PRIMITIVE_TOPOLOGY_QUAD_LIST_EXT whenever it releases
+    case Maxwell::PrimitiveTopology::QuadStrip:
+        // TODO: Use VK_PRIMITIVE_TOPOLOGY_QUAD_LIST_EXT/VK_PRIMITIVE_TOPOLOGY_QUAD_STRIP_EXT
+        // whenever it releases
        return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
    case Maxwell::PrimitiveTopology::Patches:
        return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST;
-    default:
-        UNIMPLEMENTED_MSG("Unimplemented topology={}", topology);
-        return {};
+    case Maxwell::PrimitiveTopology::Polygon:
+        LOG_WARNING(Render_Vulkan, "Draw mode is Polygon with a polygon mode of lines should be a "
+                                   "single body and not a bunch of triangles.");
+        return VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN;
    }
+    UNIMPLEMENTED_MSG("Unimplemented topology={}", topology);
+    return {};
 }

 VkFormat VertexFormat(const Device& device, Maxwell::VertexAttribute::Type type,
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp
@@ -51,15 +51,6 @@ size_t BytesPerIndex(VkIndexType index_type) {
    }
 }

-template <typename T>
-std::array<T, 6> MakeQuadIndices(u32 quad, u32 first) {
-    std::array<T, 6> indices{0, 1, 2, 0, 2, 3};
-    for (T& index : indices) {
-        index = static_cast<T>(first + index + quad * 4);
-    }
-    return indices;
-}
-
 vk::Buffer CreateBuffer(const Device& device, u64 size) {
    VkBufferUsageFlags flags =
        VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
@@ -123,6 +114,187 @@ VkBufferView Buffer::View(u32 offset, u32 size, VideoCore::Surface::PixelFormat
    return *views.back().handle;
 }

+class QuadIndexBuffer {
+public:
+    QuadIndexBuffer(const Device& device_, MemoryAllocator& memory_allocator_,
+                    Scheduler& scheduler_, StagingBufferPool& staging_pool_)
+        : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
+          staging_pool{staging_pool_} {}
+
+    virtual ~QuadIndexBuffer() = default;
+
+    void UpdateBuffer(u32 num_indices_) {
+        if (num_indices_ <= num_indices) {
+            return;
+        }
+
+        scheduler.Finish();
+
+        num_indices = num_indices_;
+        index_type = IndexTypeFromNumElements(device, num_indices);
+
+        const u32 num_quads = GetQuadsNum(num_indices);
+        const u32 num_triangle_indices = num_quads * 6;
+        const u32 num_first_offset_copies = 4;
+        const size_t bytes_per_index = BytesPerIndex(index_type);
+        const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies;
+        buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .pNext = nullptr,
+            .flags = 0,
+            .size = size_bytes,
+            .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = 0,
+            .pQueueFamilyIndices = nullptr,
+        });
+        if (device.HasDebuggingToolAttached()) {
+            buffer.SetObjectNameEXT("Quad LUT");
+        }
+        memory_commit = memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
+
+        const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload);
+        u8* staging_data = staging.mapped_span.data();
+        const size_t quad_size = bytes_per_index * 6;
+
+        for (u32 first = 0; first < num_first_offset_copies; ++first) {
+            for (u32 quad = 0; quad < num_quads; ++quad) {
+                MakeAndUpdateIndices(staging_data, quad_size, quad, first);
+                staging_data += quad_size;
+            }
+        }
+
+        scheduler.RequestOutsideRenderPassOperationContext();
+        scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset,
+                          dst_buffer = *buffer, size_bytes](vk::CommandBuffer cmdbuf) {
+            const VkBufferCopy copy{
+                .srcOffset = src_offset,
+                .dstOffset = 0,
+                .size = size_bytes,
+            };
+            const VkBufferMemoryBarrier write_barrier{
+                .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+                .pNext = nullptr,
+                .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
+                .dstAccessMask = VK_ACCESS_INDEX_READ_BIT,
+                .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                .buffer = dst_buffer,
+                .offset = 0,
+                .size = size_bytes,
+            };
+            cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy);
+            cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
+                                   VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, write_barrier);
+        });
+    }
+
+    void BindBuffer(u32 first) {
+        const VkIndexType index_type_ = index_type;
+        const size_t sub_first_offset = static_cast<size_t>(first % 4) * GetQuadsNum(num_indices);
+        const size_t offset =
+            (sub_first_offset + GetQuadsNum(first)) * 6ULL * BytesPerIndex(index_type);
+        scheduler.Record([buffer = *buffer, index_type_, offset](vk::CommandBuffer cmdbuf) {
+            cmdbuf.BindIndexBuffer(buffer, offset, index_type_);
+        });
+    }
+
+protected:
+    virtual u32 GetQuadsNum(u32 num_indices) const = 0;
+
+    virtual void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) = 0;
+
+    const Device& device;
+    MemoryAllocator& memory_allocator;
+    Scheduler& scheduler;
+    StagingBufferPool& staging_pool;
+
+    vk::Buffer buffer{};
+    MemoryCommit memory_commit{};
+    VkIndexType index_type{};
+    u32 num_indices = 0;
+};
+
+class QuadArrayIndexBuffer : public QuadIndexBuffer {
+public:
+    QuadArrayIndexBuffer(const Device& device_, MemoryAllocator& memory_allocator_,
+                         Scheduler& scheduler_, StagingBufferPool& staging_pool_)
+        : QuadIndexBuffer(device_, memory_allocator_, scheduler_, staging_pool_) {}
+
+    ~QuadArrayIndexBuffer() = default;
+
+private:
+    u32 GetQuadsNum(u32 num_indices_) const override {
+        return num_indices_ / 4;
+    }
+
+    template <typename T>
+    static std::array<T, 6> MakeIndices(u32 quad, u32 first) {
+        std::array<T, 6> indices{0, 1, 2, 0, 2, 3};
+        for (T& index : indices) {
+            index = static_cast<T>(first + index + quad * 4);
+        }
+        return indices;
+    }
+
+    void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) {
+        switch (index_type) {
+        case VK_INDEX_TYPE_UINT8_EXT:
+            std::memcpy(staging_data, MakeIndices<u8>(quad, first).data(), quad_size);
+            break;
+        case VK_INDEX_TYPE_UINT16:
+            std::memcpy(staging_data, MakeIndices<u16>(quad, first).data(), quad_size);
+            break;
+        case VK_INDEX_TYPE_UINT32:
+            std::memcpy(staging_data, MakeIndices<u32>(quad, first).data(), quad_size);
+            break;
+        default:
+            ASSERT(false);
+            break;
+        }
+    }
+};
+
+class QuadStripIndexBuffer : public QuadIndexBuffer {
+public:
+    QuadStripIndexBuffer(const Device& device_, MemoryAllocator& memory_allocator_,
+                         Scheduler& scheduler_, StagingBufferPool& staging_pool_)
+        : QuadIndexBuffer(device_, memory_allocator_, scheduler_, staging_pool_) {}
+
+    ~QuadStripIndexBuffer() = default;
+
+private:
+    u32 GetQuadsNum(u32 num_indices_) const override {
+        return num_indices_ >= 4 ? (num_indices_ - 2) / 2 : 0;
+    }
+
+    template <typename T>
+    static std::array<T, 6> MakeIndices(u32 quad, u32 first) {
+        std::array<T, 6> indices{0, 3, 1, 0, 2, 3};
+        for (T& index : indices) {
+            index = static_cast<T>(first + index + quad * 2);
+        }
+        return indices;
+    }
+
+    void MakeAndUpdateIndices(u8* staging_data, size_t quad_size, u32 quad, u32 first) {
+        switch (index_type) {
+        case VK_INDEX_TYPE_UINT8_EXT:
+            std::memcpy(staging_data, MakeIndices<u8>(quad, first).data(), quad_size);
+            break;
+        case VK_INDEX_TYPE_UINT16:
+            std::memcpy(staging_data, MakeIndices<u16>(quad, first).data(), quad_size);
+            break;
+        case VK_INDEX_TYPE_UINT32:
+            std::memcpy(staging_data, MakeIndices<u32>(quad, first).data(), quad_size);
+            break;
+        default:
+            ASSERT(false);
+            break;
+        }
+    }
+};
+
 BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_allocator_,
                                       Scheduler& scheduler_, StagingBufferPool& staging_pool_,
                                       UpdateDescriptorQueue& update_descriptor_queue_,
@@ -130,7 +302,12 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& m
    : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
      staging_pool{staging_pool_}, update_descriptor_queue{update_descriptor_queue_},
      uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
-      quad_index_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue) {}
+      quad_index_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue) {
+    quad_array_index_buffer = std::make_shared<QuadArrayIndexBuffer>(device_, memory_allocator_,
+                                                                     scheduler_, staging_pool_);
+    quad_strip_index_buffer = std::make_shared<QuadStripIndexBuffer>(device_, memory_allocator_,
+                                                                     scheduler_, staging_pool_);
+}

 StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) {
    return staging_pool.Request(size, MemoryUsage::Upload);
@@ -245,10 +422,11 @@ void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat
    VkIndexType vk_index_type = MaxwellToVK::IndexFormat(index_format);
    VkDeviceSize vk_offset = offset;
    VkBuffer vk_buffer = buffer;
-    if (topology == PrimitiveTopology::Quads) {
+    if (topology == PrimitiveTopology::Quads || topology == PrimitiveTopology::QuadStrip) {
        vk_index_type = VK_INDEX_TYPE_UINT32;
        std::tie(vk_buffer, vk_offset) =
-            quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset);
+            quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset,
+                                     topology == PrimitiveTopology::QuadStrip);
    } else if (vk_index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) {
        vk_index_type = VK_INDEX_TYPE_UINT16;
        std::tie(vk_buffer, vk_offset) = uint8_pass.Assemble(num_indices, buffer, offset);
@@ -263,7 +441,7 @@ void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat
    });
 }

-void BufferCacheRuntime::BindQuadArrayIndexBuffer(u32 first, u32 count) {
+void BufferCacheRuntime::BindQuadIndexBuffer(PrimitiveTopology topology, u32 first, u32 count) {
    if (count == 0) {
        ReserveNullBuffer();
        scheduler.Record([this](vk::CommandBuffer cmdbuf) {
@@ -271,23 +449,18 @@ void BufferCacheRuntime::BindQuadArrayIndexBuffer(u32 first, u32 count) {
        });
        return;
    }
-    ReserveQuadArrayLUT(first + count, true);

-    // The LUT has the indices 0, 1, 2, and 3 copied as an array
-    // To apply these 'first' offsets we can apply an offset based on the modulus.
-    const VkIndexType index_type = quad_array_lut_index_type;
-    const size_t sub_first_offset = static_cast<size_t>(first % 4) * (current_num_indices / 4);
-    const size_t offset = (sub_first_offset + first / 4) * 6ULL * BytesPerIndex(index_type);
-    scheduler.Record([buffer = *quad_array_lut, index_type, offset](vk::CommandBuffer cmdbuf) {
-        cmdbuf.BindIndexBuffer(buffer, offset, index_type);
-    });
+    if (topology == PrimitiveTopology::Quads) {
+        quad_array_index_buffer->UpdateBuffer(first + count);
+        quad_array_index_buffer->BindBuffer(first);
+    } else if (topology == PrimitiveTopology::QuadStrip) {
+        quad_strip_index_buffer->UpdateBuffer(first + count);
+        quad_strip_index_buffer->BindBuffer(first);
+    }
 }

 void BufferCacheRuntime::BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size,
                                          u32 stride) {
-    if (index >= device.GetMaxVertexInputBindings()) {
-        return;
-    }
    if (device.IsExtExtendedDynamicStateSupported()) {
        scheduler.Record([index, buffer, offset, size, stride](vk::CommandBuffer cmdbuf) {
            const VkDeviceSize vk_offset = buffer != VK_NULL_HANDLE ? offset : 0;
@@ -323,83 +496,6 @@ void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, VkBuffer buffer,
    });
 }

-void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle) {
-    if (num_indices <= current_num_indices) {
-        return;
-    }
-    if (wait_for_idle) {
-        scheduler.Finish();
-    }
-    current_num_indices = num_indices;
-    quad_array_lut_index_type = IndexTypeFromNumElements(device, num_indices);
-
-    const u32 num_quads = num_indices / 4;
-    const u32 num_triangle_indices = num_quads * 6;
-    const u32 num_first_offset_copies = 4;
-    const size_t bytes_per_index = BytesPerIndex(quad_array_lut_index_type);
-    const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies;
-    quad_array_lut = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
-        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
-        .pNext = nullptr,
-        .flags = 0,
-        .size = size_bytes,
-        .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
-        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-        .queueFamilyIndexCount = 0,
-        .pQueueFamilyIndices = nullptr,
-    });
-    if (device.HasDebuggingToolAttached()) {
-        quad_array_lut.SetObjectNameEXT("Quad LUT");
-    }
-    quad_array_lut_commit = memory_allocator.Commit(quad_array_lut, MemoryUsage::DeviceLocal);
-
-    const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload);
-    u8* staging_data = staging.mapped_span.data();
-    const size_t quad_size = bytes_per_index * 6;
-    for (u32 first = 0; first < num_first_offset_copies; ++first) {
-        for (u32 quad = 0; quad < num_quads; ++quad) {
-            switch (quad_array_lut_index_type) {
-            case VK_INDEX_TYPE_UINT8_EXT:
-                std::memcpy(staging_data, MakeQuadIndices<u8>(quad, first).data(), quad_size);
-                break;
-            case VK_INDEX_TYPE_UINT16:
-                std::memcpy(staging_data, MakeQuadIndices<u16>(quad, first).data(), quad_size);
-                break;
-            case VK_INDEX_TYPE_UINT32:
-                std::memcpy(staging_data, MakeQuadIndices<u32>(quad, first).data(), quad_size);
-                break;
-            default:
-                ASSERT(false);
-                break;
-            }
-            staging_data += quad_size;
-        }
-    }
-    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset,
-                      dst_buffer = *quad_array_lut, size_bytes](vk::CommandBuffer cmdbuf) {
-        const VkBufferCopy copy{
-            .srcOffset = src_offset,
-            .dstOffset = 0,
-            .size = size_bytes,
-        };
-        const VkBufferMemoryBarrier write_barrier{
-            .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
-            .pNext = nullptr,
-            .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
-            .dstAccessMask = VK_ACCESS_INDEX_READ_BIT,
-            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
-            .buffer = dst_buffer,
-            .offset = 0,
-            .size = size_bytes,
-        };
-        cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy);
-        cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
-                               0, write_barrier);
-    });
-}
-
 void BufferCacheRuntime::ReserveNullBuffer() {
    if (null_buffer) {
        return;
--- a/src/video_core/renderer_vulkan/vk_buffer_cache.h
+++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h
@@ -50,6 +50,9 @@ private:
    std::vector<BufferView> views;
 };

+class QuadArrayIndexBuffer;
+class QuadStripIndexBuffer;
+
 class BufferCacheRuntime {
    friend Buffer;

@@ -86,7 +89,7 @@ public:
    void BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, u32 num_indices,
                         u32 base_vertex, VkBuffer buffer, u32 offset, u32 size);

-    void BindQuadArrayIndexBuffer(u32 first, u32 count);
+    void BindQuadIndexBuffer(PrimitiveTopology topology, u32 first, u32 count);

    void BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, u32 stride);

@@ -118,8 +121,6 @@ private:
        update_descriptor_queue.AddBuffer(buffer, offset, size);
    }

-    void ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle);
-
    void ReserveNullBuffer();

    const Device& device;
@@ -128,10 +129,8 @@ private:
    StagingBufferPool& staging_pool;
    UpdateDescriptorQueue& update_descriptor_queue;

-    vk::Buffer quad_array_lut;
-    MemoryCommit quad_array_lut_commit;
-    VkIndexType quad_array_lut_index_type{};
-    u32 current_num_indices = 0;
+    std::shared_ptr<QuadArrayIndexBuffer> quad_array_index_buffer;
+    std::shared_ptr<QuadStripIndexBuffer> quad_strip_index_buffer;

    vk::Buffer null_buffer;
    MemoryCommit null_buffer_commit;
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@@ -245,7 +245,7 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, Scheduler& scheduler_,
                                 UpdateDescriptorQueue& update_descriptor_queue_)
    : ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS,
                  INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO,
-                  COMPUTE_PUSH_CONSTANT_RANGE<sizeof(u32) * 2>, VULKAN_QUAD_INDEXED_COMP_SPV),
+                  COMPUTE_PUSH_CONSTANT_RANGE<sizeof(u32) * 3>, VULKAN_QUAD_INDEXED_COMP_SPV),
      scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_},
      update_descriptor_queue{update_descriptor_queue_} {}

@@ -253,7 +253,7 @@ QuadIndexedPass::~QuadIndexedPass() = default;

 std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
    Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex,
-    VkBuffer src_buffer, u32 src_offset) {
+    VkBuffer src_buffer, u32 src_offset, bool is_strip) {
    const u32 index_shift = [index_format] {
        switch (index_format) {
        case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte:
@@ -267,7 +267,7 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
        return 2;
    }();
    const u32 input_size = num_vertices << index_shift;
-    const u32 num_tri_vertices = (num_vertices / 4) * 6;
+    const u32 num_tri_vertices = (is_strip ? (num_vertices - 2) / 2 : num_vertices / 4) * 6;

    const std::size_t staging_size = num_tri_vertices * sizeof(u32);
    const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
@@ -278,8 +278,8 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
    const void* const descriptor_data{update_descriptor_queue.UpdateData()};

    scheduler.RequestOutsideRenderPassOperationContext();
-    scheduler.Record([this, descriptor_data, num_tri_vertices, base_vertex,
-                      index_shift](vk::CommandBuffer cmdbuf) {
+    scheduler.Record([this, descriptor_data, num_tri_vertices, base_vertex, index_shift,
+                      is_strip](vk::CommandBuffer cmdbuf) {
        static constexpr u32 DISPATCH_SIZE = 1024;
        static constexpr VkMemoryBarrier WRITE_BARRIER{
            .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
@@ -287,7 +287,7 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
            .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
            .dstAccessMask = VK_ACCESS_INDEX_READ_BIT,
        };
-        const std::array<u32, 2> push_constants{base_vertex, index_shift};
+        const std::array<u32, 3> push_constants{base_vertex, index_shift, is_strip ? 1u : 0u};
        const VkDescriptorSet set = descriptor_allocator.Commit();
        device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
--- a/src/video_core/renderer_vulkan/vk_compute_pass.h
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.h
@@ -74,7 +74,7 @@ public:

    std::pair<VkBuffer, VkDeviceSize> Assemble(
        Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices,
-        u32 base_vertex, VkBuffer src_buffer, u32 src_offset);
+        u32 base_vertex, VkBuffer src_buffer, u32 src_offset, bool is_strip);

 private:
    Scheduler& scheduler;
--- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
+++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp
@@ -529,9 +529,7 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
    static_vector<VkVertexInputBindingDivisorDescriptionEXT, 32> vertex_binding_divisors;
    static_vector<VkVertexInputAttributeDescription, 32> vertex_attributes;
    if (key.state.dynamic_vertex_input) {
-        const size_t num_vertex_arrays = std::min(
-            key.state.attributes.size(), static_cast<size_t>(device.GetMaxVertexInputBindings()));
-        for (size_t index = 0; index < num_vertex_arrays; ++index) {
+        for (size_t index = 0; index < key.state.attributes.size(); ++index) {
            const u32 type = key.state.DynamicAttributeType(index);
            if (!stage_infos[0].loads.Generic(index) || type == 0) {
                continue;
@@ -553,9 +551,7 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
            });
        }
    } else {
-        const size_t num_vertex_arrays = std::min(
-            Maxwell::NumVertexArrays, static_cast<size_t>(device.GetMaxVertexInputBindings()));
-        for (size_t index = 0; index < num_vertex_arrays; ++index) {
+        for (size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
            const bool instanced = key.state.binding_divisors[index] != 0;
            const auto rate =
                instanced ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX;
@@ -584,8 +580,6 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) {
            });
        }
    }
-    ASSERT(vertex_attributes.size() <= device.GetMaxVertexInputAttributes());
-
    VkPipelineVertexInputStateCreateInfo vertex_input_ci{
        .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
        .pNext = nullptr,
--- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
+++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp
@@ -341,15 +341,6 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device
        .support_snorm_render_buffer = true,
        .support_viewport_index_layer = device.IsExtShaderViewportIndexLayerSupported(),
    };
-
-    if (device.GetMaxVertexInputAttributes() < Maxwell::NumVertexAttributes) {
-        LOG_WARNING(Render_Vulkan, "maxVertexInputAttributes is too low: {} < {}",
-                    device.GetMaxVertexInputAttributes(), Maxwell::NumVertexAttributes);
-    }
-    if (device.GetMaxVertexInputBindings() < Maxwell::NumVertexArrays) {
-        LOG_WARNING(Render_Vulkan, "maxVertexInputBindings is too low: {} < {}",
-                    device.GetMaxVertexInputBindings(), Maxwell::NumVertexArrays);
-    }
 }

 PipelineCache::~PipelineCache() = default;
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -138,12 +138,16 @@ DrawParams MakeDrawParams(const MaxwellDrawState& draw_state, u32 num_instances,
        .first_index = is_indexed ? draw_state.index_buffer.first : 0,
        .is_indexed = is_indexed,
    };
+    // 6 triangle vertices per quad, base vertex is part of the index
+    // See BindQuadIndexBuffer for more details
    if (draw_state.topology == Maxwell::PrimitiveTopology::Quads) {
-        // 6 triangle vertices per quad, base vertex is part of the index
-        // See BindQuadArrayIndexBuffer for more details
        params.num_vertices = (params.num_vertices / 4) * 6;
        params.base_vertex = 0;
        params.is_indexed = true;
+    } else if (draw_state.topology == Maxwell::PrimitiveTopology::QuadStrip) {
+        params.num_vertices = (params.num_vertices - 2) / 2 * 6;
+        params.base_vertex = 0;
+        params.is_indexed = true;
    }
    return params;
 }
@@ -216,6 +220,9 @@ void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) {
 void RasterizerVulkan::Clear(u32 layer_count) {
    MICROPROFILE_SCOPE(Vulkan_Clearing);

+    if (!maxwell3d->ShouldExecute()) {
+        return;
+    }
    FlushWork();

    query_cache.UpdateCounters();
--- a/src/video_core/vulkan_common/vulkan_device.cpp
+++ b/src/video_core/vulkan_common/vulkan_device.cpp
@@ -1380,10 +1380,6 @@ void Device::SetupFeatures() {
    is_shader_storage_image_multisample = features.shaderStorageImageMultisample;
    is_blit_depth_stencil_supported = TestDepthStencilBlits();
    is_optimal_astc_supported = IsOptimalAstcSupported(features);
-
-    const VkPhysicalDeviceLimits& limits{properties.limits};
-    max_vertex_input_attributes = limits.maxVertexInputAttributes;
-    max_vertex_input_bindings = limits.maxVertexInputBindings;
 }

 void Device::SetupProperties() {
--- a/src/video_core/vulkan_common/vulkan_device.h
+++ b/src/video_core/vulkan_common/vulkan_device.h
@@ -368,14 +368,6 @@ public:
        return must_emulate_bgr565;
    }

-    u32 GetMaxVertexInputAttributes() const {
-        return max_vertex_input_attributes;
-    }
-
-    u32 GetMaxVertexInputBindings() const {
-        return max_vertex_input_bindings;
-    }
-
 private:
    /// Checks if the physical device is suitable.
    void CheckSuitability(bool requires_swapchain) const;
@@ -475,8 +467,6 @@ private:
    bool supports_d24_depth{};              ///< Supports D24 depth buffers.
    bool cant_blit_msaa{};                  ///< Does not support MSAA<->MSAA blitting.
    bool must_emulate_bgr565{};             ///< Emulates BGR565 by swizzling RGB565 format.
-    u32 max_vertex_input_attributes{};      ///< Max vertex input attributes in pipeline
-    u32 max_vertex_input_bindings{};        ///< Max vertex input buffers in pipeline

    // Telemetry parameters
    std::string vendor_name;                       ///< Device's driver name.
Author	SHA1	Message	Date
FengChen	6a397bc8ed	video_core: Implement other missing vulkan topology	2022-12-26 12:20:49 +08:00
FengChen	86d5b4e556	video_core: Implement vulkan QuadStrip topology	2022-12-26 11:37:34 +08:00