gl_staging_buffer: Use glGetSynciv instead of glClientWaitSync

glGetSynciv is the intended API to query fence's signaled status.
gl_staging_buffer: Add missing GL_CLIENT_STORAGE_BIT
2019-09-04 01:50:51 -03:00 · 2019-09-04 01:50:51 -03:00 · 2019-09-04 01:50:51 -03:00 · 2019-09-04 01:50:51 -03:00 · 2019-09-04 01:50:51 -03:00 · 2019-09-04 01:50:51 -03:00
12 changed files with 477 additions and 123 deletions
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -64,6 +64,8 @@ add_library(video_core STATIC
    renderer_opengl/gl_shader_manager.h
    renderer_opengl/gl_shader_util.cpp
    renderer_opengl/gl_shader_util.h
+    renderer_opengl/gl_staging_buffer.cpp
+    renderer_opengl/gl_staging_buffer.h
    renderer_opengl/gl_state.cpp
    renderer_opengl/gl_state.h
    renderer_opengl/gl_stream_buffer.cpp
@@ -114,6 +116,7 @@ add_library(video_core STATIC
    shader/shader_ir.cpp
    shader/shader_ir.h
    shader/track.cpp
+    staging_buffer_cache.h
    surface.cpp
    surface.h
    texture_cache/surface_base.cpp
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -4,6 +4,8 @@

 #include <array>
 #include <cstddef>
+#include <string_view>
+
 #include <glad/glad.h>

 #include "common/logging/log.h"
@@ -23,6 +25,9 @@ T GetInteger(GLenum pname) {
 } // Anonymous namespace

 Device::Device() {
+    const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
+    const bool intel_proprietary = vendor == "Intel";
+
    uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
    shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
    max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
@@ -32,6 +37,7 @@ Device::Device() {
    has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
    has_variable_aoffi = TestVariableAoffi();
    has_component_indexing_bug = TestComponentIndexingBug();
+    has_broken_pbo_streaming = intel_proprietary;
 }

 Device::Device(std::nullptr_t) {
@@ -42,6 +48,7 @@ Device::Device(std::nullptr_t) {
    has_vertex_viewport_layer = true;
    has_variable_aoffi = true;
    has_component_indexing_bug = false;
+    has_broken_pbo_streaming = false;
 }

 bool Device::TestVariableAoffi() {
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -46,6 +46,10 @@ public:
        return has_component_indexing_bug;
    }

+    bool HasBrokenPBOStreaming() const {
+        return has_broken_pbo_streaming;
+    }
+
 private:
    static bool TestVariableAoffi();
    static bool TestComponentIndexingBug();
@@ -58,6 +62,7 @@ private:
    bool has_vertex_viewport_layer{};
    bool has_variable_aoffi{};
    bool has_component_indexing_bug{};
+    bool has_broken_pbo_streaming{};
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_staging_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_staging_buffer.cpp
@@ -0,0 +1,169 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <memory>
+#include <glad/glad.h>
+#include "common/assert.h"
+#include "video_core/renderer_opengl/gl_device.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_staging_buffer.h"
+
+namespace OpenGL {
+
+class PersistentStagingBuffer final : public StagingBuffer {
+public:
+    explicit PersistentStagingBuffer(std::size_t size, bool is_read_buffer)
+        : is_read_buffer{is_read_buffer} {
+        constexpr GLenum storage_read =
+            GL_CLIENT_STORAGE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_READ_BIT;
+        constexpr GLenum storage_write =
+            GL_CLIENT_STORAGE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_WRITE_BIT;
+        constexpr GLenum map_read = GL_MAP_PERSISTENT_BIT | GL_MAP_READ_BIT;
+        constexpr GLenum map_write = GL_MAP_PERSISTENT_BIT | GL_MAP_WRITE_BIT |
+                                     GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT;
+        const GLenum storage = is_read_buffer ? storage_read : storage_write;
+        const GLenum map = is_read_buffer ? map_read : map_write;
+
+        buffer.Create();
+        glNamedBufferStorage(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, storage);
+        pointer = reinterpret_cast<u8*>(
+            glMapNamedBufferRange(buffer.handle, 0, static_cast<GLsizeiptr>(size), map));
+    }
+
+    ~PersistentStagingBuffer() override {
+        if (sync) {
+            glDeleteSync(sync);
+        }
+    }
+
+    u8* GetOpenGLPointer() const override {
+        // Operations with a bound OpenGL buffer start with an offset of 0.
+        return nullptr;
+    }
+
+    u8* Map([[maybe_unused]] std::size_t size) const override {
+        return pointer;
+    }
+
+    void Unmap(std::size_t size) const override {
+        if (!is_read_buffer) {
+            // We flush the buffer on write operations
+            glFlushMappedNamedBufferRange(buffer.handle, 0, size);
+        }
+    }
+
+    void QueueFence(bool own) override {
+        DEBUG_ASSERT(!sync);
+        owned = own;
+        sync = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+    }
+
+    void WaitFence() override {
+        DEBUG_ASSERT(sync);
+        switch (glClientWaitSync(sync, 0, GL_TIMEOUT_IGNORED)) {
+        case GL_ALREADY_SIGNALED:
+        case GL_CONDITION_SATISFIED:
+            break;
+        case GL_TIMEOUT_EXPIRED:
+        case GL_WAIT_FAILED:
+            UNREACHABLE_MSG("Fence wait failed");
+            break;
+        }
+        Discard();
+    }
+
+    void Discard() override {
+        DEBUG_ASSERT(sync);
+        glDeleteSync(sync);
+        sync = nullptr;
+        owned = false;
+    }
+
+    bool IsAvailable() override {
+        if (owned) {
+            return false;
+        }
+        if (!sync) {
+            return true;
+        }
+        GLint status;
+        glGetSynciv(sync, GL_SYNC_STATUS, sizeof(GLint), nullptr, &status);
+        if (status == GL_UNSIGNALED) {
+            return false;
+        }
+        // The fence has been signaled, we can destroy it
+        glDeleteSync(sync);
+        sync = nullptr;
+        return true;
+    }
+
+    void Bind(GLenum target) const override {
+        glBindBuffer(target, buffer.handle);
+    }
+
+private:
+    OGLBuffer buffer;
+    GLsync sync{};
+    u8* pointer{};
+    bool is_read_buffer{};
+    bool owned{};
+};
+
+class CpuStagingBuffer final : public StagingBuffer {
+public:
+    explicit CpuStagingBuffer(std::size_t size) : pointer{std::make_unique<u8[]>(size)} {}
+
+    ~CpuStagingBuffer() override = default;
+
+    u8* Map([[maybe_unused]] std::size_t size) const override {
+        return pointer.get();
+    }
+
+    u8* GetOpenGLPointer() const override {
+        return pointer.get();
+    }
+
+    void Unmap([[maybe_unused]] std::size_t size) const override {}
+
+    void QueueFence(bool own) override {
+        // We don't queue anything here
+    }
+
+    void WaitFence() override {
+        // CPU operations are immediate, we don't wait for anything
+    }
+
+    void Discard() override {
+        UNREACHABLE_MSG("CpuStagingBuffer doesn't support deferred operations");
+    }
+
+    bool IsAvailable() override {
+        // A CPU buffer is always available, operations are immediate
+        return true;
+    }
+
+    void Bind(GLenum target) const override {
+        // OpenGL operations that use CPU buffers need that the target is zero
+        glBindBuffer(target, 0);
+    }
+
+private:
+    std::unique_ptr<u8[]> pointer;
+};
+
+StagingBufferCache::StagingBufferCache(const Device& device)
+    : VideoCommon::StagingBufferCache<StagingBuffer>{!device.HasBrokenPBOStreaming()},
+      device{device} {}
+
+StagingBufferCache::~StagingBufferCache() = default;
+
+std::unique_ptr<StagingBuffer> StagingBufferCache::CreateBuffer(std::size_t size, bool is_flush) {
+    if (device.HasBrokenPBOStreaming()) {
+        return std::unique_ptr<StagingBuffer>(new CpuStagingBuffer(size));
+    } else {
+        return std::unique_ptr<StagingBuffer>(new PersistentStagingBuffer(size, is_flush));
+    }
+}
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_staging_buffer.h
+++ b/src/video_core/renderer_opengl/gl_staging_buffer.h
@@ -0,0 +1,60 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <memory>
+#include <glad/glad.h>
+#include "common/common_types.h"
+#include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/staging_buffer_cache.h"
+
+namespace OpenGL {
+
+class Device;
+class StagingBuffer;
+
+class StagingBufferCache final : public VideoCommon::StagingBufferCache<StagingBuffer> {
+public:
+    explicit StagingBufferCache(const Device& device);
+    ~StagingBufferCache() override;
+
+protected:
+    std::unique_ptr<StagingBuffer> CreateBuffer(std::size_t size, bool is_flush) override;
+
+private:
+    const Device& device;
+};
+
+class StagingBuffer : public NonCopyable {
+public:
+    virtual ~StagingBuffer() = default;
+
+    /// Returns the base pointer passed to an OpenGL function.
+    [[nodiscard]] virtual u8* GetOpenGLPointer() const = 0;
+
+    /// Maps the staging buffer.
+    [[nodiscard]] virtual u8* Map(std::size_t size) const = 0;
+
+    /// Unmaps the staging buffer
+    virtual void Unmap(std::size_t size) const = 0;
+
+    /// Inserts a fence in the OpenGL pipeline.
+    /// @param own Protects the fence from being used before it's waited, intended for flushes.
+    virtual void QueueFence(bool own) = 0;
+
+    /// Waits for a fence and releases the ownership.
+    virtual void WaitFence() = 0;
+
+    /// Discards the deferred operation and its bound fence. A fence must be queued.
+    virtual void Discard() = 0;
+
+    /// Returns true when the fence is available.
+    [[nodiscard]] virtual bool IsAvailable() = 0;
+
+    /// Binds the staging buffer handle to an OpenGL target.
+    virtual void Bind(GLenum target) const = 0;
+};
+
+} // namespace OpenGL
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -10,9 +10,11 @@
 #include "core/core.h"
 #include "video_core/morton.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_staging_buffer.h"
 #include "video_core/renderer_opengl/gl_state.h"
 #include "video_core/renderer_opengl/gl_texture_cache.h"
 #include "video_core/renderer_opengl/utils.h"
+#include "video_core/staging_buffer_cache.h"
 #include "video_core/texture_cache/surface_base.h"
 #include "video_core/texture_cache/texture_cache.h"
 #include "video_core/textures/convert.h"
@@ -234,8 +236,9 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte

 } // Anonymous namespace

-CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params)
-    : VideoCommon::SurfaceBase<View>(gpu_addr, params) {
+CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params,
+                             std::vector<u8>& temporary_buffer)
+    : VideoCommon::SurfaceBase<View, StagingBuffer>{gpu_addr, params, temporary_buffer} {
    const auto& tuple{GetFormatTuple(params.pixel_format, params.component_type)};
    internal_format = tuple.internal_format;
    format = tuple.format;
@@ -251,45 +254,52 @@ CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& param

 CachedSurface::~CachedSurface() = default;

-void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) {
+void CachedSurface::DownloadTexture(StagingBuffer& buffer) {
    MICROPROFILE_SCOPE(OpenGL_Texture_Download);

-    SCOPE_EXIT({ glPixelStorei(GL_PACK_ROW_LENGTH, 0); });
-
+    buffer.Bind(GL_PIXEL_PACK_BUFFER);
+    const auto pointer_base = buffer.GetOpenGLPointer();
    for (u32 level = 0; level < params.emulated_levels; ++level) {
        glPixelStorei(GL_PACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level)));
        glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level)));
-        const std::size_t mip_offset = params.GetHostMipmapLevelOffset(level);
+        const auto mip_offset = pointer_base + params.GetHostMipmapLevelOffset(level);
        if (is_compressed) {
            glGetCompressedTextureImage(texture.handle, level,
                                        static_cast<GLsizei>(params.GetHostMipmapSize(level)),
-                                        staging_buffer.data() + mip_offset);
+                                        mip_offset);
        } else {
            glGetTextureImage(texture.handle, level, format, type,
-                              static_cast<GLsizei>(params.GetHostMipmapSize(level)),
-                              staging_buffer.data() + mip_offset);
+                              static_cast<GLsizei>(params.GetHostMipmapSize(level)), mip_offset);
        }
    }
+    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
+
+    // According to Cemu glGetTextureImage and friends do not flush, resulting in a softlock if we
+    // wait for a fence. To fix this we have to explicitly flush and then queue a fence.
+    glFlush();
+    buffer.QueueFence(true);
 }

-void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) {
+void CachedSurface::UploadTexture(StagingBuffer& buffer) {
    MICROPROFILE_SCOPE(OpenGL_Texture_Upload);
-    SCOPE_EXIT({ glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); });
+
+    buffer.Bind(GL_PIXEL_UNPACK_BUFFER);
+    const auto pointer = buffer.GetOpenGLPointer();
    for (u32 level = 0; level < params.emulated_levels; ++level) {
-        UploadTextureMipmap(level, staging_buffer);
+        UploadTextureMipmap(level, pointer);
    }
+    glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+    buffer.QueueFence(false);
 }

-void CachedSurface::UploadTextureMipmap(u32 level, const std::vector<u8>& staging_buffer) {
+void CachedSurface::UploadTextureMipmap(u32 level, const u8* opengl_pointer) {
    glPixelStorei(GL_UNPACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level)));
    glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level)));

-    auto compression_type = params.GetCompressionType();
-
-    const std::size_t mip_offset = compression_type == SurfaceCompression::Converted
-                                       ? params.GetConvertedMipmapOffset(level)
-                                       : params.GetHostMipmapLevelOffset(level);
-    const u8* buffer{staging_buffer.data() + mip_offset};
+    const auto compression_type = params.GetCompressionType();
+    const u8* mip_offset = opengl_pointer + (compression_type == SurfaceCompression::Converted
+                                                 ? params.GetConvertedMipmapOffset(level)
+                                                 : params.GetHostMipmapLevelOffset(level));
    if (is_compressed) {
        const auto image_size{static_cast<GLsizei>(params.GetHostMipmapSize(level))};
        switch (params.target) {
@@ -297,7 +307,7 @@ void CachedSurface::UploadTextureMipmap(u32 level, const std::vector<u8>& stagin
            glCompressedTextureSubImage2D(texture.handle, level, 0, 0,
                                          static_cast<GLsizei>(params.GetMipWidth(level)),
                                          static_cast<GLsizei>(params.GetMipHeight(level)),
-                                          internal_format, image_size, buffer);
+                                          internal_format, image_size, mip_offset);
            break;
        case SurfaceTarget::Texture3D:
        case SurfaceTarget::Texture2DArray:
@@ -306,7 +316,7 @@ void CachedSurface::UploadTextureMipmap(u32 level, const std::vector<u8>& stagin
                                          static_cast<GLsizei>(params.GetMipWidth(level)),
                                          static_cast<GLsizei>(params.GetMipHeight(level)),
                                          static_cast<GLsizei>(params.GetMipDepth(level)),
-                                          internal_format, image_size, buffer);
+                                          internal_format, image_size, mip_offset);
            break;
        case SurfaceTarget::TextureCubemap: {
            const std::size_t layer_size{params.GetHostLayerSize(level)};
@@ -315,8 +325,8 @@ void CachedSurface::UploadTextureMipmap(u32 level, const std::vector<u8>& stagin
                                              static_cast<GLsizei>(params.GetMipWidth(level)),
                                              static_cast<GLsizei>(params.GetMipHeight(level)), 1,
                                              internal_format, static_cast<GLsizei>(layer_size),
-                                              buffer);
-                buffer += layer_size;
+                                              mip_offset);
+                mip_offset += layer_size;
            }
            break;
        }
@@ -327,17 +337,17 @@ void CachedSurface::UploadTextureMipmap(u32 level, const std::vector<u8>& stagin
        switch (params.target) {
        case SurfaceTarget::Texture1D:
            glTextureSubImage1D(texture.handle, level, 0, params.GetMipWidth(level), format, type,
-                                buffer);
+                                mip_offset);
            break;
        case SurfaceTarget::TextureBuffer:
            ASSERT(level == 0);
            glNamedBufferSubData(texture_buffer.handle, 0,
-                                 params.GetMipWidth(level) * params.GetBytesPerPixel(), buffer);
+                                 params.GetMipWidth(level) * params.GetBytesPerPixel(), mip_offset);
            break;
        case SurfaceTarget::Texture1DArray:
        case SurfaceTarget::Texture2D:
            glTextureSubImage2D(texture.handle, level, 0, 0, params.GetMipWidth(level),
-                                params.GetMipHeight(level), format, type, buffer);
+                                params.GetMipHeight(level), format, type, mip_offset);
            break;
        case SurfaceTarget::Texture3D:
        case SurfaceTarget::Texture2DArray:
@@ -345,16 +355,18 @@ void CachedSurface::UploadTextureMipmap(u32 level, const std::vector<u8>& stagin
            glTextureSubImage3D(
                texture.handle, level, 0, 0, 0, static_cast<GLsizei>(params.GetMipWidth(level)),
                static_cast<GLsizei>(params.GetMipHeight(level)),
-                static_cast<GLsizei>(params.GetMipDepth(level)), format, type, buffer);
+                static_cast<GLsizei>(params.GetMipDepth(level)), format, type, mip_offset);
            break;
-        case SurfaceTarget::TextureCubemap:
+        case SurfaceTarget::TextureCubemap: {
+            const std::size_t layer_size = params.GetHostLayerSize(level);
            for (std::size_t face = 0; face < params.depth; ++face) {
                glTextureSubImage3D(texture.handle, level, 0, 0, static_cast<GLint>(face),
                                    params.GetMipWidth(level), params.GetMipHeight(level), 1,
-                                    format, type, buffer);
-                buffer += params.GetHostLayerSize(level);
+                                    format, type, mip_offset);
+                mip_offset += layer_size;
            }
            break;
+        }
        default:
            UNREACHABLE();
        }
@@ -452,7 +464,7 @@ OGLTextureView CachedSurfaceView::CreateTextureView() const {
 TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system,
                                       VideoCore::RasterizerInterface& rasterizer,
                                       const Device& device)
-    : TextureCacheBase{system, rasterizer} {
+    : TextureCacheBase{system, rasterizer, std::make_unique<StagingBufferCache>(device)} {
    src_framebuffer.Create();
    dst_framebuffer.Create();
 }
@@ -460,7 +472,7 @@ TextureCacheOpenGL::TextureCacheOpenGL(Core::System& system,
 TextureCacheOpenGL::~TextureCacheOpenGL() = default;

 Surface TextureCacheOpenGL::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) {
-    return std::make_shared<CachedSurface>(gpu_addr, params);
+    return std::make_shared<CachedSurface>(gpu_addr, params, temporary_buffer);
 }

 void TextureCacheOpenGL::ImageCopy(Surface& src_surface, Surface& dst_surface,
@@ -568,7 +580,6 @@ void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface)
        glGetTextureImage(src_surface->GetTexture(), 0, source_format.format, source_format.type,
                          static_cast<GLsizei>(source_size), nullptr);
    }
-    glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);

    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, copy_pbo_handle);

@@ -604,7 +615,6 @@ void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface)
            UNREACHABLE();
        }
    }
-    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);

    glTextureBarrier();
 }
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -17,6 +17,7 @@
 #include "video_core/engines/shader_bytecode.h"
 #include "video_core/renderer_opengl/gl_device.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
+#include "video_core/renderer_opengl/gl_staging_buffer.h"
 #include "video_core/texture_cache/texture_cache.h"

 namespace OpenGL {
@@ -26,21 +27,23 @@ using VideoCommon::ViewParams;

 class CachedSurfaceView;
 class CachedSurface;
+class StagingBuffer;
 class TextureCacheOpenGL;

 using Surface = std::shared_ptr<CachedSurface>;
 using View = std::shared_ptr<CachedSurfaceView>;
-using TextureCacheBase = VideoCommon::TextureCache<Surface, View>;
+using TextureCacheBase = VideoCommon::TextureCache<Surface, View, StagingBuffer>;

-class CachedSurface final : public VideoCommon::SurfaceBase<View> {
+class CachedSurface final : public VideoCommon::SurfaceBase<View, StagingBuffer> {
    friend CachedSurfaceView;

 public:
-    explicit CachedSurface(GPUVAddr gpu_addr, const SurfaceParams& params);
+    explicit CachedSurface(GPUVAddr gpu_addr, const SurfaceParams& params,
+                           std::vector<u8>& temporary_buffer);
    ~CachedSurface();

-    void UploadTexture(const std::vector<u8>& staging_buffer) override;
-    void DownloadTexture(std::vector<u8>& staging_buffer) override;
+    void UploadTexture(StagingBuffer& buffer) override;
+    void DownloadTexture(StagingBuffer& buffer) override;

    GLenum GetTarget() const {
        return target;
@@ -57,7 +60,7 @@ protected:
    View CreateViewInner(const ViewParams& view_key, bool is_proxy);

 private:
-    void UploadTextureMipmap(u32 level, const std::vector<u8>& staging_buffer);
+    void UploadTextureMipmap(u32 level, const u8* opengl_pointer);

    GLenum internal_format{};
    GLenum format{};
@@ -138,6 +141,7 @@ private:
    OGLFramebuffer src_framebuffer;
    OGLFramebuffer dst_framebuffer;
    std::unordered_map<u32, OGLBuffer> copy_pbo_cache;
+    std::vector<u8> temporary_buffer;
 };

 } // namespace OpenGL
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -172,6 +172,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
                             framebuffer.stride, block_height_log2, framebuffer.height, 0, 1, 1,
                             gl_framebuffer_data.data(), host_ptr);

+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
    glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(framebuffer.stride));

    // Update existing texture
--- a/src/video_core/staging_buffer_cache.h
+++ b/src/video_core/staging_buffer_cache.h
@@ -0,0 +1,58 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "common/bit_util.h"
+#include "common/common_types.h"
+
+namespace VideoCommon {
+
+template <typename StagingBufferType>
+class StagingBufferCache {
+    using Cache = std::unordered_map<u32, std::vector<std::unique_ptr<StagingBufferType>>>;
+
+public:
+    explicit StagingBufferCache(bool can_flush_aot) : can_flush_aot{can_flush_aot} {}
+    virtual ~StagingBufferCache() = default;
+
+    StagingBufferType& GetWriteBuffer(std::size_t size) {
+        return GetBuffer(size, false);
+    }
+
+    StagingBufferType& GetReadBuffer(std::size_t size) {
+        return GetBuffer(size, true);
+    }
+
+    bool CanFlushAheadOfTime() const {
+        return can_flush_aot;
+    }
+
+protected:
+    virtual std::unique_ptr<StagingBufferType> CreateBuffer(std::size_t size, bool is_flush) = 0;
+
+private:
+    StagingBufferType& GetBuffer(std::size_t size, bool is_flush) {
+        const u32 ceil = Common::Log2Ceil64(size);
+        auto& buffers = (is_flush ? flush_cache : upload_cache)[ceil];
+        const auto it = std::find_if(buffers.begin(), buffers.end(),
+                                     [](auto& buffer) { return buffer->IsAvailable(); });
+        if (it != buffers.end()) {
+            return **it;
+        }
+        return *buffers.emplace_back(CreateBuffer(1ULL << ceil, is_flush));
+    }
+
+    bool can_flush_aot{};
+    Cache upload_cache;
+    Cache flush_cache;
+};
+
+} // namespace VideoCommon
--- a/src/video_core/texture_cache/surface_base.cpp
+++ b/src/video_core/texture_cache/surface_base.cpp
@@ -19,12 +19,10 @@ using Tegra::Texture::ConvertFromGuestToHost;
 using VideoCore::MortonSwizzleMode;
 using VideoCore::Surface::SurfaceCompression;

-StagingCache::StagingCache() = default;
-
-StagingCache::~StagingCache() = default;
-
-SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params)
-    : params{params}, host_memory_size{params.GetHostSizeInBytes()}, gpu_addr{gpu_addr},
+SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params,
+                                 std::vector<u8>& temporary_buffer)
+    : params{params}, temporary_buffer{temporary_buffer},
+      host_memory_size{params.GetHostSizeInBytes()}, gpu_addr{gpu_addr},
      mipmap_sizes(params.num_levels), mipmap_offsets(params.num_levels) {
    std::size_t offset = 0;
    for (u32 level = 0; level < params.num_levels; ++level) {
@@ -45,6 +43,8 @@ SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params)
    }
 }

+SurfaceBaseImpl::~SurfaceBaseImpl() = default;
+
 MatchTopologyResult SurfaceBaseImpl::MatchesTopology(const SurfaceParams& rhs) const {
    const u32 src_bpp{params.GetBytesPerPixel()};
    const u32 dst_bpp{rhs.GetBytesPerPixel()};
@@ -179,10 +179,8 @@ void SurfaceBaseImpl::SwizzleFunc(MortonSwizzleMode mode, u8* memory, const Surf
    }
 }

-void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager,
-                                 StagingCache& staging_cache) {
+void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager, u8* staging_buffer) {
    MICROPROFILE_SCOPE(GPU_Load_Texture);
-    auto& staging_buffer = staging_cache.GetBuffer(0);
    u8* host_ptr;
    is_continuous = memory_manager.IsBlockContinuous(gpu_addr, guest_memory_size);

@@ -195,9 +193,8 @@ void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager,
        }
    } else {
        // Use an extra temporal buffer
-        auto& tmp_buffer = staging_cache.GetBuffer(1);
-        tmp_buffer.resize(guest_memory_size);
-        host_ptr = tmp_buffer.data();
+        temporary_buffer.resize(guest_memory_size);
+        host_ptr = temporary_buffer.data();
        memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size);
    }

@@ -207,7 +204,7 @@ void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager,
        for (u32 level = 0; level < params.num_levels; ++level) {
            const std::size_t host_offset{params.GetHostMipmapLevelOffset(level)};
            SwizzleFunc(MortonSwizzleMode::MortonToLinear, host_ptr, params,
-                        staging_buffer.data() + host_offset, level);
+                        staging_buffer + host_offset, level);
        }
    } else {
        ASSERT_MSG(params.num_levels == 1, "Linear mipmap loading is not implemented");
@@ -218,10 +215,10 @@ void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager,
        const u32 height{(params.height + block_height - 1) / block_height};
        const u32 copy_size{width * bpp};
        if (params.pitch == copy_size) {
-            std::memcpy(staging_buffer.data(), host_ptr, params.GetHostSizeInBytes());
+            std::memcpy(staging_buffer, host_ptr, params.GetHostSizeInBytes());
        } else {
            const u8* start{host_ptr};
-            u8* write_to{staging_buffer.data()};
+            u8* write_to{staging_buffer};
            for (u32 h = height; h > 0; --h) {
                std::memcpy(write_to, start, copy_size);
                start += params.pitch;
@@ -241,18 +238,16 @@ void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager,
        const std::size_t out_host_offset = compression_type == SurfaceCompression::Rearranged
                                                ? in_host_offset
                                                : params.GetConvertedMipmapOffset(level);
-        u8* in_buffer = staging_buffer.data() + in_host_offset;
-        u8* out_buffer = staging_buffer.data() + out_host_offset;
+        u8* in_buffer = staging_buffer + in_host_offset;
+        u8* out_buffer = staging_buffer + out_host_offset;
        ConvertFromGuestToHost(in_buffer, out_buffer, params.pixel_format,
                               params.GetMipWidth(level), params.GetMipHeight(level),
                               params.GetMipDepth(level), true, true);
    }
 }

-void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
-                                  StagingCache& staging_cache) {
+void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager, u8* staging_buffer) {
    MICROPROFILE_SCOPE(GPU_Flush_Texture);
-    auto& staging_buffer = staging_cache.GetBuffer(0);
    u8* host_ptr;

    // Handle continuouty
@@ -264,9 +259,8 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
        }
    } else {
        // Use an extra temporal buffer
-        auto& tmp_buffer = staging_cache.GetBuffer(1);
-        tmp_buffer.resize(guest_memory_size);
-        host_ptr = tmp_buffer.data();
+        temporary_buffer.resize(guest_memory_size);
+        host_ptr = temporary_buffer.data();
    }

    if (params.is_tiled) {
@@ -274,7 +268,7 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
        for (u32 level = 0; level < params.num_levels; ++level) {
            const std::size_t host_offset{params.GetHostMipmapLevelOffset(level)};
            SwizzleFunc(MortonSwizzleMode::LinearToMorton, host_ptr, params,
-                        staging_buffer.data() + host_offset, level);
+                        staging_buffer + host_offset, level);
        }
    } else {
        ASSERT(params.target == SurfaceTarget::Texture2D);
@@ -283,10 +277,10 @@ void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager,
        const u32 bpp{params.GetBytesPerPixel()};
        const u32 copy_size{params.width * bpp};
        if (params.pitch == copy_size) {
-            std::memcpy(host_ptr, staging_buffer.data(), guest_memory_size);
+            std::memcpy(host_ptr, staging_buffer, guest_memory_size);
        } else {
            u8* start{host_ptr};
-            const u8* read_to{staging_buffer.data()};
+            const u8* read_to{staging_buffer};
            for (u32 h = params.height; h > 0; --h) {
                std::memcpy(start, read_to, copy_size);
                start += params.pitch;
--- a/src/video_core/texture_cache/surface_base.h
+++ b/src/video_core/texture_cache/surface_base.h
@@ -38,32 +38,11 @@ enum class MatchTopologyResult : u32 {
    None = 2,
 };

-class StagingCache {
-public:
-    explicit StagingCache();
-    ~StagingCache();
-
-    std::vector<u8>& GetBuffer(std::size_t index) {
-        return staging_buffer[index];
-    }
-
-    const std::vector<u8>& GetBuffer(std::size_t index) const {
-        return staging_buffer[index];
-    }
-
-    void SetSize(std::size_t size) {
-        staging_buffer.resize(size);
-    }
-
-private:
-    std::vector<std::vector<u8>> staging_buffer;
-};
-
 class SurfaceBaseImpl {
 public:
-    void LoadBuffer(Tegra::MemoryManager& memory_manager, StagingCache& staging_cache);
+    void LoadBuffer(Tegra::MemoryManager& memory_manager, u8* staging_buffer);

-    void FlushBuffer(Tegra::MemoryManager& memory_manager, StagingCache& staging_cache);
+    void FlushBuffer(Tegra::MemoryManager& memory_manager, u8* staging_buffer);

    GPUVAddr GetGpuAddr() const {
        return gpu_addr;
@@ -161,12 +140,15 @@ public:
    }

 protected:
-    explicit SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params);
-    ~SurfaceBaseImpl() = default;
+    explicit SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params,
+                             std::vector<u8>& temporary_buffer);
+    ~SurfaceBaseImpl();

    virtual void DecorateSurfaceName() = 0;

    const SurfaceParams params;
+    std::vector<u8>& temporary_buffer;
+
    std::size_t layer_size;
    std::size_t guest_memory_size;
    const std::size_t host_memory_size;
@@ -188,25 +170,40 @@ private:
    std::vector<CopyParams> BreakDownNonLayered(const SurfaceParams& in_params) const;
 };

-template <typename TView>
+template <typename TView, typename StagingBufferType>
 class SurfaceBase : public SurfaceBaseImpl {
 public:
-    virtual void UploadTexture(const std::vector<u8>& staging_buffer) = 0;
+    virtual void UploadTexture(StagingBufferType& buffer) = 0;

-    virtual void DownloadTexture(std::vector<u8>& staging_buffer) = 0;
+    virtual void DownloadTexture(StagingBufferType& buffer) = 0;
+
+    void SetFlushBuffer(StagingBufferType* buffer) {
+        flush_buffer = buffer;
+    }
+
+    StagingBufferType* GetFlushBuffer() const {
+        return flush_buffer;
+    }

    void MarkAsModified(const bool is_modified_, const u64 tick) {
        is_modified = is_modified_ || is_target;
        modification_tick = tick;
+
+        if (is_modified && flush_buffer) {
+            // The buffer has been modified while we thought it was no longer being to be used and
+            // we queued a flush.
+            flush_buffer->Discard();
+            flush_buffer = nullptr;
+        }
    }

-    void MarkAsRenderTarget(const bool is_target, const u32 index) {
-        this->is_target = is_target;
-        this->index = index;
+    void MarkAsRenderTarget(const bool is_target_, const u32 index_) {
+        is_target = is_target_;
+        index = index_;
    }

-    void MarkAsPicked(const bool is_picked) {
-        this->is_picked = is_picked;
+    void MarkAsPicked(const bool is_picked_) {
+        is_picked = is_picked_;
    }

    bool IsModified() const {
@@ -214,7 +211,7 @@ public:
    }

    bool IsProtected() const {
-        // Only 3D Slices are to be protected
+        // Only 3D slices are to be protected
        return is_target && params.block_depth > 0;
    }

@@ -292,8 +289,9 @@ public:
    }

 protected:
-    explicit SurfaceBase(const GPUVAddr gpu_addr, const SurfaceParams& params)
-        : SurfaceBaseImpl(gpu_addr, params) {}
+    explicit SurfaceBase(const GPUVAddr gpu_addr, const SurfaceParams& params,
+                         std::vector<u8>& temporary_buffer)
+        : SurfaceBaseImpl{gpu_addr, params, temporary_buffer} {}

    ~SurfaceBase() = default;

@@ -320,6 +318,8 @@ private:
    bool is_picked{};
    u32 index{NO_RT};
    u64 modification_tick{};
+
+    StagingBufferType* flush_buffer{};
 };

 } // namespace VideoCommon
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -27,6 +27,7 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"
 #include "video_core/rasterizer_interface.h"
+#include "video_core/staging_buffer_cache.h"
 #include "video_core/surface.h"
 #include "video_core/texture_cache/copy_params.h"
 #include "video_core/texture_cache/surface_base.h"
@@ -48,7 +49,7 @@ using VideoCore::Surface::PixelFormat;
 using VideoCore::Surface::SurfaceTarget;
 using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig;

-template <typename TSurface, typename TView>
+template <typename TSurface, typename TView, typename StagingBufferType>
 class TextureCache {
    using IntervalMap = boost::icl::interval_map<CacheAddr, std::set<TSurface>>;
    using IntervalType = typename IntervalMap::interval_type;
@@ -62,10 +63,10 @@ public:
        }
    }

-    /***
+    /**
     * `Guard` guarantees that rendertargets don't unregister themselves if the
     * collide. Protection is currently only done on 3D slices.
-     ***/
+     */
    void GuardRenderTargets(bool new_guard) {
        guard_render_targets = new_guard;
    }
@@ -132,12 +133,18 @@ public:
            regs.zeta.memory_layout.block_width, regs.zeta.memory_layout.block_height,
            regs.zeta.memory_layout.block_depth, regs.zeta.memory_layout.type)};
        auto surface_view = GetSurface(gpu_addr, depth_params, preserve_contents, true);
-        if (depth_buffer.target)
+        if (auto& old_target = depth_buffer.target; old_target != surface_view.first) {
+            FlushAoT(old_target);
+        }
+
+        if (depth_buffer.target) {
            depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
+        }
        depth_buffer.target = surface_view.first;
        depth_buffer.view = surface_view.second;
-        if (depth_buffer.target)
+        if (depth_buffer.target) {
            depth_buffer.target->MarkAsRenderTarget(true, DEPTH_RT);
+        }
        return surface_view.second;
    }

@@ -166,12 +173,18 @@ public:

        auto surface_view = GetSurface(gpu_addr, SurfaceParams::CreateForFramebuffer(system, index),
                                       preserve_contents, true);
-        if (render_targets[index].target)
+        if (auto& old_target = render_targets[index].target; old_target != surface_view.first) {
+            FlushAoT(old_target);
+        }
+
+        if (render_targets[index].target) {
            render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
+        }
        render_targets[index].target = surface_view.first;
        render_targets[index].view = surface_view.second;
-        if (render_targets[index].target)
+        if (render_targets[index].target) {
            render_targets[index].target->MarkAsRenderTarget(true, static_cast<u32>(index));
+        }
        return surface_view.second;
    }

@@ -188,19 +201,25 @@ public:
    }

    void SetEmptyDepthBuffer() {
-        if (depth_buffer.target == nullptr) {
+        auto& target = depth_buffer.target;
+        if (target == nullptr) {
            return;
        }
-        depth_buffer.target->MarkAsRenderTarget(false, NO_RT);
+        FlushAoT(target);
+        target->MarkAsRenderTarget(false, NO_RT);
+
        depth_buffer.target = nullptr;
        depth_buffer.view = nullptr;
    }

    void SetEmptyColorBuffer(std::size_t index) {
-        if (render_targets[index].target == nullptr) {
+        auto& target = render_targets[index].target;
+        if (target == nullptr) {
            return;
        }
-        render_targets[index].target->MarkAsRenderTarget(false, NO_RT);
+        FlushAoT(target);
+        target->MarkAsRenderTarget(false, NO_RT);
+
        render_targets[index].target = nullptr;
        render_targets[index].view = nullptr;
    }
@@ -235,14 +254,15 @@ public:
    }

 protected:
-    TextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer)
-        : system{system}, rasterizer{rasterizer} {
+    TextureCache(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
+                 std::unique_ptr<StagingBufferCache<StagingBufferType>> staging_buffer_cache)
+        : system{system}, rasterizer{rasterizer}, staging_buffer_cache{
+                                                      std::move(staging_buffer_cache)} {
        for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) {
            SetEmptyColorBuffer(i);
        }

        SetEmptyDepthBuffer();
-        staging_cache.SetSize(2);

        const auto make_siblings = [this](PixelFormat a, PixelFormat b) {
            siblings_table[static_cast<std::size_t>(a)] = b;
@@ -687,9 +707,13 @@ private:
    }

    void LoadSurface(const TSurface& surface) {
-        staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes());
-        surface->LoadBuffer(system.GPU().MemoryManager(), staging_cache);
-        surface->UploadTexture(staging_cache.GetBuffer(0));
+        const auto host_size = surface->GetHostSizeInBytes();
+        auto& buffer = staging_buffer_cache->GetWriteBuffer(host_size);
+
+        surface->LoadBuffer(system.GPU().MemoryManager(), buffer.Map(host_size));
+        buffer.Unmap(host_size);
+
+        surface->UploadTexture(buffer);
        surface->MarkAsModified(false, Tick());
    }

@@ -697,9 +721,18 @@ private:
        if (!surface->IsModified()) {
            return;
        }
-        staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes());
-        surface->DownloadTexture(staging_cache.GetBuffer(0));
-        surface->FlushBuffer(system.GPU().MemoryManager(), staging_cache);
+
+        const auto host_size = surface->GetHostSizeInBytes();
+        auto buffer = surface->GetFlushBuffer();
+        if (!buffer) {
+            buffer = &staging_buffer_cache->GetReadBuffer(host_size);
+            surface->DownloadTexture(*buffer);
+        }
+        buffer->WaitFence();
+        surface->SetFlushBuffer(nullptr);
+
+        surface->FlushBuffer(system.GPU().MemoryManager(), buffer->Map(host_size));
+        buffer->Unmap(host_size);
        surface->MarkAsModified(false, Tick());
    }

@@ -767,6 +800,16 @@ private:
        return {};
    }

+    void FlushAoT(TSurface& surface) {
+        if (staging_buffer_cache->CanFlushAheadOfTime() || !surface || !surface->IsLinear() ||
+            surface->GetFlushBuffer()) {
+            return;
+        }
+        auto& buffer = staging_buffer_cache->GetReadBuffer(surface->GetHostSizeInBytes());
+        surface->DownloadTexture(buffer);
+        surface->SetFlushBuffer(&buffer);
+    }
+
    constexpr PixelFormat GetSiblingFormat(PixelFormat format) const {
        return siblings_table[static_cast<std::size_t>(format)];
    }
@@ -813,7 +856,7 @@ private:

    std::vector<TSurface> sampled_textures;

-    StagingCache staging_cache;
+    std::unique_ptr<StagingBufferCache<StagingBufferType>> staging_buffer_cache;
    std::recursive_mutex mutex;
 };
Author	SHA1	Message	Date
ReinUsesLisp	e3ec288568	gl_staging_buffer: Use glGetSynciv instead of glClientWaitSync glGetSynciv is the intended API to query fence's signaled status.	2019-09-04 01:50:51 -03:00
ReinUsesLisp	58659a702d	gl_staging_buffer: Add missing GL_CLIENT_STORAGE_BIT Fixes a performance regression where the OpenGL server was transfering data from server to client halting execution on texture usage up to 17ms per frame.	2019-09-04 01:50:51 -03:00
ReinUsesLisp	b65871407e	staging_buffer_cache: Remove [[nodiscard]]	2019-09-04 01:50:51 -03:00
ReinUsesLisp	f0161d2799	gl_texture_cache: clang-format fixes	2019-09-04 01:50:51 -03:00
ReinUsesLisp	6728b9b252	gl_staging_buffer: Move class declarations to cpp file and add commentaries	2019-09-04 01:50:51 -03:00
ReinUsesLisp	7ac5d2bac8	staging_buffer_cache: Disable OpenGL staging buffers for Intel proprietary drivers	2019-09-04 01:50:51 -03:00
ReinUsesLisp	256caec9ef	surface_base: Minor style changes	2019-09-04 01:50:51 -03:00
ReinUsesLisp	f733682aa7	texture_cache: Implement asynchronous flushing	2019-09-04 01:50:51 -03:00
ReinUsesLisp	e6e75f1c12	staging_buffer_cache: Use OpenGL buffers to upload/download textures	2019-09-04 01:50:51 -03:00