RasterizerCache: Swizzle to a staging buffer before copying it to memory when flushing a texture.

This gives us slightly increased memory safety, writing to out of bounds memory will be caught at the point of write instead of several calls down the line as memory corruption
GPU/DMA: Fixed Tiled->Linear transfers.
2018-09-21 13:53:13 -05:00 · 2018-09-21 13:53:12 -05:00 · 2018-09-21 13:53:12 -05:00 · 2018-09-21 13:53:12 -05:00 · 2018-09-21 13:53:11 -05:00 · 2018-09-21 13:53:11 -05:00
15 changed files with 327 additions and 51 deletions
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -4,11 +4,13 @@

 #include "core/memory.h"
 #include "video_core/engines/fermi_2d.h"
+#include "video_core/rasterizer_interface.h"
 #include "video_core/textures/decoders.h"

 namespace Tegra::Engines {

-Fermi2D::Fermi2D(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
+    : memory_manager(memory_manager), rasterizer{rasterizer} {}

 void Fermi2D::WriteReg(u32 method, u32 value) {
    ASSERT_MSG(method < Regs::NUM_REGS,
@@ -52,6 +54,12 @@ void Fermi2D::HandleSurfaceCopy() {
        return;
    }

+    rasterizer.FlushRegion(source_cpu, src_bytes_per_pixel * regs.src.width * regs.src.height);
+    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
+    // We do this before actually writing the new data because the destination address might contain
+    // a dirty surface that will have to be written back to memory.
+    rasterizer.InvalidateRegion(dest_cpu, dst_bytes_per_pixel * regs.dst.width * regs.dst.height);
+
    u8* src_buffer = Memory::GetPointer(source_cpu);
    u8* dst_buffer = Memory::GetPointer(dest_cpu);

--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -12,6 +12,10 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"

+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra::Engines {

 #define FERMI2D_REG_INDEX(field_name)                                                              \
@@ -19,7 +23,7 @@ namespace Tegra::Engines {

 class Fermi2D final {
 public:
-    explicit Fermi2D(MemoryManager& memory_manager);
+    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
    ~Fermi2D() = default;

    /// Write the value to the register identified by method.
@@ -94,6 +98,8 @@ public:
    MemoryManager& memory_manager;

 private:
+    VideoCore::RasterizerInterface& rasterizer;
+
    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
    void HandleSurfaceCopy();
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -5,10 +5,14 @@
 #include "common/logging/log.h"
 #include "core/memory.h"
 #include "video_core/engines/kepler_memory.h"
+#include "video_core/rasterizer_interface.h"

 namespace Tegra::Engines {

-KeplerMemory::KeplerMemory(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+KeplerMemory::KeplerMemory(VideoCore::RasterizerInterface& rasterizer,
+                           MemoryManager& memory_manager)
+    : memory_manager(memory_manager), rasterizer{rasterizer} {}
+
 KeplerMemory::~KeplerMemory() = default;

 void KeplerMemory::WriteReg(u32 method, u32 value) {
@@ -37,6 +41,11 @@ void KeplerMemory::ProcessData(u32 data) {
    VAddr dest_address =
        *memory_manager.GpuToCpuAddress(address + state.write_offset * sizeof(u32));

+    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
+    // We do this before actually writing the new data because the destination address might contain
+    // a dirty surface that will have to be written back to memory.
+    rasterizer.InvalidateRegion(dest_address, sizeof(u32));
+
    Memory::Write32(dest_address, data);

    state.write_offset++;
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -11,6 +11,10 @@
 #include "common/common_types.h"
 #include "video_core/memory_manager.h"

+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra::Engines {

 #define KEPLERMEMORY_REG_INDEX(field_name)                                                         \
@@ -18,7 +22,7 @@ namespace Tegra::Engines {

 class KeplerMemory final {
 public:
-    KeplerMemory(MemoryManager& memory_manager);
+    KeplerMemory(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
    ~KeplerMemory();

    /// Write the value to the register identified by method.
@@ -72,6 +76,7 @@ public:

 private:
    MemoryManager& memory_manager;
+    VideoCore::RasterizerInterface& rasterizer;

    void ProcessData(u32 data);
 };
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -4,12 +4,14 @@

 #include "core/memory.h"
 #include "video_core/engines/maxwell_dma.h"
+#include "video_core/rasterizer_interface.h"
 #include "video_core/textures/decoders.h"

 namespace Tegra {
 namespace Engines {

-MaxwellDMA::MaxwellDMA(MemoryManager& memory_manager) : memory_manager(memory_manager) {}
+MaxwellDMA::MaxwellDMA(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
+    : memory_manager(memory_manager), rasterizer{rasterizer} {}

 void MaxwellDMA::WriteReg(u32 method, u32 value) {
    ASSERT_MSG(method < Regs::NUM_REGS,
@@ -44,36 +46,79 @@ void MaxwellDMA::HandleCopy() {
    ASSERT(regs.exec.query_mode == Regs::QueryMode::None);
    ASSERT(regs.exec.query_intr == Regs::QueryIntr::None);
    ASSERT(regs.exec.copy_mode == Regs::CopyMode::Unk2);
-    ASSERT(regs.src_params.pos_x == 0);
-    ASSERT(regs.src_params.pos_y == 0);
    ASSERT(regs.dst_params.pos_x == 0);
    ASSERT(regs.dst_params.pos_y == 0);

-    if (regs.exec.is_dst_linear == regs.exec.is_src_linear) {
-        std::size_t copy_size = regs.x_count;
+    if (!regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        // If both the source and the destination are in block layout, assert.
+        UNREACHABLE_MSG("Tiled->Tiled DMA transfers are not yet implemented");
+        return;
+    }

+    if (regs.exec.is_dst_linear && regs.exec.is_src_linear) {
        // When the enable_2d bit is disabled, the copy is performed as if we were copying a 1D
-        // buffer of length `x_count`, otherwise we copy a 2D buffer of size (x_count, y_count).
-        if (regs.exec.enable_2d) {
-            copy_size = copy_size * regs.y_count;
+        // buffer of length `x_count`, otherwise we copy a 2D image of dimensions (x_count,
+        // y_count).
+        if (!regs.exec.enable_2d) {
+            Memory::CopyBlock(dest_cpu, source_cpu, regs.x_count);
+            return;
        }

-        Memory::CopyBlock(dest_cpu, source_cpu, copy_size);
+        // If both the source and the destination are in linear layout, perform a line-by-line
+        // copy. We're going to take a subrect of size (x_count, y_count) from the source
+        // rectangle. There is no need to manually flush/invalidate the regions because
+        // CopyBlock does that for us.
+        for (u32 line = 0; line < regs.y_count; ++line) {
+            const VAddr source_line = source_cpu + line * regs.src_pitch;
+            const VAddr dest_line = dest_cpu + line * regs.dst_pitch;
+            Memory::CopyBlock(dest_line, source_line, regs.x_count);
+        }
        return;
    }

    ASSERT(regs.exec.enable_2d == 1);
+
+    size_t copy_size = regs.x_count * regs.y_count;
+
+    const auto FlushAndInvalidate = [&](u32 src_size, u32 dst_size) {
+        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
+        // copying.
+        rasterizer.FlushRegion(source_cpu, src_size);
+
+        // We have to invalidate the destination region to evict any outdated surfaces from the
+        // cache. We do this before actually writing the new data because the destination address
+        // might contain a dirty surface that will have to be written back to memory.
+        rasterizer.InvalidateRegion(dest_cpu, dst_size);
+    };
+
    u8* src_buffer = Memory::GetPointer(source_cpu);
    u8* dst_buffer = Memory::GetPointer(dest_cpu);

    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
+        ASSERT(regs.src_params.size_z == 1);
        // If the input is tiled and the output is linear, deswizzle the input and copy it over.
-        Texture::CopySwizzledData(regs.src_params.size_x, regs.src_params.size_y, 1, 1, src_buffer,
-                                  dst_buffer, true, regs.src_params.BlockHeight());
+
+        u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+
+        FlushAndInvalidate(regs.src_pitch * regs.src_params.size_y,
+                           copy_size * src_bytes_per_pixel);
+
+        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
+                                  regs.src_params.size_x, src_bytes_per_pixel, source_cpu, dest_cpu,
+                                  regs.src_params.BlockHeight(), regs.src_params.pos_x,
+                                  regs.src_params.pos_y);
    } else {
+        ASSERT(regs.dst_params.size_z == 1);
+        ASSERT(regs.src_pitch == regs.x_count);
+
+        u32 src_bpp = regs.src_pitch / regs.x_count;
+
+        FlushAndInvalidate(regs.src_pitch * regs.y_count,
+                           regs.dst_params.size_x * regs.dst_params.size_y * src_bpp);
+
        // If the input is linear and the output is tiled, swizzle the input and copy it over.
-        Texture::CopySwizzledData(regs.dst_params.size_x, regs.dst_params.size_y, 1, 1, dst_buffer,
-                                  src_buffer, false, regs.dst_params.BlockHeight());
+        Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
+                                src_bpp, dest_cpu, source_cpu, regs.dst_params.BlockHeight());
    }
 }

--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -12,11 +12,15 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"

+namespace VideoCore {
+class RasterizerInterface;
+}
+
 namespace Tegra::Engines {

 class MaxwellDMA final {
 public:
-    explicit MaxwellDMA(MemoryManager& memory_manager);
+    explicit MaxwellDMA(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
    ~MaxwellDMA() = default;

    /// Write the value to the register identified by method.
@@ -129,6 +133,8 @@ public:
    MemoryManager& memory_manager;

 private:
+    VideoCore::RasterizerInterface& rasterizer;
+
    /// Performs the copy from the source buffer to the destination buffer as configured in the
    /// registers.
    void HandleCopy();
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -25,10 +25,10 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
 GPU::GPU(VideoCore::RasterizerInterface& rasterizer) {
    memory_manager = std::make_unique<Tegra::MemoryManager>();
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(rasterizer, *memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
+    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
    maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
-    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(*memory_manager);
-    kepler_memory = std::make_unique<Engines::KeplerMemory>(*memory_manager);
+    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(rasterizer, *memory_manager);
+    kepler_memory = std::make_unique<Engines::KeplerMemory>(rasterizer, *memory_manager);
 }

 GPU::~GPU() = default;
--- a/src/video_core/rasterizer_cache.h
+++ b/src/video_core/rasterizer_cache.h
@@ -17,6 +17,22 @@
 template <class T>
 class RasterizerCache : NonCopyable {
 public:
+    /// Write any cached resources overlapping the region back to memory (if dirty)
+    void FlushRegion(Tegra::GPUVAddr addr, size_t size) {
+        if (size == 0)
+            return;
+
+        const ObjectInterval interval{addr, addr + size};
+        for (auto& pair : boost::make_iterator_range(object_cache.equal_range(interval))) {
+            for (auto& cached_object : pair.second) {
+                if (!cached_object)
+                    continue;
+
+                cached_object->Flush();
+            }
+        }
+    }
+
    /// Mark the specified region as being invalidated
    void InvalidateRegion(VAddr addr, u64 size) {
        if (size == 0)
@@ -71,6 +87,7 @@ protected:
    void Unregister(const T& object) {
        auto& rasterizer = Core::System::GetInstance().Renderer().Rasterizer();
        rasterizer.UpdatePagesCachedCount(object->GetAddr(), object->GetSizeInBytes(), -1);
+        object->Flush();
        object_cache.subtract({GetInterval(object), ObjectSet{object}});
    }

--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -23,6 +23,9 @@ struct CachedBufferEntry final {
        return size;
    }

+    // We do not have to flush this cache as things in it are never modified by us.
+    void Flush() {}
+
    VAddr addr;
    std::size_t size;
    GLintptr offset;
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -322,6 +322,13 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep
            // Used when just a single color attachment is enabled, e.g. for clearing a color buffer
            Surface color_surface =
                res_cache.GetColorBufferSurface(*single_color_target, preserve_contents);
+
+            if (color_surface) {
+                // Assume that a surface will be written to if it is used as a framebuffer, even if
+                // the shader doesn't actually write to it.
+                color_surface->MarkAsDirty();
+            }
+
            glFramebufferTexture2D(
                GL_DRAW_FRAMEBUFFER,
                GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(*single_color_target), GL_TEXTURE_2D,
@@ -332,6 +339,11 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep
            std::array<GLenum, Maxwell::NumRenderTargets> buffers;
            for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) {
                Surface color_surface = res_cache.GetColorBufferSurface(index, preserve_contents);
+                if (color_surface) {
+                    // Assume that a surface will be written to if it is used as a framebuffer, even
+                    // if the shader doesn't actually write to it.
+                    color_surface->MarkAsDirty();
+                }
                buffers[index] = GL_COLOR_ATTACHMENT0 + regs.rt_control.GetMap(index);
                glFramebufferTexture2D(
                    GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index),
@@ -351,6 +363,10 @@ void RasterizerOpenGL::ConfigureFramebuffers(bool using_color_fb, bool using_dep
    }

    if (depth_surface) {
+        // Assume that a surface will be written to if it is used as a framebuffer, even if
+        // the shader doesn't actually write to it.
+        depth_surface->MarkAsDirty();
+
        if (regs.stencil_enable) {
            // Attach both depth and stencil
            glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
@@ -538,9 +554,19 @@ void RasterizerOpenGL::DrawArrays() {
    state.Apply();
 }

-void RasterizerOpenGL::FlushAll() {}
+void RasterizerOpenGL::FlushAll() {
+    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
+    res_cache.FlushRegion(0, Kernel::VMManager::MAX_ADDRESS);
+    shader_cache.FlushRegion(0, Kernel::VMManager::MAX_ADDRESS);
+    buffer_cache.FlushRegion(0, Kernel::VMManager::MAX_ADDRESS);
+}

-void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {}
+void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
+    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
+    res_cache.FlushRegion(addr, size);
+    shader_cache.FlushRegion(addr, size);
+    buffer_cache.FlushRegion(addr, size);
+}

 void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
    MICROPROFILE_SCOPE(OpenGL_CacheManagement);
@@ -550,6 +576,7 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
 }

 void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    FlushRegion(addr, size);
    InvalidateRegion(addr, size);
 }

--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -265,20 +265,22 @@ void MortonCopy(u32 stride, u32 block_height, u32 height, u8* gl_buffer, std::si
    constexpr u32 bytes_per_pixel = SurfaceParams::GetFormatBpp(format) / CHAR_BIT;
    constexpr u32 gl_bytes_per_pixel = CachedSurface::GetGLBytesPerPixel(format);

+    // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual
+    // pixel values.
+    const u32 tile_size{IsFormatBCn(format) ? 4U : 1U};
+
    if (morton_to_gl) {
-        // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual
-        // pixel values.
-        const u32 tile_size{IsFormatBCn(format) ? 4U : 1U};
        const std::vector<u8> data = Tegra::Texture::UnswizzleTexture(
            addr, tile_size, bytes_per_pixel, stride, height, block_height);
        const std::size_t size_to_copy{std::min(gl_buffer_size, data.size())};
        memcpy(gl_buffer, data.data(), size_to_copy);
    } else {
-        // TODO(bunnei): Assumes the default rendering GOB size of 16 (128 lines). We should
-        // check the configuration for this and perform more generic un/swizzle
-        LOG_WARNING(Render_OpenGL, "need to use correct swizzle/GOB parameters!");
-        VideoCore::MortonCopyPixels128(stride, height, bytes_per_pixel, gl_bytes_per_pixel,
-                                       Memory::GetPointer(addr), gl_buffer, morton_to_gl);
+        std::vector<u8> data(height * stride * bytes_per_pixel);
+        Tegra::Texture::CopySwizzledData(stride / tile_size, height / tile_size, bytes_per_pixel,
+                                         bytes_per_pixel, data.data(), gl_buffer, false,
+                                         block_height);
+        const std::size_t size_to_copy{std::min(gl_buffer_size, data.size())};
+        memcpy(Memory::GetPointer(addr), data.data(), size_to_copy);
    }
 }

@@ -357,17 +359,16 @@ static constexpr std::array<void (*)(u32, u32, u32, u8*, std::size_t, VAddr),
        MortonCopy<false, PixelFormat::RGBA16UI>,
        MortonCopy<false, PixelFormat::R11FG11FB10F>,
        MortonCopy<false, PixelFormat::RGBA32UI>,
-        // TODO(Subv): Swizzling DXT1/DXT23/DXT45/DXN1/DXN2/BC7U/BC6H_UF16/BC6H_SF16/ASTC_2D_4X4
-        // formats are not supported
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
-        nullptr,
+        MortonCopy<false, PixelFormat::DXT1>,
+        MortonCopy<false, PixelFormat::DXT23>,
+        MortonCopy<false, PixelFormat::DXT45>,
+        MortonCopy<false, PixelFormat::DXN1>,
+        MortonCopy<false, PixelFormat::DXN2UNORM>,
+        MortonCopy<false, PixelFormat::DXN2SNORM>,
+        MortonCopy<false, PixelFormat::BC7U>,
+        MortonCopy<false, PixelFormat::BC6H_UF16>,
+        MortonCopy<false, PixelFormat::BC6H_SF16>,
+        // TODO(Subv): Swizzling ASTC formats are not supported
        nullptr,
        MortonCopy<false, PixelFormat::G8R8U>,
        MortonCopy<false, PixelFormat::G8R8S>,
@@ -503,7 +504,7 @@ CachedSurface::CachedSurface(const SurfaceParams& params)
    glTexParameteri(SurfaceTargetToGL(params.target), GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 }

-static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) {
+static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height, bool reverse) {
    union S8Z24 {
        BitField<0, 24, u32> z24;
        BitField<24, 8, u32> s8;
@@ -516,16 +517,23 @@ static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height) {
    };
    static_assert(sizeof(Z24S8) == 4, "Z24S8 is incorrect size");

-    S8Z24 input_pixel{};
-    Z24S8 output_pixel{};
+    S8Z24 s8z24_pixel{};
+    Z24S8 z24s8_pixel{};
    constexpr auto bpp{CachedSurface::GetGLBytesPerPixel(PixelFormat::S8Z24)};
    for (std::size_t y = 0; y < height; ++y) {
        for (std::size_t x = 0; x < width; ++x) {
            const std::size_t offset{bpp * (y * width + x)};
-            std::memcpy(&input_pixel, &data[offset], sizeof(S8Z24));
-            output_pixel.s8.Assign(input_pixel.s8);
-            output_pixel.z24.Assign(input_pixel.z24);
-            std::memcpy(&data[offset], &output_pixel, sizeof(Z24S8));
+            if (reverse) {
+                std::memcpy(&z24s8_pixel, &data[offset], sizeof(Z24S8));
+                s8z24_pixel.s8.Assign(z24s8_pixel.s8);
+                s8z24_pixel.z24.Assign(z24s8_pixel.z24);
+                std::memcpy(&data[offset], &s8z24_pixel, sizeof(S8Z24));
+            } else {
+                std::memcpy(&s8z24_pixel, &data[offset], sizeof(S8Z24));
+                z24s8_pixel.s8.Assign(s8z24_pixel.s8);
+                z24s8_pixel.z24.Assign(s8z24_pixel.z24);
+                std::memcpy(&data[offset], &z24s8_pixel, sizeof(Z24S8));
+            }
        }
    }
 }
@@ -561,7 +569,7 @@ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelForma
    }
    case PixelFormat::S8Z24:
        // Convert the S8Z24 depth format to Z24S8, as OpenGL does not support S8Z24.
-        ConvertS8Z24ToZ24S8(data, width, height);
+        ConvertS8Z24ToZ24S8(data, width, height, false);
        break;

    case PixelFormat::G8R8U:
@@ -572,6 +580,30 @@ static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelForma
    }
 }

+/**
+ * Helper function to perform software conversion (as needed) when flushing a buffer from OpenGL to
+ * Switch memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or
+ * with typical desktop GPUs.
+ */
+static void ConvertFormatAsNeeded_FlushGLBuffer(std::vector<u8>& data, PixelFormat pixel_format,
+                                                u32 width, u32 height) {
+    switch (pixel_format) {
+    case PixelFormat::G8R8U:
+    case PixelFormat::G8R8S:
+    case PixelFormat::ASTC_2D_4X4:
+    case PixelFormat::ASTC_2D_8X8: {
+        LOG_CRITICAL(HW_GPU, "Conversion of format {} after texture flushing is not implemented",
+                     static_cast<u32>(pixel_format));
+        UNREACHABLE();
+        break;
+    }
+    case PixelFormat::S8Z24:
+        // Convert the Z24S8 depth format to S8Z24, as OpenGL does not support S8Z24.
+        ConvertS8Z24ToZ24S8(data, width, height, true);
+        break;
+    }
+}
+
 MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 64, 192));
 void CachedSurface::LoadGLBuffer() {
    ASSERT(params.type != SurfaceType::Fill);
@@ -609,11 +641,70 @@ void CachedSurface::LoadGLBuffer() {
    }

    ConvertFormatAsNeeded_LoadGLBuffer(gl_buffer, params.pixel_format, params.width, params.height);
+
+    dirty = false;
 }

 MICROPROFILE_DEFINE(OpenGL_SurfaceFlush, "OpenGL", "Surface Flush", MP_RGB(128, 192, 64));
 void CachedSurface::FlushGLBuffer() {
-    ASSERT_MSG(false, "Unimplemented");
+    MICROPROFILE_SCOPE(OpenGL_SurfaceFlush);
+
+    const auto& rect{params.GetRect()};
+
+    // Load data from memory to the surface
+    const GLint x0 = static_cast<GLint>(rect.left);
+    const GLint y0 = static_cast<GLint>(rect.bottom);
+    const size_t buffer_offset =
+        static_cast<size_t>(static_cast<size_t>(y0) * params.width + static_cast<size_t>(x0)) *
+        GetGLBytesPerPixel(params.pixel_format);
+
+    const u32 bytes_per_pixel = GetGLBytesPerPixel(params.pixel_format);
+    const u32 copy_size = params.width * params.height * bytes_per_pixel;
+    gl_buffer.resize(static_cast<size_t>(params.depth) * copy_size);
+
+    const FormatTuple& tuple = GetFormatTuple(params.pixel_format, params.component_type);
+
+    // Ensure no bad interactions with GL_UNPACK_ALIGNMENT
+    ASSERT(params.width * GetGLBytesPerPixel(params.pixel_format) % 4 == 0);
+    glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.width));
+
+    ASSERT(!tuple.compressed);
+    ASSERT(x0 == 0 && y0 == 0);
+
+    glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+    glGetTextureImage(texture.handle, 0, tuple.format, tuple.type, gl_buffer.size(),
+                      gl_buffer.data());
+    glPixelStorei(GL_PACK_ROW_LENGTH, 0);
+
+    ConvertFormatAsNeeded_FlushGLBuffer(gl_buffer, params.pixel_format, params.width,
+                                        params.height);
+
+    ASSERT(params.type != SurfaceType::Fill);
+
+    const u8* const texture_src_data = Memory::GetPointer(params.addr);
+
+    ASSERT(texture_src_data);
+
+    if (params.is_tiled) {
+        // TODO(bunnei): This only swizzles and copies a 2D texture - we do not yet know how to do
+        // this for 3D textures, etc.
+        switch (params.target) {
+        case SurfaceParams::SurfaceTarget::Texture2D:
+            // Pass impl. to the fallback code below
+            break;
+        default:
+            LOG_CRITICAL(HW_GPU, "Unimplemented tiled unload for target={}",
+                         static_cast<u32>(params.target));
+            UNREACHABLE();
+        }
+
+        gl_to_morton_fns[static_cast<size_t>(params.pixel_format)](
+            params.width, params.block_height, params.height, &gl_buffer[buffer_offset], copy_size,
+            params.addr + buffer_offset);
+    } else {
+        Memory::WriteBlock(params.addr + buffer_offset, &gl_buffer[buffer_offset],
+                           gl_buffer.size() - buffer_offset);
+    }
 }

 MICROPROFILE_DEFINE(OpenGL_TextureUL, "OpenGL", "Texture Upload", MP_RGB(128, 64, 192));
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -741,6 +741,19 @@ public:
        return params.size_in_bytes;
    }

+    void Flush() {
+        // There is no need to flush the surface if it hasn't been modified by us.
+        if (!dirty)
+            return;
+
+        FlushGLBuffer();
+        dirty = false;
+    }
+
+    void MarkAsDirty() {
+        dirty = true;
+    }
+
    const OGLTexture& Texture() const {
        return texture;
    }
@@ -772,6 +785,7 @@ private:
    std::vector<u8> gl_buffer;
    SurfaceParams params;
    GLenum gl_target;
+    bool dirty = false;
 };

 class RasterizerCacheOpenGL final : public RasterizerCache<Surface> {
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -32,6 +32,9 @@ public:
        return GLShader::MAX_PROGRAM_CODE_LENGTH * sizeof(u64);
    }

+    // We do not have to flush this cache as things in it are never modified by us.
+    void Flush() {}
+
    /// Gets the shader entries for the shader
    const GLShader::ShaderEntries& GetShaderEntries() const {
        return entries;
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -88,6 +88,38 @@ void FastSwizzleData(u32 width, u32 height, u32 bytes_per_pixel, u8* swizzled_da
    }
 }

+void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+                    u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
+                    u32 block_height) {
+    for (u32 line = 0; line < subrect_height; ++line) {
+        for (u32 x = 0; x < subrect_width; ++x) {
+            u32 swizzled_offset =
+                GetSwizzleOffset(x, line, swizzled_width, bytes_per_pixel, block_height);
+
+            const VAddr source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
+            const VAddr dest_addr = swizzled_data + swizzled_offset;
+
+            Memory::CopyBlock(dest_addr, source_line, bytes_per_pixel);
+        }
+    }
+}
+
+void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
+                      u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
+                      u32 block_height, u32 offset_x, u32 offset_y) {
+    for (u32 line = 0; line < subrect_height; ++line) {
+        for (u32 x = 0; x < subrect_width; ++x) {
+            u32 swizzled_offset = GetSwizzleOffset(offset_x, line + offset_y, swizzled_width,
+                                                   bytes_per_pixel, block_height);
+
+            const VAddr dest_line = unswizzled_data + line * dest_pitch + x;
+            const VAddr source_addr = swizzled_data + swizzled_offset;
+
+            Memory::CopyBlock(dest_line, source_addr, bytes_per_pixel);
+        }
+    }
+}
+
 u32 BytesPerPixel(TextureFormat format) {
    switch (format) {
    case TextureFormat::DXT1:
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -26,6 +26,16 @@ std::vector<u8> UnswizzleDepthTexture(VAddr address, DepthFormat format, u32 wid
 void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_per_pixel,
                      u8* swizzled_data, u8* unswizzled_data, bool unswizzle, u32 block_height);

+/// Copies an untiled subrectangle into a tiled surface.
+void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+                    u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
+                    u32 block_height);
+
+/// Copies a tiled subrectangle into a linear surface.
+void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
+                      u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
+                      u32 block_height, u32 offset_x, u32 offset_y);
+
 /**
 * Decodes an unswizzled texture into a A8R8G8B8 texture.
 */
Author	SHA1	Message	Date
Subv	51d4c772a2	RasterizerCache: Swizzle to a staging buffer before copying it to memory when flushing a texture. This gives us slightly increased memory safety, writing to out of bounds memory will be caught at the point of write instead of several calls down the line as memory corruption	2018-09-21 13:53:13 -05:00
Subv	2fd06acc8f	GPU/DMA: Fixed Tiled->Linear transfers. We no longer write to out of bounds memory anymore.	2018-09-21 13:53:12 -05:00
Subv	bdb3920753	GPU/DMA: Copy the requested amount of data when doing Linear->Linear 2D transfers.	2018-09-21 13:53:12 -05:00
Subv	bb9eeba670	GPU/DMA: Fixed the Linear->Tiled and Linear->Linear transfer modes. This fixes the loading bar in Has-Been Heroes. The Tiled->Tiled transfer mode is not implemented yet and will assert.	2018-09-21 13:53:12 -05:00
Subv	47826fd090	RasterizerGL: Flush dirty cached objects before removing them from the cache. There might be a case where we draw to a framebuffer (thus making it dirty) and then proceed to overwrite only a portion of it from the CPU. The current code would cause the rest of the modified framebuffer to be discarded.	2018-09-21 13:53:11 -05:00
Subv	b87b8db879	GPU/Engines: Flush and invalidate source/dest regions in Fermi2D and KeplerMemory copies.	2018-09-21 13:53:11 -05:00
Subv	23e1dd3c4d	RasterizerCache: Do not flush surfaces that have not been modified. This should bring the performance closer to what it was before flushing was implemented.	2018-09-21 13:53:10 -05:00
Subv	acfc8d7fc9	RasterizerCache: Convert the S8Z24 format from Z24S8 back to S8Z24 when flushing.	2018-09-21 13:53:10 -05:00
Subv	81afcf3f4e	RasterizerCache: Allow flushing compressed formats.	2018-09-21 13:53:10 -05:00
Subv	b27ba353d4	RasterizerCache: Log problematic formats when flushing a surface.	2018-09-21 13:53:09 -05:00
Subv	8474a5adf7	GPU/DMA: Implemented source offset copies for unswizzling Tiled->Linear transfers. This is used by nouveau to implement glReadPixels.	2018-09-21 13:53:09 -05:00
Subv	ca7eb39b86	RasterizerCache: Remove the glGetError call after a surface flush	2018-09-21 13:53:08 -05:00
Subv	393f2418c5	GPU/DMA: Flush the source memory region before a DMA transfer and invalidate the destination region after the transfer.	2018-09-21 13:53:08 -05:00
Subv	d0a814eaca	GPU/DMA: Pass the current rasterizer as a variable when constructing the DMA engine.	2018-09-21 13:53:06 -05:00
Subv	047bbe0881	RasterizerCache: Re-introduced the code to flush the various resource caches.	2018-09-21 13:53:06 -05:00
Subv	181aca9af0	RasterizerCache: Reintroduced code for flushing OpenGL surfaces back to memory. This is required for a proper implementation of the DMA and 2D engines.	2018-09-21 13:53:05 -05:00