gl_shader_decompiler: Implement SEL instruction.

Merge pull request #761 from bunnei/improve-raster-cache
Improvements to rasterizer cache
2018-07-22 00:37:12 -04:00 · 2018-07-21 20:28:53 -07:00 · 2018-07-21 21:51:06 -04:00 · 2018-07-21 21:51:06 -04:00 · 2018-07-21 21:51:06 -04:00 · 2018-07-21 21:51:06 -04:00
20 changed files with 236 additions and 214 deletions
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -63,7 +63,6 @@ add_library(common STATIC
    string_util.cpp
    string_util.h
    swap.h
-    synchronized_wrapper.h
    telemetry.cpp
    telemetry.h
    thread.cpp
--- a/src/common/file_util.cpp
+++ b/src/common/file_util.cpp
@@ -838,8 +838,7 @@ std::string GetPathWithoutTop(std::string path) {
    }
    const auto name_bck_index = path.find_first_of('\\');
    const auto name_fwd_index = path.find_first_of('/');
-    return path.substr(std::min<size_t>(name_bck_index, name_fwd_index) + 1);
-    return path.substr(std::min<size_t>(name_bck_index, name_fwd_index) + 1);
+    return path.substr(std::min(name_bck_index, name_fwd_index) + 1);
 }

 std::string GetFilename(std::string path) {
--- a/src/common/synchronized_wrapper.h
+++ b/src/common/synchronized_wrapper.h
@@ -1,85 +0,0 @@
-// Copyright 2015 Citra Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <algorithm>
-#include <mutex>
-
-namespace Common {
-
-template <typename T>
-class SynchronizedWrapper;
-
-/**
- * Synchronized reference, that keeps a SynchronizedWrapper's mutex locked during its lifetime. This
- * greatly reduces the chance that someone will access the wrapped resource without locking the
- * mutex.
- */
-template <typename T>
-class SynchronizedRef {
-public:
-    SynchronizedRef(SynchronizedWrapper<T>& wrapper) : wrapper(&wrapper) {
-        wrapper.mutex.lock();
-    }
-
-    SynchronizedRef(SynchronizedRef&) = delete;
-    SynchronizedRef(SynchronizedRef&& o) : wrapper(o.wrapper) {
-        o.wrapper = nullptr;
-    }
-
-    ~SynchronizedRef() {
-        if (wrapper)
-            wrapper->mutex.unlock();
-    }
-
-    SynchronizedRef& operator=(SynchronizedRef&) = delete;
-    SynchronizedRef& operator=(SynchronizedRef&& o) {
-        std::swap(wrapper, o.wrapper);
-        return *this;
-    }
-
-    T& operator*() {
-        return wrapper->data;
-    }
-    const T& operator*() const {
-        return wrapper->data;
-    }
-
-    T* operator->() {
-        return &wrapper->data;
-    }
-    const T* operator->() const {
-        return &wrapper->data;
-    }
-
-private:
-    SynchronizedWrapper<T>* wrapper;
-};
-
-/**
- * Wraps an object, only allowing access to it via a locking reference wrapper. Good to ensure no
- * one forgets to lock a mutex before acessing an object. To access the wrapped object construct a
- * SyncronizedRef on this wrapper. Inspired by Rust's Mutex type
- * (http://doc.rust-lang.org/std/sync/struct.Mutex.html).
- */
-template <typename T>
-class SynchronizedWrapper {
-public:
-    template <typename... Args>
-    SynchronizedWrapper(Args&&... args) : data(std::forward<Args>(args)...) {}
-
-    SynchronizedRef<T> Lock() {
-        return {*this};
-    }
-
-private:
-    template <typename U>
-    friend class SynchronizedRef;
-
-    std::mutex mutex;
-    T data;
-};
-
-} // namespace Common
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -42,6 +42,9 @@ u32 nvhost_gpu::ioctl(Ioctl command, const std::vector<u8>& input, std::vector<u
        if (command.cmd == NVGPU_IOCTL_CHANNEL_SUBMIT_GPFIFO) {
            return SubmitGPFIFO(input, output);
        }
+        if (command.cmd == NVGPU_IOCTL_CHANNEL_KICKOFF_PB) {
+            return KickoffPB(input, output);
+        }
    }

    UNIMPLEMENTED_MSG("Unimplemented ioctl");
@@ -127,14 +130,37 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
    IoctlSubmitGpfifo params{};
    std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
    LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
-                params.gpfifo, params.num_entries, params.flags);
+                params.address, params.num_entries, params.flags);

    auto entries = std::vector<IoctlGpfifoEntry>();
    entries.resize(params.num_entries);
    std::memcpy(&entries[0], &input.data()[sizeof(IoctlSubmitGpfifo)],
                params.num_entries * sizeof(IoctlGpfifoEntry));
    for (auto entry : entries) {
-        VAddr va_addr = entry.Address();
+        Tegra::GPUVAddr va_addr = entry.Address();
+        Core::System::GetInstance().GPU().ProcessCommandList(va_addr, entry.sz);
+    }
+    params.fence_out.id = 0;
+    params.fence_out.value = 0;
+    std::memcpy(output.data(), &params, output.size());
+    return 0;
+}
+
+u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output) {
+    if (input.size() < sizeof(IoctlSubmitGpfifo)) {
+        UNIMPLEMENTED();
+    }
+    IoctlSubmitGpfifo params{};
+    std::memcpy(&params, input.data(), sizeof(IoctlSubmitGpfifo));
+    LOG_WARNING(Service_NVDRV, "(STUBBED) called, gpfifo={:X}, num_entries={:X}, flags={:X}",
+                params.address, params.num_entries, params.flags);
+
+    std::vector<IoctlGpfifoEntry> entries(params.num_entries);
+    Memory::ReadBlock(params.address, entries.data(),
+                      params.num_entries * sizeof(IoctlGpfifoEntry));
+
+    for (auto entry : entries) {
+        Tegra::GPUVAddr va_addr = entry.Address();
        Core::System::GetInstance().GPU().ProcessCommandList(va_addr, entry.sz);
    }
    params.fence_out.id = 0;
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.h
@@ -15,6 +15,7 @@ namespace Service::Nvidia::Devices {
 class nvmap;
 constexpr u32 NVGPU_IOCTL_MAGIC('H');
 constexpr u32 NVGPU_IOCTL_CHANNEL_SUBMIT_GPFIFO(0x8);
+constexpr u32 NVGPU_IOCTL_CHANNEL_KICKOFF_PB(0x1b);

 class nvhost_gpu final : public nvdevice {
 public:
@@ -158,14 +159,14 @@ private:
            BitField<31, 1, u32_le> unk2;
        };

-        VAddr Address() const {
-            return (static_cast<VAddr>(gpu_va_hi) << 32) | entry0;
+        Tegra::GPUVAddr Address() const {
+            return (static_cast<Tegra::GPUVAddr>(gpu_va_hi) << 32) | entry0;
        }
    };
    static_assert(sizeof(IoctlGpfifoEntry) == 8, "IoctlGpfifoEntry is incorrect size");

    struct IoctlSubmitGpfifo {
-        u64_le gpfifo;      // (ignored) pointer to gpfifo fence structs
+        u64_le address;     // pointer to gpfifo entry structs
        u32_le num_entries; // number of fence objects being submitted
        u32_le flags;
        IoctlFence fence_out; // returned new fence object for others to wait on
@@ -193,6 +194,7 @@ private:
    u32 AllocGPFIFOEx2(const std::vector<u8>& input, std::vector<u8>& output);
    u32 AllocateObjectContext(const std::vector<u8>& input, std::vector<u8>& output);
    u32 SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& output);
+    u32 KickoffPB(const std::vector<u8>& input, std::vector<u8>& output);
    u32 GetWaitbase(const std::vector<u8>& input, std::vector<u8>& output);
    u32 ChannelSetTimeout(const std::vector<u8>& input, std::vector<u8>& output);

--- a/src/core/hle/service/nvdrv/interface.cpp
+++ b/src/core/hle/service/nvdrv/interface.cpp
@@ -101,7 +101,7 @@ NVDRV::NVDRV(std::shared_ptr<Module> nvdrv, const char* name)
        {8, &NVDRV::SetClientPID, "SetClientPID"},
        {9, nullptr, "DumpGraphicsMemoryInfo"},
        {10, nullptr, "InitializeDevtools"},
-        {11, nullptr, "Ioctl2"},
+        {11, &NVDRV::Ioctl, "Ioctl2"},
        {12, nullptr, "Ioctl3"},
        {13, &NVDRV::FinishInitialize, "FinishInitialize"},
    };
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -6,8 +6,7 @@
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/textures/decoders.h"

-namespace Tegra {
-namespace Engines {
+namespace Tegra::Engines {

 Fermi2D::Fermi2D(MemoryManager& memory_manager) : memory_manager(memory_manager) {}

@@ -69,5 +68,4 @@ void Fermi2D::HandleSurfaceCopy() {
    }
 }

-} // namespace Engines
-} // namespace Tegra
+} // namespace Tegra::Engines
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -12,8 +12,7 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"

-namespace Tegra {
-namespace Engines {
+namespace Tegra::Engines {

 #define FERMI2D_REG_INDEX(field_name)                                                              \
    (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32))
@@ -110,5 +109,4 @@ ASSERT_REG_POSITION(operation, 0xAB);
 ASSERT_REG_POSITION(trigger, 0xB5);
 #undef ASSERT_REG_POSITION

-} // namespace Engines
-} // namespace Tegra
+} // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -17,8 +17,7 @@
 #include "video_core/memory_manager.h"
 #include "video_core/textures/texture.h"

-namespace Tegra {
-namespace Engines {
+namespace Tegra::Engines {

 #define MAXWELL3D_REG_INDEX(field_name)                                                            \
    (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32))
@@ -488,7 +487,12 @@ public:
                    };
                } rt_control;

-                INSERT_PADDING_WORDS(0x2B);
+                INSERT_PADDING_WORDS(0x2);
+
+                u32 zeta_width;
+                u32 zeta_height;
+
+                INSERT_PADDING_WORDS(0x27);

                u32 depth_test_enable;

@@ -541,7 +545,11 @@ public:

                u32 vb_element_base;

-                INSERT_PADDING_WORDS(0x49);
+                INSERT_PADDING_WORDS(0x40);
+
+                u32 zeta_enable;
+
+                INSERT_PADDING_WORDS(0x8);

                struct {
                    u32 tsc_address_high;
@@ -866,6 +874,8 @@ ASSERT_REG_POSITION(clear_depth, 0x364);
 ASSERT_REG_POSITION(zeta, 0x3F8);
 ASSERT_REG_POSITION(vertex_attrib_format[0], 0x458);
 ASSERT_REG_POSITION(rt_control, 0x487);
+ASSERT_REG_POSITION(zeta_width, 0x48a);
+ASSERT_REG_POSITION(zeta_height, 0x48b);
 ASSERT_REG_POSITION(depth_test_enable, 0x4B3);
 ASSERT_REG_POSITION(independent_blend_enable, 0x4B9);
 ASSERT_REG_POSITION(depth_write_enabled, 0x4BA);
@@ -875,6 +885,7 @@ ASSERT_REG_POSITION(blend, 0x4CF);
 ASSERT_REG_POSITION(stencil, 0x4E0);
 ASSERT_REG_POSITION(screen_y_control, 0x4EB);
 ASSERT_REG_POSITION(vb_element_base, 0x50D);
+ASSERT_REG_POSITION(zeta_enable, 0x54E);
 ASSERT_REG_POSITION(tsc, 0x557);
 ASSERT_REG_POSITION(tic, 0x55D);
 ASSERT_REG_POSITION(stencil_two_side, 0x565);
@@ -898,5 +909,4 @@ ASSERT_REG_POSITION(tex_info_buffers.size[0], 0xD2F);

 #undef ASSERT_REG_POSITION

-} // namespace Engines
-} // namespace Tegra
+} // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_compute.h
+++ b/src/video_core/engines/maxwell_compute.h
@@ -6,8 +6,7 @@

 #include "common/common_types.h"

-namespace Tegra {
-namespace Engines {
+namespace Tegra::Engines {

 class MaxwellCompute final {
 public:
@@ -18,5 +17,4 @@ public:
    void WriteReg(u32 method, u32 value);
 };

-} // namespace Engines
-} // namespace Tegra
+} // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -12,8 +12,7 @@
 #include "video_core/gpu.h"
 #include "video_core/memory_manager.h"

-namespace Tegra {
-namespace Engines {
+namespace Tegra::Engines {

 class MaxwellDMA final {
 public:
@@ -151,5 +150,4 @@ ASSERT_REG_POSITION(src_params, 0x1CA);

 #undef ASSERT_REG_POSITION

-} // namespace Engines
-} // namespace Tegra
+} // namespace Tegra::Engines
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -15,8 +15,7 @@
 #include "common/bit_field.h"
 #include "common/common_types.h"

-namespace Tegra {
-namespace Shader {
+namespace Tegra::Shader {

 struct Register {
    /// Number of registers
@@ -109,8 +108,7 @@ union Sampler {
    u64 value{};
 };

-} // namespace Shader
-} // namespace Tegra
+} // namespace Tegra::Shader

 namespace std {

@@ -127,8 +125,7 @@ struct make_unsigned<Tegra::Shader::Register> {

 } // namespace std

-namespace Tegra {
-namespace Shader {
+namespace Tegra::Shader {

 enum class Pred : u64 {
    UnusedIndex = 0x7,
@@ -291,6 +288,11 @@ union Instruction {
        BitField<49, 1, u64> negate_a;
    } alu_integer;

+    union {
+        BitField<39, 3, u64> pred;
+        BitField<42, 1, u64> neg_pred;
+    } sel;
+
    union {
        BitField<39, 3, u64> pred;
        BitField<42, 1, u64> negate_pred;
@@ -516,6 +518,9 @@ public:
        ISCADD_C, // Scale and Add
        ISCADD_R,
        ISCADD_IMM,
+        SEL_C,
+        SEL_R,
+        SEL_IMM,
        MUFU,  // Multi-Function Operator
        RRO_C, // Range Reduction Operator
        RRO_R,
@@ -716,6 +721,9 @@ private:
            INST("0100110000011---", Id::ISCADD_C, Type::ArithmeticInteger, "ISCADD_C"),
            INST("0101110000011---", Id::ISCADD_R, Type::ArithmeticInteger, "ISCADD_R"),
            INST("0011100-00011---", Id::ISCADD_IMM, Type::ArithmeticInteger, "ISCADD_IMM"),
+            INST("0100110010100---", Id::SEL_C, Type::ArithmeticInteger, "SEL_C"),
+            INST("0101110010100---", Id::SEL_R, Type::ArithmeticInteger, "SEL_R"),
+            INST("0011100010100---", Id::SEL_IMM, Type::ArithmeticInteger, "SEL_IMM"),
            INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"),
            INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"),
            INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"),
@@ -784,5 +792,4 @@ private:
    }
 };

-} // namespace Shader
-} // namespace Tegra
+} // namespace Tegra::Shader
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -387,7 +387,7 @@ void RasterizerOpenGL::Clear() {
    }
    if (regs.clear_buffers.Z) {
        clear_mask |= GL_DEPTH_BUFFER_BIT;
-        use_depth_fb = true;
+        use_depth_fb = regs.zeta_enable != 0;

        // Always enable the depth write when clearing the depth buffer. The depth write mask is
        // ignored when clearing the buffer in the Switch, but OpenGL obeys it so we set it to true.
@@ -413,11 +413,13 @@ void RasterizerOpenGL::Clear() {
    glClear(clear_mask);

    // Mark framebuffer surfaces as dirty
-    if (dirty_color_surface != nullptr) {
-        res_cache.MarkSurfaceAsDirty(dirty_color_surface);
-    }
-    if (dirty_depth_surface != nullptr) {
-        res_cache.MarkSurfaceAsDirty(dirty_depth_surface);
+    if (Settings::values.use_accurate_framebuffers) {
+        if (dirty_color_surface != nullptr) {
+            res_cache.FlushSurface(dirty_color_surface);
+        }
+        if (dirty_depth_surface != nullptr) {
+            res_cache.FlushSurface(dirty_depth_surface);
+        }
    }
 }

@@ -431,7 +433,7 @@ void RasterizerOpenGL::DrawArrays() {
    ScopeAcquireGLContext acquire_context;

    auto [dirty_color_surface, dirty_depth_surface] =
-        ConfigureFramebuffers(true, regs.zeta.Address() != 0);
+        ConfigureFramebuffers(true, regs.zeta.Address() != 0 && regs.zeta_enable != 0);

    SyncDepthTestState();
    SyncBlendState();
@@ -520,11 +522,13 @@ void RasterizerOpenGL::DrawArrays() {
    state.Apply();

    // Mark framebuffer surfaces as dirty
-    if (dirty_color_surface != nullptr) {
-        res_cache.MarkSurfaceAsDirty(dirty_color_surface);
-    }
-    if (dirty_depth_surface != nullptr) {
-        res_cache.MarkSurfaceAsDirty(dirty_depth_surface);
+    if (Settings::values.use_accurate_framebuffers) {
+        if (dirty_color_surface != nullptr) {
+            res_cache.FlushSurface(dirty_color_surface);
+        }
+        if (dirty_depth_surface != nullptr) {
+            res_cache.FlushSurface(dirty_depth_surface);
+        }
    }
 }

--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -65,9 +65,9 @@ struct FormatTuple {
    return params;
 }

-/*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer(
-    const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config, Tegra::GPUVAddr zeta_address,
-    Tegra::DepthFormat format) {
+/*static*/ SurfaceParams SurfaceParams::CreateForDepthBuffer(u32 zeta_width, u32 zeta_height,
+                                                             Tegra::GPUVAddr zeta_address,
+                                                             Tegra::DepthFormat format) {

    SurfaceParams params{};
    params.addr = zeta_address;
@@ -77,9 +77,9 @@ struct FormatTuple {
    params.component_type = ComponentTypeFromDepthFormat(format);
    params.type = GetFormatType(params.pixel_format);
    params.size_in_bytes = params.SizeInBytes();
-    params.width = config.width;
-    params.height = config.height;
-    params.unaligned_height = config.height;
+    params.width = zeta_width;
+    params.height = zeta_height;
+    params.unaligned_height = zeta_height;
    params.size_in_bytes = params.SizeInBytes();
    return params;
 }
@@ -254,6 +254,60 @@ static void AllocateSurfaceTexture(GLuint texture, const FormatTuple& format_tup
    cur_state.Apply();
 }

+static bool BlitTextures(GLuint src_tex, const MathUtil::Rectangle<u32>& src_rect, GLuint dst_tex,
+                         const MathUtil::Rectangle<u32>& dst_rect, SurfaceType type,
+                         GLuint read_fb_handle, GLuint draw_fb_handle) {
+    OpenGLState prev_state{OpenGLState::GetCurState()};
+    SCOPE_EXIT({ prev_state.Apply(); });
+
+    OpenGLState state;
+    state.draw.read_framebuffer = read_fb_handle;
+    state.draw.draw_framebuffer = draw_fb_handle;
+    state.Apply();
+
+    u32 buffers{};
+
+    if (type == SurfaceType::ColorTexture) {
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, src_tex,
+                               0);
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, dst_tex,
+                               0);
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0,
+                               0);
+
+        buffers = GL_COLOR_BUFFER_BIT;
+    } else if (type == SurfaceType::Depth) {
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, src_tex, 0);
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_TEXTURE_2D, dst_tex, 0);
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0);
+
+        buffers = GL_DEPTH_BUFFER_BIT;
+    } else if (type == SurfaceType::DepthStencil) {
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+        glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
+                               src_tex, 0);
+
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0);
+        glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D,
+                               dst_tex, 0);
+
+        buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT;
+    }
+
+    glBlitFramebuffer(src_rect.left, src_rect.bottom, src_rect.right, src_rect.top, dst_rect.left,
+                      dst_rect.bottom, dst_rect.right, dst_rect.top, buffers,
+                      buffers == GL_COLOR_BUFFER_BIT ? GL_LINEAR : GL_NEAREST);
+
+    return true;
+}
+
 CachedSurface::CachedSurface(const SurfaceParams& params) : params(params) {
    texture.Create();
    const auto& rect{params.GetRect()};
@@ -519,8 +573,8 @@ SurfaceSurfaceRect_Tuple RasterizerCacheOpenGL::GetFramebufferSurfaces(
    }

    if (using_depth_fb) {
-        depth_params =
-            SurfaceParams::CreateForDepthBuffer(regs.rt[0], regs.zeta.Address(), regs.zeta.format);
+        depth_params = SurfaceParams::CreateForDepthBuffer(regs.zeta_width, regs.zeta_height,
+                                                           regs.zeta.Address(), regs.zeta.format);
    }

    MathUtil::Rectangle<u32> color_rect{};
@@ -565,17 +619,9 @@ void RasterizerCacheOpenGL::LoadSurface(const Surface& surface) {
    surface->UploadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
 }

-void RasterizerCacheOpenGL::MarkSurfaceAsDirty(const Surface& surface) {
-    if (Settings::values.use_accurate_framebuffers) {
-        // If enabled, always flush dirty surfaces
-        surface->DownloadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
-        surface->FlushGLBuffer();
-    } else {
-        // Otherwise, don't mark surfaces that we write to as cached, because the resulting loads
-        // and flushes are very slow and do not seem to improve accuracy
-        const auto& params{surface->GetSurfaceParams()};
-        Memory::RasterizerMarkRegionCached(params.addr, params.size_in_bytes, false);
-    }
+void RasterizerCacheOpenGL::FlushSurface(const Surface& surface) {
+    surface->DownloadGLTexture(read_framebuffer.handle, draw_framebuffer.handle);
+    surface->FlushGLBuffer();
 }

 Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params) {
@@ -588,25 +634,53 @@ Surface RasterizerCacheOpenGL::GetSurface(const SurfaceParams& params) {
    if (gpu.memory_manager->GpuToCpuAddress(params.addr) == boost::none)
        return {};

-    // Check for an exact match in existing surfaces
-    const auto& surface_key{SurfaceKey::Create(params)};
-    const auto& search{surface_cache.find(surface_key)};
+    // Look up surface in the cache based on address
+    const auto& search{surface_cache.find(params.addr)};
    Surface surface;
    if (search != surface_cache.end()) {
        surface = search->second;
        if (Settings::values.use_accurate_framebuffers) {
-            // Reload the surface from Switch memory
-            LoadSurface(surface);
+            // If use_accurate_framebuffers is enabled, always load from memory
+            FlushSurface(surface);
+            UnregisterSurface(surface);
+        } else if (surface->GetSurfaceParams() != params) {
+            // If surface parameters changed, recreate the surface from the old one
+            return RecreateSurface(surface, params);
+        } else {
+            // Use the cached surface as-is
+            return surface;
        }
-    } else {
-        surface = std::make_shared<CachedSurface>(params);
-        RegisterSurface(surface);
-        LoadSurface(surface);
    }

+    // No surface found - create a new one
+    surface = std::make_shared<CachedSurface>(params);
+    RegisterSurface(surface);
+    LoadSurface(surface);
+
    return surface;
 }

+Surface RasterizerCacheOpenGL::RecreateSurface(const Surface& surface,
+                                               const SurfaceParams& new_params) {
+    // Verify surface is compatible for blitting
+    const auto& params{surface->GetSurfaceParams()};
+    ASSERT(params.type == new_params.type);
+    ASSERT(params.pixel_format == new_params.pixel_format);
+    ASSERT(params.component_type == new_params.component_type);
+
+    // Create a new surface with the new parameters, and blit the previous surface to it
+    Surface new_surface{std::make_shared<CachedSurface>(new_params)};
+    BlitTextures(surface->Texture().handle, params.GetRect(), new_surface->Texture().handle,
+                 new_surface->GetSurfaceParams().GetRect(), params.type, read_framebuffer.handle,
+                 draw_framebuffer.handle);
+
+    // Update cache accordingly
+    UnregisterSurface(surface);
+    RegisterSurface(new_surface);
+
+    return new_surface;
+}
+
 Surface RasterizerCacheOpenGL::TryFindFramebufferSurface(VAddr cpu_addr) const {
    // Tries to find the GPU address of a framebuffer based on the CPU address. This is because
    // final output framebuffers are specified by CPU address, but internally our GPU cache uses
@@ -652,22 +726,20 @@ void RasterizerCacheOpenGL::InvalidateRegion(Tegra::GPUVAddr addr, size_t size)

 void RasterizerCacheOpenGL::RegisterSurface(const Surface& surface) {
    const auto& params{surface->GetSurfaceParams()};
-    const auto& surface_key{SurfaceKey::Create(params)};
-    const auto& search{surface_cache.find(surface_key)};
+    const auto& search{surface_cache.find(params.addr)};

    if (search != surface_cache.end()) {
        // Registered already
        return;
    }

-    surface_cache[surface_key] = surface;
+    surface_cache[params.addr] = surface;
    UpdatePagesCachedCount(params.addr, params.size_in_bytes, 1);
 }

 void RasterizerCacheOpenGL::UnregisterSurface(const Surface& surface) {
    const auto& params{surface->GetSurfaceParams()};
-    const auto& surface_key{SurfaceKey::Create(params)};
-    const auto& search{surface_cache.find(surface_key)};
+    const auto& search{surface_cache.find(params.addr)};

    if (search == surface_cache.end()) {
        // Unregistered already
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.h
@@ -10,7 +10,6 @@
 #include <vector>
 #include <boost/icl/interval_map.hpp>
 #include "common/common_types.h"
-#include "common/hash.h"
 #include "common/math_util.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
@@ -137,6 +136,7 @@ struct SurfaceParams {
        ASSERT(static_cast<size_t>(format) < bpp_table.size());
        return bpp_table[static_cast<size_t>(format)];
    }
+
    u32 GetFormatBpp() const {
        return GetFormatBpp(pixel_format);
    }
@@ -365,9 +365,21 @@ struct SurfaceParams {
        const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config);

    /// Creates SurfaceParams for a depth buffer configuration
-    static SurfaceParams CreateForDepthBuffer(
-        const Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig& config,
-        Tegra::GPUVAddr zeta_address, Tegra::DepthFormat format);
+    static SurfaceParams CreateForDepthBuffer(u32 zeta_width, u32 zeta_height,
+                                              Tegra::GPUVAddr zeta_address,
+                                              Tegra::DepthFormat format);
+
+    bool operator==(const SurfaceParams& other) const {
+        return std::tie(addr, is_tiled, block_height, pixel_format, component_type, type, width,
+                        height, unaligned_height, size_in_bytes) ==
+               std::tie(other.addr, other.is_tiled, other.block_height, other.pixel_format,
+                        other.component_type, other.type, other.width, other.height,
+                        other.unaligned_height, other.size_in_bytes);
+    }
+
+    bool operator!=(const SurfaceParams& other) const {
+        return !operator==(other);
+    }

    Tegra::GPUVAddr addr;
    bool is_tiled;
@@ -381,24 +393,6 @@ struct SurfaceParams {
    size_t size_in_bytes;
 };

-/// Hashable variation of SurfaceParams, used for a key in the surface cache
-struct SurfaceKey : Common::HashableStruct<SurfaceParams> {
-    static SurfaceKey Create(const SurfaceParams& params) {
-        SurfaceKey res;
-        res.state = params;
-        return res;
-    }
-};
-
-namespace std {
-template <>
-struct hash<SurfaceKey> {
-    size_t operator()(const SurfaceKey& k) const {
-        return k.Hash();
-    }
-};
-} // namespace std
-
 class CachedSurface final {
 public:
    CachedSurface(const SurfaceParams& params);
@@ -444,8 +438,8 @@ public:
    SurfaceSurfaceRect_Tuple GetFramebufferSurfaces(bool using_color_fb, bool using_depth_fb,
                                                    const MathUtil::Rectangle<s32>& viewport);

-    /// Marks the specified surface as "dirty", in that it is out of sync with Switch memory
-    void MarkSurfaceAsDirty(const Surface& surface);
+    /// Flushes the surface to Switch memory
+    void FlushSurface(const Surface& surface);

    /// Tries to find a framebuffer GPU address based on the provided CPU address
    Surface TryFindFramebufferSurface(VAddr cpu_addr) const;
@@ -460,6 +454,9 @@ private:
    void LoadSurface(const Surface& surface);
    Surface GetSurface(const SurfaceParams& params);

+    /// Recreates a surface with new parameters
+    Surface RecreateSurface(const Surface& surface, const SurfaceParams& new_params);
+
    /// Register surface into the cache
    void RegisterSurface(const Surface& surface);

@@ -469,7 +466,7 @@ private:
    /// Increase/decrease the number of surface in pages touching the specified region
    void UpdatePagesCachedCount(Tegra::GPUVAddr addr, u64 size, int delta);

-    std::unordered_map<SurfaceKey, Surface> surface_cache;
+    std::unordered_map<Tegra::GPUVAddr, Surface> surface_cache;
    PageMap cached_pages;

    OGLFramebuffer read_framebuffer;
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -12,8 +12,7 @@
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"

-namespace GLShader {
-namespace Decompiler {
+namespace GLShader::Decompiler {

 using Tegra::Shader::Attribute;
 using Tegra::Shader::Instruction;
@@ -1140,6 +1139,15 @@ private:
                                          "((" + op_a + " << " + shift + ") + " + op_b + ')', 1, 1);
                break;
            }
+            case OpCode::Id::SEL_C:
+            case OpCode::Id::SEL_R:
+            case OpCode::Id::SEL_IMM: {
+                std::string condition =
+                    GetPredicateCondition(instr.sel.pred, instr.sel.neg_pred != 0);
+                regs.SetRegisterToInteger(instr.gpr0, true, 0,
+                                          '(' + condition + ") ? " + op_a + " : " + op_b, 1, 1);
+                break;
+            }
            case OpCode::Id::LOP_C:
            case OpCode::Id::LOP_R:
            case OpCode::Id::LOP_IMM: {
@@ -1845,5 +1853,4 @@ boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code,
    return boost::none;
 }

-} // namespace Decompiler
-} // namespace GLShader
+} // namespace GLShader::Decompiler
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -12,8 +12,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"

-namespace GLShader {
-namespace Decompiler {
+namespace GLShader::Decompiler {

 using Tegra::Engines::Maxwell3D;

@@ -23,5 +22,4 @@ boost::optional<ProgramResult> DecompileProgram(const ProgramCode& program_code,
                                                Maxwell3D::Regs::ShaderStage stage,
                                                const std::string& suffix);

-} // namespace Decompiler
-} // namespace GLShader
+} // namespace GLShader::Decompiler
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -10,8 +10,7 @@
 #include "video_core/textures/decoders.h"
 #include "video_core/textures/texture.h"

-namespace Tegra {
-namespace Texture {
+namespace Tegra::Texture {

 /**
 * Calculates the offset of an (x, y) position within a swizzled texture.
@@ -186,5 +185,4 @@ std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat
    return rgba_data;
 }

-} // namespace Texture
-} // namespace Tegra
+} // namespace Tegra::Texture
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -8,8 +8,7 @@
 #include "common/common_types.h"
 #include "video_core/textures/texture.h"

-namespace Tegra {
-namespace Texture {
+namespace Tegra::Texture {

 /**
 * Unswizzles a swizzled texture without changing its format.
@@ -33,5 +32,4 @@ void CopySwizzledData(u32 width, u32 height, u32 bytes_per_pixel, u32 out_bytes_
 std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width,
                              u32 height);

-} // namespace Texture
-} // namespace Tegra
+} // namespace Tegra::Texture
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -10,8 +10,7 @@
 #include "common/common_types.h"
 #include "video_core/memory_manager.h"

-namespace Tegra {
-namespace Texture {
+namespace Tegra::Texture {

 enum class TextureFormat : u32 {
    R32_G32_B32_A32 = 0x01,
@@ -260,5 +259,4 @@ struct FullTextureInfo {
 /// Returns the number of bytes per pixel of the input texture format.
 u32 BytesPerPixel(TextureFormat format);

-} // namespace Texture
-} // namespace Tegra
+} // namespace Tegra::Texture
Author	SHA1	Message	Date
bunnei	c43eaa94f3	gl_shader_decompiler: Implement SEL instruction.	2018-07-22 00:37:12 -04:00
bunnei	4cd5df95d6	Merge pull request #761 from bunnei/improve-raster-cache Improvements to rasterizer cache	2018-07-21 20:28:53 -07:00
bunnei	63fbf9a7d3	gl_rasterizer_cache: Blit surfaces on recreation instead of flush and load.	2018-07-21 21:51:06 -04:00
bunnei	4301f0b539	gl_rasterizer_cache: Use GPUVAddr as cache key, not parameter set.	2018-07-21 21:51:06 -04:00
bunnei	cd47391c2d	gl_rasterizer_cache: Use zeta_width and zeta_height registers for depth buffer.	2018-07-21 21:51:06 -04:00
bunnei	d8c60029d6	gl_rasterizer: Use zeta_enable register to enable depth buffer.	2018-07-21 21:51:06 -04:00
bunnei	5287991a36	maxwell_3d: Add depth buffer enable, width, and height registers.	2018-07-21 21:51:05 -04:00
bunnei	53a219f163	Merge pull request #759 from lioncash/redundant file_util: Remove redundant duplicate return in GetPathWithoutTop()	2018-07-21 18:50:38 -07:00
bunnei	3ac736c003	Merge pull request #748 from lioncash/namespace video_core: Use nested namespaces where applicable	2018-07-21 18:50:14 -07:00
bunnei	f5e87f4ce1	Merge pull request #758 from lioncash/sync common: Remove synchronized_wrapper.h	2018-07-21 18:30:31 -07:00
bunnei	9533875eeb	Merge pull request #760 from lioncash/path file_util: Use an enum class for GetUserPath()	2018-07-21 18:30:04 -07:00
bunnei	d95a1a3742	Merge pull request #762 from Subv/ioctl2 GPU: Implement the NVGPU_IOCTL_CHANNEL_KICKOFF_PB ioctl2 command.	2018-07-21 18:28:55 -07:00
Subv	5c49e56d41	GPU: Implement the NVGPU_IOCTL_CHANNEL_KICKOFF_PB ioctl2 command. This behaves quite similarly to the SubmitGPFIFO command. Referenced from Ryujinx. Many thanks to @gdkchan for investigating this!	2018-07-21 15:50:02 -05:00
Lioncash	34d6a1349c	file_util: Remove explicit type from std::min() in GetPathWithoutTop() Given both operands are the same type, there won't be an issue with overload selection that requires making this explicit.	2018-07-21 15:19:32 -04:00
Lioncash	41660c8923	file_util: Remove redundant duplicate return in GetPathWithoutTop()	2018-07-21 15:18:23 -04:00
Lioncash	973fdce79b	common: Remove synchronized_wrapper.h This is entirely unused in the codebase.	2018-07-21 14:51:44 -04:00
Lioncash	bb960c8cb4	video_core: Use nested namespaces where applicable Compresses a few namespace specifiers to be more compact.	2018-07-20 18:23:54 -04:00