gpu_thread: Handle cache management before DMA commands.

configure_graphics: Disallow changing use_asynchronous_gpu_emulation while running.
gpu: Move flush and invalidate to GPU thread.
2019-01-12 02:41:22 -05:00 · 2019-01-12 01:36:47 -05:00 · 2019-01-12 01:36:47 -05:00 · 2019-01-12 01:36:46 -05:00 · 2019-01-12 01:36:46 -05:00 · 2019-01-12 01:36:46 -05:00
32 changed files with 380 additions and 88 deletions
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -95,6 +95,8 @@ add_library(core STATIC
    frontend/framebuffer_layout.cpp
    frontend/framebuffer_layout.h
    frontend/input.h
+    frontend/scope_acquire_window_context.cpp
+    frontend/scope_acquire_window_context.h
    gdbstub/gdbstub.cpp
    gdbstub/gdbstub.h
    hle/ipc.h
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -128,10 +128,12 @@ struct System::Impl {
            return ResultStatus::ErrorVideoCore;
        }

-        gpu_core = std::make_unique<Tegra::GPU>(renderer->Rasterizer());
+        is_powered_on = true;
+
+        gpu_core = std::make_unique<Tegra::GPU>(*renderer);

        cpu_core_manager.Initialize(system);
-        is_powered_on = true;
+
        LOG_DEBUG(Core, "Initialized OK");

        // Reset counters and set time origin to current frame
--- a/src/core/frontend/scope_acquire_window_context.cpp
+++ b/src/core/frontend/scope_acquire_window_context.cpp
@@ -0,0 +1,18 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/frontend/emu_window.h"
+#include "core/frontend/scope_acquire_window_context.h"
+
+namespace Core::Frontend {
+
+ScopeAcquireWindowContext::ScopeAcquireWindowContext(Core::Frontend::EmuWindow& emu_window_)
+    : emu_window{emu_window_} {
+    emu_window.MakeCurrent();
+}
+ScopeAcquireWindowContext::~ScopeAcquireWindowContext() {
+    emu_window.DoneCurrent();
+}
+
+} // namespace Core::Frontend
--- a/src/core/frontend/scope_acquire_window_context.h
+++ b/src/core/frontend/scope_acquire_window_context.h
@@ -0,0 +1,23 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+
+namespace Core::Frontend {
+
+class EmuWindow;
+
+/// Helper class to acquire/release window context within a given scope
+class ScopeAcquireWindowContext : NonCopyable {
+public:
+    explicit ScopeAcquireWindowContext(Core::Frontend::EmuWindow& window);
+    ~ScopeAcquireWindowContext();
+
+private:
+    Core::Frontend::EmuWindow& emu_window;
+};
+
+} // namespace Core::Frontend
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@@ -36,7 +36,7 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3

    auto& instance = Core::System::GetInstance();
    instance.GetPerfStats().EndGameFrame();
-    instance.Renderer().SwapBuffers(framebuffer);
+    instance.GPU().SwapBuffers(framebuffer);
 }

 } // namespace Service::Nvidia::Devices
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -178,7 +178,8 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
    auto& gpu = system_instance.GPU();
    auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset);
    ASSERT(cpu_addr);
-    system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(*cpu_addr, itr->second.size);
+    gpu.FlushRegion(*cpu_addr, itr->second.size);
+    gpu.InvalidateRegion(*cpu_addr, itr->second.size);

    params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size);

--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -136,16 +136,6 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
    return 0;
 }

-static void PushGPUEntries(Tegra::CommandList&& entries) {
-    if (entries.empty()) {
-        return;
-    }
-
-    auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()};
-    dma_pusher.Push(std::move(entries));
-    dma_pusher.DispatchCalls();
-}
-
 u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& output) {
    if (input.size() < sizeof(IoctlSubmitGpfifo)) {
        UNIMPLEMENTED();
@@ -163,7 +153,7 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
    std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
                params.num_entries * sizeof(Tegra::CommandListHeader));

-    PushGPUEntries(std::move(entries));
+    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));

    params.fence_out.id = 0;
    params.fence_out.value = 0;
@@ -184,7 +174,7 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
    Memory::ReadBlock(params.address, entries.data(),
                      params.num_entries * sizeof(Tegra::CommandListHeader));

-    PushGPUEntries(std::move(entries));
+    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));

    params.fence_out.id = 0;
    params.fence_out.value = 0;
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -141,7 +141,7 @@ void NVFlinger::Compose() {

            // There was no queued buffer to draw, render previous frame
            system_instance.GetPerfStats().EndGameFrame();
-            system_instance.Renderer().SwapBuffers({});
+            system_instance.GPU().SwapBuffers({});
            continue;
        }

--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -166,9 +166,6 @@ T Read(const VAddr vaddr) {
        return value;
    }

-    // The memory access might do an MMIO or cached access, so we have to lock the HLE kernel state
-    std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
-
    PageType type = current_page_table->attributes[vaddr >> PAGE_BITS];
    switch (type) {
    case PageType::Unmapped:
@@ -199,9 +196,6 @@ void Write(const VAddr vaddr, const T data) {
        return;
    }

-    // The memory access might do an MMIO or cached access, so we have to lock the HLE kernel state
-    std::lock_guard<std::recursive_mutex> lock(HLE::g_hle_lock);
-
    PageType type = current_page_table->attributes[vaddr >> PAGE_BITS];
    switch (type) {
    case PageType::Unmapped:
@@ -357,16 +351,17 @@ void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode) {
        const VAddr overlap_end = std::min(end, region_end);
        const VAddr overlap_size = overlap_end - overlap_start;

-        auto& rasterizer = system_instance.Renderer().Rasterizer();
+        auto& gpu = system_instance.GPU();
        switch (mode) {
        case FlushMode::Flush:
-            rasterizer.FlushRegion(overlap_start, overlap_size);
+            gpu.FlushRegion(overlap_start, overlap_size);
            break;
        case FlushMode::Invalidate:
-            rasterizer.InvalidateRegion(overlap_start, overlap_size);
+            gpu.InvalidateRegion(overlap_start, overlap_size);
            break;
        case FlushMode::FlushAndInvalidate:
-            rasterizer.FlushAndInvalidateRegion(overlap_start, overlap_size);
+            gpu.FlushRegion(overlap_start, overlap_size);
+            gpu.InvalidateRegion(overlap_start, overlap_size);
            break;
        }
    };
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -392,6 +392,7 @@ struct Values {
    bool use_frame_limit;
    u16 frame_limit;
    bool use_accurate_gpu_emulation;
+    bool use_asynchronous_gpu_emulation;

    float bg_red;
    float bg_green;
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -160,6 +160,8 @@ TelemetrySession::TelemetrySession() {
    AddField(Telemetry::FieldType::UserConfig, "Renderer_FrameLimit", Settings::values.frame_limit);
    AddField(Telemetry::FieldType::UserConfig, "Renderer_UseAccurateGpuEmulation",
             Settings::values.use_accurate_gpu_emulation);
+    AddField(Telemetry::FieldType::UserConfig, "Renderer_UseAsynchronousGpuEmulation",
+             Settings::values.use_asynchronous_gpu_emulation);
    AddField(Telemetry::FieldType::UserConfig, "System_UseDockedMode",
             Settings::values.use_docked_mode);
 }
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -17,6 +17,8 @@ add_library(video_core STATIC
    engines/shader_header.h
    gpu.cpp
    gpu.h
+    gpu_thread.cpp
+    gpu_thread.h
    macro_interpreter.cpp
    macro_interpreter.h
    memory_manager.cpp
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -17,6 +17,13 @@ DmaPusher::~DmaPusher() = default;

 MICROPROFILE_DEFINE(DispatchCalls, "GPU", "Execute command buffer", MP_RGB(128, 128, 192));

+void DmaPusher::QueuePendingCalls() {
+    for (auto& entry : dma_writebuffer) {
+        dma_readbuffer.push(std::move(entry));
+    }
+    dma_writebuffer.clear();
+}
+
 void DmaPusher::DispatchCalls() {
    MICROPROFILE_SCOPE(DispatchCalls);

@@ -89,9 +96,9 @@ bool DmaPusher::Step() {
                break;
            }
        }
-    } else if (ib_enable && !dma_pushbuffer.empty()) {
+    } else if (ib_enable && !dma_readbuffer.empty()) {
        // Current pushbuffer empty, but we have more IB entries to read
-        const CommandList& command_list{dma_pushbuffer.front()};
+        const CommandList& command_list{dma_readbuffer.front()};
        const CommandListHeader& command_list_header{command_list[dma_pushbuffer_subindex++]};
        dma_get = command_list_header.addr;
        dma_put = dma_get + command_list_header.size * sizeof(u32);
@@ -99,7 +106,7 @@ bool DmaPusher::Step() {

        if (dma_pushbuffer_subindex >= command_list.size()) {
            // We've gone through the current list, remove it from the queue
-            dma_pushbuffer.pop();
+            dma_readbuffer.pop();
            dma_pushbuffer_subindex = 0;
        }
    } else {
--- a/src/video_core/dma_pusher.h
+++ b/src/video_core/dma_pusher.h
@@ -61,9 +61,10 @@ public:
    ~DmaPusher();

    void Push(CommandList&& entries) {
-        dma_pushbuffer.push(std::move(entries));
+        dma_writebuffer.push_back(std::move(entries));
    }

+    void QueuePendingCalls();
    void DispatchCalls();

 private:
@@ -75,8 +76,9 @@ private:

    GPU& gpu;

-    std::queue<CommandList> dma_pushbuffer; ///< Queue of command lists to be processed
-    std::size_t dma_pushbuffer_subindex{};  ///< Index within a command list within the pushbuffer
+    std::vector<CommandList> dma_writebuffer;
+    std::queue<CommandList> dma_readbuffer;
+    std::size_t dma_pushbuffer_subindex{}; ///< Index within a command list within the pushbuffer

    struct DmaState {
        u32 method;            ///< Current method
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -46,7 +46,7 @@ void KeplerMemory::ProcessData(u32 data) {
    // We have to invalidate the destination region to evict any outdated surfaces from the cache.
    // We do this before actually writing the new data because the destination address might contain
    // a dirty surface that will have to be written back to memory.
-    rasterizer.InvalidateRegion(dest_address, sizeof(u32));
+    Core::System::GetInstance().GPU().InvalidateRegion(dest_address, sizeof(u32));

    Memory::Write32(dest_address, data);
    Core::System::GetInstance().GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -87,12 +87,12 @@ void MaxwellDMA::HandleCopy() {
    const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
        // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
        // copying.
-        rasterizer.FlushRegion(source_cpu, src_size);
+        Core::System::GetInstance().GPU().FlushRegion(source_cpu, src_size);

        // We have to invalidate the destination region to evict any outdated surfaces from the
        // cache. We do this before actually writing the new data because the destination address
        // might contain a dirty surface that will have to be written back to memory.
-        rasterizer.InvalidateRegion(dest_cpu, dst_size);
+        Core::System::GetInstance().GPU().InvalidateRegion(dest_cpu, dst_size);
    };

    if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -3,13 +3,15 @@
 // Refer to the license.txt file included.

 #include "common/assert.h"
+#include "core/settings.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/kepler_memory.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_compute.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
-#include "video_core/rasterizer_interface.h"
+#include "video_core/gpu_thread.h"
+#include "video_core/renderer_base.h"

 namespace Tegra {

@@ -24,7 +26,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
    UNREACHABLE();
 }

-GPU::GPU(VideoCore::RasterizerInterface& rasterizer) {
+GPU::GPU(VideoCore::RendererBase& renderer) : renderer{renderer} {
+    auto& rasterizer{renderer.Rasterizer()};
    memory_manager = std::make_unique<Tegra::MemoryManager>();
    dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(rasterizer, *memory_manager);
@@ -32,6 +35,10 @@ GPU::GPU(VideoCore::RasterizerInterface& rasterizer) {
    maxwell_compute = std::make_unique<Engines::MaxwellCompute>();
    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(rasterizer, *memory_manager);
    kepler_memory = std::make_unique<Engines::KeplerMemory>(rasterizer, *memory_manager);
+
+    if (Settings::values.use_asynchronous_gpu_emulation) {
+        gpu_thread = std::make_unique<VideoCore::GPUThread>(renderer, *dma_pusher);
+    }
 }

 GPU::~GPU() = default;
@@ -60,6 +67,41 @@ const DmaPusher& GPU::DmaPusher() const {
    return *dma_pusher;
 }

+void GPU::PushGPUEntries(Tegra::CommandList&& entries) {
+    if (Settings::values.use_asynchronous_gpu_emulation) {
+        gpu_thread->PushGPUEntries(std::move(entries));
+    } else {
+        dma_pusher->Push(std::move(entries));
+        dma_pusher->QueuePendingCalls();
+        dma_pusher->DispatchCalls();
+    }
+}
+
+void GPU::SwapBuffers(
+    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
+    if (Settings::values.use_asynchronous_gpu_emulation) {
+        gpu_thread->SwapBuffers(std::move(framebuffer));
+    } else {
+        renderer.SwapBuffers(std::move(framebuffer));
+    }
+}
+
+void GPU::FlushRegion(VAddr addr, u64 size) {
+    if (Settings::values.use_asynchronous_gpu_emulation) {
+        gpu_thread->FlushRegion(addr, size);
+    } else {
+        renderer.Rasterizer().FlushRegion(addr, size);
+    }
+}
+
+void GPU::InvalidateRegion(VAddr addr, u64 size) {
+    if (Settings::values.use_asynchronous_gpu_emulation) {
+        gpu_thread->InvalidateRegion(addr, size);
+    } else {
+        renderer.Rasterizer().InvalidateRegion(addr, size);
+    }
+}
+
 u32 RenderTargetBytesPerPixel(RenderTargetFormat format) {
    ASSERT(format != RenderTargetFormat::NONE);

--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -13,8 +13,9 @@
 #include "video_core/memory_manager.h"

 namespace VideoCore {
-class RasterizerInterface;
-}
+class GPUThread;
+class RendererBase;
+} // namespace VideoCore

 namespace Tegra {

@@ -117,7 +118,7 @@ enum class EngineID {

 class GPU final {
 public:
-    explicit GPU(VideoCore::RasterizerInterface& rasterizer);
+    explicit GPU(VideoCore::RendererBase& renderer);
    ~GPU();

    struct MethodCall {
@@ -156,9 +157,23 @@ public:
    /// Returns a const reference to the GPU DMA pusher.
    const Tegra::DmaPusher& DmaPusher() const;

+    /// Push GPU command entries to be processed
+    void PushGPUEntries(Tegra::CommandList&& entries);
+
+    /// Swap buffers (render frame)
+    void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    void FlushRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    void InvalidateRegion(VAddr addr, u64 size);
+
 private:
    std::unique_ptr<Tegra::DmaPusher> dma_pusher;
    std::unique_ptr<Tegra::MemoryManager> memory_manager;
+    std::unique_ptr<VideoCore::GPUThread> gpu_thread;

    /// Mapping of command subchannels to their bound engine ids.
    std::array<EngineID, 8> bound_engines = {};
@@ -173,6 +188,8 @@ private:
    std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
    /// Inline memory engine
    std::unique_ptr<Engines::KeplerMemory> kepler_memory;
+
+    VideoCore::RendererBase& renderer;
 };

 } // namespace Tegra
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -0,0 +1,135 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "core/frontend/scope_acquire_window_context.h"
+#include "video_core/gpu.h"
+#include "video_core/gpu_thread.h"
+#include "video_core/renderer_base.h"
+
+namespace {
+static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher,
+                      VideoCore::GPUThreadState& state) {
+
+    Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()};
+
+    while (state.is_running) {
+        bool is_dma_pending{};
+        bool is_swapbuffers_pending{};
+
+        {
+            // Wait for CPU thread to send GPU commands
+            std::unique_lock<std::mutex> lock{state.signal_mutex};
+            state.signal_condition.wait(lock, [&] {
+                return state.is_dma_pending || state.is_swapbuffers_pending || !state.is_running;
+            });
+
+            if (!state.is_running) {
+                return;
+            }
+
+            is_dma_pending = state.is_dma_pending;
+            is_swapbuffers_pending = state.is_swapbuffers_pending;
+
+            if (is_dma_pending) {
+                dma_pusher.QueuePendingCalls();
+                state.is_dma_pending = false;
+            }
+        }
+
+        {
+            // Cache management
+            std::lock_guard<std::recursive_mutex> lock{state.cache_mutex};
+
+            for (const auto& region : state.flush_regions) {
+                renderer.Rasterizer().FlushRegion(region.addr, region.size);
+            }
+
+            for (const auto& region : state.invalidate_regions) {
+                renderer.Rasterizer().InvalidateRegion(region.addr, region.size);
+            }
+
+            state.flush_regions.clear();
+            state.invalidate_regions.clear();
+        }
+
+        if (is_dma_pending) {
+            // Process pending DMA pushbuffer commands
+            std::lock_guard<std::mutex> lock{state.running_mutex};
+            dma_pusher.DispatchCalls();
+        }
+
+        if (is_swapbuffers_pending) {
+            // Process pending SwapBuffers
+            renderer.SwapBuffers(state.pending_swapbuffers_config);
+            state.is_swapbuffers_pending = false;
+            state.signal_condition.notify_one();
+        }
+    }
+}
+} // Anonymous namespace
+
+namespace VideoCore {
+
+GPUThread::GPUThread(RendererBase& renderer, Tegra::DmaPusher& dma_pusher)
+    : dma_pusher{dma_pusher} {
+    thread = std::make_unique<std::thread>(RunThread, std::ref(renderer), std::ref(dma_pusher),
+                                           std::ref(state));
+}
+
+GPUThread::~GPUThread() {
+    {
+        // Notify GPU thread that a shutdown is pending
+        std::lock_guard<std::mutex> lock{state.signal_mutex};
+        state.is_running = false;
+    }
+
+    state.signal_condition.notify_one();
+    thread->join();
+}
+
+void GPUThread::PushGPUEntries(Tegra::CommandList&& entries) {
+    if (entries.empty()) {
+        return;
+    }
+
+    {
+        // Notify GPU thread that data is available
+        std::lock_guard<std::mutex> lock{state.signal_mutex};
+        dma_pusher.Push(std::move(entries));
+        state.is_dma_pending = true;
+    }
+
+    state.signal_condition.notify_one();
+}
+
+void GPUThread::SwapBuffers(
+    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
+
+    {
+        // Notify GPU thread that we should SwapBuffers
+        std::lock_guard<std::mutex> lock{state.signal_mutex};
+        state.pending_swapbuffers_config = framebuffer;
+        state.is_swapbuffers_pending = true;
+    }
+
+    state.signal_condition.notify_one();
+
+    {
+        // Wait for SwapBuffers
+        std::unique_lock<std::mutex> lock{state.signal_mutex};
+        state.signal_condition.wait(lock, [this] { return !state.is_swapbuffers_pending; });
+    }
+}
+
+void GPUThread::FlushRegion(VAddr addr, u64 size) {
+    std::lock_guard<std::recursive_mutex> lock{state.cache_mutex};
+    state.flush_regions.push_back({addr, size});
+}
+
+void GPUThread::InvalidateRegion(VAddr addr, u64 size) {
+    std::lock_guard<std::recursive_mutex> lock{state.cache_mutex};
+    state.invalidate_regions.push_back({addr, size});
+}
+
+} // namespace VideoCore
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -0,0 +1,67 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <thread>
+
+#include "video_core/dma_pusher.h"
+
+namespace Tegra {
+struct FramebufferConfig;
+}
+
+namespace VideoCore {
+
+class RendererBase;
+
+struct GPUThreadState final {
+    bool is_running{true};
+    bool is_dma_pending{};
+    bool is_swapbuffers_pending{};
+    std::optional<Tegra::FramebufferConfig> pending_swapbuffers_config;
+    std::condition_variable signal_condition;
+    std::condition_variable running_condition;
+    std::mutex signal_mutex;
+    std::mutex running_mutex;
+    std::recursive_mutex cache_mutex;
+
+    struct MemoryRegion final {
+        const VAddr addr;
+        const u64 size;
+    };
+
+    std::vector<MemoryRegion> flush_regions;
+    std::vector<MemoryRegion> invalidate_regions;
+};
+
+class GPUThread final {
+public:
+    explicit GPUThread(RendererBase& renderer, Tegra::DmaPusher& dma_pusher);
+    ~GPUThread();
+
+    /// Push GPU command entries to be processed
+    void PushGPUEntries(Tegra::CommandList&& entries);
+
+    /// Swap buffers (render frame)
+    void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    void FlushRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    void InvalidateRegion(VAddr addr, u64 size);
+
+private:
+    GPUThreadState state;
+    std::unique_ptr<std::thread> thread;
+    Tegra::DmaPusher& dma_pusher;
+};
+
+} // namespace VideoCore
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -30,10 +30,6 @@ public:
    /// Notify rasterizer that any caches of the specified region should be invalidated
    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;

-    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
-    /// and invalidated
-    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
-
    /// Attempt to use a faster method to perform a surface copy
    virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                       const Tegra::Engines::Fermi2D::Regs::Surface& dst) {
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -632,8 +632,6 @@ void RasterizerOpenGL::Clear() {
        return;
    }

-    ScopeAcquireGLContext acquire_context{emu_window};
-
    ConfigureFramebuffers(clear_state, use_color, use_depth || use_stencil, false,
                          regs.clear_buffers.RT.Value());
    if (regs.clear_flags.scissor) {
@@ -667,8 +665,6 @@ void RasterizerOpenGL::DrawArrays() {
    auto& gpu = Core::System::GetInstance().GPU().Maxwell3D();
    const auto& regs = gpu.regs;

-    ScopeAcquireGLContext acquire_context{emu_window};
-
    ConfigureFramebuffers(state);
    SyncColorMask();
    SyncFragmentColorClampState();
@@ -767,11 +763,6 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
    buffer_cache.InvalidateRegion(addr, size);
 }

-void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
-    FlushRegion(addr, size);
-    InvalidateRegion(addr, size);
-}
-
 bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                                             const Tegra::Engines::Fermi2D::Regs::Surface& dst) {
    MICROPROFILE_SCOPE(OpenGL_Blits);
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -53,7 +53,6 @@ public:
    void FlushAll() override;
    void FlushRegion(VAddr addr, u64 size) override;
    void InvalidateRegion(VAddr addr, u64 size) override;
-    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
    bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src,
                               const Tegra::Engines::Fermi2D::Regs::Surface& dst) override;
    bool AccelerateFill(const void* config) override;
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -14,6 +14,7 @@
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/frontend/emu_window.h"
+#include "core/frontend/scope_acquire_window_context.h"
 #include "core/memory.h"
 #include "core/perf_stats.h"
 #include "core/settings.h"
@@ -97,18 +98,6 @@ static std::array<GLfloat, 3 * 2> MakeOrthographicMatrix(const float width, cons
    return matrix;
 }

-ScopeAcquireGLContext::ScopeAcquireGLContext(Core::Frontend::EmuWindow& emu_window_)
-    : emu_window{emu_window_} {
-    if (Settings::values.use_multi_core) {
-        emu_window.MakeCurrent();
-    }
-}
-ScopeAcquireGLContext::~ScopeAcquireGLContext() {
-    if (Settings::values.use_multi_core) {
-        emu_window.DoneCurrent();
-    }
-}
-
 RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& window)
    : VideoCore::RendererBase{window} {}

@@ -117,7 +106,6 @@ RendererOpenGL::~RendererOpenGL() = default;
 /// Swap buffers (render frame)
 void RendererOpenGL::SwapBuffers(
    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
-    ScopeAcquireGLContext acquire_context{render_window};

    Core::System::GetInstance().GetPerfStats().EndSystemFrame();

@@ -508,7 +496,7 @@ static void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum

 /// Initialize the renderer
 bool RendererOpenGL::Init() {
-    ScopeAcquireGLContext acquire_context{render_window};
+    Core::Frontend::ScopeAcquireWindowContext acquire_context{render_window};

    if (GLAD_GL_KHR_debug) {
        glEnable(GL_DEBUG_OUTPUT);
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -39,16 +39,6 @@ struct ScreenInfo {
    TextureInfo texture;
 };

-/// Helper class to acquire/release OpenGL context within a given scope
-class ScopeAcquireGLContext : NonCopyable {
-public:
-    explicit ScopeAcquireGLContext(Core::Frontend::EmuWindow& window);
-    ~ScopeAcquireGLContext();
-
-private:
-    Core::Frontend::EmuWindow& emu_window;
-};
-
 class RendererOpenGL : public VideoCore::RendererBase {
 public:
    explicit RendererOpenGL(Core::Frontend::EmuWindow& window);
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -21,7 +21,7 @@
 EmuThread::EmuThread(GRenderWindow* render_window) : render_window(render_window) {}

 void EmuThread::run() {
-    if (!Settings::values.use_multi_core) {
+    if (!Settings::values.use_asynchronous_gpu_emulation) {
        // Single core mode must acquire OpenGL context for entire emulation session
        render_window->MakeCurrent();
    }
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -372,6 +372,8 @@ void Config::ReadValues() {
    Settings::values.frame_limit = qt_config->value("frame_limit", 100).toInt();
    Settings::values.use_accurate_gpu_emulation =
        qt_config->value("use_accurate_gpu_emulation", false).toBool();
+    Settings::values.use_asynchronous_gpu_emulation =
+        qt_config->value("use_asynchronous_gpu_emulation", true).toBool();

    Settings::values.bg_red = qt_config->value("bg_red", 0.0).toFloat();
    Settings::values.bg_green = qt_config->value("bg_green", 0.0).toFloat();
@@ -630,6 +632,8 @@ void Config::SaveValues() {
    qt_config->setValue("use_frame_limit", Settings::values.use_frame_limit);
    qt_config->setValue("frame_limit", Settings::values.frame_limit);
    qt_config->setValue("use_accurate_gpu_emulation", Settings::values.use_accurate_gpu_emulation);
+    qt_config->setValue("use_asynchronous_gpu_emulation",
+                        Settings::values.use_asynchronous_gpu_emulation);

    // Cast to double because Qt's written float values are not human-readable
    qt_config->setValue("bg_red", (double)Settings::values.bg_red);
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@@ -76,6 +76,8 @@ void ConfigureGraphics::setConfiguration() {
    ui->toggle_frame_limit->setChecked(Settings::values.use_frame_limit);
    ui->frame_limit->setValue(Settings::values.frame_limit);
    ui->use_accurate_gpu_emulation->setChecked(Settings::values.use_accurate_gpu_emulation);
+    ui->use_asynchronous_gpu_emulation->setEnabled(!Core::System::GetInstance().IsPoweredOn());
+    ui->use_asynchronous_gpu_emulation->setChecked(Settings::values.use_asynchronous_gpu_emulation);
    bg_color = QColor::fromRgbF(Settings::values.bg_red, Settings::values.bg_green,
                                Settings::values.bg_blue);
    ui->bg_button->setStyleSheet(
@@ -88,6 +90,8 @@ void ConfigureGraphics::applyConfiguration() {
    Settings::values.use_frame_limit = ui->toggle_frame_limit->isChecked();
    Settings::values.frame_limit = ui->frame_limit->value();
    Settings::values.use_accurate_gpu_emulation = ui->use_accurate_gpu_emulation->isChecked();
+    Settings::values.use_asynchronous_gpu_emulation =
+        ui->use_asynchronous_gpu_emulation->isChecked();
    Settings::values.bg_red = static_cast<float>(bg_color.redF());
    Settings::values.bg_green = static_cast<float>(bg_color.greenF());
    Settings::values.bg_blue = static_cast<float>(bg_color.blueF());
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -55,6 +55,13 @@
           <string>Use accurate GPU emulation (slow)</string>
          </property>
         </widget>
+        </item>
+         <item>
+         <widget class="QCheckBox" name="use_asynchronous_gpu_emulation">
+          <property name="text">
+           <string>Use asynchronous GPU emulation</string>
+          </property>
+         </widget>
        </item>
        <item>
         <layout class="QHBoxLayout" name="horizontalLayout">
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -14,6 +14,7 @@
 #include "configuration/configure_per_general.h"
 #include "core/file_sys/vfs.h"
 #include "core/file_sys/vfs_real.h"
+#include "core/frontend/scope_acquire_window_context.h"
 #include "core/hle/service/acc/profile_manager.h"
 #include "core/hle/service/am/applets/applets.h"
 #include "core/hle/service/hid/controllers/npad.h"
@@ -735,13 +736,15 @@ bool GMainWindow::LoadROM(const QString& filename) {
        ShutdownGame();

    render_window->InitRenderTarget();
-    render_window->MakeCurrent();

-    if (!gladLoadGL()) {
-        QMessageBox::critical(this, tr("Error while initializing OpenGL 4.3 Core!"),
-                              tr("Your GPU may not support OpenGL 4.3, or you do not "
-                                 "have the latest graphics driver."));
-        return false;
+    {
+        Core::Frontend::ScopeAcquireWindowContext acquire_context{*render_window};
+        if (!gladLoadGL()) {
+            QMessageBox::critical(this, tr("Error while initializing OpenGL 4.3 Core!"),
+                                  tr("Your GPU may not support OpenGL 4.3, or you do not "
+                                     "have the latest graphics driver."));
+            return false;
+        }
    }

    QStringList unsupported_gl_extensions = GetUnsupportedGLExtensions();
@@ -782,8 +785,6 @@ bool GMainWindow::LoadROM(const QString& filename) {
               "wiki</a>. This message will not be shown again."));
    }

-    render_window->DoneCurrent();
-
    if (result != Core::System::ResultStatus::Success) {
        switch (result) {
        case Core::System::ResultStatus::ErrorGetLoader:
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -352,6 +352,8 @@ void Config::ReadValues() {
        static_cast<u16>(sdl2_config->GetInteger("Renderer", "frame_limit", 100));
    Settings::values.use_accurate_gpu_emulation =
        sdl2_config->GetBoolean("Renderer", "use_accurate_gpu_emulation", false);
+    Settings::values.use_asynchronous_gpu_emulation =
+        sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", true);

    Settings::values.bg_red = (float)sdl2_config->GetReal("Renderer", "bg_red", 0.0);
    Settings::values.bg_green = (float)sdl2_config->GetReal("Renderer", "bg_green", 0.0);
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -114,6 +114,10 @@ frame_limit =
 # 0 (default): Off (fast), 1 : On (slow)
 use_accurate_gpu_emulation =

+# Whether to use asynchronous GPU emulation
+# 0 : Off (slow), 1 (default): On (fast)
+use_asynchronous_gpu_emulation =
+
 # The clear color for the renderer. What shows up on the sides of the bottom screen.
 # Must be in range of 0.0-1.0. Defaults to 1.0 for all.
 bg_red =
Author	SHA1	Message	Date
bunnei	c7f9124824	gpu_thread: Handle cache management before DMA commands.	2019-01-12 02:41:22 -05:00
bunnei	19ce7abf07	configure_graphics: Disallow changing use_asynchronous_gpu_emulation while running.	2019-01-12 01:36:47 -05:00
bunnei	0bad8394e6	gpu: Move flush and invalidate to GPU thread.	2019-01-12 01:36:47 -05:00
bunnei	9799dcdb7f	gl_rasterizer: Flush and invalidate when GPU thread is idle.	2019-01-12 01:36:46 -05:00
bunnei	1690ea9902	gpu: Move command processing to another thread.	2019-01-12 01:36:46 -05:00
bunnei	208c599463	gpu: Refactor command and swap buffers interface for asynch.	2019-01-12 01:36:46 -05:00
bunnei	85b2c3b051	gpu: Refactor to take RendererBase instead of RasterizerInterface.	2019-01-12 01:36:45 -05:00
bunnei	7b2041a32e	frontend: Refactor ScopeAcquireWindowContext out of renderer_opengl.	2019-01-12 01:36:45 -05:00
bunnei	5daa646d62	settings: Add new graphics setting for use_asynchronous_gpu_emulation.	2019-01-12 01:36:45 -05:00
bunnei	6e589d9d59	memory: Remove HLE lock on Read/Write.	2019-01-12 01:36:45 -05:00
bunnei	353d066264	core: Set is_powered_on before GPU is initialized.	2019-01-12 01:36:44 -05:00