Create python-publish.yml

Merge pull request #9318 from goldenx86/glsl-ftw
Replace GLSL as the default OpenGL shader backend
2022-11-26 18:25:50 -06:00 · 2022-11-26 15:57:37 -08:00 · 2022-11-26 17:39:43 -03:00 · 2022-11-26 17:27:04 -03:00 · 2022-11-26 09:35:45 -05:00 · 2022-11-26 09:08:55 -05:00
31 changed files with 1972 additions and 121 deletions
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -0,0 +1,39 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -92,10 +92,14 @@ endif()
 add_subdirectory(sirit)

 if (ENABLE_WEB_SERVICE)
-    find_package(OpenSSL 1.1)
-    if (OPENSSL_FOUND)
-        set(OPENSSL_LIBRARIES OpenSSL::SSL OpenSSL::Crypto)
-    else()
+    if (NOT WIN32)
+        find_package(OpenSSL 1.1)
+        if (OPENSSL_FOUND)
+            set(OPENSSL_LIBRARIES OpenSSL::SSL OpenSSL::Crypto)
+        endif()
+    endif()
+
+    if (WIN32 OR NOT OPENSSL_FOUND)
        # LibreSSL
        set(LIBRESSL_SKIP_INSTALL ON CACHE BOOL "")
        set(OPENSSLDIR "/etc/ssl/")
--- a/src/audio_core/renderer/command/command_buffer.cpp
+++ b/src/audio_core/renderer/command/command_buffer.cpp
@@ -460,21 +460,23 @@ void CommandBuffer::GenerateDeviceSinkCommand(const s32 node_id, const s16 buffe

    cmd.session_id = session_id;

+    cmd.input_count = parameter.input_count;
+    s16 max_input{0};
+    for (u32 i = 0; i < parameter.input_count; i++) {
+        cmd.inputs[i] = buffer_offset + parameter.inputs[i];
+        max_input = std::max(max_input, cmd.inputs[i]);
+    }
+
    if (state.upsampler_info != nullptr) {
        const auto size_{state.upsampler_info->sample_count * parameter.input_count};
        const auto size_bytes{size_ * sizeof(s32)};
        const auto addr{memory_pool->Translate(state.upsampler_info->samples_pos, size_bytes)};
        cmd.sample_buffer = {reinterpret_cast<s32*>(addr),
-                             parameter.input_count * state.upsampler_info->sample_count};
+                             (max_input + 1) * state.upsampler_info->sample_count};
    } else {
        cmd.sample_buffer = samples_buffer;
    }

-    cmd.input_count = parameter.input_count;
-    for (u32 i = 0; i < parameter.input_count; i++) {
-        cmd.inputs[i] = buffer_offset + parameter.inputs[i];
-    }
-
    GenerateEnd<DeviceSinkCommand>(cmd);
 }

--- a/src/common/settings.h
+++ b/src/common/settings.h
@@ -442,7 +442,7 @@ struct Values {
    SwitchableSetting<NvdecEmulation> nvdec_emulation{NvdecEmulation::GPU, "nvdec_emulation"};
    SwitchableSetting<bool> accelerate_astc{true, "accelerate_astc"};
    SwitchableSetting<bool> use_vsync{true, "use_vsync"};
-    SwitchableSetting<ShaderBackend, true> shader_backend{ShaderBackend::GLASM, ShaderBackend::GLSL,
+    SwitchableSetting<ShaderBackend, true> shader_backend{ShaderBackend::GLSL, ShaderBackend::GLSL,
                                                          ShaderBackend::SPIRV, "shader_backend"};
    SwitchableSetting<bool> use_asynchronous_shaders{false, "use_asynchronous_shaders"};
    SwitchableSetting<bool> use_fast_gpu_time{true, "use_fast_gpu_time"};
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -28,6 +28,10 @@ add_library(video_core STATIC
    dirty_flags.h
    dma_pusher.cpp
    dma_pusher.h
+    engines/sw_blitter/blitter.cpp
+    engines/sw_blitter/blitter.h
+    engines/sw_blitter/converter.cpp
+    engines/sw_blitter/converter.h
    engines/const_buffer_info.h
    engines/engine_interface.h
    engines/engine_upload.cpp
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -1742,12 +1742,12 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
    SynchronizeBuffer(buffer, dest_address, static_cast<u32>(copy_size));

    if constexpr (USE_MEMORY_MAPS) {
+        auto upload_staging = runtime.UploadStagingBuffer(copy_size);
        std::array copies{BufferCopy{
-            .src_offset = 0,
+            .src_offset = upload_staging.offset,
            .dst_offset = buffer.Offset(dest_address),
            .size = copy_size,
        }};
-        auto upload_staging = runtime.UploadStagingBuffer(copy_size);
        u8* const src_pointer = upload_staging.mapped_span.data();
        std::memcpy(src_pointer, inlined_buffer.data(), copy_size);
        runtime.CopyBuffer(buffer, upload_staging.buffer, copies);
--- a/src/video_core/control/channel_state.cpp
+++ b/src/video_core/control/channel_state.cpp
@@ -20,7 +20,7 @@ void ChannelState::Init(Core::System& system, GPU& gpu) {
    ASSERT(memory_manager);
    dma_pusher = std::make_unique<Tegra::DmaPusher>(system, gpu, *memory_manager, *this);
    maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, *memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>();
+    fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
    kepler_compute = std::make_unique<Engines::KeplerCompute>(system, *memory_manager);
    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
    kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@@ -51,11 +51,11 @@ void State::ProcessData(std::span<const u8> read_buffer) {
        } else {
            for (u32 line = 0; line < regs.line_count; ++line) {
                const GPUVAddr dest_line = address + static_cast<size_t>(line) * regs.dest.pitch;
-                memory_manager.WriteBlockUnsafe(
-                    dest_line, read_buffer.data() + static_cast<size_t>(line) * regs.line_length_in,
-                    regs.line_length_in);
+                std::span<const u8> buffer(read_buffer.data() +
+                                               static_cast<size_t>(line) * regs.line_length_in,
+                                           regs.line_length_in);
+                rasterizer->AccelerateInlineToMemory(dest_line, regs.line_length_in, buffer);
            }
-            memory_manager.InvalidateRegion(address, regs.dest.pitch * regs.line_count);
        }
    } else {
        u32 width = regs.dest.width;
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -3,17 +3,25 @@

 #include "common/assert.h"
 #include "common/logging/log.h"
+#include "common/microprofile.h"
 #include "video_core/engines/fermi_2d.h"
-#include "video_core/memory_manager.h"
+#include "video_core/engines/sw_blitter/blitter.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/surface.h"
+#include "video_core/textures/decoders.h"
+
+MICROPROFILE_DECLARE(GPU_BlitEngine);
+MICROPROFILE_DEFINE(GPU_BlitEngine, "GPU", "Blit Engine", MP_RGB(224, 224, 128));

 using VideoCore::Surface::BytesPerBlock;
 using VideoCore::Surface::PixelFormatFromRenderTargetFormat;

 namespace Tegra::Engines {

-Fermi2D::Fermi2D() {
+using namespace Texture;
+
+Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
+    sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_);
    // Nvidia's OpenGL driver seems to assume these values
    regs.src.depth = 1;
    regs.dst.depth = 1;
@@ -42,6 +50,7 @@ void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32
 }

 void Fermi2D::Blit() {
+    MICROPROFILE_SCOPE(GPU_BlitEngine);
    LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
              regs.src.Address(), regs.dst.Address());

@@ -52,9 +61,16 @@ void Fermi2D::Blit() {
    UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");

    const auto& args = regs.pixels_from_memory;
+    constexpr s64 null_derivate = 1ULL << 32;
+    Surface src = regs.src;
+    const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
+    const bool delegate_to_gpu = src.width > 512 && src.height > 512 && bytes_per_pixel <= 8 &&
+                                 src.format != regs.dst.format;
    Config config{
        .operation = regs.operation,
        .filter = args.sample_mode.filter,
+        .must_accelerate =
+            args.du_dx != null_derivate || args.dv_dy != null_derivate || delegate_to_gpu,
        .dst_x0 = args.dst_x0,
        .dst_y0 = args.dst_y0,
        .dst_x1 = args.dst_x0 + args.dst_width,
@@ -64,8 +80,7 @@ void Fermi2D::Blit() {
        .src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
        .src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
    };
-    Surface src = regs.src;
-    const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
+
    const auto need_align_to_pitch =
        src.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch &&
        static_cast<s32>(src.width) == config.src_x1 &&
@@ -78,8 +93,9 @@ void Fermi2D::Blit() {
        config.src_x1 -= config.src_x0;
        config.src_x0 = 0;
    }
+
    if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
-        UNIMPLEMENTED();
+        sw_blitter->Blit(src, regs.dst, config);
    }
 }

--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -5,6 +5,7 @@

 #include <array>
 #include <cstddef>
+#include <memory>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -21,6 +22,10 @@ class RasterizerInterface;

 namespace Tegra::Engines {

+namespace Blitter {
+class SoftwareBlitEngine;
+}
+
 /**
 * This Engine is known as G80_2D. Documentation can be found in:
 * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
@@ -32,7 +37,7 @@ namespace Tegra::Engines {

 class Fermi2D final : public EngineInterface {
 public:
-    explicit Fermi2D();
+    explicit Fermi2D(MemoryManager& memory_manager_);
    ~Fermi2D() override;

    /// Binds a rasterizer to this engine.
@@ -286,6 +291,7 @@ public:
    struct Config {
        Operation operation;
        Filter filter;
+        bool must_accelerate;
        s32 dst_x0;
        s32 dst_y0;
        s32 dst_x1;
@@ -298,6 +304,7 @@ public:

 private:
    VideoCore::RasterizerInterface* rasterizer = nullptr;
+    std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;

    /// Performs the copy from the source surface to the destination surface as configured in the
    /// registers.
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -126,6 +126,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
    draw_command[MAXWELL3D_REG_INDEX(draw_inline_index)] = true;
    draw_command[MAXWELL3D_REG_INDEX(inline_index_2x16.even)] = true;
    draw_command[MAXWELL3D_REG_INDEX(inline_index_4x8.index0)] = true;
+    draw_command[MAXWELL3D_REG_INDEX(draw.instance_id)] = true;
 }

 void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
@@ -249,9 +250,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
        return;
    case MAXWELL3D_REG_INDEX(fragment_barrier):
        return rasterizer->FragmentBarrier();
-    case MAXWELL3D_REG_INDEX(invalidate_texture_data_cache):
-        rasterizer->InvalidateGPUCache();
-        return rasterizer->WaitForIdle();
    case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
        return rasterizer->TiledCacheBarrier();
    }
@@ -288,31 +286,58 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Maxwell3D register, increase the size of the Regs structure");

+    const u32 argument = ProcessShadowRam(method, method_argument);
+    ProcessDirtyRegisters(method, argument);
+
    if (draw_command[method]) {
        regs.reg_array[method] = method_argument;
        deferred_draw_method.push_back(method);
-        auto u32_to_u8 = [&](const u32 argument) {
-            inline_index_draw_indexes.push_back(static_cast<u8>(argument & 0x000000ff));
-            inline_index_draw_indexes.push_back(static_cast<u8>((argument & 0x0000ff00) >> 8));
-            inline_index_draw_indexes.push_back(static_cast<u8>((argument & 0x00ff0000) >> 16));
-            inline_index_draw_indexes.push_back(static_cast<u8>((argument & 0xff000000) >> 24));
+        auto update_inline_index = [&](const u32 index) {
+            inline_index_draw_indexes.push_back(static_cast<u8>(index & 0x000000ff));
+            inline_index_draw_indexes.push_back(static_cast<u8>((index & 0x0000ff00) >> 8));
+            inline_index_draw_indexes.push_back(static_cast<u8>((index & 0x00ff0000) >> 16));
+            inline_index_draw_indexes.push_back(static_cast<u8>((index & 0xff000000) >> 24));
+            draw_mode = DrawMode::InlineIndex;
        };
-        if (MAXWELL3D_REG_INDEX(draw_inline_index) == method) {
-            u32_to_u8(method_argument);
-        } else if (MAXWELL3D_REG_INDEX(inline_index_2x16.even) == method) {
-            u32_to_u8(regs.inline_index_2x16.even);
-            u32_to_u8(regs.inline_index_2x16.odd);
-        } else if (MAXWELL3D_REG_INDEX(inline_index_4x8.index0) == method) {
-            u32_to_u8(regs.inline_index_4x8.index0);
-            u32_to_u8(regs.inline_index_4x8.index1);
-            u32_to_u8(regs.inline_index_4x8.index2);
-            u32_to_u8(regs.inline_index_4x8.index3);
+        switch (method) {
+        case MAXWELL3D_REG_INDEX(draw.end):
+            switch (draw_mode) {
+            case DrawMode::General:
+                ProcessDraw(1);
+                break;
+            case DrawMode::InlineIndex:
+                regs.index_buffer.count = static_cast<u32>(inline_index_draw_indexes.size() / 4);
+                regs.index_buffer.format = Regs::IndexFormat::UnsignedInt;
+                ProcessDraw(1);
+                inline_index_draw_indexes.clear();
+                break;
+            case DrawMode::Instance:
+                break;
+            }
+            break;
+        case MAXWELL3D_REG_INDEX(draw_inline_index):
+            update_inline_index(method_argument);
+            break;
+        case MAXWELL3D_REG_INDEX(inline_index_2x16.even):
+            update_inline_index(regs.inline_index_2x16.even);
+            update_inline_index(regs.inline_index_2x16.odd);
+            break;
+        case MAXWELL3D_REG_INDEX(inline_index_4x8.index0):
+            update_inline_index(regs.inline_index_4x8.index0);
+            update_inline_index(regs.inline_index_4x8.index1);
+            update_inline_index(regs.inline_index_4x8.index2);
+            update_inline_index(regs.inline_index_4x8.index3);
+            break;
+        case MAXWELL3D_REG_INDEX(draw.instance_id):
+            draw_mode =
+                (regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Subsequent) ||
+                        (regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Unchanged)
+                    ? DrawMode::Instance
+                    : DrawMode::General;
+            break;
        }
    } else {
        ProcessDeferredDraw();
-
-        const u32 argument = ProcessShadowRam(method, method_argument);
-        ProcessDirtyRegisters(method, argument);
        ProcessMethodCall(method, argument, method_argument, is_last_call);
    }
 }
@@ -511,10 +536,7 @@ void Maxwell3D::ProcessCounterReset() {

 void Maxwell3D::ProcessSyncPoint() {
    const u32 sync_point = regs.sync_info.sync_point.Value();
-    const u32 cache_flush = regs.sync_info.clean_l2.Value();
-    if (cache_flush != 0) {
-        rasterizer->InvalidateGPUCache();
-    }
+    [[maybe_unused]] const u32 cache_flush = regs.sync_info.clean_l2.Value();
    rasterizer->SignalSyncPoint(sync_point);
 }

@@ -626,57 +648,27 @@ void Maxwell3D::ProcessDraw(u32 instance_count) {
 }

 void Maxwell3D::ProcessDeferredDraw() {
-    if (deferred_draw_method.empty()) {
+    if (draw_mode != DrawMode::Instance || deferred_draw_method.empty()) {
        return;
    }

-    enum class DrawMode {
-        Undefined,
-        General,
-        Instance,
-    };
-    DrawMode draw_mode{DrawMode::Undefined};
    u32 method_count = static_cast<u32>(deferred_draw_method.size());
-    u32 method = deferred_draw_method[method_count - 1];
-    if (MAXWELL3D_REG_INDEX(draw.end) != method) {
-        return;
-    }
-    draw_mode = (regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Subsequent) ||
-                        (regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Unchanged)
-                    ? DrawMode::Instance
-                    : DrawMode::General;
-    u32 instance_count = 0;
-    if (draw_mode == DrawMode::Instance) {
-        u32 vertex_buffer_count = 0;
-        u32 index_buffer_count = 0;
-        for (u32 index = 0; index < method_count; ++index) {
-            method = deferred_draw_method[index];
-            if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count)) {
-                instance_count = ++vertex_buffer_count;
-            } else if (method == MAXWELL3D_REG_INDEX(index_buffer.count)) {
-                instance_count = ++index_buffer_count;
-            }
-        }
-        ASSERT_MSG(!(vertex_buffer_count && index_buffer_count),
-                   "Instance both indexed and direct?");
-    } else {
-        instance_count = 1;
-        for (u32 index = 0; index < method_count; ++index) {
-            method = deferred_draw_method[index];
-            if (MAXWELL3D_REG_INDEX(draw_inline_index) == method ||
-                MAXWELL3D_REG_INDEX(inline_index_2x16.even) == method ||
-                MAXWELL3D_REG_INDEX(inline_index_4x8.index0) == method) {
-                regs.index_buffer.count = static_cast<u32>(inline_index_draw_indexes.size() / 4);
-                regs.index_buffer.format = Regs::IndexFormat::UnsignedInt;
-                break;
-            }
+    u32 instance_count = 1;
+    u32 vertex_buffer_count = 0;
+    u32 index_buffer_count = 0;
+    for (u32 index = 0; index < method_count; ++index) {
+        u32 method = deferred_draw_method[index];
+        if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count)) {
+            instance_count = ++vertex_buffer_count;
+        } else if (method == MAXWELL3D_REG_INDEX(index_buffer.count)) {
+            instance_count = ++index_buffer_count;
        }
    }
+    ASSERT_MSG(!(vertex_buffer_count && index_buffer_count), "Instance both indexed and direct?");

    ProcessDraw(instance_count);

    deferred_draw_method.clear();
-    inline_index_draw_indexes.clear();
 }

 } // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -3148,10 +3148,12 @@ private:
    /// Handles use of topology overrides (e.g., to avoid using a topology assigned from a macro)
    void ProcessTopologyOverride();

-    void ProcessDraw(u32 instance_count = 1);
-
+    /// Handles deferred draw(e.g., instance draw).
    void ProcessDeferredDraw();

+    /// Handles a draw.
+    void ProcessDraw(u32 instance_count = 1);
+
    /// Returns a query's value or an empty object if the value will be deferred through a cache.
    std::optional<u64> GetQueryResult();

@@ -3178,6 +3180,8 @@ private:

    std::array<bool, Regs::NUM_REGS> draw_command{};
    std::vector<u32> deferred_draw_method;
+    enum class DrawMode : u32 { General = 0, Instance, InlineIndex };
+    DrawMode draw_mode{DrawMode::General};
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -62,7 +62,8 @@ void MaxwellDMA::Launch() {

        if (!is_src_pitch && !is_dst_pitch) {
            // If both the source and the destination are in block layout, assert.
-            UNIMPLEMENTED_MSG("Tiled->Tiled DMA transfers are not yet implemented");
+            CopyBlockLinearToBlockLinear();
+            ReleaseSemaphore();
            return;
        }

@@ -291,6 +292,70 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
 }

+void MaxwellDMA::CopyBlockLinearToBlockLinear() {
+    UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
+
+    const bool is_remapping = regs.launch_dma.remap_enable != 0;
+
+    // Deswizzle the input and copy it over.
+    const Parameters& src = regs.src_params;
+    const Parameters& dst = regs.dst_params;
+
+    const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
+    const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
+
+    const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
+
+    u32 src_width = src.width;
+    u32 dst_width = dst.width;
+    u32 x_elements = regs.line_length_in;
+    u32 src_x_offset = src.origin.x;
+    u32 dst_x_offset = dst.origin.x;
+    u32 bpp_shift = 0U;
+    if (!is_remapping) {
+        bpp_shift = Common::FoldRight(
+            4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
+            src_width, dst_width, x_elements, src_x_offset, dst_x_offset,
+            static_cast<u32>(regs.offset_in), static_cast<u32>(regs.offset_out));
+        src_width >>= bpp_shift;
+        dst_width >>= bpp_shift;
+        x_elements >>= bpp_shift;
+        src_x_offset >>= bpp_shift;
+        dst_x_offset >>= bpp_shift;
+    }
+
+    const u32 bytes_per_pixel = base_bpp << bpp_shift;
+    const size_t src_size = CalculateSize(true, bytes_per_pixel, src_width, src.height, src.depth,
+                                          src.block_size.height, src.block_size.depth);
+    const size_t dst_size = CalculateSize(true, bytes_per_pixel, dst_width, dst.height, dst.depth,
+                                          dst.block_size.height, dst.block_size.depth);
+
+    const u32 pitch = x_elements * bytes_per_pixel;
+    const size_t mid_buffer_size = pitch * regs.line_count;
+
+    if (read_buffer.size() < src_size) {
+        read_buffer.resize(src_size);
+    }
+    if (write_buffer.size() < dst_size) {
+        write_buffer.resize(dst_size);
+    }
+
+    intermediate_buffer.resize(mid_buffer_size);
+
+    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
+    memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
+
+    UnswizzleSubrect(intermediate_buffer, read_buffer, bytes_per_pixel, src_width, src.height,
+                     src.depth, src_x_offset, src.origin.y, x_elements, regs.line_count,
+                     src.block_size.height, src.block_size.depth, pitch);
+
+    SwizzleSubrect(write_buffer, intermediate_buffer, bytes_per_pixel, dst_width, dst.height,
+                   dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
+                   dst.block_size.height, dst.block_size.depth, pitch);
+
+    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
+}
+
 void MaxwellDMA::ReleaseSemaphore() {
    const auto type = regs.launch_dma.semaphore_type;
    const GPUVAddr address = regs.semaphore.address;
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -223,6 +223,8 @@ private:

    void CopyPitchToBlockLinear();

+    void CopyBlockLinearToBlockLinear();
+
    void FastCopyBlockLinearToPitch();

    void ReleaseSemaphore();
@@ -234,6 +236,7 @@ private:

    std::vector<u8> read_buffer;
    std::vector<u8> write_buffer;
+    std::vector<u8> intermediate_buffer;

    static constexpr std::size_t NUM_REGS = 0x800;
    struct Regs {
--- a/src/video_core/engines/puller.cpp
+++ b/src/video_core/engines/puller.cpp
@@ -118,7 +118,7 @@ void Puller::ProcessSemaphoreRelease() {
    std::function<void()> operation([this, sequence_address, payload] {
        memory_manager.Write<u32>(sequence_address, payload);
    });
-    rasterizer->SyncOperation(std::move(operation));
+    rasterizer->SignalFence(std::move(operation));
 }

 void Puller::ProcessSemaphoreAcquire() {
@@ -151,8 +151,8 @@ void Puller::CallPullerMethod(const MethodCall& method_call) {
    case BufferMethods::SemaphoreAddressLow:
    case BufferMethods::SemaphoreSequencePayload:
    case BufferMethods::SyncpointPayload:
-        break;
    case BufferMethods::WrcacheFlush:
+        break;
    case BufferMethods::RefCnt:
        rasterizer->SignalReference();
        break;
--- a/src/video_core/engines/sw_blitter/blitter.cpp
+++ b/src/video_core/engines/sw_blitter/blitter.cpp
@@ -0,0 +1,238 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "video_core/engines/sw_blitter/blitter.h"
+#include "video_core/engines/sw_blitter/converter.h"
+#include "video_core/memory_manager.h"
+#include "video_core/surface.h"
+#include "video_core/textures/decoders.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+using VideoCore::Surface::BytesPerBlock;
+using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
+
+namespace Tegra::Engines::Blitter {
+
+using namespace Texture;
+
+namespace {
+
+constexpr size_t ir_components = 4;
+
+void NearestNeighbor(std::span<const u8> input, std::span<u8> output, u32 src_width, u32 src_height,
+                     u32 dst_width, u32 dst_height, size_t bpp) {
+    const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
+    const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
+    size_t src_y = 0;
+    for (u32 y = 0; y < dst_height; y++) {
+        size_t src_x = 0;
+        for (u32 x = 0; x < dst_width; x++) {
+            const size_t read_from = ((src_y * src_width + src_x) >> 32) * bpp;
+            const size_t write_to = (y * dst_width + x) * bpp;
+
+            std::memcpy(&output[write_to], &input[read_from], bpp);
+            src_x += dx_du;
+        }
+        src_y += dy_dv;
+    }
+}
+
+void NearestNeighborFast(std::span<const f32> input, std::span<f32> output, u32 src_width,
+                         u32 src_height, u32 dst_width, u32 dst_height) {
+    const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
+    const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
+    size_t src_y = 0;
+    for (u32 y = 0; y < dst_height; y++) {
+        size_t src_x = 0;
+        for (u32 x = 0; x < dst_width; x++) {
+            const size_t read_from = ((src_y * src_width + src_x) >> 32) * ir_components;
+            const size_t write_to = (y * dst_width + x) * ir_components;
+
+            std::memcpy(&output[write_to], &input[read_from], sizeof(f32) * ir_components);
+            src_x += dx_du;
+        }
+        src_y += dy_dv;
+    }
+}
+
+void Bilinear(std::span<const f32> input, std::span<f32> output, size_t src_width,
+              size_t src_height, size_t dst_width, size_t dst_height) {
+    const auto bilinear_sample = [](std::span<const f32> x0_y0, std::span<const f32> x1_y0,
+                                    std::span<const f32> x0_y1, std::span<const f32> x1_y1,
+                                    f32 weight_x, f32 weight_y) {
+        std::array<f32, ir_components> result{};
+        for (size_t i = 0; i < ir_components; i++) {
+            const f32 a = std::lerp(x0_y0[i], x1_y0[i], weight_x);
+            const f32 b = std::lerp(x0_y1[i], x1_y1[i], weight_x);
+            result[i] = std::lerp(a, b, weight_y);
+        }
+        return result;
+    };
+    const f32 dx_du =
+        dst_width > 1 ? static_cast<f32>(src_width - 1) / static_cast<f32>(dst_width - 1) : 0.f;
+    const f32 dy_dv =
+        dst_height > 1 ? static_cast<f32>(src_height - 1) / static_cast<f32>(dst_height - 1) : 0.f;
+    for (u32 y = 0; y < dst_height; y++) {
+        for (u32 x = 0; x < dst_width; x++) {
+            const f32 x_low = std::floor(static_cast<f32>(x) * dx_du);
+            const f32 y_low = std::floor(static_cast<f32>(y) * dy_dv);
+            const f32 x_high = std::ceil(static_cast<f32>(x) * dx_du);
+            const f32 y_high = std::ceil(static_cast<f32>(y) * dy_dv);
+            const f32 weight_x = (static_cast<f32>(x) * dx_du) - x_low;
+            const f32 weight_y = (static_cast<f32>(y) * dy_dv) - y_low;
+
+            const auto read_src = [&](f32 in_x, f32 in_y) {
+                const size_t read_from =
+                    ((static_cast<size_t>(in_x) * src_width + static_cast<size_t>(in_y)) >> 32) *
+                    ir_components;
+                return std::span<const f32>(&input[read_from], ir_components);
+            };
+
+            auto x0_y0 = read_src(x_low, y_low);
+            auto x1_y0 = read_src(x_high, y_low);
+            auto x0_y1 = read_src(x_low, y_high);
+            auto x1_y1 = read_src(x_high, y_high);
+
+            const auto result = bilinear_sample(x0_y0, x1_y0, x0_y1, x1_y1, weight_x, weight_y);
+
+            const size_t write_to = (y * dst_width + x) * ir_components;
+
+            std::memcpy(&output[write_to], &result, sizeof(f32) * ir_components);
+        }
+    }
+}
+
+} // namespace
+
+struct SoftwareBlitEngine::BlitEngineImpl {
+    std::vector<u8> tmp_buffer;
+    std::vector<u8> src_buffer;
+    std::vector<u8> dst_buffer;
+    std::vector<f32> intermediate_src;
+    std::vector<f32> intermediate_dst;
+    ConverterFactory converter_factory;
+};
+
+SoftwareBlitEngine::SoftwareBlitEngine(MemoryManager& memory_manager_)
+    : memory_manager{memory_manager_} {
+    impl = std::make_unique<BlitEngineImpl>();
+}
+
+SoftwareBlitEngine::~SoftwareBlitEngine() = default;
+
+bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
+                              Fermi2D::Config& config) {
+    const auto get_surface_size = [](Fermi2D::Surface& surface, u32 bytes_per_pixel) {
+        if (surface.linear == Fermi2D::MemoryLayout::BlockLinear) {
+            return CalculateSize(true, bytes_per_pixel, surface.width, surface.height,
+                                 surface.depth, surface.block_height, surface.block_depth);
+        }
+        return static_cast<size_t>(surface.pitch * surface.height);
+    };
+    const auto process_pitch_linear = [](bool unpack, std::span<const u8> input,
+                                         std::span<u8> output, u32 extent_x, u32 extent_y,
+                                         u32 pitch, u32 x0, u32 y0, size_t bpp) {
+        const size_t base_offset = x0 * bpp;
+        const size_t copy_size = extent_x * bpp;
+        for (u32 y = y0; y < extent_y; y++) {
+            const size_t first_offset = y * pitch + base_offset;
+            const size_t second_offset = y * extent_x * bpp;
+            u8* write_to = unpack ? &output[first_offset] : &output[second_offset];
+            const u8* read_from = unpack ? &input[second_offset] : &input[first_offset];
+            std::memcpy(write_to, read_from, copy_size);
+        }
+    };
+
+    const u32 src_extent_x = config.src_x1 - config.src_x0;
+    const u32 src_extent_y = config.src_y1 - config.src_y0;
+
+    const u32 dst_extent_x = config.dst_x1 - config.dst_x0;
+    const u32 dst_extent_y = config.dst_y1 - config.dst_y0;
+    const auto src_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
+    const auto dst_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(dst.format));
+    const size_t src_size = get_surface_size(src, src_bytes_per_pixel);
+    impl->tmp_buffer.resize(src_size);
+    memory_manager.ReadBlock(src.Address(), impl->tmp_buffer.data(), src_size);
+
+    const size_t src_copy_size = src_extent_x * src_extent_y * src_bytes_per_pixel;
+
+    const size_t dst_copy_size = dst_extent_x * dst_extent_y * dst_bytes_per_pixel;
+
+    impl->src_buffer.resize(src_copy_size);
+
+    const bool no_passthrough =
+        src.format != dst.format || src_extent_x != dst_extent_x || src_extent_y != dst_extent_y;
+
+    const auto convertion_phase_same_format = [&]() {
+        NearestNeighbor(impl->src_buffer, impl->dst_buffer, src_extent_x, src_extent_y,
+                        dst_extent_x, dst_extent_y, dst_bytes_per_pixel);
+    };
+
+    const auto convertion_phase_ir = [&]() {
+        auto* input_converter = impl->converter_factory.GetFormatConverter(src.format);
+        impl->intermediate_src.resize((src_copy_size / src_bytes_per_pixel) * ir_components);
+        impl->intermediate_dst.resize((dst_copy_size / dst_bytes_per_pixel) * ir_components);
+        input_converter->ConvertTo(impl->src_buffer, impl->intermediate_src);
+
+        if (config.filter != Fermi2D::Filter::Bilinear) {
+            NearestNeighborFast(impl->intermediate_src, impl->intermediate_dst, src_extent_x,
+                                src_extent_y, dst_extent_x, dst_extent_y);
+        } else {
+            Bilinear(impl->intermediate_src, impl->intermediate_dst, src_extent_x, src_extent_y,
+                     dst_extent_x, dst_extent_y);
+        }
+
+        auto* output_converter = impl->converter_factory.GetFormatConverter(dst.format);
+        output_converter->ConvertFrom(impl->intermediate_dst, impl->dst_buffer);
+    };
+
+    // Do actuall Blit
+
+    impl->dst_buffer.resize(dst_copy_size);
+    if (src.linear == Fermi2D::MemoryLayout::BlockLinear) {
+        UnswizzleSubrect(impl->src_buffer, impl->tmp_buffer, src_bytes_per_pixel, src.width,
+                         src.height, src.depth, config.src_x0, config.src_y0, src_extent_x,
+                         src_extent_y, src.block_height, src.block_depth,
+                         src_extent_x * src_bytes_per_pixel);
+    } else {
+        process_pitch_linear(false, impl->tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y,
+                             src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel);
+    }
+
+    // Conversion Phase
+    if (no_passthrough) {
+        if (src.format != dst.format || config.filter == Fermi2D::Filter::Bilinear) {
+            convertion_phase_ir();
+        } else {
+            convertion_phase_same_format();
+        }
+    } else {
+        impl->dst_buffer.swap(impl->src_buffer);
+    }
+
+    const size_t dst_size = get_surface_size(dst, dst_bytes_per_pixel);
+    impl->tmp_buffer.resize(dst_size);
+    memory_manager.ReadBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
+
+    if (dst.linear == Fermi2D::MemoryLayout::BlockLinear) {
+        SwizzleSubrect(impl->tmp_buffer, impl->dst_buffer, dst_bytes_per_pixel, dst.width,
+                       dst.height, dst.depth, config.dst_x0, config.dst_y0, dst_extent_x,
+                       dst_extent_y, dst.block_height, dst.block_depth,
+                       dst_extent_x * dst_bytes_per_pixel);
+    } else {
+        process_pitch_linear(true, impl->dst_buffer, impl->tmp_buffer, dst_extent_x, dst_extent_y,
+                             dst.pitch, config.dst_x0, config.dst_y0,
+                             static_cast<size_t>(dst_bytes_per_pixel));
+    }
+    memory_manager.WriteBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
+    return true;
+}
+
+} // namespace Tegra::Engines::Blitter
--- a/src/video_core/engines/sw_blitter/blitter.h
+++ b/src/video_core/engines/sw_blitter/blitter.h
@@ -0,0 +1,27 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include "video_core/engines/fermi_2d.h"
+
+namespace Tegra {
+class MemoryManager;
+}
+
+namespace Tegra::Engines::Blitter {
+
+class SoftwareBlitEngine {
+public:
+    explicit SoftwareBlitEngine(MemoryManager& memory_manager_);
+    ~SoftwareBlitEngine();
+
+    bool Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, Fermi2D::Config& copy_config);
+
+private:
+    MemoryManager& memory_manager;
+    struct BlitEngineImpl;
+    std::unique_ptr<BlitEngineImpl> impl;
+};
+
+} // namespace Tegra::Engines::Blitter
--- a/src/video_core/engines/sw_blitter/converter.cpp
+++ b/src/video_core/engines/sw_blitter/converter.cpp
--- a/src/video_core/engines/sw_blitter/converter.h
+++ b/src/video_core/engines/sw_blitter/converter.h
@@ -0,0 +1,36 @@
+// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#pragma once
+
+#include <memory>
+#include <span>
+
+#include "common/common_types.h"
+
+#include "video_core/gpu.h"
+
+namespace Tegra::Engines::Blitter {
+
+class Converter {
+public:
+    virtual void ConvertTo(std::span<const u8> input, std::span<f32> output) = 0;
+    virtual void ConvertFrom(std::span<const f32> input, std::span<u8> output) = 0;
+    virtual ~Converter() = default;
+};
+
+class ConverterFactory {
+public:
+    ConverterFactory();
+    ~ConverterFactory();
+
+    Converter* GetFormatConverter(RenderTargetFormat format);
+
+private:
+    Converter* BuildConverter(RenderTargetFormat format);
+
+    struct ConverterFactoryImpl;
+    std::unique_ptr<ConverterFactoryImpl> impl;
+};
+
+} // namespace Tegra::Engines::Blitter
--- a/src/video_core/engines/sw_blitter/generate_converters.py
+++ b/src/video_core/engines/sw_blitter/generate_converters.py
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import re
+
+class Format:
+    def __init__(self, string_value):
+        self.name = string_value
+        tmp = string_value.split('_')
+        self.component_type = tmp[1]
+        component_data = re.findall(r"\w\d+", tmp[0])
+        self.num_components = len(component_data)
+        sizes = []
+        swizzle = []
+        for data in component_data:
+            swizzle.append(data[0])
+            sizes.append(int(data[1:]))
+        self.sizes = sizes
+        self.swizzle = swizzle
+
+    def build_component_type_array(self):
+        result = "{ "
+        b = False
+        for i in range(0, self.num_components):
+            if b:
+                result += ", "
+            b = True
+            result += "ComponentType::" + self.component_type
+        result += " }"
+        return result
+
+    def build_component_sizes_array(self):
+        result = "{ "
+        b = False
+        for i in range(0, self.num_components):
+            if b:
+                result += ", "
+            b = True
+            result += str(self.sizes[i])
+        result += " }"
+        return result
+
+    def build_component_swizzle_array(self):
+        result = "{ "
+        b = False
+        for i in range(0, self.num_components):
+            if b:
+                result += ", "
+            b = True
+            swizzle = self.swizzle[i]
+            if swizzle == "X":
+                swizzle = "None"
+            result += "Swizzle::" + swizzle
+        result += " }"
+        return result
+
+    def print_declaration(self):
+        print("struct " + self.name + "Traits {")
+        print("  static constexpr size_t num_components = " + str(self.num_components) + ";")
+        print("  static constexpr std::array<ComponentType, num_components> component_types = " + self.build_component_type_array() + ";")
+        print("  static constexpr std::array<size_t, num_components> component_sizes = " + self.build_component_sizes_array() + ";")
+        print("  static constexpr std::array<Swizzle, num_components> component_swizzle = " + self.build_component_swizzle_array() + ";")
+        print("};\n")
+
+    def print_case(self):
+        print("case RenderTargetFormat::" + self.name + ":")
+        print("  return impl->converters_cache")
+        print("    .emplace(format, std::make_unique<ConverterImpl<" + self.name + "Traits>>())")
+        print("    .first->second.get();")
+        print("  break;")
+
+txt = """
+R32G32B32A32_FLOAT
+R32G32B32A32_SINT
+R32G32B32A32_UINT
+R32G32B32X32_FLOAT
+R32G32B32X32_SINT
+R32G32B32X32_UINT
+R16G16B16A16_UNORM
+R16G16B16A16_SNORM
+R16G16B16A16_SINT
+R16G16B16A16_UINT
+R16G16B16A16_FLOAT
+R32G32_FLOAT
+R32G32_SINT
+R32G32_UINT
+R16G16B16X16_FLOAT
+A8R8G8B8_UNORM
+A8R8G8B8_SRGB
+A2B10G10R10_UNORM
+A2B10G10R10_UINT
+A2R10G10B10_UNORM
+A8B8G8R8_UNORM
+A8B8G8R8_SRGB
+A8B8G8R8_SNORM
+A8B8G8R8_SINT
+A8B8G8R8_UINT
+R16G16_UNORM
+R16G16_SNORM
+R16G16_SINT
+R16G16_UINT
+R16G16_FLOAT
+B10G11R11_FLOAT
+R32_SINT
+R32_UINT
+R32_FLOAT
+X8R8G8B8_UNORM
+X8R8G8B8_SRGB
+R5G6B5_UNORM
+A1R5G5B5_UNORM
+R8G8_UNORM
+R8G8_SNORM
+R8G8_SINT
+R8G8_UINT
+R16_UNORM
+R16_SNORM
+R16_SINT
+R16_UINT
+R16_FLOAT
+R8_UNORM
+R8_SNORM
+R8_SINT
+R8_UINT
+X1R5G5B5_UNORM
+X8B8G8R8_UNORM
+X8B8G8R8_SRGB
+"""
+
+x = txt.split()
+y = list(map(lambda a: Format(a), x))
+formats = list(y)
+for format in formats:
+  format.print_declaration()
+
+for format in formats:
+  format.print_case()
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -27,12 +27,12 @@ struct CommandList;
 // TODO: Implement the commented ones
 enum class RenderTargetFormat : u32 {
    NONE = 0x0,
-    R32B32G32A32_FLOAT = 0xC0,
+    R32G32B32A32_FLOAT = 0xC0,
    R32G32B32A32_SINT = 0xC1,
    R32G32B32A32_UINT = 0xC2,
-    // R32G32B32X32_FLOAT = 0xC3,
-    // R32G32B32X32_SINT = 0xC4,
-    // R32G32B32X32_UINT = 0xC5,
+    R32G32B32X32_FLOAT = 0xC3,
+    R32G32B32X32_SINT = 0xC4,
+    R32G32B32X32_UINT = 0xC5,
    R16G16B16A16_UNORM = 0xC6,
    R16G16B16A16_SNORM = 0xC7,
    R16G16B16A16_SINT = 0xC8,
@@ -56,13 +56,13 @@ enum class RenderTargetFormat : u32 {
    R16G16_SINT = 0xDC,
    R16G16_UINT = 0xDD,
    R16G16_FLOAT = 0xDE,
-    // A2R10G10B10_UNORM = 0xDF,
+    A2R10G10B10_UNORM = 0xDF,
    B10G11R11_FLOAT = 0xE0,
    R32_SINT = 0xE3,
    R32_UINT = 0xE4,
    R32_FLOAT = 0xE5,
-    // X8R8G8B8_UNORM = 0xE6,
-    // X8R8G8B8_SRGB = 0xE7,
+    X8R8G8B8_UNORM = 0xE6,
+    X8R8G8B8_SRGB = 0xE7,
    R5G6B5_UNORM = 0xE8,
    A1R5G5B5_UNORM = 0xE9,
    R8G8_UNORM = 0xEA,
@@ -79,11 +79,11 @@ enum class RenderTargetFormat : u32 {
    R8_SINT = 0xF5,
    R8_UINT = 0xF6,

-    /*
-    A8_UNORM = 0xF7,
+    // A8_UNORM = 0xF7,
    X1R5G5B5_UNORM = 0xF8,
    X8B8G8R8_UNORM = 0xF9,
    X8B8G8R8_SRGB = 0xFA,
+    /*
    Z1R5G5B5_UNORM = 0xFB,
    O1R5G5B5_UNORM = 0xFC,
    Z8R8G8B8_UNORM = 0xFD,
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -466,8 +466,7 @@ bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
                                             const Tegra::Engines::Fermi2D::Config& copy_config) {
    MICROPROFILE_SCOPE(OpenGL_Blits);
    std::scoped_lock lock{texture_cache.mutex};
-    texture_cache.BlitImage(dst, src, copy_config);
-    return true;
+    return texture_cache.BlitImage(dst, src, copy_config);
 }

 Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA() {
--- a/src/video_core/renderer_opengl/maxwell_to_gl.h
+++ b/src/video_core/renderer_opengl/maxwell_to_gl.h
@@ -28,6 +28,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB
    {GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV},             // A1R5G5B5_UNORM
    {GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV},           // A2B10G10R10_UNORM
    {GL_RGB10_A2UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UINT
+    {GL_RGB10_A2, GL_BGRA, GL_UNSIGNED_INT_2_10_10_10_REV},           // A2R10G10B10_UNORM
    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV},             // A1B5G5R5_UNORM
    {GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1},                 // A5B5G5R1_UNORM
    {GL_R8, GL_RED, GL_UNSIGNED_BYTE},                                // R8_UNORM
--- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
+++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp
@@ -125,6 +125,7 @@ struct FormatTuple {
    {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable},              // A1R5G5B5_UNORM
    {VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10_UNORM
    {VK_FORMAT_A2B10G10R10_UINT_PACK32, Attachable | Storage},  // A2B10G10R10_UINT
+    {VK_FORMAT_A2R10G10B10_UNORM_PACK32, Attachable | Storage}, // A2R10G10B10_UNORM
    {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable},         // A1B5G5R5_UNORM (flipped with swizzle)
    {VK_FORMAT_R5G5B5A1_UNORM_PACK16},                     // A5B5G5R1_UNORM (specially swizzled)
    {VK_FORMAT_R8_UNORM, Attachable | Storage},            // R8_UNORM
@@ -149,7 +150,7 @@ struct FormatTuple {
    {VK_FORMAT_BC6H_UFLOAT_BLOCK},                             // BC6H_UFLOAT
    {VK_FORMAT_BC6H_SFLOAT_BLOCK},                             // BC6H_SFLOAT
    {VK_FORMAT_ASTC_4x4_UNORM_BLOCK},                          // ASTC_2D_4X4_UNORM
-    {VK_FORMAT_B8G8R8A8_UNORM, Attachable},                    // B8G8R8A8_UNORM
+    {VK_FORMAT_B8G8R8A8_UNORM, Attachable | Storage},          // B8G8R8A8_UNORM
    {VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage},     // R32G32B32A32_FLOAT
    {VK_FORMAT_R32G32B32A32_SINT, Attachable | Storage},       // R32G32B32A32_SINT
    {VK_FORMAT_R32G32_SFLOAT, Attachable | Storage},           // R32G32_FLOAT
@@ -159,7 +160,7 @@ struct FormatTuple {
    {VK_FORMAT_R16_UNORM, Attachable | Storage},               // R16_UNORM
    {VK_FORMAT_R16_SNORM, Attachable | Storage},               // R16_SNORM
    {VK_FORMAT_R16_UINT, Attachable | Storage},                // R16_UINT
-    {VK_FORMAT_UNDEFINED},                                     // R16_SINT
+    {VK_FORMAT_R16_SINT, Attachable | Storage},                // R16_SINT
    {VK_FORMAT_R16G16_UNORM, Attachable | Storage},            // R16G16_UNORM
    {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage},           // R16G16_FLOAT
    {VK_FORMAT_R16G16_UINT, Attachable | Storage},             // R16G16_UINT
@@ -183,7 +184,7 @@ struct FormatTuple {
    {VK_FORMAT_BC2_SRGB_BLOCK},                                // BC2_SRGB
    {VK_FORMAT_BC3_SRGB_BLOCK},                                // BC3_SRGB
    {VK_FORMAT_BC7_SRGB_BLOCK},                                // BC7_SRGB
-    {VK_FORMAT_R4G4B4A4_UNORM_PACK16, Attachable},             // A4B4G4R4_UNORM
+    {VK_FORMAT_R4G4B4A4_UNORM_PACK16},                         // A4B4G4R4_UNORM
    {VK_FORMAT_R4G4_UNORM_PACK8},                              // G4R4_UNORM
    {VK_FORMAT_ASTC_4x4_SRGB_BLOCK},                           // ASTC_2D_4X4_SRGB
    {VK_FORMAT_ASTC_8x8_SRGB_BLOCK},                           // ASTC_2D_8X8_SRGB
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@@ -542,8 +542,7 @@ bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
                                             const Tegra::Engines::Fermi2D::Surface& dst,
                                             const Tegra::Engines::Fermi2D::Config& copy_config) {
    std::scoped_lock lock{texture_cache.mutex};
-    texture_cache.BlitImage(dst, src, copy_config);
-    return true;
+    return texture_cache.BlitImage(dst, src, copy_config);
 }

 Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA() {
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -93,11 +93,14 @@ PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) {

 PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) {
    switch (format) {
-    case Tegra::RenderTargetFormat::R32B32G32A32_FLOAT:
+    case Tegra::RenderTargetFormat::R32G32B32A32_FLOAT:
+    case Tegra::RenderTargetFormat::R32G32B32X32_FLOAT:
        return PixelFormat::R32G32B32A32_FLOAT;
    case Tegra::RenderTargetFormat::R32G32B32A32_SINT:
+    case Tegra::RenderTargetFormat::R32G32B32X32_SINT:
        return PixelFormat::R32G32B32A32_SINT;
    case Tegra::RenderTargetFormat::R32G32B32A32_UINT:
+    case Tegra::RenderTargetFormat::R32G32B32X32_UINT:
        return PixelFormat::R32G32B32A32_UINT;
    case Tegra::RenderTargetFormat::R16G16B16A16_UNORM:
        return PixelFormat::R16G16B16A16_UNORM;
@@ -118,16 +121,22 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
    case Tegra::RenderTargetFormat::R16G16B16X16_FLOAT:
        return PixelFormat::R16G16B16X16_FLOAT;
    case Tegra::RenderTargetFormat::A8R8G8B8_UNORM:
+    case Tegra::RenderTargetFormat::X8R8G8B8_UNORM:
        return PixelFormat::B8G8R8A8_UNORM;
    case Tegra::RenderTargetFormat::A8R8G8B8_SRGB:
+    case Tegra::RenderTargetFormat::X8R8G8B8_SRGB:
        return PixelFormat::B8G8R8A8_SRGB;
    case Tegra::RenderTargetFormat::A2B10G10R10_UNORM:
        return PixelFormat::A2B10G10R10_UNORM;
    case Tegra::RenderTargetFormat::A2B10G10R10_UINT:
        return PixelFormat::A2B10G10R10_UINT;
+    case Tegra::RenderTargetFormat::A2R10G10B10_UNORM:
+        return PixelFormat::A2R10G10B10_UNORM;
    case Tegra::RenderTargetFormat::A8B8G8R8_UNORM:
+    case Tegra::RenderTargetFormat::X8B8G8R8_UNORM:
        return PixelFormat::A8B8G8R8_UNORM;
    case Tegra::RenderTargetFormat::A8B8G8R8_SRGB:
+    case Tegra::RenderTargetFormat::X8B8G8R8_SRGB:
        return PixelFormat::A8B8G8R8_SRGB;
    case Tegra::RenderTargetFormat::A8B8G8R8_SNORM:
        return PixelFormat::A8B8G8R8_SNORM;
@@ -156,6 +165,7 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
    case Tegra::RenderTargetFormat::R5G6B5_UNORM:
        return PixelFormat::R5G6B5_UNORM;
    case Tegra::RenderTargetFormat::A1R5G5B5_UNORM:
+    case Tegra::RenderTargetFormat::X1R5G5B5_UNORM:
        return PixelFormat::A1R5G5B5_UNORM;
    case Tegra::RenderTargetFormat::R8G8_UNORM:
        return PixelFormat::R8G8_UNORM;
--- a/src/video_core/surface.h
+++ b/src/video_core/surface.h
@@ -23,6 +23,7 @@ enum class PixelFormat {
    A1R5G5B5_UNORM,
    A2B10G10R10_UNORM,
    A2B10G10R10_UINT,
+    A2R10G10B10_UNORM,
    A1B5G5R5_UNORM,
    A5B5G5R1_UNORM,
    R8_UNORM,
@@ -159,6 +160,7 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_WIDTH_TABLE = {{
    1,  // A1R5G5B5_UNORM
    1,  // A2B10G10R10_UNORM
    1,  // A2B10G10R10_UINT
+    1,  // A2R10G10B10_UNORM
    1,  // A1B5G5R5_UNORM
    1,  // A5B5G5R1_UNORM
    1,  // R8_UNORM
@@ -264,6 +266,7 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_HEIGHT_TABLE = {{
    1,  // A1R5G5B5_UNORM
    1,  // A2B10G10R10_UNORM
    1,  // A2B10G10R10_UINT
+    1,  // A2R10G10B10_UNORM
    1,  // A1B5G5R5_UNORM
    1,  // A5B5G5R1_UNORM
    1,  // R8_UNORM
@@ -369,6 +372,7 @@ constexpr std::array<u8, MaxPixelFormat> BITS_PER_BLOCK_TABLE = {{
    16,  // A1R5G5B5_UNORM
    32,  // A2B10G10R10_UNORM
    32,  // A2B10G10R10_UINT
+    32,  // A2R10G10B10_UNORM
    16,  // A1B5G5R5_UNORM
    16,  // A5B5G5R1_UNORM
    8,   // R8_UNORM
--- a/src/video_core/texture_cache/formatter.h
+++ b/src/video_core/texture_cache/formatter.h
@@ -35,6 +35,8 @@ struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::str
                return "A2B10G10R10_UNORM";
            case PixelFormat::A2B10G10R10_UINT:
                return "A2B10G10R10_UINT";
+            case PixelFormat::A2R10G10B10_UNORM:
+                return "A2R10G10B10_UNORM";
            case PixelFormat::A1B5G5R5_UNORM:
                return "A1B5G5R5_UNORM";
            case PixelFormat::A5B5G5R1_UNORM:
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -506,10 +506,14 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
 }

 template <class P>
-void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
+bool TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
                                const Tegra::Engines::Fermi2D::Surface& src,
                                const Tegra::Engines::Fermi2D::Config& copy) {
-    const BlitImages images = GetBlitImages(dst, src, copy);
+    const auto result = GetBlitImages(dst, src, copy);
+    if (!result) {
+        return false;
+    }
+    const BlitImages images = *result;
    const ImageId dst_id = images.dst_id;
    const ImageId src_id = images.src_id;

@@ -596,6 +600,7 @@ void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
        runtime.BlitImage(dst_framebuffer, dst_view, src_view, dst_region, src_region, copy.filter,
                          copy.operation);
    }
+    return true;
 }

 template <class P>
@@ -1133,7 +1138,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
 }

 template <class P>
-typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
+std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImages(
    const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
    const Tegra::Engines::Fermi2D::Config& copy) {

@@ -1154,6 +1159,20 @@ typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
        has_deleted_images = false;
        src_id = FindImage(src_info, src_addr, try_options);
        dst_id = FindImage(dst_info, dst_addr, try_options);
+        if (!copy.must_accelerate) {
+            do {
+                if (!src_id && !dst_id) {
+                    return std::nullopt;
+                }
+                if (src_id && True(slot_images[src_id].flags & ImageFlagBits::GpuModified)) {
+                    break;
+                }
+                if (dst_id && True(slot_images[dst_id].flags & ImageFlagBits::GpuModified)) {
+                    break;
+                }
+                return std::nullopt;
+            } while (false);
+        }
        const ImageBase* const src_image = src_id ? &slot_images[src_id] : nullptr;
        if (src_image && src_image->info.num_samples > 1) {
            RelaxedOptions find_options{FIND_OPTIONS | RelaxedOptions::ForceBrokenViews};
@@ -1194,12 +1213,12 @@ typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
            dst_id = FindOrInsertImage(dst_info, dst_addr, RelaxedOptions{});
        } while (has_deleted_images);
    }
-    return BlitImages{
+    return {BlitImages{
        .dst_id = dst_id,
        .src_id = src_id,
        .dst_format = dst_info.format,
        .src_format = src_info.format,
-    };
+    }};
 }

 template <class P>
--- a/src/video_core/texture_cache/texture_cache_base.h
+++ b/src/video_core/texture_cache/texture_cache_base.h
@@ -174,7 +174,7 @@ public:
    void UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size);

    /// Blit an image with the given parameters
-    void BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
+    bool BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
                   const Tegra::Engines::Fermi2D::Surface& src,
                   const Tegra::Engines::Fermi2D::Config& copy);

@@ -285,9 +285,9 @@ private:
    [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr);

    /// Return a blit image pair from the given guest blit parameters
-    [[nodiscard]] BlitImages GetBlitImages(const Tegra::Engines::Fermi2D::Surface& dst,
-                                           const Tegra::Engines::Fermi2D::Surface& src,
-                                           const Tegra::Engines::Fermi2D::Config& copy);
+    [[nodiscard]] std::optional<BlitImages> GetBlitImages(
+        const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
+        const Tegra::Engines::Fermi2D::Config& copy);

    /// Find or create a sampler from a guest descriptor sampler
    [[nodiscard]] SamplerId FindSampler(const TSCEntry& config);
--- a/src/yuzu/main.ui
+++ b/src/yuzu/main.ui
@@ -231,6 +231,9 @@
   <property name="text">
    <string>Con&amp;figure...</string>
   </property>
+   <property name="menuRole">
+    <enum>QAction::PreferencesRole</enum>
+   </property>
  </action>
  <action name="action_Display_Dock_Widget_Headers">
   <property name="checkable">
@@ -363,6 +366,9 @@
   <property name="text">
    <string>&amp;Configure TAS...</string>
   </property>
+   <property name="menuRole">
+    <enum>QAction::NoRole</enum>
+   </property>
  </action>
  <action name="action_Configure_Current_Game">
   <property name="enabled">
@@ -371,6 +377,9 @@
   <property name="text">
    <string>Configure C&amp;urrent Game...</string>
   </property>
+   <property name="menuRole">
+    <enum>QAction::NoRole</enum>
+   </property>
  </action>
  <action name="action_TAS_Start">
   <property name="enabled">
Author	SHA1	Message	Date
Vedarius TopBAE1 Vincent A. Russell	a9ff3a232b	Create python-publish.yml	2022-11-26 18:25:50 -06:00
bunnei	eabe45346f	Merge pull request #9318 from goldenx86/glsl-ftw Replace GLSL as the default OpenGL shader backend	2022-11-26 15:57:37 -08:00
Matías Locatti	701ca96827	Oops	2022-11-26 17:39:43 -03:00
Matías Locatti	26211ac339	Replace GLSL as the default OpenGL shader backend GLASM is not very compatible with the latest games, and too many people have the special superpower to break their Vulkan support.	2022-11-26 17:27:04 -03:00
liamwhite	3e53d8138c	Merge pull request #9288 from vonchenplus/deferred_draw video_core: Fine tune maxwell drawing trigger mechanism	2022-11-26 09:35:45 -05:00
liamwhite	ddca512f3f	Merge pull request #9307 from Morph1984/not-used-correctly maxwell_to_vk: Fix format usage bits and add R16_SINT	2022-11-26 09:08:55 -05:00
liamwhite	e16d1b85f1	Merge pull request #9297 from Kelebek1/sink_oob [audio_core] Fix an OoB with sample sinking	2022-11-25 12:53:29 -05:00
bunnei	2572b0a5ea	Merge pull request #9302 from liamwhite/why-are-we-still-using-ado externals: always use LibreSSL on Windows	2022-11-25 00:39:16 -08:00
bunnei	e8cbc3b4c5	Merge pull request #9304 from liamwhite/menu-roll Qt: assign menuRole properties for actions	2022-11-25 00:38:50 -08:00
bunnei	64965cc658	Merge pull request #9305 from lioncash/request hle_ipc: Add helper function for determining element counts	2022-11-25 00:38:17 -08:00
liamwhite	20b62dbd30	Merge pull request #9194 from FernandoS27/yfc-fermi2d YFC - Fermi2D: Rework blit engine and add a software blitter.	2022-11-24 21:48:41 -05:00
Morph	9d081a8729	Merge pull request #9312 from FernandoS27/pokemomma GPU: Fix buffer cache issue, engine upload not inlining memory in multiple lines, etc	2022-11-24 18:24:07 -05:00
Fernando Sahmkow	826e0785bf	Fermi2D: Cleanup and address feedback.	2022-11-24 21:00:48 +01:00
Fernando Sahmkow	3b582d5fb2	GPU: Fix buffer cache issue, engine upload not inlining memory in multiline and pessismistic invalidation.	2022-11-24 20:57:16 +01:00
Fernando Sahmkow	7356ab1de6	GPU: Implement additional render target formats.	2022-11-24 20:35:44 +01:00
Fernando Sahmkow	daf2ef8f1c	MaxwellDMA: Implement BlockLinear to BlockLinear copies.	2022-11-24 20:35:44 +01:00
Fernando Sahmkow	5fbd6954ef	Fermi2D: Implement Bilinear software filtering and address feedback.	2022-11-24 20:35:44 +01:00
Fernando Sahmkow	957840be91	Fermi2D: Rework blit engine and add a software blitter.	2022-11-24 20:35:44 +01:00
Morph	852de7a771	maxwell_to_vk: Add R16_SINT This was somehow missed when the format was added to GL	2022-11-23 21:30:58 -05:00
Morph	ca154d466a	maxwell_to_vk: Fix format usage bits - VK_FORMAT_B8G8R8A8_UNORM supports the STORAGE_IMAGE_BIT - VK_FORMAT_R4G4B4A4_UNORM_PACK16 does not support the COLOR_ATTACHMENT_BIT	2022-11-23 21:29:43 -05:00
Liam	9abceaed61	Qt: assign menuRole properties for actions	2022-11-23 12:41:56 -05:00
Liam	cdb2e4eaff	externals: always use LibreSSL on Windows	2022-11-23 10:24:25 -05:00
Kelebek1	84d4da89a5	Use the maximum input index for samples buffer span size, not just the input count	2022-11-22 15:32:11 +00:00
FengChen	1d57851fc7	video_core: Optimize maxwell drawing trigger mechanism	2022-11-22 17:53:26 +08:00