Compare commits

..

24 Commits

Author SHA1 Message Date
Vedarius TopBAE1 Vincent A. Russell
a9ff3a232b Create python-publish.yml 2022-11-26 18:25:50 -06:00
bunnei
eabe45346f Merge pull request #9318 from goldenx86/glsl-ftw
Replace GLSL as the default OpenGL shader backend
2022-11-26 15:57:37 -08:00
Matías Locatti
701ca96827 Oops 2022-11-26 17:39:43 -03:00
Matías Locatti
26211ac339 Replace GLSL as the default OpenGL shader backend
GLASM is not very compatible with the latest games, and too many people have the special superpower to break their Vulkan support.
2022-11-26 17:27:04 -03:00
liamwhite
3e53d8138c Merge pull request #9288 from vonchenplus/deferred_draw
video_core: Fine tune maxwell drawing trigger mechanism
2022-11-26 09:35:45 -05:00
liamwhite
ddca512f3f Merge pull request #9307 from Morph1984/not-used-correctly
maxwell_to_vk: Fix format usage bits and add R16_SINT
2022-11-26 09:08:55 -05:00
liamwhite
e16d1b85f1 Merge pull request #9297 from Kelebek1/sink_oob
[audio_core] Fix an OoB with sample sinking
2022-11-25 12:53:29 -05:00
bunnei
2572b0a5ea Merge pull request #9302 from liamwhite/why-are-we-still-using-ado
externals: always use LibreSSL on Windows
2022-11-25 00:39:16 -08:00
bunnei
e8cbc3b4c5 Merge pull request #9304 from liamwhite/menu-roll
Qt: assign menuRole properties for actions
2022-11-25 00:38:50 -08:00
bunnei
64965cc658 Merge pull request #9305 from lioncash/request
hle_ipc: Add helper function for determining element counts
2022-11-25 00:38:17 -08:00
liamwhite
20b62dbd30 Merge pull request #9194 from FernandoS27/yfc-fermi2d
YFC - Fermi2D: Rework blit engine and add a software blitter.
2022-11-24 21:48:41 -05:00
Morph
9d081a8729 Merge pull request #9312 from FernandoS27/pokemomma
GPU: Fix buffer cache issue, engine upload not inlining memory in multiple lines, etc
2022-11-24 18:24:07 -05:00
Fernando Sahmkow
826e0785bf Fermi2D: Cleanup and address feedback. 2022-11-24 21:00:48 +01:00
Fernando Sahmkow
3b582d5fb2 GPU: Fix buffer cache issue, engine upload not inlining memory in multiline and pessismistic invalidation. 2022-11-24 20:57:16 +01:00
Fernando Sahmkow
7356ab1de6 GPU: Implement additional render target formats. 2022-11-24 20:35:44 +01:00
Fernando Sahmkow
daf2ef8f1c MaxwellDMA: Implement BlockLinear to BlockLinear copies. 2022-11-24 20:35:44 +01:00
Fernando Sahmkow
5fbd6954ef Fermi2D: Implement Bilinear software filtering and address feedback. 2022-11-24 20:35:44 +01:00
Fernando Sahmkow
957840be91 Fermi2D: Rework blit engine and add a software blitter. 2022-11-24 20:35:44 +01:00
Morph
852de7a771 maxwell_to_vk: Add R16_SINT
This was somehow missed when the format was added to GL
2022-11-23 21:30:58 -05:00
Morph
ca154d466a maxwell_to_vk: Fix format usage bits
- VK_FORMAT_B8G8R8A8_UNORM supports the STORAGE_IMAGE_BIT
- VK_FORMAT_R4G4B4A4_UNORM_PACK16 does not support the COLOR_ATTACHMENT_BIT
2022-11-23 21:29:43 -05:00
Liam
9abceaed61 Qt: assign menuRole properties for actions 2022-11-23 12:41:56 -05:00
Liam
cdb2e4eaff externals: always use LibreSSL on Windows 2022-11-23 10:24:25 -05:00
Kelebek1
84d4da89a5 Use the maximum input index for samples buffer span size, not just the input count 2022-11-22 15:32:11 +00:00
FengChen
1d57851fc7 video_core: Optimize maxwell drawing trigger mechanism 2022-11-22 17:53:26 +08:00
31 changed files with 1972 additions and 121 deletions

39
.github/workflows/python-publish.yml vendored Normal file
View File

@@ -0,0 +1,39 @@
# This workflow will upload a Python Package using Twine when a release is created
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.
name: Upload Python Package
on:
release:
types: [published]
permissions:
contents: read
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install build
- name: Build package
run: python -m build
- name: Publish package
uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}

View File

@@ -92,10 +92,14 @@ endif()
add_subdirectory(sirit)
if (ENABLE_WEB_SERVICE)
find_package(OpenSSL 1.1)
if (OPENSSL_FOUND)
set(OPENSSL_LIBRARIES OpenSSL::SSL OpenSSL::Crypto)
else()
if (NOT WIN32)
find_package(OpenSSL 1.1)
if (OPENSSL_FOUND)
set(OPENSSL_LIBRARIES OpenSSL::SSL OpenSSL::Crypto)
endif()
endif()
if (WIN32 OR NOT OPENSSL_FOUND)
# LibreSSL
set(LIBRESSL_SKIP_INSTALL ON CACHE BOOL "")
set(OPENSSLDIR "/etc/ssl/")

View File

@@ -460,21 +460,23 @@ void CommandBuffer::GenerateDeviceSinkCommand(const s32 node_id, const s16 buffe
cmd.session_id = session_id;
cmd.input_count = parameter.input_count;
s16 max_input{0};
for (u32 i = 0; i < parameter.input_count; i++) {
cmd.inputs[i] = buffer_offset + parameter.inputs[i];
max_input = std::max(max_input, cmd.inputs[i]);
}
if (state.upsampler_info != nullptr) {
const auto size_{state.upsampler_info->sample_count * parameter.input_count};
const auto size_bytes{size_ * sizeof(s32)};
const auto addr{memory_pool->Translate(state.upsampler_info->samples_pos, size_bytes)};
cmd.sample_buffer = {reinterpret_cast<s32*>(addr),
parameter.input_count * state.upsampler_info->sample_count};
(max_input + 1) * state.upsampler_info->sample_count};
} else {
cmd.sample_buffer = samples_buffer;
}
cmd.input_count = parameter.input_count;
for (u32 i = 0; i < parameter.input_count; i++) {
cmd.inputs[i] = buffer_offset + parameter.inputs[i];
}
GenerateEnd<DeviceSinkCommand>(cmd);
}

View File

@@ -442,7 +442,7 @@ struct Values {
SwitchableSetting<NvdecEmulation> nvdec_emulation{NvdecEmulation::GPU, "nvdec_emulation"};
SwitchableSetting<bool> accelerate_astc{true, "accelerate_astc"};
SwitchableSetting<bool> use_vsync{true, "use_vsync"};
SwitchableSetting<ShaderBackend, true> shader_backend{ShaderBackend::GLASM, ShaderBackend::GLSL,
SwitchableSetting<ShaderBackend, true> shader_backend{ShaderBackend::GLSL, ShaderBackend::GLSL,
ShaderBackend::SPIRV, "shader_backend"};
SwitchableSetting<bool> use_asynchronous_shaders{false, "use_asynchronous_shaders"};
SwitchableSetting<bool> use_fast_gpu_time{true, "use_fast_gpu_time"};

View File

@@ -28,6 +28,10 @@ add_library(video_core STATIC
dirty_flags.h
dma_pusher.cpp
dma_pusher.h
engines/sw_blitter/blitter.cpp
engines/sw_blitter/blitter.h
engines/sw_blitter/converter.cpp
engines/sw_blitter/converter.h
engines/const_buffer_info.h
engines/engine_interface.h
engines/engine_upload.cpp

View File

@@ -1742,12 +1742,12 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
SynchronizeBuffer(buffer, dest_address, static_cast<u32>(copy_size));
if constexpr (USE_MEMORY_MAPS) {
auto upload_staging = runtime.UploadStagingBuffer(copy_size);
std::array copies{BufferCopy{
.src_offset = 0,
.src_offset = upload_staging.offset,
.dst_offset = buffer.Offset(dest_address),
.size = copy_size,
}};
auto upload_staging = runtime.UploadStagingBuffer(copy_size);
u8* const src_pointer = upload_staging.mapped_span.data();
std::memcpy(src_pointer, inlined_buffer.data(), copy_size);
runtime.CopyBuffer(buffer, upload_staging.buffer, copies);

View File

@@ -20,7 +20,7 @@ void ChannelState::Init(Core::System& system, GPU& gpu) {
ASSERT(memory_manager);
dma_pusher = std::make_unique<Tegra::DmaPusher>(system, gpu, *memory_manager, *this);
maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, *memory_manager);
fermi_2d = std::make_unique<Engines::Fermi2D>();
fermi_2d = std::make_unique<Engines::Fermi2D>(*memory_manager);
kepler_compute = std::make_unique<Engines::KeplerCompute>(system, *memory_manager);
maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);

View File

@@ -51,11 +51,11 @@ void State::ProcessData(std::span<const u8> read_buffer) {
} else {
for (u32 line = 0; line < regs.line_count; ++line) {
const GPUVAddr dest_line = address + static_cast<size_t>(line) * regs.dest.pitch;
memory_manager.WriteBlockUnsafe(
dest_line, read_buffer.data() + static_cast<size_t>(line) * regs.line_length_in,
regs.line_length_in);
std::span<const u8> buffer(read_buffer.data() +
static_cast<size_t>(line) * regs.line_length_in,
regs.line_length_in);
rasterizer->AccelerateInlineToMemory(dest_line, regs.line_length_in, buffer);
}
memory_manager.InvalidateRegion(address, regs.dest.pitch * regs.line_count);
}
} else {
u32 width = regs.dest.width;

View File

@@ -3,17 +3,25 @@
#include "common/assert.h"
#include "common/logging/log.h"
#include "common/microprofile.h"
#include "video_core/engines/fermi_2d.h"
#include "video_core/memory_manager.h"
#include "video_core/engines/sw_blitter/blitter.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/surface.h"
#include "video_core/textures/decoders.h"
MICROPROFILE_DECLARE(GPU_BlitEngine);
MICROPROFILE_DEFINE(GPU_BlitEngine, "GPU", "Blit Engine", MP_RGB(224, 224, 128));
using VideoCore::Surface::BytesPerBlock;
using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
namespace Tegra::Engines {
Fermi2D::Fermi2D() {
using namespace Texture;
Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_);
// Nvidia's OpenGL driver seems to assume these values
regs.src.depth = 1;
regs.dst.depth = 1;
@@ -42,6 +50,7 @@ void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32
}
void Fermi2D::Blit() {
MICROPROFILE_SCOPE(GPU_BlitEngine);
LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}",
regs.src.Address(), regs.dst.Address());
@@ -52,9 +61,16 @@ void Fermi2D::Blit() {
UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled");
const auto& args = regs.pixels_from_memory;
constexpr s64 null_derivate = 1ULL << 32;
Surface src = regs.src;
const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
const bool delegate_to_gpu = src.width > 512 && src.height > 512 && bytes_per_pixel <= 8 &&
src.format != regs.dst.format;
Config config{
.operation = regs.operation,
.filter = args.sample_mode.filter,
.must_accelerate =
args.du_dx != null_derivate || args.dv_dy != null_derivate || delegate_to_gpu,
.dst_x0 = args.dst_x0,
.dst_y0 = args.dst_y0,
.dst_x1 = args.dst_x0 + args.dst_width,
@@ -64,8 +80,7 @@ void Fermi2D::Blit() {
.src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32),
.src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32),
};
Surface src = regs.src;
const auto bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
const auto need_align_to_pitch =
src.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch &&
static_cast<s32>(src.width) == config.src_x1 &&
@@ -78,8 +93,9 @@ void Fermi2D::Blit() {
config.src_x1 -= config.src_x0;
config.src_x0 = 0;
}
if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
UNIMPLEMENTED();
sw_blitter->Blit(src, regs.dst, config);
}
}

View File

@@ -5,6 +5,7 @@
#include <array>
#include <cstddef>
#include <memory>
#include "common/bit_field.h"
#include "common/common_funcs.h"
#include "common/common_types.h"
@@ -21,6 +22,10 @@ class RasterizerInterface;
namespace Tegra::Engines {
namespace Blitter {
class SoftwareBlitEngine;
}
/**
* This Engine is known as G80_2D. Documentation can be found in:
* https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml
@@ -32,7 +37,7 @@ namespace Tegra::Engines {
class Fermi2D final : public EngineInterface {
public:
explicit Fermi2D();
explicit Fermi2D(MemoryManager& memory_manager_);
~Fermi2D() override;
/// Binds a rasterizer to this engine.
@@ -286,6 +291,7 @@ public:
struct Config {
Operation operation;
Filter filter;
bool must_accelerate;
s32 dst_x0;
s32 dst_y0;
s32 dst_x1;
@@ -298,6 +304,7 @@ public:
private:
VideoCore::RasterizerInterface* rasterizer = nullptr;
std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;
/// Performs the copy from the source surface to the destination surface as configured in the
/// registers.

View File

@@ -126,6 +126,7 @@ void Maxwell3D::InitializeRegisterDefaults() {
draw_command[MAXWELL3D_REG_INDEX(draw_inline_index)] = true;
draw_command[MAXWELL3D_REG_INDEX(inline_index_2x16.even)] = true;
draw_command[MAXWELL3D_REG_INDEX(inline_index_4x8.index0)] = true;
draw_command[MAXWELL3D_REG_INDEX(draw.instance_id)] = true;
}
void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) {
@@ -249,9 +250,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
return;
case MAXWELL3D_REG_INDEX(fragment_barrier):
return rasterizer->FragmentBarrier();
case MAXWELL3D_REG_INDEX(invalidate_texture_data_cache):
rasterizer->InvalidateGPUCache();
return rasterizer->WaitForIdle();
case MAXWELL3D_REG_INDEX(tiled_cache_barrier):
return rasterizer->TiledCacheBarrier();
}
@@ -288,31 +286,58 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
ASSERT_MSG(method < Regs::NUM_REGS,
"Invalid Maxwell3D register, increase the size of the Regs structure");
const u32 argument = ProcessShadowRam(method, method_argument);
ProcessDirtyRegisters(method, argument);
if (draw_command[method]) {
regs.reg_array[method] = method_argument;
deferred_draw_method.push_back(method);
auto u32_to_u8 = [&](const u32 argument) {
inline_index_draw_indexes.push_back(static_cast<u8>(argument & 0x000000ff));
inline_index_draw_indexes.push_back(static_cast<u8>((argument & 0x0000ff00) >> 8));
inline_index_draw_indexes.push_back(static_cast<u8>((argument & 0x00ff0000) >> 16));
inline_index_draw_indexes.push_back(static_cast<u8>((argument & 0xff000000) >> 24));
auto update_inline_index = [&](const u32 index) {
inline_index_draw_indexes.push_back(static_cast<u8>(index & 0x000000ff));
inline_index_draw_indexes.push_back(static_cast<u8>((index & 0x0000ff00) >> 8));
inline_index_draw_indexes.push_back(static_cast<u8>((index & 0x00ff0000) >> 16));
inline_index_draw_indexes.push_back(static_cast<u8>((index & 0xff000000) >> 24));
draw_mode = DrawMode::InlineIndex;
};
if (MAXWELL3D_REG_INDEX(draw_inline_index) == method) {
u32_to_u8(method_argument);
} else if (MAXWELL3D_REG_INDEX(inline_index_2x16.even) == method) {
u32_to_u8(regs.inline_index_2x16.even);
u32_to_u8(regs.inline_index_2x16.odd);
} else if (MAXWELL3D_REG_INDEX(inline_index_4x8.index0) == method) {
u32_to_u8(regs.inline_index_4x8.index0);
u32_to_u8(regs.inline_index_4x8.index1);
u32_to_u8(regs.inline_index_4x8.index2);
u32_to_u8(regs.inline_index_4x8.index3);
switch (method) {
case MAXWELL3D_REG_INDEX(draw.end):
switch (draw_mode) {
case DrawMode::General:
ProcessDraw(1);
break;
case DrawMode::InlineIndex:
regs.index_buffer.count = static_cast<u32>(inline_index_draw_indexes.size() / 4);
regs.index_buffer.format = Regs::IndexFormat::UnsignedInt;
ProcessDraw(1);
inline_index_draw_indexes.clear();
break;
case DrawMode::Instance:
break;
}
break;
case MAXWELL3D_REG_INDEX(draw_inline_index):
update_inline_index(method_argument);
break;
case MAXWELL3D_REG_INDEX(inline_index_2x16.even):
update_inline_index(regs.inline_index_2x16.even);
update_inline_index(regs.inline_index_2x16.odd);
break;
case MAXWELL3D_REG_INDEX(inline_index_4x8.index0):
update_inline_index(regs.inline_index_4x8.index0);
update_inline_index(regs.inline_index_4x8.index1);
update_inline_index(regs.inline_index_4x8.index2);
update_inline_index(regs.inline_index_4x8.index3);
break;
case MAXWELL3D_REG_INDEX(draw.instance_id):
draw_mode =
(regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Subsequent) ||
(regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Unchanged)
? DrawMode::Instance
: DrawMode::General;
break;
}
} else {
ProcessDeferredDraw();
const u32 argument = ProcessShadowRam(method, method_argument);
ProcessDirtyRegisters(method, argument);
ProcessMethodCall(method, argument, method_argument, is_last_call);
}
}
@@ -511,10 +536,7 @@ void Maxwell3D::ProcessCounterReset() {
void Maxwell3D::ProcessSyncPoint() {
const u32 sync_point = regs.sync_info.sync_point.Value();
const u32 cache_flush = regs.sync_info.clean_l2.Value();
if (cache_flush != 0) {
rasterizer->InvalidateGPUCache();
}
[[maybe_unused]] const u32 cache_flush = regs.sync_info.clean_l2.Value();
rasterizer->SignalSyncPoint(sync_point);
}
@@ -626,57 +648,27 @@ void Maxwell3D::ProcessDraw(u32 instance_count) {
}
void Maxwell3D::ProcessDeferredDraw() {
if (deferred_draw_method.empty()) {
if (draw_mode != DrawMode::Instance || deferred_draw_method.empty()) {
return;
}
enum class DrawMode {
Undefined,
General,
Instance,
};
DrawMode draw_mode{DrawMode::Undefined};
u32 method_count = static_cast<u32>(deferred_draw_method.size());
u32 method = deferred_draw_method[method_count - 1];
if (MAXWELL3D_REG_INDEX(draw.end) != method) {
return;
}
draw_mode = (regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Subsequent) ||
(regs.draw.instance_id == Maxwell3D::Regs::Draw::InstanceId::Unchanged)
? DrawMode::Instance
: DrawMode::General;
u32 instance_count = 0;
if (draw_mode == DrawMode::Instance) {
u32 vertex_buffer_count = 0;
u32 index_buffer_count = 0;
for (u32 index = 0; index < method_count; ++index) {
method = deferred_draw_method[index];
if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count)) {
instance_count = ++vertex_buffer_count;
} else if (method == MAXWELL3D_REG_INDEX(index_buffer.count)) {
instance_count = ++index_buffer_count;
}
}
ASSERT_MSG(!(vertex_buffer_count && index_buffer_count),
"Instance both indexed and direct?");
} else {
instance_count = 1;
for (u32 index = 0; index < method_count; ++index) {
method = deferred_draw_method[index];
if (MAXWELL3D_REG_INDEX(draw_inline_index) == method ||
MAXWELL3D_REG_INDEX(inline_index_2x16.even) == method ||
MAXWELL3D_REG_INDEX(inline_index_4x8.index0) == method) {
regs.index_buffer.count = static_cast<u32>(inline_index_draw_indexes.size() / 4);
regs.index_buffer.format = Regs::IndexFormat::UnsignedInt;
break;
}
u32 instance_count = 1;
u32 vertex_buffer_count = 0;
u32 index_buffer_count = 0;
for (u32 index = 0; index < method_count; ++index) {
u32 method = deferred_draw_method[index];
if (method == MAXWELL3D_REG_INDEX(vertex_buffer.count)) {
instance_count = ++vertex_buffer_count;
} else if (method == MAXWELL3D_REG_INDEX(index_buffer.count)) {
instance_count = ++index_buffer_count;
}
}
ASSERT_MSG(!(vertex_buffer_count && index_buffer_count), "Instance both indexed and direct?");
ProcessDraw(instance_count);
deferred_draw_method.clear();
inline_index_draw_indexes.clear();
}
} // namespace Tegra::Engines

View File

@@ -3148,10 +3148,12 @@ private:
/// Handles use of topology overrides (e.g., to avoid using a topology assigned from a macro)
void ProcessTopologyOverride();
void ProcessDraw(u32 instance_count = 1);
/// Handles deferred draw(e.g., instance draw).
void ProcessDeferredDraw();
/// Handles a draw.
void ProcessDraw(u32 instance_count = 1);
/// Returns a query's value or an empty object if the value will be deferred through a cache.
std::optional<u64> GetQueryResult();
@@ -3178,6 +3180,8 @@ private:
std::array<bool, Regs::NUM_REGS> draw_command{};
std::vector<u32> deferred_draw_method;
enum class DrawMode : u32 { General = 0, Instance, InlineIndex };
DrawMode draw_mode{DrawMode::General};
};
#define ASSERT_REG_POSITION(field_name, position) \

View File

@@ -62,7 +62,8 @@ void MaxwellDMA::Launch() {
if (!is_src_pitch && !is_dst_pitch) {
// If both the source and the destination are in block layout, assert.
UNIMPLEMENTED_MSG("Tiled->Tiled DMA transfers are not yet implemented");
CopyBlockLinearToBlockLinear();
ReleaseSemaphore();
return;
}
@@ -291,6 +292,70 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::CopyBlockLinearToBlockLinear() {
UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
const bool is_remapping = regs.launch_dma.remap_enable != 0;
// Deswizzle the input and copy it over.
const Parameters& src = regs.src_params;
const Parameters& dst = regs.dst_params;
const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
u32 src_width = src.width;
u32 dst_width = dst.width;
u32 x_elements = regs.line_length_in;
u32 src_x_offset = src.origin.x;
u32 dst_x_offset = dst.origin.x;
u32 bpp_shift = 0U;
if (!is_remapping) {
bpp_shift = Common::FoldRight(
4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
src_width, dst_width, x_elements, src_x_offset, dst_x_offset,
static_cast<u32>(regs.offset_in), static_cast<u32>(regs.offset_out));
src_width >>= bpp_shift;
dst_width >>= bpp_shift;
x_elements >>= bpp_shift;
src_x_offset >>= bpp_shift;
dst_x_offset >>= bpp_shift;
}
const u32 bytes_per_pixel = base_bpp << bpp_shift;
const size_t src_size = CalculateSize(true, bytes_per_pixel, src_width, src.height, src.depth,
src.block_size.height, src.block_size.depth);
const size_t dst_size = CalculateSize(true, bytes_per_pixel, dst_width, dst.height, dst.depth,
dst.block_size.height, dst.block_size.depth);
const u32 pitch = x_elements * bytes_per_pixel;
const size_t mid_buffer_size = pitch * regs.line_count;
if (read_buffer.size() < src_size) {
read_buffer.resize(src_size);
}
if (write_buffer.size() < dst_size) {
write_buffer.resize(dst_size);
}
intermediate_buffer.resize(mid_buffer_size);
memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
UnswizzleSubrect(intermediate_buffer, read_buffer, bytes_per_pixel, src_width, src.height,
src.depth, src_x_offset, src.origin.y, x_elements, regs.line_count,
src.block_size.height, src.block_size.depth, pitch);
SwizzleSubrect(write_buffer, intermediate_buffer, bytes_per_pixel, dst_width, dst.height,
dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
dst.block_size.height, dst.block_size.depth, pitch);
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
}
void MaxwellDMA::ReleaseSemaphore() {
const auto type = regs.launch_dma.semaphore_type;
const GPUVAddr address = regs.semaphore.address;

View File

@@ -223,6 +223,8 @@ private:
void CopyPitchToBlockLinear();
void CopyBlockLinearToBlockLinear();
void FastCopyBlockLinearToPitch();
void ReleaseSemaphore();
@@ -234,6 +236,7 @@ private:
std::vector<u8> read_buffer;
std::vector<u8> write_buffer;
std::vector<u8> intermediate_buffer;
static constexpr std::size_t NUM_REGS = 0x800;
struct Regs {

View File

@@ -118,7 +118,7 @@ void Puller::ProcessSemaphoreRelease() {
std::function<void()> operation([this, sequence_address, payload] {
memory_manager.Write<u32>(sequence_address, payload);
});
rasterizer->SyncOperation(std::move(operation));
rasterizer->SignalFence(std::move(operation));
}
void Puller::ProcessSemaphoreAcquire() {
@@ -151,8 +151,8 @@ void Puller::CallPullerMethod(const MethodCall& method_call) {
case BufferMethods::SemaphoreAddressLow:
case BufferMethods::SemaphoreSequencePayload:
case BufferMethods::SyncpointPayload:
break;
case BufferMethods::WrcacheFlush:
break;
case BufferMethods::RefCnt:
rasterizer->SignalReference();
break;

View File

@@ -0,0 +1,238 @@
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#include <algorithm>
#include <cmath>
#include <vector>
#include "video_core/engines/sw_blitter/blitter.h"
#include "video_core/engines/sw_blitter/converter.h"
#include "video_core/memory_manager.h"
#include "video_core/surface.h"
#include "video_core/textures/decoders.h"
namespace Tegra {
class MemoryManager;
}
using VideoCore::Surface::BytesPerBlock;
using VideoCore::Surface::PixelFormatFromRenderTargetFormat;
namespace Tegra::Engines::Blitter {
using namespace Texture;
namespace {
constexpr size_t ir_components = 4;
void NearestNeighbor(std::span<const u8> input, std::span<u8> output, u32 src_width, u32 src_height,
u32 dst_width, u32 dst_height, size_t bpp) {
const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
size_t src_y = 0;
for (u32 y = 0; y < dst_height; y++) {
size_t src_x = 0;
for (u32 x = 0; x < dst_width; x++) {
const size_t read_from = ((src_y * src_width + src_x) >> 32) * bpp;
const size_t write_to = (y * dst_width + x) * bpp;
std::memcpy(&output[write_to], &input[read_from], bpp);
src_x += dx_du;
}
src_y += dy_dv;
}
}
void NearestNeighborFast(std::span<const f32> input, std::span<f32> output, u32 src_width,
u32 src_height, u32 dst_width, u32 dst_height) {
const size_t dx_du = std::llround((static_cast<f64>(src_width) / dst_width) * (1ULL << 32));
const size_t dy_dv = std::llround((static_cast<f64>(src_height) / dst_height) * (1ULL << 32));
size_t src_y = 0;
for (u32 y = 0; y < dst_height; y++) {
size_t src_x = 0;
for (u32 x = 0; x < dst_width; x++) {
const size_t read_from = ((src_y * src_width + src_x) >> 32) * ir_components;
const size_t write_to = (y * dst_width + x) * ir_components;
std::memcpy(&output[write_to], &input[read_from], sizeof(f32) * ir_components);
src_x += dx_du;
}
src_y += dy_dv;
}
}
void Bilinear(std::span<const f32> input, std::span<f32> output, size_t src_width,
size_t src_height, size_t dst_width, size_t dst_height) {
const auto bilinear_sample = [](std::span<const f32> x0_y0, std::span<const f32> x1_y0,
std::span<const f32> x0_y1, std::span<const f32> x1_y1,
f32 weight_x, f32 weight_y) {
std::array<f32, ir_components> result{};
for (size_t i = 0; i < ir_components; i++) {
const f32 a = std::lerp(x0_y0[i], x1_y0[i], weight_x);
const f32 b = std::lerp(x0_y1[i], x1_y1[i], weight_x);
result[i] = std::lerp(a, b, weight_y);
}
return result;
};
const f32 dx_du =
dst_width > 1 ? static_cast<f32>(src_width - 1) / static_cast<f32>(dst_width - 1) : 0.f;
const f32 dy_dv =
dst_height > 1 ? static_cast<f32>(src_height - 1) / static_cast<f32>(dst_height - 1) : 0.f;
for (u32 y = 0; y < dst_height; y++) {
for (u32 x = 0; x < dst_width; x++) {
const f32 x_low = std::floor(static_cast<f32>(x) * dx_du);
const f32 y_low = std::floor(static_cast<f32>(y) * dy_dv);
const f32 x_high = std::ceil(static_cast<f32>(x) * dx_du);
const f32 y_high = std::ceil(static_cast<f32>(y) * dy_dv);
const f32 weight_x = (static_cast<f32>(x) * dx_du) - x_low;
const f32 weight_y = (static_cast<f32>(y) * dy_dv) - y_low;
const auto read_src = [&](f32 in_x, f32 in_y) {
const size_t read_from =
((static_cast<size_t>(in_x) * src_width + static_cast<size_t>(in_y)) >> 32) *
ir_components;
return std::span<const f32>(&input[read_from], ir_components);
};
auto x0_y0 = read_src(x_low, y_low);
auto x1_y0 = read_src(x_high, y_low);
auto x0_y1 = read_src(x_low, y_high);
auto x1_y1 = read_src(x_high, y_high);
const auto result = bilinear_sample(x0_y0, x1_y0, x0_y1, x1_y1, weight_x, weight_y);
const size_t write_to = (y * dst_width + x) * ir_components;
std::memcpy(&output[write_to], &result, sizeof(f32) * ir_components);
}
}
}
} // namespace
struct SoftwareBlitEngine::BlitEngineImpl {
std::vector<u8> tmp_buffer;
std::vector<u8> src_buffer;
std::vector<u8> dst_buffer;
std::vector<f32> intermediate_src;
std::vector<f32> intermediate_dst;
ConverterFactory converter_factory;
};
SoftwareBlitEngine::SoftwareBlitEngine(MemoryManager& memory_manager_)
: memory_manager{memory_manager_} {
impl = std::make_unique<BlitEngineImpl>();
}
SoftwareBlitEngine::~SoftwareBlitEngine() = default;
bool SoftwareBlitEngine::Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst,
Fermi2D::Config& config) {
const auto get_surface_size = [](Fermi2D::Surface& surface, u32 bytes_per_pixel) {
if (surface.linear == Fermi2D::MemoryLayout::BlockLinear) {
return CalculateSize(true, bytes_per_pixel, surface.width, surface.height,
surface.depth, surface.block_height, surface.block_depth);
}
return static_cast<size_t>(surface.pitch * surface.height);
};
const auto process_pitch_linear = [](bool unpack, std::span<const u8> input,
std::span<u8> output, u32 extent_x, u32 extent_y,
u32 pitch, u32 x0, u32 y0, size_t bpp) {
const size_t base_offset = x0 * bpp;
const size_t copy_size = extent_x * bpp;
for (u32 y = y0; y < extent_y; y++) {
const size_t first_offset = y * pitch + base_offset;
const size_t second_offset = y * extent_x * bpp;
u8* write_to = unpack ? &output[first_offset] : &output[second_offset];
const u8* read_from = unpack ? &input[second_offset] : &input[first_offset];
std::memcpy(write_to, read_from, copy_size);
}
};
const u32 src_extent_x = config.src_x1 - config.src_x0;
const u32 src_extent_y = config.src_y1 - config.src_y0;
const u32 dst_extent_x = config.dst_x1 - config.dst_x0;
const u32 dst_extent_y = config.dst_y1 - config.dst_y0;
const auto src_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(src.format));
const auto dst_bytes_per_pixel = BytesPerBlock(PixelFormatFromRenderTargetFormat(dst.format));
const size_t src_size = get_surface_size(src, src_bytes_per_pixel);
impl->tmp_buffer.resize(src_size);
memory_manager.ReadBlock(src.Address(), impl->tmp_buffer.data(), src_size);
const size_t src_copy_size = src_extent_x * src_extent_y * src_bytes_per_pixel;
const size_t dst_copy_size = dst_extent_x * dst_extent_y * dst_bytes_per_pixel;
impl->src_buffer.resize(src_copy_size);
const bool no_passthrough =
src.format != dst.format || src_extent_x != dst_extent_x || src_extent_y != dst_extent_y;
const auto convertion_phase_same_format = [&]() {
NearestNeighbor(impl->src_buffer, impl->dst_buffer, src_extent_x, src_extent_y,
dst_extent_x, dst_extent_y, dst_bytes_per_pixel);
};
const auto convertion_phase_ir = [&]() {
auto* input_converter = impl->converter_factory.GetFormatConverter(src.format);
impl->intermediate_src.resize((src_copy_size / src_bytes_per_pixel) * ir_components);
impl->intermediate_dst.resize((dst_copy_size / dst_bytes_per_pixel) * ir_components);
input_converter->ConvertTo(impl->src_buffer, impl->intermediate_src);
if (config.filter != Fermi2D::Filter::Bilinear) {
NearestNeighborFast(impl->intermediate_src, impl->intermediate_dst, src_extent_x,
src_extent_y, dst_extent_x, dst_extent_y);
} else {
Bilinear(impl->intermediate_src, impl->intermediate_dst, src_extent_x, src_extent_y,
dst_extent_x, dst_extent_y);
}
auto* output_converter = impl->converter_factory.GetFormatConverter(dst.format);
output_converter->ConvertFrom(impl->intermediate_dst, impl->dst_buffer);
};
// Do actuall Blit
impl->dst_buffer.resize(dst_copy_size);
if (src.linear == Fermi2D::MemoryLayout::BlockLinear) {
UnswizzleSubrect(impl->src_buffer, impl->tmp_buffer, src_bytes_per_pixel, src.width,
src.height, src.depth, config.src_x0, config.src_y0, src_extent_x,
src_extent_y, src.block_height, src.block_depth,
src_extent_x * src_bytes_per_pixel);
} else {
process_pitch_linear(false, impl->tmp_buffer, impl->src_buffer, src_extent_x, src_extent_y,
src.pitch, config.src_x0, config.src_y0, src_bytes_per_pixel);
}
// Conversion Phase
if (no_passthrough) {
if (src.format != dst.format || config.filter == Fermi2D::Filter::Bilinear) {
convertion_phase_ir();
} else {
convertion_phase_same_format();
}
} else {
impl->dst_buffer.swap(impl->src_buffer);
}
const size_t dst_size = get_surface_size(dst, dst_bytes_per_pixel);
impl->tmp_buffer.resize(dst_size);
memory_manager.ReadBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
if (dst.linear == Fermi2D::MemoryLayout::BlockLinear) {
SwizzleSubrect(impl->tmp_buffer, impl->dst_buffer, dst_bytes_per_pixel, dst.width,
dst.height, dst.depth, config.dst_x0, config.dst_y0, dst_extent_x,
dst_extent_y, dst.block_height, dst.block_depth,
dst_extent_x * dst_bytes_per_pixel);
} else {
process_pitch_linear(true, impl->dst_buffer, impl->tmp_buffer, dst_extent_x, dst_extent_y,
dst.pitch, config.dst_x0, config.dst_y0,
static_cast<size_t>(dst_bytes_per_pixel));
}
memory_manager.WriteBlock(dst.Address(), impl->tmp_buffer.data(), dst_size);
return true;
}
} // namespace Tegra::Engines::Blitter

View File

@@ -0,0 +1,27 @@
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#pragma once
#include "video_core/engines/fermi_2d.h"
namespace Tegra {
class MemoryManager;
}
namespace Tegra::Engines::Blitter {
class SoftwareBlitEngine {
public:
explicit SoftwareBlitEngine(MemoryManager& memory_manager_);
~SoftwareBlitEngine();
bool Blit(Fermi2D::Surface& src, Fermi2D::Surface& dst, Fermi2D::Config& copy_config);
private:
MemoryManager& memory_manager;
struct BlitEngineImpl;
std::unique_ptr<BlitEngineImpl> impl;
};
} // namespace Tegra::Engines::Blitter

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,36 @@
// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
#pragma once
#include <memory>
#include <span>
#include "common/common_types.h"
#include "video_core/gpu.h"
namespace Tegra::Engines::Blitter {
class Converter {
public:
virtual void ConvertTo(std::span<const u8> input, std::span<f32> output) = 0;
virtual void ConvertFrom(std::span<const f32> input, std::span<u8> output) = 0;
virtual ~Converter() = default;
};
class ConverterFactory {
public:
ConverterFactory();
~ConverterFactory();
Converter* GetFormatConverter(RenderTargetFormat format);
private:
Converter* BuildConverter(RenderTargetFormat format);
struct ConverterFactoryImpl;
std::unique_ptr<ConverterFactoryImpl> impl;
};
} // namespace Tegra::Engines::Blitter

View File

@@ -0,0 +1,136 @@
# SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
# SPDX-License-Identifier: GPL-3.0-or-later
import re
class Format:
def __init__(self, string_value):
self.name = string_value
tmp = string_value.split('_')
self.component_type = tmp[1]
component_data = re.findall(r"\w\d+", tmp[0])
self.num_components = len(component_data)
sizes = []
swizzle = []
for data in component_data:
swizzle.append(data[0])
sizes.append(int(data[1:]))
self.sizes = sizes
self.swizzle = swizzle
def build_component_type_array(self):
result = "{ "
b = False
for i in range(0, self.num_components):
if b:
result += ", "
b = True
result += "ComponentType::" + self.component_type
result += " }"
return result
def build_component_sizes_array(self):
result = "{ "
b = False
for i in range(0, self.num_components):
if b:
result += ", "
b = True
result += str(self.sizes[i])
result += " }"
return result
def build_component_swizzle_array(self):
result = "{ "
b = False
for i in range(0, self.num_components):
if b:
result += ", "
b = True
swizzle = self.swizzle[i]
if swizzle == "X":
swizzle = "None"
result += "Swizzle::" + swizzle
result += " }"
return result
def print_declaration(self):
print("struct " + self.name + "Traits {")
print(" static constexpr size_t num_components = " + str(self.num_components) + ";")
print(" static constexpr std::array<ComponentType, num_components> component_types = " + self.build_component_type_array() + ";")
print(" static constexpr std::array<size_t, num_components> component_sizes = " + self.build_component_sizes_array() + ";")
print(" static constexpr std::array<Swizzle, num_components> component_swizzle = " + self.build_component_swizzle_array() + ";")
print("};\n")
def print_case(self):
print("case RenderTargetFormat::" + self.name + ":")
print(" return impl->converters_cache")
print(" .emplace(format, std::make_unique<ConverterImpl<" + self.name + "Traits>>())")
print(" .first->second.get();")
print(" break;")
txt = """
R32G32B32A32_FLOAT
R32G32B32A32_SINT
R32G32B32A32_UINT
R32G32B32X32_FLOAT
R32G32B32X32_SINT
R32G32B32X32_UINT
R16G16B16A16_UNORM
R16G16B16A16_SNORM
R16G16B16A16_SINT
R16G16B16A16_UINT
R16G16B16A16_FLOAT
R32G32_FLOAT
R32G32_SINT
R32G32_UINT
R16G16B16X16_FLOAT
A8R8G8B8_UNORM
A8R8G8B8_SRGB
A2B10G10R10_UNORM
A2B10G10R10_UINT
A2R10G10B10_UNORM
A8B8G8R8_UNORM
A8B8G8R8_SRGB
A8B8G8R8_SNORM
A8B8G8R8_SINT
A8B8G8R8_UINT
R16G16_UNORM
R16G16_SNORM
R16G16_SINT
R16G16_UINT
R16G16_FLOAT
B10G11R11_FLOAT
R32_SINT
R32_UINT
R32_FLOAT
X8R8G8B8_UNORM
X8R8G8B8_SRGB
R5G6B5_UNORM
A1R5G5B5_UNORM
R8G8_UNORM
R8G8_SNORM
R8G8_SINT
R8G8_UINT
R16_UNORM
R16_SNORM
R16_SINT
R16_UINT
R16_FLOAT
R8_UNORM
R8_SNORM
R8_SINT
R8_UINT
X1R5G5B5_UNORM
X8B8G8R8_UNORM
X8B8G8R8_SRGB
"""
x = txt.split()
y = list(map(lambda a: Format(a), x))
formats = list(y)
for format in formats:
format.print_declaration()
for format in formats:
format.print_case()

View File

@@ -27,12 +27,12 @@ struct CommandList;
// TODO: Implement the commented ones
enum class RenderTargetFormat : u32 {
NONE = 0x0,
R32B32G32A32_FLOAT = 0xC0,
R32G32B32A32_FLOAT = 0xC0,
R32G32B32A32_SINT = 0xC1,
R32G32B32A32_UINT = 0xC2,
// R32G32B32X32_FLOAT = 0xC3,
// R32G32B32X32_SINT = 0xC4,
// R32G32B32X32_UINT = 0xC5,
R32G32B32X32_FLOAT = 0xC3,
R32G32B32X32_SINT = 0xC4,
R32G32B32X32_UINT = 0xC5,
R16G16B16A16_UNORM = 0xC6,
R16G16B16A16_SNORM = 0xC7,
R16G16B16A16_SINT = 0xC8,
@@ -56,13 +56,13 @@ enum class RenderTargetFormat : u32 {
R16G16_SINT = 0xDC,
R16G16_UINT = 0xDD,
R16G16_FLOAT = 0xDE,
// A2R10G10B10_UNORM = 0xDF,
A2R10G10B10_UNORM = 0xDF,
B10G11R11_FLOAT = 0xE0,
R32_SINT = 0xE3,
R32_UINT = 0xE4,
R32_FLOAT = 0xE5,
// X8R8G8B8_UNORM = 0xE6,
// X8R8G8B8_SRGB = 0xE7,
X8R8G8B8_UNORM = 0xE6,
X8R8G8B8_SRGB = 0xE7,
R5G6B5_UNORM = 0xE8,
A1R5G5B5_UNORM = 0xE9,
R8G8_UNORM = 0xEA,
@@ -79,11 +79,11 @@ enum class RenderTargetFormat : u32 {
R8_SINT = 0xF5,
R8_UINT = 0xF6,
/*
A8_UNORM = 0xF7,
// A8_UNORM = 0xF7,
X1R5G5B5_UNORM = 0xF8,
X8B8G8R8_UNORM = 0xF9,
X8B8G8R8_SRGB = 0xFA,
/*
Z1R5G5B5_UNORM = 0xFB,
O1R5G5B5_UNORM = 0xFC,
Z8R8G8B8_UNORM = 0xFD,

View File

@@ -466,8 +466,7 @@ bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
const Tegra::Engines::Fermi2D::Config& copy_config) {
MICROPROFILE_SCOPE(OpenGL_Blits);
std::scoped_lock lock{texture_cache.mutex};
texture_cache.BlitImage(dst, src, copy_config);
return true;
return texture_cache.BlitImage(dst, src, copy_config);
}
Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA() {

View File

@@ -28,6 +28,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB
{GL_RGB5_A1, GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1R5G5B5_UNORM
{GL_RGB10_A2, GL_RGBA, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UNORM
{GL_RGB10_A2UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2B10G10R10_UINT
{GL_RGB10_A2, GL_BGRA, GL_UNSIGNED_INT_2_10_10_10_REV}, // A2R10G10B10_UNORM
{GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_1_5_5_5_REV}, // A1B5G5R5_UNORM
{GL_RGB5_A1, GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1}, // A5B5G5R1_UNORM
{GL_R8, GL_RED, GL_UNSIGNED_BYTE}, // R8_UNORM

View File

@@ -125,6 +125,7 @@ struct FormatTuple {
{VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable}, // A1R5G5B5_UNORM
{VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10_UNORM
{VK_FORMAT_A2B10G10R10_UINT_PACK32, Attachable | Storage}, // A2B10G10R10_UINT
{VK_FORMAT_A2R10G10B10_UNORM_PACK32, Attachable | Storage}, // A2R10G10B10_UNORM
{VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable}, // A1B5G5R5_UNORM (flipped with swizzle)
{VK_FORMAT_R5G5B5A1_UNORM_PACK16}, // A5B5G5R1_UNORM (specially swizzled)
{VK_FORMAT_R8_UNORM, Attachable | Storage}, // R8_UNORM
@@ -149,7 +150,7 @@ struct FormatTuple {
{VK_FORMAT_BC6H_UFLOAT_BLOCK}, // BC6H_UFLOAT
{VK_FORMAT_BC6H_SFLOAT_BLOCK}, // BC6H_SFLOAT
{VK_FORMAT_ASTC_4x4_UNORM_BLOCK}, // ASTC_2D_4X4_UNORM
{VK_FORMAT_B8G8R8A8_UNORM, Attachable}, // B8G8R8A8_UNORM
{VK_FORMAT_B8G8R8A8_UNORM, Attachable | Storage}, // B8G8R8A8_UNORM
{VK_FORMAT_R32G32B32A32_SFLOAT, Attachable | Storage}, // R32G32B32A32_FLOAT
{VK_FORMAT_R32G32B32A32_SINT, Attachable | Storage}, // R32G32B32A32_SINT
{VK_FORMAT_R32G32_SFLOAT, Attachable | Storage}, // R32G32_FLOAT
@@ -159,7 +160,7 @@ struct FormatTuple {
{VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16_UNORM
{VK_FORMAT_R16_SNORM, Attachable | Storage}, // R16_SNORM
{VK_FORMAT_R16_UINT, Attachable | Storage}, // R16_UINT
{VK_FORMAT_UNDEFINED}, // R16_SINT
{VK_FORMAT_R16_SINT, Attachable | Storage}, // R16_SINT
{VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // R16G16_UNORM
{VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // R16G16_FLOAT
{VK_FORMAT_R16G16_UINT, Attachable | Storage}, // R16G16_UINT
@@ -183,7 +184,7 @@ struct FormatTuple {
{VK_FORMAT_BC2_SRGB_BLOCK}, // BC2_SRGB
{VK_FORMAT_BC3_SRGB_BLOCK}, // BC3_SRGB
{VK_FORMAT_BC7_SRGB_BLOCK}, // BC7_SRGB
{VK_FORMAT_R4G4B4A4_UNORM_PACK16, Attachable}, // A4B4G4R4_UNORM
{VK_FORMAT_R4G4B4A4_UNORM_PACK16}, // A4B4G4R4_UNORM
{VK_FORMAT_R4G4_UNORM_PACK8}, // G4R4_UNORM
{VK_FORMAT_ASTC_4x4_SRGB_BLOCK}, // ASTC_2D_4X4_SRGB
{VK_FORMAT_ASTC_8x8_SRGB_BLOCK}, // ASTC_2D_8X8_SRGB

View File

@@ -542,8 +542,7 @@ bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surf
const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) {
std::scoped_lock lock{texture_cache.mutex};
texture_cache.BlitImage(dst, src, copy_config);
return true;
return texture_cache.BlitImage(dst, src, copy_config);
}
Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA() {

View File

@@ -93,11 +93,14 @@ PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) {
PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) {
switch (format) {
case Tegra::RenderTargetFormat::R32B32G32A32_FLOAT:
case Tegra::RenderTargetFormat::R32G32B32A32_FLOAT:
case Tegra::RenderTargetFormat::R32G32B32X32_FLOAT:
return PixelFormat::R32G32B32A32_FLOAT;
case Tegra::RenderTargetFormat::R32G32B32A32_SINT:
case Tegra::RenderTargetFormat::R32G32B32X32_SINT:
return PixelFormat::R32G32B32A32_SINT;
case Tegra::RenderTargetFormat::R32G32B32A32_UINT:
case Tegra::RenderTargetFormat::R32G32B32X32_UINT:
return PixelFormat::R32G32B32A32_UINT;
case Tegra::RenderTargetFormat::R16G16B16A16_UNORM:
return PixelFormat::R16G16B16A16_UNORM;
@@ -118,16 +121,22 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
case Tegra::RenderTargetFormat::R16G16B16X16_FLOAT:
return PixelFormat::R16G16B16X16_FLOAT;
case Tegra::RenderTargetFormat::A8R8G8B8_UNORM:
case Tegra::RenderTargetFormat::X8R8G8B8_UNORM:
return PixelFormat::B8G8R8A8_UNORM;
case Tegra::RenderTargetFormat::A8R8G8B8_SRGB:
case Tegra::RenderTargetFormat::X8R8G8B8_SRGB:
return PixelFormat::B8G8R8A8_SRGB;
case Tegra::RenderTargetFormat::A2B10G10R10_UNORM:
return PixelFormat::A2B10G10R10_UNORM;
case Tegra::RenderTargetFormat::A2B10G10R10_UINT:
return PixelFormat::A2B10G10R10_UINT;
case Tegra::RenderTargetFormat::A2R10G10B10_UNORM:
return PixelFormat::A2R10G10B10_UNORM;
case Tegra::RenderTargetFormat::A8B8G8R8_UNORM:
case Tegra::RenderTargetFormat::X8B8G8R8_UNORM:
return PixelFormat::A8B8G8R8_UNORM;
case Tegra::RenderTargetFormat::A8B8G8R8_SRGB:
case Tegra::RenderTargetFormat::X8B8G8R8_SRGB:
return PixelFormat::A8B8G8R8_SRGB;
case Tegra::RenderTargetFormat::A8B8G8R8_SNORM:
return PixelFormat::A8B8G8R8_SNORM;
@@ -156,6 +165,7 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format)
case Tegra::RenderTargetFormat::R5G6B5_UNORM:
return PixelFormat::R5G6B5_UNORM;
case Tegra::RenderTargetFormat::A1R5G5B5_UNORM:
case Tegra::RenderTargetFormat::X1R5G5B5_UNORM:
return PixelFormat::A1R5G5B5_UNORM;
case Tegra::RenderTargetFormat::R8G8_UNORM:
return PixelFormat::R8G8_UNORM;

View File

@@ -23,6 +23,7 @@ enum class PixelFormat {
A1R5G5B5_UNORM,
A2B10G10R10_UNORM,
A2B10G10R10_UINT,
A2R10G10B10_UNORM,
A1B5G5R5_UNORM,
A5B5G5R1_UNORM,
R8_UNORM,
@@ -159,6 +160,7 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_WIDTH_TABLE = {{
1, // A1R5G5B5_UNORM
1, // A2B10G10R10_UNORM
1, // A2B10G10R10_UINT
1, // A2R10G10B10_UNORM
1, // A1B5G5R5_UNORM
1, // A5B5G5R1_UNORM
1, // R8_UNORM
@@ -264,6 +266,7 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_HEIGHT_TABLE = {{
1, // A1R5G5B5_UNORM
1, // A2B10G10R10_UNORM
1, // A2B10G10R10_UINT
1, // A2R10G10B10_UNORM
1, // A1B5G5R5_UNORM
1, // A5B5G5R1_UNORM
1, // R8_UNORM
@@ -369,6 +372,7 @@ constexpr std::array<u8, MaxPixelFormat> BITS_PER_BLOCK_TABLE = {{
16, // A1R5G5B5_UNORM
32, // A2B10G10R10_UNORM
32, // A2B10G10R10_UINT
32, // A2R10G10B10_UNORM
16, // A1B5G5R5_UNORM
16, // A5B5G5R1_UNORM
8, // R8_UNORM

View File

@@ -35,6 +35,8 @@ struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::str
return "A2B10G10R10_UNORM";
case PixelFormat::A2B10G10R10_UINT:
return "A2B10G10R10_UINT";
case PixelFormat::A2R10G10B10_UNORM:
return "A2R10G10B10_UNORM";
case PixelFormat::A1B5G5R5_UNORM:
return "A1B5G5R5_UNORM";
case PixelFormat::A5B5G5R1_UNORM:

View File

@@ -506,10 +506,14 @@ void TextureCache<P>::UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t siz
}
template <class P>
void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
bool TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Config& copy) {
const BlitImages images = GetBlitImages(dst, src, copy);
const auto result = GetBlitImages(dst, src, copy);
if (!result) {
return false;
}
const BlitImages images = *result;
const ImageId dst_id = images.dst_id;
const ImageId src_id = images.src_id;
@@ -596,6 +600,7 @@ void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
runtime.BlitImage(dst_framebuffer, dst_view, src_view, dst_region, src_region, copy.filter,
copy.operation);
}
return true;
}
template <class P>
@@ -1133,7 +1138,7 @@ ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VA
}
template <class P>
typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
std::optional<typename TextureCache<P>::BlitImages> TextureCache<P>::GetBlitImages(
const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Config& copy) {
@@ -1154,6 +1159,20 @@ typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
has_deleted_images = false;
src_id = FindImage(src_info, src_addr, try_options);
dst_id = FindImage(dst_info, dst_addr, try_options);
if (!copy.must_accelerate) {
do {
if (!src_id && !dst_id) {
return std::nullopt;
}
if (src_id && True(slot_images[src_id].flags & ImageFlagBits::GpuModified)) {
break;
}
if (dst_id && True(slot_images[dst_id].flags & ImageFlagBits::GpuModified)) {
break;
}
return std::nullopt;
} while (false);
}
const ImageBase* const src_image = src_id ? &slot_images[src_id] : nullptr;
if (src_image && src_image->info.num_samples > 1) {
RelaxedOptions find_options{FIND_OPTIONS | RelaxedOptions::ForceBrokenViews};
@@ -1194,12 +1213,12 @@ typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages(
dst_id = FindOrInsertImage(dst_info, dst_addr, RelaxedOptions{});
} while (has_deleted_images);
}
return BlitImages{
return {BlitImages{
.dst_id = dst_id,
.src_id = src_id,
.dst_format = dst_info.format,
.src_format = src_info.format,
};
}};
}
template <class P>

View File

@@ -174,7 +174,7 @@ public:
void UnmapGPUMemory(size_t as_id, GPUVAddr gpu_addr, size_t size);
/// Blit an image with the given parameters
void BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
bool BlitImage(const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Config& copy);
@@ -285,9 +285,9 @@ private:
[[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr);
/// Return a blit image pair from the given guest blit parameters
[[nodiscard]] BlitImages GetBlitImages(const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Config& copy);
[[nodiscard]] std::optional<BlitImages> GetBlitImages(
const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Config& copy);
/// Find or create a sampler from a guest descriptor sampler
[[nodiscard]] SamplerId FindSampler(const TSCEntry& config);

View File

@@ -231,6 +231,9 @@
<property name="text">
<string>Con&amp;figure...</string>
</property>
<property name="menuRole">
<enum>QAction::PreferencesRole</enum>
</property>
</action>
<action name="action_Display_Dock_Widget_Headers">
<property name="checkable">
@@ -363,6 +366,9 @@
<property name="text">
<string>&amp;Configure TAS...</string>
</property>
<property name="menuRole">
<enum>QAction::NoRole</enum>
</property>
</action>
<action name="action_Configure_Current_Game">
<property name="enabled">
@@ -371,6 +377,9 @@
<property name="text">
<string>Configure C&amp;urrent Game...</string>
</property>
<property name="menuRole">
<enum>QAction::NoRole</enum>
</property>
</action>
<action name="action_TAS_Start">
<property name="enabled">