Compare commits
15 Commits
__refs_pul
...
__refs_pul
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f58ee3f15f | ||
|
|
da8e0f6571 | ||
|
|
3fa9702952 | ||
|
|
ae7da0b12d | ||
|
|
214b9fc9a7 | ||
|
|
951c61aeaa | ||
|
|
eb7d361657 | ||
|
|
af89f7683d | ||
|
|
3a89723d97 | ||
|
|
8c907c620d | ||
|
|
b84d429c2e | ||
|
|
9f34be5a61 | ||
|
|
14ac0c2923 | ||
|
|
640fc1418b | ||
|
|
f9e0681d59 |
2
externals/dynarmic
vendored
2
externals/dynarmic
vendored
Submodule externals/dynarmic updated: cce7e4ee5d...28714ee75a
@@ -597,6 +597,7 @@ struct Values {
|
||||
BasicSetting<std::string> program_args{std::string(), "program_args"};
|
||||
BasicSetting<bool> dump_exefs{false, "dump_exefs"};
|
||||
BasicSetting<bool> dump_nso{false, "dump_nso"};
|
||||
BasicSetting<bool> dump_shaders{false, "dump_shaders"};
|
||||
BasicSetting<bool> enable_fs_access_log{false, "enable_fs_access_log"};
|
||||
BasicSetting<bool> reporting_services{false, "reporting_services"};
|
||||
BasicSetting<bool> quest_flag{false, "quest_flag"};
|
||||
|
||||
@@ -187,6 +187,8 @@ add_library(core STATIC
|
||||
hle/kernel/k_event.h
|
||||
hle/kernel/k_handle_table.cpp
|
||||
hle/kernel/k_handle_table.h
|
||||
hle/kernel/k_interrupt_manager.cpp
|
||||
hle/kernel/k_interrupt_manager.h
|
||||
hle/kernel/k_light_condition_variable.cpp
|
||||
hle/kernel/k_light_condition_variable.h
|
||||
hle/kernel/k_light_lock.cpp
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "core/hle/kernel/global_scheduler_context.h"
|
||||
#include "core/hle/kernel/k_scheduler.h"
|
||||
#include "core/hle/kernel/kernel.h"
|
||||
#include "core/hle/kernel/physical_core.h"
|
||||
|
||||
namespace Kernel {
|
||||
|
||||
@@ -42,6 +43,11 @@ void GlobalSchedulerContext::PreemptThreads() {
|
||||
for (u32 core_id = 0; core_id < Core::Hardware::NUM_CPU_CORES; core_id++) {
|
||||
const u32 priority = preemption_priorities[core_id];
|
||||
kernel.Scheduler(core_id).RotateScheduledQueue(core_id, priority);
|
||||
|
||||
// Signal an interrupt occurred. For core 3, this is a certainty, as preemption will result
|
||||
// in the rotator thread being scheduled. For cores 0-2, this is to simulate or system
|
||||
// interrupts that may have occurred.
|
||||
kernel.PhysicalCore(core_id).Interrupt();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
34
src/core/hle/kernel/k_interrupt_manager.cpp
Normal file
34
src/core/hle/kernel/k_interrupt_manager.cpp
Normal file
@@ -0,0 +1,34 @@
|
||||
// Copyright 2021 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include "core/hle/kernel/k_interrupt_manager.h"
|
||||
#include "core/hle/kernel/k_process.h"
|
||||
#include "core/hle/kernel/k_scheduler.h"
|
||||
#include "core/hle/kernel/k_thread.h"
|
||||
#include "core/hle/kernel/kernel.h"
|
||||
|
||||
namespace Kernel::KInterruptManager {
|
||||
|
||||
void HandleInterrupt(KernelCore& kernel, s32 core_id) {
|
||||
auto* process = kernel.CurrentProcess();
|
||||
if (!process) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto& scheduler = kernel.Scheduler(core_id);
|
||||
auto& current_thread = *scheduler.GetCurrentThread();
|
||||
|
||||
// If the user disable count is set, we may need to pin the current thread.
|
||||
if (current_thread.GetUserDisableCount() && !process->GetPinnedThread(core_id)) {
|
||||
KScopedSchedulerLock sl{kernel};
|
||||
|
||||
// Pin the current thread.
|
||||
process->PinCurrentThread(core_id);
|
||||
|
||||
// Set the interrupt flag for the thread.
|
||||
scheduler.GetCurrentThread()->SetInterruptFlag();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Kernel::KInterruptManager
|
||||
17
src/core/hle/kernel/k_interrupt_manager.h
Normal file
17
src/core/hle/kernel/k_interrupt_manager.h
Normal file
@@ -0,0 +1,17 @@
|
||||
// Copyright 2021 yuzu Emulator Project
|
||||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "common/common_types.h"
|
||||
|
||||
namespace Kernel {
|
||||
|
||||
class KernelCore;
|
||||
|
||||
namespace KInterruptManager {
|
||||
void HandleInterrupt(KernelCore& kernel, s32 core_id);
|
||||
}
|
||||
|
||||
} // namespace Kernel
|
||||
@@ -220,30 +220,28 @@ bool KProcess::ReleaseUserException(KThread* thread) {
|
||||
}
|
||||
}
|
||||
|
||||
void KProcess::PinCurrentThread() {
|
||||
void KProcess::PinCurrentThread(s32 core_id) {
|
||||
ASSERT(kernel.GlobalSchedulerContext().IsLocked());
|
||||
|
||||
// Get the current thread.
|
||||
const s32 core_id = GetCurrentCoreId(kernel);
|
||||
KThread* cur_thread = GetCurrentThreadPointer(kernel);
|
||||
KThread* cur_thread = kernel.Scheduler(static_cast<std::size_t>(core_id)).GetCurrentThread();
|
||||
|
||||
// If the thread isn't terminated, pin it.
|
||||
if (!cur_thread->IsTerminationRequested()) {
|
||||
// Pin it.
|
||||
PinThread(core_id, cur_thread);
|
||||
cur_thread->Pin();
|
||||
cur_thread->Pin(core_id);
|
||||
|
||||
// An update is needed.
|
||||
KScheduler::SetSchedulerUpdateNeeded(kernel);
|
||||
}
|
||||
}
|
||||
|
||||
void KProcess::UnpinCurrentThread() {
|
||||
void KProcess::UnpinCurrentThread(s32 core_id) {
|
||||
ASSERT(kernel.GlobalSchedulerContext().IsLocked());
|
||||
|
||||
// Get the current thread.
|
||||
const s32 core_id = GetCurrentCoreId(kernel);
|
||||
KThread* cur_thread = GetCurrentThreadPointer(kernel);
|
||||
KThread* cur_thread = kernel.Scheduler(static_cast<std::size_t>(core_id)).GetCurrentThread();
|
||||
|
||||
// Unpin it.
|
||||
cur_thread->Unpin();
|
||||
|
||||
@@ -345,8 +345,8 @@ public:
|
||||
|
||||
bool IsSignaled() const override;
|
||||
|
||||
void PinCurrentThread();
|
||||
void UnpinCurrentThread();
|
||||
void PinCurrentThread(s32 core_id);
|
||||
void UnpinCurrentThread(s32 core_id);
|
||||
void UnpinThread(KThread* thread);
|
||||
|
||||
KLightLock& GetStateLock() {
|
||||
|
||||
@@ -15,6 +15,7 @@
|
||||
#include "core/core.h"
|
||||
#include "core/core_timing.h"
|
||||
#include "core/cpu_manager.h"
|
||||
#include "core/hle/kernel/k_interrupt_manager.h"
|
||||
#include "core/hle/kernel/k_process.h"
|
||||
#include "core/hle/kernel/k_scheduler.h"
|
||||
#include "core/hle/kernel/k_scoped_scheduler_lock_and_sleep.h"
|
||||
@@ -53,6 +54,13 @@ void KScheduler::RescheduleCores(KernelCore& kernel, u64 cores_pending_reschedul
|
||||
}
|
||||
cores_pending_reschedule &= ~(1ULL << core);
|
||||
}
|
||||
|
||||
for (std::size_t core_id = 0; core_id < Core::Hardware::NUM_CPU_CORES; ++core_id) {
|
||||
if (kernel.PhysicalCore(core_id).IsInterrupted()) {
|
||||
KInterruptManager::HandleInterrupt(kernel, static_cast<s32>(core_id));
|
||||
}
|
||||
}
|
||||
|
||||
if (must_context_switch) {
|
||||
auto core_scheduler = kernel.CurrentScheduler();
|
||||
kernel.ExitSVCProfile();
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <cinttypes>
|
||||
#include <optional>
|
||||
#include <vector>
|
||||
@@ -33,6 +34,7 @@
|
||||
#include "core/hle/kernel/svc_results.h"
|
||||
#include "core/hle/kernel/time_manager.h"
|
||||
#include "core/hle/result.h"
|
||||
#include "core/memory.h"
|
||||
|
||||
#ifdef ARCHITECTURE_x86_64
|
||||
#include "core/arm/dynarmic/arm_dynarmic_32.h"
|
||||
@@ -63,6 +65,13 @@ namespace Kernel {
|
||||
|
||||
namespace {
|
||||
|
||||
struct ThreadLocalRegion {
|
||||
static constexpr std::size_t MessageBufferSize = 0x100;
|
||||
std::array<u32, MessageBufferSize / sizeof(u32)> message_buffer;
|
||||
std::atomic_uint16_t disable_count;
|
||||
std::atomic_uint16_t interrupt_flag;
|
||||
};
|
||||
|
||||
class ThreadQueueImplForKThreadSleep final : public KThreadQueueWithoutEndWait {
|
||||
public:
|
||||
explicit ThreadQueueImplForKThreadSleep(KernelCore& kernel_)
|
||||
@@ -346,7 +355,7 @@ void KThread::StartTermination() {
|
||||
if (parent != nullptr) {
|
||||
parent->ReleaseUserException(this);
|
||||
if (parent->GetPinnedThread(GetCurrentCoreId(kernel)) == this) {
|
||||
parent->UnpinCurrentThread();
|
||||
parent->UnpinCurrentThread(core_id);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -372,7 +381,7 @@ void KThread::StartTermination() {
|
||||
this->Close();
|
||||
}
|
||||
|
||||
void KThread::Pin() {
|
||||
void KThread::Pin(s32 current_core) {
|
||||
ASSERT(kernel.GlobalSchedulerContext().IsLocked());
|
||||
|
||||
// Set ourselves as pinned.
|
||||
@@ -389,7 +398,6 @@ void KThread::Pin() {
|
||||
|
||||
// Bind ourselves to this core.
|
||||
const s32 active_core = GetActiveCore();
|
||||
const s32 current_core = GetCurrentCoreId(kernel);
|
||||
|
||||
SetActiveCore(current_core);
|
||||
physical_ideal_core_id = current_core;
|
||||
@@ -482,6 +490,36 @@ void KThread::Unpin() {
|
||||
}
|
||||
}
|
||||
|
||||
u16 KThread::GetUserDisableCount() const {
|
||||
if (!IsUserThread()) {
|
||||
// We only emulate TLS for user threads
|
||||
return {};
|
||||
}
|
||||
|
||||
auto& memory = kernel.System().Memory();
|
||||
return memory.Read16(tls_address + offsetof(ThreadLocalRegion, disable_count));
|
||||
}
|
||||
|
||||
void KThread::SetInterruptFlag() {
|
||||
if (!IsUserThread()) {
|
||||
// We only emulate TLS for user threads
|
||||
return;
|
||||
}
|
||||
|
||||
auto& memory = kernel.System().Memory();
|
||||
memory.Write16(tls_address + offsetof(ThreadLocalRegion, interrupt_flag), 1);
|
||||
}
|
||||
|
||||
void KThread::ClearInterruptFlag() {
|
||||
if (!IsUserThread()) {
|
||||
// We only emulate TLS for user threads
|
||||
return;
|
||||
}
|
||||
|
||||
auto& memory = kernel.System().Memory();
|
||||
memory.Write16(tls_address + offsetof(ThreadLocalRegion, interrupt_flag), 0);
|
||||
}
|
||||
|
||||
ResultCode KThread::GetCoreMask(s32* out_ideal_core, u64* out_affinity_mask) {
|
||||
KScopedSchedulerLock sl{kernel};
|
||||
|
||||
|
||||
@@ -307,6 +307,10 @@ public:
|
||||
return parent != nullptr;
|
||||
}
|
||||
|
||||
u16 GetUserDisableCount() const;
|
||||
void SetInterruptFlag();
|
||||
void ClearInterruptFlag();
|
||||
|
||||
[[nodiscard]] KThread* GetLockOwner() const {
|
||||
return lock_owner;
|
||||
}
|
||||
@@ -490,7 +494,7 @@ public:
|
||||
this->GetStackParameters().disable_count--;
|
||||
}
|
||||
|
||||
void Pin();
|
||||
void Pin(s32 current_core);
|
||||
|
||||
void Unpin();
|
||||
|
||||
|
||||
@@ -2027,6 +2027,25 @@ static ResultCode SignalToAddress(Core::System& system, VAddr address, Svc::Sign
|
||||
count);
|
||||
}
|
||||
|
||||
static void SynchronizePreemptionState(Core::System& system) {
|
||||
auto& kernel = system.Kernel();
|
||||
|
||||
// Lock the scheduler.
|
||||
KScopedSchedulerLock sl{kernel};
|
||||
|
||||
// If the current thread is pinned, unpin it.
|
||||
KProcess* cur_process = system.Kernel().CurrentProcess();
|
||||
const auto core_id = GetCurrentCoreId(kernel);
|
||||
|
||||
if (cur_process->GetPinnedThread(core_id) == GetCurrentThreadPointer(kernel)) {
|
||||
// Clear the current thread's interrupt flag.
|
||||
GetCurrentThread(kernel).ClearInterruptFlag();
|
||||
|
||||
// Unpin the current thread.
|
||||
cur_process->UnpinCurrentThread(core_id);
|
||||
}
|
||||
}
|
||||
|
||||
static ResultCode SignalToAddress32(Core::System& system, u32 address, Svc::SignalType signal_type,
|
||||
s32 value, s32 count) {
|
||||
return SignalToAddress(system, address, signal_type, value, count);
|
||||
@@ -2797,7 +2816,7 @@ static const FunctionDef SVC_Table_64[] = {
|
||||
{0x33, SvcWrap64<GetThreadContext>, "GetThreadContext"},
|
||||
{0x34, SvcWrap64<WaitForAddress>, "WaitForAddress"},
|
||||
{0x35, SvcWrap64<SignalToAddress>, "SignalToAddress"},
|
||||
{0x36, nullptr, "SynchronizePreemptionState"},
|
||||
{0x36, SvcWrap64<SynchronizePreemptionState>, "SynchronizePreemptionState"},
|
||||
{0x37, nullptr, "Unknown"},
|
||||
{0x38, nullptr, "Unknown"},
|
||||
{0x39, nullptr, "Unknown"},
|
||||
|
||||
@@ -126,6 +126,22 @@ void EmitGetAttribute(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr, Scal
|
||||
}
|
||||
}
|
||||
|
||||
void EmitGetAttributeU32(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr, ScalarU32) {
|
||||
switch (attr) {
|
||||
case IR::Attribute::PrimitiveId:
|
||||
ctx.Add("MOV.S {}.x,primitive.id;", inst);
|
||||
break;
|
||||
case IR::Attribute::InstanceId:
|
||||
ctx.Add("MOV.S {}.x,{}.instance;", inst, ctx.attrib_name);
|
||||
break;
|
||||
case IR::Attribute::VertexId:
|
||||
ctx.Add("MOV.S {}.x,{}.id;", inst, ctx.attrib_name);
|
||||
break;
|
||||
default:
|
||||
throw NotImplementedException("Get U32 attribute {}", attr);
|
||||
}
|
||||
}
|
||||
|
||||
void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, ScalarF32 value,
|
||||
[[maybe_unused]] ScalarU32 vertex) {
|
||||
const u32 element{static_cast<u32>(attr) % 4};
|
||||
|
||||
@@ -50,6 +50,7 @@ void EmitGetCbufU32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
|
||||
void EmitGetCbufF32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, ScalarU32 offset);
|
||||
void EmitGetCbufU32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, ScalarU32 offset);
|
||||
void EmitGetAttribute(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr, ScalarU32 vertex);
|
||||
void EmitGetAttributeU32(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr, ScalarU32 vertex);
|
||||
void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, ScalarF32 value, ScalarU32 vertex);
|
||||
void EmitGetAttributeIndexed(EmitContext& ctx, IR::Inst& inst, ScalarS32 offset, ScalarU32 vertex);
|
||||
void EmitSetAttributeIndexed(EmitContext& ctx, ScalarU32 offset, ScalarF32 value, ScalarU32 vertex);
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "shader_recompiler/backend/glsl/emit_glsl_instructions.h"
|
||||
#include "shader_recompiler/backend/glsl/glsl_emit_context.h"
|
||||
#include "shader_recompiler/frontend/ir/value.h"
|
||||
#include "shader_recompiler/profile.h"
|
||||
|
||||
namespace Shader::Backend::GLSL {
|
||||
namespace {
|
||||
@@ -30,8 +31,9 @@ void EmitConditionRef(EmitContext& ctx, IR::Inst& inst, const IR::Value& value)
|
||||
inst.DestructiveAddUsage(1);
|
||||
const auto ret{ctx.var_alloc.Define(inst, GlslVarType::U1)};
|
||||
const auto input{ctx.var_alloc.Consume(value)};
|
||||
const auto suffix{ctx.profile.has_gl_bool_ref_bug ? "?true:false" : ""};
|
||||
if (ret != input) {
|
||||
ctx.Add("{}={};", ret, input);
|
||||
ctx.Add("{}={}{};", ret, input, suffix);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -102,39 +102,46 @@ void GetCbuf16(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, const
|
||||
|
||||
void EmitGetCbufU8(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
|
||||
const IR::Value& offset) {
|
||||
GetCbuf8(ctx, inst, binding, offset, "ftou");
|
||||
const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "" : "ftou"};
|
||||
GetCbuf8(ctx, inst, binding, offset, cast);
|
||||
}
|
||||
|
||||
void EmitGetCbufS8(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
|
||||
const IR::Value& offset) {
|
||||
GetCbuf8(ctx, inst, binding, offset, "ftoi");
|
||||
const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "int" : "ftoi"};
|
||||
GetCbuf8(ctx, inst, binding, offset, cast);
|
||||
}
|
||||
|
||||
void EmitGetCbufU16(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
|
||||
const IR::Value& offset) {
|
||||
GetCbuf16(ctx, inst, binding, offset, "ftou");
|
||||
const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "" : "ftou"};
|
||||
GetCbuf16(ctx, inst, binding, offset, cast);
|
||||
}
|
||||
|
||||
void EmitGetCbufS16(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
|
||||
const IR::Value& offset) {
|
||||
GetCbuf16(ctx, inst, binding, offset, "ftoi");
|
||||
const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "int" : "ftoi"};
|
||||
GetCbuf16(ctx, inst, binding, offset, cast);
|
||||
}
|
||||
|
||||
void EmitGetCbufU32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
|
||||
const IR::Value& offset) {
|
||||
const auto ret{ctx.var_alloc.Define(inst, GlslVarType::U32)};
|
||||
GetCbuf(ctx, ret, binding, offset, 32, "ftou");
|
||||
const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "" : "ftou"};
|
||||
GetCbuf(ctx, ret, binding, offset, 32, cast);
|
||||
}
|
||||
|
||||
void EmitGetCbufF32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
|
||||
const IR::Value& offset) {
|
||||
const auto ret{ctx.var_alloc.Define(inst, GlslVarType::F32)};
|
||||
GetCbuf(ctx, ret, binding, offset, 32);
|
||||
const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "utof" : ""};
|
||||
GetCbuf(ctx, ret, binding, offset, 32, cast);
|
||||
}
|
||||
|
||||
void EmitGetCbufU32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding,
|
||||
const IR::Value& offset) {
|
||||
const auto cbuf{fmt::format("{}_cbuf{}", ctx.stage_name, binding.U32())};
|
||||
const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "" : "ftou"};
|
||||
if (offset.IsImmediate()) {
|
||||
static constexpr u32 cbuf_size{0x10000};
|
||||
const u32 u32_offset{offset.U32()};
|
||||
@@ -145,26 +152,26 @@ void EmitGetCbufU32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding
|
||||
return;
|
||||
}
|
||||
if (u32_offset % 2 == 0) {
|
||||
ctx.AddU32x2("{}=ftou({}[{}].{}{});", inst, cbuf, u32_offset / 16,
|
||||
ctx.AddU32x2("{}={}({}[{}].{}{});", inst, cast, cbuf, u32_offset / 16,
|
||||
OffsetSwizzle(u32_offset), OffsetSwizzle(u32_offset + 4));
|
||||
} else {
|
||||
ctx.AddU32x2("{}=uvec2(ftou({}[{}].{}),ftou({}[{}].{}));", inst, cbuf, u32_offset / 16,
|
||||
OffsetSwizzle(u32_offset), cbuf, (u32_offset + 4) / 16,
|
||||
OffsetSwizzle(u32_offset + 4));
|
||||
ctx.AddU32x2("{}=uvec2({}({}[{}].{}),{}({}[{}].{}));", inst, cast, cbuf,
|
||||
u32_offset / 16, OffsetSwizzle(u32_offset), cast, cbuf,
|
||||
(u32_offset + 4) / 16, OffsetSwizzle(u32_offset + 4));
|
||||
}
|
||||
return;
|
||||
}
|
||||
const auto offset_var{ctx.var_alloc.Consume(offset)};
|
||||
if (!ctx.profile.has_gl_component_indexing_bug) {
|
||||
ctx.AddU32x2("{}=uvec2(ftou({}[{}>>4][({}>>2)%4]),ftou({}[({}+4)>>4][(({}+4)>>2)%4]));",
|
||||
inst, cbuf, offset_var, offset_var, cbuf, offset_var, offset_var);
|
||||
ctx.AddU32x2("{}=uvec2({}({}[{}>>4][({}>>2)%4]),{}({}[({}+4)>>4][(({}+4)>>2)%4]));", inst,
|
||||
cast, cbuf, offset_var, offset_var, cast, cbuf, offset_var, offset_var);
|
||||
return;
|
||||
}
|
||||
const auto ret{ctx.var_alloc.Define(inst, GlslVarType::U32x2)};
|
||||
const auto cbuf_offset{fmt::format("{}>>2", offset_var)};
|
||||
for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
|
||||
ctx.Add("if(({}&3)=={}){}=uvec2(ftou({}[{}>>4].{}),ftou({}[({}+4)>>4].{}));", cbuf_offset,
|
||||
swizzle, ret, cbuf, offset_var, "xyzw"[swizzle], cbuf, offset_var,
|
||||
ctx.Add("if(({}&3)=={}){}=uvec2({}({}[{}>>4].{}),{}({}[({}+4)>>4].{}));", cbuf_offset,
|
||||
swizzle, ret, cast, cbuf, offset_var, "xyzw"[swizzle], cast, cbuf, offset_var,
|
||||
"xyzw"[(swizzle + 1) % 4]);
|
||||
}
|
||||
}
|
||||
@@ -221,6 +228,22 @@ void EmitGetAttribute(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr,
|
||||
}
|
||||
}
|
||||
|
||||
void EmitGetAttributeU32(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr, std::string_view) {
|
||||
switch (attr) {
|
||||
case IR::Attribute::PrimitiveId:
|
||||
ctx.AddU32("{}=uint(gl_PrimitiveID);", inst);
|
||||
break;
|
||||
case IR::Attribute::InstanceId:
|
||||
ctx.AddU32("{}=uint(gl_InstanceID);", inst);
|
||||
break;
|
||||
case IR::Attribute::VertexId:
|
||||
ctx.AddU32("{}=uint(gl_VertexID);", inst);
|
||||
break;
|
||||
default:
|
||||
throw NotImplementedException("Get U32 attribute {}", attr);
|
||||
}
|
||||
}
|
||||
|
||||
void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, std::string_view value,
|
||||
[[maybe_unused]] std::string_view vertex) {
|
||||
if (IR::IsGeneric(attr)) {
|
||||
|
||||
@@ -125,11 +125,11 @@ void EmitFPNeg16([[maybe_unused]] EmitContext& ctx, [[maybe_unused]] IR::Inst& i
|
||||
}
|
||||
|
||||
void EmitFPNeg32(EmitContext& ctx, IR::Inst& inst, std::string_view value) {
|
||||
ctx.AddF32("{}=-({});", inst, value);
|
||||
ctx.AddF32("{}=0.f-({});", inst, value);
|
||||
}
|
||||
|
||||
void EmitFPNeg64(EmitContext& ctx, IR::Inst& inst, std::string_view value) {
|
||||
ctx.AddF64("{}=-({});", inst, value);
|
||||
ctx.AddF64("{}=double(0.)-({});", inst, value);
|
||||
}
|
||||
|
||||
void EmitFPSin(EmitContext& ctx, IR::Inst& inst, std::string_view value) {
|
||||
|
||||
@@ -60,6 +60,8 @@ void EmitGetCbufU32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding
|
||||
const IR::Value& offset);
|
||||
void EmitGetAttribute(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr,
|
||||
std::string_view vertex);
|
||||
void EmitGetAttributeU32(EmitContext& ctx, IR::Inst& inst, IR::Attribute attr,
|
||||
std::string_view vertex);
|
||||
void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, std::string_view value,
|
||||
std::string_view vertex);
|
||||
void EmitGetAttributeIndexed(EmitContext& ctx, IR::Inst& inst, std::string_view offset,
|
||||
|
||||
@@ -87,11 +87,11 @@ void EmitUDiv32(EmitContext& ctx, IR::Inst& inst, std::string_view a, std::strin
|
||||
}
|
||||
|
||||
void EmitINeg32(EmitContext& ctx, IR::Inst& inst, std::string_view value) {
|
||||
ctx.AddU32("{}=uint(-({}));", inst, value);
|
||||
ctx.AddU32("{}=uint(int(0)-int({}));", inst, value);
|
||||
}
|
||||
|
||||
void EmitINeg64(EmitContext& ctx, IR::Inst& inst, std::string_view value) {
|
||||
ctx.AddU64("{}=-({});", inst, value);
|
||||
ctx.AddU64("{}=uint64_t(int64_t(0)-int64_t({}));", inst, value);
|
||||
}
|
||||
|
||||
void EmitIAbs32(EmitContext& ctx, IR::Inst& inst, std::string_view value) {
|
||||
|
||||
@@ -90,7 +90,9 @@ void EmitPhiMove(EmitContext& ctx, const IR::Value& phi_value, const IR::Value&
|
||||
if (phi_reg == val_reg) {
|
||||
return;
|
||||
}
|
||||
ctx.Add("{}={};", phi_reg, val_reg);
|
||||
const bool needs_workaround{ctx.profile.has_gl_bool_ref_bug && phi_type == IR::Type::U1};
|
||||
const auto suffix{needs_workaround ? "?true:false" : ""};
|
||||
ctx.Add("{}={}{};", phi_reg, val_reg, suffix);
|
||||
}
|
||||
|
||||
void EmitPrologue(EmitContext& ctx) {
|
||||
|
||||
@@ -428,9 +428,10 @@ void EmitContext::DefineConstantBuffers(Bindings& bindings) {
|
||||
return;
|
||||
}
|
||||
for (const auto& desc : info.constant_buffer_descriptors) {
|
||||
header += fmt::format(
|
||||
"layout(std140,binding={}) uniform {}_cbuf_{}{{vec4 {}_cbuf{}[{}];}};",
|
||||
bindings.uniform_buffer, stage_name, desc.index, stage_name, desc.index, 4 * 1024);
|
||||
const auto cbuf_type{profile.has_gl_cbuf_ftou_bug ? "uvec4" : "vec4"};
|
||||
header += fmt::format("layout(std140,binding={}) uniform {}_cbuf_{}{{{} {}_cbuf{}[{}];}};",
|
||||
bindings.uniform_buffer, stage_name, desc.index, cbuf_type,
|
||||
stage_name, desc.index, 4 * 1024);
|
||||
bindings.uniform_buffer += desc.count;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -355,6 +355,31 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex) {
|
||||
}
|
||||
}
|
||||
|
||||
Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, Id) {
|
||||
switch (attr) {
|
||||
case IR::Attribute::PrimitiveId:
|
||||
return ctx.OpLoad(ctx.U32[1], ctx.primitive_id);
|
||||
case IR::Attribute::InstanceId:
|
||||
if (ctx.profile.support_vertex_instance_id) {
|
||||
return ctx.OpLoad(ctx.U32[1], ctx.instance_id);
|
||||
} else {
|
||||
const Id index{ctx.OpLoad(ctx.U32[1], ctx.instance_index)};
|
||||
const Id base{ctx.OpLoad(ctx.U32[1], ctx.base_instance)};
|
||||
return ctx.OpISub(ctx.U32[1], index, base);
|
||||
}
|
||||
case IR::Attribute::VertexId:
|
||||
if (ctx.profile.support_vertex_instance_id) {
|
||||
return ctx.OpLoad(ctx.U32[1], ctx.vertex_id);
|
||||
} else {
|
||||
const Id index{ctx.OpLoad(ctx.U32[1], ctx.vertex_index)};
|
||||
const Id base{ctx.OpLoad(ctx.U32[1], ctx.base_vertex)};
|
||||
return ctx.OpISub(ctx.U32[1], index, base);
|
||||
}
|
||||
default:
|
||||
throw NotImplementedException("Read U32 attribute {}", attr);
|
||||
}
|
||||
}
|
||||
|
||||
void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, [[maybe_unused]] Id vertex) {
|
||||
const std::optional<OutAttr> output{OutputAttrPointer(ctx, attr)};
|
||||
if (!output) {
|
||||
|
||||
@@ -53,6 +53,7 @@ Id EmitGetCbufU32(EmitContext& ctx, const IR::Value& binding, const IR::Value& o
|
||||
Id EmitGetCbufF32(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
|
||||
Id EmitGetCbufU32x2(EmitContext& ctx, const IR::Value& binding, const IR::Value& offset);
|
||||
Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, Id vertex);
|
||||
Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, Id vertex);
|
||||
void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, Id vertex);
|
||||
Id EmitGetAttributeIndexed(EmitContext& ctx, Id offset, Id vertex);
|
||||
void EmitSetAttributeIndexed(EmitContext& ctx, Id offset, Id value, Id vertex);
|
||||
|
||||
@@ -31,6 +31,8 @@ public:
|
||||
|
||||
[[nodiscard]] virtual std::array<u32, 3> WorkgroupSize() const = 0;
|
||||
|
||||
virtual void Dump(u64 hash) = 0;
|
||||
|
||||
[[nodiscard]] const ProgramHeader& SPH() const noexcept {
|
||||
return sph;
|
||||
}
|
||||
|
||||
@@ -40,6 +40,7 @@ OPCODE(GetCbufU32, U32, U32,
|
||||
OPCODE(GetCbufF32, F32, U32, U32, )
|
||||
OPCODE(GetCbufU32x2, U32x2, U32, U32, )
|
||||
OPCODE(GetAttribute, F32, Attribute, U32, )
|
||||
OPCODE(GetAttributeU32, U32, Attribute, U32, )
|
||||
OPCODE(SetAttribute, Void, Attribute, F32, U32, )
|
||||
OPCODE(GetAttributeIndexed, F32, U32, U32, )
|
||||
OPCODE(SetAttributeIndexed, Void, U32, F32, U32, )
|
||||
|
||||
@@ -389,6 +389,7 @@ void VisitUsages(Info& info, IR::Inst& inst) {
|
||||
info.uses_demote_to_helper_invocation = true;
|
||||
break;
|
||||
case IR::Opcode::GetAttribute:
|
||||
case IR::Opcode::GetAttributeU32:
|
||||
info.loads.mask[static_cast<size_t>(inst.Arg(0).Attribute())] = true;
|
||||
break;
|
||||
case IR::Opcode::SetAttribute:
|
||||
|
||||
@@ -505,6 +505,29 @@ void FoldBitCast(IR::Inst& inst, IR::Opcode reverse) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if constexpr (op == IR::Opcode::BitCastU32F32) {
|
||||
// Workaround for new NVIDIA driver bug, where:
|
||||
// uint attr = ftou(itof(gl_InstanceID));
|
||||
// always returned 0.
|
||||
// We can instead manually optimize this and work around the driver bug:
|
||||
// uint attr = uint(gl_InstanceID);
|
||||
if (arg_inst->GetOpcode() == IR::Opcode::GetAttribute) {
|
||||
const IR::Attribute attr{arg_inst->Arg(0).Attribute()};
|
||||
switch (attr) {
|
||||
case IR::Attribute::PrimitiveId:
|
||||
case IR::Attribute::InstanceId:
|
||||
case IR::Attribute::VertexId:
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
// Replace the bitcasts with an integer attribute get
|
||||
inst.ReplaceOpcode(IR::Opcode::GetAttributeU32);
|
||||
inst.SetArg(0, arg_inst->Arg(0));
|
||||
inst.SetArg(1, arg_inst->Arg(1));
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FoldInverseFunc(IR::Inst& inst, IR::Opcode reverse) {
|
||||
|
||||
@@ -65,6 +65,10 @@ struct Profile {
|
||||
bool has_gl_component_indexing_bug{};
|
||||
/// The precise type qualifier is broken in the fragment stage of some drivers
|
||||
bool has_gl_precise_bug{};
|
||||
/// Some drivers do not properly support floatBitsToUint when used on cbufs
|
||||
bool has_gl_cbuf_ftou_bug{};
|
||||
/// Some drivers poorly optimize boolean variable references
|
||||
bool has_gl_bool_ref_bug{};
|
||||
/// Ignores SPIR-V ordered vs unordered using GLSL semantics
|
||||
bool ignore_nan_fp_comparisons{};
|
||||
|
||||
|
||||
@@ -73,12 +73,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
|
||||
}
|
||||
const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
|
||||
|
||||
for (const auto& [map_addr, map_size] : submapped_ranges) {
|
||||
for (const auto& map : submapped_ranges) {
|
||||
// Flush and invalidate through the GPU interface, to be asynchronous if possible.
|
||||
const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
|
||||
const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map.first);
|
||||
ASSERT(cpu_addr);
|
||||
|
||||
rasterizer->UnmapMemory(*cpu_addr, map_size);
|
||||
rasterizer->UnmapMemory(*cpu_addr, map.second);
|
||||
}
|
||||
|
||||
UpdateRange(gpu_addr, PageEntry::State::Unmapped, size);
|
||||
@@ -265,8 +265,7 @@ size_t MemoryManager::BytesToMapEnd(GPUVAddr gpu_addr) const noexcept {
|
||||
return it->second - (gpu_addr - it->first);
|
||||
}
|
||||
|
||||
void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
|
||||
bool is_safe) const {
|
||||
void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const {
|
||||
std::size_t remaining_size{size};
|
||||
std::size_t page_index{gpu_src_addr >> page_bits};
|
||||
std::size_t page_offset{gpu_src_addr & page_mask};
|
||||
@@ -274,15 +273,35 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
|
||||
while (remaining_size > 0) {
|
||||
const std::size_t copy_amount{
|
||||
std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
|
||||
const auto page_addr{GpuToCpuAddress(page_index << page_bits)};
|
||||
if (page_addr && *page_addr != 0) {
|
||||
|
||||
if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
|
||||
const auto src_addr{*page_addr + page_offset};
|
||||
|
||||
// Flush must happen on the rasterizer interface, such that memory is always synchronous
|
||||
// when it is read (even when in asynchronous GPU mode). Fixes Dead Cells title menu.
|
||||
rasterizer->FlushRegion(src_addr, copy_amount);
|
||||
system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
|
||||
}
|
||||
|
||||
page_index++;
|
||||
page_offset = 0;
|
||||
dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
|
||||
remaining_size -= copy_amount;
|
||||
}
|
||||
}
|
||||
|
||||
void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
|
||||
const std::size_t size) const {
|
||||
std::size_t remaining_size{size};
|
||||
std::size_t page_index{gpu_src_addr >> page_bits};
|
||||
std::size_t page_offset{gpu_src_addr & page_mask};
|
||||
|
||||
while (remaining_size > 0) {
|
||||
const std::size_t copy_amount{
|
||||
std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
|
||||
|
||||
if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
|
||||
const auto src_addr{*page_addr + page_offset};
|
||||
if (is_safe) {
|
||||
// Flush must happen on the rasterizer interface, such that memory is always
|
||||
// synchronous when it is read (even when in asynchronous GPU mode).
|
||||
// Fixes Dead Cells title menu.
|
||||
rasterizer->FlushRegion(src_addr, copy_amount);
|
||||
}
|
||||
system.Memory().ReadBlockUnsafe(src_addr, dest_buffer, copy_amount);
|
||||
} else {
|
||||
std::memset(dest_buffer, 0, copy_amount);
|
||||
@@ -295,17 +314,7 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
|
||||
}
|
||||
}
|
||||
|
||||
void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const {
|
||||
ReadBlockImpl(gpu_src_addr, dest_buffer, size, true);
|
||||
}
|
||||
|
||||
void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
|
||||
const std::size_t size) const {
|
||||
ReadBlockImpl(gpu_src_addr, dest_buffer, size, false);
|
||||
}
|
||||
|
||||
void MemoryManager::WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size,
|
||||
bool is_safe) {
|
||||
void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size) {
|
||||
std::size_t remaining_size{size};
|
||||
std::size_t page_index{gpu_dest_addr >> page_bits};
|
||||
std::size_t page_offset{gpu_dest_addr & page_mask};
|
||||
@@ -313,15 +322,13 @@ void MemoryManager::WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffe
|
||||
while (remaining_size > 0) {
|
||||
const std::size_t copy_amount{
|
||||
std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
|
||||
const auto page_addr{GpuToCpuAddress(page_index << page_bits)};
|
||||
if (page_addr && *page_addr != 0) {
|
||||
|
||||
if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
|
||||
const auto dest_addr{*page_addr + page_offset};
|
||||
|
||||
if (is_safe) {
|
||||
// Invalidate must happen on the rasterizer interface, such that memory is always
|
||||
// synchronous when it is written (even when in asynchronous GPU mode).
|
||||
rasterizer->InvalidateRegion(dest_addr, copy_amount);
|
||||
}
|
||||
// Invalidate must happen on the rasterizer interface, such that memory is always
|
||||
// synchronous when it is written (even when in asynchronous GPU mode).
|
||||
rasterizer->InvalidateRegion(dest_addr, copy_amount);
|
||||
system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
|
||||
}
|
||||
|
||||
@@ -332,13 +339,26 @@ void MemoryManager::WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffe
|
||||
}
|
||||
}
|
||||
|
||||
void MemoryManager::WriteBlock(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size) {
|
||||
WriteBlockImpl(gpu_dest_addr, src_buffer, size, true);
|
||||
}
|
||||
|
||||
void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer,
|
||||
std::size_t size) {
|
||||
WriteBlockImpl(gpu_dest_addr, src_buffer, size, false);
|
||||
std::size_t remaining_size{size};
|
||||
std::size_t page_index{gpu_dest_addr >> page_bits};
|
||||
std::size_t page_offset{gpu_dest_addr & page_mask};
|
||||
|
||||
while (remaining_size > 0) {
|
||||
const std::size_t copy_amount{
|
||||
std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
|
||||
|
||||
if (const auto page_addr{GpuToCpuAddress(page_index << page_bits)}; page_addr) {
|
||||
const auto dest_addr{*page_addr + page_offset};
|
||||
system.Memory().WriteBlockUnsafe(dest_addr, src_buffer, copy_amount);
|
||||
}
|
||||
|
||||
page_index++;
|
||||
page_offset = 0;
|
||||
src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
|
||||
remaining_size -= copy_amount;
|
||||
}
|
||||
}
|
||||
|
||||
void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const {
|
||||
@@ -415,15 +435,15 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
|
||||
size_t page_offset{gpu_addr & page_mask};
|
||||
std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{};
|
||||
std::optional<VAddr> old_page_addr{};
|
||||
const auto extend_size = [&last_segment, &page_index, &page_offset](std::size_t bytes) {
|
||||
const auto extend_size = [this, &last_segment, &page_index](std::size_t bytes) {
|
||||
if (!last_segment) {
|
||||
const GPUVAddr new_base_addr = (page_index << page_bits) + page_offset;
|
||||
GPUVAddr new_base_addr = page_index << page_bits;
|
||||
last_segment = {new_base_addr, bytes};
|
||||
} else {
|
||||
last_segment->second += bytes;
|
||||
}
|
||||
};
|
||||
const auto split = [&last_segment, &result] {
|
||||
const auto split = [this, &last_segment, &result] {
|
||||
if (last_segment) {
|
||||
result.push_back(*last_segment);
|
||||
last_segment = std::nullopt;
|
||||
@@ -432,7 +452,7 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
|
||||
while (remaining_size > 0) {
|
||||
const size_t num_bytes{std::min(page_size - page_offset, remaining_size)};
|
||||
const auto page_addr{GpuToCpuAddress(page_index << page_bits)};
|
||||
if (!page_addr || *page_addr == 0) {
|
||||
if (!page_addr) {
|
||||
split();
|
||||
} else if (old_page_addr) {
|
||||
if (*old_page_addr + page_size != *page_addr) {
|
||||
|
||||
@@ -155,11 +155,6 @@ private:
|
||||
|
||||
void FlushRegion(GPUVAddr gpu_addr, size_t size) const;
|
||||
|
||||
void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
|
||||
bool is_safe) const;
|
||||
void WriteBlockImpl(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size,
|
||||
bool is_safe);
|
||||
|
||||
[[nodiscard]] static constexpr std::size_t PageEntryIndex(GPUVAddr gpu_addr) {
|
||||
return (gpu_addr >> page_bits) & page_table_mask;
|
||||
}
|
||||
|
||||
@@ -182,17 +182,13 @@ Device::Device() {
|
||||
shader_backend = Settings::ShaderBackend::GLSL;
|
||||
}
|
||||
|
||||
if (shader_backend == Settings::ShaderBackend::GLSL && is_nvidia &&
|
||||
!Settings::values.renderer_debug) {
|
||||
if (shader_backend == Settings::ShaderBackend::GLSL && is_nvidia) {
|
||||
const std::string_view driver_version = version.substr(13);
|
||||
const int version_major =
|
||||
std::atoi(driver_version.substr(0, driver_version.find(".")).data());
|
||||
|
||||
if (version_major >= 495) {
|
||||
LOG_WARNING(Render_OpenGL, "NVIDIA drivers 495 and later causes significant problems "
|
||||
"with yuzu. Forcing GLASM as a mitigation.");
|
||||
shader_backend = Settings::ShaderBackend::GLASM;
|
||||
use_assembly_shaders = true;
|
||||
has_cbuf_ftou_bug = true;
|
||||
has_bool_ref_bug = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -152,6 +152,14 @@ public:
|
||||
return need_fastmath_off;
|
||||
}
|
||||
|
||||
bool HasCbufFtouBug() const {
|
||||
return has_cbuf_ftou_bug;
|
||||
}
|
||||
|
||||
bool HasBoolRefBug() const {
|
||||
return has_bool_ref_bug;
|
||||
}
|
||||
|
||||
Settings::ShaderBackend GetShaderBackend() const {
|
||||
return shader_backend;
|
||||
}
|
||||
@@ -200,6 +208,8 @@ private:
|
||||
bool has_sparse_texture_2{};
|
||||
bool warp_size_potentially_larger_than_guest{};
|
||||
bool need_fastmath_off{};
|
||||
bool has_cbuf_ftou_bug{};
|
||||
bool has_bool_ref_bug{};
|
||||
|
||||
std::string vendor_name;
|
||||
};
|
||||
|
||||
@@ -214,6 +214,8 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo
|
||||
.has_broken_fp16_float_controls = false,
|
||||
.has_gl_component_indexing_bug = device.HasComponentIndexingBug(),
|
||||
.has_gl_precise_bug = device.HasPreciseBug(),
|
||||
.has_gl_cbuf_ftou_bug = device.HasCbufFtouBug(),
|
||||
.has_gl_bool_ref_bug = device.HasBoolRefBug(),
|
||||
.ignore_nan_fp_comparisons = true,
|
||||
.gl_max_compute_smem_size = device.GetMaxComputeSharedMemorySize(),
|
||||
},
|
||||
@@ -423,6 +425,11 @@ std::unique_ptr<GraphicsPipeline> ShaderCache::CreateGraphicsPipeline(
|
||||
|
||||
const u32 cfg_offset{static_cast<u32>(env.StartAddress() + sizeof(Shader::ProgramHeader))};
|
||||
Shader::Maxwell::Flow::CFG cfg(env, pools.flow_block, cfg_offset, index == 0);
|
||||
|
||||
if (Settings::values.dump_shaders) {
|
||||
env.Dump(key.unique_hashes[index]);
|
||||
}
|
||||
|
||||
if (!uses_vertex_a || index != 1) {
|
||||
// Normal path
|
||||
programs[index] = TranslateProgram(pools.inst, pools.block, env, cfg, host_info);
|
||||
@@ -509,8 +516,12 @@ std::unique_ptr<ComputePipeline> ShaderCache::CreateComputePipeline(
|
||||
LOG_INFO(Render_OpenGL, "0x{:016x}", key.Hash());
|
||||
|
||||
Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()};
|
||||
auto program{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)};
|
||||
|
||||
if (Settings::values.dump_shaders) {
|
||||
env.Dump(key.Hash());
|
||||
}
|
||||
|
||||
auto program{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)};
|
||||
const u32 num_storage_buffers{Shader::NumDescriptors(program.info.storage_buffers_descriptors)};
|
||||
Shader::RuntimeInfo info;
|
||||
info.glasm_use_storage_buffers = num_storage_buffers <= device.GetMaxGLASMStorageBufferBlocks();
|
||||
|
||||
@@ -517,6 +517,9 @@ std::unique_ptr<GraphicsPipeline> PipelineCache::CreateGraphicsPipeline(
|
||||
|
||||
const u32 cfg_offset{static_cast<u32>(env.StartAddress() + sizeof(Shader::ProgramHeader))};
|
||||
Shader::Maxwell::Flow::CFG cfg(env, pools.flow_block, cfg_offset, index == 0);
|
||||
if (Settings::values.dump_shaders) {
|
||||
env.Dump(key.unique_hashes[index]);
|
||||
}
|
||||
if (!uses_vertex_a || index != 1) {
|
||||
// Normal path
|
||||
programs[index] = TranslateProgram(pools.inst, pools.block, env, cfg, host_info);
|
||||
@@ -613,6 +616,12 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline(
|
||||
LOG_INFO(Render_Vulkan, "0x{:016x}", key.Hash());
|
||||
|
||||
Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()};
|
||||
|
||||
// Dump it before error.
|
||||
if (Settings::values.dump_shaders) {
|
||||
env.Dump(key.Hash());
|
||||
}
|
||||
|
||||
auto program{TranslateProgram(pools.inst, pools.block, env, cfg, host_info)};
|
||||
const std::vector<u32> code{EmitSPIRV(profile, program)};
|
||||
device.SaveShader(code);
|
||||
|
||||
@@ -1344,7 +1344,6 @@ bool Image::ScaleUp(bool ignore) {
|
||||
return false;
|
||||
}
|
||||
has_scaled = true;
|
||||
const auto& device = runtime->device;
|
||||
if (!scaled_image) {
|
||||
const bool is_2d = info.type == ImageType::e2D;
|
||||
const u32 scaled_width = resolution.ScaleUp(info.size.width);
|
||||
@@ -1352,7 +1351,7 @@ bool Image::ScaleUp(bool ignore) {
|
||||
auto scaled_info = info;
|
||||
scaled_info.size.width = scaled_width;
|
||||
scaled_info.size.height = scaled_height;
|
||||
scaled_image = MakeImage(device, scaled_info);
|
||||
scaled_image = MakeImage(runtime->device, scaled_info);
|
||||
auto& allocator = runtime->memory_allocator;
|
||||
scaled_commit = MemoryCommit(allocator.Commit(scaled_image, MemoryUsage::DeviceLocal));
|
||||
ignore = false;
|
||||
@@ -1361,18 +1360,13 @@ bool Image::ScaleUp(bool ignore) {
|
||||
if (ignore) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (aspect_mask == 0) {
|
||||
aspect_mask = ImageAspectMask(info.format);
|
||||
}
|
||||
static constexpr auto OPTIMAL_FORMAT = FormatType::Optimal;
|
||||
const PixelFormat format = StorageFormat(info.format);
|
||||
const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format;
|
||||
const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT;
|
||||
if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) {
|
||||
BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution);
|
||||
} else {
|
||||
if (NeedsScaleHelper()) {
|
||||
return BlitScaleHelper(true);
|
||||
} else {
|
||||
BlitScale(*scheduler, *original_image, *scaled_image, info, aspect_mask, resolution);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -1394,15 +1388,10 @@ bool Image::ScaleDown(bool ignore) {
|
||||
if (aspect_mask == 0) {
|
||||
aspect_mask = ImageAspectMask(info.format);
|
||||
}
|
||||
static constexpr auto OPTIMAL_FORMAT = FormatType::Optimal;
|
||||
const PixelFormat format = StorageFormat(info.format);
|
||||
const auto& device = runtime->device;
|
||||
const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format;
|
||||
const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT;
|
||||
if (device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT)) {
|
||||
BlitScale(*scheduler, *scaled_image, *original_image, info, aspect_mask, resolution, false);
|
||||
} else {
|
||||
if (NeedsScaleHelper()) {
|
||||
return BlitScaleHelper(false);
|
||||
} else {
|
||||
BlitScale(*scheduler, *scaled_image, *original_image, info, aspect_mask, resolution, false);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -1470,6 +1459,20 @@ bool Image::BlitScaleHelper(bool scale_up) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Image::NeedsScaleHelper() const {
|
||||
const auto& device = runtime->device;
|
||||
const bool needs_msaa_helper = info.num_samples > 1 && device.CantBlitMSAA();
|
||||
if (needs_msaa_helper) {
|
||||
return true;
|
||||
}
|
||||
static constexpr auto OPTIMAL_FORMAT = FormatType::Optimal;
|
||||
const PixelFormat format = StorageFormat(info.format);
|
||||
const auto vk_format = MaxwellToVK::SurfaceFormat(device, OPTIMAL_FORMAT, false, format).format;
|
||||
const auto blit_usage = VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT;
|
||||
const bool needs_blit_helper = !device.IsFormatSupported(vk_format, blit_usage, OPTIMAL_FORMAT);
|
||||
return needs_blit_helper;
|
||||
}
|
||||
|
||||
ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info,
|
||||
ImageId image_id_, Image& image)
|
||||
: VideoCommon::ImageViewBase{info, image.info, image_id_}, device{&runtime.device},
|
||||
|
||||
@@ -149,6 +149,8 @@ public:
|
||||
private:
|
||||
bool BlitScaleHelper(bool scale_up);
|
||||
|
||||
bool NeedsScaleHelper() const;
|
||||
|
||||
VKScheduler* scheduler{};
|
||||
TextureCacheRuntime* runtime{};
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include <algorithm>
|
||||
#include <bit>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
@@ -14,6 +15,7 @@
|
||||
#include "common/common_types.h"
|
||||
#include "common/div_ceil.h"
|
||||
#include "common/fs/fs.h"
|
||||
#include "common/fs/path_util.h"
|
||||
#include "common/logging/log.h"
|
||||
#include "shader_recompiler/environment.h"
|
||||
#include "video_core/engines/kepler_compute.h"
|
||||
@@ -57,6 +59,47 @@ static Shader::TextureType ConvertType(const Tegra::Texture::TICEntry& entry) {
|
||||
}
|
||||
}
|
||||
|
||||
static std::string_view StageToPrefix(Shader::Stage stage) {
|
||||
switch (stage) {
|
||||
case Shader::Stage::VertexB:
|
||||
return "VB";
|
||||
case Shader::Stage::TessellationControl:
|
||||
return "TC";
|
||||
case Shader::Stage::TessellationEval:
|
||||
return "TE";
|
||||
case Shader::Stage::Geometry:
|
||||
return "GS";
|
||||
case Shader::Stage::Fragment:
|
||||
return "FS";
|
||||
case Shader::Stage::Compute:
|
||||
return "CS";
|
||||
case Shader::Stage::VertexA:
|
||||
return "VA";
|
||||
default:
|
||||
return "UK";
|
||||
}
|
||||
}
|
||||
|
||||
static void DumpImpl(u64 hash, const u64* code, u32 read_highest, u32 read_lowest,
|
||||
u32 initial_offset, Shader::Stage stage) {
|
||||
const auto shader_dir{Common::FS::GetYuzuPath(Common::FS::YuzuPath::DumpDir)};
|
||||
const auto base_dir{shader_dir / "shaders"};
|
||||
if (!Common::FS::CreateDir(shader_dir) || !Common::FS::CreateDir(base_dir)) {
|
||||
LOG_ERROR(Common_Filesystem, "Failed to create shader dump directories");
|
||||
return;
|
||||
}
|
||||
const auto prefix = StageToPrefix(stage);
|
||||
const auto name{base_dir / fmt::format("{}{:016x}.ash", prefix, hash)};
|
||||
const size_t real_size = read_highest - read_lowest + initial_offset;
|
||||
const size_t padding_needed = ((32 - (real_size % 32)) % 32);
|
||||
std::fstream shader_file(name, std::ios::out | std::ios::binary);
|
||||
const size_t jump_index = initial_offset / sizeof(u64);
|
||||
shader_file.write(reinterpret_cast<const char*>(code + jump_index), real_size);
|
||||
for (size_t i = 0; i < padding_needed; i++) {
|
||||
shader_file.put(0);
|
||||
}
|
||||
}
|
||||
|
||||
GenericEnvironment::GenericEnvironment(Tegra::MemoryManager& gpu_memory_, GPUVAddr program_base_,
|
||||
u32 start_address_)
|
||||
: gpu_memory{&gpu_memory_}, program_base{program_base_} {
|
||||
@@ -128,6 +171,10 @@ u64 GenericEnvironment::CalculateHash() const {
|
||||
return Common::CityHash64(data.get(), size);
|
||||
}
|
||||
|
||||
void GenericEnvironment::Dump(u64 hash) {
|
||||
DumpImpl(hash, code.data(), read_highest, read_lowest, initial_offset, stage);
|
||||
}
|
||||
|
||||
void GenericEnvironment::Serialize(std::ofstream& file) const {
|
||||
const u64 code_size{static_cast<u64>(CachedSize())};
|
||||
const u64 num_texture_types{static_cast<u64>(texture_types.size())};
|
||||
@@ -207,6 +254,7 @@ GraphicsEnvironment::GraphicsEnvironment(Tegra::Engines::Maxwell3D& maxwell3d_,
|
||||
u32 start_address_)
|
||||
: GenericEnvironment{gpu_memory_, program_base_, start_address_}, maxwell3d{&maxwell3d_} {
|
||||
gpu_memory->ReadBlock(program_base + start_address, &sph, sizeof(sph));
|
||||
initial_offset = sizeof(sph);
|
||||
gp_passthrough_mask = maxwell3d->regs.gp_passthrough_mask;
|
||||
switch (program) {
|
||||
case Maxwell::ShaderProgram::VertexA:
|
||||
@@ -323,14 +371,20 @@ void FileEnvironment::Deserialize(std::ifstream& file) {
|
||||
if (stage == Shader::Stage::Compute) {
|
||||
file.read(reinterpret_cast<char*>(&workgroup_size), sizeof(workgroup_size))
|
||||
.read(reinterpret_cast<char*>(&shared_memory_size), sizeof(shared_memory_size));
|
||||
initial_offset = 0;
|
||||
} else {
|
||||
file.read(reinterpret_cast<char*>(&sph), sizeof(sph));
|
||||
initial_offset = sizeof(sph);
|
||||
if (stage == Shader::Stage::Geometry) {
|
||||
file.read(reinterpret_cast<char*>(&gp_passthrough_mask), sizeof(gp_passthrough_mask));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FileEnvironment::Dump(u64 [[maybe_unused]] hash) {
|
||||
DumpImpl(hash, code.get(), read_highest, read_lowest, initial_offset, stage);
|
||||
}
|
||||
|
||||
u64 FileEnvironment::ReadInstruction(u32 address) {
|
||||
if (address < read_lowest || address > read_highest) {
|
||||
throw Shader::LogicError("Out of bounds address {}", address);
|
||||
|
||||
@@ -57,6 +57,8 @@ public:
|
||||
|
||||
[[nodiscard]] u64 CalculateHash() const;
|
||||
|
||||
void Dump(u64 hash) override;
|
||||
|
||||
void Serialize(std::ofstream& file) const;
|
||||
|
||||
protected:
|
||||
@@ -82,6 +84,7 @@ protected:
|
||||
|
||||
u32 cached_lowest = std::numeric_limits<u32>::max();
|
||||
u32 cached_highest = 0;
|
||||
u32 initial_offset = 0;
|
||||
|
||||
bool has_unbound_instructions = false;
|
||||
};
|
||||
@@ -149,6 +152,8 @@ public:
|
||||
|
||||
[[nodiscard]] std::array<u32, 3> WorkgroupSize() const override;
|
||||
|
||||
void Dump(u64 hash) override;
|
||||
|
||||
private:
|
||||
std::unique_ptr<u64[]> code;
|
||||
std::unordered_map<u32, Shader::TextureType> texture_types;
|
||||
@@ -159,6 +164,7 @@ private:
|
||||
u32 texture_bound{};
|
||||
u32 read_lowest{};
|
||||
u32 read_highest{};
|
||||
u32 initial_offset{};
|
||||
};
|
||||
|
||||
void SerializePipeline(std::span<const char> key, std::span<const GenericEnvironment* const> envs,
|
||||
|
||||
@@ -1376,7 +1376,9 @@ void TextureCache<P>::ForEachSparseSegment(ImageBase& image, Func&& func) {
|
||||
using FuncReturn = typename std::invoke_result<Func, GPUVAddr, VAddr, size_t>::type;
|
||||
static constexpr bool RETURNS_BOOL = std::is_same_v<FuncReturn, bool>;
|
||||
const auto segments = gpu_memory.GetSubmappedRange(image.gpu_addr, image.guest_size_bytes);
|
||||
for (const auto& [gpu_addr, size] : segments) {
|
||||
for (auto& segment : segments) {
|
||||
const auto gpu_addr = segment.first;
|
||||
const auto size = segment.second;
|
||||
std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr);
|
||||
ASSERT(cpu_addr);
|
||||
if constexpr (RETURNS_BOOL) {
|
||||
|
||||
@@ -364,14 +364,14 @@ template <u32 GOB_EXTENT>
|
||||
|
||||
[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapRightAddress2D(
|
||||
const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size) {
|
||||
const u32 layer_stride = new_info.layer_stride;
|
||||
const s32 new_size = layer_stride * new_info.resources.layers;
|
||||
const s32 diff = static_cast<s32>(overlap.gpu_addr - gpu_addr);
|
||||
const u64 layer_stride = new_info.layer_stride;
|
||||
const u64 new_size = layer_stride * new_info.resources.layers;
|
||||
const u64 diff = overlap.gpu_addr - gpu_addr;
|
||||
if (diff > new_size) {
|
||||
return std::nullopt;
|
||||
}
|
||||
const s32 base_layer = diff / layer_stride;
|
||||
const s32 mip_offset = diff % layer_stride;
|
||||
const s32 base_layer = static_cast<s32>(diff / layer_stride);
|
||||
const s32 mip_offset = static_cast<s32>(diff % layer_stride);
|
||||
const std::array offsets = CalculateMipLevelOffsets(new_info);
|
||||
const auto end = offsets.begin() + new_info.resources.levels;
|
||||
const auto it = std::find(offsets.begin(), end, static_cast<u32>(mip_offset));
|
||||
|
||||
@@ -638,15 +638,20 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
|
||||
}
|
||||
}
|
||||
|
||||
if (ext_vertex_input_dynamic_state && driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) {
|
||||
const bool is_intel_windows = driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS;
|
||||
if (ext_vertex_input_dynamic_state && is_intel_windows) {
|
||||
LOG_WARNING(Render_Vulkan, "Blacklisting Intel for VK_EXT_vertex_input_dynamic_state");
|
||||
ext_vertex_input_dynamic_state = false;
|
||||
}
|
||||
if (is_float16_supported && driver_id == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) {
|
||||
if (is_float16_supported && is_intel_windows) {
|
||||
// Intel's compiler crashes when using fp16 on Astral Chain, disable it for the time being.
|
||||
LOG_WARNING(Render_Vulkan, "Blacklisting Intel proprietary from float16 math");
|
||||
is_float16_supported = false;
|
||||
}
|
||||
if (is_intel_windows) {
|
||||
LOG_WARNING(Render_Vulkan, "Intel proprietary drivers do not support MSAA image blits");
|
||||
cant_blit_msaa = true;
|
||||
}
|
||||
|
||||
supports_d24_depth =
|
||||
IsFormatSupported(VK_FORMAT_D24_UNORM_S8_UINT,
|
||||
|
||||
@@ -350,6 +350,10 @@ public:
|
||||
return supports_d24_depth;
|
||||
}
|
||||
|
||||
bool CantBlitMSAA() const {
|
||||
return cant_blit_msaa;
|
||||
}
|
||||
|
||||
private:
|
||||
/// Checks if the physical device is suitable.
|
||||
void CheckSuitability(bool requires_swapchain) const;
|
||||
@@ -443,6 +447,7 @@ private:
|
||||
bool has_renderdoc{}; ///< Has RenderDoc attached
|
||||
bool has_nsight_graphics{}; ///< Has Nsight Graphics attached
|
||||
bool supports_d24_depth{}; ///< Supports D24 depth buffers.
|
||||
bool cant_blit_msaa{}; ///< Does not support MSAA<->MSAA blitting.
|
||||
|
||||
// Telemetry parameters
|
||||
std::string vendor_name; ///< Device's driver name.
|
||||
|
||||
@@ -51,6 +51,8 @@ void ConfigureDebug::SetConfiguration() {
|
||||
ui->enable_cpu_debugging->setChecked(Settings::values.cpu_debug_mode.GetValue());
|
||||
ui->enable_nsight_aftermath->setEnabled(runtime_lock);
|
||||
ui->enable_nsight_aftermath->setChecked(Settings::values.enable_nsight_aftermath.GetValue());
|
||||
ui->dump_shaders->setEnabled(runtime_lock);
|
||||
ui->dump_shaders->setChecked(Settings::values.dump_shaders.GetValue());
|
||||
ui->disable_macro_jit->setEnabled(runtime_lock);
|
||||
ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit.GetValue());
|
||||
ui->disable_loop_safety_checks->setEnabled(runtime_lock);
|
||||
@@ -73,6 +75,7 @@ void ConfigureDebug::ApplyConfiguration() {
|
||||
Settings::values.renderer_shader_feedback = ui->enable_shader_feedback->isChecked();
|
||||
Settings::values.cpu_debug_mode = ui->enable_cpu_debugging->isChecked();
|
||||
Settings::values.enable_nsight_aftermath = ui->enable_nsight_aftermath->isChecked();
|
||||
Settings::values.dump_shaders = ui->dump_shaders->isChecked();
|
||||
Settings::values.disable_shader_loop_safety_checks =
|
||||
ui->disable_loop_safety_checks->isChecked();
|
||||
Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked();
|
||||
|
||||
@@ -105,6 +105,19 @@
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="2" column="1">
|
||||
<widget class="QCheckBox" name="dump_shaders">
|
||||
<property name="enabled">
|
||||
<bool>true</bool>
|
||||
</property>
|
||||
<property name="toolTip">
|
||||
<string>When checked, it will dump all the original assembler shaders from the disk shader cache or game as found</string>
|
||||
</property>
|
||||
<property name="text">
|
||||
<string>Dump Game Shaders</string>
|
||||
</property>
|
||||
</widget>
|
||||
</item>
|
||||
<item row="0" column="1">
|
||||
<widget class="QCheckBox" name="disable_macro_jit">
|
||||
<property name="enabled">
|
||||
|
||||
Reference in New Issue
Block a user