GPU: Store uploaded GPU macros and keep track of the number of method parameters.

GPU: Macros are specific to the Maxwell3D engine, so handle them internally.
Merge pull request #245 from Subv/set_shader2
2018-03-18 11:51:46 -05:00 · 2018-03-18 11:51:45 -05:00 · 2018-03-17 21:19:39 -04:00 · 2018-03-17 18:32:57 -05:00 · 2018-03-17 18:32:57 -05:00 · 2018-03-17 18:32:56 -05:00
21 changed files with 475 additions and 103 deletions
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@@ -39,8 +39,12 @@ public:
        Run(1);
    }

+    /// Maps a backing memory region for the CPU
    virtual void MapBackingMemory(VAddr address, size_t size, u8* memory,
-                                  Kernel::VMAPermission perms) {}
+                                  Kernel::VMAPermission perms) = 0;
+
+    /// Unmaps a region of memory that was previously mapped using MapBackingMemory
+    virtual void UnmapMemory(VAddr address, size_t size) = 0;

    /// Clear all instruction cache
    virtual void ClearInstructionCache() = 0;
--- a/src/core/arm/dynarmic/arm_dynarmic.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic.cpp
@@ -136,6 +136,10 @@ void ARM_Dynarmic::MapBackingMemory(u64 address, size_t size, u8* memory,
    inner_unicorn.MapBackingMemory(address, size, memory, perms);
 }

+void ARM_Dynarmic::UnmapMemory(u64 address, size_t size) {
+    inner_unicorn.UnmapMemory(address, size);
+}
+
 void ARM_Dynarmic::SetPC(u64 pc) {
    jit->SetPC(pc);
 }
--- a/src/core/arm/dynarmic/arm_dynarmic.h
+++ b/src/core/arm/dynarmic/arm_dynarmic.h
@@ -19,7 +19,7 @@ public:

    void MapBackingMemory(VAddr address, size_t size, u8* memory,
                          Kernel::VMAPermission perms) override;
-
+    void UnmapMemory(u64 address, size_t size) override;
    void SetPC(u64 pc) override;
    u64 GetPC() const override;
    u64 GetReg(int index) const override;
--- a/src/core/arm/unicorn/arm_unicorn.cpp
+++ b/src/core/arm/unicorn/arm_unicorn.cpp
@@ -77,6 +77,10 @@ void ARM_Unicorn::MapBackingMemory(VAddr address, size_t size, u8* memory,
    CHECKED(uc_mem_map_ptr(uc, address, size, static_cast<u32>(perms), memory));
 }

+void ARM_Unicorn::UnmapMemory(VAddr address, size_t size) {
+    CHECKED(uc_mem_unmap(uc, address, size));
+}
+
 void ARM_Unicorn::SetPC(u64 pc) {
    CHECKED(uc_reg_write(uc, UC_ARM64_REG_PC, &pc));
 }
--- a/src/core/arm/unicorn/arm_unicorn.h
+++ b/src/core/arm/unicorn/arm_unicorn.h
@@ -14,6 +14,7 @@ public:
    ~ARM_Unicorn();
    void MapBackingMemory(VAddr address, size_t size, u8* memory,
                          Kernel::VMAPermission perms) override;
+    void UnmapMemory(VAddr address, size_t size) override;
    void SetPC(u64 pc) override;
    u64 GetPC() const override;
    u64 GetReg(int index) const override;
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -33,10 +33,6 @@ enum class HandleType : u32 {
    ServerSession,
 };

-enum {
-    DEFAULT_STACK_SIZE = 0x10000,
-};
-
 enum class ResetType {
    OneShot,
    Sticky,
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -117,11 +117,12 @@ void Process::ParseKernelCaps(const u32* kernel_caps, size_t len) {
 }

 void Process::Run(VAddr entry_point, s32 main_thread_priority, u32 stack_size) {
-    // Allocate and map stack
+    // Allocate and map the main thread stack
+    // TODO(bunnei): This is heap area that should be allocated by the kernel and not mapped as part
+    // of the user address space.
    vm_manager
-        .MapMemoryBlock(Memory::HEAP_VADDR_END - stack_size,
-                        std::make_shared<std::vector<u8>>(stack_size, 0), 0, stack_size,
-                        MemoryState::Heap)
+        .MapMemoryBlock(Memory::STACK_VADDR, std::make_shared<std::vector<u8>>(stack_size, 0), 0,
+                        stack_size, MemoryState::Mapped)
        .Unwrap();
    misc_memory_used += stack_size;
    memory_region->used += stack_size;
@@ -153,9 +154,9 @@ void Process::LoadModule(SharedPtr<CodeSet> module_, VAddr base_addr) {
    };

    // Map CodeSet segments
-    MapSegment(module_->code, VMAPermission::ReadExecute, MemoryState::Code);
-    MapSegment(module_->rodata, VMAPermission::Read, MemoryState::Static);
-    MapSegment(module_->data, VMAPermission::ReadWrite, MemoryState::Static);
+    MapSegment(module_->code, VMAPermission::ReadExecute, MemoryState::CodeStatic);
+    MapSegment(module_->rodata, VMAPermission::Read, MemoryState::CodeMutable);
+    MapSegment(module_->data, VMAPermission::ReadWrite, MemoryState::CodeMutable);
 }

 VAddr Process::GetLinearHeapAreaAddress() const {
@@ -182,6 +183,8 @@ ResultVal<VAddr> Process::HeapAllocate(VAddr target, u64 size, VMAPermission per
        // Initialize heap
        heap_memory = std::make_shared<std::vector<u8>>();
        heap_start = heap_end = target;
+    } else {
+        vm_manager.UnmapRange(heap_start, heap_end - heap_start);
    }

    // If necessary, expand backing vector to cover new heap extents.
@@ -201,7 +204,7 @@ ResultVal<VAddr> Process::HeapAllocate(VAddr target, u64 size, VMAPermission per
                                                       size, MemoryState::Heap));
    vm_manager.Reprotect(vma, perms);

-    heap_used += size;
+    heap_used = size;
    memory_region->used += size;

    return MakeResult<VAddr>(heap_end - size);
@@ -288,7 +291,7 @@ ResultCode Process::MirrorMemory(VAddr dst_addr, VAddr src_addr, u64 size) {

    CASCADE_RESULT(auto new_vma,
                   vm_manager.MapMemoryBlock(dst_addr, backing_block, backing_block_offset, size,
-                                             vma->second.meminfo_state));
+                                             MemoryState::Mapped));
    // Protect mirror with permissions from old region
    vm_manager.Reprotect(new_vma, vma->second.permissions);
    // Remove permissions from old region
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -317,13 +317,13 @@ static ResultCode GetInfo(u64* result, u64 info_id, u64 handle, u64 info_sub_id)
        *result = Core::CurrentProcess()->allowed_thread_priority_mask;
        break;
    case GetInfoType::MapRegionBaseAddr:
-        *result = vm_manager.GetMapRegionBaseAddr();
+        *result = Memory::MAP_REGION_VADDR;
        break;
    case GetInfoType::MapRegionSize:
-        *result = vm_manager.GetAddressSpaceSize();
+        *result = Memory::MAP_REGION_SIZE;
        break;
    case GetInfoType::HeapRegionBaseAddr:
-        *result = vm_manager.GetNewMapRegionBaseAddr() + vm_manager.GetNewMapRegionSize();
+        *result = Memory::HEAP_VADDR;
        break;
    case GetInfoType::HeapRegionSize:
        *result = Memory::HEAP_SIZE;
@@ -347,10 +347,10 @@ static ResultCode GetInfo(u64* result, u64 info_id, u64 handle, u64 info_sub_id)
        *result = vm_manager.GetAddressSpaceSize();
        break;
    case GetInfoType::NewMapRegionBaseAddr:
-        *result = vm_manager.GetNewMapRegionBaseAddr();
+        *result = Memory::NEW_MAP_REGION_VADDR;
        break;
    case GetInfoType::NewMapRegionSize:
-        *result = vm_manager.GetNewMapRegionSize();
+        *result = Memory::NEW_MAP_REGION_SIZE;
        break;
    case GetInfoType::IsVirtualAddressMemoryEnabled:
        *result = Core::CurrentProcess()->is_virtual_address_memory_enabled;
@@ -468,7 +468,7 @@ static ResultCode QueryProcessMemory(MemoryInfo* memory_info, PageInfo* /*page_i
        memory_info->base_address = 0;
        memory_info->permission = static_cast<u32>(VMAPermission::None);
        memory_info->size = 0;
-        memory_info->type = static_cast<u32>(MemoryState::Free);
+        memory_info->type = static_cast<u32>(MemoryState::Unmapped);
    } else {
        memory_info->base_address = vma->second.base;
        memory_info->permission = static_cast<u32>(vma->second.permissions);
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -314,7 +314,7 @@ ResultVal<SharedPtr<Thread>> Thread::Create(std::string name, VAddr entry_point,
        // TODO(Subv): Find the correct MemoryState for this region.
        vm_manager.MapMemoryBlock(Memory::TLS_AREA_VADDR + available_page * Memory::PAGE_SIZE,
                                  linheap_memory, offset, Memory::PAGE_SIZE,
-                                  MemoryState::ThreadLocalStorage);
+                                  MemoryState::ThreadLocal);
    }

    // Mark the slot as used
@@ -357,7 +357,7 @@ SharedPtr<Thread> SetupMainThread(VAddr entry_point, u32 priority,

    // Initialize new "main" thread
    auto thread_res = Thread::Create("main", entry_point, priority, 0, THREADPROCESSORID_0,
-                                     Memory::HEAP_VADDR_END, owner_process);
+                                     Memory::STACK_VADDR_END, owner_process);

    SharedPtr<Thread> thread = std::move(thread_res).Unwrap();

--- a/src/core/hle/kernel/vm_manager.cpp
+++ b/src/core/hle/kernel/vm_manager.cpp
@@ -18,8 +18,26 @@ namespace Kernel {

 static const char* GetMemoryStateName(MemoryState state) {
    static const char* names[] = {
-        "Free",   "Reserved",   "IO",      "Static", "Code",      "Private",
-        "Shared", "Continuous", "Aliased", "Alias",  "AliasCode", "Locked",
+        "Unmapped",
+        "Io",
+        "Normal",
+        "CodeStatic",
+        "CodeMutable",
+        "Heap",
+        "Shared",
+        "Unknown1"
+        "ModuleCodeStatic",
+        "ModuleCodeMutable",
+        "IpcBuffer0",
+        "Mapped",
+        "ThreadLocal",
+        "TransferMemoryIsolated",
+        "TransferMemory",
+        "ProcessMemory",
+        "Unknown2"
+        "IpcBuffer1",
+        "IpcBuffer3",
+        "KernelStack",
    };

    return names[(int)state];
@@ -142,7 +160,7 @@ VMManager::VMAIter VMManager::Unmap(VMAIter vma_handle) {
    VirtualMemoryArea& vma = vma_handle->second;
    vma.type = VMAType::Free;
    vma.permissions = VMAPermission::None;
-    vma.meminfo_state = MemoryState::Free;
+    vma.meminfo_state = MemoryState::Unmapped;

    vma.backing_block = nullptr;
    vma.offset = 0;
@@ -166,6 +184,9 @@ ResultCode VMManager::UnmapRange(VAddr target, u64 size) {
    }

    ASSERT(FindVMA(target)->second.size >= size);
+
+    Core::CPU().UnmapMemory(target, size);
+
    return RESULT_SUCCESS;
 }

@@ -377,19 +398,4 @@ u64 VMManager::GetAddressSpaceSize() {
    return MAX_ADDRESS;
 }

-VAddr VMManager::GetMapRegionBaseAddr() {
-    LOG_WARNING(Kernel, "(STUBBED) called");
-    return Memory::HEAP_VADDR;
-}
-
-VAddr VMManager::GetNewMapRegionBaseAddr() {
-    LOG_WARNING(Kernel, "(STUBBED) called");
-    return 0x8000000;
-}
-
-u64 VMManager::GetNewMapRegionSize() {
-    LOG_WARNING(Kernel, "(STUBBED) called");
-    return 0x8000000;
-}
-
 } // namespace Kernel
--- a/src/core/hle/kernel/vm_manager.h
+++ b/src/core/hle/kernel/vm_manager.h
@@ -41,15 +41,24 @@ enum class VMAPermission : u8 {

 /// Set of values returned in MemoryInfo.state by svcQueryMemory.
 enum class MemoryState : u32 {
-    Free = 0,
-    IO = 1,
-    Normal = 2,
-    Code = 3,
-    Static = 4,
-    Heap = 5,
-    Shared = 6,
-    Mapped = 6,
-    ThreadLocalStorage = 12,
+    Unmapped = 0x0,
+    Io = 0x1,
+    Normal = 0x2,
+    CodeStatic = 0x3,
+    CodeMutable = 0x4,
+    Heap = 0x5,
+    Shared = 0x6,
+    ModuleCodeStatic = 0x8,
+    ModuleCodeMutable = 0x9,
+    IpcBuffer0 = 0xA,
+    Mapped = 0xB,
+    ThreadLocal = 0xC,
+    TransferMemoryIsolated = 0xD,
+    TransferMemory = 0xE,
+    ProcessMemory = 0xF,
+    IpcBuffer1 = 0x11,
+    IpcBuffer3 = 0x12,
+    KernelStack = 0x13,
 };

 /**
@@ -66,7 +75,7 @@ struct VirtualMemoryArea {
    VMAType type = VMAType::Free;
    VMAPermission permissions = VMAPermission::None;
    /// Tag returned by svcQueryMemory. Not otherwise used.
-    MemoryState meminfo_state = MemoryState::Free;
+    MemoryState meminfo_state = MemoryState::Unmapped;

    // Settings for type = AllocatedMemoryBlock
    /// Memory block backing this VMA.
@@ -192,15 +201,6 @@ public:
    /// Gets the total address space address size, used by svcGetInfo
    u64 GetAddressSpaceSize();

-    /// Gets the map region base address, used by svcGetInfo
-    VAddr GetMapRegionBaseAddr();
-
-    /// Gets the base address for a new memory region, used by svcGetInfo
-    VAddr GetNewMapRegionBaseAddr();
-
-    /// Gets the size for a new memory region, used by svcGetInfo
-    u64 GetNewMapRegionSize();
-
    /// Each VMManager has its own page table, which is set as the main one when the owning process
    /// is scheduled.
    Memory::PageTable page_table;
--- a/src/core/hle/service/nvflinger/buffer_queue.cpp
+++ b/src/core/hle/service/nvflinger/buffer_queue.cpp
@@ -36,9 +36,7 @@ u32 BufferQueue::DequeueBuffer(u32 pixel_format, u32 width, u32 height) {
            return false;

        // Make sure that the parameters match.
-        auto& igbp_buffer = buffer.igbp_buffer;
-        return igbp_buffer.format == pixel_format && igbp_buffer.width == width &&
-               igbp_buffer.height == height;
+        return buffer.igbp_buffer.width == width && buffer.igbp_buffer.height == height;
    });
    if (itr == queue.end()) {
        LOG_CRITICAL(Service_NVDRV, "no free buffers for pixel_format=%d, width=%d, height=%d",
--- a/src/core/loader/elf.cpp
+++ b/src/core/loader/elf.cpp
@@ -414,7 +414,7 @@ ResultStatus AppLoader_ELF::Load(Kernel::SharedPtr<Kernel::Process>& process) {
    process->resource_limit =
        Kernel::ResourceLimit::GetForCategory(Kernel::ResourceLimitCategory::APPLICATION);

-    process->Run(codeset->entrypoint, 48, Kernel::DEFAULT_STACK_SIZE);
+    process->Run(codeset->entrypoint, 48, Memory::STACK_SIZE);

    is_loaded = true;
    return ResultStatus::Success;
--- a/src/core/loader/nro.cpp
+++ b/src/core/loader/nro.cpp
@@ -137,7 +137,7 @@ ResultStatus AppLoader_NRO::Load(Kernel::SharedPtr<Kernel::Process>& process) {
    process->address_mappings = default_address_mappings;
    process->resource_limit =
        Kernel::ResourceLimit::GetForCategory(Kernel::ResourceLimitCategory::APPLICATION);
-    process->Run(base_addr, 48, Kernel::DEFAULT_STACK_SIZE);
+    process->Run(base_addr, 48, Memory::STACK_SIZE);

    is_loaded = true;
    return ResultStatus::Success;
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@@ -165,7 +165,7 @@ ResultStatus AppLoader_NSO::Load(Kernel::SharedPtr<Kernel::Process>& process) {
    process->address_mappings = default_address_mappings;
    process->resource_limit =
        Kernel::ResourceLimit::GetForCategory(Kernel::ResourceLimitCategory::APPLICATION);
-    process->Run(Memory::PROCESS_IMAGE_VADDR, 48, Kernel::DEFAULT_STACK_SIZE);
+    process->Run(Memory::PROCESS_IMAGE_VADDR, 48, Memory::STACK_SIZE);

    is_loaded = true;
    return ResultStatus::Success;
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -129,21 +129,6 @@ enum : VAddr {
    PROCESS_IMAGE_MAX_SIZE = 0x08000000,
    PROCESS_IMAGE_VADDR_END = PROCESS_IMAGE_VADDR + PROCESS_IMAGE_MAX_SIZE,

-    /// Area where IPC buffers are mapped onto.
-    IPC_MAPPING_VADDR = 0x04000000,
-    IPC_MAPPING_SIZE = 0x04000000,
-    IPC_MAPPING_VADDR_END = IPC_MAPPING_VADDR + IPC_MAPPING_SIZE,
-
-    /// Application heap (includes stack).
-    HEAP_VADDR = 0x108000000,
-    HEAP_SIZE = 0xF0000000,
-    HEAP_VADDR_END = HEAP_VADDR + HEAP_SIZE,
-
-    /// Area where shared memory buffers are mapped onto.
-    SHARED_MEMORY_VADDR = 0x10000000,
-    SHARED_MEMORY_SIZE = 0x04000000,
-    SHARED_MEMORY_VADDR_END = SHARED_MEMORY_VADDR + SHARED_MEMORY_SIZE,
-
    /// Maps 1:1 to an offset in FCRAM. Used for HW allocations that need to be linear in physical
    /// memory.
    LINEAR_HEAP_VADDR = 0x14000000,
@@ -176,14 +161,39 @@ enum : VAddr {
    SHARED_PAGE_SIZE = 0x00001000,
    SHARED_PAGE_VADDR_END = SHARED_PAGE_VADDR + SHARED_PAGE_SIZE,

-    /// Area where TLS (Thread-Local Storage) buffers are allocated.
-    TLS_AREA_VADDR = 0x228000000,
-    TLS_ENTRY_SIZE = 0x200,
-
    /// Equivalent to LINEAR_HEAP_VADDR, but expanded to cover the extra memory in the New 3DS.
    NEW_LINEAR_HEAP_VADDR = 0x30000000,
    NEW_LINEAR_HEAP_SIZE = 0x10000000,
    NEW_LINEAR_HEAP_VADDR_END = NEW_LINEAR_HEAP_VADDR + NEW_LINEAR_HEAP_SIZE,
+
+    /// Area where TLS (Thread-Local Storage) buffers are allocated.
+    TLS_AREA_VADDR = NEW_LINEAR_HEAP_VADDR_END,
+    TLS_ENTRY_SIZE = 0x200,
+    TLS_AREA_SIZE = 0x10000000,
+    TLS_ADREA_VADDR_END = TLS_AREA_VADDR + TLS_AREA_SIZE,
+
+    /// Application stack
+    STACK_VADDR = TLS_ADREA_VADDR_END,
+    STACK_SIZE = 0x10000,
+    STACK_VADDR_END = STACK_VADDR + STACK_SIZE,
+
+    /// Application heap
+    /// Size is confirmed to be a static value on fw 3.0.0
+    HEAP_VADDR = 0x108000000,
+    HEAP_SIZE = 0x180000000,
+    HEAP_VADDR_END = HEAP_VADDR + HEAP_SIZE,
+
+    /// New map region
+    /// Size is confirmed to be a static value on fw 3.0.0
+    NEW_MAP_REGION_VADDR = HEAP_VADDR_END,
+    NEW_MAP_REGION_SIZE = 0x80000000,
+    NEW_MAP_REGION_VADDR_END = NEW_MAP_REGION_VADDR + NEW_MAP_REGION_SIZE,
+
+    /// Map region
+    /// Size is confirmed to be a static value on fw 3.0.0
+    MAP_REGION_VADDR = NEW_MAP_REGION_VADDR_END,
+    MAP_REGION_SIZE = 0x1000000000,
+    MAP_REGION_VADDR_END = MAP_REGION_VADDR + MAP_REGION_SIZE,
 };

 /// Currently active page table
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -24,12 +24,37 @@ namespace Tegra {

 enum class BufferMethods {
    BindObject = 0,
+    SetGraphMacroCode = 0x45,
+    SetGraphMacroCodeArg = 0x46,
+    SetGraphMacroEntry = 0x47,
    CountBufferMethods = 0x100,
 };

-void GPU::WriteReg(u32 method, u32 subchannel, u32 value) {
-    LOG_WARNING(HW_GPU, "Processing method %08X on subchannel %u value %08X", method, subchannel,
-                value);
+void GPU::WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params) {
+    LOG_WARNING(HW_GPU, "Processing method %08X on subchannel %u value %08X remaining params %u",
+                method, subchannel, value, remaining_params);
+
+    if (method == static_cast<u32>(BufferMethods::SetGraphMacroEntry)) {
+        // Prepare to upload a new macro, reset the upload counter.
+        LOG_DEBUG(HW_GPU, "Uploading GPU macro %08X", value);
+        current_macro_entry = value;
+        current_macro_code.clear();
+        return;
+    }
+
+    if (method == static_cast<u32>(BufferMethods::SetGraphMacroCodeArg)) {
+        // Append a new code word to the current macro.
+        current_macro_code.push_back(value);
+
+        // There are no more params remaining, submit the code to the 3D engine.
+        if (remaining_params == 0) {
+            maxwell_3d->SubmitMacroCode(current_macro_entry, std::move(current_macro_code));
+            current_macro_entry = InvalidGraphMacroEntry;
+            current_macro_code.clear();
+        }
+
+        return;
+    }

    if (method == static_cast<u32>(BufferMethods::BindObject)) {
        // Bind the current subchannel to the desired engine id.
@@ -54,7 +79,7 @@ void GPU::WriteReg(u32 method, u32 subchannel, u32 value) {
        fermi_2d->WriteReg(method, value);
        break;
    case EngineID::MAXWELL_B:
-        maxwell_3d->WriteReg(method, value);
+        maxwell_3d->WriteReg(method, value, remaining_params);
        break;
    case EngineID::MAXWELL_COMPUTE_B:
        maxwell_compute->WriteReg(method, value);
@@ -78,7 +103,8 @@ void GPU::ProcessCommandList(GPUVAddr address, u32 size) {
        case SubmissionMode::Increasing: {
            // Increase the method value with each argument.
            for (unsigned i = 0; i < header.arg_count; ++i) {
-                WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr));
+                WriteReg(header.method + i, header.subchannel, Memory::Read32(current_addr),
+                         header.arg_count - i - 1);
                current_addr += sizeof(u32);
            }
            break;
@@ -87,27 +113,31 @@ void GPU::ProcessCommandList(GPUVAddr address, u32 size) {
        case SubmissionMode::NonIncreasing: {
            // Use the same method value for all arguments.
            for (unsigned i = 0; i < header.arg_count; ++i) {
-                WriteReg(header.method, header.subchannel, Memory::Read32(current_addr));
+                WriteReg(header.method, header.subchannel, Memory::Read32(current_addr),
+                         header.arg_count - i - 1);
                current_addr += sizeof(u32);
            }
            break;
        }
        case SubmissionMode::IncreaseOnce: {
            ASSERT(header.arg_count.Value() >= 1);
+
            // Use the original method for the first argument and then the next method for all other
            // arguments.
-            WriteReg(header.method, header.subchannel, Memory::Read32(current_addr));
+            WriteReg(header.method, header.subchannel, Memory::Read32(current_addr),
+                     header.arg_count - 1);
            current_addr += sizeof(u32);
-            // Use the same method value for all arguments.
+
            for (unsigned i = 1; i < header.arg_count; ++i) {
-                WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr));
+                WriteReg(header.method + 1, header.subchannel, Memory::Read32(current_addr),
+                         header.arg_count - i - 1);
                current_addr += sizeof(u32);
            }
            break;
        }
        case SubmissionMode::Inline: {
            // The register value is stored in the bits 16-28 as an immediate
-            WriteReg(header.method, header.subchannel, header.inline_data);
+            WriteReg(header.method, header.subchannel, header.inline_data, 0);
            break;
        }
        default:
--- a/src/video_core/command_processor.h
+++ b/src/video_core/command_processor.h
@@ -34,6 +34,4 @@ static_assert(std::is_standard_layout<CommandHeader>::value == true,
              "CommandHeader does not use standard layout");
 static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!");

-void ProcessCommandList(VAddr address, u32 size);
-
 } // namespace Tegra
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -8,17 +8,101 @@
 namespace Tegra {
 namespace Engines {

+/// First register id that is actually a Macro call.
+constexpr u32 MacroRegistersStart = 0xE00;
+
+const std::unordered_map<u32, Maxwell3D::MethodInfo> Maxwell3D::method_handlers = {
+    {0xE24, {"SetShader", 5, &Maxwell3D::SetShader}},
+};
+
 Maxwell3D::Maxwell3D(MemoryManager& memory_manager) : memory_manager(memory_manager) {}

-void Maxwell3D::WriteReg(u32 method, u32 value) {
+void Maxwell3D::SubmitMacroCode(u32 entry, std::vector<u32> code) {
+    uploaded_macros[entry * 2 + MacroRegistersStart] = std::move(code);
+}
+
+void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) {
+    // TODO(Subv): Write an interpreter for the macros uploaded via registers 0x45 and 0x47
+
+    // The requested macro must have been uploaded already.
+    ASSERT_MSG(uploaded_macros.find(method) != uploaded_macros.end(), "Macro %08X was not uploaded",
+               method);
+
+    auto itr = method_handlers.find(method);
+    ASSERT_MSG(itr != method_handlers.end(), "Unhandled method call %08X", method);
+
+    ASSERT(itr->second.arguments == parameters.size());
+
+    (this->*itr->second.handler)(parameters);
+
+    // Reset the current macro and its parameters.
+    executing_macro = 0;
+    macro_params.clear();
+}
+
+void Maxwell3D::WriteReg(u32 method, u32 value, u32 remaining_params) {
    ASSERT_MSG(method < Regs::NUM_REGS,
               "Invalid Maxwell3D register, increase the size of the Regs structure");

+    // It is an error to write to a register other than the current macro's ARG register before it
+    // has finished execution.
+    if (executing_macro != 0) {
+        ASSERT(method == executing_macro + 1);
+    }
+
+    // Methods after 0xE00 are special, they're actually triggers for some microcode that was
+    // uploaded to the GPU during initialization.
+    if (method >= MacroRegistersStart) {
+        // We're trying to execute a macro
+        if (executing_macro == 0) {
+            // A macro call must begin by writing the macro method's register, not its argument.
+            ASSERT_MSG((method % 2) == 0,
+                       "Can't start macro execution by writing to the ARGS register");
+            executing_macro = method;
+        }
+
+        macro_params.push_back(value);
+
+        // Call the macro when there are no more parameters in the command buffer
+        if (remaining_params == 0) {
+            CallMacroMethod(executing_macro, macro_params);
+        }
+        return;
+    }
+
    regs.reg_array[method] = value;

 #define MAXWELL3D_REG_INDEX(field_name) (offsetof(Regs, field_name) / sizeof(u32))

    switch (method) {
+    case MAXWELL3D_REG_INDEX(code_address.code_address_high):
+    case MAXWELL3D_REG_INDEX(code_address.code_address_low): {
+        // Note: For some reason games (like Puyo Puyo Tetris) seem to write 0 to the CODE_ADDRESS
+        // register, we do not currently know if that's intended or a bug, so we assert it lest
+        // stuff breaks in other places (like the shader address calculation).
+        ASSERT_MSG(regs.code_address.CodeAddress() == 0, "Unexpected CODE_ADDRESS register value.");
+        break;
+    }
+    case MAXWELL3D_REG_INDEX(cb_bind[0].raw_config): {
+        ProcessCBBind(Regs::ShaderStage::Vertex);
+        break;
+    }
+    case MAXWELL3D_REG_INDEX(cb_bind[1].raw_config): {
+        ProcessCBBind(Regs::ShaderStage::TesselationControl);
+        break;
+    }
+    case MAXWELL3D_REG_INDEX(cb_bind[2].raw_config): {
+        ProcessCBBind(Regs::ShaderStage::TesselationEval);
+        break;
+    }
+    case MAXWELL3D_REG_INDEX(cb_bind[3].raw_config): {
+        ProcessCBBind(Regs::ShaderStage::Geometry);
+        break;
+    }
+    case MAXWELL3D_REG_INDEX(cb_bind[4].raw_config): {
+        ProcessCBBind(Regs::ShaderStage::Fragment);
+        break;
+    }
    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): {
        DrawArrays();
        break;
@@ -56,5 +140,59 @@ void Maxwell3D::DrawArrays() {
    LOG_WARNING(HW_GPU, "Game requested a DrawArrays, ignoring");
 }

+void Maxwell3D::SetShader(const std::vector<u32>& parameters) {
+    /**
+     * Parameters description:
+     * [0] = Shader Program.
+     * [1] = Unknown, presumably the shader id.
+     * [2] = Offset to the start of the shader, after the 0x30 bytes header.
+     * [3] = Shader Stage.
+     * [4] = Const Buffer Address >> 8.
+     */
+    auto shader_program = static_cast<Regs::ShaderProgram>(parameters[0]);
+    // TODO(Subv): This address is probably an offset from the CODE_ADDRESS register.
+    GPUVAddr address = parameters[2];
+    auto shader_stage = static_cast<Regs::ShaderStage>(parameters[3]);
+    GPUVAddr cb_address = parameters[4] << 8;
+
+    auto& shader = state.shader_programs[static_cast<size_t>(shader_program)];
+    shader.program = shader_program;
+    shader.stage = shader_stage;
+    shader.address = address;
+
+    // Perform the same operations as the real macro code.
+    // TODO(Subv): Early exit if register 0xD1C + shader_program contains the same as params[1].
+    auto& shader_regs = regs.shader_config[static_cast<size_t>(shader_program)];
+    shader_regs.start_id = address;
+    // TODO(Subv): Write params[1] to register 0xD1C + shader_program.
+    // TODO(Subv): Write params[2] to register 0xD22 + shader_program.
+
+    // Note: This value is hardcoded in the macro's code.
+    static constexpr u32 DefaultCBSize = 0x10000;
+    regs.const_buffer.cb_size = DefaultCBSize;
+    regs.const_buffer.cb_address_high = cb_address >> 32;
+    regs.const_buffer.cb_address_low = cb_address & 0xFFFFFFFF;
+
+    // Write a hardcoded 0x11 to CB_BIND, this binds the current const buffer to buffer c1[] in the
+    // shader. It's likely that these are the constants for the shader.
+    regs.cb_bind[static_cast<size_t>(shader_stage)].valid.Assign(1);
+    regs.cb_bind[static_cast<size_t>(shader_stage)].index.Assign(1);
+
+    ProcessCBBind(shader_stage);
+}
+
+void Maxwell3D::ProcessCBBind(Regs::ShaderStage stage) {
+    // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
+    auto& shader = state.shader_stages[static_cast<size_t>(stage)];
+    auto& bind_data = regs.cb_bind[static_cast<size_t>(stage)];
+
+    auto& buffer = shader.const_buffers[bind_data.index];
+
+    buffer.enabled = bind_data.valid.Value() != 0;
+    buffer.index = bind_data.index;
+    buffer.address = regs.const_buffer.BufferAddress();
+    buffer.size = regs.const_buffer.cb_size;
+}
+
 } // namespace Engines
 } // namespace Tegra
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -4,6 +4,9 @@

 #pragma once

+#include <array>
+#include <unordered_map>
+#include <vector>
 #include "common/bit_field.h"
 #include "common/common_funcs.h"
 #include "common/common_types.h"
@@ -18,21 +21,58 @@ public:
    ~Maxwell3D() = default;

    /// Write the value to the register identified by method.
-    void WriteReg(u32 method, u32 value);
+    void WriteReg(u32 method, u32 value, u32 remaining_params);
+
+    /// Uploads the code for a GPU macro program associated with the specified entry.
+    void SubmitMacroCode(u32 entry, std::vector<u32> code);

    /// Register structure of the Maxwell3D engine.
    /// TODO(Subv): This structure will need to be made bigger as more registers are discovered.
    struct Regs {
        static constexpr size_t NUM_REGS = 0xE36;

+        static constexpr size_t NumCBData = 16;
+        static constexpr size_t NumVertexArrays = 32;
+        static constexpr size_t MaxShaderProgram = 6;
+        static constexpr size_t MaxShaderStage = 5;
+        // Maximum number of const buffers per shader stage.
+        static constexpr size_t MaxConstBuffers = 16;
+
        enum class QueryMode : u32 {
            Write = 0,
            Sync = 1,
        };

+        enum class ShaderProgram : u32 {
+            VertexA = 0,
+            VertexB = 1,
+            TesselationControl = 2,
+            TesselationEval = 3,
+            Geometry = 4,
+            Fragment = 5,
+        };
+
+        enum class ShaderStage : u32 {
+            Vertex = 0,
+            TesselationControl = 1,
+            TesselationEval = 2,
+            Geometry = 3,
+            Fragment = 4,
+        };
+
        union {
            struct {
-                INSERT_PADDING_WORDS(0x585);
+                INSERT_PADDING_WORDS(0x582);
+                struct {
+                    u32 code_address_high;
+                    u32 code_address_low;
+
+                    GPUVAddr CodeAddress() const {
+                        return static_cast<GPUVAddr>(
+                            (static_cast<GPUVAddr>(code_address_high) << 32) | code_address_low);
+                    }
+                } code_address;
+                INSERT_PADDING_WORDS(1);
                struct {
                    u32 vertex_end_gl;
                    u32 vertex_begin_gl;
@@ -54,7 +94,79 @@ public:
                            (static_cast<GPUVAddr>(query_address_high) << 32) | query_address_low);
                    }
                } query;
-                INSERT_PADDING_WORDS(0x772);
+
+                INSERT_PADDING_WORDS(0x3C);
+
+                struct {
+                    union {
+                        BitField<0, 12, u32> stride;
+                        BitField<12, 1, u32> enable;
+                    };
+                    u32 start_high;
+                    u32 start_low;
+                    u32 divisor;
+
+                    GPUVAddr StartAddress() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(start_high) << 32) |
+                                                     start_low);
+                    }
+                } vertex_array[NumVertexArrays];
+
+                INSERT_PADDING_WORDS(0x40);
+
+                struct {
+                    u32 limit_high;
+                    u32 limit_low;
+
+                    GPUVAddr LimitAddress() const {
+                        return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) |
+                                                     limit_low);
+                    }
+                } vertex_array_limit[NumVertexArrays];
+
+                struct {
+                    union {
+                        BitField<0, 1, u32> enable;
+                        BitField<4, 4, ShaderProgram> program;
+                    };
+                    u32 start_id;
+                    INSERT_PADDING_WORDS(1);
+                    u32 gpr_alloc;
+                    ShaderStage type;
+                    INSERT_PADDING_WORDS(9);
+                } shader_config[MaxShaderProgram];
+
+                INSERT_PADDING_WORDS(0x8C);
+
+                struct {
+                    u32 cb_size;
+                    u32 cb_address_high;
+                    u32 cb_address_low;
+                    u32 cb_pos;
+                    u32 cb_data[NumCBData];
+
+                    GPUVAddr BufferAddress() const {
+                        return static_cast<GPUVAddr>(
+                            (static_cast<GPUVAddr>(cb_address_high) << 32) | cb_address_low);
+                    }
+                } const_buffer;
+
+                INSERT_PADDING_WORDS(0x10);
+
+                struct {
+                    union {
+                        u32 raw_config;
+                        BitField<0, 1, u32> valid;
+                        BitField<4, 5, u32> index;
+                    };
+                    INSERT_PADDING_WORDS(7);
+                } cb_bind[MaxShaderStage];
+
+                INSERT_PADDING_WORDS(0x56);
+
+                u32 tex_cb_index;
+
+                INSERT_PADDING_WORDS(0x4B3);
            };
            std::array<u32, NUM_REGS> reg_array;
        };
@@ -62,21 +174,81 @@ public:

    static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "Maxwell3D Regs has wrong size");

+    struct State {
+        struct ConstBufferInfo {
+            GPUVAddr address;
+            u32 index;
+            u32 size;
+            bool enabled;
+        };
+
+        struct ShaderProgramInfo {
+            Regs::ShaderStage stage;
+            Regs::ShaderProgram program;
+            GPUVAddr address;
+        };
+
+        struct ShaderStageInfo {
+            std::array<ConstBufferInfo, Regs::MaxConstBuffers> const_buffers;
+        };
+
+        std::array<ShaderStageInfo, Regs::MaxShaderStage> shader_stages;
+        std::array<ShaderProgramInfo, Regs::MaxShaderProgram> shader_programs;
+    };
+
+    State state{};
+
 private:
+    MemoryManager& memory_manager;
+
+    std::unordered_map<u32, std::vector<u32>> uploaded_macros;
+
+    /// Macro method that is currently being executed / being fed parameters.
+    u32 executing_macro = 0;
+    /// Parameters that have been submitted to the macro call so far.
+    std::vector<u32> macro_params;
+
+    /**
+     * Call a macro on this engine.
+     * @param method Method to call
+     * @param parameters Arguments to the method call
+     */
+    void CallMacroMethod(u32 method, const std::vector<u32>& parameters);
+
    /// Handles a write to the QUERY_GET register.
    void ProcessQueryGet();

+    /// Handles a write to the CB_BIND register.
+    void ProcessCBBind(Regs::ShaderStage stage);
+
    /// Handles a write to the VERTEX_END_GL register, triggering a draw.
    void DrawArrays();

-    MemoryManager& memory_manager;
+    /// Method call handlers
+    void SetShader(const std::vector<u32>& parameters);
+
+    struct MethodInfo {
+        const char* name;
+        u32 arguments;
+        void (Maxwell3D::*handler)(const std::vector<u32>& parameters);
+    };
+
+    static const std::unordered_map<u32, MethodInfo> method_handlers;
 };

 #define ASSERT_REG_POSITION(field_name, position)                                                  \
    static_assert(offsetof(Maxwell3D::Regs, field_name) == position * 4,                           \
                  "Field " #field_name " has invalid position")

+ASSERT_REG_POSITION(code_address, 0x582);
+ASSERT_REG_POSITION(draw, 0x585);
 ASSERT_REG_POSITION(query, 0x6C0);
+ASSERT_REG_POSITION(vertex_array[0], 0x700);
+ASSERT_REG_POSITION(vertex_array_limit[0], 0x7C0);
+ASSERT_REG_POSITION(shader_config[0], 0x800);
+ASSERT_REG_POSITION(const_buffer, 0x8E0);
+ASSERT_REG_POSITION(cb_bind[0], 0x904);
+ASSERT_REG_POSITION(tex_cb_index, 0x982);

 #undef ASSERT_REG_POSITION

--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -6,6 +6,7 @@

 #include <memory>
 #include <unordered_map>
+#include <vector>
 #include "common/common_types.h"
 #include "video_core/engines/fermi_2d.h"
 #include "video_core/engines/maxwell_3d.h"
@@ -38,8 +39,10 @@ public:
    std::unique_ptr<MemoryManager> memory_manager;

 private:
+    static constexpr u32 InvalidGraphMacroEntry = 0xFFFFFFFF;
+
    /// Writes a single register in the engine bound to the specified subchannel
-    void WriteReg(u32 method, u32 subchannel, u32 value);
+    void WriteReg(u32 method, u32 subchannel, u32 value, u32 remaining_params);

    /// Mapping of command subchannels to their bound engine ids.
    std::unordered_map<u32, EngineID> bound_engines;
@@ -50,6 +53,11 @@ private:
    std::unique_ptr<Engines::Fermi2D> fermi_2d;
    /// Compute engine
    std::unique_ptr<Engines::MaxwellCompute> maxwell_compute;
+
+    /// Entry of the macro that is currently being uploaded
+    u32 current_macro_entry = InvalidGraphMacroEntry;
+    /// Code being uploaded for the current macro
+    std::vector<u32> current_macro_code;
 };

 } // namespace Tegra
Author	SHA1	Message	Date
Subv	aa586fa268	GPU: Store uploaded GPU macros and keep track of the number of method parameters.	2018-03-18 11:51:46 -05:00
Subv	7ac8657432	GPU: Macros are specific to the Maxwell3D engine, so handle them internally.	2018-03-18 11:51:45 -05:00
bunnei	29981fa2eb	Merge pull request #245 from Subv/set_shader2 GPU: Store shader constbuffer bindings in the GPU state.	2018-03-17 21:19:39 -04:00
Subv	ccb8da1512	GPU: Renamed ShaderType to ShaderStage as that is less confusing.	2018-03-17 18:32:57 -05:00
Subv	88698c156f	GPU: Store shader constbuffer bindings in the GPU state.	2018-03-17 18:32:57 -05:00
Subv	66dae22790	GPU: Corrected some register offsets and removed superfluous macro registers.	2018-03-17 18:32:56 -05:00
Subv	1d9d9c16e8	GPU: Make the SetShader macro call do the same as the real macro's code. It'll now set the CB_SIZE, CB_ADDRESS and CB_BIND registers when it's called. Presumably this SetShader function is binding the constant shader uniforms to buffer 1 (c1[]).	2018-03-17 18:32:55 -05:00
Subv	579000e747	GPU: Corrected the parameter documentation for the SetShader macro call. Register 0xE24 is actually a macro that sets some shader parameters in the register structure. Macros are uploaded to the GPU at startup and have their own ISA, we'll probably write an interpreter for this in the future.	2018-03-17 13:55:42 -05:00
bunnei	516ef4f19f	Merge pull request #242 from Subv/set_shader GPU: Handle the SetShader method call (0xE24) and store the shader config.	2018-03-17 00:34:17 -04:00
bunnei	c286921739	Merge pull request #243 from Subv/vertex_buffer GPU: Added the vertex array registers.	2018-03-17 00:04:31 -04:00
Subv	f93d769a1c	GPU: Handle the SetShader method call (0xE24) and store the shader config.	2018-03-16 22:51:06 -05:00
Subv	d2888f7e90	GPU: Added the vertex array registers.	2018-03-16 22:47:45 -05:00
bunnei	cd4e8a989c	Merge pull request #241 from Subv/gpu_method_call GPU: Process command mode 5 (IncreaseOnce) differently from other commands	2018-03-16 22:28:22 -04:00
Subv	29feece4b8	GPU: Process command mode 5 (IncreaseOnce) differently from other commands. Accumulate all arguments before calling the desired method. Note: Maybe we should do the same for the NonIncreasing mode?	2018-03-16 20:32:44 -05:00
bunnei	0eff775264	Merge pull request #239 from Subv/shaders GPU: Added some shader-related registers.	2018-03-16 21:09:35 -04:00
bunnei	e453b09a61	Merge pull request #238 from bunnei/fix-buffer-check nvflinger: Remove superfluous buffer format check.	2018-03-16 21:04:39 -04:00
Subv	bf310a41b8	GPU: Assert that we get a 0 CODE_ADDRESS register in the 3D engine. Shader address calculation depends on this value to some extent, we do not currently know what it being 0 entails.	2018-03-16 19:24:41 -05:00
Subv	cbec739e7b	GPU: Added Maxwell registers for Shader Program control.	2018-03-16 19:23:11 -05:00
bunnei	494275fd38	nvflinger: Remove superfluous buffer format check.	2018-03-16 20:11:50 -04:00
bunnei	e7ba2a4447	Merge pull request #232 from bunnei/heap-fixes Various heap fixes for libtransistor	2018-03-16 20:06:27 -04:00
bunnei	cc6f22e0e4	process: MirrorMemory should use MemoryState::Mapped.	2018-03-16 19:24:54 -04:00
bunnei	e9a857ce82	process: Unmap previously allocated heap.	2018-03-16 18:32:25 -04:00
bunnei	403f8e79ea	arm_interface: Support unmapping previously mapped memory.	2018-03-16 18:32:24 -04:00
bunnei	34a29ad051	svc: Use more correct values for GetInfo MapRegion and NewMapRegion.	2018-03-16 18:32:23 -04:00
bunnei	8581404482	kernel: Move stack region outside of application heap.	2018-03-16 18:32:23 -04:00
bunnei	69ee9edd8d	memory: Add regions for map region, "new" map region, etc.	2018-03-16 18:32:22 -04:00
bunnei	3923b0f589	process: Fix stack memory state.	2018-03-16 18:32:21 -04:00
bunnei	8be7131033	MemoryState: Add additional memory states and improve naming.	2018-03-16 18:32:21 -04:00
bunnei	07ae1f972d	Merge pull request #237 from mailwl/nifm-module Service/NIFM: convert to module	2018-03-16 18:26:02 -04:00