diff --git a/UnleashedRecomp/CMakeLists.txt b/UnleashedRecomp/CMakeLists.txt index 66b60a4..7a6c277 100644 --- a/UnleashedRecomp/CMakeLists.txt +++ b/UnleashedRecomp/CMakeLists.txt @@ -310,7 +310,11 @@ endif() if (UNLEASHED_RECOMP_D3D12) find_package(directx-headers CONFIG REQUIRED) find_package(directx12-agility CONFIG REQUIRED) - target_compile_definitions(UnleashedRecomp PRIVATE UNLEASHED_RECOMP_D3D12) + target_compile_definitions(UnleashedRecomp PRIVATE + UNLEASHED_RECOMP_D3D12 + D3D12MA_USING_DIRECTX_HEADERS + D3D12MA_OPTIONS16_SUPPORTED + ) endif() if (CMAKE_SYSTEM_NAME MATCHES "Linux") diff --git a/UnleashedRecomp/gpu/rhi/plume_d3d12.cpp b/UnleashedRecomp/gpu/rhi/plume_d3d12.cpp index 395630c..073ea68 100644 --- a/UnleashedRecomp/gpu/rhi/plume_d3d12.cpp +++ b/UnleashedRecomp/gpu/rhi/plume_d3d12.cpp @@ -442,6 +442,8 @@ namespace plume { return D3D12_HEAP_TYPE_UPLOAD; case RenderHeapType::READBACK: return D3D12_HEAP_TYPE_READBACK; + case RenderHeapType::GPU_UPLOAD: + return D3D12_HEAP_TYPE_GPU_UPLOAD; default: assert(false && "Unknown heap type."); return D3D12_HEAP_TYPE_DEFAULT; @@ -2385,7 +2387,7 @@ namespace plume { range.End = readRange->end; } - void *outputData; + void *outputData = nullptr; d3d->Map(subresource, (readRange != nullptr) ? &range : nullptr, &outputData); return outputData; } @@ -2629,14 +2631,22 @@ namespace plume { // D3D12Pool - D3D12Pool::D3D12Pool(D3D12Device *device, const RenderPoolDesc &desc) { + D3D12Pool::D3D12Pool(D3D12Device *device, const RenderPoolDesc &desc, bool gpuUploadHeapFallback) { assert(device != nullptr); this->device = device; this->desc = desc; D3D12MA::POOL_DESC poolDesc = {}; - poolDesc.HeapProperties.Type = toD3D12(desc.heapType); + + // When using an UMA architecture without explicit support for GPU Upload heaps, we instead just make a custom heap with the same properties as Upload heaps. + if ((desc.heapType == RenderHeapType::GPU_UPLOAD) && gpuUploadHeapFallback) { + poolDesc.HeapProperties = device->d3d->GetCustomHeapProperties(0, D3D12_HEAP_TYPE_UPLOAD); + } + else { + poolDesc.HeapProperties.Type = toD3D12(desc.heapType); + } + poolDesc.MinBlockCount = desc.minBlockCount; poolDesc.MaxBlockCount = desc.maxBlockCount; poolDesc.Flags |= desc.useLinearAlgorithm ? D3D12MA::POOL_FLAG_ALGORITHM_LINEAR : D3D12MA::POOL_FLAG_NONE; @@ -3390,13 +3400,15 @@ namespace plume { if (SUCCEEDED(res)) { triangleFanSupportOption = d3d12Options15.TriangleFanSupported; } - - // Check if dynamic depth bias is supported. + + // Check if dynamic depth bias and GPU upload heap are supported. bool dynamicDepthBiasOption = false; + bool gpuUploadHeapOption = false; D3D12_FEATURE_DATA_D3D12_OPTIONS16 d3d12Options16 = {}; res = deviceOption->CheckFeatureSupport(D3D12_FEATURE_D3D12_OPTIONS16, &d3d12Options16, sizeof(d3d12Options16)); if (SUCCEEDED(res)) { dynamicDepthBiasOption = d3d12Options16.DynamicDepthBiasSupported; + gpuUploadHeapOption = d3d12Options16.GPUUploadHeapSupported; } // Check if the architecture has UMA. @@ -3431,6 +3443,11 @@ namespace plume { capabilities.triangleFan = triangleFanSupportOption; capabilities.dynamicDepthBias = dynamicDepthBiasOption; capabilities.uma = uma; + + // Pretend GPU Upload heaps are supported if UMA is supported, as the backend has a workaround using a custom pool for it. + capabilities.gpuUploadHeap = uma || gpuUploadHeapOption; + gpuUploadHeapFallback = uma && !gpuUploadHeapOption; + description.name = deviceName; description.dedicatedVideoMemory = adapterDesc.DedicatedVideoMemory; description.vendor = RenderDeviceVendor(adapterDesc.VendorId); @@ -3528,6 +3545,13 @@ namespace plume { colorTargetHeapAllocator = std::make_unique(this, TargetDescriptorHeapSize, D3D12_DESCRIPTOR_HEAP_TYPE_RTV); depthTargetHeapAllocator = std::make_unique(this, TargetDescriptorHeapSize, D3D12_DESCRIPTOR_HEAP_TYPE_DSV); + // Create the custom upload pool that will be used as the fallback when using an UMA architecture without explicit support for GPU Upload heaps. + if (gpuUploadHeapFallback) { + RenderPoolDesc poolDesc; + poolDesc.heapType = RenderHeapType::GPU_UPLOAD; + customUploadPool = std::make_unique(this, poolDesc, true); + } + // Create a command queue only for retrieving the timestamp frequency. Delete it immediately afterwards. std::unique_ptr timestampCommandQueue = std::make_unique(this, RenderCommandListType::DIRECT); res = timestampCommandQueue->d3d->GetTimestampFrequency(×tampFrequency); @@ -3577,7 +3601,12 @@ namespace plume { } std::unique_ptr D3D12Device::createBuffer(const RenderBufferDesc &desc) { - return std::make_unique(this, nullptr, desc); + if ((desc.heapType == RenderHeapType::GPU_UPLOAD) && gpuUploadHeapFallback) { + return std::make_unique(this, customUploadPool.get(), desc); + } + else { + return std::make_unique(this, nullptr, desc); + } } std::unique_ptr D3D12Device::createTexture(const RenderTextureDesc &desc) { @@ -3589,7 +3618,7 @@ namespace plume { } std::unique_ptr D3D12Device::createPool(const RenderPoolDesc &desc) { - return std::make_unique(this, desc); + return std::make_unique(this, desc, gpuUploadHeapFallback); } std::unique_ptr D3D12Device::createPipelineLayout(const RenderPipelineLayoutDesc &desc) { diff --git a/UnleashedRecomp/gpu/rhi/plume_d3d12.h b/UnleashedRecomp/gpu/rhi/plume_d3d12.h index d4987fb..34461c0 100644 --- a/UnleashedRecomp/gpu/rhi/plume_d3d12.h +++ b/UnleashedRecomp/gpu/rhi/plume_d3d12.h @@ -329,7 +329,7 @@ namespace plume { D3D12Device *device = nullptr; RenderPoolDesc desc; - D3D12Pool(D3D12Device *device, const RenderPoolDesc &desc); + D3D12Pool(D3D12Device *device, const RenderPoolDesc &desc, bool gpuUploadHeapFallback); ~D3D12Pool() override; std::unique_ptr createBuffer(const RenderBufferDesc &desc) override; std::unique_ptr createTexture(const RenderTextureDesc &desc) override; @@ -430,9 +430,11 @@ namespace plume { std::unique_ptr samplerHeapAllocator; std::unique_ptr colorTargetHeapAllocator; std::unique_ptr depthTargetHeapAllocator; + std::unique_ptr customUploadPool; RenderDeviceCapabilities capabilities; RenderDeviceDescription description; uint64_t timestampFrequency = 1; + bool gpuUploadHeapFallback = false; D3D12Device(D3D12Interface *renderInterface, const std::string &preferredDeviceName); ~D3D12Device() override; diff --git a/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h b/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h index b0be159..568160a 100644 --- a/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h +++ b/UnleashedRecomp/gpu/rhi/plume_render_interface_types.h @@ -351,7 +351,8 @@ namespace plume { UNKNOWN, DEFAULT, UPLOAD, - READBACK + READBACK, + GPU_UPLOAD }; enum class RenderTextureArrangement { @@ -1807,6 +1808,9 @@ namespace plume { // UMA. bool uma = false; + + // GPU Upload heap. + bool gpuUploadHeap = false; }; struct RenderInterfaceCapabilities { diff --git a/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp b/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp index 477a431..94f91fa 100644 --- a/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp +++ b/UnleashedRecomp/gpu/rhi/plume_vulkan.cpp @@ -808,6 +808,12 @@ namespace plume { bufferInfo.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT; createInfo.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT; break; + case RenderHeapType::GPU_UPLOAD: + bufferInfo.usage |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT; + bufferInfo.usage |= VK_BUFFER_USAGE_TRANSFER_DST_BIT; + createInfo.flags |= VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT; + createInfo.requiredFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT; + break; default: assert(false && "Unknown heap type."); break; @@ -833,7 +839,7 @@ namespace plume { } if (res != VK_SUCCESS) { - fprintf(stderr, "vkCreateBuffer failed with error code 0x%X.\n", res); + fprintf(stderr, "vmaCreateBuffer failed with error code 0x%X.\n", res); return; } } @@ -3887,6 +3893,15 @@ namespace plume { VkDeviceSize memoryHeapSize = 0; const VkPhysicalDeviceMemoryProperties *memoryProps = nullptr; vmaGetMemoryProperties(allocator, &memoryProps); + + constexpr VkMemoryPropertyFlags uploadHeapPropertyFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT; + bool hasHostVisibleDeviceLocalMemory = false; + for (uint32_t i = 0; i < memoryProps->memoryTypeCount; i++) { + if ((memoryProps->memoryTypes[i].propertyFlags & uploadHeapPropertyFlags) == uploadHeapPropertyFlags) { + hasHostVisibleDeviceLocalMemory = true; + } + } + for (uint32_t i = 0; i < memoryProps->memoryHeapCount; i++) { if (memoryProps->memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { memoryHeapSize = std::max(memoryProps->memoryHeaps[i].size, memoryHeapSize); @@ -3907,6 +3922,8 @@ namespace plume { capabilities.preferHDR = memoryHeapSize > (512 * 1024 * 1024); capabilities.triangleFan = true; capabilities.dynamicDepthBias = true; + capabilities.uma = (description.type == RenderDeviceType::INTEGRATED) && hasHostVisibleDeviceLocalMemory; + capabilities.gpuUploadHeap = capabilities.uma; // Fill Vulkan-only capabilities. loadStoreOpNoneSupported = supportedOptionalExtensions.find(VK_EXT_LOAD_STORE_OP_NONE_EXTENSION_NAME) != supportedOptionalExtensions.end(); diff --git a/UnleashedRecomp/gpu/video.cpp b/UnleashedRecomp/gpu/video.cpp index 42b3921..19c7fb6 100644 --- a/UnleashedRecomp/gpu/video.cpp +++ b/UnleashedRecomp/gpu/video.cpp @@ -2114,40 +2114,54 @@ static void* LockVertexBuffer(GuestBuffer* buffer, uint32_t, uint32_t, uint32_t return LockBuffer(buffer, flags); } +static std::atomic g_bufferUploadCount = 0; + template static void UnlockBuffer(GuestBuffer* buffer, bool useCopyQueue) { - auto uploadBuffer = g_device->createBuffer(RenderBufferDesc::UploadBuffer(buffer->dataSize)); + auto copyBuffer = [&](T* dest) + { + auto src = reinterpret_cast(buffer->mappedMemory); - auto dest = reinterpret_cast(uploadBuffer->map()); - auto src = reinterpret_cast(buffer->mappedMemory); - - for (size_t i = 0; i < buffer->dataSize; i += sizeof(T)) - { - *dest = ByteSwap(*src); - ++dest; - ++src; - } - - uploadBuffer->unmap(); - - if (useCopyQueue) - { - ExecuteCopyCommandList([&] + for (size_t i = 0; i < buffer->dataSize; i += sizeof(T)) { - g_copyCommandList->copyBufferRegion(buffer->buffer->at(0), uploadBuffer->at(0), buffer->dataSize); - }); + *dest = ByteSwap(*src); + ++dest; + ++src; + } + }; + + if (useCopyQueue && g_capabilities.gpuUploadHeap) + { + copyBuffer(reinterpret_cast(buffer->buffer->map())); + buffer->buffer->unmap(); } else { - auto& commandList = g_commandLists[g_frame]; + auto uploadBuffer = g_device->createBuffer(RenderBufferDesc::UploadBuffer(buffer->dataSize)); + copyBuffer(reinterpret_cast(uploadBuffer->map())); + uploadBuffer->unmap(); - commandList->barriers(RenderBarrierStage::COPY, RenderBufferBarrier(buffer->buffer.get(), RenderBufferAccess::WRITE)); - commandList->copyBufferRegion(buffer->buffer->at(0), uploadBuffer->at(0), buffer->dataSize); - commandList->barriers(RenderBarrierStage::GRAPHICS, RenderBufferBarrier(buffer->buffer.get(), RenderBufferAccess::READ)); + if (useCopyQueue) + { + ExecuteCopyCommandList([&] + { + g_copyCommandList->copyBufferRegion(buffer->buffer->at(0), uploadBuffer->at(0), buffer->dataSize); + }); + } + else + { + auto& commandList = g_commandLists[g_frame]; - g_tempBuffers[g_frame].emplace_back(std::move(uploadBuffer)); + commandList->barriers(RenderBarrierStage::COPY, RenderBufferBarrier(buffer->buffer.get(), RenderBufferAccess::WRITE)); + commandList->copyBufferRegion(buffer->buffer->at(0), uploadBuffer->at(0), buffer->dataSize); + commandList->barriers(RenderBarrierStage::GRAPHICS, RenderBufferBarrier(buffer->buffer.get(), RenderBufferAccess::READ)); + + g_tempBuffers[g_frame].emplace_back(std::move(uploadBuffer)); + } } + + g_bufferUploadCount++; } template @@ -2325,10 +2339,11 @@ static void DrawProfiler() std::lock_guard lock(g_userHeap.physicalMutex); physicalDiagnostics = o1heapGetDiagnostics(g_userHeap.physicalHeap); } - + ImGui::Text("Heap Allocated: %d MB", int32_t(diagnostics.allocated / (1024 * 1024))); ImGui::Text("Physical Heap Allocated: %d MB", int32_t(physicalDiagnostics.allocated / (1024 * 1024))); ImGui::Text("GPU Waits: %d", int32_t(g_waitForGPUCount)); + ImGui::Text("Buffer Uploads: %d", int32_t(g_bufferUploadCount)); ImGui::NewLine(); ImGui::Text("Present Wait: %s", g_capabilities.presentWait ? "Supported" : "Unsupported"); @@ -2344,6 +2359,7 @@ static void DrawProfiler() ImGui::Text("Device Type: %s", DeviceTypeName(g_device->getDescription().type)); ImGui::Text("VRAM: %.2f MiB", (double)(g_device->getDescription().dedicatedVideoMemory) / (1024.0 * 1024.0)); ImGui::Text("UMA: %s", g_capabilities.uma ? "Supported" : "Unsupported"); + ImGui::Text("GPU Upload Heap: %s", g_capabilities.gpuUploadHeap ? "Supported" : "Unsupported"); const char* sdlVideoDriver = SDL_GetCurrentVideoDriver(); if (sdlVideoDriver != nullptr) @@ -3024,10 +3040,15 @@ static GuestTexture* CreateTexture(uint32_t width, uint32_t height, uint32_t dep return texture; } +static RenderHeapType GetBufferHeapType() +{ + return g_capabilities.gpuUploadHeap ? RenderHeapType::GPU_UPLOAD : RenderHeapType::DEFAULT; +} + static GuestBuffer* CreateVertexBuffer(uint32_t length) { auto buffer = g_userHeap.AllocPhysical(ResourceType::VertexBuffer); - buffer->buffer = g_device->createBuffer(RenderBufferDesc::VertexBuffer(length, RenderHeapType::DEFAULT, RenderBufferFlag::INDEX)); + buffer->buffer = g_device->createBuffer(RenderBufferDesc::VertexBuffer(length, GetBufferHeapType(), RenderBufferFlag::INDEX)); buffer->dataSize = length; #ifdef _DEBUG buffer->buffer->setName(fmt::format("Vertex Buffer {:X}", g_memory.MapVirtual(buffer))); @@ -3038,7 +3059,7 @@ static GuestBuffer* CreateVertexBuffer(uint32_t length) static GuestBuffer* CreateIndexBuffer(uint32_t length, uint32_t, uint32_t format) { auto buffer = g_userHeap.AllocPhysical(ResourceType::IndexBuffer); - buffer->buffer = g_device->createBuffer(RenderBufferDesc::IndexBuffer(length, RenderHeapType::DEFAULT)); + buffer->buffer = g_device->createBuffer(RenderBufferDesc::IndexBuffer(length, GetBufferHeapType())); buffer->dataSize = length; buffer->format = ConvertFormat(format); buffer->guestFormat = format;