From 96108e1759adedac227f204da2af325635680eff Mon Sep 17 00:00:00 2001 From: "Skyth (Asilkan)" <19259897+blueskythlikesclouds@users.noreply.github.com> Date: Wed, 5 Feb 2025 17:02:27 +0300 Subject: [PATCH] Use a separate upload buffer allocator for the main thread. (#281) --- UnleashedRecomp/gpu/video.cpp | 133 ++++++++++++++++++------ UnleashedRecomp/patches/fps_patches.cpp | 5 +- 2 files changed, 103 insertions(+), 35 deletions(-) diff --git a/UnleashedRecomp/gpu/video.cpp b/UnleashedRecomp/gpu/video.cpp index 3d4b291..cfa17cc 100644 --- a/UnleashedRecomp/gpu/video.cpp +++ b/UnleashedRecomp/gpu/video.cpp @@ -176,8 +176,8 @@ static RenderViewport g_viewport(0.0f, 0.0f, 1280.0f, 720.0f); static PipelineState g_pipelineState; static int32_t g_depthBias; static float g_slopeScaledDepthBias; -static UploadAllocation g_vertexShaderConstants; -static UploadAllocation g_pixelShaderConstants; +static uint32_t g_vertexShaderConstants[0x400]; +static uint32_t g_pixelShaderConstants[0x380]; static SharedConstants g_sharedConstants; static GuestTexture* g_textures[16]; static RenderSamplerDesc g_samplerDescs[16]; @@ -408,12 +408,9 @@ struct UploadAllocator std::vector buffers; uint32_t index = 0; uint32_t offset = 0; - Mutex mutex; UploadAllocation allocate(uint32_t size, uint32_t alignment) { - std::lock_guard lock(mutex); - assert(size <= UploadBuffer::SIZE); offset = (offset + alignment - 1) & ~(alignment - 1); @@ -474,6 +471,53 @@ struct UploadAllocator static UploadAllocator g_uploadAllocators[NUM_FRAMES]; +struct IntermediaryUploadAllocator +{ + static constexpr size_t SIZE = 16 * 1024 * 1024; + + std::vector> buffers; + uint32_t index = 0; + uint32_t offset = 0; + + uint8_t* allocate(uint32_t size) + { + assert(size <= SIZE); + + if (offset + size > SIZE) + { + ++index; + offset = 0; + } + + if (buffers.size() <= index) + buffers.resize(index + 1); + + auto& buffer = buffers[index]; + if (buffer == nullptr) + buffer = std::make_unique_for_overwrite(SIZE); + + auto result = buffer.get() + offset; + offset += ((size + 0xF) & ~0xF); + + return result; + } + + uint8_t* allocate(const void* memory, uint32_t size) + { + auto result = allocate(size); + memcpy(result, memory, size); + return result; + } + + void reset() + { + index = 0; + offset = 0; + } +}; + +static IntermediaryUploadAllocator g_intermediaryUploadAllocator; + static std::vector g_tempResources[NUM_FRAMES]; static std::vector> g_tempBuffers[NUM_FRAMES]; @@ -821,12 +865,16 @@ struct RenderCommand struct { - UploadAllocation allocation; + uint8_t* memory; + uint32_t index; + uint32_t size; } setVertexShaderConstants; struct { - UploadAllocation allocation; + uint8_t* memory; + uint32_t index; + uint32_t size; } setPixelShaderConstants; struct @@ -854,7 +902,8 @@ struct RenderCommand { uint32_t primitiveType; uint32_t primitiveCount; - UploadAllocation vertexStreamZeroData; + uint8_t* vertexStreamZeroData; + uint32_t vertexStreamZeroSize; uint32_t vertexStreamZeroStride; CsdFilterState csdFilterState; } drawPrimitiveUP; @@ -1485,9 +1534,6 @@ static void BeginCommandList() g_backBuffer->layout = RenderTextureLayout::UNKNOWN; - g_vertexShaderConstants = {}; - g_pixelShaderConstants = {}; - for (size_t i = 0; i < 16; i++) { g_sharedConstants.texture2DIndices[i] = TEXTURE_DESCRIPTOR_NULL_TEXTURE_2D; @@ -2498,6 +2544,7 @@ void Video::Present() g_dirtyStates = DirtyStates(true); g_uploadAllocators[g_frame].reset(); + g_intermediaryUploadAllocator.reset(); g_triangleFanIndexData.reset(); g_quadIndexData.reset(); @@ -2532,15 +2579,6 @@ void Video::Present() g_presentProfiler.Reset(); } -static void Present(GuestDevice* device) -{ - Video::Present(); - - // Invalidate vertex/pixel shader constants. - device->dirtyFlags[0] = ~0; - device->dirtyFlags[1] = ~0; -} - void Video::StartPipelinePrecompilation() { g_shouldPrecompilePipelines = true; @@ -3966,20 +4004,38 @@ static void FlushRenderStateForMainThread(GuestDevice* device, LocalRenderComman } } - if (device->dirtyFlags[0] != 0) + uint64_t dirtyFlags = device->dirtyFlags[0].get(); + if (dirtyFlags != 0) { + int startRegister = std::countl_zero(dirtyFlags); + int endRegister = 64 - std::countr_zero(dirtyFlags); + + uint32_t index = startRegister * 16; + uint32_t size = (endRegister - startRegister) * 64; + auto& cmd = queue.enqueue(); cmd.type = RenderCommandType::SetVertexShaderConstants; - cmd.setVertexShaderConstants.allocation = g_uploadAllocators[g_frame].allocate(device->vertexShaderFloatConstants, 0x1000, 0x100); + cmd.setVertexShaderConstants.memory = g_intermediaryUploadAllocator.allocate(&device->vertexShaderFloatConstants[index], size); + cmd.setVertexShaderConstants.index = index; + cmd.setVertexShaderConstants.size = size; device->dirtyFlags[0] = 0; } - if (device->dirtyFlags[1] != 0) + dirtyFlags = device->dirtyFlags[1].get(); + if (dirtyFlags != 0) { + int startRegister = std::countl_zero(dirtyFlags); + int endRegister = std::min(56, 64 - std::countr_zero(dirtyFlags)); + + uint32_t index = startRegister * 16; + uint32_t size = (endRegister - startRegister) * 64; + auto& cmd = queue.enqueue(); cmd.type = RenderCommandType::SetPixelShaderConstants; - cmd.setPixelShaderConstants.allocation = g_uploadAllocators[g_frame].allocate(device->pixelShaderFloatConstants, 0xE00, 0x100); + cmd.setPixelShaderConstants.memory = g_intermediaryUploadAllocator.allocate(&device->pixelShaderFloatConstants[index], size); + cmd.setPixelShaderConstants.index = index; + cmd.setPixelShaderConstants.size = size; device->dirtyFlags[1] = 0; } @@ -4040,13 +4096,19 @@ static void ProcSetSamplerState(const RenderCommand& cmd) static void ProcSetVertexShaderConstants(const RenderCommand& cmd) { - g_vertexShaderConstants = cmd.setVertexShaderConstants.allocation; + auto& args = cmd.setVertexShaderConstants; + assert((args.index * sizeof(uint32_t) + args.size) <= sizeof(g_vertexShaderConstants)); + + memcpy(&g_vertexShaderConstants[args.index], args.memory, args.size); g_dirtyStates.vertexShaderConstants = true; } static void ProcSetPixelShaderConstants(const RenderCommand& cmd) { - g_pixelShaderConstants = cmd.setPixelShaderConstants.allocation; + auto& args = cmd.setPixelShaderConstants; + assert((args.index * sizeof(uint32_t) + args.size) <= sizeof(g_pixelShaderConstants)); + + memcpy(&g_pixelShaderConstants[args.index], args.memory, args.size); g_dirtyStates.pixelShaderConstants = true; } @@ -4138,10 +4200,16 @@ static void FlushRenderStateForRenderThread() commandList->setDepthBias(g_depthBias, 0.0f, g_slopeScaledDepthBias); if (g_dirtyStates.vertexShaderConstants) - SetRootDescriptor(g_vertexShaderConstants, 0); + { + auto vertexShaderConstants = g_uploadAllocators[g_frame].allocate(g_vertexShaderConstants, sizeof(g_vertexShaderConstants), 0x100); + SetRootDescriptor(vertexShaderConstants, 0); + } if (g_dirtyStates.pixelShaderConstants) - SetRootDescriptor(g_pixelShaderConstants, 1); + { + auto pixelShaderConstants = g_uploadAllocators[g_frame].allocate(g_pixelShaderConstants, sizeof(g_pixelShaderConstants), 0x100); + SetRootDescriptor(pixelShaderConstants, 1); + } if (g_dirtyStates.sharedConstants) { @@ -4302,7 +4370,8 @@ static void DrawPrimitiveUP(GuestDevice* device, uint32_t primitiveType, uint32_ cmd.type = RenderCommandType::DrawPrimitiveUP; cmd.drawPrimitiveUP.primitiveType = primitiveType; cmd.drawPrimitiveUP.primitiveCount = primitiveCount; - cmd.drawPrimitiveUP.vertexStreamZeroData = g_uploadAllocators[g_frame].allocate(reinterpret_cast(vertexStreamZeroData), primitiveCount * vertexStreamZeroStride, 0x4); + cmd.drawPrimitiveUP.vertexStreamZeroData = g_intermediaryUploadAllocator.allocate(vertexStreamZeroData, primitiveCount * vertexStreamZeroStride); + cmd.drawPrimitiveUP.vertexStreamZeroSize = primitiveCount * vertexStreamZeroStride; cmd.drawPrimitiveUP.vertexStreamZeroStride = vertexStreamZeroStride; cmd.drawPrimitiveUP.csdFilterState = g_csdFilterState; @@ -4320,9 +4389,11 @@ static void ProcDrawPrimitiveUP(const RenderCommand& cmd) SetPrimitiveType(args.primitiveType); SetDirtyValue(g_dirtyStates.pipelineState, g_pipelineState.vertexStrides[0], uint8_t(args.vertexStreamZeroStride)); + auto allocation = g_uploadAllocators[g_frame].allocate(reinterpret_cast(args.vertexStreamZeroData), args.vertexStreamZeroSize, 0x4); + auto& vertexBufferView = g_vertexBufferViews[0]; vertexBufferView.size = args.primitiveCount * args.vertexStreamZeroStride; - vertexBufferView.buffer = args.vertexStreamZeroData.buffer->at(args.vertexStreamZeroData.offset); + vertexBufferView.buffer = allocation.buffer->at(allocation.offset); g_inputSlots[0].stride = args.vertexStreamZeroStride; g_dirtyStates.vertexStreamFirst = 0; @@ -7132,7 +7203,7 @@ GUEST_FUNCTION_HOOK(sub_82BE96F0, GetSurfaceDesc); GUEST_FUNCTION_HOOK(sub_82BE04B0, GetVertexDeclaration); GUEST_FUNCTION_HOOK(sub_82BE0530, HashVertexDeclaration); -GUEST_FUNCTION_HOOK(sub_82BDA8C0, Present); +GUEST_FUNCTION_HOOK(sub_82BDA8C0, Video::Present); GUEST_FUNCTION_HOOK(sub_82BDD330, GetBackBuffer); GUEST_FUNCTION_HOOK(sub_82BE9498, CreateTexture); diff --git a/UnleashedRecomp/patches/fps_patches.cpp b/UnleashedRecomp/patches/fps_patches.cpp index 41cc7e0..97f36b4 100644 --- a/UnleashedRecomp/patches/fps_patches.cpp +++ b/UnleashedRecomp/patches/fps_patches.cpp @@ -115,10 +115,7 @@ PPC_FUNC(sub_8312DBF8) constexpr auto INTERVAL = 1000000000ns / 60; auto next = now + (INTERVAL - now.time_since_epoch() % INTERVAL); - std::this_thread::sleep_for(std::chrono::floor(next - now - 1ms)); - - while (std::chrono::steady_clock::now() < next) - std::this_thread::yield(); + std::this_thread::sleep_until(next); } void WaitVsyncMidAsmHook()