diff --git a/src/d3d11/d3d11_context.cpp b/src/d3d11/d3d11_context.cpp index 16529a2179d..ae2edcc499f 100644 --- a/src/d3d11/d3d11_context.cpp +++ b/src/d3d11/d3d11_context.cpp @@ -1009,6 +1009,9 @@ namespace dxvk { if (!ctrBuf.defined()) return; + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + // We bind the SO counter as an indirect count buffer, // so reset any tracking we may have been doing here. m_state.id.reset(); @@ -1035,6 +1038,9 @@ namespace dxvk { UINT StartVertexLocation) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->draw( VertexCount, 1, @@ -1050,6 +1056,9 @@ namespace dxvk { INT BaseVertexLocation) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->drawIndexed( IndexCount, 1, @@ -1067,6 +1076,9 @@ namespace dxvk { UINT StartInstanceLocation) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->draw( VertexCountPerInstance, @@ -1086,6 +1098,9 @@ namespace dxvk { UINT StartInstanceLocation) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->drawIndexed( IndexCountPerInstance, @@ -1107,6 +1122,9 @@ namespace dxvk { if (!ValidateDrawBufferSize(pBufferForArgs, AlignedByteOffsetForArgs, sizeof(VkDrawIndexedIndirectCommand))) return; + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + // If possible, batch up multiple indirect draw calls of // the same type into one single multiDrawIndirect call auto cmdData = static_cast(m_cmdData); @@ -1142,6 +1160,9 @@ namespace dxvk { if (!ValidateDrawBufferSize(pBufferForArgs, AlignedByteOffsetForArgs, sizeof(VkDrawIndirectCommand))) return; + if (unlikely(HasDirtyGraphicsBindings())) + ApplyDirtyGraphicsBindings(); + // If possible, batch up multiple indirect draw calls of // the same type into one single multiDrawIndirect call auto cmdData = static_cast(m_cmdData); @@ -1174,6 +1195,9 @@ namespace dxvk { UINT ThreadGroupCountZ) { D3D10DeviceLock lock = LockContext(); + if (unlikely(HasDirtyComputeBindings())) + ApplyDirtyComputeBindings(); + EmitCs([=] (DxvkContext* ctx) { ctx->dispatch( ThreadGroupCountX, @@ -1193,6 +1217,9 @@ namespace dxvk { if (!ValidateDrawBufferSize(pBufferForArgs, AlignedByteOffsetForArgs, sizeof(VkDispatchIndirectCommand))) return; + if (unlikely(HasDirtyComputeBindings())) + ApplyDirtyComputeBindings(); + EmitCs([cOffset = AlignedByteOffsetForArgs] (DxvkContext* ctx) { ctx->dispatchIndirect(cOffset); @@ -2131,9 +2158,6 @@ namespace dxvk { return; // Unbind previously bound conflicting UAVs - uint32_t uavSlotId = computeUavBinding (DxbcProgramType::ComputeShader, 0); - uint32_t ctrSlotId = computeUavCounterBinding(DxbcProgramType::ComputeShader, 0); - int32_t uavId = m_state.uav.mask.findNext(0); while (uavId >= 0) { @@ -2145,9 +2169,8 @@ namespace dxvk { m_state.uav.views[uavId] = nullptr; m_state.uav.mask.clr(uavId); - BindUnorderedAccessView( - uavSlotId + uavId, nullptr, - ctrSlotId + uavId, ~0u); + if (!DirtyComputeUnorderedAccessView(uavId, true)) + BindUnorderedAccessView(DxbcProgramType::ComputeShader, uavId, nullptr); } } @@ -2162,13 +2185,15 @@ namespace dxvk { auto uav = static_cast(ppUnorderedAccessViews[i]); auto ctr = pUAVInitialCounts ? pUAVInitialCounts[i] : ~0u; - if (m_state.uav.views[StartSlot + i] != uav || ctr != ~0u) { + if (ctr != ~0u && uav && uav->HasCounter()) + UpdateUnorderedAccessViewCounter(uav, ctr); + + if (m_state.uav.views[StartSlot + i] != uav) { m_state.uav.views[StartSlot + i] = uav; m_state.uav.mask.set(StartSlot + i, uav != nullptr); - BindUnorderedAccessView( - uavSlotId + StartSlot + i, uav, - ctrSlotId + StartSlot + i, ctr); + if (!DirtyComputeUnorderedAccessView(StartSlot + i, !uav)) + BindUnorderedAccessView(DxbcProgramType::ComputeShader, StartSlot + i, uav); ResolveCsSrvHazards(uav); } @@ -3157,6 +3182,138 @@ namespace dxvk { } + template + void D3D11CommonContext::ApplyDirtyConstantBuffers( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask) { + uint32_t bindMask = BoundMask.cbvMask & DirtyMask.cbvMask; + + if (!bindMask) + return; + + // Need to clear dirty bits before binding + const auto& state = m_state.cbv[Stage]; + DirtyMask.cbvMask -= bindMask; + + for (uint32_t slot : bit::BitMask(bindMask)) { + const auto& cbv = state.buffers[slot]; + + BindConstantBuffer(Stage, slot, cbv.buffer.ptr(), + cbv.constantOffset, cbv.constantBound); + } + } + + + template + void D3D11CommonContext::ApplyDirtySamplers( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask) { + uint32_t bindMask = BoundMask.samplerMask & DirtyMask.samplerMask; + + if (!bindMask) + return; + + // Need to clear dirty bits before binding + const auto& state = m_state.samplers[Stage]; + DirtyMask.samplerMask -= bindMask; + + for (uint32_t slot : bit::BitMask(bindMask)) + BindSampler(Stage, slot, state.samplers[slot]); + } + + + template + void D3D11CommonContext::ApplyDirtyShaderResources( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask) { + const auto& state = m_state.srv[Stage]; + + for (uint32_t i = 0; i < state.maxCount; i += 64u) { + uint32_t maskIndex = i / 64u; + uint64_t bindMask = BoundMask.srvMask[maskIndex] & DirtyMask.srvMask[maskIndex]; + + if (!bindMask) + continue; + + // Need to clear dirty bits before binding + DirtyMask.srvMask[maskIndex] -= bindMask; + + for (uint32_t slot : bit::BitMask(bindMask)) + BindShaderResource(Stage, slot + i, state.views[slot + i].ptr()); + } + } + + + template + void D3D11CommonContext::ApplyDirtyUnorderedAccessViews( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask) { + uint64_t bindMask = BoundMask.uavMask & DirtyMask.uavMask; + + if (!bindMask) + return; + + const auto& views = Stage == DxbcProgramType::ComputeShader + ? m_state.uav.views + : m_state.om.uavs; + + // Need to clear dirty bits before binding + DirtyMask.uavMask -= bindMask; + + for (uint32_t slot : bit::BitMask(bindMask)) + BindUnorderedAccessView(Stage, slot, views[slot].ptr()); + } + + + template + void D3D11CommonContext::ApplyDirtyGraphicsBindings() { + auto dirtyMask = m_state.lazy.shadersDirty & m_state.lazy.shadersUsed; + dirtyMask.clr(DxbcProgramType::ComputeShader); + + if (unlikely(!(dirtyMask & m_state.lazy.graphicsUavShaders).isClear())) { + DxbcProgramType stage = DxbcProgramType::PixelShader; + + auto& boundMask = m_state.lazy.bindingsUsed[stage]; + auto& dirtyMask = m_state.lazy.bindingsDirty[stage]; + + ApplyDirtyUnorderedAccessViews(stage, boundMask, dirtyMask); + } + + for (uint32_t stageIndex : bit::BitMask(uint32_t(dirtyMask.raw()))) { + DxbcProgramType stage = DxbcProgramType(stageIndex); + + auto& boundMask = m_state.lazy.bindingsUsed[stage]; + auto& dirtyMask = m_state.lazy.bindingsDirty[stage]; + + ApplyDirtySamplers(stage, boundMask, dirtyMask); + ApplyDirtyConstantBuffers(stage, boundMask, dirtyMask); + ApplyDirtyShaderResources(stage, boundMask, dirtyMask); + + m_state.lazy.shadersDirty.clr(stage); + } + } + + + template + void D3D11CommonContext::ApplyDirtyComputeBindings() { + DxbcProgramType stage = DxbcProgramType::ComputeShader; + + auto& boundMask = m_state.lazy.bindingsUsed[stage]; + auto& dirtyMask = m_state.lazy.bindingsDirty[stage]; + + ApplyDirtySamplers(stage, boundMask, dirtyMask); + ApplyDirtyConstantBuffers(stage, boundMask, dirtyMask); + ApplyDirtyShaderResources(stage, boundMask, dirtyMask); + ApplyDirtyUnorderedAccessViews(stage, boundMask, dirtyMask); + + m_state.lazy.shadersDirty.clr(stage); + } + + template void D3D11CommonContext::ApplyInputLayout() { auto inputLayout = m_state.ia.inputLayout.prvRef(); @@ -3416,6 +3573,8 @@ namespace dxvk { template void D3D11CommonContext::BindShader( const D3D11CommonShader* pShaderModule) { + uint64_t oldUavMask = m_state.lazy.bindingsUsed[ShaderStage].uavMask; + if (pShaderModule) { auto buffer = pShaderModule->GetIcb(); auto shader = pShaderModule->GetShader(); @@ -3423,6 +3582,17 @@ namespace dxvk { if (unlikely(shader->needsLibraryCompile())) m_device->requestCompileShader(shader); + // If this shader activates any bindings that have not yet been applied, + // mark the shader stage as dirty so it gets applied on the next draw. + // Don't apply it right away since any dirty bindings are likely redundant. + m_state.lazy.shadersUsed.set(ShaderStage); + m_state.lazy.bindingsUsed[ShaderStage] = pShaderModule->GetBindingMask(); + + if (!m_state.lazy.shadersDirty.test(ShaderStage) && (DebugLazyBinding != Tristate::False)) { + if (!(m_state.lazy.bindingsDirty[ShaderStage] & m_state.lazy.bindingsUsed[ShaderStage]).empty()) + m_state.lazy.shadersDirty.set(ShaderStage); + } + EmitCs([ cBuffer = std::move(buffer), cShader = std::move(shader) @@ -3438,6 +3608,15 @@ namespace dxvk { Forwarder::move(cBuffer)); }); } else { + // Mark shader stage as inactive and clean since we'll have no active + // bindings. This works because if the app changes any binding at all + // for this stage, it will get flagged as dirty, and if another shader + // gets bound, it will check for any dirty bindings again. + m_state.lazy.shadersUsed.clr(ShaderStage); + m_state.lazy.shadersDirty.clr(ShaderStage); + + m_state.lazy.bindingsUsed[ShaderStage].reset(); + EmitCs([] (DxvkContext* ctx) { constexpr VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); @@ -3448,6 +3627,33 @@ namespace dxvk { ctx->bindUniformBuffer(stage, slotId, DxvkBufferSlice()); }); } + + // On graphics, UAVs are available to all stages, but we treat them as part + // of the pixel shader binding set. Re-compute the active UAV mask. We don't + // need to set the PS as active or dirty here though since the UAV update + // code will mark all other stages that access UAVs as dirty, too. + uint64_t newUavMask = m_state.lazy.bindingsUsed[ShaderStage].uavMask; + + if (ShaderStage != DxbcProgramType::ComputeShader && oldUavMask != newUavMask) { + constexpr DxbcProgramType ps = DxbcProgramType::PixelShader; + + // Since dirty UAVs are only tracked on the PS mask, we need to mark the + // stage as dirty if any of the used UAVs overlap with the dirty PS mask. + if (m_state.lazy.bindingsDirty[ps].uavMask & newUavMask) + m_state.lazy.shadersDirty.set(ShaderStage); + + // Accumulate graphics UAV mask and write it back to the pixel shader mask. + m_state.lazy.graphicsUavShaders.clr(ShaderStage); + + for (uint32_t stageIndex : bit::BitMask(uint32_t(m_state.lazy.graphicsUavShaders.raw()))) + newUavMask |= m_state.lazy.bindingsUsed[DxbcProgramType(stageIndex)].uavMask; + + m_state.lazy.bindingsUsed[ps].uavMask = newUavMask; + + // Update bit mask of shaders actively accessing graphics UAVs + if (newUavMask) + m_state.lazy.graphicsUavShaders.set(ShaderStage); + } } @@ -3666,171 +3872,164 @@ namespace dxvk { template - template void D3D11CommonContext::BindConstantBuffer( + DxbcProgramType ShaderStage, UINT Slot, D3D11Buffer* pBuffer, UINT Offset, UINT Length) { + uint32_t slotId = computeConstantBufferBinding(ShaderStage, Slot); + if (pBuffer) { EmitCs([ - cSlotId = Slot, + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage), cBufferSlice = pBuffer->GetBufferSlice(16 * Offset, 16 * Length) ] (DxvkContext* ctx) mutable { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindUniformBuffer(stage, cSlotId, + ctx->bindUniformBuffer(cStage, cSlotId, Forwarder::move(cBufferSlice)); }); } else { EmitCs([ - cSlotId = Slot + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage) ] (DxvkContext* ctx) { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindUniformBuffer(stage, cSlotId, DxvkBufferSlice()); + ctx->bindUniformBuffer(cStage, cSlotId, DxvkBufferSlice()); }); } } template - template void D3D11CommonContext::BindConstantBufferRange( + DxbcProgramType ShaderStage, UINT Slot, UINT Offset, UINT Length) { + uint32_t slotId = computeConstantBufferBinding(ShaderStage, Slot); + EmitCs([ - cSlotId = Slot, - cOffset = 16 * Offset, - cLength = 16 * Length + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage), + cOffset = 16u * Offset, + cLength = 16u * Length ] (DxvkContext* ctx) { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindUniformBufferRange(stage, cSlotId, cOffset, cLength); + ctx->bindUniformBufferRange(cStage, cSlotId, cOffset, cLength); }); } template - template void D3D11CommonContext::BindSampler( + DxbcProgramType ShaderStage, UINT Slot, D3D11SamplerState* pSampler) { + uint32_t slotId = computeSamplerBinding(ShaderStage, Slot); + if (pSampler) { EmitCs([ - cSlotId = Slot, + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage), cSampler = pSampler->GetDXVKSampler() ] (DxvkContext* ctx) mutable { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindResourceSampler(stage, cSlotId, + ctx->bindResourceSampler(cStage, cSlotId, Forwarder::move(cSampler)); }); } else { EmitCs([ - cSlotId = Slot + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage) ] (DxvkContext* ctx) { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindResourceSampler(stage, cSlotId, nullptr); + ctx->bindResourceSampler(cStage, cSlotId, nullptr); }); } } template - template void D3D11CommonContext::BindShaderResource( + DxbcProgramType ShaderStage, UINT Slot, D3D11ShaderResourceView* pResource) { + uint32_t slotId = computeSrvBinding(ShaderStage, Slot); + if (pResource) { if (pResource->GetViewInfo().Dimension != D3D11_RESOURCE_DIMENSION_BUFFER) { EmitCs([ - cSlotId = Slot, + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage), cView = pResource->GetImageView() ] (DxvkContext* ctx) mutable { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindResourceImageView(stage, cSlotId, + ctx->bindResourceImageView(cStage, cSlotId, Forwarder::move(cView)); }); } else { EmitCs([ - cSlotId = Slot, + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage), cView = pResource->GetBufferView() ] (DxvkContext* ctx) mutable { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindResourceBufferView(stage, cSlotId, + ctx->bindResourceBufferView(cStage, cSlotId, Forwarder::move(cView)); }); } } else { EmitCs([ - cSlotId = Slot + cSlotId = slotId, + cStage = GetShaderStage(ShaderStage) ] (DxvkContext* ctx) { - VkShaderStageFlagBits stage = GetShaderStage(ShaderStage); - ctx->bindResourceImageView(stage, cSlotId, nullptr); + ctx->bindResourceImageView(cStage, cSlotId, nullptr); }); } } template - template void D3D11CommonContext::BindUnorderedAccessView( - UINT UavSlot, - D3D11UnorderedAccessView* pUav, - UINT CtrSlot, - UINT Counter) { + DxbcProgramType ShaderStage, + UINT Slot, + D3D11UnorderedAccessView* pUav) { + uint32_t uavSlotId = computeUavBinding(ShaderStage, Slot); + uint32_t ctrSlotId = computeUavCounterBinding(ShaderStage, Slot); + + VkShaderStageFlags stages = ShaderStage == DxbcProgramType::ComputeShader + ? VK_SHADER_STAGE_COMPUTE_BIT + : VK_SHADER_STAGE_ALL_GRAPHICS; + if (pUav) { if (pUav->GetViewInfo().Dimension == D3D11_RESOURCE_DIMENSION_BUFFER) { EmitCs([ - cUavSlotId = UavSlot, - cCtrSlotId = CtrSlot, + cUavSlotId = uavSlotId, + cCtrSlotId = ctrSlotId, + cStages = stages, cBufferView = pUav->GetBufferView(), - cCounterView = pUav->GetCounterView(), - cCounterValue = Counter + cCounterView = pUav->GetCounterView() ] (DxvkContext* ctx) mutable { - VkShaderStageFlags stages = ShaderStage == DxbcProgramType::ComputeShader - ? VK_SHADER_STAGE_COMPUTE_BIT - : VK_SHADER_STAGE_ALL_GRAPHICS; - - if (cCounterView != nullptr && cCounterValue != ~0u) { - DxvkBufferSlice counterSlice(cCounterView); - - ctx->updateBuffer( - counterSlice.buffer(), - counterSlice.offset(), - sizeof(uint32_t), - &cCounterValue); - } - - ctx->bindResourceBufferView(stages, cUavSlotId, + ctx->bindResourceBufferView(cStages, cUavSlotId, Forwarder::move(cBufferView)); - ctx->bindResourceBufferView(stages, cCtrSlotId, + ctx->bindResourceBufferView(cStages, cCtrSlotId, Forwarder::move(cCounterView)); }); } else { EmitCs([ - cUavSlotId = UavSlot, - cCtrSlotId = CtrSlot, + cUavSlotId = uavSlotId, + cCtrSlotId = ctrSlotId, + cStages = stages, cImageView = pUav->GetImageView() ] (DxvkContext* ctx) mutable { - VkShaderStageFlags stages = ShaderStage == DxbcProgramType::ComputeShader - ? VK_SHADER_STAGE_COMPUTE_BIT - : VK_SHADER_STAGE_ALL_GRAPHICS; - - ctx->bindResourceImageView(stages, cUavSlotId, + ctx->bindResourceImageView(cStages, cUavSlotId, Forwarder::move(cImageView)); - ctx->bindResourceBufferView(stages, cCtrSlotId, nullptr); + ctx->bindResourceBufferView(cStages, cCtrSlotId, nullptr); }); } } else { EmitCs([ - cUavSlotId = UavSlot, - cCtrSlotId = CtrSlot + cUavSlotId = uavSlotId, + cCtrSlotId = ctrSlotId, + cStages = stages ] (DxvkContext* ctx) { - VkShaderStageFlags stages = ShaderStage == DxbcProgramType::ComputeShader - ? VK_SHADER_STAGE_COMPUTE_BIT - : VK_SHADER_STAGE_ALL_GRAPHICS; - - ctx->bindResourceImageView(stages, cUavSlotId, nullptr); - ctx->bindResourceBufferView(stages, cCtrSlotId, nullptr); + ctx->bindResourceImageView(cStages, cUavSlotId, nullptr); + ctx->bindResourceBufferView(cStages, cCtrSlotId, nullptr); }); } } @@ -4216,6 +4415,117 @@ namespace dxvk { } + template + template + bool D3D11CommonContext::DirtyBindingGeneric( + DxbcProgramType ShaderStage, + T BoundMask, + T& DirtyMask, + T DirtyBit, + bool IsNull) { + // Forward immediately if lazy binding is forced off + if (DebugLazyBinding == Tristate::False) + return false; + + if ((BoundMask & ~DirtyMask) & DirtyBit) { + // If we're binding a non-null resource to an active slot that has not been + // marked for lazy binding yet, forward the call immediately in order to + // avoid tracking overhead. This is by far the most common case. + if (likely(!IsNull && DebugLazyBinding != Tristate::True)) + return false; + + // If we are binding a null resource to an active slot, the app will likely + // either bind something else or bind a shader that does not use this slot. + // In that case, avoid likely redundant CS traffic and apply the binding on + // the next draw. + m_state.lazy.shadersDirty.set(ShaderStage); + } + + // Binding is either inactive or already dirty. In the inactive case, there + // is no need to mark the shader stage as dirty since binding a shader that + // activates the binding will implicitly do so. + DirtyMask |= DirtyBit; + return true; + } + + + template + bool D3D11CommonContext::DirtyConstantBuffer( + DxbcProgramType ShaderStage, + uint32_t Slot, + bool IsNull) { + return DirtyBindingGeneric(ShaderStage, + m_state.lazy.bindingsUsed[ShaderStage].cbvMask, + m_state.lazy.bindingsDirty[ShaderStage].cbvMask, + 1u << Slot, IsNull); + } + + + template + bool D3D11CommonContext::DirtySampler( + DxbcProgramType ShaderStage, + uint32_t Slot, + bool IsNull) { + return DirtyBindingGeneric(ShaderStage, + m_state.lazy.bindingsUsed[ShaderStage].samplerMask, + m_state.lazy.bindingsDirty[ShaderStage].samplerMask, + 1u << Slot, IsNull); + } + + + template + bool D3D11CommonContext::DirtyShaderResource( + DxbcProgramType ShaderStage, + uint32_t Slot, + bool IsNull) { + uint32_t idx = Slot / 64u; + + return DirtyBindingGeneric(ShaderStage, + m_state.lazy.bindingsUsed[ShaderStage].srvMask[idx], + m_state.lazy.bindingsDirty[ShaderStage].srvMask[idx], + uint64_t(1u) << Slot, IsNull); + } + + + template + bool D3D11CommonContext::DirtyComputeUnorderedAccessView( + uint32_t Slot, + bool IsNull) { + constexpr DxbcProgramType ShaderStage = DxbcProgramType::ComputeShader; + + return DirtyBindingGeneric(ShaderStage, + m_state.lazy.bindingsUsed[ShaderStage].uavMask, + m_state.lazy.bindingsDirty[ShaderStage].uavMask, + uint64_t(1u) << Slot, IsNull); + } + + + template + bool D3D11CommonContext::DirtyGraphicsUnorderedAccessView( + uint32_t Slot) { + constexpr DxbcProgramType ShaderStage = DxbcProgramType::PixelShader; + + if (DebugLazyBinding == Tristate::False) + return false; + + // Use different logic here and always use lazy binding for graphics UAVs. + // Since graphics UAVs are generally bound together with render targets, + // looking at the active binding mask doesn't really help us here. + uint64_t dirtyBit = uint64_t(1u) << Slot; + + if (m_state.lazy.bindingsUsed[ShaderStage].uavMask & dirtyBit) { + // Need to mark all graphics stages that use UAVs as dirty here to + // make sure that bindings actually get reapplied properly. There + // may be no pixel shader bound in this case, even though we do + // all the tracking on the pixel shader bit mask. + m_state.lazy.shadersDirty.set(m_state.lazy.graphicsUavShaders); + } + + m_state.lazy.bindingsDirty[ShaderStage].uavMask |= dirtyBit; + return true; + } + + template void D3D11CommonContext::DiscardBuffer( ID3D11Resource* pResource) { @@ -4378,6 +4688,21 @@ namespace dxvk { } + template + bool D3D11CommonContext::HasDirtyComputeBindings() { + return m_state.lazy.shadersDirty.test(DxbcProgramType::ComputeShader); + } + + + template + bool D3D11CommonContext::HasDirtyGraphicsBindings() { + return (m_state.lazy.shadersDirty & m_state.lazy.shadersUsed).any( + DxbcProgramType::VertexShader, DxbcProgramType::GeometryShader, + DxbcProgramType::HullShader, DxbcProgramType::DomainShader, + DxbcProgramType::PixelShader); + } + + template void D3D11CommonContext::ResetCommandListState() { EmitCs([ @@ -4516,6 +4841,18 @@ namespace dxvk { m_state.srv.reset(); m_state.uav.reset(); m_state.samplers.reset(); + + // Reset dirty tracking + m_state.lazy.reset(); + } + + + template + void D3D11CommonContext::ResetDirtyTracking() { + // Must only be called when all bindings are guaranteed to get applied + // to the DXVK context before the next draw or dispatch command. + m_state.lazy.bindingsDirty.reset(); + m_state.lazy.shadersDirty = 0u; } @@ -4530,8 +4867,6 @@ namespace dxvk { void D3D11CommonContext::ResolveSrvHazards( T* pView) { auto& bindings = m_state.srv[ShaderStage]; - - uint32_t slotId = computeSrvBinding(ShaderStage, 0); int32_t srvId = bindings.hazardous.findNext(0); while (srvId >= 0) { @@ -4544,7 +4879,8 @@ namespace dxvk { bindings.views[srvId] = nullptr; bindings.hazardous.clr(srvId); - BindShaderResource(slotId + srvId, nullptr); + if (!DirtyShaderResource(ShaderStage, srvId, true)) + BindShaderResource(ShaderStage, srvId, nullptr); } } else { // Avoid further redundant iterations @@ -4608,16 +4944,12 @@ namespace dxvk { if (!pView || !pView->HasBindFlag(D3D11_BIND_UNORDERED_ACCESS)) return; - uint32_t uavSlotId = computeUavBinding (DxbcProgramType::PixelShader, 0); - uint32_t ctrSlotId = computeUavCounterBinding(DxbcProgramType::PixelShader, 0); - for (uint32_t i = 0; i < m_state.om.maxUav; i++) { if (CheckViewOverlap(pView, m_state.om.uavs[i].ptr())) { m_state.om.uavs[i] = nullptr; - BindUnorderedAccessView( - uavSlotId + i, nullptr, - ctrSlotId + i, ~0u); + if (!DirtyGraphicsUnorderedAccessView(i)) + BindUnorderedAccessView(DxbcProgramType::PixelShader, i, nullptr); } } } @@ -4659,29 +4991,20 @@ namespace dxvk { for (uint32_t i = 0; i < m_state.so.targets.size(); i++) BindXfbBuffer(i, m_state.so.targets[i].buffer.ptr(), ~0u); - RestoreConstantBuffers(); - RestoreConstantBuffers(); - RestoreConstantBuffers(); - RestoreConstantBuffers(); - RestoreConstantBuffers(); - RestoreConstantBuffers(); - - RestoreShaderResources(); - RestoreShaderResources(); - RestoreShaderResources(); - RestoreShaderResources(); - RestoreShaderResources(); - RestoreShaderResources(); - - RestoreUnorderedAccessViews(); - RestoreUnorderedAccessViews(); - - RestoreSamplers(); - RestoreSamplers(); - RestoreSamplers(); - RestoreSamplers(); - RestoreSamplers(); - RestoreSamplers(); + // Reset dirty binding and shader masks before applying + // bindings to avoid implicit null binding overrids. + ResetDirtyTracking(); + + for (uint32_t i = 0; i < uint32_t(DxbcProgramType::Count); i++) { + auto stage = DxbcProgramType(i); + + RestoreConstantBuffers(stage); + RestoreShaderResources(stage); + RestoreSamplers(stage); + } + + RestoreUnorderedAccessViews(DxbcProgramType::PixelShader); + RestoreUnorderedAccessViews(DxbcProgramType::ComputeShader); // Draw buffer bindings aren't persistent at the API level, and // we can't meaningfully track them. Just reset this state here @@ -4691,43 +5014,40 @@ namespace dxvk { template - template - void D3D11CommonContext::RestoreConstantBuffers() { + void D3D11CommonContext::RestoreConstantBuffers( + DxbcProgramType Stage) { const auto& bindings = m_state.cbv[Stage]; - uint32_t slotId = computeConstantBufferBinding(Stage, 0); for (uint32_t i = 0; i < bindings.maxCount; i++) { - BindConstantBuffer(slotId + i, bindings.buffers[i].buffer.ptr(), + BindConstantBuffer(Stage, i, bindings.buffers[i].buffer.ptr(), bindings.buffers[i].constantOffset, bindings.buffers[i].constantBound); } } template - template - void D3D11CommonContext::RestoreSamplers() { + void D3D11CommonContext::RestoreSamplers( + DxbcProgramType Stage) { const auto& bindings = m_state.samplers[Stage]; - uint32_t slotId = computeSamplerBinding(Stage, 0); for (uint32_t i = 0; i < bindings.maxCount; i++) - BindSampler(slotId + i, bindings.samplers[i]); + BindSampler(Stage, i, bindings.samplers[i]); } template - template - void D3D11CommonContext::RestoreShaderResources() { + void D3D11CommonContext::RestoreShaderResources( + DxbcProgramType Stage) { const auto& bindings = m_state.srv[Stage]; - uint32_t slotId = computeSrvBinding(Stage, 0); for (uint32_t i = 0; i < bindings.maxCount; i++) - BindShaderResource(slotId + i, bindings.views[i].ptr()); + BindShaderResource(Stage, i, bindings.views[i].ptr()); } template - template - void D3D11CommonContext::RestoreUnorderedAccessViews() { + void D3D11CommonContext::RestoreUnorderedAccessViews( + DxbcProgramType Stage) { const auto& views = Stage == DxbcProgramType::ComputeShader ? m_state.uav.views : m_state.om.uavs; @@ -4736,14 +5056,8 @@ namespace dxvk { ? m_state.uav.maxCount : m_state.om.maxUav; - uint32_t uavSlotId = computeUavBinding(Stage, 0); - uint32_t ctrSlotId = computeUavCounterBinding(Stage, 0); - - for (uint32_t i = 0; i < maxCount; i++) { - BindUnorderedAccessView( - uavSlotId + i, views[i].ptr(), - ctrSlotId + i, ~0u); - } + for (uint32_t i = 0; i < maxCount; i++) + BindUnorderedAccessView(Stage, i, views[i].ptr()); } @@ -4754,7 +5068,6 @@ namespace dxvk { UINT NumBuffers, ID3D11Buffer* const* ppConstantBuffers) { auto& bindings = m_state.cbv[ShaderStage]; - uint32_t slotId = computeConstantBufferBinding(ShaderStage, StartSlot); for (uint32_t i = 0; i < NumBuffers; i++) { auto newBuffer = static_cast(ppConstantBuffers[i]); @@ -4771,7 +5084,8 @@ namespace dxvk { bindings.buffers[StartSlot + i].constantCount = constantCount; bindings.buffers[StartSlot + i].constantBound = constantCount; - BindConstantBuffer(slotId + i, newBuffer, 0, constantCount); + if (!DirtyConstantBuffer(ShaderStage, StartSlot + i, !newBuffer)) + BindConstantBuffer(ShaderStage, StartSlot + i, newBuffer, 0, constantCount); } } @@ -4790,8 +5104,6 @@ namespace dxvk { const UINT* pNumConstants) { auto& bindings = m_state.cbv[ShaderStage]; - uint32_t slotId = computeConstantBufferBinding(ShaderStage, StartSlot); - for (uint32_t i = 0; i < NumBuffers; i++) { auto newBuffer = static_cast(ppConstantBuffers[i]); @@ -4830,14 +5142,16 @@ namespace dxvk { bindings.buffers[StartSlot + i].constantCount = constantCount; bindings.buffers[StartSlot + i].constantBound = constantBound; - BindConstantBuffer(slotId + i, newBuffer, constantOffset, constantBound); + if (!DirtyConstantBuffer(ShaderStage, StartSlot + i, !newBuffer)) + BindConstantBuffer(ShaderStage, StartSlot + i, newBuffer, constantOffset, constantBound); } else if (bindings.buffers[StartSlot + i].constantOffset != constantOffset || bindings.buffers[StartSlot + i].constantCount != constantCount) { bindings.buffers[StartSlot + i].constantOffset = constantOffset; bindings.buffers[StartSlot + i].constantCount = constantCount; bindings.buffers[StartSlot + i].constantBound = constantBound; - BindConstantBufferRange(slotId + i, constantOffset, constantBound); + if (!DirtyConstantBuffer(ShaderStage, StartSlot + i, !newBuffer)) + BindConstantBufferRange(ShaderStage, StartSlot + i, constantOffset, constantBound); } } @@ -4853,7 +5167,6 @@ namespace dxvk { UINT NumResources, ID3D11ShaderResourceView* const* ppResources) { auto& bindings = m_state.srv[ShaderStage]; - uint32_t slotId = computeSrvBinding(ShaderStage, StartSlot); for (uint32_t i = 0; i < NumResources; i++) { auto resView = static_cast(ppResources[i]); @@ -4872,7 +5185,9 @@ namespace dxvk { } bindings.views[StartSlot + i] = resView; - BindShaderResource(slotId + i, resView); + + if (!DirtyShaderResource(ShaderStage, StartSlot + i, !resView)) + BindShaderResource(ShaderStage, StartSlot + i, resView); } } @@ -4888,14 +5203,15 @@ namespace dxvk { UINT NumSamplers, ID3D11SamplerState* const* ppSamplers) { auto& bindings = m_state.samplers[ShaderStage]; - uint32_t slotId = computeSamplerBinding(ShaderStage, StartSlot); for (uint32_t i = 0; i < NumSamplers; i++) { auto sampler = static_cast(ppSamplers[i]); if (bindings.samplers[StartSlot + i] != sampler) { bindings.samplers[StartSlot + i] = sampler; - BindSampler(slotId + i, sampler); + + if (!DirtySampler(ShaderStage, StartSlot + i, !sampler)) + BindSampler(ShaderStage, StartSlot + i, sampler); } } @@ -4958,14 +5274,15 @@ namespace dxvk { } if (unlikely(NumUAVs || m_state.om.maxUav)) { - uint32_t uavSlotId = computeUavBinding (DxbcProgramType::PixelShader, 0); - uint32_t ctrSlotId = computeUavCounterBinding(DxbcProgramType::PixelShader, 0); - if (likely(NumUAVs != D3D11_KEEP_UNORDERED_ACCESS_VIEWS)) { - uint32_t newMaxUav = NumUAVs ? UAVStartSlot + NumUAVs : 0; + uint32_t newMinUav = NumUAVs ? UAVStartSlot : D3D11_1_UAV_SLOT_COUNT; + uint32_t newMaxUav = NumUAVs ? UAVStartSlot + NumUAVs : 0u; + + uint32_t oldMinUav = std::exchange(m_state.om.minUav, newMinUav); uint32_t oldMaxUav = std::exchange(m_state.om.maxUav, newMaxUav); - for (uint32_t i = 0; i < std::max(oldMaxUav, newMaxUav); i++) { + for (uint32_t i = std::min(oldMinUav, newMinUav); + i < std::max(oldMaxUav, newMaxUav); i++) { D3D11UnorderedAccessView* uav = nullptr; uint32_t ctr = ~0u; @@ -4974,12 +5291,14 @@ namespace dxvk { ctr = pUAVInitialCounts ? pUAVInitialCounts[i - UAVStartSlot] : ~0u; } - if (m_state.om.uavs[i] != uav || ctr != ~0u) { + if (ctr != ~0u && uav && uav->HasCounter()) + UpdateUnorderedAccessViewCounter(uav, ctr); + + if (m_state.om.uavs[i] != uav) { m_state.om.uavs[i] = uav; - BindUnorderedAccessView( - uavSlotId + i, uav, - ctrSlotId + i, ctr); + if (!DirtyGraphicsUnorderedAccessView(i)) + BindUnorderedAccessView(DxbcProgramType::PixelShader, i, uav); ResolveOmSrvHazards(uav); @@ -5375,6 +5694,20 @@ namespace dxvk { } + template + void D3D11CommonContext::UpdateUnorderedAccessViewCounter( + D3D11UnorderedAccessView* pUav, + uint32_t CounterValue) { + EmitCs([ + cView = pUav->GetCounterView(), + cCounter = CounterValue + ] (DxvkContext* ctx) { + ctx->updateBuffer(cView->buffer(), + cView->info().offset, sizeof(cCounter), &cCounter); + }); + } + + template bool D3D11CommonContext::ValidateRenderTargets( UINT NumViews, diff --git a/src/d3d11/d3d11_context.h b/src/d3d11/d3d11_context.h index a2d6c365982..d4b06e2d8e0 100644 --- a/src/d3d11/d3d11_context.h +++ b/src/d3d11/d3d11_context.h @@ -75,6 +75,11 @@ namespace dxvk { // Use a local staging buffer to handle tiny uploads, most // of the time we're fine with hitting the global allocator constexpr static VkDeviceSize StagingBufferSize = 256ull << 10; + + protected: + // Compile-time debug flag to force lazy binding on (True) or off (False) + constexpr static Tristate DebugLazyBinding = Tristate::Auto; + public: D3D11CommonContext( @@ -799,6 +804,30 @@ namespace dxvk { DxvkBufferSlice AllocStagingBuffer( VkDeviceSize Size); + void ApplyDirtyConstantBuffers( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask); + + void ApplyDirtySamplers( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask); + + void ApplyDirtyShaderResources( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask); + + void ApplyDirtyUnorderedAccessViews( + DxbcProgramType Stage, + const DxbcBindingMask& BoundMask, + DxbcBindingMask& DirtyMask); + + void ApplyDirtyGraphicsBindings(); + + void ApplyDirtyComputeBindings(); + void ApplyInputLayout(); void ApplyPrimitiveTopology(); @@ -854,35 +883,33 @@ namespace dxvk { D3D11Buffer* pBuffer, UINT Offset); - template void BindConstantBuffer( + DxbcProgramType ShaderStage, UINT Slot, D3D11Buffer* pBuffer, UINT Offset, UINT Length); - template void BindConstantBufferRange( + DxbcProgramType ShaderStage, UINT Slot, UINT Offset, UINT Length); - template void BindSampler( + DxbcProgramType ShaderStage, UINT Slot, D3D11SamplerState* pSampler); - template void BindShaderResource( + DxbcProgramType ShaderStage, UINT Slot, D3D11ShaderResourceView* pResource); - template void BindUnorderedAccessView( - UINT UavSlot, - D3D11UnorderedAccessView* pUav, - UINT CtrSlot, - UINT Counter); + DxbcProgramType ShaderStage, + UINT Slot, + D3D11UnorderedAccessView* pUav); VkClearValue ConvertColorValue( const FLOAT Color[4], @@ -911,6 +938,36 @@ namespace dxvk { DxvkBufferSlice BufferSlice, UINT Flags); + template + bool DirtyBindingGeneric( + DxbcProgramType ShaderStage, + T BoundMask, + T& DirtyMask, + T DirtyBit, + bool IsNull); + + bool DirtyConstantBuffer( + DxbcProgramType ShaderStage, + uint32_t Slot, + bool IsNull); + + bool DirtySampler( + DxbcProgramType ShaderStage, + uint32_t Slot, + bool IsNull); + + bool DirtyShaderResource( + DxbcProgramType ShaderStage, + uint32_t Slot, + bool IsNull); + + bool DirtyComputeUnorderedAccessView( + uint32_t Slot, + bool IsNull); + + bool DirtyGraphicsUnorderedAccessView( + uint32_t Slot); + void DiscardBuffer( ID3D11Resource* pResource); @@ -943,10 +1000,16 @@ namespace dxvk { D3D11MaxUsedBindings GetMaxUsedBindings(); + bool HasDirtyComputeBindings(); + + bool HasDirtyGraphicsBindings(); + void ResetCommandListState(); void ResetContextState(); + void ResetDirtyTracking(); + void ResetStagingBuffer(); template @@ -969,18 +1032,18 @@ namespace dxvk { void RestoreCommandListState(); - template - void RestoreConstantBuffers(); - - template - void RestoreSamplers(); - - template - void RestoreShaderResources(); - - template - void RestoreUnorderedAccessViews(); + void RestoreConstantBuffers( + DxbcProgramType Stage); + void RestoreSamplers( + DxbcProgramType Stage); + + void RestoreShaderResources( + DxbcProgramType Stage); + + void RestoreUnorderedAccessViews( + DxbcProgramType Stage); + template void SetConstantBuffers( UINT StartSlot, @@ -1063,6 +1126,10 @@ namespace dxvk { UINT SrcDepthPitch, UINT CopyFlags); + void UpdateUnorderedAccessViewCounter( + D3D11UnorderedAccessView* pUav, + uint32_t CounterValue); + bool ValidateRenderTargets( UINT NumViews, ID3D11RenderTargetView* const* ppRenderTargetViews, @@ -1083,7 +1150,7 @@ namespace dxvk { DxvkMultisampleState* pMsState, UINT SampleMask); - template + template void EmitCs(Cmd&& command) { m_cmdData = nullptr; @@ -1091,14 +1158,14 @@ namespace dxvk { GetTypedContext()->EmitCsChunk(std::move(m_csChunk)); m_csChunk = AllocCsChunk(); - if constexpr (AllowFlush) + if constexpr (!IsDeferred && AllowFlush) GetTypedContext()->ConsiderFlush(GpuFlushType::ImplicitWeakHint); m_csChunk->push(command); } } - template + template M* EmitCsCmd(Cmd&& command, Args&&... args) { M* data = m_csChunk->pushCmd( command, std::forward(args)...); @@ -1107,7 +1174,7 @@ namespace dxvk { GetTypedContext()->EmitCsChunk(std::move(m_csChunk)); m_csChunk = AllocCsChunk(); - if constexpr (AllowFlush) + if constexpr (!IsDeferred && AllowFlush) GetTypedContext()->ConsiderFlush(GpuFlushType::ImplicitWeakHint); // We must record this command after the potential diff --git a/src/d3d11/d3d11_context_ext.cpp b/src/d3d11/d3d11_context_ext.cpp index 5254f480f43..c933f571d80 100644 --- a/src/d3d11/d3d11_context_ext.cpp +++ b/src/d3d11/d3d11_context_ext.cpp @@ -48,6 +48,9 @@ namespace dxvk { D3D10DeviceLock lock = m_ctx->LockContext(); m_ctx->SetDrawBuffers(pBufferForArgs, nullptr); + if (unlikely(m_ctx->HasDirtyGraphicsBindings())) + m_ctx->ApplyDirtyGraphicsBindings(); + m_ctx->EmitCs([ cCount = DrawCount, cOffset = ByteOffsetForArgs, @@ -67,6 +70,9 @@ namespace dxvk { D3D10DeviceLock lock = m_ctx->LockContext(); m_ctx->SetDrawBuffers(pBufferForArgs, nullptr); + if (unlikely(m_ctx->HasDirtyGraphicsBindings())) + m_ctx->ApplyDirtyGraphicsBindings(); + m_ctx->EmitCs([ cCount = DrawCount, cOffset = ByteOffsetForArgs, @@ -88,6 +94,9 @@ namespace dxvk { D3D10DeviceLock lock = m_ctx->LockContext(); m_ctx->SetDrawBuffers(pBufferForArgs, pBufferForCount); + if (unlikely(m_ctx->HasDirtyGraphicsBindings())) + m_ctx->ApplyDirtyGraphicsBindings(); + m_ctx->EmitCs([ cMaxCount = MaxDrawCount, cArgOffset = ByteOffsetForArgs, @@ -110,6 +119,9 @@ namespace dxvk { D3D10DeviceLock lock = m_ctx->LockContext(); m_ctx->SetDrawBuffers(pBufferForArgs, pBufferForCount); + if (unlikely(m_ctx->HasDirtyGraphicsBindings())) + m_ctx->ApplyDirtyGraphicsBindings(); + m_ctx->EmitCs([ cMaxCount = MaxDrawCount, cArgOffset = ByteOffsetForArgs, diff --git a/src/d3d11/d3d11_context_imm.cpp b/src/d3d11/d3d11_context_imm.cpp index e22201afaf2..15005f40f01 100644 --- a/src/d3d11/d3d11_context_imm.cpp +++ b/src/d3d11/d3d11_context_imm.cpp @@ -218,7 +218,12 @@ namespace dxvk { D3D10DeviceLock lock = LockContext(); auto commandList = static_cast(pCommandList); - + + // Reset dirty binding tracking before submitting any CS chunks. + // This is needed so that any submission that might occur during + // this call does not disrupt bindings set by the deferred context. + ResetDirtyTracking(); + // Clear state so that the command list can't observe any // current context state. The command list itself will clean // up after execution to ensure that no state changes done @@ -754,7 +759,11 @@ namespace dxvk { if (!pState) return; - // Reset all state affected by the current context state + // Clear dirty tracking here since all context state will be + // re-applied anyway when the context state is swapped in again. + ResetDirtyTracking(); + + // Reset all state affected by the current context state. ResetCommandListState(); Com oldState = std::move(m_stateObject); @@ -975,8 +984,106 @@ namespace dxvk { } + void D3D11ImmediateContext::ApplyDirtyNullBindings() { + // At the end of a submission, set all bindings that have not been applied yet + // to null on the DXVK context. This way, we avoid keeping resources alive that + // are bound to the DXVK context but not to the immediate context. + // + // Note: This requires that all methods that may modify dirty bindings on the + // DXVK context also reset the corresponding dirty bits *before* performing the + // bind operation, or otherwise an implicit flush can potentially override them. + auto& dirtyState = m_state.lazy.bindingsDirty; + + EmitCs([ + cDirtyState = dirtyState + ] (DxvkContext* ctx) { + for (uint32_t i = 0; i < uint32_t(DxbcProgramType::Count); i++) { + auto dxStage = DxbcProgramType(i); + auto vkStage = GetShaderStage(dxStage); + + // Unbind all dirty constant buffers + auto cbvSlot = computeConstantBufferBinding(dxStage, 0); + + for (uint32_t index : bit::BitMask(cDirtyState[dxStage].cbvMask)) + ctx->bindUniformBuffer(vkStage, cbvSlot + index, DxvkBufferSlice()); + + // Unbind all dirty samplers + auto samplerSlot = computeSamplerBinding(dxStage, 0); + + for (uint32_t index : bit::BitMask(cDirtyState[dxStage].samplerMask)) + ctx->bindResourceSampler(vkStage, samplerSlot + index, nullptr); + + // Unbind all dirty shader resource views + auto srvSlot = computeSrvBinding(dxStage, 0); + + for (uint32_t m = 0; m < cDirtyState[dxStage].srvMask.size(); m++) { + for (uint32_t index : bit::BitMask(cDirtyState[dxStage].srvMask[m])) + ctx->bindResourceImageView(vkStage, srvSlot + index + m * 64u, nullptr); + } + + // Unbind all dirty unordered access views + VkShaderStageFlags uavStages = 0u; + + if (dxStage == DxbcProgramType::ComputeShader) + uavStages = VK_SHADER_STAGE_COMPUTE_BIT; + else if (dxStage == DxbcProgramType::PixelShader) + uavStages = VK_SHADER_STAGE_ALL_GRAPHICS; + + if (uavStages) { + auto uavSlot = computeUavBinding(dxStage, 0); + auto ctrSlot = computeUavCounterBinding(dxStage, 0); + + for (uint32_t index : bit::BitMask(cDirtyState[dxStage].uavMask)) { + ctx->bindResourceImageView(vkStage, uavSlot + index, nullptr); + ctx->bindResourceBufferView(vkStage, ctrSlot + index, nullptr); + } + } + } + }); + + // Since we set the DXVK context bindings to null, any bindings that are null + // on the D3D context are no longer dirty, so we can clear the respective bits. + for (uint32_t i = 0; i < uint32_t(DxbcProgramType::Count); i++) { + auto stage = DxbcProgramType(i); + + for (uint32_t index : bit::BitMask(dirtyState[stage].cbvMask)) { + if (!m_state.cbv[stage].buffers[index].buffer.ptr()) + dirtyState[stage].cbvMask &= ~(1u << index); + } + + for (uint32_t index : bit::BitMask(dirtyState[stage].samplerMask)) { + if (!m_state.samplers[stage].samplers[index]) + dirtyState[stage].samplerMask &= ~(1u << index); + } + + for (uint32_t m = 0; m < dirtyState[stage].srvMask.size(); m++) { + for (uint32_t index : bit::BitMask(dirtyState[stage].srvMask[m])) { + if (!m_state.srv[stage].views[index + m * 64u].ptr()) + dirtyState[stage].srvMask[m] &= ~(uint64_t(1u) << index); + } + } + + if (stage == DxbcProgramType::ComputeShader || stage == DxbcProgramType::PixelShader) { + auto& uavs = stage == DxbcProgramType::ComputeShader ? m_state.uav.views : m_state.om.uavs; + + for (uint32_t index : bit::BitMask(dirtyState[stage].uavMask)) { + if (!uavs[index].ptr()) + dirtyState[stage].uavMask &= ~(uint64_t(1u) << index); + } + } + + if (dirtyState[stage].empty()) + m_state.lazy.shadersDirty.clr(stage); + } + } + + void D3D11ImmediateContext::ConsiderFlush( GpuFlushType FlushType) { + // In stress test mode, behave as if this would always flush + if (DebugLazyBinding == Tristate::True) + ApplyDirtyNullBindings(); + uint64_t chunkId = GetCurrentSequenceNumber(); uint64_t submissionId = m_submissionFence->value(); @@ -998,6 +1105,9 @@ namespace dxvk { if (!GetPendingCsChunks() && !hEvent) return; + // Unbind unused resources + ApplyDirtyNullBindings(); + // Signal the submission fence and flush the command list uint64_t submissionId = ++m_submissionId; diff --git a/src/d3d11/d3d11_context_imm.h b/src/d3d11/d3d11_context_imm.h index a73dfe6d053..faf61842381 100644 --- a/src/d3d11/d3d11_context_imm.h +++ b/src/d3d11/d3d11_context_imm.h @@ -195,6 +195,8 @@ namespace dxvk { uint64_t GetPendingCsChunks(); + void ApplyDirtyNullBindings(); + void ConsiderFlush( GpuFlushType FlushType); diff --git a/src/d3d11/d3d11_context_state.h b/src/d3d11/d3d11_context_state.h index 9dea349375b..6886dd99d84 100644 --- a/src/d3d11/d3d11_context_state.h +++ b/src/d3d11/d3d11_context_state.h @@ -199,6 +199,7 @@ namespace dxvk { UINT stencilRef = D3D11_DEFAULT_STENCIL_REFERENCE; UINT maxRtv = 0u; + UINT minUav = 0u; UINT maxUav = 0u; void reset() { @@ -302,6 +303,32 @@ namespace dxvk { predicateValue = false; } }; + + + /** + * \brief Lazy binding state + * + * Keeps track of what state needs to be + * re-applied to the context. + */ + struct D3D11LazyBindings { + DxbcProgramTypeFlags shadersUsed = 0u; + DxbcProgramTypeFlags shadersDirty = 0u; + DxbcProgramTypeFlags graphicsUavShaders = 0u; + + D3D11ShaderStageState bindingsUsed; + D3D11ShaderStageState bindingsDirty; + + void reset() { + shadersUsed = 0u; + shadersDirty = 0u; + graphicsUavShaders = 0u; + + bindingsUsed.reset(); + bindingsDirty.reset(); + } + }; + /** * \brief Context state @@ -325,6 +352,8 @@ namespace dxvk { D3D11SrvBindings srv; D3D11UavBindings uav; D3D11SamplerBindings samplers; + + D3D11LazyBindings lazy; }; /** @@ -342,7 +371,7 @@ namespace dxvk { * \brief Maximum used binding numbers for all context state */ struct D3D11MaxUsedBindings { - std::array stages; + std::array stages; uint32_t vbCount; uint32_t soCount; }; diff --git a/src/d3d11/d3d11_shader.cpp b/src/d3d11/d3d11_shader.cpp index 9560eb131d2..332d9cffdea 100644 --- a/src/d3d11/d3d11_shader.cpp +++ b/src/d3d11/d3d11_shader.cpp @@ -79,6 +79,12 @@ namespace dxvk { } pDevice->GetDXVKDevice()->registerShader(m_shader); + + // Write back binding mask + auto bindings = module.bindings(); + + if (bindings) + m_bindings = *bindings; } diff --git a/src/d3d11/d3d11_shader.h b/src/d3d11/d3d11_shader.h index 44a3bcb984d..8be4353f7c5 100644 --- a/src/d3d11/d3d11_shader.h +++ b/src/d3d11/d3d11_shader.h @@ -18,7 +18,7 @@ namespace dxvk { class D3D11Device; - + /** * \brief Common shader object * @@ -52,12 +52,18 @@ namespace dxvk { std::string GetName() const { return m_shader->debugName(); } - + + DxbcBindingMask GetBindingMask() const { + return m_bindings; + } + private: - + Rc m_shader; Rc m_buffer; - + + DxbcBindingMask m_bindings = { }; + }; diff --git a/src/d3d11/d3d11_util.h b/src/d3d11/d3d11_util.h index 8720bbbb328..10a4a8fc5ec 100644 --- a/src/d3d11/d3d11_util.h +++ b/src/d3d11/d3d11_util.h @@ -52,15 +52,15 @@ namespace dxvk { * \returns Corresponding Vulkan shader stage */ constexpr VkShaderStageFlagBits GetShaderStage(DxbcProgramType ProgramType) { - switch (ProgramType) { - case DxbcProgramType::VertexShader: return VK_SHADER_STAGE_VERTEX_BIT; - case DxbcProgramType::HullShader: return VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; - case DxbcProgramType::DomainShader: return VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT; - case DxbcProgramType::GeometryShader: return VK_SHADER_STAGE_GEOMETRY_BIT; - case DxbcProgramType::PixelShader: return VK_SHADER_STAGE_FRAGMENT_BIT; - case DxbcProgramType::ComputeShader: return VK_SHADER_STAGE_COMPUTE_BIT; - default: return VkShaderStageFlagBits(0); - } + constexpr uint64_t lut + = (uint64_t(VK_SHADER_STAGE_VERTEX_BIT) << (8u * uint32_t(DxbcProgramType::VertexShader))) + | (uint64_t(VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT) << (8u * uint32_t(DxbcProgramType::HullShader))) + | (uint64_t(VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT) << (8u * uint32_t(DxbcProgramType::DomainShader))) + | (uint64_t(VK_SHADER_STAGE_GEOMETRY_BIT) << (8u * uint32_t(DxbcProgramType::GeometryShader))) + | (uint64_t(VK_SHADER_STAGE_FRAGMENT_BIT) << (8u * uint32_t(DxbcProgramType::PixelShader))) + | (uint64_t(VK_SHADER_STAGE_COMPUTE_BIT) << (8u * uint32_t(DxbcProgramType::ComputeShader))); + + return VkShaderStageFlagBits((lut >> (8u * uint32_t(ProgramType))) & 0xff); } -} \ No newline at end of file +} diff --git a/src/d3d11/d3d11_video.cpp b/src/d3d11/d3d11_video.cpp index 1be4802a8cb..a5b20351dec 100644 --- a/src/d3d11/d3d11_video.cpp +++ b/src/d3d11/d3d11_video.cpp @@ -1037,7 +1037,9 @@ namespace dxvk { continue; if (!hasStreamsEnabled) { + m_ctx->ResetDirtyTracking(); m_ctx->ResetCommandListState(); + BindOutputView(pOutputView); hasStreamsEnabled = true; } @@ -1047,6 +1049,7 @@ namespace dxvk { if (hasStreamsEnabled) { UnbindResources(); + m_ctx->RestoreCommandListState(); } diff --git a/src/d3d11/d3d11_view_uav.h b/src/d3d11/d3d11_view_uav.h index c86eba5fa88..8396fb8a1eb 100644 --- a/src/d3d11/d3d11_view_uav.h +++ b/src/d3d11/d3d11_view_uav.h @@ -43,6 +43,10 @@ namespace dxvk { return m_info.BindFlags & Flags; } + BOOL HasCounter() const { + return m_counterView != nullptr; + } + D3D11_RESOURCE_DIMENSION GetResourceType() const { D3D11_RESOURCE_DIMENSION type; m_resource->GetType(&type); diff --git a/src/dxbc/dxbc_analysis.cpp b/src/dxbc/dxbc_analysis.cpp index a5bf4fc194e..2a6747b0582 100644 --- a/src/dxbc/dxbc_analysis.cpp +++ b/src/dxbc/dxbc_analysis.cpp @@ -108,6 +108,47 @@ namespace dxvk { m_analysis->uavInfos[registerId].nonInvariantAccess = true; } break; + case DxbcInstClass::Declaration: { + switch (ins.op) { + case DxbcOpcode::DclConstantBuffer: { + uint32_t registerId = ins.dst[0].idx[0].offset; + + if (registerId < DxbcConstBufBindingCount) + m_analysis->bindings.cbvMask |= 1u << registerId; + } break; + + case DxbcOpcode::DclSampler: { + uint32_t registerId = ins.dst[0].idx[0].offset; + + if (registerId < DxbcSamplerBindingCount) + m_analysis->bindings.samplerMask |= 1u << registerId; + } break; + + case DxbcOpcode::DclResource: + case DxbcOpcode::DclResourceRaw: + case DxbcOpcode::DclResourceStructured: { + uint32_t registerId = ins.dst[0].idx[0].offset; + + uint32_t idx = registerId / 64u; + uint32_t bit = registerId % 64u; + + if (registerId < DxbcResourceBindingCount) + m_analysis->bindings.srvMask[idx] |= uint64_t(1u) << bit; + } break; + + case DxbcOpcode::DclUavTyped: + case DxbcOpcode::DclUavRaw: + case DxbcOpcode::DclUavStructured: { + uint32_t registerId = ins.dst[0].idx[0].offset; + + if (registerId < DxbcUavBindingCount) + m_analysis->bindings.uavMask |= uint64_t(1u) << registerId; + } break; + + default: ; + } + } break; + default: break; } diff --git a/src/dxbc/dxbc_analysis.h b/src/dxbc/dxbc_analysis.h index fa589f4fd2e..64e9870384f 100644 --- a/src/dxbc/dxbc_analysis.h +++ b/src/dxbc/dxbc_analysis.h @@ -53,6 +53,8 @@ namespace dxvk { DxbcClipCullInfo clipCullIn; DxbcClipCullInfo clipCullOut; + + DxbcBindingMask bindings = { }; bool usesDerivatives = false; bool usesKill = false; diff --git a/src/dxbc/dxbc_common.cpp b/src/dxbc/dxbc_common.cpp index d150c585b83..db3d715291e 100644 --- a/src/dxbc/dxbc_common.cpp +++ b/src/dxbc/dxbc_common.cpp @@ -10,9 +10,8 @@ namespace dxvk { case DxbcProgramType::HullShader : return VK_SHADER_STAGE_TESSELLATION_CONTROL_BIT; case DxbcProgramType::DomainShader : return VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT; case DxbcProgramType::ComputeShader : return VK_SHADER_STAGE_COMPUTE_BIT; + default: throw DxvkError("DxbcProgramInfo::shaderStage: Unsupported program type"); } - - throw DxvkError("DxbcProgramInfo::shaderStage: Unsupported program type"); } @@ -24,9 +23,8 @@ namespace dxvk { case DxbcProgramType::HullShader : return spv::ExecutionModelTessellationControl; case DxbcProgramType::DomainShader : return spv::ExecutionModelTessellationEvaluation; case DxbcProgramType::ComputeShader : return spv::ExecutionModelGLCompute; + default: throw DxvkError("DxbcProgramInfo::executionModel: Unsupported program type"); } - - throw DxvkError("DxbcProgramInfo::executionModel: Unsupported program type"); } -} \ No newline at end of file +} diff --git a/src/dxbc/dxbc_common.h b/src/dxbc/dxbc_common.h index 9c490d62c68..d9d4207673b 100644 --- a/src/dxbc/dxbc_common.h +++ b/src/dxbc/dxbc_common.h @@ -17,7 +17,11 @@ namespace dxvk { HullShader = 3, DomainShader = 4, ComputeShader = 5, + + Count }; + + using DxbcProgramTypeFlags = Flags; /** diff --git a/src/dxbc/dxbc_compiler.cpp b/src/dxbc/dxbc_compiler.cpp index f0f36233461..9e58e6497fa 100644 --- a/src/dxbc/dxbc_compiler.cpp +++ b/src/dxbc/dxbc_compiler.cpp @@ -237,6 +237,7 @@ namespace dxvk { case DxbcProgramType::GeometryShader: this->emitGsFinalize(); break; case DxbcProgramType::PixelShader: this->emitPsFinalize(); break; case DxbcProgramType::ComputeShader: this->emitCsFinalize(); break; + default: throw DxvkError("Invalid shader stage"); } // Emit float control mode if the extension is supported @@ -6138,7 +6139,7 @@ namespace dxvk { case DxbcProgramType::HullShader: emitHsSystemValueStore(sv, mask, value); break; case DxbcProgramType::DomainShader: emitDsSystemValueStore(sv, mask, value); break; case DxbcProgramType::PixelShader: emitPsSystemValueStore(sv, mask, value); break; - case DxbcProgramType::ComputeShader: break; + default: break; } } } @@ -6822,6 +6823,7 @@ namespace dxvk { case DxbcProgramType::GeometryShader: emitGsInit(); break; case DxbcProgramType::PixelShader: emitPsInit(); break; case DxbcProgramType::ComputeShader: emitCsInit(); break; + default: throw DxvkError("Invalid shader stage"); } } diff --git a/src/dxbc/dxbc_module.cpp b/src/dxbc/dxbc_module.cpp index d406bf2925a..16e37ebbbd4 100644 --- a/src/dxbc/dxbc_module.cpp +++ b/src/dxbc/dxbc_module.cpp @@ -42,7 +42,7 @@ namespace dxvk { Rc DxbcModule::compile( const DxbcModuleInfo& moduleInfo, - const std::string& fileName) const { + const std::string& fileName) { if (m_shexChunk == nullptr) throw DxvkError("DxbcModule::compile: No SHDR/SHEX chunk"); @@ -54,6 +54,8 @@ namespace dxvk { m_psgnChunk, analysisInfo); this->runAnalyzer(analyzer, m_shexChunk->slice()); + + m_bindings = std::make_optional(analysisInfo.bindings); DxbcCompiler compiler( fileName, moduleInfo, @@ -62,7 +64,7 @@ namespace dxvk { m_psgnChunk, analysisInfo); this->runCompiler(compiler, m_shexChunk->slice()); - + return compiler.finalize(); } diff --git a/src/dxbc/dxbc_module.h b/src/dxbc/dxbc_module.h index 7609e23225e..7d6088baedf 100644 --- a/src/dxbc/dxbc_module.h +++ b/src/dxbc/dxbc_module.h @@ -7,6 +7,7 @@ #include "dxbc_header.h" #include "dxbc_modinfo.h" #include "dxbc_reader.h" +#include "dxbc_util.h" // References used for figuring out DXBC: // - https://github.com/tgjones/slimshader-cpp @@ -41,7 +42,16 @@ namespace dxvk { return m_shexChunk->programInfo(); } - + + /** + * \brief Queries shader binding mask + * + * Only valid after successfully compiling the shader. + */ + std::optional bindings() const { + return m_bindings; + } + /** * \brief Input and output signature chunks * @@ -50,7 +60,7 @@ namespace dxvk { */ Rc isgn() const { return m_isgnChunk; } Rc osgn() const { return m_osgnChunk; } - + /** * \brief Compiles DXBC shader to SPIR-V module * @@ -61,7 +71,7 @@ namespace dxvk { */ Rc compile( const DxbcModuleInfo& moduleInfo, - const std::string& fileName) const; + const std::string& fileName); /** * \brief Compiles a pass-through geometry shader @@ -85,6 +95,8 @@ namespace dxvk { Rc m_osgnChunk; Rc m_psgnChunk; Rc m_shexChunk; + + std::optional m_bindings; void runAnalyzer( DxbcAnalyzer& analyzer, diff --git a/src/dxbc/dxbc_util.h b/src/dxbc/dxbc_util.h index 04bec752a33..53394c2a99d 100644 --- a/src/dxbc/dxbc_util.h +++ b/src/dxbc/dxbc_util.h @@ -33,6 +33,43 @@ namespace dxvk { }; + /** + * \brief Shader binding mask + * + * Stores a bit masks of resource bindings + * that are accessed by any given shader. + */ + struct DxbcBindingMask { + uint32_t cbvMask = 0u; + uint32_t samplerMask = 0u; + uint64_t uavMask = 0u; + std::array srvMask = { }; + + void reset() { + cbvMask = 0u; + samplerMask = 0u; + uavMask = 0u; + srvMask = { }; + } + + bool empty() const { + uint64_t mask = (uint64_t(cbvMask) | uint64_t(samplerMask) << 32u) + | (uavMask | srvMask[0] | srvMask[1]); + return !mask; + } + + DxbcBindingMask operator & (const DxbcBindingMask& other) const { + DxbcBindingMask result = *this; + result.cbvMask &= other.cbvMask; + result.samplerMask &= other.samplerMask; + result.uavMask &= other.uavMask; + result.srvMask[0] &= other.srvMask[0]; + result.srvMask[1] &= other.srvMask[1]; + return result; + } + }; + + /** * \brief Computes first binding index for a given stage * @@ -124,4 +161,4 @@ namespace dxvk { uint32_t primitiveVertexCount( DxbcPrimitive primitive); -} \ No newline at end of file +} diff --git a/src/util/util_bit.h b/src/util/util_bit.h index 586ce83c986..6f672700503 100644 --- a/src/util/util_bit.h +++ b/src/util/util_bit.h @@ -129,6 +129,32 @@ namespace dxvk::bit { #endif } + inline uint32_t bsf(uint32_t n) { + #if (defined(__GNUC__) || defined(__clang__)) && !defined(__BMI__) && defined(DXVK_ARCH_X86) + uint32_t res; + asm ("tzcnt %1,%0" + : "=r" (res) + : "r" (n) + : "cc"); + return res; + #else + return tzcnt(n); + #endif + } + + inline uint32_t bsf(uint64_t n) { + #if (defined(__GNUC__) || defined(__clang__)) && !defined(__BMI__) && defined(DXVK_ARCH_X86_64) + uint64_t res; + asm ("tzcnt %1,%0" + : "=r" (res) + : "r" (n) + : "cc"); + return res; + #else + return tzcnt(n); + #endif + } + inline uint32_t lzcnt(uint32_t n) { #if (defined(_MSC_VER) && !defined(__clang__)) || defined(__LZCNT__) return _lzcnt_u32(n); @@ -490,6 +516,7 @@ namespace dxvk::bit { }; + template class BitMask { public: @@ -497,12 +524,12 @@ namespace dxvk::bit { class iterator { public: using iterator_category = std::input_iterator_tag; - using value_type = uint32_t; - using difference_type = uint32_t; - using pointer = const uint32_t*; - using reference = uint32_t; + using value_type = T; + using difference_type = T; + using pointer = const T*; + using reference = T; - explicit iterator(uint32_t flags) + explicit iterator(T flags) : m_mask(flags) { } iterator& operator ++ () { @@ -516,17 +543,8 @@ namespace dxvk::bit { return retval; } - uint32_t operator * () const { -#if (defined(__GNUC__) || defined(__clang__)) && !defined(__BMI__) && defined(DXVK_ARCH_X86) - uint32_t res; - asm ("tzcnt %1,%0" - : "=r" (res) - : "r" (m_mask) - : "cc"); - return res; -#else - return tzcnt(m_mask); -#endif + T operator * () const { + return bsf(m_mask); } bool operator == (iterator other) const { return m_mask == other.m_mask; } @@ -534,14 +552,14 @@ namespace dxvk::bit { private: - uint32_t m_mask; + T m_mask; }; BitMask() : m_mask(0) { } - BitMask(uint32_t n) + explicit BitMask(T n) : m_mask(n) { } iterator begin() { @@ -554,7 +572,7 @@ namespace dxvk::bit { private: - uint32_t m_mask; + T m_mask; };