Unity的Forward+ FPTL光照剔除解析(四)

news/发布时间2024/5/17 17:09:43

序言

看完上一节基本上HDRP的光照剔除数据的流程就写完了,这一节主要是解析适用于透明队列的Cluster光照剔除。
为了让Cluster的均匀分布,Cluster的光照剔除也同样是借助了PreDepth的深度图

RenderGraph Dispatch

这里的buildPerVoxelLightListKernel根据是否开启读取Depth制定suggestBase划分Cluster以及MSAA是否开启有很多的不同的变体。
这里主要解析的Kernel是TileLightListGen_DepthRT_SrcBigTile

//Cluster的TileSize
public static int s_TileSizeClustered = 32;static int GetNumTileClusteredX(HDCamera hdCamera)
{return HDUtils.DivRoundUp((int) hdCamera.screenSize.x, LightDefinitions.s_TileSizeClustered);
}unsafe void PrepareBuildGPULightListPassData(RenderGraph renderGraph,RenderGraphBuilder builder,HDCamera hdCamera,TileAndClusterData tileAndClusterData,ref ShaderVariablesLightList constantBuffer,int totalLightCount,TextureHandle depthStencilBuffer,TextureHandle stencilBufferCopy,GBufferOutput gBuffer,BuildGPULightListPassData passData)
{...// Clusterbool msaa = hdCamera.msaaEnabled;var clustPrepassSourceIdx = hdCamera.frameSettings.IsEnabled(FrameSettingsField.BigTilePrepass) ? ClusterPrepassSource.BigTile : ClusterPrepassSource.None;var clustDepthSourceIdx = ClusterDepthSource.NoDepth;if (tileAndClusterData.clusterNeedsDepth)clustDepthSourceIdx = msaa ? ClusterDepthSource.MSAA_Depth : ClusterDepthSource.Depth;passData.buildPerVoxelLightListShader = buildPerVoxelLightListShader;passData.clearClusterAtomicIndexShader = clearClusterAtomicIndexShader;//类似UE的Shader宏开启写法passData.buildPerVoxelLightListKernel = isProjectionOblique ? s_ClusterObliqueKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx] : s_ClusterKernels[(int)clustPrepassSourceIdx, (int)clustDepthSourceIdx];passData.numTilesClusterX = GetNumTileClusteredX(hdCamera);passData.numTilesClusterY = GetNumTileClusteredY(hdCamera);passData.clusterNeedsDepth = tileAndClusterData.clusterNeedsDepth;...
}static void VoxelLightListGeneration(BuildGPULightListPassData data, CommandBuffer cmd)
{if (data.runLightList){// clear atomic offset indexcmd.SetComputeBufferParam(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);cmd.DispatchCompute(data.clearClusterAtomicIndexShader, s_ClearVoxelAtomicKernel, 1, 1, 1);cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, s_ClearVoxelAtomicKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vLayeredLightList, data.output.perVoxelLightLists);cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredOffset, data.output.perVoxelOffset);cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_LayeredSingleIdxBuffer, data.globalLightListAtomic);//开启Big Tileif (data.runBigTilePrepass)cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBigTileLightList, data.output.bigTileLightList);//开启Hiz cullingif (data.clusterNeedsDepth){cmd.SetComputeTextureParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_depth_tex, data.depthBuffer);cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_logBaseBuffer, data.output.perTileLogBaseTweak);}cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_vBoundsBuffer, data.AABBBoundsBuffer);cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs._LightVolumeData, data.lightVolumeDataBuffer);cmd.SetComputeBufferParam(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, HDShaderIDs.g_data, data.convexBoundsBuffer);ConstantBuffer.Push(cmd, data.lightListCB, data.buildPerVoxelLightListShader, HDShaderIDs._ShaderVariablesLightList);cmd.DispatchCompute(data.buildPerVoxelLightListShader, data.buildPerVoxelLightListKernel, data.numTilesClusterX, data.numTilesClusterY, data.viewCount);}
}

Initialize

首先依旧是跟之前的TileLightListGen类似,计算当前线程的Tile的映射关系要用到的数据.(Tile的X/Y轴上的数量,当前线程组对应的TileID)

#define TILE_SIZE_CLUSTERED (32)
//若data.clusterNeedsDepth==true,
//就使用TileLightListGen_DepthRT_SrcBigTile  LIGHTLISTGEN=TileLightListGen_DepthRT_SrcBigTile  ENABLE_DEPTH_TEXTURE_BACKPLANE
//即#define ENABLE_DEPTH_TEXTURE_BACKPLANE
#define ENABLE_DEPTH_TEXTURE_BACKPLANEgroupshared uint lightOffs;#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE
groupshared uint ldsZMax;
#endif[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{uint eyeIndex = u3GroupID.z;uint2 tileIDX = u3GroupID.xy;uint t=threadID;//firstbithigh(32)=5const uint log2TileSize = firstbithigh(TILE_SIZE_CLUSTERED);uint nrTilesX = ((uint)g_screenSize.x +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.x,32)uint nrTilesY = ((uint)g_screenSize.y +(TILE_SIZE_CLUSTERED-1))>>log2TileSize;//DivRoundUp(g_screenSize.y,32)// Screen space coordinates of clustered tile//当前Tile的左下角屏幕坐标uint2 viTilLL = TILE_SIZE_CLUSTERED*tileIDX;//当前Tile的右上角屏幕坐标uint2 viTilUR = min( viTilLL+uint2(TILE_SIZE_CLUSTERED,TILE_SIZE_CLUSTERED), uint2(g_screenSize.x, g_screenSize.y) );       // not width and height minus 1 since viTilUR represents the end of the tile corner.//重置lightOffs,ldsZMax(跟TileLightListGen类似需要求Tile内的ZMax)if(t==0){lightOffs = 0;#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANEldsZMax = 0;
#endif}#if NR_THREADS > PLATFORM_LANE_COUNTGroupMemoryBarrierWithGroupSync();
#endif...
}

ldsZMax

通过遍历Tile内的深度,得到linMaDist,最后再InterlockedMax Resolve得到ldsZMax(Tile内的Max Z)

#define TILE_SIZE_CLUSTERED (32)
#define VIEWPORT_SCALE_Z (1)//跟lightlistbuild.compute一样,一样是通过zDptBufSpace以及对应的屏幕坐标计算出对应的Linear Depth [Near,Far]
float GetLinearDepth(float2 pixXY, float zDptBufSpace, uint eyeIndex) // 0 is near 1 is far
{float4x4 g_mInvScrProjection = g_mInvScrProjectionArr[eyeIndex];#ifdef USE_OBLIQUE_MODEfloat2 res2 = mul(g_mInvScrProjection, float4(pixXY, zDptBufSpace, 1.0)).zw;return res2.x / res2.y;#else// for perspective projection m22 is zero and m23 is +1/-1 (depends on left/right hand proj)// however this function must also work for orthographic projection so we keep it like this.float m22 = g_mInvScrProjection[2].z, m23 = g_mInvScrProjection[2].w;float m32 = g_mInvScrProjection[3].z, m33 = g_mInvScrProjection[3].w;return (m22 * zDptBufSpace + m23) / (m32 * zDptBufSpace + m33);#endif
}[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{...#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANE// establish max depth firstfloat linMaDist = 0.0;//TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED=32*32//遍历Cluster Tile内对应的Depthfor (int idx = t; idx < (TILE_SIZE_CLUSTERED * TILE_SIZE_CLUSTERED); idx += NR_THREADS){uint2 uPixCrd = min(uint2(viTilLL.x + (idx & (TILE_SIZE_CLUSTERED - 1)), viTilLL.y + (idx >> log2TileSize)), uint2(g_screenSize.x - 1, g_screenSize.y - 1));//#ifdef MSAA_ENABLED//for(int i=0; i<g_iNumSamplesMSAA; i++)//{//const float fDpth = FetchDepthMSAA(uPixCrd, i);//const float2 fracSampleCoord = g_depth_tex.GetSamplePosition(i).xy;     // this is optimized away when USE_OBLIQUE_MODE is NOT set.//#elseconst float fDpth = FetchDepth(uPixCrd);const float2 fracSampleCoord = float2(0.5, 0.5);//#endifif (fDpth < VIEWPORT_SCALE_Z) // if not skydome{float linZ = GetLinearDepth(uPixCrd + fracSampleCoord, fDpth, eyeIndex);#if USE_LEFT_HAND_CAMERA_SPACEfloat linDistZ = linZ;#elsefloat linDistZ = -linZ;#endif//求ZMaxlinMaDist = max(linDistZ, linMaDist);}//#ifdef MSAA_ENABLED//}//#endif}//Resolve linMaDist并线程同步(InterlockedMax)赋值给ldsZMaxlinMaDist = max(linMaDist, 0.0);InterlockedMax(ldsZMax, asuint(linMaDist));//这个GroupMemoryBarrierWithGroupSync莫名其妙的#if NR_THREADS > PLATFORM_LANE_COUNTGroupMemoryBarrierWithGroupSync();#endiflinMaDist = asfloat(ldsZMax);//if (fDpth < VIEWPORT_SCALE_Z)if (linMaDist <= 0.0) linMaDist = g_fFarPlane; // assume sky pixel#endif...
}

Build coarse list,SphericalIntersectionTests

跟lightlistbuild.compute一样,这里也同样可以借用Big Tile的计算结果(g_vBigTileLightList),只遍历Big Tile内的灯光列表来Build coarseList
然后SphericalIntersectionTests Tile内的灯光,剔除掉并没有与Tile相交的灯光(DoesSphereOverlapTile)

[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{...// 'Normalized' coordinates of tile, for use with AABB bounds in g_vBoundsBufferfloat2 vTileLL = float2(viTilLL.x / g_screenSize.x, viTilLL.y / g_screenSize.y);float2 vTileUR = float2(viTilUR.x / g_screenSize.x, viTilUR.y / g_screenSize.y);// build coarse list using AABB#ifdef USE_TWO_PASS_TILED_LIGHTING//tileIDX映射bigTileIdxconst uint log2BigTileToClustTileRatio = firstbithigh(TILE_SIZE_BIG_TILE) - log2TileSize;int NrBigTilesX = (nrTilesX + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;int NrBigTilesY = (nrTilesY + ((1 << log2BigTileToClustTileRatio) - 1)) >> log2BigTileToClustTileRatio;const int bigTileBase = eyeIndex * NrBigTilesX * NrBigTilesY;const int bigTileIdx = bigTileBase + ((tileIDX.y >> log2BigTileToClustTileRatio) * NrBigTilesX) + (tileIDX.x >> log2BigTileToClustTileRatio); // map the idx to 64x64 tilesint nrBigTileLights = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + 0];for (int l0 = (int)t; l0 < (int)nrBigTileLights; l0 += NR_THREADS){int l = g_vBigTileLightList[MAX_NR_BIG_TILE_LIGHTS_PLUS_ONE * bigTileIdx + l0 + 1];#elsefor (int l = (int)t; l < (int)g_iNrVisibLights; l += NR_THREADS){#endifconst ScreenSpaceBoundsIndices boundsIndices = GenerateScreenSpaceBoundsIndices(l, g_iNrVisibLights, eyeIndex);const float2 vMi = g_vBoundsBuffer[boundsIndices.min].xy;const float2 vMa = g_vBoundsBuffer[boundsIndices.max].xy;//在Tile内if (all(vMa > vTileLL) && all(vMi < vTileUR)){unsigned int uInc = 1;unsigned int uIndex;InterlockedAdd(lightOffs, uInc, uIndex);if (uIndex < MAX_NR_COARSE_ENTRIES) coarseList[uIndex] = l; // add to light list}}#if NR_THREADS > PLATFORM_LANE_COUNTGroupMemoryBarrierWithGroupSync();#endifint iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);#ifdef PERFORM_SPHERICAL_INTERSECTION_TESTSiNrCoarseLights = SphericalIntersectionTests(t, iNrCoarseLights, float2(min(viTilLL.xy + uint2(TILE_SIZE_CLUSTERED / 2,TILE_SIZE_CLUSTERED / 2), uint2(g_screenSize.x - 1, g_screenSize.y - 1))),eyeIndex);#endif...
}

根据Tile内的linMaDist分割Cluster

以Tile内linMaDist(fTileFarPlane)作为分割Cluster,即根据Tile内的最远的深度进行划分,当fTileFarPlane离得近时,Cluster的Index在靠前的深度分布越多。(提高Cluster的利用率)

函数图像:SuggestLogBase50
float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C);
可以化简为:令d=rangeFittedDistance
\(\begin{cases} suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.5 \\suggestedBase=1 & \text{ if } d\ge0.5 \end{cases}\)
max(g_fClustBase, suggested_base)之后suggestedBase(g_fClustBase=1.02f)
\(\begin{cases} suggestedBase=(\frac{1}{d}-1)^{\frac{1}{32}} & \text{ if } d<0.3466 \\suggestedBase=1.02 & \text{ if } d\ge0.3466 \end{cases}\)
函数图像:SnapToClusterIdxFlex
f1(x,t)被限制在了[1.02,1.68]
1.68是代入rangeFittedDistance的最小值FLT_EPS计算得到的suggestedBase。
f2(x),f3(x)就是在演示suggestedBase在[1.02,1.68]之间滑动对SnapToClusterIdxFlex的影响。
可以看到当f1(x,t)从1.02变化到1.68的时候,由原本接近线性分布,变成了log曲线一样,使得更多的Index分布了在前面的深度。


float LogBase(float x, float b)
{return log2(x) / log2(b);
}int SnapToClusterIdxFlex(float z_in, float suggestedBase, bool logBasePerTile)
{
#if USE_LEFT_HAND_CAMERA_SPACEfloat z = z_in;
#elsefloat z = -z_in;
#endif//float userscale = g_fClustScale;//if (logBasePerTile)//    userscale = GetScaleFromBase(suggestedBase);// using the inverse of the geometric series//const float dist = max(0, z - g_fNearPlane);//return (int)clamp(log2(dist * userscale * (suggestedBase - 1.0f) + 1) / log2(suggestedBase), 0.0, (float)((1 << g_iLog2NumClusters) - 1));const int C = 1 << g_iLog2NumClusters;const float rangeFittedDistance = max(0, z - g_fNearPlane) / (g_fFarPlane - g_fNearPlane);return (int)clamp( LogBase( lerp(1.0, PositivePow(suggestedBase, (float) C), rangeFittedDistance), suggestedBase), 0.0, (float)(C - 1));
}int SnapToClusterIdx(float z_in, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANEbool logBasePerTile = true;     // resolved compile time
#elsebool logBasePerTile = false;
#endifreturn SnapToClusterIdxFlex(z_in, suggestedBase, logBasePerTile);
}// generate a log-base value such that half of the clusters are consumed from near plane to max. opaque depth of tile.
float SuggestLogBase50(float tileFarPlane)
{const float C = (float)(1 << g_iLog2NumClusters);float rangeFittedDistance = clamp((tileFarPlane - g_fNearPlane) / (g_fFarPlane - g_fNearPlane), FLT_EPS, 1.0);float suggested_base = pow((1.0 + sqrt(max(0.0, 1.0 - 4.0 * rangeFittedDistance * (1.0 - rangeFittedDistance)))) / (2.0 * rangeFittedDistance), 2.0 / C);      ////g_fClustBase=1.02f;return max(g_fClustBase, suggested_base);
}#define MAX_NR_COARSE_ENTRIES       128
//两盏灯的ClusterId Min MaxID合并成一个clusterIdxs 128/2
groupshared unsigned int clusterIdxs[MAX_NR_COARSE_ENTRIES / 2];[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{...#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANEfloat fTileFarPlane = linMaDist;float suggestedBase = SuggestLogBase50(fTileFarPlane);#else // ENABLE_DEPTH_TEXTURE_BACKPLANEfloat fTileFarPlane = g_fFarPlane;float suggestedBase = g_fClustBase;#endif// //#define EXACT_EDGE_TESTS EXACT_EDGE_TESTS没有启用#ifdef EXACT_EDGE_TESTSiNrCoarseLights = CullByExactEdgeTests(t, iNrCoarseLights, viTilLL.xy, viTilUR.xy, fTileFarPlane, eyeIndex);#endif//这里的灯光Index排序依旧是上一节的双调排序// sort lights (gives a more efficient execution in both deferred and tiled forward lighting).#if NR_THREADS > PLATFORM_LANE_COUNTSORTLIST(coarseList, iNrCoarseLights, MAX_NR_COARSE_ENTRIES, t, NR_THREADS);#endif//////////// cell specific code//根据BoundBuffer.w(viewPos.z),结合上面求的suggestedBase用SnapToClusterIdx指定Cluster ID{for (int l = (int)t; l < ((iNrCoarseLights + 1) >> 1); l += NR_THREADS){const int l0 = coarseList[2 * l + 0], l1 = coarseList[min(2 * l + 1, iNrCoarseLights - 1)];const ScreenSpaceBoundsIndices l0Bounds = GenerateScreenSpaceBoundsIndices(l0, g_iNrVisibLights, eyeIndex);const ScreenSpaceBoundsIndices l1Bounds = GenerateScreenSpaceBoundsIndices(l1, g_iNrVisibLights, eyeIndex);const unsigned int clustIdxMi0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.min].w, suggestedBase));const unsigned int clustIdxMa0 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l0Bounds.max].w, suggestedBase));const unsigned int clustIdxMi1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.min].w, suggestedBase));const unsigned int clustIdxMa1 = (const unsigned int)min(255, SnapToClusterIdx(g_vBoundsBuffer[l1Bounds.max].w, suggestedBase));//这里两盏灯的 clustIdxMin,clustIdxMax合并成一个ClusterIdx clusterIdxs[l] = (clustIdxMa1 << 24) | (clustIdxMi1 << 16) | (clustIdxMa0 << 8) | (clustIdxMi0 << 0);}}//线程同步#if NR_THREADS > PLATFORM_LANE_COUNTGroupMemoryBarrierWithGroupSync();#endif...
}

统计各个Cluster内的灯光数量[iSpaceAvail]

上面简单了Test灯光是否在Cluster内,是不够精准的,还需要检测构成cluster的点是否跟灯光Volume相交(CheckIntersection)
如果相交了,灯光的Index才最终加入到g_vLayeredLightList里。
相对应的lightCategory也要计数+1

#define NR_THREADS       64
#define LIGHTCATEGORY_COUNT       5//每个线程对应一个Cluster,即categoryListCountScratch记录每个Cluster的Light Count
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];
//记录不同Category的在LightData/LightVolumeData中的Index偏移量(_EnvLightIndexShift/_DecalIndexShift/_LocalVolumetricFogIndexShift)
//这个值没有相关操作,感觉是多余的,不如直接用ConstantBuffer的变量。
groupshared int shiftIndexScratch[NR_THREADS * LIGHTCATEGORY_COUNT];//4盏灯,每盏灯记录6个平面,每个平面用float4描述,float4(vN.xyz,-dot(vN,p0))
groupshared float4 lightPlanes[4 * 6];// Each plane is defined by a float4. 6 planes per light, 4 lights (24 planes)bool CheckIntersectionBasic(int l, int k)
{unsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;return ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);
}void ZeroCategoryListCountAndShiftIndex(uint threadIdx)
{for (int i = 0; i < LIGHTCATEGORY_COUNT; ++i){categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + i] = 0;}
}void WriteShiftIndex(uint threadIdx, uint index, int value)
{shiftIndexScratch[threadIdx * LIGHTCATEGORY_COUNT + index] = value;
}void IncrementCategoryListCount(uint threadIdx, uint index)
{categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{...//g_iLog2NumClusters=6 nrClusters=64int nrClusters = (1 << g_iLog2NumClusters);//////////////////////////////////////////////////////////uint start = 0;int i = (int)t;int iSpaceAvail = 0;int iSum = 0;if (i < nrClusters){// Each thread checks it's respective cluster against all coarse lights for intersection.// At the end, 'iSum' represents the number of lights that intersect this cluster!for (int l = 0; l < iNrCoarseLights; l++){iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);}// We have a limit to the number of lights we will track in a cluster (128). This is how much memory we// want to allocate out of g_LayeredSingleIdxBuffer.iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflectionInterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory//Start记录前面所有Cluster包含的Light数量//iSpaceAvail记录当前Cluster里包含的Light数量//g_LayeredSingleIdxBuffer[0]没有用到过,应该是某个Indirect的操作,记录所有Cluster的LightIndex总数}// All our cull data are in the same list, but at render time envLights are separated so we need to shift the index// to make it work correctlyZeroCategoryListCountAndShiftIndex(t);WriteShiftIndex(t, LIGHTCATEGORY_ENV, _EnvLightIndexShift);WriteShiftIndex(t, LIGHTCATEGORY_DECAL, _DecalIndexShift);WriteShiftIndex(t, LIGHTCATEGORY_LOCAL_VOLUMETRIC_FOG, _LocalVolumetricFogIndexShift);uint offs = start;//遍历Tile内灯光列表(CoarseLights)把对应的灯光Index放到对应的//int iNrCoarseLights = min(lightOffs,MAX_NR_COARSE_ENTRIES);最大值为128for (int ll = 0; ll < iNrCoarseLights; ll += 4){//只有线程组前24线程FetchPlane,每次循环只fetch 4盏灯的24个平面int p = i >> 2;int m = i & 3;if (i < 24)lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);//线程同步#if NR_THREADS > PLATFORM_LANE_COUNTGroupMemoryBarrierWithGroupSync();#endif//检测刚刚Fetch过平面的灯光,检测是否与Cluster的8个AABB平面相交。for (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++){if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex)){const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;//当前Cluster对应categoryListCount计数器Index++,//categoryListCountScratch LDS里面的count才是不同Category Cluster内的lightDataCountIncrementCategoryListCount(t, lightCategory);//跟上一篇的BuildPerTileLightList末尾输出LightIndex类似,也需要减去对应lightCategory偏移得到对应Category的Data Indexg_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);}}#if NR_THREADS > PLATFORM_LANE_COUNTGroupMemoryBarrierWithGroupSync();#endif}...
}

Fetch Plane

这里的FetchPlane函数依旧是使用LightingConvexHullUtils.hlsl里面的,
[GetHullPlane]根据不同面序号返回对应的平面上的一点以及平面的法向
[GetHullPlaneEq]后续为了判断点与平面的朝向(ToLeftTest)预先构成float4(vN, -dot(vN,p0))这样的表示平面方式

_D9F6E79F-8A33-47ed-B15D-A01A967A5788_.png

图中标红Cube的是Hull的顶点,红线是Hull的平面法向
//LightingConvexHullUtils.hlslvoid GetHullPlane(out float3 p0, out float3 n0, const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{//const int iAbsSide = (sideIndex == 0 || sideIndex == 1) ? 0 : ((sideIndex == 2 || sideIndex == 3) ? 1 : 2);const int iAbsSide = min(sideIndex>>1, 2);const float fS = (sideIndex & 1) != 0 ? 1 : (-1);float3 vA = fS*(iAbsSide == 0 ? boxX : (iAbsSide == 1 ? (-boxY) : boxZ));float3 vB = fS*(iAbsSide == 0 ? (-boxY) : (iAbsSide == 1 ? (-boxX) : (-boxY)));float3 vC = iAbsSide == 0 ? boxZ : (iAbsSide == 1 ? boxZ : (-boxX));//isTop的条件判断多少有点抽象,建议直接在C#模拟一遍就够了bool bIsTopQuad = iAbsSide == 2 && (sideIndex & 1) != 0;        // in this case all 4 verts get scaled.bool bIsSideQuad = (iAbsSide == 0 || iAbsSide == 1);        // if side quad only two verts get scaled (impacts q1 and q2)if (bIsTopQuad) { vB *= scaleXY.y; vC *= scaleXY.x; }float3 vA2 = vA;float3 vB2 = vB;if (bIsSideQuad) {vA2 *= (iAbsSide == 0 ? scaleXY.x : scaleXY.y); vB2 *= (iAbsSide == 0 ? scaleXY.y : scaleXY.x); }float3 vN = cross(vB2, 0.5 * (vA - vA2) - vC);  // +/- normalfloat3 v0 = vA + vB - vC;   // vector from center to p0p0 = center + v0;           // center + vA is center of face when scaleXY is 1.0//dot(vN,v0) < 0.0 保证法线朝外n0 = dot(vN,v0) < 0.0 ? (-vN) : vN;
}float4 GetHullPlaneEq(const float3 boxX, const float3 boxY, const float3 boxZ, const float3 center, const float2 scaleXY, const int sideIndex)
{float3 p0, vN;GetHullPlane(p0, vN, boxX, boxY, boxZ, center, scaleXY, sideIndex);return float4(vN, -dot(vN,p0));
}

CheckIntersection

判断Cluster与灯光是否相交,有两个判断方式,
一个就是用前面计算的Cluster Index范围做简单的判断,
第二个就是用Cluster ID计算出构成Cluster的八个顶点与灯光平面的几何关系

//用每个Tile ldsZMax计算出来的suggestedBase来计算计算每个Cluster的NearPlaneZ
//下一个Cluster的NearPlane就是当前Cluster的FarPlaneZ
float ClusterIdxToZFlex(int k, float suggestedBase, bool logBasePerTile)
{float res;//float userscale = g_fClustScale;//if (logBasePerTile)//    userscale = GetScaleFromBase(suggestedBase);//float dist = (PositivePow(suggestedBase, (float)k) - 1.0) / (userscale * (suggestedBase - 1.0f));//res = dist + g_fNearPlane;const float C = (float)(1 << g_iLog2NumClusters);float rangeFittedDistance = (PositivePow(suggestedBase, (float)k) - 1.0) / (PositivePow(suggestedBase, C) - 1.0);res = lerp(g_fNearPlane, g_fFarPlane, rangeFittedDistance);#if USE_LEFT_HAND_CAMERA_SPACEreturn res;
#elsereturn -res;
#endif
}float ClusterIdxToZ(int k, float suggestedBase)
{
#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANEbool logBasePerTile = true;     // resolved compile time
#elsebool logBasePerTile = false;
#endifreturn ClusterIdxToZFlex(k, suggestedBase, logBasePerTile);
}bool CheckIntersection(int l, int k, uint2 viTilLL, uint2 viTilUR, float suggestedBase, uint eyeIndex)
{//先简单通过Cluster ID是否在灯光MinMax范围内判交,若不在就直接返回// If this light's screen space depth bounds intersect this cluster...simple cluster testunsigned int val = (clusterIdxs[l >> 1] >> (16 * (l & 1))) & 0xffff;bool bIsHit = ((val >> 0) & 0xff) <= ((uint)k) && ((uint)k) <= ((val >> 8) & 0xff);if (bIsHit){#ifdef CONV_HULL_TEST_ENABLEDfloat depthAtNearZ = ClusterIdxToZ(k, suggestedBase);float depthAtFarZ = ClusterIdxToZ(k + 1, suggestedBase);//若Cluster所有的点都在灯光的某一平面外侧说明Cluster与灯光不相交for (int p = 0; p < 6; p++){float4 plane = lightPlanes[6 * (l & 3) + p];bool bAllInvisib = true;for (int i = 0; i < 8; i++){float x = (i & 1) == 0 ? viTilLL.x : viTilUR.x;float y = (i & 2) == 0 ? viTilLL.y : viTilUR.y;float z = (i & 4) == 0 ? depthAtNearZ : depthAtFarZ;//用屏幕坐标以及对应的LinDepth计算ViewPositiion,计算方式跟上一节一样都是用g_mScrProjectionArr计算float3 vP = GetViewPosFromLinDepth(float2(x, y), z, eyeIndex);//plane = float4(vN.xyz,-dot(vN,p0))//dot(plane, float4(vP, 1.0)) > 0即vP对平面vN做ToLeftTest//vN.xyz*vP.xyz>dot(vN,p0)检测vP是否在平面左侧// Test each corner of the cluster against the light bounding box planesbAllInvisib = bAllInvisib && dot(plane, float4(vP, 1.0)) > 0;}//即找到一个平面能跟Cluster完全分离if (bAllInvisib) bIsHit = false;}#endif}return bIsHit;
}

Final Resolve

上面的start值记录的是当前Cluster在g_vLayeredLightList记录LightData的起始Index
categoryListCountScratch也记录了Cluster不同的Category的LightData Count,
所以我们可以通过start以及对应的Category的LightData Count就可以在g_vLayeredLightList中寻址得到对应的Category LightData Index


uint GenerateLayeredOffsetBufferIndex(uint lightCategory, uint2 tileIndex, uint clusterIndex, uint numTilesX, uint numTilesY, int numClusters, uint eyeIndex)
{// Each eye is split into category, cluster, x, yuint eyeOffset = eyeIndex * LIGHTCATEGORY_COUNT * numClusters * numTilesX * numTilesY;int lightOffset = ((lightCategory * numClusters + clusterIndex) * numTilesY + tileIndex.y) * numTilesX + tileIndex.x;return (eyeOffset + lightOffset);
}//67108863=1<<26-1
#define LIGHT_CLUSTER_PACKING_OFFSET_MASK (67108863)#define LIGHT_CLUSTER_PACKING_COUNT_MASK (63)
#define LIGHT_CLUSTER_PACKING_OFFSET_BITS (26)uint PackClusterLayeredOffset(uint offset, uint count)
{return (offset & LIGHT_CLUSTER_PACKING_OFFSET_MASK) | (min(count, LIGHT_CLUSTER_PACKING_COUNT_MASK) << LIGHT_CLUSTER_PACKING_OFFSET_BITS);
}//统计当前Tile内的Cluster(64个)不同category计数器
groupshared int categoryListCountScratch[NR_THREADS * LIGHTCATEGORY_COUNT];void IncrementCategoryListCount(uint threadIdx, uint index)
{categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index]++;
}int ReadCategoryListCount(uint threadIdx, uint index)
{return categoryListCountScratch[threadIdx * LIGHTCATEGORY_COUNT + index];
}//LogBaseBufferIndex计算,suggestedBase是逐tile数据
uint GenerateLogBaseBufferIndex(uint2 tileIndex, uint numTilesX, uint numTilesY, uint eyeIndex)
{uint eyeOffset = eyeIndex * numTilesX * numTilesY;return (eyeOffset + (tileIndex.y * numTilesX) + tileIndex.x);
}[numthreads(NR_THREADS, 1, 1)]
void LIGHTLISTGEN(uint threadID : SV_GroupIndex, uint3 u3GroupID : SV_GroupID)
{...uint start = 0;int i = (int)t;int iSpaceAvail = 0;int iSum = 0;if (i < nrClusters){// Each thread checks it's respective cluster against all coarse lights for intersection.// At the end, 'iSum' represents the number of lights that intersect this cluster!for (int l = 0; l < iNrCoarseLights; l++){iSum += (CheckIntersectionBasic(l, i) ? 1 : 0);}// We have a limit to the number of lights we will track in a cluster (128). This is how much memory we// want to allocate out of g_LayeredSingleIdxBuffer.iSpaceAvail = min(iSum,MAX_NR_COARSE_ENTRIES); // combined storage for both direct lights and reflection//start = g_LayeredSingleIdxBuffer[0];//InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail);InterlockedAdd(g_LayeredSingleIdxBuffer[0], (uint)iSpaceAvail, start); // alloc list memory//start记录前面所有Cluster包含的Light数量//iSpaceAvail记录当前Cluster里包含的Light数量//g_LayeredSingleIdxBuffer[0]没有用到过,应该是某个Indirect的操作,记录所有Cluster的LightIndex总数}...//start记录前面所有Cluster包含的Light数量,也是后续跳表List(g_LayeredOffset)存储的uint offs = start;for (int ll = 0; ll < iNrCoarseLights; ll += 4){int p = i >> 2;int m = i & 3;if (i < 24)lightPlanes[6 * m + p] = FetchPlane(min(iNrCoarseLights - 1, ll + m), p, eyeIndex);#if NR_THREADS > PLATFORM_LANE_COUNTGroupMemoryBarrierWithGroupSync();#endiffor (int l = ll; l < min(iNrCoarseLights, (ll + 4)); l++){//iSpaceAvail是通过CheckIntersectionBasic测试的灯光数量if (offs < (start + iSpaceAvail) && i < nrClusters && CheckIntersection(l, i, viTilLL.xy, viTilUR.xy, suggestedBase, eyeIndex)){//不同lightCategory是连续存储的,后续通过ReadCategoryListCount读取到不同Category的LightData的数量//从而分开不同Category.const int lightVolIndex = GenerateLightCullDataIndex(coarseList[l], g_iNrVisibLights, eyeIndex);uint lightCategory = _LightVolumeData[lightVolIndex].lightCategory;IncrementCategoryListCount(t, lightCategory);g_vLayeredLightList[offs++] = coarseList[l] - ReadShiftIndex(t, lightCategory);}}#if NR_THREADS > PLATFORM_LANE_COUNTGroupMemoryBarrierWithGroupSync();#endif}...uint localOffs = 0;//每个Cluster起始offset由LIGHTCATEGORY_PUNCTUAL(0)为标准//在loop里面再累计[offs += (nrClusters * nrTilesX * nrTilesY);]//nrClusters=64offs = GenerateLayeredOffsetBufferIndex(0, tileIDX, i, nrTilesX, nrTilesY, nrClusters, eyeIndex);for (int category = 0; category < LIGHTCATEGORY_COUNT; category++){//读取当前Cluster中category对应的light Countint numLights = ReadCategoryListCount(t, category);if (i < nrClusters){//(读取g_vLayeredLightList的起始Index) start + localOffs //(Cluster内category对应的light Count) numLights//g_vLayeredLightList里面才存储LightIndex,g_LayeredOffset作为跳转的Listg_LayeredOffset[offs] = PackClusterLayeredOffset((start + localOffs), (uint)numLights);offs += (nrClusters * nrTilesX * nrTilesY);localOffs += numLights; // use unclamped count for localOffs}}//为了后面LightingLoop映射ClusterIdx,需要保存每个Tile的划分Cluster的参数(suggestedBase)#ifdef ENABLE_DEPTH_TEXTURE_BACKPLANEconst uint logBaseIndex = GenerateLogBaseBufferIndex(tileIDX, nrTilesX, nrTilesY, eyeIndex);if (threadID == 0) g_logBaseBuffer[logBaseIndex] = suggestedBase;#endif...
}

LightLoop

类似FPTL,Cluster这里也同样调用的是GetCountAndStart接口获取当前Cluster的lightData Count以及start的Index,
FetchIndex获取最终的LightData Index
最终的流程如下:
1.positionInput的TileCoord + Depth.z计算出当前片元归属的clusterIdx
2.clusterIdx + 当前要计算的category (GenerateLayeredOffsetBufferIndex)计算当前cluster在g_vLayeredOffsetsBuffer的idx
3.g_vLayeredOffsetsBuffer[idx]即为上面最后记录的用来跳转用的dataPair(PackClusterLayeredOffset)
4.UnpackClusterLayeredOffset得到start和lightCount
5.Lighting的时候从start开始FetchIndex就可以得到lightData的真正Index

//LightLoopDef.hlsl...#elif defined(USE_CLUSTERED_LIGHTLIST)#include "Packages/com.unity.render-pipelines.high-definition/Runtime/Lighting/LightLoop/ClusteredUtils.hlsl"uint GetTileSize()
{return TILE_SIZE_CLUSTERED;
}uint GetLightClusterIndex(uint2 tileIndex, float linearDepth)
{float logBase = g_fClustBase;if (g_isLogBaseBufferEnabled){const uint logBaseIndex = GenerateLogBaseBufferIndex(tileIndex, _NumTileClusteredX, _NumTileClusteredY, unity_StereoEyeIndex);logBase = g_logBaseBuffer[logBaseIndex];}return SnapToClusterIdxFlex(linearDepth, logBase, g_isLogBaseBufferEnabled != 0);
}void UnpackClusterLayeredOffset(uint packedValue, out uint offset, out uint count)
{offset = packedValue & LIGHT_CLUSTER_PACKING_OFFSET_MASK;count = packedValue >> LIGHT_CLUSTER_PACKING_OFFSET_BITS;
}void GetCountAndStartCluster(uint2 tileIndex, uint clusterIndex, uint lightCategory, out uint start, out uint lightCount)
{int nrClusters = (1 << g_iLog2NumClusters);const int idx = GenerateLayeredOffsetBufferIndex(lightCategory, tileIndex, clusterIndex, _NumTileClusteredX, _NumTileClusteredY, nrClusters, unity_StereoEyeIndex);uint dataPair = g_vLayeredOffsetsBuffer[idx];UnpackClusterLayeredOffset(dataPair, start, lightCount);
}void GetCountAndStartCluster(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{// Note: XR depends on unity_StereoEyeIndex already being defined,// which means ShaderVariables.hlsl needs to be defined ahead of this!uint2 tileIndex    = posInput.tileCoord;uint  clusterIndex = GetLightClusterIndex(tileIndex, posInput.linearDepth);GetCountAndStartCluster(tileIndex, clusterIndex, lightCategory, start, lightCount);
}void GetCountAndStart(PositionInputs posInput, uint lightCategory, out uint start, out uint lightCount)
{GetCountAndStartCluster(posInput, lightCategory, start, lightCount);
}uint FetchIndex(uint lightStart, uint lightOffset)
{return g_vLightListCluster[lightStart + lightOffset];
}...

本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.ulsteruni.cn/article/68741631.html

如若内容造成侵权/违法违规/事实不符,请联系编程大学网进行投诉反馈email:xxxxxxxx@qq.com,一经查实,立即删除!

相关文章

虚拟机+FunASR的安装与使用

写在前面:我是偶然了解到这个语音识别这个AI方向的,由于当时没有接触过其他的AI模型(ps:当时常见的AI大都是使用过的,但只是止于直接地使用,画个图,让AI帮我解决一些不知道的问题而已),所以FunASR算是我接触过的首个开源且我现有的设备能跑的动的AI项目。所以我对AI的了…

Swoole 源码分析之 Timer 定时器模块

原文首发链接:Swoole 源码分析之 Timer 定时器模块 大家好,我是码农先森。 引言 Swoole 中的毫秒精度的定时器。底层基于 epoll_wait 和 setitimer 实现,数据结构使用最小堆,可支持添加大量定时器。 在同步 IO 进程中使用 setitimer 和信号实现,如 Manager 和 TaskWorker …

手把手教你做阅读理解题-初中中考阅读理解解题技巧013-dearMars Project

PDF格式公众号回复关键字:ZKYD013阅读理解技巧,在帮助读者有效获取和理解文本信息方面发挥着重要作用,熟练掌握如下6个技巧,可快速突破阅读理解 1 预览文章结构 在开始深入阅读之前,快速浏览文章的标题、段落开头和结尾,可以迅速把握文章的主题、大致内容和结构 标题通常能…

Java登陆第四十天——Router路由守卫练习

需求 未登录无法访问除login页面 练习 1.使用vite创建项目,导入依赖 npm create vite 选择vue+js npm i 导入基本依赖 npm vue-router 导入路由依赖2. 创建组件,login.vue、home.vue、list.vue 仅展示home.vue组件,其他都一样。 <script setup></script><tem…