fixing discrepancies between OpenToNano/PointsToGrid constructions

sifakis · matthewdcong · commit 6f3f350dafc9 · 2025-09-10T16:10:28.000-07:00
This PR fixes a few discrepancies between grid construction via (a) PointsToGrid, and (b) OpenToNanoVDB. The discrepancies are not typically inhibiting functionality of either construction, but result in grids that are not bit-identical, when they would otherwise be expected to be. The changes are:

Removal of the PointsToGridData::flags member. The actual functionality of this flag was to allow control of solely the leaf-level flags (interior node, and GridData flags were already set independently), and in that aspect it was even incorrectly setting the IsBreadthFirst bit (which only makes sense for GridData::mFlags, not the leaf node flags). The method includeBBox was having no effect as it was, it was never called in any of the unittests, and its effect was overwritten later in the code.

PointsToGrid has been corrected to flag HasBBox as active in every node (not just the leaf nodes, as was happening prior; the flags were previously being set to zero).

The world space bounding box in OpenToNanoVDB was incrementing the upper bound of the index-space bounding box, prior to applying the transform ( hence, a grid with a single voxel [0,0,0] would have a WS bounding box of [0,0,0]-&gt;[1,1,1]). PointsToGrid was not applying this increment, resulting in a smaller WS bounding box.

Signed-off-by: Matthew Cong &lt;mcong@nvidia.com&gt;
diff --git a/nanovdb/nanovdb/tools/cuda/DistributedPointsToGrid.cuh b/nanovdb/nanovdb/tools/cuda/DistributedPointsToGrid.cuh
@@ -461,7 +461,6 @@ DistributedPointsToGrid<BuildT>::DistributedPointsToGrid(const nanovdb::cuda::De
     mTempDevicePools = new nanovdb::cuda::TempDevicePool[mDeviceMesh.deviceCount()];
 
     cudaCheck(cudaMallocManaged(&mData, sizeof(PointsToGridData<BuildT>)));
-    mData->flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst});
     mData->map = map;
 
     mStripeCounts = nullptr;
@@ -906,7 +905,7 @@ template <typename BuildT>
 inline void DistributedPointsToGrid<BuildT>::processNodes()
 {
     // Parallel construction of upper, lower, and leaf nodes
-    const uint8_t flags = static_cast<uint8_t>(mData->flags.data());// mIncludeStats ? 16u : 0u;// 4th bit indicates stats
+    const uint8_t flags = (uint8_t) GridFlags::HasBBox;
 
     for (const auto& [deviceId, stream] : mDeviceMesh) {
         cudaCheck(cudaSetDevice(deviceId));
@@ -1002,58 +1001,56 @@ inline void DistributedPointsToGrid<BuildT>::processPoints(const PtrT, size_t)
 template <typename BuildT>
 inline void DistributedPointsToGrid<BuildT>::processBBox()
 {
-    if (mData->flags.isMaskOn(GridFlags::HasBBox)) {
-        // Compute and propagate bounding boxes for the upper nodes and their descendents belonging to each device in parallel.
-        std::vector<cudaEvent_t> propagateLowerBBoxEvents(mDeviceMesh.deviceCount());
-        for (const auto& [deviceId, stream] : mDeviceMesh) {
-            cudaCheck(cudaSetDevice(deviceId));
-            // reset bbox in lower nodes
-            if (deviceNodeCount(deviceId)[1]) {
-                util::cuda::offsetLambdaKernel<<<numBlocks(deviceNodeCount(deviceId)[1]), mNumThreads, 0, stream>>>(deviceNodeCount(deviceId)[1], deviceNodeOffset(deviceId)[1], ResetLowerNodeBBoxFunctor<BuildT>(), mData);
-                cudaCheckError();
-            }
-
-            // update and propagate bbox from leaf -> lower/parent nodes
-            if (deviceNodeCount(deviceId)[0]) {
-                util::cuda::offsetLambdaKernel<<<numBlocks(deviceNodeCount(deviceId)[0]), mNumThreads, 0, stream>>>(deviceNodeCount(deviceId)[0], deviceNodeOffset(deviceId)[0], UpdateAndPropagateLeafBBoxFunctor<BuildT>(), mData);
-                cudaCheckError();
-            }
-
-            // reset bbox in upper nodes
-            if (deviceNodeCount(deviceId)[2]) {
-                util::cuda::offsetLambdaKernel<<<numBlocks(deviceNodeCount(deviceId)[2]), mNumThreads, 0, stream>>>(deviceNodeCount(deviceId)[2], deviceNodeOffset(deviceId)[2], ResetUpperNodeBBoxFunctor<BuildT>(), mData);
-                cudaCheckError();
-            }
-
-            // propagate bbox from lower -> upper/parent node
-            if (deviceNodeCount(deviceId)[1]) {
-                util::cuda::offsetLambdaKernel<<<numBlocks(deviceNodeCount(deviceId)[1]), mNumThreads, 0, stream>>>(deviceNodeCount(deviceId)[1], deviceNodeOffset(deviceId)[1], PropagateLowerBBoxFunctor<BuildT>(), mData);
-                cudaCheckError();
-            }
+    // Compute and propagate bounding boxes for the upper nodes and their descendents belonging to each device in parallel.
+    std::vector<cudaEvent_t> propagateLowerBBoxEvents(mDeviceMesh.deviceCount());
+    for (const auto& [deviceId, stream] : mDeviceMesh) {
+        cudaCheck(cudaSetDevice(deviceId));
+        // reset bbox in lower nodes
+        if (deviceNodeCount(deviceId)[1]) {
+            util::cuda::offsetLambdaKernel<<<numBlocks(deviceNodeCount(deviceId)[1]), mNumThreads, 0, stream>>>(deviceNodeCount(deviceId)[1], deviceNodeOffset(deviceId)[1], ResetLowerNodeBBoxFunctor<BuildT>(), mData);
+            cudaCheckError();
+        }
 
-            cudaEventCreate(&propagateLowerBBoxEvents[deviceId]);
-            cudaEventRecord(propagateLowerBBoxEvents[deviceId], stream);
+        // update and propagate bbox from leaf -> lower/parent nodes
+        if (deviceNodeCount(deviceId)[0]) {
+            util::cuda::offsetLambdaKernel<<<numBlocks(deviceNodeCount(deviceId)[0]), mNumThreads, 0, stream>>>(deviceNodeCount(deviceId)[0], deviceNodeOffset(deviceId)[0], UpdateAndPropagateLeafBBoxFunctor<BuildT>(), mData);
+            cudaCheckError();
         }
 
-        // Wait until bounding boxes are computed for each upper node and then compute the root bounding box on the zeroth device
-        {
-            int deviceId = 0;
-            auto stream = mDeviceMesh[deviceId].stream;
-            cudaCheck(cudaSetDevice(deviceId));
-            for (const auto& propagateLowerBBoxEvent : propagateLowerBBoxEvents)
-            {
-                cudaStreamWaitEvent(stream, propagateLowerBBoxEvent);
-            }
-            // propagate bbox from upper -> root/parent node
-            util::cuda::lambdaKernel<<<numBlocks(mData->nodeCount[2]), mNumThreads, 0, stream>>>(mData->nodeCount[2], PropagateUpperBBoxFunctor<BuildT>(), mData);
+        // reset bbox in upper nodes
+        if (deviceNodeCount(deviceId)[2]) {
+            util::cuda::offsetLambdaKernel<<<numBlocks(deviceNodeCount(deviceId)[2]), mNumThreads, 0, stream>>>(deviceNodeCount(deviceId)[2], deviceNodeOffset(deviceId)[2], ResetUpperNodeBBoxFunctor<BuildT>(), mData);
             cudaCheckError();
+        }
 
-            // update the world-bbox in the root node
-            util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, UpdateRootWorldBBoxFunctor<BuildT>(), mData);
+        // propagate bbox from lower -> upper/parent node
+        if (deviceNodeCount(deviceId)[1]) {
+            util::cuda::offsetLambdaKernel<<<numBlocks(deviceNodeCount(deviceId)[1]), mNumThreads, 0, stream>>>(deviceNodeCount(deviceId)[1], deviceNodeOffset(deviceId)[1], PropagateLowerBBoxFunctor<BuildT>(), mData);
             cudaCheckError();
+        }
 
-            cudaCheck(cudaEventDestroy(propagateLowerBBoxEvents[deviceId]));
+        cudaEventCreate(&propagateLowerBBoxEvents[deviceId]);
+        cudaEventRecord(propagateLowerBBoxEvents[deviceId], stream);
+    }
+
+    // Wait until bounding boxes are computed for each upper node and then compute the root bounding box on the zeroth device
+    {
+        int deviceId = 0;
+        auto stream = mDeviceMesh[deviceId].stream;
+        cudaCheck(cudaSetDevice(deviceId));
+        for (const auto& propagateLowerBBoxEvent : propagateLowerBBoxEvents)
+        {
+            cudaStreamWaitEvent(stream, propagateLowerBBoxEvent);
         }
+        // propagate bbox from upper -> root/parent node
+        util::cuda::lambdaKernel<<<numBlocks(mData->nodeCount[2]), mNumThreads, 0, stream>>>(mData->nodeCount[2], PropagateUpperBBoxFunctor<BuildT>(), mData);
+        cudaCheckError();
+
+        // update the world-bbox in the root node
+        util::cuda::lambdaKernel<<<1, 1, 0, stream>>>(1, UpdateRootWorldBBoxFunctor<BuildT>(), mData);
+        cudaCheckError();
+
+        cudaCheck(cudaEventDestroy(propagateLowerBBoxEvents[deviceId]));
     }
 
     // Explicitly synchronize so that move constructor in getHandle doesn't fail
diff --git a/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh b/nanovdb/nanovdb/tools/cuda/PointsToGrid.cuh
@@ -277,7 +277,6 @@ struct PointsToGridData {
     uint32_t *d_indx;// device pointer to point indices (or IDs)
     uint32_t  nodeCount[3], *pointsPerLeafPrefix, *pointsPerLeaf;// 0=leaf,1=lower, 2=upper
     uint32_t  voxelCount,  *pointsPerVoxelPrefix, *pointsPerVoxel;
-    BitFlags<16> flags;
     __hostdev__ NanoGrid<BuildT>&  getGrid() const {return *util::PtrAdd<NanoGrid<BuildT>>(d_bufferPtr, grid);}
     __hostdev__ NanoTree<BuildT>&  getTree() const {return *util::PtrAdd<NanoTree<BuildT>>(d_bufferPtr, tree);}
     __hostdev__ NanoRoot<BuildT>&  getRoot() const {return *util::PtrAdd<NanoRoot<BuildT>>(d_bufferPtr, root);}
@@ -303,7 +302,6 @@ public:
         , mPointType(util::is_same<BuildT,Point>::value ? PointType::Default : PointType::Disable)
     {
         mData.map = map;
-        mData.flags.initMask({GridFlags::HasBBox, GridFlags::IsBreadthFirst});
         mDeviceData = static_cast<PointsToGridData<BuildT>*>(ResourceT::allocateAsync(sizeof(PointsToGridData<BuildT>), ResourceT::DEFAULT_ALIGNMENT, mStream));
     }
 
@@ -335,10 +333,6 @@ public:
     /// @param mode Mode of checksum computation
     void setChecksum(CheckMode mode = CheckMode::Disable){mChecksum = mode;}
 
-    /// @brief Toggle on and off the computation of a bounding-box
-    /// @param on If true bbox will be computed
-    void includeBBox(bool on = true) { mData.flags.setMask(GridFlags::HasBBox, on); }
-
     /// @brief Set the name of the output grid
     /// @param name name of the output grid
     void setGridName(const std::string &name) {mGridName = name;}
@@ -901,6 +895,8 @@ struct BuildGridTreeRootFunctor
             default:
                 printf("Error in PointsToGrid<BuildT, ResourceT>::processGridTreeRoot: invalid pointType\n");
             }
+        } else if constexpr(BuildTraits<BuildT>::is_onindex) {
+            grid.mGridClass = GridClass::IndexGrid;
         } else if constexpr(BuildTraits<BuildT>::is_offindex) {
             grid.mData1 = 1u + 512u*d_data->nodeCount[0];
             grid.mGridClass = GridClass::IndexGrid;
@@ -946,7 +942,7 @@ struct BuildUpperNodesFunctor
 #endif
         root.tile(tid)->setChild(ijk, &upper, &root);
         upper.mBBox[0] = ijk;
-        upper.mFlags = 0;
+        upper.mFlags = (uint64_t) GridFlags::HasBBox;
         upper.mValueMask.setOff();
         upper.mChildMask.setOff();
         upper.mMinimum = upper.mMaximum = typename NanoLower<BuildT>::ValueType(0);
@@ -992,7 +988,7 @@ struct BuildLowerNodesFunctor
         auto &lower = d_data->getLower(tid);
         upper.setChild(upperOffset, &lower);
         lower.mBBox[0] = upper.offsetToGlobalCoord(upperOffset);
-        lower.mFlags = 0;
+        lower.mFlags = (uint64_t) GridFlags::HasBBox;
         lower.mValueMask.setOff();
         lower.mChildMask.setOff();
         lower.mMinimum = lower.mMaximum = typename NanoLower<BuildT>::ValueType(0);// background;
@@ -1093,7 +1089,7 @@ struct SetLeafInactiveVoxelValuesFunctor
 template<typename BuildT, typename ResourceT>
 inline void PointsToGrid<BuildT, ResourceT>::processLeafNodes(size_t pointCount)
 {
-    const uint8_t flags = static_cast<uint8_t>(mData.flags.data());// mIncludeStats ? 16u : 0u;// 4th bit indicates stats
+    const uint8_t flags = (uint8_t) GridFlags::HasBBox;
 
     if (mVerbose==2) mTimer.start("process leaf meta data");
     // loop over leaf nodes and add it to its parent node
@@ -1274,19 +1270,16 @@ struct UpdateRootWorldBBoxFunctor
 {
     __device__
     void operator()(size_t tid, PointsToGridData<BuildT> *d_data) {
-        d_data->getGrid().mWorldBBox = d_data->getRoot().mBBox.transform(d_data->map);
+        auto BBox = d_data->getRoot().mBBox;
+        BBox.max() += 1;
+        d_data->getGrid().mFlags.setMaskOn(GridFlags::HasBBox);
+        d_data->getGrid().mWorldBBox = BBox.transform(d_data->map);
     }
 };
 
 template<typename BuildT, typename ResourceT>
 inline void PointsToGrid<BuildT, ResourceT>::processBBox()
 {
-    if (mData.flags.isMaskOff(GridFlags::HasBBox)) {
-        ResourceT::deallocateAsync(mData.d_leaf_keys, mData.nodeCount[0]*sizeof(uint64_t), ResourceT::DEFAULT_ALIGNMENT, mStream);
-        ResourceT::deallocateAsync(mData.d_lower_keys, mData.nodeCount[1]*sizeof(uint64_t), ResourceT::DEFAULT_ALIGNMENT, mStream);
-        return;
-    }
-
     // reset bbox in lower nodes
     util::cuda::lambdaKernel<<<numBlocks(mData.nodeCount[1]), mNumThreads, 0, mStream>>>(mData.nodeCount[1], ResetLowerNodeBBoxFunctor<BuildT>(), mDeviceData);
     cudaCheckError();