AcademySoftwareFoundation · matthewdcong · Aug 19, 2025 · Aug 19, 2025 · Aug 20, 2025 · Aug 20, 2025
@@ -90,7 +90,7 @@ jobs:
           -DCMAKE_POSITION_INDEPENDENT_CODE=ON
           \'
       - name: test
-        run: cd build && sudo ctest -V -E ".*cuda.*"
+        run: cd build && sudo ctest -V -E ".*cuda.*|.*mgpu.*"
 
   windows-nanovdb:
     if: |
@@ -137,7 +137,7 @@ jobs:
         \'
     - name: test
       shell: bash
-      run: cd build && ctest -V -E ".*cuda.*"
+      run: cd build && ctest -V -E ".*cuda.*|.*mgpu.*"
 
   macos-nanovdb:
     if: |
@@ -166,7 +166,7 @@ jobs:
           --components=core,nano,nanotest,nanoexam,nanobench,nanotool
           --cargs=\'-DUSE_EXPLICIT_INSTANTIATION=OFF -DNANOVDB_USE_CUDA=OFF -DNANOVDB_USE_OPENVDB=ON\'
       - name: test
-        run: cd build && ctest -V -E ".*cuda.*"
+        run: cd build && ctest -V -E ".*cuda.*|.*mgpu.*"
 
   nanovdb-lite:
     if: |

@@ -490,7 +490,7 @@ splitGrids(const GridHandle<BufferT> &handle, const BufferT* other = nullptr)
         h = HandleT(std::move(buffer));
         ptr = util::PtrAdd(ptr, src->mGridSize);
     }
-    return std::move(handles);
+    return handles;
 }// splitGrids
 
 /// @brief Combines (or merges) multiple GridHandles into a single GridHandle containing all grids

@@ -3213,18 +3213,19 @@ struct NANOVDB_ALIGN(NANOVDB_DATA_ALIGNMENT) InternalData
     __hostdev__ const StatsT& average() const { return mAverage; }
     __hostdev__ const StatsT& stdDeviation() const { return mStdDevi; }
 
-// GCC 11 (and possibly prior versions) has a regression that results in invalid
+// GCC 13 (and possibly prior versions) has a regression that results in invalid
 // warnings when -Wstringop-overflow is turned on. For details, refer to
 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101854
-#if defined(__GNUC__) && (__GNUC__ < 12) && !defined(__APPLE__) && !defined(__llvm__)
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106757
+#if defined(__GNUC__) && (__GNUC__ < 14) && !defined(__APPLE__) && !defined(__llvm__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstringop-overflow"
 #endif
     __hostdev__ void setMin(const ValueT& v) { mMinimum = v; }
     __hostdev__ void setMax(const ValueT& v) { mMaximum = v; }
     __hostdev__ void setAvg(const StatsT& v) { mAverage = v; }
     __hostdev__ void setDev(const StatsT& v) { mStdDevi = v; }
-#if defined(__GNUC__) && (__GNUC__ < 12) && !defined(__APPLE__) && !defined(__llvm__)
+#if defined(__GNUC__) && (__GNUC__ < 14) && !defined(__APPLE__) && !defined(__llvm__)
 #pragma GCC diagnostic pop
 #endif
 

@@ -222,18 +222,6 @@ inline size_t minDevicePageSize(const DeviceMesh& mesh)
     return minGranularity;
 }// minDevicePageSize
 
-/// @brief Launches a function for each node in the device mesh in parallel and blocks until the have all completed
-template<typename Func, typename... Args>
-void parallelForEach(const nanovdb::cuda::DeviceMesh& mesh, Func func, Args... args)
-{
-    std::vector<std::thread> threads;
-    for (const auto& [device, stream] : mesh) {
-        threads.emplace_back(func, device, stream, args...);
-    }
-    std::for_each(threads.begin(), threads.end(), [](std::thread& t) { t.join(); });
-    threads.clear();
-}// parallelForEach
-
 } // namespace cuda
 
 } // namespace nanovdb

@@ -25,13 +25,14 @@ namespace nanovdb {
 
 namespace cuda {
 
-namespace {// anonymous namespace
-__global__ void cpyGridHandleMeta(const GridData *d_data, GridHandleMetaData *d_meta)
+namespace detail {
+
+static __global__ void cpyGridHandleMeta(const GridData *d_data, GridHandleMetaData *d_meta)
 {
     nanovdb::cpyGridHandleMeta(d_data, d_meta);
 }
 
-__global__ void updateGridCount(GridData *d_data, uint32_t gridIndex, uint32_t gridCount, bool *d_dirty)
+static __global__ void updateGridCount(GridData *d_data, uint32_t gridIndex, uint32_t gridCount, bool *d_dirty)
 {
     NANOVDB_ASSERT(gridIndex < gridCount);
     *d_dirty = (d_data->mGridIndex != gridIndex) || (d_data->mGridCount != gridCount);
@@ -41,7 +42,8 @@ __global__ void updateGridCount(GridData *d_data, uint32_t gridIndex, uint32_t g
         if (d_data->mChecksum.isEmpty()) *d_dirty = false;// no need to update checksum if it didn't already exist
     }
 }
-}// anonymous namespace
+
+}// namespace detail
 
 template<typename BufferT, template <class, class...> class VectorT = std::vector>
 inline typename util::enable_if<BufferTraits<BufferT>::hasDeviceDual, VectorT<GridHandle<BufferT>>>::type
@@ -52,14 +54,13 @@ splitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = nullp
     VectorT<GridHandle<BufferT>> handles(handle.gridCount());
     bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
     cudaCheck(util::cuda::mallocAsync((void**)&d_dirty, sizeof(bool), stream));
-    int device = 0;
-    cudaCheck(cudaGetDevice(&device));
+    int device = util::cuda::currentDevice();
     for (uint32_t n=0; n<handle.gridCount(); ++n) {
         auto buffer = BufferT::create(handle.gridSize(n), other, device, stream);
         GridData *dst = reinterpret_cast<GridData*>(buffer.deviceData());
         const GridData *src = reinterpret_cast<const GridData*>(ptr);
         cudaCheck(cudaMemcpyAsync(dst, src, handle.gridSize(n), cudaMemcpyDeviceToDevice, stream));
-        updateGridCount<<<1, 1, 0, stream>>>(dst, 0u, 1u, d_dirty);
+        detail::updateGridCount<<<1, 1, 0, stream>>>(dst, 0u, 1u, d_dirty);
         cudaCheckError();
         cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
         cudaCheck(cudaStreamSynchronize(stream));
@@ -68,7 +69,7 @@ splitGridHandles(const GridHandle<BufferT> &handle, const BufferT* other = nullp
         ptr = util::PtrAdd(ptr, handle.gridSize(n));
     }
     cudaCheck(util::cuda::freeAsync(d_dirty, stream));
-    return std::move(handles);
+    return handles;
 }// cuda::splitGridHandles
 
 template<typename BufferT, template <class, class...> class VectorT>
@@ -81,8 +82,7 @@ mergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT* oth
         gridCount += h.gridCount();
         for (uint32_t n=0; n<h.gridCount(); ++n) size += h.gridSize(n);
     }
-    int device = 0;
-    cudaCheck(cudaGetDevice(&device));
+    int device = util::cuda::currentDevice();
     auto buffer = BufferT::create(size, other, device, stream);
     void *dst = buffer.deviceData();
     bool dirty, *d_dirty;// use this to check if the checksum needs to be recomputed
@@ -92,7 +92,7 @@ mergeGridHandles(const VectorT<GridHandle<BufferT>> &handles, const BufferT* oth
         for (uint32_t n=0; n<h.gridCount(); ++n) {
             cudaCheck(cudaMemcpyAsync(dst, src, h.gridSize(n), cudaMemcpyDeviceToDevice, stream));
             GridData *data = reinterpret_cast<GridData*>(dst);
-            updateGridCount<<<1, 1, 0, stream>>>(data, counter++, gridCount, d_dirty);
+            detail::updateGridCount<<<1, 1, 0, stream>>>(data, counter++, gridCount, d_dirty);
             cudaCheckError();
             cudaCheck(cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost, stream));
             cudaCheck(cudaStreamSynchronize(stream));
@@ -136,7 +136,7 @@ GridHandle<BufferT>::GridHandle(T&& buffer)
             if (!tmp.isValid()) throw std::runtime_error("GridHandle was constructed with an invalid device buffer");
             GridHandleMetaData *d_metaData;
             cudaMalloc((void**)&d_metaData, tmp.mGridCount*sizeof(GridHandleMetaData));
-            cuda::cpyGridHandleMeta<<<1,1>>>(d_data, d_metaData);
+            cuda::detail::cpyGridHandleMeta<<<1,1>>>(d_data, d_metaData);
             mMetaData.resize(tmp.mGridCount);
             cudaCheck(cudaMemcpy(mMetaData.data(), d_metaData,tmp.mGridCount*sizeof(GridHandleMetaData), cudaMemcpyDeviceToHost));
             cudaCheck(cudaFree(d_metaData));

@@ -109,6 +109,8 @@ nanovdb_example(NAME "ex_bump_pool_buffer")
 nanovdb_example(NAME "ex_collide_level_set")
 nanovdb_example(NAME "ex_raytrace_fog_volume")
 nanovdb_example(NAME "ex_raytrace_level_set")
+nanovdb_example(NAME "ex_dilate_nanovdb_cuda" OPENVDB)
+nanovdb_example(NAME "ex_merge_nanovdb_cuda" OPENVDB)
 
 if(CUDAToolkit_FOUND)
   nanovdb_example(NAME "ex_make_mgpu_nanovdb") # requires cuRAND

@@ -0,0 +1,133 @@
+// Copyright Contributors to the OpenVDB Project
+// SPDX-License-Identifier: Apache-2.0
+
+// the following files are from OpenVDB
+#include <openvdb/tools/Morphology.h>
+#include <openvdb/util/CpuTimer.h>
+
+// the following files are from NanoVDB
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/cuda/DeviceBuffer.h>
+#include <nanovdb/tools/CreateNanoGrid.h>
+
+template<typename BuildT>
+void mainDilateGrid(
+    nanovdb::NanoGrid<BuildT> *deviceGridOriginal,
+    nanovdb::NanoGrid<BuildT> *deviceGridDilated,
+    nanovdb::NanoGrid<BuildT> *indexGridOriginal,
+    nanovdb::NanoGrid<BuildT> *indexGridDilated,
+    uint32_t nnType,
+    uint32_t benchmark_iters
+);
+
+/// @brief This example depends on OpenVDB, NanoVDB, and CUDA
+int main(int argc, char *argv[])
+{
+    using GridT = openvdb::FloatGrid;
+    using BuildT = nanovdb::ValueOnIndex;
+
+    // Select the type of dilation here. The NN_EDGE case supports leaf dilation too (currently)
+    // openvdb::tools::NearestNeighbors nnType = openvdb::tools::NN_FACE_EDGE_VERTEX;
+    openvdb::tools::NearestNeighbors nnType = openvdb::tools::NN_FACE;
+
+    openvdb::util::CpuTimer cpuTimer;
+    const bool printGridDiagnostics = true;
+
+    try {
+
+        if (argc<2) OPENVDB_THROW(openvdb::ValueError, "usage: "+std::string(argv[0])+" input.vdb [<iterations>]\n");
+        int benchmark_iters = 10;
+        if (argc > 2) sscanf(argv[2], "%d", &benchmark_iters);
+
+        // Read the initial level set from file
+
+        cpuTimer.start("Read input VDB file");
+        openvdb::initialize();
+        openvdb::io::File inFile(argv[1]);
+        inFile.open(false); // disable delayed loading
+        auto baseGrids = inFile.getGrids();
+        inFile.close();
+        auto grid = openvdb::gridPtrCast<GridT>(baseGrids->at(0));
+        openvdb::FloatGrid* ptr = grid.get(); // raw pointer
+        if (!grid) OPENVDB_THROW(openvdb::ValueError, "First grid is not a FloatGrid\n");
+        cpuTimer.stop();
+
+        // Convert to indexGrid (original, un-dilated)
+        cpuTimer.start("Converting openVDB input to indexGrid (original version)");
+        auto handleOriginal = nanovdb::tools::openToIndexVDB<BuildT, nanovdb::cuda::DeviceBuffer>(
+            grid,
+            0u,    // Don't copy data channel
+            false, // No stats
+            false, // No tiles
+            1      // Verbose mode
+        );
+        auto *indexGridOriginal = handleOriginal.grid<BuildT>();
+        cpuTimer.stop();
+
+        if (printGridDiagnostics) {
+            std::cout << "============ Original Grid ===========" << std::endl;
+            std::cout << "Allocated values [valueCount()]       : " << indexGridOriginal->valueCount() << std::endl;
+            std::cout << "Active voxels    [activeVoxelCount()] : " << indexGridOriginal->activeVoxelCount() << std::endl;
+            auto minCorner = indexGridOriginal->indexBBox().min(), maxCorner = indexGridOriginal->indexBBox().max();
+            std::cout << "Index-space bounding box              : [" << minCorner.x() << "," << minCorner.y() << "," << minCorner.z()
+                      << "] -> [" << maxCorner.x() << "," << maxCorner.y() << "," << maxCorner.z() << "]" << std::endl;
+            std::cout << "Leaf nodes                            : " << indexGridOriginal->tree().nodeCount(0) << std::endl;
+            std::cout << "Lower internal nodes                  : " << indexGridOriginal->tree().nodeCount(1) << std::endl;
+            std::cout << "Upper internal nodes                  : " << indexGridOriginal->tree().nodeCount(2) << std::endl;
+            std::cout << "Leaf-level occupancy                  : "
+                      << 100.f * (float)(indexGridOriginal->activeVoxelCount())/(float)(indexGridOriginal->tree().nodeCount(0) * 512)
+                      << "%" << std::endl;
+            std::cout << "Memory usage                          : " << indexGridOriginal->gridSize() << " bytes" << std::endl;
+        }
+
+        // Dilation (CPU/OpenVDB version)
+        cpuTimer.start("Dilating openVDB (on CPU)");
+        openvdb::tools::dilateActiveValues(grid->tree(), 1, nnType);
+        cpuTimer.stop();
+
+        // Convert to indexGrid (dilated)
+        cpuTimer.start("Converting openVDB input to indexGrid (dilated version)");
+        auto handleDilated = nanovdb::tools::openToIndexVDB<BuildT, nanovdb::cuda::DeviceBuffer>(
+            grid,
+            0u,    // Don't copy data channel
+            false, // No stats
+            false, // No tiles
+            1      // Verbose mode
+        );
+        cpuTimer.stop();
+
+        auto *indexGridDilated = handleDilated.grid<BuildT>();
+
+        if (printGridDiagnostics) {
+            std::cout << "============ Dilated Grid ============" << std::endl;
+            std::cout << "Allocated values [valueCount()]       : " << indexGridDilated->valueCount() << std::endl;
+            std::cout << "Active voxels    [activeVoxelCount()] : " << indexGridDilated->activeVoxelCount() << std::endl;
+            auto minCorner = indexGridDilated->indexBBox().min(), maxCorner = indexGridDilated->indexBBox().max();
+            std::cout << "Index-space bounding box              : [" << minCorner.x() << "," << minCorner.y() << "," << minCorner.z()
+                      << "] -> [" << maxCorner.x() << "," << maxCorner.y() << "," << maxCorner.z() << "]" << std::endl;
+            std::cout << "Leaf nodes                            : " << indexGridDilated->tree().nodeCount(0) << std::endl;
+            std::cout << "Lower internal nodes                  : " << indexGridDilated->tree().nodeCount(1) << std::endl;
+            std::cout << "Upper internal nodes                  : " << indexGridDilated->tree().nodeCount(2) << std::endl;
+            std::cout << "Leaf-level occupancy                  : "
+                      << 100.f * (float)(indexGridDilated->activeVoxelCount())/(float)(indexGridDilated->tree().nodeCount(0) * 512)
+                      << "%" << std::endl;
+            std::cout << "Memory usage                          : " << indexGridDilated->gridSize() << " bytes" << std::endl;
+        }
+
+        // Copy both NanoVDB grids to GPU
+        handleOriginal.deviceUpload();
+        handleDilated.deviceUpload();
+        auto* deviceGridOriginal = handleOriginal.deviceGrid<BuildT>();
+        auto* deviceGridDilated = handleDilated.deviceGrid<BuildT>();
+        if (!deviceGridOriginal || !deviceGridDilated)
+            OPENVDB_THROW(openvdb::RuntimeError, "Failure while uploading indexGrids to GPU");
+
+        // Launch benchmark
+        mainDilateGrid( deviceGridOriginal, deviceGridDilated, indexGridOriginal, indexGridDilated, nnType, benchmark_iters );
+
+    }
+    catch (const std::exception& e) {
+        std::cerr << "An exception occurred: \"" << e.what() << "\"" << std::endl;
+    }
+    return 0;
+}