Add GPU direct communications

FluidNumerics · Sep 21, 2024 · ca03665 · ca03665
1 parent 34cc519
commit ca03665
Show file tree

Hide file tree

Showing 11 changed files with 402 additions and 34 deletions.
diff --git a/docs/GettingStarted/install.md b/docs/GettingStarted/install.md
@@ -64,7 +64,7 @@ The Spectral Element Library in Fortran can be built provided the following depe
 
 * [Cmake (v3.21 or greater)](https://cmake.org/resources/)
 * Fortran 2008 compliant compiler ( `gfortran` recommended )
-* MPI, e.g. [OpenMPI](https://www.open-mpi.org/)
+* MPI, e.g. [OpenMPI](https://www.open-mpi.org/) with [GPU-Aware Support](./dependencies.md)
 * [MAGMA](https://icl.utk.edu/magma/)
 * [HDF5](https://www.hdfgroup.org/solutions/hdf5/)
 * [FluidNumerics/feq-parse](https://github.com/FluidNumerics/feq-parse)

diff --git a/docs/Learning/dependencies.md b/docs/Learning/dependencies.md
@@ -0,0 +1,19 @@
+# Dependencies
+
+This documentation provides more detailed information about SELF's dependencies.
+
+
+## GPU-Aware Message Passing Interface (MPI)
+In order to support scaling of conservation law solvers to large supercomputers, SELF exposes additional parallelism through domain decomposition. This strategy divides a physical domain into multiple subdomains and gives ownership of each subdomain to a single process. These processes are often called "MPI ranks" or "MPI processes". So long as the SELF program you write enables domain decomposition during mesh creation, launching with `mpirun -np 4 /path/to/your/program` will provide you with domain decomposition across four MPI ranks. 
+
+With domain decomposition, each process has its own private memory. However, your conservation law likely demands that neighboring elements in your mesh participate in flux calculations together. When two neighboring elements reside on two separate processes, information must be shared. In SELF, this is achieved as part of the `SideExchange` methods for each data type when domain decomposition is enabled.
+
+On CPU-only platforms, the `boundary` attributes of the `SELF_Mapped*` types are passed to `MPI_ISEND` (asynchronous send) to send data to neighboring processes. On the flip size, the `extboundary` attributes are passed to the `MPI_IRECV` (asynchronous recv) to receive data from the neighboring processes. With each `MPI_ISEND/IRECV` pair, a unique tag for the message is calculated using the global edge/face ID and variable ID. This ensures the messages get stored in the correct address locations when they are received.
+
+On GPU and multi-GPU accelerated platforms, we assume that you are working with a GPU-Aware installation of MPI. In this case, the GPU pointer for the `boundary` attribute (`boundary_gpu`) is passed to `MPI_ISEND`. Similarly, `extboundary_gpu` is passed to `MPI_IRECV`. If your installation of MPI is not GPU aware and you are using a GPU accelerated build of SELF, your program will halt during the initialization of the `DomainDecomposition` class. At this stage, we check the results of `MPIX_Query_rocm_support` and `MPIX_Query_cuda_support` to determine if you have GPU aware MPI enabled at runtime.
+
+At the moment, we have only tested SELF with OpenMPI with GPU aware support on AMD and Nvidia GPU platforms. As requested by users, we can work to test other MPI flavors ( [Open an issue](https://github.com/FluidNumerics/SELF/issues/new/choose) ). To find if your OpenMPI is built with GPU-Aware Support
+```shell
+ompi_info --parsable --all | grep mpi_built_with_cuda_support:value 
+mca:mpi:base:param:mpi_built_with_cuda_support:value:true
+```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -15,13 +15,15 @@ nav:
   - Home: index.md
   - Getting Started:
     - Installation: GettingStarted/install.md
+
   - Learning:
     - Theory:
       - Spectral Approximations: Learning/SpectralApproximations.md
       - Differential Geometry: Learning/DifferentialGeometry.md
       - Provable Stability: Learning/ProvableStability.md
     - Code:
       - Software Architecture: Learning/SoftwareArchitecture.md
+      - Dependencies: Learning/dependencies.md
   - Contributing:
     - Documentation: Contributing/Documentation.md
 

diff --git a/src/SELF_DomainDecomposition.f90 → src/SELF_DomainDecomposition_t.f90 b/src/SELF_DomainDecomposition.f90 → src/SELF_DomainDecomposition_t.f90
@@ -24,7 +24,7 @@
 !
 ! //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// !
 
-module SELF_DomainDecomposition
+module SELF_DomainDecomposition_t
 
   use SELF_Constants
   use SELF_Lagrange
@@ -34,7 +34,7 @@ module SELF_DomainDecomposition
 
   implicit none
 
-  type DomainDecomposition
+  type DomainDecomposition_t
     logical :: mpiEnabled
     integer :: mpiComm
     integer :: mpiPrec
@@ -50,23 +50,23 @@ module SELF_DomainDecomposition
 
   contains
 
-    procedure :: Init => Init_DomainDecomposition
-    procedure :: Free => Free_DomainDecomposition
+    procedure :: Init => Init_DomainDecomposition_t
+    procedure :: Free => Free_DomainDecomposition_t
 
-    procedure :: GenerateDecomposition => GenerateDecomposition_DomainDecomposition
-    procedure :: SetElemToRank => SetElemToRank_DomainDecomposition
+    procedure :: GenerateDecomposition => GenerateDecomposition_DomainDecomposition_t
+    procedure :: SetElemToRank => SetElemToRank_DomainDecomposition_t
 
     procedure,public :: FinalizeMPIExchangeAsync
 
-  endtype DomainDecomposition
+  endtype DomainDecomposition_t
 
 contains
 
-  subroutine Init_DomainDecomposition(this,enableMPI)
+  subroutine Init_DomainDecomposition_t(this,enableMPI)
 #undef __FUNC__
-#define __FUNC__ "Init_DomainDecomposition"
+#define __FUNC__ "Init_DomainDecomposition_t"
     implicit none
-    class(DomainDecomposition),intent(out) :: this
+    class(DomainDecomposition_t),intent(out) :: this
     logical,intent(in) :: enableMPI
     ! Local
     integer       :: ierror
@@ -97,11 +97,11 @@ subroutine Init_DomainDecomposition(this,enableMPI)
 
     allocate(this%offsetElem(1:this%nRanks+1))
 
-  endsubroutine Init_DomainDecomposition
+  endsubroutine Init_DomainDecomposition_t
 
-  subroutine Free_DomainDecomposition(this)
+  subroutine Free_DomainDecomposition_t(this)
     implicit none
-    class(DomainDecomposition),intent(inout) :: this
+    class(DomainDecomposition_t),intent(inout) :: this
     ! Local
     integer :: ierror
 
@@ -120,11 +120,11 @@ subroutine Free_DomainDecomposition(this)
       call MPI_FINALIZE(ierror)
     endif
 
-  endsubroutine Free_DomainDecomposition
+  endsubroutine Free_DomainDecomposition_t
 
-  subroutine GenerateDecomposition_DomainDecomposition(this,nGlobalElem,maxMsg)
+  subroutine GenerateDecomposition_DomainDecomposition_t(this,nGlobalElem,maxMsg)
     implicit none
-    class(DomainDecomposition),intent(inout) :: this
+    class(DomainDecomposition_t),intent(inout) :: this
     integer,intent(in) :: nGlobalElem
     integer,intent(in) :: maxMsg
 
@@ -139,11 +139,11 @@ subroutine GenerateDecomposition_DomainDecomposition(this,nGlobalElem,maxMsg)
     print*,__FILE__//" : Rank ",this%rankId+1," : n_elements = ", &
       this%offSetElem(this%rankId+2)-this%offSetElem(this%rankId+1)
 
-  endsubroutine GenerateDecomposition_DomainDecomposition
+  endsubroutine GenerateDecomposition_DomainDecomposition_t
 
-  subroutine SetElemToRank_DomainDecomposition(this,nElem)
+  subroutine SetElemToRank_DomainDecomposition_t(this,nElem)
     implicit none
-    class(DomainDecomposition),intent(inout) :: this
+    class(DomainDecomposition_t),intent(inout) :: this
     integer,intent(in) :: nElem
     ! Local
     integer :: iel
@@ -163,7 +163,7 @@ subroutine SetElemToRank_DomainDecomposition(this,nElem)
                       this%elemToRank(iel))
     enddo
 
-  endsubroutine SetElemToRank_DomainDecomposition
+  endsubroutine SetElemToRank_DomainDecomposition_t
 
   subroutine DomainDecomp(nElems,nDomains,offSetElem)
     ! From https://www.hopr-project.org/externals/Meshformat.pdf, Algorithm 4
@@ -225,7 +225,7 @@ subroutine ElemToRank(nDomains,offsetElem,elemID,domain)
   endsubroutine ElemToRank
 
   subroutine FinalizeMPIExchangeAsync(mpiHandler)
-    class(DomainDecomposition),intent(inout) :: mpiHandler
+    class(DomainDecomposition_t),intent(inout) :: mpiHandler
     ! Local
     integer :: ierror
     integer :: msgCount
@@ -241,7 +241,7 @@ subroutine FinalizeMPIExchangeAsync(mpiHandler)
   endsubroutine FinalizeMPIExchangeAsync
 
   ! subroutine GlobalReduce_RealScalar(mpiHandler,sendBuf,recvBuf)
-  !   class(DomainDecomposition),intent(in) :: mpiHandler
+  !   class(DomainDecomposition_t),intent(in) :: mpiHandler
   !   real(prec),intent(in) :: sendBuf
   !   real(prec),intent(out) :: recvBuf
   !   ! Local
@@ -261,4 +261,4 @@ subroutine FinalizeMPIExchangeAsync(mpiHandler)
 
   ! endsubroutine GlobalReduce_RealScalar
 
-endmodule SELF_DomainDecomposition
+endmodule SELF_DomainDecomposition_t
diff --git a/src/cpu/SELF_DomainDecomposition.f90 b/src/cpu/SELF_DomainDecomposition.f90
@@ -0,0 +1,37 @@
+! //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// !
+!
+! Maintainers : [email protected]
+! Official Repository : https://github.com/FluidNumerics/self/
+!
+! Copyright © 2024 Fluid Numerics LLC
+!
+! Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+!
+! 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+!
+! 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in
+!    the documentation and/or other materials provided with the distribution.
+!
+! 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from
+!    this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+! HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+! LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+! THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+! THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+!
+! //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// !
+
+module SELF_DomainDecomposition
+
+  use SELF_DomainDecomposition_t
+
+  implicit none
+
+  type, extends(DomainDecomposition_t) :: DomainDecomposition
+  endtype DomainDecomposition
+
+
+endmodule SELF_DomainDecomposition
diff --git a/src/gpu/SELF_DomainDecomposition.cpp b/src/gpu/SELF_DomainDecomposition.cpp
@@ -0,0 +1,22 @@
+#include <stdio.h>
+#include <mpi.h>
+#include <mpi-ext.h> /* Needed for ROCm-aware check */
+
+extern "C"
+{
+    int check_gpu_aware_support()
+    {
+        int gpuaware = 0;
+
+#if defined(OMPI_HAVE_MPI_EXT_ROCM) && OMPI_HAVE_MPI_EXT_ROCM
+        gpuaware = (int) MPIX_Query_rocm_support();
+#endif
+
+#if defined(OMPI_HAVE_MPI_EXT_CUDA) && OMPI_HAVE_MPI_EXT_CUDA
+        gpuaware = (int) MPIX_Query_cuda_support();
+#endif
+
+        return gpuaware;
+
+    }
+}
diff --git a/src/gpu/SELF_DomainDecomposition.f90 b/src/gpu/SELF_DomainDecomposition.f90
@@ -0,0 +1,150 @@
+! //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// !
+!
+! Maintainers : [email protected]
+! Official Repository : https://github.com/FluidNumerics/self/
+!
+! Copyright © 2024 Fluid Numerics LLC
+!
+! Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+!
+! 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+!
+! 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in
+!    the documentation and/or other materials provided with the distribution.
+!
+! 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from
+!    this software without specific prior written permission.
+!
+! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+! LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+! HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+! LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+! THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+! THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+!
+! //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// !
+
+module SELF_DomainDecomposition
+
+    use SELF_DomainDecomposition_t
+    use mpi
+    use iso_c_binding
+
+    implicit none
+
+    type, extends(DomainDecomposition_t) :: DomainDecomposition
+      type(c_ptr) :: elemToRank_gpu
+
+
+    contains
+
+      procedure :: Init => Init_DomainDecomposition
+      procedure :: Free => Free_DomainDecomposition
+
+      procedure :: SetElemToRank => SetElemToRank_DomainDecomposition
+
+    endtype DomainDecomposition
+
+    interface
+      function check_gpu_aware_support() bind(c,name="check_gpu_aware_support")
+        use iso_c_binding
+        integer(c_int) :: check_gpu_aware_support
+      end function check_gpu_aware_support
+    endinterface
+  contains
+
+  subroutine Init_DomainDecomposition(this,enableMPI)
+    implicit none
+    class(DomainDecomposition),intent(out) :: this
+    logical,intent(in) :: enableMPI
+    ! Local
+    integer       :: ierror
+    integer(c_int) :: gpuaware
+
+    this%mpiComm = 0
+    this%mpiPrec = prec
+    this%rankId = 0
+    this%nRanks = 1
+    this%nElem = 0
+    this%mpiEnabled = enableMPI
+
+    if(enableMPI) then
+      this%mpiComm = MPI_COMM_WORLD
+      print*,__FILE__," : Initializing MPI"
+      call MPI_INIT(ierror)
+
+      if( check_gpu_aware_support() == 0 )then
+        print*,__FILE__" : Error! GPU Aware support is not detected. Stopping."
+        call MPI_FINALIZE(ierror)
+        stop
+      endif
+
+      call MPI_COMM_RANK(this%mpiComm,this%rankId,ierror)
+      call MPI_COMM_SIZE(this%mpiComm,this%nRanks,ierror)
+      print*,__FILE__," : Rank ",this%rankId+1,"/",this%nRanks," checking in."
+    else
+      print*,__FILE__," : MPI not initialized. No domain decomposition used."
+    endif
+
+    if(prec == real32) then
+      this%mpiPrec = MPI_FLOAT
+    else
+      this%mpiPrec = MPI_DOUBLE
+    endif
+
+    allocate(this%offsetElem(1:this%nRanks+1))
+
+  endsubroutine Init_DomainDecomposition
+
+  subroutine Free_DomainDecomposition(this)
+    implicit none
+    class(DomainDecomposition),intent(inout) :: this
+    ! Local
+    integer :: ierror
+
+    if(associated(this%offSetElem)) then
+      deallocate(this%offSetElem)
+    endif
+    if(associated(this%elemToRank)) then
+      deallocate(this%elemToRank)
+      call gpuCheck(hipFree(this%elemToRank_gpu))
+    endif
+
+    if(allocated(this%requests)) deallocate(this%requests)
+    if(allocated(this%stats)) deallocate(this%stats)
+
+    if(this%mpiEnabled) then
+      print*,__FILE__," : Rank ",this%rankId+1,"/",this%nRanks," checking out."
+      call MPI_FINALIZE(ierror)
+    endif
+
+  endsubroutine Free_DomainDecomposition
+
+  subroutine SetElemToRank_DomainDecomposition(this,nElem)
+    implicit none
+    class(DomainDecomposition),intent(inout) :: this
+    integer,intent(in) :: nElem
+    ! Local
+    integer :: iel
+
+    this%nElem = nElem
+
+    allocate(this%elemToRank(1:nelem))
+    call gpuCheck(hipMalloc(this%elemToRank_gpu,sizeof(this%elemToRank)))
+
+    call DomainDecomp(nElem, &
+                      this%nRanks, &
+                      this%offSetElem)
+
+    do iel = 1,nElem
+      call ElemToRank(this%nRanks, &
+                      this%offSetElem, &
+                      iel, &
+                      this%elemToRank(iel))
+    enddo
+    call gpuCheck(hipMemcpy(this%elemToRank_gpu,c_loc(this%elemToRank),sizeof(this%elemToRank),hipMemcpyHostToDevice))
+
+  endsubroutine SetElemToRank_DomainDecomposition
+
+
+endmodule SELF_DomainDecomposition