/* BARRIER set a barrier; for SPMD versions */ #if defined(MPI) # define BARRIER call mpi_barrier(mpi_comm_hycom,mpierr) #elif defined(SHMEM) # define BARRIER call shmem_barrier_all() #endif /* BARRIER_MP halo synchronization; SHMEM only */ /* BARRIER_NP halo synchronization; SHMEM only */ #if defined(RINGB) #define BARRIER_MP call xctbar(idproc(mproc-1,nproc),idproc(mproc+1,nproc)) #define BARRIER_NP call xctbar(idproc(mproc,nproc-1),idproc(mproc,nproc+1)) #else #define BARRIER_MP BARRIER #define BARRIER_NP BARRIER #endif #if defined(MPI) /* #define MPISR */ /* MTYPE4 mpi type for real*4 */ /* MTYPER mpi type for real */ /* MTYPED mpi type for real*8 */ /* MTYPEI mpi type for integer */ /* MPI_SEND either mpi_send or mpi_ssend */ /* MPI_ISEND either mpi_isend or mpi_issend */ #if defined(NOMPIR8) /* LAM does not support mpi_real[48] */ #if defined(REAL4) # define MTYPE4 mpi_real # define MTYPER mpi_real # define MTYPED mpi_double_precision # define MTYPEI mpi_integer #else /* REAL8 */ # define MTYPE4 mpi_real # define MTYPER mpi_double_precision # define MTYPED mpi_double_precision # define MTYPEI mpi_integer #endif #else /* most MPI's allow mpi_real[48] */ #if defined(REAL4) # define MTYPE4 mpi_real4 # define MTYPER mpi_real4 # define MTYPED mpi_real8 # define MTYPEI mpi_integer #else /* REAL8 */ # define MTYPE4 mpi_real4 # define MTYPER mpi_real8 # define MTYPED mpi_real8 # define MTYPEI mpi_integer #endif #endif #if defined(SSEND) # define MPI_SEND mpi_ssend # define MPI_ISEND mpi_issend #else # define MPI_SEND mpi_send # define MPI_ISEND mpi_isend #endif /* SSEND:else */ #endif /* MPI */ #if defined(SHMEM) /* SHMEM_GET4 get real*4 variables */ /* SHMEM_GETR get real variables */ /* SHMEM_GETD get real*8 variables */ /* SHMEM_GETI get integer variables */ /* SHMEM_MYPE return number of this PE (0...npes-1) */ /* SHMEM_NPES return number of PEs */ #if defined(REAL4) # define SHMEM_GET4 shmem_get32 # define SHMEM_GETR shmem_get32 # define SHMEM_GETD shmem_get64 # define SHMEM_GETI shmem_integer_get #else /* REAL8 */ # define SHMEM_GET4 shmem_get32 # define SHMEM_GETR shmem_get64 # define SHMEM_GETD shmem_get64 # define SHMEM_GETI shmem_integer_get #endif # define SHMEM_MYPE shmem_my_pe # define SHMEM_NPES shmem_n_pes #endif /* SHMEM */ c c----------------------------------------------------------------------- c c auxillary routines that involve off-processor communication. c message passing version, contained in module mod_xc. c c author: Alan J. Wallcraft, NRL. c c----------------------------------------------------------------------- c subroutine xcaget(aa, a, mnflg) implicit none c real, intent(out) :: aa(itdm,jtdm) real, intent(in) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) integer, intent(in) :: mnflg c c********** c* c 1) convert an entire 2-D array from tiled to non-tiled layout. c c 3) mnflg selects which nodes must return the array c = 0; all nodes c = n; node number n (mnproc=n) c c 4) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c aa real output non-tiled target array c a real input tiled source array c mnflg integer input node return flag c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif integer i,j,mp,np,mnp #if defined(TIMER) c if (nxc.eq.0) then call xctmr0( 1) nxc = 1 endif #endif #if defined(RELO) c if (.not.allocated(al)) then allocate( al(itdm,jdm) ) call mem_stat_add( itdm*jdm ) allocate( at(idm*jdm) ) call mem_stat_add( idm*jdm ) endif #endif c c gather each row of tiles onto the first tile in the row. c #if defined(MPI) if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,i0 al(i,j) = vland enddo do i= 1,ii al(i0+i,j) = a(i,j) enddo do i= i0+ii+1,itdm al(i,j) = vland enddo enddo do mp= mpe_1(nproc)+1,mpe_e(nproc) call MPI_RECV(at,ii_pe(mp,nproc)*jj,MTYPER, & idproc(mp,nproc), 9941, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) do j= 1,jj do i= 1,ii_pe(mp,nproc) al(i0_pe(mp,nproc)+i,j) = at(i+(j-1)*ii_pe(mp,nproc)) enddo enddo enddo else !mproc>1 do j= 1,jj do i= 1,ii at(i+(j-1)*ii) = a(i,j) enddo enddo call MPI_SEND(at,ii*jj,MTYPER, & idproc(mpe_1(nproc),nproc), 9941, & mpi_comm_hycom, mpierr) endif #elif defined(SHMEM) do j= 1,jj do i= 1,ii at(i+(j-1)*ii) = a(i,j) enddo enddo BARRIER if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,i0 al(i,j) = vland enddo do i= 1,ii al(i0+i,j) = a(i,j) enddo do i= i0+ii+1,itdm al(i,j) = vland enddo enddo do mp= mpe_1(nproc)+1,mpe_e(nproc) call SHMEM_GETR(at, & at,ii_pe(mp,nproc)*jj, idproc(mp,nproc)) do j= 1,jj do i= 1,ii_pe(mp,nproc) al(i0_pe(mp,nproc)+i,j) = at(i+(j-1)*ii_pe(mp,nproc)) enddo enddo enddo endif BARRIER #endif /* MPI:SHMEM */ c c gather each row of tiles onto the output array. c #if defined(MPI) mnp = max(mnflg,1) c if (mnproc.eq.mnp) then if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,itdm aa(i,j+j0) = al(i,j) enddo enddo endif do np= 1,jpr mp = mpe_1(np) if (idproc(mp,np).ne.idproc(mproc,nproc)) then call MPI_RECV(al,itdm*jj_pe(mp,np),MTYPER, & idproc(mp,np), 9942, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) do j= 1,jj_pe(mp,np) do i= 1,itdm aa(i,j+j0_pe(mp,np)) = al(i,j) enddo enddo endif enddo elseif (mproc.eq.mpe_1(nproc)) then call MPI_SEND(al,itdm*jj,MTYPER, & idproc1(mnp), 9942, & mpi_comm_hycom, mpierr) endif c if (mnflg.eq.0) then call mpi_bcast(aa,itdm*jtdm,MTYPER, & idproc1(1),mpi_comm_hycom,mpierr) endif #elif defined(SHMEM) if (mnflg.eq.0 .or. mnproc.eq.mnflg) then if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,itdm aa(i,j+j0) = al(i,j) enddo enddo endif do np= 1,jpr mp = mpe_1(np) if (idproc(mp,np).ne.idproc(mproc,nproc)) then call SHMEM_GETR(al, & al,itdm*jj_pe(mp,np), idproc(mp,np)) do j= 1,jj_pe(mp,np) do i= 1,itdm aa(i,j+j0_pe(mp,np)) = al(i,j) enddo enddo endif enddo endif ! no barrier needed here because of double buffering (at and then al) #endif /* MPI:SHMEM */ #if defined(TIMER) c if (nxc.eq. 1) then call xctmr1( 1) nxc = 0 endif #endif return end subroutine xcaget subroutine xcaget4(aa, a, mnflg) implicit none c real*4, intent(out) :: aa(itdm,*) real*4, intent(in) :: a(ii,jj) integer, intent(in) :: mnflg c c********** c* c 1) convert an entire 2-D array from tiled to non-tiled layout. c c 2) Special version for zaiord and zaiowr only. c arrays are real*4 and tiled array has no halo. c c 3) mnflg selects which nodes must return the array c =-1; first node in each row returns that row c = n; node number n (mnproc=n) c c 4) The size of aa depends on mnflg c =-1; aa needed on 1st node in each row, dimension (itdm,jdm) c = n; aa needed on nth node, dimension (itdm,jtdm) c c 5) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c aa real output non-tiled target array c a real input tiled source array c mnflg integer input node return flag c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c integer i,j,mp,np,mnp #if defined(TIMER) c if (nxc.eq.0) then call xctmr0( 1) nxc = 1 endif #endif #if defined(RELO) c if (.not.allocated(at4)) then if (mproc.eq.mpe_1(nproc)) then allocate( al4(itdm,jdm) ) allocate( alt4(idm*jdm,ipr) ) call mem_stat_add( (itdm*jdm)/2 ) !real*4, so /2 call mem_stat_add( (idm *jdm*jpr)/2 ) !real*4, so /2 endif !first tile in the row allocate( at4(idm*jdm) ) call mem_stat_add( (idm*jdm) /2 ) !real*4, so /2 endif #endif c c gather each row of tiles onto the first tile in the row. c #if defined(MPI) if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,i0 al4(i,j) = vland4 enddo do i= 1,ii al4(i0+i,j) = a(i,j) enddo do i= i0+ii+1,itdm al4(i,j) = vland4 enddo enddo do mp= mpe_1(nproc)+1,mpe_e(nproc) call MPI_RECV(at4,ii_pe(mp,nproc)*jj,MTYPE4, & idproc(mp,nproc), 9941, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) do j= 1,jj do i= 1,ii_pe(mp,nproc) al4(i0_pe(mp,nproc)+i,j) = at4(i+(j-1)*ii_pe(mp,nproc)) enddo enddo enddo else !mproc>1 call MPI_SEND(a,ii*jj,MTYPE4, & idproc(mpe_1(nproc),nproc), 9941, & mpi_comm_hycom, mpierr) endif #elif defined(SHMEM) do j= 1,jj do i= 1,ii at4(i+(j-1)*ii) = a(i,j) enddo enddo BARRIER if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,i0 al4(i,j) = vland4 enddo do i= 1,ii al4(i0+i,j) = a(i,j) enddo do i= i0+ii+1,itdm al4(i,j) = vland4 enddo enddo do mp= mpe_1(nproc)+1,mpe_e(nproc) call SHMEM_GET4(at4, & at4,ii_pe(mp,nproc)*jj, idproc(mp,nproc)) do j= 1,jj do i= 1,ii_pe(mp,nproc) al4(i0_pe(mp,nproc)+i,j) = at4(i+(j-1)*ii_pe(mp,nproc)) enddo enddo enddo endif BARRIER #endif /* MPI:SHMEM */ if (mnflg.eq.-1) then c c we are essentially done. c if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,itdm aa(i,j) = al4(i,j) enddo enddo endif else !mnflg.ne.-1 c c gather each row of tiles onto the output array. c #if defined(MPI) mnp = max(mnflg,1) c if (mnproc.eq.mnp) then if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,itdm aa(i,j+j0) = al4(i,j) enddo enddo endif do np= 1,jpr mp = mpe_1(np) if (idproc(mp,np).ne.idproc(mproc,nproc)) then call MPI_RECV(al4,itdm*jj_pe(mp,np),MTYPE4, & idproc(mp,np), 9942, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) do j= 1,jj_pe(mp,np) do i= 1,itdm aa(i,j+j0_pe(mp,np)) = al4(i,j) enddo enddo endif enddo elseif (mproc.eq.mpe_1(nproc)) then call MPI_SEND(al4,itdm*jj,MTYPE4, & idproc1(mnp), 9942, & mpi_comm_hycom, mpierr) endif #elif defined(SHMEM) if (mnproc.eq.mnflg) then if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,itdm aa(i,j+j0) = al4(i,j) enddo enddo endif do np= 1,jpr mp = mpe_1(np) if (idproc(mp,np).ne.idproc(mproc,nproc)) then call SHMEM_GET4(al4, & al4,itdm*jj_pe(mp,np), idproc(mp,np)) do j= 1,jj_pe(mp,np) do i= 1,itdm aa(i,j+j0_pe(mp,np)) = al4(i,j) enddo enddo endif enddo endif ! no barrier needed here because of double buffering (at and then al) #endif /* MPI:SHMEM */ endif !mnflg.eq.-1:else #if defined(TIMER) c if (nxc.eq. 1) then call xctmr1( 1) nxc = 0 endif #endif return end subroutine xcaget4 subroutine xcaput(aa, a, mnflg) implicit none c real, intent(inout) :: aa(itdm,jtdm) real, intent(out) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) integer, intent(in) :: mnflg c c********** c* c 1) convert an entire 2-D array from non-tiled to tiled layout. c c 3) mnflg selects which nodes must contain the non-tiled array c = 0; all nodes c = n; node number n (mnproc=n) c if mnflg.ne.0 the array aa may be broadcast to all nodes, c so aa must exist and be overwritable on all nodes. c c 4) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c aa real input non-tiled source array c a real output tiled target array c mnflg integer input node source flag c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr c integer mpireqa(jpr),mpireqb(ipr) #endif c integer i,j,mp,np,mnp #if defined(TIMER) c if (nxc.eq.0) then BARRIER call xctmr0( 4) nxc = 4 endif #endif #if defined(RELO) c if (.not.allocated(al)) then allocate( al(itdm,jdm) ) call mem_stat_add( itdm*jdm ) allocate( at(idm*jdm) ) call mem_stat_add( idm*jdm ) endif #endif * * if (mnproc.eq.1) then * write(lp,'(a,g20.10)') 'xcaput - vland =',vland * endif c c use xclput for now, c this is slow for mnflg.ne.0, but easy to implement. c if (mnflg.ne.0) then c "broadcast" row sections of aa to all processors in the row. if (mnproc.ne.mnflg) then aa(:,:) = vland endif #if defined(MPI) if (mnproc.eq.mnflg) then j = 0 do np= 1,jpr mp = mpe_1(np) if (np.eq.nproc .and. mp.eq.mproc) then al(:,1:jj) = aa(:,j0+1:j0+jj) else j = j + 1 call MPI_ISEND(aa(1,j0_pe(mp,np)+1), & itdm*jj_pe(mp,np),MTYPER, & idproc(mp,np), 9951, & mpi_comm_hycom, mpireqa(j), mpierr) endif enddo call mpi_waitall( j, mpireqa, MPI_STATUSES_IGNORE, mpierr) elseif (mproc.eq.mpe_1(nproc)) then call MPI_RECV(al,itdm*jj,MTYPER, & idproc1(mnflg), 9951, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) endif c if (mproc.eq.mpe_1(nproc)) then i = 0 do mp= mpe_1(nproc)+1,mpe_e(nproc) i = i + 1 call MPI_ISEND(al,itdm*jj,MTYPER, & idproc(mp,nproc), 9952, & mpi_comm_hycom, mpireqb(i), mpierr) enddo call mpi_waitall( i, mpireqb, MPI_STATUSES_IGNORE, mpierr) else call MPI_RECV(al,itdm*jj,MTYPER, & idproc(mpe_1(nproc),nproc), 9952, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) endif c aa(:,j0+1:j0+jj) = al(:,1:jj) #elif defined(SHMEM) c assume aa is in common. BARRIER if (mnproc.ne.mnflg) then do j= 1,jj call SHMEM_GETR(aa(1,j0+j), & aa(1,j0+j),itdm,idproc1(mnflg)) enddo endif BARRIER #endif endif do j= 1,jtdm call xclput(aa(1,j),itdm, a, 1,j,1,0) enddo #if defined(TIMER) c if (nxc.eq. 4) then call xctmr1( 4) nxc = 0 endif #endif return end subroutine xcaput subroutine xcaput4(aa, a, mnflg) implicit none c real*4, intent(in) :: aa(itdm,*) real*4, intent(out) :: a(ii,jj) integer, intent(in) :: mnflg c c********** c* c 1) convert an entire 2-D array from non-tiled to tiled layout. c c 2) Special version for zaiord and zaiowr only. c arrays are real*4 and tiled array has no halo. c c 3) mnflg selects which nodes must contain the non-tiled array c =-1; first node in each row contains that row c = n; node number n (mnproc=n) c the array aa is unchanged on exit, and need not exist c on the nodes that do not reference it. c c 4) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c aa real input non-tiled source array c a real output tiled target array c mnflg integer input node source flag c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr c integer mpireqa(jpr),mpireqb(ipr) #endif c integer i,msg,j,mp,np,mnp #if defined(TIMER) c if (nxc.eq.0) then BARRIER call xctmr0( 4) nxc = 4 endif #endif #if defined(RELO) c if (.not.allocated(at4)) then if (mproc.eq.mpe_1(nproc)) then allocate( al4(itdm,jdm) ) allocate( alt4(idm *jdm,ipr) ) call mem_stat_add( (itdm*jdm)/2 ) !real*4, so /2 call mem_stat_add( (idm *jdm*jpr)/2 ) !real*4, so /2 endif !first tile in the row allocate( at4(idm*jdm) ) call mem_stat_add( (idm*jdm) /2 ) !real*4, so /2 endif #endif * * if (mnproc.eq.1) then * write(lp,'(a,g20.10)') 'xcaput - vland =',vland * endif c if (mnflg.gt.0) then #if defined(MPI) c "broadcast" row sections of aa to first processor in each row. if (mnproc.eq.mnflg) then j = 0 do np= 1,jpr mp = mpe_1(np) if (np.eq.nproc .and. mp.eq.mproc) then al4(:,1:jj) = aa(:,j0+1:j0+jj) else j = j + 1 call MPI_ISEND(aa(1,j0_pe(mp,np)+1), & itdm*jj_pe(mp,np),MTYPE4, & idproc(mp,np), 9951, & mpi_comm_hycom, mpireqa(j), mpierr) endif enddo call mpi_waitall( j, mpireqa, MPI_STATUSES_IGNORE, mpierr) elseif (mproc.eq.mpe_1(nproc)) then call MPI_RECV(al4,itdm*jj,MTYPE4, & idproc1(mnflg), 9951, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) endif c c "broadcast" partial row sections to the rest of the row's processors if (mproc.eq.mpe_1(nproc)) then msg = 0 do mp= mpe_1(nproc)+1,mpe_e(nproc) do j= 1,jj do i= 1,ii_pe(mp,nproc) alt4(i+(j-1)*ii_pe(mp,nproc),mp) = & al4(i0_pe(mp,nproc)+i,j) enddo enddo msg = msg + 1 call MPI_ISEND(alt4(1,mp),ii_pe(mp,nproc)*jj,MTYPE4, & idproc(mp,nproc), 9952, & mpi_comm_hycom, mpireqb(msg), mpierr) enddo !mp c do j= 1,jj do i= 1,ii a(i,j) = al4(i0+i,j) enddo enddo c call mpi_waitall( msg, mpireqb, MPI_STATUSES_IGNORE, mpierr) else call MPI_RECV(at4,ii*jj,MTYPE4, & idproc(mpe_1(nproc),nproc), 9952, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) do j= 1,jj do i= 1,ii a(i,j) = at4(i+(j-1)*ii) enddo enddo endif #elif defined(SHMEM) c assume aa is in common. BARRIER c "broadcast" row sections of aa to first processor in each row. if (mproc.eq.mpe_1(nproc)) then call SHMEM_GET4(al4(1, 1), & aa(1,j0+1),itdm*jj,idproc1(mnflg)) do mp= mpe_1(nproc)+1,mpe_e(nproc) do j= 1,jj do i= 1,ii alt4(i+(j-1)*ii,mp) = al4(i0+i,j) enddo enddo enddo !mp endif BARRIER c "broadcast" partial row sections to the rest of the row's processors if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,ii a(i,j) = al4(i0+i,j) enddo enddo else call SHMEM_GET4( at4(1, 1), & alt4(1,mproc),ii*jj,idproc1(mpe_1(nproc))) do j= 1,jj do i= 1,ii a(i,j) = at4(i+(j-1)*ii) enddo enddo endif BARRIER #endif elseif (mnflg.eq.-1) then #if defined(MPI) c "broadcast" partial row sections of aa to all processors in the row. if (mproc.eq.mpe_1(nproc)) then msg = 0 do mp= mpe_1(nproc)+1,mpe_e(nproc) do j= 1,jj do i= 1,ii_pe(mp,nproc) alt4(i+(j-1)*ii_pe(mp,nproc),mp) = & aa(i0_pe(mp,nproc)+i,j) enddo enddo msg = msg + 1 call MPI_ISEND(alt4(1,mp),ii_pe(mp,nproc)*jj,MTYPE4, & idproc(mp,nproc), 9953, & mpi_comm_hycom, mpireqb(msg), mpierr) enddo !mp c do j= 1,jj do i= 1,ii a(i,j) = aa(i0+i,j) enddo enddo c call mpi_waitall( msg, mpireqb, MPI_STATUSES_IGNORE, mpierr) else call MPI_RECV(at4,ii*jj,MTYPE4, & idproc(mpe_1(nproc),nproc), 9953, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) do j= 1,jj do i= 1,ii a(i,j) = at4(i+(j-1)*ii) enddo enddo endif #elif defined(SHMEM) c "broadcast" partial row sections to the rest of the row's processors if (mproc.eq.mpe_1(nproc)) then do mp= mpe_1(nproc)+1,mpe_e(nproc) do j= 1,jj do i= 1,ii_pe(mp,nproc) alt4(i+(j-1)*ii_pe(mp,nproc),mp) = & aa(i0_pe(mp,nproc)+i,j) enddo enddo enddo !mp endif BARRIER if (mproc.eq.mpe_1(nproc)) then do j= 1,jj do i= 1,ii a(i,j) = aa(i0+i,j) enddo enddo else call SHMEM_GET4( at4(1, 1), & alt4(1,mproc),ii*jj,idproc1(mpe_1(nproc))) do j= 1,jj do i= 1,ii a(i,j) = at4(i+(j-1)*ii) enddo enddo endif BARRIER #endif endif !mnflg.gt.0:mnflg.eq.-1 #if defined(TIMER) c if (nxc.eq. 4) then call xctmr1( 4) nxc = 0 endif #endif return end subroutine xcaput4 subroutine xcastr(a, mnflg) implicit none c real, intent(inout) :: a(:) integer, intent(in) :: mnflg c c********** c* c 1) broadcast array a to all tiles. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target array c mnflg integer input node originator flag c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c integer i,is0,isl,mn,n,nn #if defined(TIMER) c if (nxc.eq.0) then call xctmr0( 9) nxc = 9 endif #endif c c stripmine a. c n = size(a) c do is0= 0,n-1,nmax isl = min(is0+nmax,n) nn = isl - is0 if (mnproc.eq.mnflg) then do i= 1,nn b(i) = a(is0+i) enddo endif #if defined(MPI) call mpi_bcast(b,nn,MTYPER, & idproc1(mnflg), & mpi_comm_hycom,mpierr) #elif defined(SHMEM) BARRIER if (mnproc.ne.mnflg) then c get from source processor call SHMEM_GETR(b,b,nn, idproc1(mnflg)) endif BARRIER #endif if (mnproc.ne.mnflg) then do i= 1,nn a(is0+i) = b(i) enddo endif enddo ! stripmine loop #if defined(TIMER) c if (nxc.eq. 9) then call xctmr1( 9) nxc = 0 endif #endif return end subroutine xcastr subroutine xceget(aelem, a, ia,ja, mnflg) implicit none c real, intent(out) :: aelem real, intent(in) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) integer, intent(in) :: ia,ja integer, intent(in), optional :: mnflg c c********** c* c 1) find the value of a(ia,ja) on the non-tiled 2-D grid. c c 2) mnflg selects which nodes must return the line c = 0; all nodes (default) c = n; node number n (mnproc=n) c note that aelem might, or might not, be returned on other nodes c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c aelem real output required element c a real input tiled source array c ia integer input 1st non-tiled index into a c ja integer input 2nd non-tiled index into a c mnflg integer input node return flag c c 4) the global variable vland is returned when outside active tiles. c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c c double buffer to reduce the number of required barriers. c real, save, dimension(0:1) :: elem integer, save :: kdb = 0 c integer i,j,lmnflg,mp,np #if defined(TIMER) c * if (nxc.eq.0) then * call xctmr0( 2) * nxc = 2 * endif #endif c if (present(mnflg)) then lmnflg = mnflg else lmnflg = 0 ! default endif c kdb = mod(kdb+1,2) ! the least recently used of the two buffers c c find the host tile. c np = npe_j(max(0,min(jtdm+1,ja)) ) mp = mpe_i(max(0,min(itdm+1,ia)),np) c if (mp.le.0) then c c no tile. c elem(kdb) = vland elseif (mp.eq.mproc .and. np.eq.nproc) then c c this tile. c i = ia - i0 j = ja - j0 c elem(kdb) = a(i,j) #if defined(MPI) if (lmnflg.eq.0) then call mpi_bcast(elem(kdb),1,MTYPER, & idproc(mp,np),mpi_comm_hycom,mpierr) elseif (lmnflg.ne.mnproc) then call MPI_SEND(elem(kdb),1,MTYPER, & idproc1(lmnflg), 9932, & mpi_comm_hycom, mpierr) endif !lmnflg #elif defined(SHMEM) BARRIER #endif else c c another tile. c #if defined(MPI) if (lmnflg.eq.0) then call mpi_bcast(elem(kdb),1,MTYPER, & idproc(mp,np),mpi_comm_hycom,mpierr) elseif (lmnflg.eq.mnproc) then call MPI_RECV( elem(kdb),1,MTYPER, & idproc(mp,np), 9932, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) endif !lmnflg #elif defined(SHMEM) BARRIER call SHMEM_GETR(elem(kdb), & elem(kdb),1,idproc(mp,np)) ! no barrier needed here because of double buffering #endif endif aelem = elem(kdb) #if defined(TIMER) c * if (nxc.eq. 2) then * call xctmr1( 2) * nxc = 0 * endif #endif return end subroutine xceget subroutine xceput(aelem, a, ia,ja, mnflg) implicit none c integer, intent(in), optional :: mnflg integer, intent(in) :: ia,ja real, intent(inout) :: aelem real, intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) c c********** c* c 1) fill a single element in the non-tiled 2-D grid. c c 2) mnflg selects which nodes hold the element on entry c = 0; all nodes (default) c = n; node number n (mnproc=n) c all nodes hold the element on exit c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c aelem real in/out element value c a real in/out tiled target array c ia integer input 1st non-tiled index into a c ja integer input 2nd non-tiled index into a c mnflg integer input node source flag c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c real, save :: a1(1) #if defined(TIMER) c * if (nxc.eq.0) then * call xctmr0( 5) * nxc = 5 * endif #endif c if (present(mnflg)) then if (mnflg.ne.0) then if (mnproc.eq.mnflg) then a1(1) = aelem call xcastr(a1, mnflg) !broadcast the input element else call xcastr(a1, mnflg) !broadcast the input element aelem = a1(1) endif !mnproc endif !mnflg>0 endif !present c if (i0.lt.ia .and. ia.le.i0+ii .and. & j0.lt.ja .and. ja.le.j0+jj ) then c c this tile. c a(ia-i0,ja-j0) = aelem endif #if defined(TIMER) c * if (nxc.eq. 5) then * call xctmr1( 5) * nxc = 0 * endif #endif return end subroutine xceput subroutine xcgetc(iline) implicit none c integer, intent(inout) :: iline(81) c c********** c* c 1) machine specific routine for broadcasting iline. c c 2) only use in zagetc (hence the name). c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c c broadcast to all processors c #if defined(MPI) call mpi_bcast(iline,81,MTYPEI, & idproc1(1),mpi_comm_hycom,mpierr) #elif defined(SHMEM) BARRIER if (mnproc.ne.1) then call SHMEM_GETI(iline, & iline,81, idproc1(1)) endif ! no barrier needed here because zagetc is using two buffers #endif return end subroutine xcgetc subroutine xchalt(cerror) implicit none c character*(*), intent(in) :: cerror c c********** c* c 1) stop all processes. c c 2) only one processes need call this routine, i.e. it is for c emergency stops. use 'xcstop' for ordinary stops called c by all processes. c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c cerror char*(*) input error message c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c c message passing version. c if (cerror.ne.' ') then write(lp,*) '**************************************************' write(lp,*) cerror endif write(lp,*) '**************************************************' write(lp,*) 'XCHALT CALLED ON PROC = ',mnproc,mproc,nproc write(lp,*) '**************************************************' call flush(lp) c #if defined(MPI) call mpi_abort(mpi_comm_hycom,9) #else call abort() #endif stop '(xchalt)' end subroutine xchalt subroutine xciget(alist,nl, a, ia,ja, mnflg) implicit none c integer, intent(in) :: nl real, intent(out) :: alist(nl) real, intent(in) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) integer, intent(in) :: ia(nl),ja(nl) integer, intent(in), optional :: mnflg c c********** c* c 1) find the value of a(ia(:),ja(:)) on the non-tiled 2-D grid. c c 2) mnflg selects which nodes must return the line c = 0; all nodes (default) c = n; node number n (mnproc=n) c note that alist might, or might not, be returned on other nodes c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c alist real output required elements c nl integer input dimension of alist c a real input tiled source array c ia integer input 1st non-tiled indexes into a c ja integer input 2nd non-tiled indexes into a c mnflg integer input node return flag c c 4) the global variable vland is returned when outside active tiles. c* c********** c integer i,lmnflg,nl_sm c if (present(mnflg)) then lmnflg = mnflg else lmnflg = 0 ! default endif c c stripmine alist in itdm+jtdm chunks c do i= 1,nl,itdm+jtdm nl_sm = min( itdm+jtdm, nl+1-i ) call xciget_sm(alist(i),nl_sm, a, ia(i),ja(i), lmnflg) enddo !i return end subroutine xciget subroutine xciget_sm(alist,nl, a, ia,ja, mnflg) implicit none c integer, intent(in) :: nl real, intent(out) :: alist(nl) real, intent(in) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) integer, intent(in) :: ia(nl),ja(nl) integer, intent(in) :: mnflg c c********** c* c 1) find the value of a(ia(:),ja(:)) on the non-tiled 2-D grid. c version for nl<=itdm+jtdm, always invoke via xciget c c 2) mnflg selects which nodes must return the line c = 0; all nodes (default) c = n; node number n (mnproc=n) c note that alist might, or might not, be returned on other nodes c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c alist real output required elements c nl integer input dimension of alist c a real input tiled source array c ia integer input 1st non-tiled indexes into a c ja integer input 2nd non-tiled indexes into a c mnflg integer input node return flag c c 4) the global variable vland is returned when outside active tiles. c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c c double buffer to remove the need for barriers. c pad alX to guard against false sharing. c #if defined(RELO) real, save, allocatable, dimension(:,:) :: & als,alr #else real, save, dimension(-47:itdm+jtdm+48,0:1) :: & als,alr #endif integer, save, dimension(0:ijqr-1,0:1) :: nlp,nxp c integer, save :: kdb = 0 integer :: i,np,mp,mnp,nxpa #if defined(RELO) c if (.not.allocated(als)) then allocate( als(-47:itdm+jtdm+48,0:1), & alr(-47:itdm+jtdm+48,0:1) ) call mem_stat_add( 2*(itdm+jtdm+96)*2 ) endif #endif c kdb = mod(kdb+1,2) ! the least recently used of the two buffers c c --- calculate the gatherv sizes and offsets on all tiles c nlp(0:ijqr-1,kdb) = 0 do i= 1,nl np = npe_j(max(0,min(jtdm+1,ja(i))) ) mp = mpe_i(max(0,min(itdm+1,ia(i))),np) if (mp.gt.0) then mnp = idproc(mp,np) nlp(mnp,kdb) = nlp(mnp,kdb) + 1 if (mp.eq.mproc .and. np.eq.nproc) then c --- update the local send buffer als(nlp(mnp,kdb),kdb) = a(ia(i)-i0,ja(i)-j0) endif endif !mp enddo !i c nxp(0,kdb) = 0 do i= 1,ijqr-1 nxp(i,kdb) = nxp(i-1,kdb) + nlp(i-1,kdb) enddo nxpa = nxp(ijqr-1,kdb) + nlp(ijqr-1,kdb) c #if defined(MPI) if (mnflg.eq.0) then c --- gatherv onto the 1st mpi task, then broadcast the result mnp = idproc(mproc,nproc) call mpi_gatherv(als(1,kdb),nlp(mnp,kdb),MTYPER, & alr(1,kdb),nlp(0,kdb),nxp(0,kdb),MTYPER, & idproc1(1), & mpi_comm_hycom, mpierr) call mpi_bcast( alr(1,kdb),nxpa,MTYPER, & idproc1(1), & mpi_comm_hycom, mpierr) else mnp = idproc(mproc,nproc) call mpi_gatherv(als(1,kdb),nlp(mnp,kdb),MTYPER, & alr(1,kdb),nlp(0,kdb),nxp(0,kdb),MTYPER, & idproc1(mnflg), & mpi_comm_hycom, mpierr) endif !mnflg #elif defined(SHMEM) call xcstop('xciget: shmem not implemented') stop '(xciget)' #endif c if (mnflg.eq.0 .or. mnflg.eq.mnproc) then c --- repeat the gatherv size logic to unwind the recv buffer nlp(0:ijqr-1,kdb) = 0 do i= 1,nl np = npe_j(max(0,min(jtdm+1,ja(i))) ) mp = mpe_i(max(0,min(itdm+1,ia(i))),np) if (mp.le.0) then alist(i) = vland else mnp = idproc(mp,np) nlp(mnp,kdb) = nlp(mnp,kdb) + 1 alist(i) = alr(nlp(mnp,kdb)+nxp(mnp,kdb),kdb) endif !mp enddo !i endif !mnflg return end subroutine xciget_sm subroutine xciput(alist,nl, a, ia,ja, mnflg) implicit none c integer, intent(in) :: nl integer, intent(in), optional :: mnflg integer, intent(in) :: ia(nl),ja(nl) real, intent(inout) :: alist(nl) real, intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) c c********** c* c 1) fill a list of elements in the non-tiled 2-D grid. c also updates the halo. c c 2) mnflg selects which nodes hold the elements on entry c = 0; all nodes (default) c = n; node number n (mnproc=n) c all nodes hold the elements on exit c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c alist real in/out element values c nl integer input dimension of alist c a real in/out tiled target array c ia integer input 1st non-tiled indexes into a c ja integer input 2nd non-tiled indexes into a c mnflg integer input node source flag c* c********** c integer i,iai,jai #if defined(TIMER) c * if (nxc.eq.0) then * call xctmr0( 5) * nxc = 5 * endif #endif c if (present(mnflg)) then if (mnflg.ne.0) then call xcastr(alist, mnflg) !broadcast the input list endif endif c do i= 1,nl iai = ia(i)-i0 jai = ja(i)-j0 if (iai.ge.1-nbdy .and. iai.le.ii+nbdy .and. & jai.ge.1-nbdy .and. jai.le.jj+nbdy ) then c c on this tile with halo. c a(iai,jai) = alist(i) endif !on tile c if (nreg.ne.0 .and. ia(i).ge.itdm-nbdy+1) then ! periodic iai = ia(i)-itdm-i0 if (iai.ge.1-nbdy .and. iai.le.ii+nbdy .and. & jai.ge.1-nbdy .and. jai.le.jj+nbdy ) then a(iai,jai) = alist(i) endif !on tile endif !periodic, large ia(i) if (nreg.ne.0 .and. ia(i).le.nbdy) then ! periodic iai = ia(i)+itdm-i0 if (iai.ge.1-nbdy .and. iai.le.ii+nbdy .and. & jai.ge.1-nbdy .and. jai.le.jj+nbdy ) then a(iai,jai) = alist(i) endif !on tile endif !periodic, small ia(i) enddo !i #if defined(TIMER) c * if (nxc.eq. 5) then * call xctmr1( 5) * nxc = 0 * endif #endif return end subroutine xciput subroutine xclget(aline,nl, a, i1,j1,iinc,jinc, mnflg) implicit none c integer, intent(in) :: nl,i1,j1,iinc,jinc,mnflg real, intent(out) :: aline(nl) real, intent(in) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) c c********** c* c 1) extract a line of elements from the non-tiled 2-D grid. c c 2) aline(i) == aa(i1+iinc*(i-1),j1+jinc*(i-1)), for i=1...nl. c where aa is the non-tiled representation of a, and c iinc and jinc can each be -1, 0, or +1. c c 3) mnflg selects which nodes must return the line c = 0; all nodes c = n; node number n (mnproc=n) c c 4) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c aline real output required line of elements c nl integer input dimension of aline c a real input tiled source array c i1 integer input 1st non-tiled index into a c j1 integer input 2nd non-tiled index into a c iinc integer input 1st index increment c jinc integer input 2nd index increment c mnflg integer input node return flag c c 5) the global variable vland is returned when outside active tiles. c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c c pad al to guard against false sharing. c double buffer to reduce the number of required barriers. c #if defined(RELO) real, save, allocatable, dimension (:,:) :: al #else real, save, dimension(-47:itdm+jtdm+48,0:1) :: al #endif c integer, save :: kdb = 0 integer, allocatable :: ia(:),ja(:) c integer i1n,iif,iil,j1n,jjf,jjl,l,lx0,nxl,mp,np #if defined(TIMER) c if (nxc.eq.0) then call xctmr0( 3) nxc = 3 endif #endif #if defined(RELO) c if (.not.allocated(al)) then allocate( al(-47:itdm+jtdm+48,0:1) ) call mem_stat_add( (itdm+jtdm+96)*2 ) endif #endif c kdb = mod(kdb+1,2) ! the least recently used of the two buffers c if ((jinc.eq.0 .and. iinc.eq.1) .or. nl.eq.1) then c c --- horizontal forward line. c al(1:nl,kdb) = vland c np = npe_j(j1) do mp= mpe_1(np),mpe_e(np) iif = max(i1, i0_pe(mp,np)+1) iil = min(i1+nl-1,i0_pe(mp,np)+ii_pe(mp,np)) lx0 = iif - i1 nxl = iil - iif + 1 if (nxl.le.0) then cycle ! no elements from tile (mp,np) endif c if (mp.eq.mproc .and. np.eq.nproc) then c c this tile. c do l= lx0+1,lx0+nxl al(l,kdb) = a(i1+l-1-i0,j1-j0) enddo #if defined(MPI) if (mnflg.eq.0) then call mpi_bcast(al(lx0+1,kdb),nxl,MTYPER, & idproc(mp,np),mpi_comm_hycom,mpierr) elseif (mnflg.ne.mnproc) then call MPI_SEND(al(lx0+1,kdb),nxl,MTYPER, & idproc1(mnflg), 9931, & mpi_comm_hycom, mpierr) endif else c c another tile. c if (mnflg.eq.0) then call mpi_bcast(al(lx0+1,kdb),nxl,MTYPER, & idproc(mp,np),mpi_comm_hycom,mpierr) elseif (mnflg.eq.mnproc) then call MPI_RECV(al(lx0+1,kdb),nxl,MTYPER, & idproc(mp,np), 9931, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) endif #endif /* MPI */ endif c if (lx0+nxl.eq.nl) then exit endif enddo ! np=1,jpr #if defined(SHMEM) c c spliting process into two phases saves on barriers. c BARRIER do mp= mpe_1(np),mpe_e(np) iif = max(i1, i0_pe(mp,np)+1) iil = min(i1+nl-1,i0_pe(mp,np)+ii_pe(mp,np)) lx0 = iif - i1 nxl = iil - iif + 1 if (nxl.le.0) then cycle ! no elements from tile (mp,np) endif c if (mp.eq.mproc .and. np.eq.nproc) then c c nothing to do here (see 1st phase, above). c else c c another tile. c if (mnflg.eq.0 .or. mnflg.eq.mnproc) then call SHMEM_GETR(al(lx0+1,kdb), & al(lx0+1,kdb),nxl,idproc(mp,np)) endif endif c if (lx0+nxl.eq.nl) then exit endif enddo ! np=1,jpr ! no barrier needed here because of double buffering #endif /* SHMEM */ c if (mnflg.eq.0 .or. mnflg.eq.mnproc) then aline(1:nl) = al(1:nl,kdb) endif elseif (iinc.eq.0 .and. jinc.eq.1) then c c --- vertical forward line. c al(1:nl,kdb) = vland c do np= 1,jpr mp = mpe_i(i1,np) if (mp.le.0) then cycle ! an all-land column endif jjf = max(j1, j0_pe(mp,np)+1) jjl = min(j1+nl-1,j0_pe(mp,np)+jj_pe(mp,np)) lx0 = jjf - j1 nxl = jjl - jjf + 1 if (nxl.le.0) then cycle ! no elements from tile (mp,np) endif c if (mp.eq.mproc .and. np.eq.nproc) then c c this tile. c do l= lx0+1,lx0+nxl al(l,kdb) = a(i1-i0,j1+l-1-j0) enddo #if defined(MPI) if (mnflg.eq.0) then call mpi_bcast(al(lx0+1,kdb),nxl,MTYPER, & idproc(mp,np),mpi_comm_hycom,mpierr) elseif (mnflg.ne.mnproc) then call MPI_SEND(al(lx0+1,kdb),nxl,MTYPER, & idproc1(mnflg), 9931, & mpi_comm_hycom, mpierr) endif else c c another tile. c if (mnflg.eq.0) then call mpi_bcast(al(lx0+1,kdb),nxl,MTYPER, & idproc(mp,np),mpi_comm_hycom,mpierr) elseif (mnflg.eq.mnproc) then call MPI_RECV(al(lx0+1,kdb),nxl,MTYPER, & idproc(mp,np), 9931, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) endif #endif /* MPI */ endif c if (lx0+nxl.eq.nl) then exit endif enddo ! np=1,jpr #if defined(SHMEM) c c spliting process into two phases saves on barriers. c BARRIER do np= 1,jpr mp = mpe_i(i1,np) if (mp.le.0) then cycle ! an all-land column endif jjf = max(j1, j0_pe(mp,np)+1) jjl = min(j1+nl-1,j0_pe(mp,np)+jj_pe(mp,np)) lx0 = jjf - j1 nxl = jjl - jjf + 1 if (nxl.le.0) then cycle ! no elements from tile (mp,np) endif c if (mp.eq.mproc .and. np.eq.nproc) then c c nothing to do here (see 1st phase, above). c else c c another tile. c if (mnflg.eq.0 .or. mnflg.eq.mnproc) then call SHMEM_GETR(al(lx0+1,kdb), & al(lx0+1,kdb),nxl,idproc(mp,np)) endif endif c if (lx0+nxl.eq.nl) then exit endif enddo ! np=1,jpr ! no barrier needed here because of double buffering #endif /* SHMEM */ c if (mnflg.eq.0 .or. mnflg.eq.mnproc) then aline(1:nl) = al(1:nl,kdb) endif elseif (nl.eq.1) then c c diagonal and reversing lines, single element c call xceget(aline(1),a,i1,j1, mnflg) else c c diagonal and reversing lines - call xciget. c allocate(ia(nl),ja(nl)) do l= 1,nl ia(l) = i1+iinc*(l-1) ja(l) = j1+jinc*(l-1) enddo !l call xciget(aline,nl, a, ia,ja, mnflg) deallocate(ia,ja) endif #if defined(TIMER) c if (nxc.eq. 3) then call xctmr1( 3) nxc = 0 endif #endif return end subroutine xclget subroutine xclput(aline,nl, a, i1,j1,iinc,jinc, mnflg) implicit none c integer, intent(in), optional :: mnflg integer, intent(in) :: nl,i1,j1,iinc,jinc real, intent(inout) :: aline(nl) real, intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) c c********** c* c 1) fill a line of elements in the non-tiled 2-D grid. c c 2) aline(i) == aa(i1+i1*(i-1),j1+j1*(i-1)), for i=1...nl. c where aa is the non-tiled representation of a, and c one of iinc and jinc must be 0, and the other must be 1. c also updates the halo. c c 3) mnflg selects which nodes hold the line on entry c = 0; all nodes (default) c = n; node number n (mnproc=n) c all nodes hold the line on exit c c 4) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c aline real in/out line of element values c nl integer input dimension of aline c a real in/out tiled target array c i1 integer input 1st non-tiled index into a c j1 integer input 2nd non-tiled index into a c iinc integer input 1st index increment c jinc integer input 2nd index increment c mnflg integer input node source flag c* c********** c integer i,j #if defined(TIMER) c if (nxc.eq.0) then call xctmr0( 5) nxc = 5 endif #endif c if (present(mnflg)) then if (mnflg.ne.0) then call xcastr(aline, mnflg) !broadcast the input line endif endif c if (jinc.eq.0) then j = j1-j0 if (j.ge.1-nbdy .and. j.le.jj+nbdy) then do i= max(1-nbdy,i1-i0),min(i1+nl-1-i0,ii+nbdy) a(i,j) = aline(i+i0-i1+1) enddo if (nreg.ne.0 .and. i1+nl-1.ge.itdm-nbdy+1) then ! periodic do i= max(1-nbdy,i1-itdm-i0),min(i1+nl-1-itdm-i0,0) a(i,j) = aline(i+itdm+i0-i1+1) enddo endif if (nreg.ne.0 .and. i1.le.nbdy) then ! periodic do i= max(ii+1,i1+itdm-i0),min(i1+nl-1+itdm-i0,ii+nbdy) a(i,j) = aline(i-itdm+i0-i1+1) enddo endif endif !j elseif (iinc.eq.0) then i = i1-i0 if (i.ge.1-nbdy .and. i.le.ii+nbdy) then do j= max(1-nbdy,j1-j0),min(j1-j0+nl-1,jj+nbdy) a(i,j) = aline(j+j0-j1+1) enddo endif !on tile if (nreg.ne.0 .and. i1.ge.itdm-nbdy+1) then ! periodic i = i1-itdm-i0 !e.g. i1==itdm becomes 0-i0 if (i.ge.1-nbdy .and. i.le.0) then do j= max(1-nbdy,j1-j0),min(j1-j0+nl-1,jj+nbdy) a(i,j) = aline(j+j0-j1+1) enddo endif !in halo endif !near itdm if (nreg.ne.0 .and. i1.le.nbdy) then ! periodic i = i1+itdm-i0 !e.g. i1==1 becomes itdm+1-i0 if (i.ge.ii+1 .and. i.le.ii+nbdy) then do j= max(1-nbdy,j1-j0),min(j1-j0+nl-1,jj+nbdy) a(i,j) = aline(j+j0-j1+1) enddo endif !in halo endif !near 1 endif #if defined(TIMER) c if (nxc.eq. 5) then call xctmr1( 5) nxc = 0 endif #endif return end subroutine xclput subroutine xcmaxr_0(a, mnflg) implicit none c real, intent(inout) :: a integer, intent(in) :: mnflg c c********** c* c 1) replace scalar a with its element-wise maximum over all tiles. c c 2) mnflg selects which nodes must return the minimum c = 0; all nodes c = n; node number n (mnproc=n) c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target variable c mnflg integer input node return flag c* c********** c real a1(1) c a1(1) = a call xcmaxr_1(a1, mnflg) a = a1(1) return end subroutine xcmaxr_0 subroutine xcmaxr_1(a, mnflg) implicit none c real, intent(inout) :: a(:) integer, intent(in) :: mnflg c c********** c* c 1) replace array a with its element-wise maximum over all tiles. c c 2) mnflg selects which nodes must return the minimum c = 0; all nodes c = n; node number n (mnproc=n) c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target array c mnflg integer input node return flag c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c integer i,is0,isl,mn,mnp,n,nn #if defined(TIMER) c if (nxc.eq.0) then call xctmr0(10) nxc = 10 endif #endif c c stripmine a. c n = size(a) c do is0= 0,n-1,nmax isl = min(is0+nmax,n) nn = isl - is0 do i= 1,nn b(i) = a(is0+i) enddo c #if defined(MPI) if (mnflg.eq.0) then call mpi_allreduce(b,c,nn,MTYPER,mpi_max, & mpi_comm_hycom,mpierr) else call mpi_reduce( b,c,nn,MTYPER,mpi_max, & idproc1(mnflg), & mpi_comm_hycom,mpierr) endif #elif defined(SHMEM) BARRIER mnp = max(1,mnflg) if (mnproc.eq.mnp) then c form global maximum on one processor do i= 1,nn c(i) = b(i) enddo do mn= 1,ijpr if (mn.ne.mnp) then call SHMEM_GETR(b,b,nn, idproc1(mn)) do i= 1,nn c(i) = max(c(i),b(i)) enddo endif !.ne.mnp enddo BARRIER elseif (mnflg.eq.0) then c get global maximum from 1st processor BARRIER call SHMEM_GETR(c,c,nn, idproc1(mnp)) endif ! no barrier needed here because using two buffers (b and c) #endif if (mnflg.eq.0 .or. mnflg.eq.mnproc) then do i= 1,nn a(is0+i) = c(i) enddo endif enddo ! stripmine loop #if defined(TIMER) c if (nxc.eq.10) then call xctmr1(10) nxc = 0 endif #endif return end subroutine xcmaxr_1 subroutine xcmaxr_0o(a) implicit none c real, intent(inout) :: a c c********** c* c 1) replace scalar a with its element-wise maximum over all tiles. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target variable c* c********** c integer mnflg real a1(1) c mnflg = 0 !all nodes a1(1) = a call xcmaxr_1(a1, mnflg) a = a1(1) return end subroutine xcmaxr_0o subroutine xcmaxr_1o(a) implicit none c real, intent(inout) :: a(:) c c********** c* c 1) replace array a with its element-wise maximum over all tiles. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target array c* c********** c integer mnflg c mnflg = 0 !all nodes call xcmaxr_1(a, mnflg) return end subroutine xcmaxr_1o subroutine xcminr_0(a, mnflg) implicit none c real, intent(inout) :: a integer, intent(in) :: mnflg c c********** c* c 1) replace scalar a with its element-wise minimum over all tiles. c c 2) mnflg selects which nodes must return the minimum c = 0; all nodes c = n; node number n (mnproc=n) c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target variable c mnflg integer input node return flag c* c********** c real a1(1) c a1(1) = a call xcminr_1(a1, mnflg) a = a1(1) return end subroutine xcminr_0 subroutine xcminr_1(a, mnflg) implicit none c real, intent(inout) :: a(:) integer, intent(in) :: mnflg c c********** c* c 1) replace array a with its element-wise minimum over all tiles. c c 2) mnflg selects which nodes must return the minimum c = 0; all nodes c = n; node number n (mnproc=n) c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target array c mnflg integer input node return flag c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c integer i,is0,isl,mn,mnp,n,nn #if defined(TIMER) c if (nxc.eq.0) then call xctmr0(10) nxc = 10 endif #endif c c stripmine a. c n = size(a) c do is0= 0,n-1,nmax isl = min(is0+nmax,n) nn = isl - is0 do i= 1,nn b(i) = a(is0+i) enddo c #if defined(MPI) if (mnflg.eq.0) then call mpi_allreduce(b,c,nn,MTYPER,mpi_min, & mpi_comm_hycom,mpierr) else call mpi_reduce( b,c,nn,MTYPER,mpi_min, & idproc1(mnflg), & mpi_comm_hycom,mpierr) endif #elif defined(SHMEM) BARRIER mnp = max(1,mnflg) if (mnproc.eq.mnp) then c form global minimum on one processor do i= 1,nn c(i) = b(i) enddo do mn= 1,ijpr if (mn.ne.mnp) then call SHMEM_GETR(b,b,nn, idproc1(mn)) do i= 1,nn c(i) = min(c(i),b(i)) enddo endif !.ne.mnp enddo BARRIER elseif (mnflg.eq.0) then c get global minimum from 1st processor BARRIER call SHMEM_GETR(c,c,nn, idproc1(mnp)) endif ! no barrier needed here because using two buffers (b and c) #endif if (mnflg.eq.0 .or. mnflg.eq.mnproc) then do i= 1,nn a(is0+i) = c(i) enddo endif enddo #if defined(TIMER) c if (nxc.eq.10) then call xctmr1(10) nxc = 0 endif #endif return end subroutine xcminr_1 subroutine xcminr_0o(a) implicit none c real, intent(inout) :: a c c********** c* c 1) replace scalar a with its element-wise minimum over all tiles. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target variable c* c********** c integer mnflg real a1(1) c mnflg = 0 !all nodes a1(1) = a call xcminr_1(a1, mnflg) a = a1(1) return end subroutine xcminr_0o subroutine xcminr_1o(a) implicit none c real, intent(inout) :: a(:) c c********** c* c 1) replace array a with its element-wise minimum over all tiles. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target array c* c********** c integer mnflg c mnflg = 0 !all nodes call xcminr_1(a, mnflg) return end subroutine xcminr_1o #if defined(OCEANS2) subroutine xcpipe(aa_master,what_master, aa_slave,what_slave) implicit none c real, intent(out) :: aa_master(itdm,jtdm) real, intent(in) :: aa_slave(itdm,jtdm) character*12, intent(out) :: what_master character*12, intent(in) :: what_slave c c********** c* c 1) copy aa_slave on task idp1_2 to aa_master on task idp1_1 c only called from pipe_compare c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c aa_master real output non-tiled target array c aa_slave real input non-tiled source array c what_master character*12 output non-tiled target array name c what_slave character*12 input non-tiled source array name c* c********** c include 'mpif.h' integer mpierr c integer iwhat(12) save iwhat c integer i c c only the 1st task on master and slave are involved in the copy. c note the use of mpi_comm_world c if (mnproc.eq.1) then if (nocean.eq.1) then c --- master call MPI_RECV(iwhat,12,MTYPEI, & idp1_2, 9943, & mpi_comm_world, MPI_STATUS_IGNORE, mpierr) do i= 1,12 what_master(i:i) = char(iwhat(i)) enddo !i call MPI_RECV(aa_master,itdm*jtdm,MTYPER, & idp1_2, 9944, & mpi_comm_world, MPI_STATUS_IGNORE, mpierr) else !nocean==2 c --- slave do i= 1,12 iwhat(i) = ichar(what_slave(i:i)) enddo !i call MPI_SEND(iwhat,12,MTYPEI, & idp1_1, 9943, & mpi_comm_world, mpierr) call MPI_SEND(aa_slave,itdm*jtdm,MTYPER, & idp1_1, 9944, & mpi_comm_world, mpierr) endif !master:slave endif !mnproc==1 return end subroutine xcpipe #endif /* OCEANS2 */ #if defined(USE_ESMF) subroutine xcspmd(mpi_comm_vm) implicit none c integer mpi_comm_vm #else subroutine xcspmd #if defined(USE_CCSM3) use ccsm3 use ccsm3_io endif #endif implicit none #endif c c********** c* c 1) initialize data structures that identify the tiles. c c 2) data structures (public): c ipr - 1st 2-D node dimension (<=iqr) c jpr - 2nd 2-D node dimension (<=jqr) c ijpr - 1-D node dimension (ipr*jpr) c mproc - 1st 2-D node index c nproc - 2nd 2-D node index c mnproc - 1-D node index c mp_1st - 1st node in this row of 2-D nodes, mpe_1(nproc) c i0 - 1st dimension offset for this tile, i0_pe(mproc,nproc) c ii - 1st dimension extent for this tile, ii_pe(mproc,nproc) c j0 - 2nd dimension offset for this tile, j0_pe(mproc,nproc) c jj - 2nd dimension extent for this tile, jj_pe(mproc,nproc) c nreg - region type c vland - fill value for land (standard value 0.0) c c 3) data structures (private): c idproc - 2-D node addresses, with periodic wrap c idproc1 - 1-D node addresses, with periodic wrap c idhalo - left and right halo target nodes c i0_pe - 1st dimension tile offsets c ii_pe - 1st dimension tile extents (<=idm) c j0_pe - 2nd dimension tile offsets c jj_pe - 2nd dimension tile extents (<=jdm) c mpe_1 - 1st node in each row of 2-D nodes c mpe_e - end node in each row of 2-D nodes c mpe_i - mapping from 1st global dimension to 2-D nodes c npe_j - mapping from 2nd global dimension to 2-D nodes c i1sum - local index of 1st partial sum on each tile c iisum - number of partial sums on each tile c m0_top - tile offset: top neighbors (0:jpr-1) c mm_top - tile extent: top neighbors (<=jpr) c i0_st - halo offsets: send top neighbors c ii_st - halo lengths: send top neighbors c i0_gt - halo offsets: get top neighbors c ii_gt - halo lengths: get top neighbors c m0_bot - tile offset: bot neighbors (0:jpr-1) c mm_bot - tile extent: bot neighbors (<=jpr) c i0_sb - halo offsets: send bot neighbors c ii_sb - halo lengths: send bot neighbors c i0_gb - halo offsets: get bot neighbors c ii_gb - halo lengths: get bot neighbors c c 4) all data structures are based on the processor number and c the patch distribution file, 'patch.input'. c* c********** c integer i,idm_in,itdm_in,ios,ixsum, & j,jdm_in,jtdm_in,l,m,mm,mn,mypei,n,npesi c character*80 cfile c #if defined(MPI) include 'mpif.h' integer mpierr c c standard mpi (message passing) version. c #if defined(USE_ESMF) call mpi_comm_dup(mpi_comm_vm, mpi_comm_hycom, mpierr) lp = 6 #elif defined(USE_CCSM3) call mpi_comm_dup(mpi_comm_ocn, mpi_comm_hycom, mpierr) lp = 6 #elif defined(OCEANS2) c --- two copies of HYCOM on distinct MPI tasks, master/slave via mod_pipe. call mpi_init(mpierr) call mpi_comm_rank(mpi_comm_world, mypei, mpierr) open(unit=uoff+99,file='./patch.input', iostat=ios) if (ios.ne.0) then if (mypei.eq.0) then write(6,'(a)') "xcspmd: can't open ./patch.input" call flush(6) endif !1st task call mpi_finalize(mpierr) stop '(xcspmd)' endif !ios!=0 read(uoff+99,'(/i6/)',iostat=ios) ijpr if (ios.ne.0) then if (mypei.eq.0) then write(6,'(a)') "xcspmd: can't read ./patch.input" call flush(6) endif !1st task call mpi_finalize(mpierr) stop '(xcspmd)' endif !ios!=0 close(uoff+99) idp1_1 = 0 !1st mpi task of 1st ocean idp1_2 = ijpr !1st mpi task of 2nd ocean if (mypei.lt.idp1_2) then nocean = 1 !master ocean call mpi_comm_split(mpi_comm_world, nocean,mypei, & mpi_comm_hycom, mpierr) lp = 6 else nocean = 2 !slave ocean call mpi_comm_split(mpi_comm_world, nocean,mypei, & mpi_comm_hycom, mpierr) lp = uoff+6 write(cfile,"(a,i6.6,a)") "./OCEAN2/stdout",mypei+1,".log" open(unit=lp,file=trim(cfile),status='new') endif #else call mpi_init(mpierr) call mpi_comm_dup(mpi_comm_world, mpi_comm_hycom, mpierr) lp = 6 #endif c call mpi_comm_rank(mpi_comm_hycom, mypei, mpierr) call mpi_comm_size(mpi_comm_hycom, npesi, mpierr) c mnproc = mypei + 1 ! mnproc counts from 1 #if defined(DEBUG_ALL) write(lp,'(a,i5)') 'mnproc =',mnproc call xcsync(flush_lp) #endif #elif defined(SHMEM) integer SHMEM_MYPE,SHMEM_NPES c c shmem version. c call start_pes(0) c lp = 6 c mypei = SHMEM_MYPE() npesi = SHMEM_NPES() c mnproc = mypei + 1 ! mnproc counts from 1 #else lp = 6 c write(lp,*) write(lp,*) '***** ERROR - UNDEFINED SPMD MACHINE TYPE *****' write(lp,*) call flush(lp) stop '(xcspmd)' #endif c vland = 0.0 vland4 = 0.0 c #if defined(T3E) open(unit=lp,delim='none') ! forces open on stdout #endif c c initialize timers early to allow xcstop to work. c call xctmri c c read in the tile locations and sizes. c patch distibution file on unit 21 (fort.21). c c here is an example of a patch file, with a leading "!" added: c ! npes npe mpe idm jdm ibig jbig nreg ! 16 4 4 57 52 19 15 0 ! !ispt( 1) = 22 35 42 49 !iipe( 1) = 13 7 7 7 !ispt( 2) = 1 15 25 34 !iipe( 2) = 14 10 9 9 !ispt( 3) = 2 21 29 38 !iipe( 3) = 19 8 9 10 !ispt( 4) = 18 28 35 42 !iipe( 4) = 10 7 7 9 ! !jspt( 1) = 1 15 26 38 !jjpe( 1) = 14 11 12 15 c c ispt(j) is the 1st i-point on each tile in the j-th row c iipe(j) is the i-extent of each tile in the j-th row c jspt(1) is the 1st j-point on each tile in all columns c jjpe(1) is the j-extent of each tile in all columns c c note that each tile row can have a different tile layout, c but each tile column is identical. So a tile can have at c most one nieghbor in the E and W directions, but several c nieghbors in the N and S directions. c c iipe can be zero, indicating an empty tile (not included in the c active tile count, npes) and therefore at least a halo widths c land separation between active tiles to its east and west. In c periodic cases, the last non-empty tile can periodic wrap to the c first tile in the row (i.e. trailing "empty" tiles can be null c tiles, rather than all-land tiles). c #if defined(USE_CCSM3) open(unit=uoff+99,file=trim(flnmptchd)//'patch.input', & iostat=ios) #elif defined(OCEANS2) if (nocean.eq.2) then open(unit=uoff+99,file="./OCEAN2/patch.input", & iostat=ios) else open(unit=uoff+99,file='./patch.input', & iostat=ios) endif #else open(unit=uoff+99,file='./patch.input', & iostat=ios) #endif if (ios.ne.0) then call xcstop('xcspmd: error opening patch.input') stop '(xcspmd)' endif read(uoff+99,'(/8i6/)',iostat=ios) ijpr,ipr,jpr, & itdm_in,jtdm_in,idm_in,jdm_in,nreg if (ios.ne.0) then call xcstop('xcspmd: error reading patch.input') stop '(xcspmd)' elseif (ijpr.gt.ijqr .or. ipr.gt.iqr .or. jpr.gt.jqr) then if (mnproc.eq.1) then write(lp,'(a,3i5)') 'input: ijpr,ipr,jpr =',ijpr,ipr,jpr write(lp,'(a,3i5)') 'param: ijqr,iqr,jqr =',ijqr,iqr,jqr call flush(lp) endif call xcstop('xcspmd: patch.input for wrong ipr,jpr,ijpr') stop '(xcspmd)' #if defined(ARCTIC) elseif (nreg.ne.3) then ! not arctic if (mnproc.eq.1) then write(lp,'(a,i5)') 'input: nreg =',nreg call flush(lp) endif call xcstop('xcspmd: patch.input must be for arctic') stop '(xcspmd)' #else elseif (nreg.lt.0 .or. nreg.gt.2) then ! not closed or periodic ! use TYPE=one/omp for f-plane if (mnproc.eq.1) then write(lp,'(a,i5)') 'input: nreg =',nreg call flush(lp) endif call xcstop('xcspmd: patch.input for wrong nreg') stop '(xcspmd)' #endif /* ARCTIC:else */ endif c #if defined(RELO) c --- region's horizontal dimensions are from patch.input: c itdm = itdm_in jtdm = jtdm_in idm = idm_in jdm = jdm_in #else if (itdm_in.ne.itdm .or. jtdm_in.ne.jtdm) then if (mnproc.eq.1) then write(lp,'(a,2i5)') 'input: itdm,jtdm =',itdm_in,jtdm_in write(lp,'(a,2i5)') 'param: itdm,jtdm =',itdm, jtdm call flush(lp) endif call xcstop('xcspmd: patch.input for wrong itdm,jtdm') stop '(xcspmd)' elseif (idm_in.gt.idm .or. jdm_in.gt.jdm) then if (mnproc.eq.1) then write(lp,'(a,2i5)') 'input: idm,jdm =',idm_in,jdm_in write(lp,'(a,2i5)') 'param: idm,jdm =',idm, jdm call flush(lp) endif call xcstop('xcspmd: patch.input for wrong idm,jdm') stop '(xcspmd)' endif #endif c c individual tile rows. c do n= 1,jpr read( uoff+99,'(12x,8i6)') (i0_pe(m,n),m=1,ipr) ! ispt=i0+1 read( uoff+99,'(12x,8i6)') (ii_pe(m,n),m=1,ipr) ! iipe if (maxval(ii_pe(1:ipr,n)).le.0) then call xcstop('xcspmd: patch.input has an empty row') stop '(xcspmd)' endif do m= 1,ipr i0_pe(m,n) = i0_pe(m,n) - 1 enddo enddo #if defined(ARCTIC) c c --- all arctic patch tiles must be the same size or empty, c --- and empty tiles must be "twinned" across the top boundary. c if (ipr.gt.1) then do m= 1,ipr if (ii_pe(m,jpr).eq.0) then if (ii_pe(ipr+1-m,jpr).ne.0) then if (mnproc.eq.1) then write(lp,'(a,i3,a,i3,a)') & 'error - tile',m,',jpr is empty but tile', & ipr+1-m,',jpr is not' call flush(lp) endif call xcstop('xcspmd: arctic empty tiles are not twins') stop '(xcspmd)' endif elseif (ii_pe(m,jpr).ne.itdm/ipr) then if (mnproc.eq.1) then write(lp,'(a,i5)') & 'error - arctic patch tiles should have ii =',itdm/ipr call flush(lp) endif call xcstop('xcspmd: arctic tiles are not the right size') stop '(xcspmd)' endif enddo !m endif #endif /* ARCTIC */ c c the generic tile column (must cover entire column). c read( uoff+99,*) read( uoff+99,'(12x,8i6)') (j0_pe(1,n),n=1,jpr) ! jspt = io+1 read( uoff+99,'(12x,8i6)') (jj_pe(1,n),n=1,jpr) ! jjpe if (j0_pe(1,1).ne.1) then call xcstop('xcspmd: patch.input for wrong jspt') stop '(xcspmd)' endif j0_pe(1,1) = 0 do n= 2,jpr j0_pe(1,n) = j0_pe(1,n) - 1 if (j0_pe(1,n).ne.j0_pe(1,n-1)+jj_pe(1,n-1)) then call xcstop('xcspmd: patch.input non-contiguous') stop '(xcspmd)' endif enddo if (j0_pe(1,jpr)+jj_pe(1,jpr).ne.jtdm) then call xcstop('xcspmd: patch.input for wrong jjpe') stop '(xcspmd)' endif do m= 2,ipr j0_pe(m,:) = j0_pe(1,:) jj_pe(m,:) = jj_pe(1,:) enddo close(uoff+99) c #if defined(RELO) allocate( mpe_i(0:itdm+1,0:jqr), npe_j(0:jtdm+1) ) call mem_stat_add( (itdm+2)*(jqr+1)+jtdm+2 ) #endif c #if defined(MPI) c c do we have the right number of pes? c if (npesi.ne.ijpr) then if (mnproc.eq.1) then write(lp,*) write(lp,*) '***** ERROR - WRONG MPI SIZE *****' write(lp,*) write(lp,*) 'NPES = ',npesi write(lp,*) ' IJPR = ', ijpr write(lp,*) 'IPR,JPR = ',ipr,jpr write(lp,*) call flush(lp) endif call xcstop('Error in xcspmd') stop endif c c mpi messages are sent and received by pe number (0:ijpr-1). c null_tile = mpi_proc_null c mn = 0 do n= 1,jpr mpe_1(n) = 0 do m= 1,ipr if (ii_pe(m,n).eq.0) then idproc(m,n) = null_tile else idproc1(mn+1) = mn idproc(m,n) = mn mn = mn + 1 if (mnproc.eq.mn) then mproc = m nproc = n endif mpe_e(n) = m if (mpe_1(n).eq.0) then mpe_1(n) = m endif endif enddo enddo c mp_1st = mpe_1(nproc) !1st node in this row (public) c c mpi-2 i/o group (public), see mod_za. c if (mproc.eq.mp_1st) then i = 1 call mpi_comm_split(mpi_comm_hycom, i,0, & group_1st_in_row, mpierr) else i = 0 call mpi_comm_split(mpi_comm_hycom, i,0, & group_1st_in_row, mpierr) call mpi_comm_free( group_1st_in_row, mpierr) endif #elif defined(SHMEM) c c do we have the right number of pes? c if (npesi.ne.ijpr) then if (mnproc.eq.1) then write(lp,*) write(lp,*) '***** ERROR - WRONG SHMEM SIZE *****' write(lp,*) write(lp,*) 'NPES = ',npesi write(lp,*) ' IJPR = ', ijpr write(lp,*) 'IPR,JPR = ',ipr,jpr write(lp,*) call flush(lp) endif call xcstop('Error in xcspmd') stop endif c c shmem messages are sent and received by pe number (0:ijpr-1). c null_tile = -1 c mn = 0 do n= 1,jpr mpe_1(n) = 0 do m= 1,ipr if (ii_pe(m,n).eq.0) then idproc(m,n) = null_tile else idproc1(mn+1) = mn idproc(m,n) = mn mn = mn + 1 if (mnproc.eq.mn) then mproc = m nproc = n endif mpe_e(n) = m if (mpe_1(n).eq.0) then mpe_1(n) = m endif endif enddo enddo c mp_1st = mpe_1(nproc) !1st node in this row (public) #endif c if (mn.ne.ijpr) then if (mnproc.eq.1) then write(lp,'(a,i5)') 'input: ijpr =',ijpr write(lp,'(a,i5)') 'calc: ijpr =',mn call flush(lp) endif call xcstop('xcspmd: wrong number of sea tiles') stop '(xcspmd)' endif c if (nreg.eq.0) then c c longitudinal tile dimension is closed (not periodic) c do n= 1,jpr idproc( 0,n) = null_tile idproc(ipr+1,n) = null_tile enddo else c c longitudinal tile dimension is potentially periodic. c do n= 1,jpr idproc( 0,n) = null_tile idproc(ipr+1,n) = null_tile c i = maxval((i0_pe(1:ipr,n)+ii_pe(1:ipr,n))) if (i0_pe(1,n).eq.0 .and. i.eq.itdm) then idproc( 0,n) = idproc(mpe_e(n),n) idproc(mpe_e(n)+1,n) = idproc( 1,n) endif enddo endif #if defined(ARCTIC) c c must have ipr even or 1 for arctic boundary case. c if (ipr.gt.1 .and. mod(ipr,2).ne.0) then call xcstop('Error in xcspmd (arctic) - ipr must be even') stop '(xcspmd)' endif c c latitudinal tile dimension is closed/arctic. c do m= 1,ipr idproc(m, 0) = null_tile idproc(m,jpr+1) = idproc(ipr+1-m,jpr) !arctic tile mapping enddo idproc( 0, 0) = null_tile idproc(ipr+1, 0) = null_tile idproc( 0,jpr+1) = idproc(ipr,jpr+1) idproc(ipr+1,jpr+1) = idproc(1, jpr+1) #else c c latitudinal tile dimension is closed c do m= 0,ipr+1 idproc(m, 0) = null_tile idproc(m,jpr+1) = null_tile enddo #endif /* ARCTIC:else */ c c 1-d tiling logic is easier if assumed periodic. c idproc1( 0) = idproc1(ijpr) idproc1(ijpr+1) = idproc1( 1) c c mapping from global i,j to mp,np. c ia,ja is on tile mpe_i(ia,npe_j(ja)),npe_j(ja), c or on no tile if mpe_i(ia,npe_j(ja)) is 0 or -1. c mpe_i( :,0) = -1 !for npe_j out of range mpe_i( 0,:) = -1 !for i out of range mpe_i(itdm+1,:) = -1 !for i out of range do n= 1,jpr mpe_i(1:itdm,n) = 0 ! default is an empty tile do m= 1,ipr ! i-map potentially varies with n if (ii_pe(m,n).gt.0) then do i= i0_pe(m,n)+1,i0_pe(m,n)+ii_pe(m,n) mpe_i(i,n) = m enddo if (m.ne.ipr) then if (ii_pe(m+1,n).gt.0) then do i= i0_pe(m,n)+ii_pe(m,n)+1,i0_pe(m+1,n) mpe_i(i,n) = -1 ! gap between tiles enddo endif endif endif enddo m = 1 ! only one j-map do j= j0_pe(m,n)+1,j0_pe(m,n)+jj_pe(m,n) npe_j(j) = n enddo npe_j( 0) = 0 !for j out of range npe_j(jtdm+1) = 0 !for j out of range enddo c c do each partial sum on the tile that owns its center point. c i1sum - local index of 1st partial sum on each tile c iisum - number of partial sums on each tile c see xcsum for how i1sum and iisum are used. c ixsum = 0 do n= 1,jpr do m= 1,ipr if (ii_pe(m,n).le.0) then i1sum(m,n) = 0 iisum(m,n) = 0 else c --- partial sum specific definition of idhalo idhalo(1) = idproc(m-1,n) idhalo(2) = idproc(m+1,n) if (idhalo(1).ne.null_tile .and. m.ne.1) then if (i0_pe(m,n).ne.i0_pe(m-1,n)+ii_pe(m-1,n)) then idhalo(1) = null_tile endif endif if (m.eq.mpe_e(n)) then idhalo(2) = null_tile elseif (idhalo(2).ne.null_tile) then if (i0_pe(m,n)+ii_pe(m,n).ne.i0_pe(m+1,n)) then idhalo(2) = null_tile endif endif i1sum(m,n) = -99 iisum(m,n) = 0 do i= 1+nbdy,itdm+nbdy,2*nbdy+1 if (i0_pe(m,n).lt.i .and. & i.le.i0_pe(m,n)+ii_pe(m,n)) then iisum(m,n) = iisum(m,n) + 1 if (iisum(m,n).eq.1) then i1sum(m,n) = i - nbdy - i0_pe(m,n) endif elseif (idhalo(1).eq.null_tile .and. & i.gt.i0_pe(m,n)-nbdy .and. & i.le.i0_pe(m,n) ) then iisum(m,n) = iisum(m,n) + 1 if (iisum(m,n).eq.1) then i1sum(m,n) = i - nbdy - i0_pe(m,n) endif elseif (idhalo(2).eq.null_tile .and. & i.gt.i0_pe(m,n)+ii_pe(m,n) .and. & i.le.i0_pe(m,n)+ii_pe(m,n)+nbdy ) then iisum(m,n) = iisum(m,n) + 1 if (iisum(m,n).eq.1) then i1sum(m,n) = i - nbdy - i0_pe(m,n) endif endif enddo endif ixsum = max( ixsum, iisum(m,n) ) enddo !m enddo !n c c local tile extents. c i0 = i0_pe(mproc,nproc) ii = ii_pe(mproc,nproc) j0 = j0_pe(mproc,nproc) jj = jj_pe(mproc,nproc) c c left and right halo targets c idhalo(1) = idproc(mproc-1,nproc) idhalo(2) = idproc(mproc+1,nproc) c if (idhalo(1).ne.null_tile .and. mproc.ne.1) then c c is the left tile touching this one? c if (i0.ne.i0_pe(mproc-1,nproc)+ii_pe(mproc-1,nproc)) then idhalo(1) = null_tile endif endif c if (idhalo(2).ne.null_tile .and. mproc.ne.mpe_e(nproc)) then c c is the right tile touching this one? c if (i0+ii.ne.i0_pe(mproc+1,nproc)) then idhalo(2) = null_tile endif endif c c local halo exchange data structures c c m0_top - tile offset: top neighbors c mm_top - tile extent: top neighbors (<=jpr) c i0_st - halo offsets: send top neighbors c ii_st - halo lengths: send top neighbors c i0_gt - halo offsets: get top neighbors c ii_gt - halo lengths: get top neighbors c m0_bot - tile offset: bot neighbors c mm_bot - tile extent: bot neighbors (<=jpr) c i0_sb - halo offsets: send bot neighbors c ii_sb - halo lengths: send bot neighbors c i0_gb - halo offsets: get bot neighbors c ii_gb - halo lengths: get bot neighbors c c note that send is also receive, and is w.r.t. the local tile. c similarly get is also put, and is w.r.t. the remote tile. c if (nproc.eq.jpr) then #if defined(ARCTIC) c single, same size, top arctic nieghbor m0_top = mproc - 1 mm_top = 1 i0_st(1) = 0 i0_gt(1) = 0 ii_st(1) = ii ii_gt(1) = ii #else c no top nieghbor (closed boundary) m0_top = 0 mm_top = 0 #endif /* ARCTIC:else */ else n = nproc + 1 m0_top = 0 mm_top = 0 m = 0 do i= 1,ii if (mpe_i(i0+i,n).ne.m) then if (mm_top.eq.0) then m0_top = mpe_i(i0+i,n) - 1 elseif (m.ne.-1) then ii_st(mm_top) = i-1 - i0_st(mm_top) ii_gt(mm_top) = ii_st(mm_top) endif m = mpe_i(i0+i,n) if (m.gt.0) then mm_top = mm_top + 1 i0_st(mm_top) = i-1 i0_gt(mm_top) = i-1 + i0-i0_pe(m,n) elseif (m.eq.0) then mm_top = mm_top + 1 i0_st(mm_top) = i-1 i0_gt(mm_top) = i0_gt(mm_top-1) + ii_gt(mm_top-1) * elseif (m.eq.-1) then !do nothing endif endif enddo if (mm_top.gt.0) then if (m.gt.0) then ii_st(mm_top) = ii - i0_st(mm_top) ii_gt(mm_top) = ii_st(mm_top) elseif (m.eq.0) then mm_top = mm_top-1 * elseif (m.eq.-1) then !do nothing endif endif endif !nproc.eq.1:else c if (nproc.eq.1) then c no bottom nieghbor (closed boundary) m0_bot = 0 mm_bot = 0 else n = nproc - 1 m0_bot = 0 mm_bot = 0 m = 0 do i= 1,ii if (mpe_i(i0+i,n).ne.m) then if (mm_bot.eq.0) then m0_bot = mpe_i(i0+i,n) - 1 elseif (m.ne.-1) then ii_sb(mm_bot) = i-1 - i0_sb(mm_bot) ii_gb(mm_bot) = ii_sb(mm_bot) endif m = mpe_i(i0+i,n) if (m.gt.0) then mm_bot = mm_bot + 1 i0_sb(mm_bot) = i-1 i0_gb(mm_bot) = i-1 + i0-i0_pe(m,n) elseif (m.eq.0) then mm_bot = mm_bot + 1 i0_sb(mm_bot) = i-1 i0_gb(mm_bot) = i0_gb(mm_bot-1) + ii_gb(mm_bot-1) * elseif (m.eq.-1) then !do nothing endif endif enddo if (mm_bot.gt.0) then if (m.gt.0) then ii_sb(mm_bot) = ii - i0_sb(mm_bot) ii_gb(mm_bot) = ii_sb(mm_bot) elseif (m.eq.0) then mm_bot = mm_bot-1 * elseif (m.eq.-1) then !do nothing endif endif endif !nproc.eq.1:else c c printout the tile data structures. c if (mnproc.eq.1) then write(lp,'(/a)') & 'mnproc mproc nproc i0 ii j0 jj i1sum iisum' mn = 0 do n= 1,jpr do m= 1,ipr if (ii_pe(m,n).ne.0) then mn= mn + 1 write(lp,'(i6,2i6,i7,i5,i7,i5,i7,i6)') & mn,m,n, & i0_pe(m,n),ii_pe(m,n), & j0_pe(m,n),jj_pe(m,n), & i1sum(m,n),iisum(m,n) endif enddo !m enddo !n write(lp,*) #if defined(ARCTIC) write(lp,'(a)') & 'mnproc mproc nproc mnarct' mn = 0 do n= 1,jpr do m= 1,ipr if (ii_pe(m,n).ne.0) then mn= mn + 1 if (n.eq.jpr) then write(lp,'(i6,2i6,i7)') & mn,m,n,idproc(m,n+1) endif endif enddo !m enddo !n write(lp,*) #endif /* ARCTIC */ #if defined(DEBUG_ALL) do n= 1,jpr write(lp,*) 'mpe_1,mpe_e = ',mpe_1(n),mpe_e(n) enddo do n= 1,jpr write(lp,*) 'mpe_i = ',mpe_i(:,n) enddo write(lp,*) write(lp,*) 'npe_j = ',npe_j(:) write(lp,*) #endif endif call xcsync(flush_lp) c #if defined(DEBUG_ALL) do n= 1,jpr do m= 1,ipr if (mproc.eq.m .and. nproc.eq.n) then write(lp,'(a,2i3,i4,i3,16(i5,i4))') & 'm,n,_top,_st = ', & m,n,m0_top,mm_top,(i0_st(l),ii_st(l), l= 1,mm_top) #if defined(SHMEM) write(lp,'(a,2i3,i4,i3,16(i5,i4))') & 'm,n,_top,_gt = ', & m,n,m0_top,mm_top,(i0_gt(l),ii_gt(l), l= 1,mm_top) #endif if (m.eq.ipr) then write(lp,*) !blank line endif endif call xcsync(flush_lp) enddo !m do m= 1,ipr if (mproc.eq.m .and. nproc.eq.n) then write(lp,'(a,2i3,i4,i3,16(i5,i4))') & 'm,n,_bot,_sb = ', & m,n,m0_bot,mm_bot,(i0_sb(l),ii_sb(l), l= 1,mm_bot) #if defined(SHMEM) write(lp,'(a,2i3,i4,i3,16(i5,i4))') & 'm,n,_bot,_gb = ', & m,n,m0_bot,mm_bot,(i0_gb(l),ii_gb(l), l= 1,mm_bot) #endif if (m.eq.ipr) then write(lp,*) !blank line endif endif call xcsync(flush_lp) enddo !m do m= 1,ipr if (mproc.eq.m .and. nproc.eq.n) then write(lp,'(a,2i3,3i5)') & 'm,n,id,idhalo = ', & m,n,idproc(m,n),idhalo if (m.eq.ipr) then write(lp,*) !blank line endif endif call xcsync(flush_lp) enddo !m enddo !n #endif /* DEBUG_ALL */ #if defined(USE_ESMF) c c --- ESMF tiling fills the global array (1:itdm,1:jtdm) c mn=0 do n=1,jpr do m=1,ipr if(ii_pe(m,n).ne.0)then mn= mn + 1 #if defined(ARCTIC) c --- don't include the last row (it is a copy of the previous one) deBlockList(2,1,mn) = j0_pe(m,n)+1 deBlockList(2,2,mn) = min( j0_pe(m,n)+jj_pe(m,n), jtdm-1 ) #else deBlockList(2,1,mn) = j0_pe(m,n)+1 deBlockList(2,2,mn) = j0_pe(m,n)+jj_pe(m,n) #endif /* ARCTIC:else */ if (m.eq.mpe_1(n)) then if (m.eq.mpe_e(n)) then c --- only tile in row deBlockList(1,1,mn) = 1 deBlockList(1,2,mn) = itdm else c --- 1st tile in row, right edge may be extended later deBlockList(1,1,mn) = 1 deBlockList(1,2,mn) = i0_pe(m,n)+ii_pe(m,n) endif elseif (m.eq.mpe_e(n)) then c --- last tile in row deBlockList(1,1,mn) = i0_pe(m,n)+1 deBlockList(1,2,mn) = itdm if (deBlockList(1,1,mn) .ne. & deBlockList(1,2,mn-1)+1 ) then c --- a hole in the HYCOM tiling, over land, is filled here deBlockList(1,2,mn-1) = (deBlockList(1,2,mn-1)+ & deBlockList(1,1,mn) )/2 deBlockList(1,1,mn) = deBlockList(1,2,mn-1)+1 endif else c --- middle tile in row, right edge may be extended later deBlockList(1,1,mn) = i0_pe(m,n)+1 deBlockList(1,2,mn) = i0_pe(m,n)+ii_pe(m,n) if (deBlockList(1,1,mn) .ne. & deBlockList(1,2,mn-1)+1 ) then c --- a hole in the HYCOM tiling, over land, is filled here deBlockList(1,2,mn-1) = (deBlockList(1,2,mn-1)+ & deBlockList(1,1,mn) )/2 deBlockList(1,1,mn) = deBlockList(1,2,mn-1)+1 endif endif endif enddo !m enddo !n c if (mnproc.eq.1) then write(lp,'(/a)') & 'mnproc mproc nproc BL11 BL21 BL12 BL22' mn = 0 do n= 1,jpr do m= 1,ipr if (ii_pe(m,n).ne.0) then mn=mn+1 write(lp,'(i6,2i6,4i7)') & mn,m,n, & deBlockList(1,1,mn), & deBlockList(2,1,mn), & deBlockList(1,2,mn), & deBlockList(2,2,mn) endif enddo !m enddo !n write(lp,*) endif !1st tile call xcsync(flush_lp) #endif c c mxsum large enough? c #if defined(RELO) mxsum = ((idm+4*nbdy)/(2*nbdy+1))*jdm c #endif if (ixsum.gt.mxsum) then if (mnproc.eq.1) then write(lp,'(a,2i5)') 'mxsum,ixsum =',mxsum,ixsum call flush(lp) endif call xcstop('Error in xcspmd - mxsum too small') stop '(xcspmd)' endif c #if defined(TIMER) c initialize timer names. c call xctmrn( 1,'xcaget') call xctmrn( 2,'xceget') call xctmrn( 3,'xclget') call xctmrn( 4,'xcaput') call xctmrn( 5,'xcXput') call xctmrn( 6,'xcsum ') call xctmrn( 9,'xcastr') call xctmrn(10,'xcmaxr') call xctmrn(12,'xctilr') #if defined(ARCTIC) call xctmrn(13,'xctila') #endif #endif return end subroutine xcspmd subroutine xcstop(cerror) implicit none c character*(*), intent(in) :: cerror c c********** c* c 1) stop all processes. c c 2) all processes must call this routine. c use 'xchalt' for emergency stops. c c 3) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c cerror char*(*) input error message c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c c print active timers. c call xctmrp c c message passing version, set barrier and then stop everything. c use xcsync as the barrier so that stdout is flushed. c note that the system will hang unless all processes call xcstop. c call xcsync(flush_lp) if (mnproc.eq.1 .and. cerror.ne.' ') then write(lp,*) '**************************************************' write(lp,*) cerror write(lp,*) '**************************************************' write(lp,*) endif call xcsync(flush_lp) c #if defined(USE_ESMF) || defined(USE_CCSM3) || defined(OCEANS2) c c --- we may not be running on all processes, so call mpi_abort c if (mnproc.eq.1) then call mpi_abort(mpi_comm_hycom,9) endif call xcsync(flush_lp) #elif defined(MPI) write(lp,*) 'mpi_finalize called on processor ',mnproc call xcsync(flush_lp) call mpi_finalize(mpierr) c #endif #if defined(STOP2003) c --- Fortran 2003 STOP outputs many lines, replace with exit call exit(0) #else stop '(xcstop)' #endif end subroutine xcstop subroutine xcsum(sum, a,mask) implicit none c real*8, intent(out) :: sum real, intent(inout) :: a( 1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) integer, intent(in) :: mask(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) c c********** c* c 1) sum a 2-d array, where mask==1 c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c sum real*8 output sum of a c a real input source array c mask integer input mask array c c 3) sum is bit for bit reproducable for the same halo size, nbdy. c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c real*8 zero8 parameter (zero8=0.0) c real*8 sum8 real vsave integer i,i1,j,l,mp,np #if defined(TIMER) c if (nxc.eq.0) then call xctmr0( 6) nxc = 6 endif #endif #if defined(RELO) c if (.not.allocated(sum8t)) then allocate( sum8t(mxsum), sum8j(jdm) ) call mem_stat_add( mxsum + jdm ) endif #endif c c halo update so that 2*nbdy+1 wide strips are on chip. c vsave = vland vland = 0.0 call xctilr(a,1,1, nbdy,0, halo_ps) vland = vsave c c row sums in 2*nbdy+1 wide strips. c !$OMP PARALLEL DO PRIVATE(j,i1,i,l,sum8) !$OMP& SCHEDULE(STATIC,jblk) do j=1,jj do l= 1,iisum(mproc,nproc) i1 = i1sum(mproc,nproc) + (l-1)*(2*nbdy+1) sum8 = zero8 do i= max(i1,1-nbdy),min(i1+2*nbdy,ii+nbdy,itdm-i0) if (mask(i,j).eq.1) then sum8 = sum8 + a(i,j) endif enddo sum8t(l + (j-1)*iisum(mproc,nproc)) = sum8 enddo enddo !$OMP END PARALLEL DO c c complete row sums on first processor in each row. c #if defined(SHMEM) BARRIER #endif if (mproc.eq.mpe_1(nproc)) then do j=1,jj sum8j(j) = zero8 do l= 1,iisum(mproc,nproc) sum8j(j) = sum8j(j) + sum8t(l + (j-1)*iisum(mproc,nproc)) enddo * write(lp,'(a,i3,i5,f12.2)') 'xcsum: np,j,sum = ', * & 1,j,sum8j(j) enddo c c remote sums. c do mp= mpe_1(nproc)+1,mpe_e(nproc) l = iisum(mp,nproc)*jj if (l.gt.0) then #if defined(MPI) call MPI_RECV(sum8t,l,MTYPED, & idproc(mp,nproc), 9900, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) #elif defined(SHMEM) call SHMEM_GETD(sum8t, & sum8t,l,idproc(mp,nproc)) #endif do j=1,jj do l= 1,iisum(mp,nproc) sum8j(j) = sum8j(j) + sum8t(l + (j-1)*iisum(mp,nproc)) enddo * write(lp,'(a,i3,i5,f12.2)') 'xcsum: np,j,sum = ', * & mp,j,sum8j(j) enddo endif enddo #if defined(MPI) else l = iisum(mproc,nproc)*jj if (l.gt.0) then call MPI_SEND(sum8t,l,MTYPED, & idproc(mpe_1(nproc),nproc), 9900, & mpi_comm_hycom, mpierr) endif #endif endif c c sum of row sums, on first processor. c #if defined(SHMEM) BARRIER #endif if (mnproc.eq.1) then sum8 = zero8 do j= 1,jj sum8 = sum8 + sum8j(j) enddo * write(lp,'(a,i5,f12.2)') 'xcsum: jj,sum = ',jj,sum8 c do np= 2,jpr mp = mpe_1(np) #if defined(MPI) call MPI_RECV(sum8j,jj_pe(mp,np),MTYPED, & idproc(mp,np), 9901, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) #elif defined(SHMEM) call SHMEM_GETD(sum8j, & sum8j,jj_pe(mp,np),idproc(mp,np)) #endif do j= 1,jj_pe(mp,np) sum8 = sum8 + sum8j(j) enddo * write(lp,'(a,i5,f12.2)') 'xcsum: jj,sum = ', * & jj_pe(mp,np),sum8 enddo sum8s = sum8 #if defined(MPI) elseif (mproc.eq.mpe_1(nproc)) then call MPI_SEND(sum8j,jj,MTYPED, & idproc1(1), 9901, & mpi_comm_hycom, mpierr) #endif endif c c broadcast result to all processors. c #if defined(MPI) call mpi_bcast(sum8s,1,MTYPED, & idproc1(1),mpi_comm_hycom,mpierr) #elif defined(SHMEM) BARRIER if (mnproc.ne.1) then call SHMEM_GETD(sum8s, & sum8s,1,idproc1(1)) endif #endif c sum = sum8s #if defined(TIMER) c if (nxc.eq. 6) then call xctmr1( 6) nxc = 0 endif #endif return end subroutine xcsum subroutine xcsumj(sumj, a,mask) implicit none c real*8, intent(out) :: sumj(jtdm) real, intent(inout) :: a( 1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) integer, intent(in) :: mask(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy) c c********** c* c 1) rwo-sum of a 2-d array, where mask==1, on first processor only. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c sumj real*8 output row-sum of a c a real input source array c mask integer input mask array c c 3) sum is bit for bit reproducable for the same halo size, nbdy. c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c real*8 zero8 parameter (zero8=0.0) c real*8 sum8 real vsave integer i,i1,j,l,mp,np #if defined(TIMER) c if (nxc.eq.0) then call xctmr0( 6) nxc = 6 endif #endif #if defined(RELO) c if (.not.allocated(sum8t)) then allocate( sum8t(mxsum), sum8j(jdm) ) call mem_stat_add( mxsum + jdm ) endif #endif c c halo update so that 2*nbdy+1 wide strips are on chip. c vsave = vland vland = 0.0 call xctilr(a,1,1, nbdy,0, halo_ps) vland = vsave c c row sums in 2*nbdy+1 wide strips. c !$OMP PARALLEL DO PRIVATE(j,i1,i,l,sum8) !$OMP& SCHEDULE(STATIC,jblk) do j=1,jj do l= 1,iisum(mproc,nproc) i1 = i1sum(mproc,nproc) + (l-1)*(2*nbdy+1) sum8 = zero8 do i= max(i1,1-nbdy),min(i1+2*nbdy,ii+nbdy,itdm-i0) if (mask(i,j).eq.1) then sum8 = sum8 + a(i,j) endif enddo sum8t(l + (j-1)*iisum(mproc,nproc)) = sum8 enddo enddo !$OMP END PARALLEL DO c c complete row sums on first processor in each row. c #if defined(SHMEM) BARRIER #endif if (mproc.eq.mpe_1(nproc)) then do j=1,jj sum8j(j) = zero8 do l= 1,iisum(mproc,nproc) sum8j(j) = sum8j(j) + sum8t(l + (j-1)*iisum(mproc,nproc)) enddo * write(lp,'(a,i3,i5,f12.2)') 'xcsum: np,j,sum = ', * & 1,j,sum8j(j) enddo c c remote sums. c do mp= mpe_1(nproc)+1,mpe_e(nproc) l = iisum(mp,nproc)*jj if (l.gt.0) then #if defined(MPI) call MPI_RECV(sum8t,l,MTYPED, & idproc(mp,nproc), 9900, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) #elif defined(SHMEM) call SHMEM_GETD(sum8t, & sum8t,l,idproc(mp,nproc)) #endif do j=1,jj do l= 1,iisum(mp,nproc) sum8j(j) = sum8j(j) + sum8t(l + (j-1)*iisum(mp,nproc)) enddo * write(lp,'(a,i3,i5,f12.2)') 'xcsum: np,j,sum = ', * & mp,j,sum8j(j) enddo endif enddo #if defined(MPI) else l = iisum(mproc,nproc)*jj if (l.gt.0) then call MPI_SEND(sum8t,l,MTYPED, & idproc(mpe_1(nproc),nproc), 9900, & mpi_comm_hycom, mpierr) endif #endif endif c c send row sums to first processor. c #if defined(SHMEM) BARRIER #endif if (mnproc.eq.1) then do j= 1,jj sumj(j) = sum8j(j) enddo c do np= 2,jpr mp = mpe_1(np) #if defined(MPI) call MPI_RECV(sum8j,jj_pe(mp,np),MTYPED, & idproc(mp,np), 9901, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) #elif defined(SHMEM) call SHMEM_GETD(sum8j, & sum8j,jj_pe(mp,np),idproc(mp,np)) #endif do j= 1,jj_pe(1,np) sumj(j+j0_pe(1,np)) = sum8j(j) enddo enddo #if defined(MPI) elseif (mproc.eq.mpe_1(nproc)) then call MPI_SEND(sum8j,jj,MTYPED, & idproc1(1), 9901, & mpi_comm_hycom, mpierr) #endif endif #if defined(TIMER) c if (nxc.eq. 6) then call xctmr1( 6) nxc = 0 endif #endif return end subroutine xcsumj subroutine xcsync(lflush) implicit none c logical, intent(in) :: lflush c c********** c* c 1) barrier, no processor exits until all arrive (and flush stdout). c c 2) some MPI implementations only flush stdout as a collective c operation, and hence the lflush=.true. option to flush stdout. c c 3) typically this is just a wrapper to the "BARRIER" macro. c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c if (lflush) then #if defined(MPI) && defined(AIX) && ! (defined(AIX_NOFL) || defined(USE_ESMF) || defined(USE_CCSM3) || defined(OCEANS2)) call mp_flush(1) ! flushes stdout, and implies a barrier #else call flush(lp) BARRIER #endif else BARRIER endif return end subroutine xcsync #if defined(SHMEM) && defined(RINGB) subroutine xctbar(ipe1,ipe2) implicit none c integer, intent(in) :: ipe1,ipe2 c c********** c* c 1) sync with processors ipe1 and ipe2. c c 2) this is a global collective operation, and the calls on ipe1 c and ipe2 must list this processor as one of the two targets. c c 3) this is used in place of a global barrier in halo operations, c but it only provides syncronization of one or two processors c with the local processor. c c 4) ipe1 and/or ipe2 can be null_tile, to indicate no processor. c* c********** c integer cache_line,ilarge parameter (cache_line=32, ilarge=2**30) c integer, save, dimension(cache_line,-1:ijqr-1) :: ibp c integer i c integer icount save icount data icount / -1 / c icount = mod(icount+1,ilarge) if (icount.eq.0) then call shmem_barrier_all() do i= -1,ijpr-1 ibp(1,i) = -1 enddo call shmem_barrier_all() endif c ibp(1,-1) = icount if (ipe1.ne.null_tile) then call shmem_integer_put(ibp(1,mnproc-1),icount,1,ipe1) endif if (ipe2.ne.null_tile) then call shmem_integer_put(ibp(1,mnproc-1),icount,1,ipe2) endif call shmem_fence c i = -1 do while (i.lt.icount) c this assignment statement must not be optimized away. cdir$ suppress i = min(ibp(1,ipe1),ibp(1,ipe2)) enddo return end subroutine xctbar #endif /* SHMEM && RINGB */ #if defined(ARCTIC) recursive subroutine xctila(a,l1,ld,itype) implicit none c integer, intent(in) :: l1,ld,itype real, intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy,ld) c c********** c* c 1) update the top row of a real array. c only needed when importing a tripole grid array. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target array c l1 integer input 3rd dim. start index c ld integer input 3rd dimension of a c mh integer input 1st (EW) update halo size c nh integer input 2nd (NS) update halo size c itype integer input grid and field type c c 3) itype selects both the grid and field type c itype= 1; p-grid, scalar field c itype=11; p-grid, vector field c c 4) this version for a global tripole (arctic bipolar patch) grid c* c********** c c halo buffer (in common for enhanced MPI safety). c #if defined(RELO) real, save, allocatable, dimension(:,:) :: ai #else integer, parameter :: ilen=idm*kdm real, save, dimension(ilen,2) :: ai #endif integer i,io,k,l,lg0,ls0,lm,m real sarc c #if defined(MPI) include 'mpif.h' integer mpierr c c persistent communication handles. c integer mpireqa(4*iqr),nreqa, & klnold save mpireqa, nreqa, & klnold c data nreqa,klnold / 0,0 / #endif /* MPI */ c #if defined(RELO) if (.not.allocated(ai)) then allocate( ai(idm*kdm,2) ) call mem_stat_add( idm*kdm*2 ) endif #endif c c --- split large requests into smaller pieces c if (ld-l1+1.gt.kdm) then do k= l1,ld,kdm l = min(k+kdm-1,ld) call xctila(a,k,l,itype) enddo return endif c #if defined(TIMER) c if (nxc.eq.0) then call xctmr0(13) nxc = 13 endif #endif c #if defined(SHMEM) c --- not needed if ipr.eq.1, but simpler to include it BARRIER #endif if (nproc.eq.jpr) then !tiles involved in arctic boundary if (itype.lt.10) then sarc = 1.0 else sarc = -1.0 endif c if (ipr.eq.1) then do k= l1,ld do i= 1,ii io = ii-mod(i-1,ii) if (a(io,jj-1,k).ne.vland) then a(i,jj,k) = sarc*a(io,jj-1,k) else a(i,jj,k) = vland endif enddo !i enddo !k else l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case io = ii+1-i !ii:1:-1 do k= l1,ld l = l + 1 if (a(io,jj-1,k).ne.vland) then ai(l,1) = sarc*a(io,jj-1,k) else ai(l,1) = vland endif enddo !k enddo !i * write(lp,'(a,5i6)') 'xctila - l1,ld,ii,l,mnproc = ', * & l1,ld,ii,l,mnproc c #if defined(MPI) if (klnold.ne.(ld-l1+1)) then !new mpi init needed #if defined(DEBUG_ALL) * if (mproc.eq.mp_1st) then * write(lp,'(a,3i6)') 'xctila - l1,ld,itype = ', * & l1,ld,itype * endif * call xcsync(flush_lp) #endif do i= 1,nreqa call mpi_request_free(mpireqa(i), mpierr) enddo klnold = ld-l1+1 c c loop through all neigboring tiles. c l = 0 do m= 1,mm_top l = l + 1 ls0 = i0_st(m)*(ld-l1+1) lm = ii_st(m)*(ld-l1+1) call mpi_send_init( & ai(ls0+1,1),lm,MTYPER, & idproc(m0_top+m,nproc+1), 99053, & mpi_comm_hycom, mpireqa(l), mpierr) enddo do m= 1,mm_top l = l + 1 ls0 = i0_st(m)*(ld-l1+1) lm = ii_st(m)*(ld-l1+1) call mpi_recv_init( & ai(ls0+1,2),lm,MTYPER, & idproc(m0_top+m,nproc+1), 99053, & mpi_comm_hycom, mpireqa(l), mpierr) enddo nreqa = l endif !new mpi_init if (nreqa.gt.0) then call mpi_startall(nreqa, mpireqa, mpierr) call mpi_waitall( nreqa, mpireqa, MPI_STATUSES_IGNORE, & mpierr) endif #elif defined(SHMEM) c c loop through all neigboring tiles, barrier issued above. c do m= 1,mm_top if (idproc(m0_top+m,nproc+1).ne.null_tile) then lg0 = i0_gt(m)*(ld-l1+1) ls0 = i0_st(m)*(ld-l1+1) lm = ii_st(m)*(ld-l1+1) if (nproc.eq.jpr) then call SHMEM_GETR(ai(ls0+1,2), & ai(lg0+1,1),lm, !buffer 1 & idproc(m0_top+m,nproc+1)) endif endif enddo #endif /* MPI:SHMEM */ c l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case do k= l1,ld l = l + 1 a(i,jj,k) = ai(l,2) enddo enddo endif !ipr.eq.1:else endif !nproc.eq.jpr c #if defined(TIMER) if (nxc.eq.13) then call xctmr1(13) nxc = 0 endif #endif return end subroutine xctila recursive subroutine xctilr(a,l1,ld,mh,nh,itype) implicit none c integer, intent(in) :: l1,ld,mh,nh,itype real, intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy,ld) c c********** c* c 1) update the tile overlap halo of a real array. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target array c l1 integer input 3rd dim. start index c ld integer input 3rd dimension of a c mh integer input 1st (EW) update halo size c nh integer input 2nd (NS) update halo size c itype integer input grid and field type c c 3) itype selects both the grid and field type c itype= 1; p-grid, scalar field c itype= 2; q-grid, scalar field c itype= 3; u-grid, scalar field c itype= 4; v-grid, scalar field c itype=11; p-grid, vector field c itype=12; q-grid, vector field c itype=13; u-grid, vector field c itype=14; v-grid, vector field c c 4) the global variable vland is returned by halos over land. c c 5) this version for a global tripole (arctic bipolar patch) grid c* c********** c c halo buffer c #if defined(RELO) real, save, allocatable, dimension(:,:) :: ai,aj,aia #else integer, parameter :: ilen= idm *kdm*nbdy+64, & jlen=(jdm+2*nbdy)*kdm*nbdy+64 real, save, dimension (ilen,4) :: ai real, save, dimension (jlen,4) :: aj real, save, dimension (kdm*nbdy+64,2) :: aia #endif c integer i,io,itynew,j,k,l,lg0,ls0,lm,m,mhl,nhl real sarc c #if defined(MPI) include 'mpif.h' integer mpierr c c persistent communication handles. c integer mpireqa(4*iqr),mpireqb(4),nreqa, & klmold,klnold,mhlold,nhlold,ityold save mpireqa,mpireqb,nreqa, & klmold,klnold,mhlold,nhlold,ityold c data nreqa,klmold,klnold,mhlold,nhlold,ityold / 0,0,0,0,0,0 / #endif /* MPI */ #if defined(RELO) c if (.not.allocated(ai)) then allocate( ai( idm *kdm*nbdy+64 ,4) ) call mem_stat_add( ( idm *kdm*nbdy+64)*4 ) allocate( aj((jdm+2*nbdy)*kdm*nbdy+64 ,4) ) call mem_stat_add( ((jdm+2*nbdy)*kdm*nbdy+64)*4 ) allocate( aia( kdm*nbdy+64, 2) ) call mem_stat_add( (kdm*nbdy+64)*2 ) endif #endif c c --- split large requests into smaller pieces c if (ld-l1+1.gt.kdm) then do k= l1,ld,kdm l = min(k+kdm-1,ld) call xctilr(a,k,l,mh,nh,itype) enddo return endif c #if defined(TIMER) c if (nxc.eq.0) then call xctmr0(12) nxc = 12 endif #endif c mhl = max(0,min(mh,nbdy)) nhl = max(0,min(nh,nbdy)) c if (itype.lt.10) then sarc = 1.0 else sarc = -1.0 endif c if (nhl.gt.0) then if (ipr.eq.1 .and. jpr.eq.1) then do k= l1,ld do j= 1,nhl do i= 1,ii a(i, 1-j,k) = vland enddo if (itype.eq.1 .or. itype.eq.11) then do i= 1,ii io = ii-mod(i-1,ii) if (a(io,jj-1-j,k).ne.vland) then a(i,jj+j,k) = sarc*a(io,jj-1-j,k) else a(i,jj+j,k) = vland endif enddo !i elseif (itype.eq.2 .or. itype.eq.12) then do i= 1,ii io = mod(ii-(i-1),ii)+1 if (a(io,jj-j,k).ne.vland) then a(i,jj+j,k) = sarc*a(io,jj-j,k) else a(i,jj+j,k) = vland endif enddo !i elseif (itype.eq.3 .or. itype.eq.13) then do i= 1,ii io = mod(ii-(i-1),ii)+1 if (a(io,jj-1-j,k).ne.vland) then a(i,jj+j,k) = sarc*a(io,jj-1-j,k) else a(i,jj+j,k) = vland endif enddo !i elseif (itype.eq.4 .or. itype.eq.14) then do i= 1,ii io = ii-mod(i-1,ii) if (a(io,jj-j,k).ne.vland) then a(i,jj+j,k) = sarc*a(io,jj-j,k) else a(i,jj+j,k) = vland endif enddo !i endif !itype enddo !j enddo !k else if (nproc.ne.jpr) then l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case do k= l1,ld do j= 1,nhl l = l + 1 ai(l,1) = a(i,jj+1-j,k) ai(l,2) = a(i, j,k) ai(l,3) = vland ai(l,4) = vland enddo !j enddo !k enddo !i elseif (itype.eq.1 .or. itype.eq.11) then !p-grid l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case io = ii+1-i !ii:1:-1 do k= l1,ld do j= 1,nhl l = l + 1 if (a(io,jj-1-j,k).ne.vland) then ai(l,1) = sarc*a(io,jj-1-j,k) else ai(l,1) = vland endif ai(l,2) = a(i,j,k) ai(l,3) = vland ai(l,4) = vland enddo !j enddo !k enddo !i elseif (itype.eq.3 .or. itype.eq.13) then !u-grid l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case io = ii+2-i !ii+1:2:-1 do k= l1,ld do j= 1,nhl l = l + 1 if (a(io,jj-1-j,k).ne.vland) then ai(l,1) = sarc*a(io,jj-1-j,k) else ai(l,1) = vland endif ai(l,2) = a(i,j,k) ai(l,3) = vland ai(l,4) = vland enddo !j enddo !k enddo !i l = 0 do k= l1,ld do j= 1,nhl l = l + 1 if (a(1,jj-1-j,k).ne.vland) then aia(l,1) = sarc*a(1,jj-1-j,k) else aia(l,1) = vland endif aia(l,2) = vland enddo !j enddo !k elseif (itype.eq.2 .or. itype.eq.12) then !q-grid l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case io = ii+2-i !ii+1:2:-1 do k= l1,ld do j= 1,nhl l = l + 1 if (a(io,jj-j,k).ne.vland) then ai(l,1) = sarc*a(io,jj-j,k) else ai(l,1) = vland endif ai(l,2) = a(i,j,k) ai(l,3) = vland ai(l,4) = vland enddo !j enddo !k enddo !i l = 0 do k= l1,ld do j= 1,nhl l = l + 1 if (a(1,jj-j,k).ne.vland) then aia(l,1) = sarc*a(1,jj-j,k) else aia(l,1) = vland endif aia(l,2) = vland enddo !j enddo !k else !v-grid l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case io = ii+1-i !ii:1:-1 do k= l1,ld do j= 1,nhl l = l + 1 if (a(io,jj-j,k).ne.vland) then ai(l,1) = sarc*a(io,jj-j,k) else ai(l,1) = vland endif ai(l,2) = a(i,j,k) ai(l,3) = vland ai(l,4) = vland enddo !j enddo !k enddo !i endif !itype * write(lp,'(a,6i6)') 'xctilr - nhl,l1,ld,ii,l,mnproc = ', * & nhl,l1,ld,ii,l,mnproc * call xcsync(flush_lp) c #if defined(MPI) if (itype.eq.2 .or. itype.eq.12 .or. !q-grid & itype.eq.3 .or. itype.eq.13 ) then !u-grid itynew = 2 else itynew = 1 endif if (klnold.ne.(ld-l1+1) .or. & nhlold.ne.nhl .or. & ityold.ne.itynew ) then !new mpi init needed #if defined(DEBUG_ALL) * if (mnproc.eq.1) then * write(lp,'(a,4i6)') 'xctilr - nhl,l1,ld,itype = ', * & nhl,l1,ld,itype * endif * call xcsync(flush_lp) #endif do i= 1,nreqa call mpi_request_free(mpireqa(i), mpierr) enddo c c loop through all neigboring tiles. c l = 0 do m= 1,mm_top l = l + 1 ls0 = i0_st(m)*nhl*(ld-l1+1) lm = ii_st(m)*nhl*(ld-l1+1) if (nproc.ne.jpr) then call mpi_send_init( & ai(ls0+1,1),lm,MTYPER, & idproc(m0_top+m,nproc+1), 9905, & mpi_comm_hycom, mpireqa(l), mpierr) else !arctic call mpi_send_init( & ai(ls0+1,1),lm,MTYPER, & idproc(m0_top+m,nproc+1), 99051, & mpi_comm_hycom, mpireqa(l), mpierr) endif enddo if (nproc.eq.jpr) then !arctic if (itype.eq.2 .or. itype.eq.12 .or. !q-grid & itype.eq.3 .or. itype.eq.13 ) then !u-grid l = l + 1 lm = nhl*(ld-l1+1) call mpi_send_init( & aia(1,1),lm,MTYPER, & idproc(mod(ipr+1-mproc,ipr)+1,nproc), 99052, & mpi_comm_hycom, mpireqa(l), mpierr) endif !q-grid,u-grid endif do m= 1,mm_bot l = l + 1 ls0 = i0_sb(m)*nhl*(ld-l1+1) lm = ii_sb(m)*nhl*(ld-l1+1) call mpi_send_init( & ai(ls0+1,2),lm,MTYPER,idproc(m0_bot+m,nproc-1), 9906, & mpi_comm_hycom, mpireqa(l), mpierr) enddo do m= 1,mm_top l = l + 1 ls0 = i0_st(m)*nhl*(ld-l1+1) lm = ii_st(m)*nhl*(ld-l1+1) if (nproc.ne.jpr) then call mpi_recv_init( & ai(ls0+1,4),lm,MTYPER, & idproc(m0_top+m,nproc+1), 9906, & mpi_comm_hycom, mpireqa(l), mpierr) else !arctic call mpi_recv_init( & ai(ls0+1,4),lm,MTYPER, & idproc(m0_top+m,nproc+1), 99051, & mpi_comm_hycom, mpireqa(l), mpierr) endif enddo if (nproc.eq.jpr) then !arctic if (itype.eq.2 .or. itype.eq.12 .or. !q-grid & itype.eq.3 .or. itype.eq.13 ) then !u-grid l = l + 1 lm = nhl*(ld-l1+1) call mpi_recv_init( & aia(1,2),lm,MTYPER, & idproc(mod(ipr+1-mproc,ipr)+1,nproc), 99052, & mpi_comm_hycom, mpireqa(l), mpierr) endif !q-grid,u-grid endif do m= 1,mm_bot l = l + 1 ls0 = i0_sb(m)*nhl*(ld-l1+1) lm = ii_sb(m)*nhl*(ld-l1+1) call mpi_recv_init( & ai(ls0+1,3),lm,MTYPER,idproc(m0_bot+m,nproc-1), 9905, & mpi_comm_hycom, mpireqa(l), mpierr) enddo nreqa = l endif if (nreqa.gt.0) then call mpi_startall(nreqa, mpireqa, mpierr) call mpi_waitall( nreqa, mpireqa, MPI_STATUSES_IGNORE, & mpierr) endif #elif defined(SHMEM) BARRIER c c loop through all neigboring tiles. c do m= 1,mm_top if (idproc(m0_top+m,nproc+1).ne.null_tile) then lg0 = i0_gt(m)*nhl*(ld-l1+1) ls0 = i0_st(m)*nhl*(ld-l1+1) lm = ii_st(m)*nhl*(ld-l1+1) if (nproc.ne.jpr) then call SHMEM_GETR(ai(ls0+1,4), & ai(lg0+1,2),lm, & idproc(m0_top+m,nproc+1)) else !arctic call SHMEM_GETR(ai(ls0+1,4), & ai(lg0+1,1),lm, !buffer 1 & idproc(m0_top+m,nproc+1)) endif endif enddo if (nproc.eq.jpr) then !arctic if (itype.eq.2 .or. itype.eq.12 .or. !q-grid & itype.eq.3 .or. itype.eq.13 ) then !u-grid lm = nhl*(ld-l1+1) call SHMEM_GETR(aia(1,2), & aia(1,1),lm, & idproc(mod(ipr+1-mproc)+1,nproc)) endif !q-grid,u-grid endif c do m= 1,mm_bot if (idproc(m0_bot+m,nproc-1).ne.null_tile) then lg0 = i0_gb(m)*nhl*(ld-l1+1) ls0 = i0_sb(m)*nhl*(ld-l1+1) lm = ii_sb(m)*nhl*(ld-l1+1) call SHMEM_GETR(ai(ls0+1,3), & ai(lg0+1,1),lm, idproc(m0_bot+m,nproc-1)) endif enddo #endif /* MPI:SHMEM */ c if (nproc.eq.jpr) then !arctic if (itype.eq.2 .or. itype.eq.12 .or. !q-grid & itype.eq.3 .or. itype.eq.13 ) then !u-grid l = 0 do k= l1,ld do j= 1,nhl l = l + 1 ai(l,4) = aia(l,2) enddo !j enddo !k endif !q-grid,u-grid endif !arctic l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case do k= l1,ld do j= 1,nhl l = l + 1 a(i, 1-j,k) = ai(l,3) a(i,jj+j,k) = ai(l,4) enddo enddo enddo endif ! jpr.eq.1:else endif ! nhl.gt.0 c if (mhl.gt.0) then if (ipr.eq.1) then if (nreg.eq.0) then do k= l1,ld do j= 1-nhl,jj+nhl do i= 1,mhl a( 1-i,j,k) = vland a(ii+i,j,k) = vland enddo enddo enddo else do k= l1,ld do j= 1-nhl,jj+nhl do i= 1,mhl a( 1-i,j,k) = a(ii+1-i,j,k) a(ii+i,j,k) = a( i,j,k) enddo enddo enddo endif else l = 0 do k= l1,ld do j= 1-nhl,jj+nhl do i= 1,mhl l = l + 1 aj(l,1) = a(ii+1-i,j,k) aj(l,2) = a( i,j,k) aj(l,3) = vland aj(l,4) = vland enddo enddo enddo * write(lp,'(a,6i6)') 'xctilr - mhl,l1,ld,jj,l,mnproc = ', * & mhl,l1,ld,jj,l,mnproc * call xcsync(flush_lp) #if defined(MPISR) call mpi_sendrecv( & aj(1,1),l,MTYPER,idhalo(2), 9907, & aj(1,4),l,MTYPER,idhalo(2), 9908, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) call mpi_sendrecv( & aj(1,2),l,MTYPER,idhalo(1), 9908, & aj(1,3),l,MTYPER,idhalo(1), 9907, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) #elif defined(MPI) if (klmold.ne.(ld-l1+1) .or. & mhlold.ne.mhl .or. & nhlold.ne.nhl ) then !new mpi init #if defined(DEBUG_ALL) * if (mnproc.eq.1) then * write(lp,'(a,3i6)') 'xctilr - mhl,l1,ld = ', * & mhl,l1,ld * endif * call xcsync(flush_lp) #endif if (klmold.ne.0) then call mpi_request_free(mpireqb(1), mpierr) call mpi_request_free(mpireqb(2), mpierr) call mpi_request_free(mpireqb(3), mpierr) call mpi_request_free(mpireqb(4), mpierr) endif call mpi_send_init( & aj(1,1),l,MTYPER,idhalo(2), 9907, & mpi_comm_hycom, mpireqb(1), mpierr) call mpi_send_init( & aj(1,2),l,MTYPER,idhalo(1), 9908, & mpi_comm_hycom, mpireqb(2), mpierr) call mpi_recv_init( & aj(1,3),l,MTYPER,idhalo(1), 9907, & mpi_comm_hycom, mpireqb(3), mpierr) call mpi_recv_init( & aj(1,4),l,MTYPER,idhalo(2), 9908, & mpi_comm_hycom, mpireqb(4), mpierr) endif call mpi_startall(4, mpireqb, mpierr) call mpi_waitall( 4, mpireqb, MPI_STATUSES_IGNORE, mpierr) #elif defined(SHMEM) BARRIER_MP if (idhalo(1).ne.null_tile) then call SHMEM_GETR(aj(1,3), & aj(1,1),l,idhalo(1)) endif if (idhalo(2).ne.null_tile) then call SHMEM_GETR(aj(1,4), & aj(1,2),l,idhalo(2)) endif BARRIER_MP #endif l = 0 do k= l1,ld do j= 1-nhl,jj+nhl do i= 1,mhl l = l + 1 a( 1-i,j,k) = aj(l,3) a(ii+i,j,k) = aj(l,4) enddo enddo enddo endif ! ipr.eq.1:else endif ! mhl.gt.0 c klnold = ld-l1+1 klmold = ld-l1+1 nhlold = nhl mhlold = mhl ityold = itynew #if defined(TIMER) c if (nxc.eq.12) then call xctmr1(12) nxc = 0 endif #endif return end subroutine xctilr #else /* !ARCTIC */ recursive subroutine xctilr(a,l1,ld,mh,nh,itype) implicit none c integer, intent(in) :: l1,ld,mh,nh,itype real, intent(inout) :: a(1-nbdy:idm+nbdy,1-nbdy:jdm+nbdy,ld) c c********** c* c 1) update the tile overlap halo of a real array. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c a real in/out target array c l1 integer input 3rd dim. start index c ld integer input 3rd dimension of a c mh integer input 1st (EW) update halo size c nh integer input 2nd (NS) update halo size c itype integer input grid and field type c c 3) itype selects both the grid and field type c itype= 1; p-grid, scalar field c itype= 2; q-grid, scalar field c itype= 3; u-grid, scalar field c itype= 4; v-grid, scalar field c itype=11; p-grid, vector field c itype=12; q-grid, vector field c itype=13; u-grid, vector field c itype=14; v-grid, vector field c it is ignored here because all types are the same unless c the grid includes the arctic ocean c c 4) the global variable vland is returned by halos over land. c* c********** c c halo buffer. c #if defined(RELO) real, save, allocatable, dimension (:,:) :: ai,aj #else integer, parameter :: ilen= idm *kdm*nbdy+64, & jlen=(jdm+2*nbdy)*kdm*nbdy+64 real, save, dimension(ilen,4) :: ai real, save, dimension(jlen,4) :: aj #endif c integer i,j,k,l,lg0,ls0,lm,m,mhl,nhl c #if defined(MPI) include 'mpif.h' integer mpierr c c persistent communication handles. c integer mpireqa(4*iqr),mpireqb(4), & ilold,jlold,nreqa save mpireqa,mpireqb, & ilold,jlold,nreqa c data ilold,jlold / 0,0 / #endif /* MPI */ #if defined(RELO) c if (.not.allocated(ai)) then allocate( ai( idm *kdm*nbdy+64 ,4) ) call mem_stat_add( ( idm *kdm*nbdy+64)*4 ) allocate( aj((jdm+2*nbdy)*kdm*nbdy+64 ,4) ) call mem_stat_add( ((jdm+2*nbdy)*kdm*nbdy+64)*4 ) endif #endif c c --- split large requests into smaller pieces c if (ld-l1+1.gt.kdm) then do k= l1,ld,kdm l = min(k+kdm-1,ld) call xctilr(a,k,l,mh,nh,itype) enddo return endif c #if defined(TIMER) c if (nxc.eq.0) then call xctmr0(12) nxc = 12 endif #endif c mhl = max(0,min(mh,nbdy)) nhl = max(0,min(nh,nbdy)) c if (nhl.gt.0) then if (jpr.eq.1) then do k= l1,ld do j= 1,nhl do i= 1,ii a(i, 1-j,k) = vland a(i,jj+j,k) = vland enddo enddo enddo else l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case do k= l1,ld do j= 1,nhl l = l + 1 ai(l,1) = a(i,jj+1-j,k) ai(l,2) = a(i, j,k) ai(l,3) = vland ai(l,4) = vland enddo enddo enddo * write(lp,'(a,6i6)') 'xctilr - nhl,l1,ld,ii,l,mnproc = ', * & nhl,l1,ld,ii,l,mnproc * call xcsync(flush_lp) c #if defined(MPI) if (jlold.ne.l) then if (jlold.ne.0) then do i= 1,nreqa call mpi_request_free(mpireqa(i), mpierr) enddo endif jlold = l c c loop through all neigboring tiles. c l = 0 do m= 1,mm_top l = l + 1 ls0 = i0_st(m)*nhl*(ld-l1+1) lm = ii_st(m)*nhl*(ld-l1+1) call mpi_send_init( & ai(ls0+1,1),lm,MTYPER,idproc(m0_top+m,nproc+1), 9905, & mpi_comm_hycom, mpireqa(l), mpierr) enddo do m= 1,mm_bot l = l + 1 ls0 = i0_sb(m)*nhl*(ld-l1+1) lm = ii_sb(m)*nhl*(ld-l1+1) call mpi_send_init( & ai(ls0+1,2),lm,MTYPER,idproc(m0_bot+m,nproc-1), 9906, & mpi_comm_hycom, mpireqa(l), mpierr) enddo do m= 1,mm_top l = l + 1 ls0 = i0_st(m)*nhl*(ld-l1+1) lm = ii_st(m)*nhl*(ld-l1+1) call mpi_recv_init( & ai(ls0+1,4),lm,MTYPER,idproc(m0_top+m,nproc+1), 9906, & mpi_comm_hycom, mpireqa(l), mpierr) enddo do m= 1,mm_bot l = l + 1 ls0 = i0_sb(m)*nhl*(ld-l1+1) lm = ii_sb(m)*nhl*(ld-l1+1) call mpi_recv_init( & ai(ls0+1,3),lm,MTYPER,idproc(m0_bot+m,nproc-1), 9905, & mpi_comm_hycom, mpireqa(l), mpierr) enddo nreqa = l endif if (nreqa.gt.0) then call mpi_startall(nreqa, mpireqa, mpierr) call mpi_waitall( nreqa, mpireqa, MPI_STATUSES_IGNORE, & mpierr) endif #elif defined(SHMEM) BARRIER c c loop through all neigboring tiles. c do m= 1,mm_top if (idproc(m0_top+m,nproc+1).ne.null_tile) then lg0 = i0_gt(m)*nhl*(ld-l1+1) ls0 = i0_st(m)*nhl*(ld-l1+1) lm = ii_st(m)*nhl*(ld-l1+1) call SHMEM_GETR(ai(ls0+1,4), & ai(lg0+1,2),lm, idproc(m0_top+m,nproc+1)) endif enddo c do m= 1,mm_bot if (idproc(m0_bot+m,nproc-1).ne.null_tile) then lg0 = i0_gb(m)*nhl*(ld-l1+1) ls0 = i0_sb(m)*nhl*(ld-l1+1) lm = ii_sb(m)*nhl*(ld-l1+1) call SHMEM_GETR(ai(ls0+1,3), & ai(lg0+1,1),lm, idproc(m0_bot+m,nproc-1)) endif enddo #endif /* MPI:SHMEM */ c l = 0 do i= 1,ii ! outer loop to simplify multiple neighbor case do k= l1,ld do j= 1,nhl l = l + 1 a(i, 1-j,k) = ai(l,3) a(i,jj+j,k) = ai(l,4) enddo enddo enddo endif ! jpr.eq.1:else endif ! nhl.gt.0 c if (mhl.gt.0) then if (ipr.eq.1) then if (nreg.eq.0) then do k= l1,ld do j= 1-nhl,jj+nhl do i= 1,mhl a( 1-i,j,k) = vland a(ii+i,j,k) = vland enddo enddo enddo else do k= l1,ld do j= 1-nhl,jj+nhl do i= 1,mhl a( 1-i,j,k) = a(ii+1-i,j,k) a(ii+i,j,k) = a( i,j,k) enddo enddo enddo endif else l = 0 do k= l1,ld do j= 1-nhl,jj+nhl do i= 1,mhl l = l + 1 aj(l,1) = a(ii+1-i,j,k) aj(l,2) = a( i,j,k) aj(l,3) = vland aj(l,4) = vland enddo enddo enddo * write(lp,'(a,6i6)') 'xctilr - mhl,l1,ld,jj,l,mnproc = ', * & mhl,l1,ld,jj,l,mnproc * call xcsync(flush_lp) #if defined(MPISR) call mpi_sendrecv( & aj(1,1),l,MTYPER,idhalo(2), 9907, & aj(1,4),l,MTYPER,idhalo(2), 9908, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) call mpi_sendrecv( & aj(1,2),l,MTYPER,idhalo(1), 9908, & aj(1,3),l,MTYPER,idhalo(1), 9907, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) #elif defined(MPI) if (ilold.ne.l) then if (ilold.ne.0) then call mpi_request_free(mpireqb(1), mpierr) call mpi_request_free(mpireqb(2), mpierr) call mpi_request_free(mpireqb(3), mpierr) call mpi_request_free(mpireqb(4), mpierr) endif ilold = l call mpi_send_init( & aj(1,1),l,MTYPER,idhalo(2), 9907, & mpi_comm_hycom, mpireqb(1), mpierr) call mpi_send_init( & aj(1,2),l,MTYPER,idhalo(1), 9908, & mpi_comm_hycom, mpireqb(2), mpierr) call mpi_recv_init( & aj(1,3),l,MTYPER,idhalo(1), 9907, & mpi_comm_hycom, mpireqb(3), mpierr) call mpi_recv_init( & aj(1,4),l,MTYPER,idhalo(2), 9908, & mpi_comm_hycom, mpireqb(4), mpierr) endif call mpi_startall(4, mpireqb, mpierr) call mpi_waitall( 4, mpireqb, MPI_STATUSES_IGNORE, mpierr) #elif defined(SHMEM) BARRIER_MP if (idhalo(1).ne.null_tile) then call SHMEM_GETR(aj(1,3), & aj(1,1),l,idhalo(1)) endif if (idhalo(2).ne.null_tile) then call SHMEM_GETR(aj(1,4), & aj(1,2),l,idhalo(2)) endif BARRIER_MP #endif l = 0 do k= l1,ld do j= 1-nhl,jj+nhl do i= 1,mhl l = l + 1 a( 1-i,j,k) = aj(l,3) a(ii+i,j,k) = aj(l,4) enddo enddo enddo endif ! ipr.eq.1:else endif ! mhl.gt.0 #if defined(TIMER) c if (nxc.eq.12) then call xctmr1(12) nxc = 0 endif #endif return end subroutine xctilr #endif /* ARCTIC:else */ subroutine xctmri implicit none c c********** c* c 1) initialize timers. c c 2) timers 1:32 are for message passing routines, c timers 33:80 are for general hycom routines, c timers 81:96 are for user selected routines. c timer 97 is the total time. c c 3) call xctmri to initialize timers (called in xcspmd), c call xctmr0(n) to start timer n, c call xctmr1(n) to stop timer n and add event to timer sum, c call xctnrn(n,cname) to register a name for timer n, c call xctmrp to printout timer statistics (called by xcstop). c c 4) time every 50-th event above 5,000. c* c********** c integer i c real*8 zero8 parameter (zero8=0.0) c nxc = 0 do i= 1,97 cc(i) = ' ' nc(i) = 0 tc(i) = zero8 enddo c call xctmrn(97,'total ') call xctmr0(97) return end subroutine xctmri subroutine xctmr0(n) implicit none c integer, intent(in) :: n c c********** c* c 1) start timer n. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c n integer input timer number c c 3) time every 50-th event above 5,000. c* c********** c real*8 wtime c #if defined(DEBUG_TIMER_ALL) if ( cc(n).ne.' ') then write(lp,'(i5,2x,a,a)') mnproc,'call ',cc(n) call flush(lp) endif #endif #if defined(DEBUG_TIMER) if (n.gt.32 .and. cc(n).ne.' ') then if (mnproc.eq.1) then write(lp,*) 'call ',cc(n) call flush(lp) endif endif #endif if (timer_on) then if (mod(nc(n),50).eq.0 .or. nc(n).le.5000) then t0(n) = wtime() endif endif !timer_on return end subroutine xctmr0 subroutine xctmr1(n) implicit none c integer, intent(in) :: n c c********** c* c 1) add time since call to xctim0 to timer n. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c n integer input timer number c c 3) time every 50-th event above 5,000. c* c********** c real*8 wtime c if (timer_on) then if (nc(n).gt.5000) then if (mod(nc(n),50).eq.0) then tc(n) = tc(n) + 50.0*(wtime() - t0(n)) endif else tc(n) = tc(n) + (wtime() - t0(n)) endif nc(n) = nc(n) + 1 endif !timer_on #if defined(DEBUG_TIMER_ALL) if ( cc(n).ne.' ') then write(lp,'(i5,2x,a,a)') mnproc,'exit ',cc(n) call flush(lp) endif #endif #if defined(DEBUG_TIMER) if (n.gt.32 .and. cc(n).ne.' ') then if (mnproc.eq.1) then write(lp,*) 'exit ',cc(n) call flush(lp) endif endif #endif return end subroutine xctmr1 subroutine xctmrn(n,cname) implicit none c character*6, intent(in) :: cname integer, intent(in) :: n c c********** c* c 1) register name of timer n. c c 2) parameters: c name type usage description c ---------- ---------- ------- ---------------------------- c n integer input timer number c cname char*(6) input timer name c* c********** c cc(n) = cname return end subroutine xctmrn #if defined(TIMER_ALLOUT) subroutine xctmrp implicit none c c********** c* c 1) print all active timers, on all processors. c c 2) on exit all timers are reset to zero. c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c real*8 tci integer i,mn character*16 cmnproc c real*8 zero8 parameter (zero8=0.0) c c get total time. c call xctmr1(97) c call xcsync(flush_lp) !includes a barrier for shmem if (mnproc.ne.1) then #if defined(MPI) call MPI_SEND(tc,97,MTYPED, & idproc1(1), 9949, & mpi_comm_hycom, mpierr) #endif else !mnproc.eq.1 do mn= 1,ijpr if (mn.ne.1) then #if defined(MPI) call MPI_RECV(tc,97,MTYPED, & idproc1(mn), 9949, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) #elif defined(SHMEM) call SHMEM_GETD(tc,tc,97,idproc1(mn)) #endif endif write(lp,6000) mn,ijpr tcxc(1) = zero8 do i= 1,32 if (nc(i).ne.0) then if (nc(i).le.5000) then tci = tc(i) else !correct for timing every 50th event tci = (tc(i)*nc(i))/(nc(i)-mod(nc(i),50)) endif if (cc(i).ne.' ') then write(lp,6100) cc(i),nc(i),tci,tci/nc(i) else write(lp,6150) i, nc(i),tci,tci/nc(i) endif if (cc(i)(1:2).eq.'xc') then tcxc(1) = tcxc(1) + tci !communication overhead endif endif !nc(i)>0 enddo !i write(lp,6100) 'xc****',1,tcxc(1),tcxc(1) do i= 33,97 if (nc(i).ne.0) then if (nc(i).le.5000) then tci = tc(i) else !correct for timing every 50th event tci = (tc(i)*nc(i))/(nc(i)-mod(nc(i),50)) endif if (cc(i).ne.' ') then write(lp,6100) cc(i),nc(i),tci,tci/nc(i) else write(lp,6150) i, nc(i),tci,tci/nc(i) endif endif enddo !i enddo !mn call mem_stat_print('processor 1:') endif !mnproc.ne.1:else call xcsync(flush_lp) !includes a barrier for shmem if (nproc.eq.2 .and. mproc.eq.mpe_1(nproc)) then write(cmnproc,'(a,i6,a)') 'processor',mnproc,':' call mem_stat_print(cmnproc) endif call xcsync(flush_lp) if (mnproc.eq.ijpr) then write(cmnproc,'(a,i6,a)') 'processor',mnproc,':' call mem_stat_print(cmnproc) write(lp,*) endif call xcsync(flush_lp) c c reset timers to zero. c do i= 1,97 nc(i) = 0 tc(i) = zero8 enddo tcxc(1) = zero8 c c start a new total time measurement. c call xctmr0(97) return c 6000 format(/ / & 3x,' timer statistics, processor',i5,' out of',i5 / & 3x,'-----------------------------------------------' /) 6100 format(3x,a6, & ' calls =',i9, & ' time =',f11.5, & ' time/call =',f14.8) 6150 format(3x,' #',i2, & ' calls =',i9, & ' time =',f11.5, & ' time/call =',f14.8) end subroutine xctmrp #else subroutine xctmrp implicit none c c********** c* c 1) print all active timers. c c 2) on exit all timers are reset to zero. c* c********** c #if defined(MPI) include 'mpif.h' integer mpierr #endif c real*8 tci integer i,mn,mnloc character*16 cmnproc c real*8 zero8 parameter (zero8=0.0) c c get total time. c call xctmr1(97) c c report time on the processor with the least communication overhead c tcxc(2) = mnproc tcxc(1) = zero8 do i= 1,32 if (nc(i).ne.0 .and. cc(i)(1:2).eq.'xc') then tcxc(1) = tcxc(1) + tc(i) !communication overhead endif enddo !i c if (ijpr.ne.1) then #if defined(MPI) tcxl(1) = tcxc(1) tcxl(2) = tcxc(2) call mpi_allreduce(tcxl,tcxc,1, & mpi_2double_precision,mpi_minloc, & mpi_comm_hycom,mpierr) mnloc = tcxc(2) !processor with the least comm. overhead if (mnproc.eq.1) then if (mnloc.ne.1) then call MPI_RECV(tc,97,MTYPED, & idproc1(mnloc), 9949, & mpi_comm_hycom, MPI_STATUS_IGNORE, mpierr) endif elseif (mnproc.eq.mnloc) then call MPI_SEND(tc,97,MTYPED, & idproc1(1), 9949, & mpi_comm_hycom, mpierr) endif #elif defined(SHMEM) BARRIER if (mnproc.eq.1) then mnloc = 1 do mn= 2,ijpr call SHMEM_GETD(tcxc(2),tcxc(1),1,idproc1(mn)) if (tcxc(2).gt.tcxc(1)) then tcxc(1) = tcxc(2) mnloc = mn endif enddo tcxc(2) = mnloc !processor with the least comm. overhead endif if (mnloc.ne.1) then call SHMEM_GETD(tc,tc,97,idproc1(mnloc)) endif BARRIER #endif endif c call xcsync(flush_lp) if (mnproc.eq.1) then write(lp,6000) mnloc,ijpr do i= 1,32 if (nc(i).ne.0) then if (nc(i).le.5000) then tci = tc(i) else !correct for timing every 50th event tci = (tc(i)*nc(i))/(nc(i)-mod(nc(i),50)) endif if (cc(i).ne.' ') then write(lp,6100) cc(i),nc(i),tci,tci/nc(i) else write(lp,6150) i, nc(i),tci,tci/nc(i) endif if (cc(i)(1:2).eq.'xc') then tcxc(1) = tcxc(1) + tci !communication overhead endif endif !nc(i)>0 enddo !i write(lp,6100) 'xc****',1,tcxc(1),tcxc(1) do i= 33,97 if (nc(i).ne.0) then if (nc(i).le.5000) then tci = tc(i) else !correct for timing every 50th event tci = (tc(i)*nc(i))/(nc(i)-mod(nc(i),50)) endif if (cc(i).ne.' ') then write(lp,6100) cc(i),nc(i),tci,tci/nc(i) else write(lp,6150) i, nc(i),tci,tci/nc(i) endif endif enddo !i call mem_stat_print('processor 1:') endif !mnproc.eq.1 call xcsync(flush_lp) if (nproc.eq.2 .and. mproc.eq.mpe_1(nproc)) then write(cmnproc,'(a,i6,a)') 'processor',mnproc,':' call mem_stat_print(cmnproc) endif call xcsync(flush_lp) if (mnproc.eq.ijpr) then write(cmnproc,'(a,i6,a)') 'processor',mnproc,':' call mem_stat_print(cmnproc) write(lp,*) endif call xcsync(flush_lp) c c reset timers to zero. c do i= 1,97 nc(i) = 0 tc(i) = zero8 enddo tcxc(1) = zero8 c c start a new total time measurement. c call xctmr0(97) return c 6000 format(/ / & 3x,' timer statistics, processor',i5,' out of',i5 / & 3x,'-----------------------------------------------' /) 6100 format(3x,a6, & ' calls =',i9, & ' time =',f11.5, & ' time/call =',f14.8) 6150 format(3x,' #',i2, & ' calls =',i9, & ' time =',f11.5, & ' time/call =',f14.8) end subroutine xctmrp #endif /* TIMER_ALLOUT:else */ c c Revision history: c c> Nov. 2009 - iisum bugfix for periodic domains c> Nov. 2011 - time every 50-th event above 5,000 (was 1,000). c> Mar. 2012 - added optional mnflg to xclput c> Mar. 2012 - bugfix to periodic case in xclput c> Apr. 2012 - added optional mnflg to xceget and xceput c> Apr. 2012 - added xciget and xciput c> Nov. 2012 - added the OCEANS2 macro and xcpipe c> Dec. 2012 - iisum bugfix for xcsumj c> Jan. 2014 - replaced mpistat with MPI_STATUS[ES]_IGNORE c> Jan. 2014 - ARCTIC xctilr bugfix for when called after xcsum c> Feb. 2015 - reduced buffering memory requirements subroutine gdsum(x,work) real x,work #if defined(MPI) include 'mpif.h' integer mpierr call MPI_ALLREDUCE(x,work,1,MPI_DOUBLE_PRECISION,MPI_SUM, & mpi_comm_hycom, mpierr) #endif x = work return end subroutine gdsum