!-------------------------------------------------------------------------!
!                                                                         !
!           N  A  S     P A R A L L E L     B E N C H M A R K S  2.1      !
!                                                                         !
!                                   F T                                   !
!                                                                         !
!-------------------------------------------------------------------------!
!                                                                         !
!    This benchmark is part of the NAS Parallel Benchmark 2.1 suite.      !
!    It is described in NAS Technical Report 95-020.                      !
!                                                                         !
!    Permission to use, copy, distribute and modify this software         !
!    for any purpose with or without fee is hereby granted.  We           !
!    request, however, that all derived work reference the NAS            !
!    Parallel Benchmarks 2.1. This software is provided "as is"           !
!    without express or implied warranty.                                 !
!                                                                         !
!    Information on NPB 2.1, including the technical report, the          ! 
!    original specifications, source code, results and information        !
!    on how to submit new results, is available at:                       !
!                                                                         !
!           http://www.nas.nasa.gov/NAS/NPB/                              !
!                                                                         !
!    Send comments or suggestions to  npb@nas.nasa.gov                    !
!    Send bug reports to              npb-bugs@nas.nasa.gov               !
!                                                                         !
!          NAS Parallel Benchmarks Group                                  !
!          NASA Ames Research Center                                      !
!          Mail Stop: T27A-1                                              !
!          Moffett Field, CA   94035-1000                                 !
!                                                                         !
!          E-mail:  npb@nas.nasa.gov                                      !
!          Fax:     (415) 604-3957                                        !
!                                                                         !
!-------------------------------------------------------------------------!

c---------------------------------------------------------------------
c
c Author: D. Bailey
c
c---------------------------------------------------------------------

c---------------------------------------------------------------------

      program fftpde

c---------------------------------------------------------------------

c---------------------------------------------------------------------
c
c   This is the MPI version of the 3-D FFT PDE benchmark from the NPB.
c
c   In the following parameter statement, M1, M2 and M3 are the Log_2 of the
c   three dimensions of the 3-D input array.  Set MX = MAX (M1, M2, M3).
c   A is the multiplier of the random number generator (here set to 5^13),
c   and S is the initial seed.  AL is the value of alpha.  NT is the number
c   of iterations.  NPM is specified for a given system and problem size to
c   prevent exceeding the available memory on a single node.  NY and NY1 may 
c   be adjusted for efficiency on a given computer system.  See also NC and 
c   NC1 in routine TRANS below.
c
c---------------------------------------------------------------------

      implicit none

      include 'mpinpb.h'
      
      


c---------------------------------------------------------------------
c   Size of problem given by m1, m2, m3, which are the log of the 
c   number of grid points in each direction. 
c   npm is the minimum number of processors to run this size
c   nt is the number of iterations. All these paramters are defined 
c   in "npbparams.h"
c---------------------------------------------------------------------
      
      include 'npbparams.h'

      double complex st, st2, u, x0, x1, x2, y
      integer mm,n1,n2,n3,nn,nx,kn,np
      double precision a,al,pi,s,cl2
      double precision retmp, imtmp, epsilon
      parameter (mm = m1 + m2 + m3, 
     >  n1 = 2 ** m1, n2 = 2 ** m2, n3 = 2 ** m3, nn = 2 ** mm, 
     >  nx = 2 ** mx, a = 1220703125.d0, al = 1d-6,   
     >  pi = 3.141592653589793238d0, s = 314159265.d0,                  
     >  cl2 = 1.4426950408889633d0)

      integer ny, ny1
      parameter (ny = 16, ny1 = 18)

      double precision x3,tt0,tt1,t1,rn,ap,t2,t3,an,randlc,
     >  tm0,tm1,mflops
      dimension u(nx), x0(n3,n1,n2/npm), x1(n1,n2,n3/npm), 
     >  x2(n3,n1,n2/npm), x3(n3,n1,n2/npm), y(2*ny1*nx)

      integer ierr,nq,mq,np2,np3,n12,n22,n32,i,kk,kl,ik,k,k1,j,j1,jk,
     >  i1,kt,ii,jj,kp

      double complex st2store(20)
      character class
      logical verified

c---------------------------------------------------------------------
c   Sample size reference checksums
c---------------------------------------------------------------------
      double precision reclasss(6)
      data reclasss / 5.546087004964D+02,
     >                5.546385409189D+02,
     >                5.546148406171D+02,
     >                5.545423607415D+02,
     >                5.544255039624D+02,
     >                5.542683411902D+02 /

      double precision imclasss(6)
      data imclasss / 4.845363331978D+02,
     >                4.865304269511D+02,
     >                4.883910722336D+02,
     >                4.901273169046D+02,
     >                4.917475857993D+02,
     >                4.932597244941D+02 /

c---------------------------------------------------------------------
c   Class A size reference checksums
c---------------------------------------------------------------------
      double precision reclassa(6)
      data reclassa / 5.046735008193D+02,
     >                5.059412319734D+02,
     >                5.069376896287D+02,
     >                5.077892868474D+02,
     >                5.085233095391D+02,
     >                5.091487099959D+02 /
      
      double precision imclassa(6)
      data imclassa / 5.114047905510D+02,
     >                5.098809666433D+02,
     >                5.098144042213D+02,
     >                5.101336130759D+02,
     >                5.104914655194D+02,
     >                5.107917842803D+02 /
      
c---------------------------------------------------------------------
c   Class B size reference checksums
c---------------------------------------------------------------------
      double precision reclassb(20)
      data reclassb / 5.177643571579D+02,
     >                5.154521291263D+02,
     >                5.146409228649D+02,
     >                5.142378756213D+02,
     >                5.139626667737D+02,
     >                5.137423460082D+02,
     >                5.135547056878D+02,
     >                5.133910925466D+02,
     >                5.132470705390D+02,
     >                5.131197729984D+02,
     >                5.130070319283D+02,
     >                5.129070537032D+02,
     >                5.128182883502D+02,
     >                5.127393733383D+02,
     >                5.126691062020D+02,
     >                5.126064276004D+02,
     >                5.125504076570D+02,
     >                5.125002331720D+02,
     >                5.124551951846D+02,
     >                5.124146770029D+02 /
   
      double precision imclassb(20)
      data imclassb / 5.077803458597D+02,
     >                5.088249431599D+02,                  
     >                5.096208912659D+02,                     
     >                5.101023387619D+02,                  
     >                5.103976610617D+02,                  
     >                5.105948019802D+02,                  
     >                5.107404165783D+02,                  
     >                5.108576573661D+02,                  
     >                5.109577278523D+02,                  
     >                5.110460304483D+02,                  
     >                5.111252433800D+02,                  
     >                5.111968077718D+02,                  
     >                5.112616233064D+02,                  
     >                5.113203605551D+02,                  
     >                5.113735928093D+02,                  
     >                5.114218460548D+02,
     >                5.114656139760D+02,
     >                5.115053595966D+02,
     >                5.115415130407D+02,
     >                5.115744692211D+02 /

c---------------------------------------------------------------------
c   Class C size reference checksums
c---------------------------------------------------------------------
      double precision reclassc(20)
      data reclassc / 5.195078707457D+02,
     >                5.155422171134D+02,
     >                5.144678022222D+02,
     >                5.140150594328D+02,
     >                5.137550426810D+02,
     >                5.135811056728D+02,
     >                5.134569343165D+02,
     >                5.133651975661D+02,
     >                5.132955192805D+02,
     >                5.132410471738D+02,
     >                5.131971141679D+02,
     >                5.131605205716D+02,
     >                5.131290734194D+02,
     >                5.131012720314D+02,
     >                5.130760908195D+02,
     >                5.130528295923D+02,
     >                5.130310107773D+02,
     >                5.130103090133D+02,
     >                5.129905029333D+02,
     >                5.129714421109D+02 /

      double precision imclassc(20)
      data imclassc / 5.149019699238D+02,
     >                5.127578201997D+02,
     >                5.122251847514D+02,
     >                5.121090289018D+02,
     >                5.121143685824D+02,
     >                5.121496764568D+02,
     >                5.121870921893D+02,
     >                5.122193250322D+02,
     >                5.122454735794D+02,
     >                5.122663649603D+02,
     >                5.122830879827D+02,
     >                5.122965869718D+02,
     >                5.123075927445D+02,
     >                5.123166486553D+02,
     >                5.123241541685D+02,
     >                5.123304037599D+02,
     >                5.123356167976D+02,
     >                5.123399592211D+02,
     >                5.123435588985D+02,
     >                5.123465164008D+02 /

c---------------------------------------------------------------------
c   Initialize.
c---------------------------------------------------------------------
      call mpi_init (ierr)
      include 'mpifrag.f'
      call mpi_comm_rank (mpi_comm_world, kn, ierr)
      call mpi_comm_size (mpi_comm_world, np, ierr)
      if (np .lt. npm) then
        if (kn .eq. 0) write (*, 181) np, npm
 181    format ('Too Few Processors:',2I6)
        call mpi_finalize(ierr)
        stop
      endif

      if (kn .eq. 0) then
         write (*, 1000)
         write (*, 1001) n1, n2, n3
         write (*, 1002) nt
         write (*, 1003) np
 1000    format(//,' NAS Parallel Benchmarks 2.1 -- FT Benchmark ',/)
 1001    format(' Size: ', i3, 'x', i3, 'x', i3)
 1002    format(' Iterations: ', i3)
 1003    format(' Number of processes: ', i5)
       endif

      call mpi_barrier(mpi_comm_world, ierr)
      tt0 = mpi_wtime()
      call cfftz (0, mx, nx, ny, ny1, u, x0, y)
      nq = nn / np
      t1 = nq
      mq = cl2 * log (t1) + 1d-13
      np2 = n2 / np
      np3 = n3 / np
      rn = 1.d0 / nn
      ap = - 4.d0 * al * pi ** 2
      n12 = n1 / 2
      n22 = n2 / 2
      n32 = n3 / 2

c---------------------------------------------------------------------
c   Compute AN = A ^ (2 * NQ) (mod 2^46).
c---------------------------------------------------------------------
      t1 = a

      do i = 1, mq + 1
        t2 = randlc (t1, t1)
      enddo

      an = t1

c---------------------------------------------------------------------
c   Generate pseudorandom data.  Each processor generates one section.
c---------------------------------------------------------------------
      kk = kn
      kl = kk
      t1 = s
      t2 = an

c---------------------------------------------------------------------
c  Find starting seed T1 for this KK using the binary rule for exponentiation.
c---------------------------------------------------------------------
      do i = 1, 100
        ik = kk / 2
        if (2 * ik .ne. kk) t3 = randlc (t1, t2)
        if (ik .eq. 0) goto 120
        t3 = randlc (t2, t2)
        kk = ik
      enddo

c---------------------------------------------------------------------
c   Compute 2 * NQ pseudorandom numbers.
c---------------------------------------------------------------------
 120  call vranlc (2 * nq, t1, a, x1)

c---------------------------------------------------------------------
c   Perform a forward 3-D FFT on X1.
c---------------------------------------------------------------------
      call cfft3 (-1, m1, m2, m3, n1, n2, n3, np, ny, ny1, u, x1, x0, y)

c---------------------------------------------------------------------
c   Compute exponential terms.
c---------------------------------------------------------------------
      do kk = 1, np2
        k = kn * np2 + kk
        k1 = k - 1
        if (k .gt. n22) k1 = k1 - n2
        
        do j = 1, n1
          j1 = j - 1
          if (j .gt. n12) j1 = j1 - n1
          jk = j1 ** 2 + k1 ** 2

          do i = 1, n3
            i1 = i - 1
            if (i .gt. n32) i1 = i1 - n3
            x3(i,j,kk) = exp (ap * (i1 ** 2 + jk))
          enddo
        enddo
      enddo

c---------------------------------------------------------------------
c   Perform the following for KT = 1, ..., NT.
c---------------------------------------------------------------------
      do kt = 1, nt

c---------------------------------------------------------------------
c   Multiply by the exponential term raised to the KT power.
c---------------------------------------------------------------------
        do k = 1, np2
          do j = 1, n1
            do i = 1, n3
              x2(i,j,k) = x3(i,j,k) ** kt * x0(i,j,k)
            enddo
          enddo
        enddo

c---------------------------------------------------------------------
c   Compute inverse 3-D FFT.
c---------------------------------------------------------------------
        call cfft3(1, m1, m2, m3, n1, n2, n3, np, ny, ny1, u, x1, x2, y)

c---------------------------------------------------------------------
c   Normalize by 1 / (N1 * N2 * N3).
c---------------------------------------------------------------------
        do k = 1, np3
          do j = 1, n2
            do i = 1, n1
              x1(i,j,k) = rn * x1(i,j,k)
            enddo
          enddo
        enddo

c---------------------------------------------------------------------
c   Compute checksum.
c---------------------------------------------------------------------
        st = (0.d0, 0.d0)

        do i = 1, 1024
          i1 = i - 1
          ii = mod (i1, n1) + 1
          jj = mod (3 * i1, n2) + 1
          kk = mod (5 * i1, n3) + 1
          kp = (kk - 1) / np3
          ik = kk - kp * np3
          if (kn .eq. kp) then
            st = st + x1(ii,jj,ik)
          endif
        enddo

        call mpi_reduce (st, st2, 1, dc_type, mpi_sum, 0,
     >    mpi_comm_world, ierr)
        if (kn .eq. 0) then
            write (*, 30) kt, st2
 30         format (' T =',I5,5X,'Checksum =',1P2D22.12)
            st2store(kt) = st2
        endif
      enddo

      tt1 = mpi_wtime()
      tm0 = tt1 - tt0
      call mpi_reduce (tm0, tm1, 1, dp_type, mpi_max, 0,
     >    mpi_comm_world, ierr)

c---------------------------------------------------------------------
c   tolerance level
c---------------------------------------------------------------------
      epsilon = 1.0d-12

c---------------------------------------------------------------------
c   verification test for checksums
c---------------------------------------------------------------------
c     Sample size
c---------------------------------------------------------------------
      verified = .FALSE.
      class = 'U'
      if( m1 .eq. 6 .and. 
     >    m2 .eq. 6 .and. 
     >    m3 .eq. 6 .and. 
     >    nt .eq. 6 .and.
     >    kn .eq. 0 ) then

         class = 'S'
          do kt = 1,nt
              retmp = 
     >           abs( (dreal(st2store(kt))-reclasss(kt))/reclasss(kt) )
              imtmp = 
     >           abs( (dimag(st2store(kt))-imclasss(kt))/imclasss(kt) )
              if ( retmp .gt. epsilon .or. imtmp .gt. epsilon) then
                 verified = .false.
                 go to 100
              endif
          end do

          verified = .true.

  100    continue

c---------------------------------------------------------------------
c     Class A
c---------------------------------------------------------------------
      else if( m1 .eq. 8 .and. 
     >         m2 .eq. 8 .and. 
     >         m3 .eq. 7 .and. 
     >         nt .eq. 6 .and.
     >         kn .eq. 0 ) then

          class = 'A'
          do kt = 1,nt
              retmp = 
     >           abs( (dreal(st2store(kt))-reclassa(kt))/reclassa(kt) )
              imtmp = 
     >           abs( (dimag(st2store(kt))-imclassa(kt))/imclassa(kt) )
              if ( retmp .gt. epsilon .or. imtmp .gt. epsilon) then
                 verified = .false.
                 go to 200
              endif
          end do

          verified = .true.

  200    continue

c---------------------------------------------------------------------
c     Class B
c---------------------------------------------------------------------
      else if( m1 .eq. 9 .and. 
     >         m2 .eq. 8 .and. 
     >         m3 .eq. 8 .and. 
     >         nt .eq. 20 .and.
     >         kn .eq. 0 ) then

          class = 'B'
          do kt = 1,nt
              retmp = 
     >           abs((dreal( st2store(kt) )-reclassb(kt))/reclassb(kt))
              imtmp = 
     >           abs((dimag( st2store(kt) )-imclassb(kt))/imclassb(kt))
              if ( retmp .gt. epsilon .or. imtmp .gt. epsilon) then
                 verified = .false.
                 go to 300
              endif
          end do

          verified = .true.

  300     continue

c---------------------------------------------------------------------
c     Class C
c---------------------------------------------------------------------
      else if( m1 .eq. 9 .and. 
     >         m2 .eq. 9 .and. 
     >         m3 .eq. 9 .and. 
     >         nt .eq. 20 .and.
     >         kn .eq. 0 ) then

          class = 'C'
          do kt = 1,nt
              retmp = 
     >           abs( (dreal(st2store(kt) )-reclassc(kt))/reclassc(kt))
              imtmp = 
     >           abs( (dimag(st2store(kt) )-imclassc(kt))/imclassc(kt))
              if ( retmp .gt. epsilon .or. imtmp .gt. epsilon) then
                 verified = .false.
                 go to 400
              endif
          end do

          verified = .true.

  400     continue

      else

         verified = .false.
         if( kn .eq. 0 ) then
            write (*, 4000)
            write (*, 4001)
 4000       format(' Problem size unknown')
 4001       format(' NO VERIFICATION PERFORMED')
         endif

      endif

      if (np .ne. npm) then
         write(*, 4010) npm
         write(*, 4011)
         write(*, 4012)
c multiple statements because some Fortran compilers have
c problems with long strings. 
 4010    format( ' Warning: benchmark was compiled for ', i3, 
     >           'processors')
 4011    format( ' Must be run on this many processors for official',
     >           ' verification')
 4012    format( ' so memory access is repeatable')
         verified = .false.
      endif

      if (kn .eq. 0) then
         if (class .ne. 'U') then
            if (verified) then
               write(*,2000)
 2000          format(' VERIFICATION SUCCESSFUL')
            else
               write(*,2001)
 2001          format(' VERIFICATION FAILED')
            endif
         endif

         if( tm1 .ne. 0. ) then
            mflops = 1.0d-6*float(nn)*float(58+6*nt+5*(nt+1)*mm)/tm1
         else
            mflops = 0.0
         endif
         call print_results('FT', class, n1, n2, n3, nt, npm, np, tm1, 
     >                      mflops, verified, npbversion, compiletime,
     >                      cs1, cs2, cs3, cs4, cs5, cs6)

       endif

      call mpi_finalize(ierr)
      end

c---------------------------------------------------------------------
c---------------------------------------------------------------------

      double precision function randlc (x, a)

c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c
c   This routine returns a uniform pseudorandom double precision number in the
c   range (0, 1) by using the linear congruential generator
c
c   x_{k+1} = a x_k  (mod 2^46)
c
c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
c   before repeating.  The argument A is the same as 'a' in the above formula,
c   and X is the same as x_0.  A and X must be odd double precision integers
c   in the range (1, 2^46).  The returned value RANDLC is normalized to be
c   between 0 and 1, i.e. RANDLC = 2^(-46) * x_1.  X is updated to contain
c   the new seed x_1, so that subsequent calls to RANDLC using the same
c   arguments will generate a continuous sequence.
c
c   This routine should produce the same results on any computer with at least
c   48 mantissa bits in double precision floating point data.  On 64 bit
c   systems, double precision should be disabled.
c
c   David H. Bailey     October 26, 1990
c
c---------------------------------------------------------------------

      implicit none

      double precision r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
      parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
     >  t46 = t23 ** 2)

c---------------------------------------------------------------------
c   Break A into two parts such that A = 2^23 * A1 + A2.
c---------------------------------------------------------------------
      t1 = r23 * a
      a1 = int (t1)
      a2 = a - t23 * a1

c---------------------------------------------------------------------
c   Break X into two parts such that X = 2^23 * X1 + X2, compute
c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
c   X = 2^23 * Z + A2 * X2  (mod 2^46).
c---------------------------------------------------------------------
      t1 = r23 * x
      x1 = int (t1)
      x2 = x - t23 * x1
      t1 = a1 * x2 + a2 * x1
      t2 = int (r23 * t1)
      z = t1 - t23 * t2
      t3 = t23 * z + a2 * x2
      t4 = int (r46 * t3)
      x = t3 - t46 * t4
      randlc = r46 * x

      return
      end

c---------------------------------------------------------------------
c---------------------------------------------------------------------

      subroutine vranlc (n, x, a, y)

c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c
c   This routine generates N uniform pseudorandom double precision numbers in
c   the range (0, 1) by using the linear congruential generator
c
c   x_{k+1} = a x_k  (mod 2^46)
c
c   where 0 < x_k < 2^46 and 0 < a < 2^46.  This scheme generates 2^44 numbers
c   before repeating.  The argument A is the same as 'a' in the above formula,
c   and X is the same as x_0.  A and X must be odd double precision integers
c   in the range (1, 2^46).  The N results are placed in Y and are normalized
c   to be between 0 and 1.  X is updated to contain the new seed, so that
c   subsequent calls to VRANLC using the same arguments will generate a
c   continuous sequence.  If N is zero, only initialization is performed, and
c   the variables X, A and Y are ignored.
c
c   This routine is the standard version designed for scalar or RISC systems.
c   However, it should produce the same results on any single processor
c   computer with at least 48 mantissa bits in double precision floating point
c   data.  On 64 bit systems, double precision should be disabled.
c
c---------------------------------------------------------------------

      implicit none

      integer i,n
      double precision y,r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
      dimension y(n)
      parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
     >  t46 = t23 ** 2)


c---------------------------------------------------------------------
c   Break A into two parts such that A = 2^23 * A1 + A2.
c---------------------------------------------------------------------
      t1 = r23 * a
      a1 = int (t1)
      a2 = a - t23 * a1

c---------------------------------------------------------------------
c   Generate N results.   This loop is not vectorizable.
c---------------------------------------------------------------------
      do i = 1, n

c---------------------------------------------------------------------
c   Break X into two parts such that X = 2^23 * X1 + X2, compute
c   Z = A1 * X2 + A2 * X1  (mod 2^23), and then
c   X = 2^23 * Z + A2 * X2  (mod 2^46).
c---------------------------------------------------------------------
        t1 = r23 * x
        x1 = int (t1)
        x2 = x - t23 * x1
        t1 = a1 * x2 + a2 * x1
        t2 = int (r23 * t1)
        z = t1 - t23 * t2
        t3 = t23 * z + a2 * x2
        t4 = int (r46 * t3)
        x = t3 - t46 * t4
        y(i) = r46 * x
      enddo

      return
      end

c---------------------------------------------------------------------
c---------------------------------------------------------------------

      subroutine cfft3 (is, m1, m2, m3, n1, n2, n3, np, ny, ny1, 
     >  u, x1, x2, y)

c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c   This performs a 3-D complex-to-complex FFT on the global array X1, which
c   is assumed to be DOUBLE COMPLEX and of dimensions (N1,N2,N3/NP) in each 
c   node.  It is assumed that N1 = 2^M1, N2 = 2^M2 and N3 = 2^M3.  IS is 
c   the sign of the transform, either -1 or 1.  U is the root of unity array, 
c   which must have been previously initialized by calling CFFTZ with 0 as the
c   first argument and MM = MAX (M1,M2,M3) as the second argument.  X2 is a 
c   DOUBLE COMPLEX scratch array of the same size as X1.  Y is a DOUBLE COMPLEX
c   scratch array of size 2 * NY1 * 2^MM.  NP is the number of processor nodes.
c---------------------------------------------------------------------

      implicit none

      integer is,m1,m2,m3,n1,n2,n3,np,ny,ny1
      double complex u, x1, x2, y
      dimension u(n1), x1(n1,n2,n3/np), x2(n3,n1,n2/np), y(2*ny1*n1)


      if (is .eq. -1) then
        call cffts1 (is, m1, n1, n2, n3, np, ny, ny1, u, x1, y)
        call cffts2 (is, m2, n1, n2, n3, np, ny, ny1, u, x1, y)
        call transx (n1 * n2, n3, np, x1, x2, x1, x2)
        call cffts1 (is, m3, n3, n1, n2, np, ny, ny1, u, x2, y)
      else
        call cffts1 (is, m3, n3, n1, n2, np, ny, ny1, u, x2, y)
        call transx (n3, n1 * n2, np, x2, x1, x2, x1)
        call cffts2 (is, m2, n1, n2, n3, np, ny, ny1, u, x1, y)
        call cffts1 (is, m1, n1, n2, n3, np, ny, ny1, u, x1, y)
      endif
      return
      end

c---------------------------------------------------------------------
c---------------------------------------------------------------------

      subroutine transx (n1, n2, np, x1, x2, x3, x4)

c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c   This transposes the DOUBLE COMPLEX matrix X1 into X4.  X1 is assumed to
c   have dimensions (N1,N2/NP) in each processor.  X2 and X3 are assumed to 
c   have dimensions (N2/NP,N1).  X4 is assumed to have dimensions (N2,N1/NP).
c   X3 and X4 may be the same arrays as X1 and X2, respectively, in the 
c   calling program.  NP is the number of processor nodes.
c---------------------------------------------------------------------

      implicit none

      include 'mpinpb.h'

      integer n1,n2,np,kn,ierr,np1,np2,knp,ln,i,j,k,j1,j2
      double complex x1, x2, x3, x4
      dimension x1(n1,n2/np), x2(n2/np,n1), x3(n2/np,n1), x4(n2,n1/np)


      call mpi_comm_rank (mpi_comm_world, kn, ierr)
      np1 = n1 / np
      np2 = n2 / np
      knp = kn * np1
      ln = np1 * np2
 
c---------------------------------------------------------------------
c   Transpose X1 to X2.
c---------------------------------------------------------------------
      call trans (n1, np2, x1, x2)

c---------------------------------------------------------------------
c   Perform complete exchange from X2 to X3.
c---------------------------------------------------------------------
      call mpi_alltoall (x2, ln, dc_type, x3, ln, 
     >  dc_type, mpi_comm_world, ierr)

c---------------------------------------------------------------------
c   Move data in X3 to X4.
c---------------------------------------------------------------------
      if (np2 .ge. 4) then
        do k = 1, np1
          do j = 0, np - 1
            j1 = j * np1
            j2 = j * np2

            do i = 1, np2
              x4(i+j2,k) = x3(i,k+j1)
            enddo
          enddo
        enddo
      else
        do k = 1, np1
          do i = 1, np2
            do j = 0, np - 1
              j1 = j * np1
              j2 = j * np2
              x4(i+j2,k) = x3(i,k+j1)
            enddo
          enddo
        enddo
      endif

      return
      end

c---------------------------------------------------------------------
c---------------------------------------------------------------------

      subroutine trans (n1, n2, x, y)

c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c   Performs an in-memory transpose of the N1 x N2 DOUBLE COMPLEX matrix X 
c   into Y.  NC and NC1 are parameters that can be adjusted for a given system.
c---------------------------------------------------------------------

      implicit none

      integer nc,nc1,n,n1,n2,i,j,ii,jj,i1,j1

      double complex x, y, z
      parameter (nc = 32, nc1 = 34)
      dimension x(n1,n2), y(n2,n1), z(nc1,nc)


      n = n1 * n2

      if (n1 .le. nc .or. n2 .le. nc) then
        if (n1 .ge. n2) then
          goto 110
        else
          goto 130
        endif
      else
        goto 150
      endif

c---------------------------------------------------------------------
c   Scheme 1:  Perform a simple transpose in the usual way.
c---------------------------------------------------------------------
 110  do j = 1, n2
        do i = 1, n1
          y(j,i) = x(i,j)
        enddo
      enddo

      goto 260

c---------------------------------------------------------------------
c   Scheme 2:  Perform a simple transpose with the loops reversed.
c---------------------------------------------------------------------
 130  do i = 1, n1
        do j = 1, n2
          y(j,i) = x(i,j)
        enddo
      enddo

      goto 260

c---------------------------------------------------------------------
c   Scheme 3:  Perform a transpose using the intermediate array Z.  This gives
c   better performance than schemes 1 and 2 on certain cache memory systems.
c---------------------------------------------------------------------
 150  do jj = 0, n2 - 1, nc
        do ii = 0, n1 - 1, nc
          do j = 1, nc
            j1 = j + jj

            do i = 1, nc
              z(j,i) = x(i+ii,j1)
            enddo
          enddo

          do i = 1, nc
            i1 = i + ii

            do j = 1, nc
              y(j+jj,i1) = z(j,i)
            enddo
          enddo
        enddo
      enddo

 260  return
      end

c---------------------------------------------------------------------
c---------------------------------------------------------------------

      subroutine cffts1 (is, m1, n1, n2, n3, np, ny, ny1, u, x, y)

c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c   This performs N2 * N3 simultaneous N1-point FFTs on the N1 x N2 x N3 
c   DOUBLE COMPLEX array X.  Here N1 = 2 ^ M1.  U is the root of unity array,
c   and Y is a DOUBLE COMPLEX scratch array.
c---------------------------------------------------------------------

      implicit none

      integer n1,n2,n3,np,is,m1,ny,ny1,np3,i,j,k,jj
      double complex u, x, y
      dimension u(n1), x(n1,n2,n3/np), y(ny1,n1,2)


      np3 = n3 / np

      do k = 1, np3
        do jj = 0, n2 - ny, ny
          do j = 1, ny
            do i = 1, n1
              y(j,i,1) = x(i,j+jj,k)
            enddo
          enddo

          call cfftz (is, m1, n1, ny, ny1, u, y, y(1,1,2))

          do j = 1, ny
            do i = 1, n1
              x(i,j+jj,k) = y(j,i,1)
            enddo
          enddo
        enddo
      enddo

      return
      end

c---------------------------------------------------------------------
c---------------------------------------------------------------------

      subroutine cffts2 (is, m2, n1, n2, n3, np, ny, ny1, u, x, y)

c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c   This performs N1 * N3 simultaneous N2-point FFTs on the N1 x N2 x N3 
c   DOUBLE COMPLEX array X.  Here N2 = 2 ^ M2.  U is the root of unity array,
c   and Y is a DOUBLE COMPLEX scratch array.  X is the input and the output 
c   array.
c---------------------------------------------------------------------

      implicit none

      integer n1,n2,n3,np,np3,ny,ny1,is,m2,i,j,k,ii
      double complex u, x, y
      dimension u(n2), x(n1,n2,n3/np), y(ny1,n2,2)


      np3 = n3 / np

      do k = 1, np3
        do ii = 0, n1 - ny, ny
          do j = 1, n2
            do i = 1, ny
              y(i,j,1) = x(i+ii,j,k)
            enddo
          enddo

          call cfftz (is, m2, n2, ny, ny1, u, y, y(1,1,2))

          do j = 1, n2
            do i = 1, ny
              x(i+ii,j,k) = y(i,j,1)
            enddo
          enddo
        enddo
      enddo

      return
      end

c---------------------------------------------------------------------
c---------------------------------------------------------------------

      subroutine cfftz (is, m, n, ny, ny1, u, x, y)

c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
c   to Swarztrauber.  X is both the input and the output array, while Y is a 
c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to 
c   perform FFTs, the array U must be initialized by calling CFFTZ with IS 
c   set to 0 and M set to MX, where MX is the maximum value of M for any 
c   subsequent call.
c---------------------------------------------------------------------

      implicit none

      integer is,m,n,ny,ny1,nu,ku,i,j,l,ln,mx
      double complex u, x, y
      double precision pi,t,ti
      parameter (pi = 3.141592653589793238d0)
      dimension u(n), x(ny1,n), y(ny1,n)


      if (is .eq. 0)  then

c---------------------------------------------------------------------
c   Initialize the U array with sines and cosines in a manner that permits
c   stride one access at each FFT iteration.
c---------------------------------------------------------------------
        nu = n
        u(1) = m
        ku = 2
        ln = 1

        do j = 1, m
          t = pi / ln

c---------------------------------------------------------------------
c   This loop is vectorizable.
c---------------------------------------------------------------------
          do i = 0, ln - 1
            ti = i * t
            u(i+ku) = dcmplx (cos (ti), sin(ti))
          enddo

          ku = ku + ln
          ln = 2 * ln
        enddo

        return
      endif

c---------------------------------------------------------------------
c   Check if input parameters are invalid.
c---------------------------------------------------------------------
      mx = u(1)
      if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)    
     >  then
        write (*, 1)  is, m, mx
 1      format ('CFFTZ: Either U has not been initialized, or else'/    
     >    'one of the input parameters is invalid', 3I5)
        stop
      endif

c---------------------------------------------------------------------
c   Perform one variant of the Stockham FFT.
c---------------------------------------------------------------------
      do l = 1, m, 2
        call fftz2 (is, l, m, n, ny, ny1, u, x, y)
        if (l .eq. m) goto 160
        call fftz2 (is, l + 1, m, n, ny, ny1, u, y, x)
      enddo

      goto 180

c---------------------------------------------------------------------
c   Copy Y to X.
c---------------------------------------------------------------------
 160  do j = 1, n
        do i = 1, ny
          x(i,j) = y(i,j)
        enddo
      enddo

 180  continue

      return
      end

c---------------------------------------------------------------------
c---------------------------------------------------------------------

      subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)

c---------------------------------------------------------------------
c---------------------------------------------------------------------

c---------------------------------------------------------------------
c   Performs the L-th iteration of the second variant of the Stockham FFT.
c---------------------------------------------------------------------

      implicit none

      integer is,k,l,m,n,ny,ny1,nu,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
      double complex u,x,y,u1,x11,x21
      dimension u(n), x(ny1,n), y(ny1,n)


c---------------------------------------------------------------------
c   Set initial parameters.
c---------------------------------------------------------------------
      n = 2 ** m
      k = u(1)
      nu = k / 64
      n1 = n / 2
      lk = 2 ** (l - 1)
      li = 2 ** (m - l)
      lj = 2 * lk
      ku = li + 1

      do i = 0, li - 1
        i11 = i * lk + 1
        i12 = i11 + n1
        i21 = i * lj + 1
        i22 = i21 + lk
        if (is .ge. 1) then
          u1 = u(ku+i)
        else
          u1 = dconjg (u(ku+i))
        endif

c---------------------------------------------------------------------
c   This loop is vectorizable.
c---------------------------------------------------------------------
        do k = 0, lk - 1
          do j = 1, ny
            x11 = x(j,i11+k)
            x21 = x(j,i12+k)
            y(j,i21+k) = x11 + x21
            y(j,i22+k) = u1 * (x11 - x21)
          enddo
        enddo
      enddo

      return
      end

c---------------------------------------------------------------------

