C#######################################################################
C PSTSWM Version 1.0 (8/1/93)                                          #
C  A message-passing benchmark code and parallel algorithm testbed     #
C  that solves the nonlinear shallow water equations using the spectral#
C  transform method.                                                   #
C Written by:                                                          #
C  Patrick Worley of Oak Ridge National Laboratory                     #
C  Ian Foster of Argonne National Laboratory                           #
C Based on the sequential code STSWM 2.0 by James Hack and Ruediger    #
C  Jakob of the National Center for Atmospheric Research.              #
C Research and development funded by the Computer Hardware, Advanced   #
C  Mathematics, and Model Physics (CHAMMP) program of the U.S.         #
C  Department of Energy.                                               # 
C                                                                      #
C Questions and comments should be directed to worley@msr.epm.ornl.gov #
C Please notify and acknowledge the authors in any research or         #
C publications utilizing PSTSWM or any part of the code.               #
C                                                                      #
C NOTICE: Neither the institutions nor the authors make any            #
C representations about the suitability of this software for any       #
C purpose. This software is provided "as is", without express or       #
C implied warranty.                                                    #
C#######################################################################
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C parrft.f                                                             C
C                                                                      C
C The following routines are the driver routine used to invoke the     C
C forward and inverse distributed real Fourier transform routines and  C
C utility routines used in both the forward and inverse transforms.    C
C                                                                      C
C PARRFT - driver routine for both forward and inverse real Fourier    C
C          transform.                                                  C
C RFFT_INIT - routine that calculates swap partners and other related  C
C          information needed by distributed algorithms                C
C BUFCPY - a generic copy routine used in the transform algorithms     C
C CPYFIX - a special purpose copy routine used to reorder a complex    C
C          vector in preparation for the recovery of the real transformC
C          from the output of a complex transform.                     C
C                                                                      C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
      SUBROUTINE PARRFT(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP,
     &                  MYINDEX, BASE1, BASE2, ISIGN, NLOCAL, MLOCAL,
     &                  JUMP, NFAX, NTRIGS, TRIGS, WS, X)
C
C This subroutine calculates either the forward Fourier transform of M
C real vectors, each of length N, or the inverse fourier transform of M
C complex vectors of length N/2+1, over a subset of MAPSIZE processors. 
C Here N=NLOCAL*MAPSIZE. It uses a distributed algorithm, i.e., an
C algorithm that moves the data and partial results only as they are
C needed. The MAP array defines the subset and the processor ordering to
C use. The routine RFTFAX must be called before calling PARRFT. 
C
C NOTE: The complex vector of length N/2+1 is packed into a vector of 
C length N/2. The first and last Fourier coefficients are real, and the 
C real part of the last coefficient is packed into the imaginary part
C of the first coefficient. This element should be "unpacked" on 
C processor 0 following a call to PARRFT to calculate the forward FFT using
C code like:
C
C     Unpack (NLOCAL/2+1)st "complex" coefficient from imaginary
C     part of first coefficient.
C     IF (MYINDEX .EQ. 0) THEN
C       DO I = 1,MLOCAL
C         X(NLOCAL+1,I) = X(2,I)
C         X(NLOCAL+2,I) = 0.0
C         X(2,I)        = 0.0
C       ENDDO
C     ENDIF
C
C Similarly, this element should be "packed" on processor 0 preceding a
C call to PARRFT to calculate the inverse FFT:
C 
C     Pack (NLOCAL/2+1)st "complex" coefficient into imaginary part of 
C     first coefficient. 
C     IF (MYINDEX .EQ. 0) THEN
C       DO I = 1,MLOCAL
C         X(2,I) = X(NLOCAL+1,I)
C       ENDDO
C     ENDIF
C
C Communication options (COMMOPT) for PARRFT include:
C  IF (COMMOPT .EQ. 0) simple swap: send/recv/sum
C  IF (COMMOPT .EQ. 1) ordered swap: [send/recv]|[recv/send]/sum
C  IF (COMMOPT .EQ. 2) simple swap with recv-ahead
C  IF (COMMOPT .EQ. 3) ordered swap with recv-ahead
C Communication protocol options (PROTOPT) for PARRFT include:
C  IF (PROTOPT .EQ. 1, 3, .OR. 5)     nonblocking send   
C  IF (PROTOPT .EQ. 2, 3, 4, .OR. 5)  nonblocking receive
C  IF (PROTOPT .EQ. 4 .OR. 5)         forcetype          
C  IF (PROTOPT .EQ. 6 .AND. COMMOPT .EQ. 1) synchronous  
C
C called by: RFTLON
C calls: FRFFT1, FRFFT2, IRFFT1, IRFFT2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C communication algorithm option
      INTEGER COMMOPT
C communication protocol option 
      INTEGER PROTOPT
C forcetype message type offset
      INTEGER FORCETYPE
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C message type offsets to use in interprocessor communication
      INTEGER BASE1, BASE2
C direction of FFT (-1 is forward, +1 is inverse)
      INTEGER ISIGN
C the local length of each vector to be transformed
      INTEGER NLOCAL
C the number of vectors per processor to be transformed
      INTEGER MLOCAL
C the offset between the start of successive vectors
      INTEGER JUMP
C the factorization of N (= NLOCAL*MAPSIZE)
      INTEGER NFAX(13)
C number of trigonometric function values used by PARRFT 
      INTEGER NTRIGS
C trigonometric function values used by PARRFT 
      COMPLEX TRIGS(NTRIGS)
C
C     Input/Output
C
C For a forward transform, (ISIGN .EQ. -1), X must contain the real data
C to be transformed. The Mth vector to be transformed extends from
C X((M-1)*JUMP+1) to X(M*JUMP). 
C For an inverse transform, (ISIGN .EQ. +1), X must contain the
C half-length complex data. Thus X contains M complex vectors each of
C length N/2+1. The real and imaginary parts of one of these vectors are
C stored as consecutive elements of X (with the exception of the first 
C and last elements, as described above).  Hence the real and imaginary
C parts of the Ith component of the Jth complex vector are stored at
C X((J-1)*JUMP+2*I-1) and X((J-1)*JUMP+2*I). 
      REAL X(JUMP,MLOCAL)
C
C     Work Space
C
C communication buffers
      REAL WS(JUMP,MLOCAL)
C
C---- Local Variables --------------------------------------------------
C
C base 2 logarithm of the number of processors (also, approximate number
C of steps in the distributed phase of the Fourier transform algorithm)
      INTEGER LGP
C length and index offset into trigonometric function values array
C for forward and inverse transforms, respectively
      INTEGER ICFWD, NCFWD, ICINV, NCINV
C number of complex vectors in each block in the two block algorithms
      INTEGER MLOCH1, MLOCH2
C loop index
      INTEGER I
C
C---- External Functions -----------------------------------------------
C
C integer base 2 logarithm
      EXTERNAL LOG2
      INTEGER LOG2
C
C---- Executable Statements --------------------------------------------
C
C     Calculate lengths and index offsets for trigonometric data for
C     forward and inverse transforms. 
      LGP = LOG2(MAPSIZE)
      IF (2**LGP .NE. MAPSIZE) THEN
        WRITE(0,100) MAPSIZE
  100   FORMAT (/,' PSTSWM: FATAL ERROR IN SUBROUTINE PARRFT ',/,
     &          ' MAPSIZE MUST BE A POWER OF TWO TO USE THE ',/,
     &          ' DISTRIBUTED FOURIER TRANSFORM ALGORITHM ',/,
     &          ' MAPSIZE = ',I4)
        STOP
      ENDIF
C
      ICFWD = 1
      NCFWD = LGP + 3*NLOCAL/4
C
      ICINV = ICFWD + NCFWD
      NCINV = (3 + LGP)*NLOCAL/4
C
C     Call appropriate RFT routine.
      IF (ISIGN .EQ. -1) THEN
C       forward RFT
C
        IF ( MLOCAL .GE. 1 ) THEN
C         something to do
C
          IF (COMMOPT .LE. 1) THEN
C           one block (no overlap) algorithm
            CALL FRFFT1(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP,
     &                  MYINDEX, BASE1, NLOCAL, MLOCAL, JUMP, NFAX,
     &                  NCFWD, TRIGS(ICFWD), WS, X)
          ELSE
C           two blocks (overlap) algorithm
            MLOCH1 = (MLOCAL+1)/2
            MLOCH2 = MLOCAL - MLOCH1
            CALL FRFFT2(COMMOPT-2, PROTOPT, FORCETYPE, MAPSIZE, 
     &                  MAP, MYINDEX, BASE1, BASE2,
     &                  NLOCAL, MLOCH1, MLOCH2, JUMP, NFAX,
     &                  NCFWD, TRIGS(ICFWD), WS, WS(1,MLOCH1+1), X,
     &                  X(1,MLOCH1+1)) 
          ENDIF
C
        ENDIF
C
      ELSEIF (ISIGN .EQ. 1) THEN
C       inverse RFT
C
        IF ( MLOCAL .GE. 1 ) THEN
C         something to do
C
          IF (COMMOPT .LE. 1) THEN
C           one block (no overlap) algorithm
            CALL IRFFT1(COMMOPT, PROTOPT, FORCETYPE, MAPSIZE, MAP,
     &                  MYINDEX, BASE1, NLOCAL, MLOCAL, JUMP, NFAX,
     &                  NCINV, TRIGS(ICINV), WS, X)
          ELSE
C           two blocks (overlap) algorithm
            MLOCH1 = (MLOCAL+1)/2
            MLOCH2 = MLOCAL - MLOCH1
            CALL IRFFT2(COMMOPT-2, PROTOPT, FORCETYPE, MAPSIZE, 
     &                  MAP, MYINDEX, BASE1, BASE2,
     &                  NLOCAL, MLOCH1, MLOCH2, JUMP, NFAX,
     &                  NCINV, TRIGS(ICINV), WS,
     &                  WS(1,MLOCH1+1), X, X(1,MLOCH1+1)) 
C
          ENDIF
C
        ENDIF
C
      ENDIF     
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE RFFT_INIT(MAPSIZE, MAP, MYINDEX, LGPROCSX, MAXSWAP,
     &                     SWAPNODE, ORDER, DIRN, TWINDEX, TWORDER) 
C
C This routine calculates swap partners and other information needed
C by the distributed Fourier transform algorithms.
C
C called by: FRFFT1, FRFFT2, IRFFT1, IRFFT2
C calls: LOG2
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C number of processors in subset
      INTEGER MAPSIZE
C processor subset (and processor ordering)
      INTEGER MAP(0:MAPSIZE-1)
C index of "me" in MAP array
      INTEGER MYINDEX
C declared length of the output arrays, at least as long as 
C LOG2(MAPSIZE)
      INTEGER LGPROCSX
C
C     Output
C
C number of "computational" swaps in distributed phase of transform
C algorithm. Total number of swaps includes one before sequential
C phase, and possibly one before real fix-up phase.
      INTEGER MAXSWAP
C arrays indicating the destination of the message sent during a
C given swap, and whether this processor sends or receives first
      INTEGER SWAPNODE(0:LGPROCSX)
      INTEGER ORDER(0:LGPROCSX)
C which half (0: low, 1: high) of the data buffer that is sent during a 
C given swap
      INTEGER DIRN(0:LGPROCSX)
C index of the processor "twin" whose data is needed in order to
C compute the real transform from the complex transform, and order
C in which swap with TWIN should be performed 
      INTEGER TWINDEX, TWORDER
C
C---- Local Variables --------------------------------------------------
C
C distance between MYINDEX and swap partner index
      INTEGER DISTANCE
C index of swap partner
      INTEGER DEST
C loop index
      INTEGER I
C temporaries used to compute TWINDEX
      INTEGER ILSHFT, IMASK, IPTIL
C
C---- External Functions -----------------------------------------------
C
C integer base 2 logarithm
      EXTERNAL LOG2
      INTEGER LOG2
C
C---- Executable Statements -------------------------------------------
C
      IF (MAPSIZE .GT. 1) THEN
C       a distributed Fourier transform
C
        MAXSWAP = 0
        DISTANCE = 1
C           
        DO WHILE (DISTANCE .LT. MAPSIZE)
C
C         Increment step index.
          MAXSWAP = MAXSWAP + 1
C
C         Calculate swap partners, swap order, and buffer pointers. The
C         order is chosen in such a way as to minimize collisions on a
C         bidirectional grid. 
          IF (MOD(MYINDEX,2*DISTANCE) .LT. DISTANCE) THEN
            DEST = MYINDEX + DISTANCE
            DIRN(MAXSWAP) = 0
            IF (MOD(MYINDEX, 2) .EQ. 0) THEN
              ORDER(MAXSWAP) = 1
            ELSE
              ORDER(MAXSWAP) = -1
            ENDIF
          ELSE
            DEST = MYINDEX - DISTANCE
            DIRN(MAXSWAP) = 1
            IF (MOD(DEST, 2) .EQ. 0) THEN
              ORDER(MAXSWAP) = -1
            ELSE
              ORDER(MAXSWAP) = 1
            ENDIF
          ENDIF
          SWAPNODE(MAXSWAP) = MAP(DEST)
C
C         Update distance.
          DISTANCE = 2*DISTANCE
C
        ENDDO
C
C       First and last (excepting swap with TWIN) swap partners and
C       order are the same.
        SWAPNODE(0) = SWAPNODE(MAXSWAP)
        DIRN(0) = DIRN(MAXSWAP)
        ORDER(0) = ORDER(MAXSWAP)
C
C       Calculate TWIN and associated order.
        IF (MYINDEX .NE. 0) THEN
C         circular left shift of MYINDEX
          ILSHFT  = 2*MOD(MYINDEX, (MAPSIZE/2)) + MYINDEX/(MAPSIZE/2)
C         largest power of two smaller than or equal to ILSHFT
          IMASK   = 2**LOG2(ILSHFT)
          IPTIL   = 2*IMASK - MOD(ILSHFT, IMASK) - 1
          TWINDEX = IPTIL/2 + MOD(IPTIL, 2)*(MAPSIZE/2)
        ELSE
          TWINDEX = MYINDEX
        ENDIF
C
        IF (MYINDEX .LT. TWINDEX) THEN
          IF (MOD(MYINDEX, 2) .EQ. 0) THEN
            TWORDER = 1
          ELSE
            TWORDER = -1
          ENDIF
        ELSE
          IF (MOD(TWINDEX, 2) .EQ. 0) THEN
            TWORDER = -1
          ELSE
            TWORDER = 1
          ENDIF
        ENDIF
C
      ELSE
C       a serial Fourier transform
C
        MAXSWAP = -1
        TWINDEX = MYINDEX
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE BUFCPY(CPYLTH, INSET, OUTSET, INLTH, OUTLTH, MVECS, IN,
     &                  OUT)  
C
C BUFCPY copies CPYLTH elements of each real vector in IN into the
C corresponding real vector in OUT. The offset for the starting index in
C IN is INSET, and the offset for the starting index in OUT is OUTSET.
C
C called by: FRFFT1, FRFFT2, IRFFT1, IRFFT2
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C number of real elements to be copied per vector
      INTEGER CPYLTH
C index offset denoting what elements of IN should be used in the copy 
      INTEGER INSET
C index offset denoting what elements of OUT should be overwritten
      INTEGER OUTSET
C lengths of the real vectors in IN and OUT
      INTEGER INLTH, OUTLTH
C number of vectors to be copied
      INTEGER MVECS
C source array for the copy
      REAL IN(INLTH,MVECS)
C
C     Output
C
C destination array for the copy
      REAL OUT(OUTLTH,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C loop indices
      INTEGER I, J
C                                                                             
C---- Executable Statements --------------------------------------------
C
      DO J=1,MVECS
        DO I=1,CPYLTH
          OUT(OUTSET+I,J) = IN(INSET+I,J)
        ENDDO
      ENDDO
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
C
      SUBROUTINE CPYFIX(MYINDEX, VECLTH, MVECS, CJUMP, Y, BUF)
C
C CPYFIX copies the even numbered elements of each complex vector in Y
C into the corresponding half-length vector in BUF. If (MYINDEX .NE. 0),
C then the elements are copied in order. Otherwise, the order of copying
C is more complicated. The ordering is that required after (or before) a
C complex forward (or inverse) Fourier transform calculated as part of a
C real Fourier transform.  
C
C called by: FRFFT1, FRFFT2, IRFFT1, IRFFT2
C calls: 
C
C---- Implicit None ----------------------------------------------------
C
      IMPLICIT NONE
C
C---- Arguments --------------------------------------------------------
C
C     Input
C
C index of "me" in processor subset collaborating in distributed real
C Fourier transform 
      INTEGER MYINDEX
C length of the complex vectors in Y
      INTEGER VECLTH
C number of vectors in Y and BUF
      INTEGER MVECS
C the offset between the start of successive vectors in Y
      INTEGER CJUMP
C the array of data vectors
      COMPLEX Y(CJUMP,MVECS)
C
C     Output
C
C the array of half-length output vectors
      COMPLEX BUF(VECLTH/2,MVECS)
C
C---- Local Variables --------------------------------------------------
C
C length of the vectors in BUF
      INTEGER HLFLTH
C loop bound for each step of the copy algorithm when (MYINDEX .EQ. 0)
      INTEGER IPOW
C index offsets used to move through the elements of the output and data 
C arrays, respectively, in the correct order
      INTEGER IC, OFFSET
C loop indices
      INTEGER I, J, K
C                                                                             
C---- Executable Statements --------------------------------------------
C
C     length of the vectors in BUF
      HLFLTH = VECLTH/2
C
      IF (MYINDEX .EQ. 0) THEN
C
        DO K=1,MVECS
C
          IC   = HLFLTH
          IPOW = 1
C
C         Copy first even element into last buffer location.
          BUF(IC,K) = Y(2,K)
C
C         Copy successively larger blocks of even numbered elements
C         into BUF in reverse order.
          DO WHILE (IPOW .LT. HLFLTH)
C
C           Copy a block of IPOW even numbered elements into BUF in
C           order.  
            OFFSET = 4*IPOW+2
            DO I=1,IPOW
              IC = IC - 1
              BUF(IC,K) = Y(OFFSET-2*I,K)
            ENDDO
C
C           Calculate size of next block to be copied.
            IPOW = 2*IPOW
C
          ENDDO
C
        ENDDO
C
      ELSE
C
        DO J=1,MVECS
          DO I=1,HLFLTH
            BUF(I,J) = Y(2*I,J)
          ENDDO
        ENDDO
C
      ENDIF
C
      RETURN
      END
C
CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
