ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
pcpbtrsv.f
Go to the documentation of this file.
00001       SUBROUTINE PCPBTRSV( UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B,
00002      $                     IB, DESCB, AF, LAF, WORK, LWORK, INFO )
00003 *
00004 *  -- ScaLAPACK routine (version 2.0.2) --
00005 *     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
00006 *     May 1 2012
00007 *
00008 *     .. Scalar Arguments ..
00009       CHARACTER          TRANS, UPLO
00010       INTEGER            BW, IB, INFO, JA, LAF, LWORK, N, NRHS
00011 *     ..
00012 *     .. Array Arguments ..
00013       INTEGER            DESCA( * ), DESCB( * )
00014       COMPLEX            A( * ), AF( * ), B( * ), WORK( * )
00015 *     ..
00016 *
00017 *
00018 *  Purpose
00019 *  =======
00020 *
00021 *  PCPBTRSV solves a banded triangular system of linear equations
00022 *
00023 *                      A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
00024 *                                 or
00025 *                      A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS)
00026 *
00027 *  where A(1:N, JA:JA+N-1) is a banded
00028 *  triangular matrix factor produced by the
00029 *  Cholesky factorization code PCPBTRF
00030 *  and is stored in A(1:N,JA:JA+N-1) and AF.
00031 *  The matrix stored in A(1:N, JA:JA+N-1) is either
00032 *  upper or lower triangular according to UPLO,
00033 *  and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H
00034 *  is dictated by the user by the parameter TRANS.
00035 *
00036 *  Routine PCPBTRF MUST be called first.
00037 *
00038 *  =====================================================================
00039 *
00040 *  Arguments
00041 *  =========
00042 *
00043 *  UPLO    (global input) CHARACTER
00044 *          = 'U':  Upper triangle of A(1:N, JA:JA+N-1) is stored;
00045 *          = 'L':  Lower triangle of A(1:N, JA:JA+N-1) is stored.
00046 *
00047 *  TRANS   (global input) CHARACTER
00048 *          = 'N':  Solve with A(1:N, JA:JA+N-1);
00049 *          = 'C':  Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
00050 *
00051 *  N       (global input) INTEGER
00052 *          The number of rows and columns to be operated on, i.e. the
00053 *          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
00054 *
00055 *  BW      (global input) INTEGER
00056 *          Number of subdiagonals in L or U. 0 <= BW <= N-1
00057 *
00058 *  NRHS    (global input) INTEGER
00059 *          The number of right hand sides, i.e., the number of columns
00060 *          of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
00061 *          NRHS >= 0.
00062 *
00063 *  A       (local input/local output) COMPLEX pointer into
00064 *          local memory to an array with first dimension
00065 *          LLD_A >=(bw+1) (stored in DESCA).
00066 *          On entry, this array contains the local pieces of the
00067 *          N-by-N symmetric banded distributed Cholesky factor L or
00068 *          L^T A(1:N, JA:JA+N-1).
00069 *          This local portion is stored in the packed banded format
00070 *            used in LAPACK. Please see the Notes below and the
00071 *            ScaLAPACK manual for more detail on the format of
00072 *            distributed matrices.
00073 *
00074 *  JA      (global input) INTEGER
00075 *          The index in the global array A that points to the start of
00076 *          the matrix to be operated on (which may be either all of A
00077 *          or a submatrix of A).
00078 *
00079 *  DESCA   (global and local input) INTEGER array of dimension DLEN.
00080 *          if 1D type (DTYPE_A=501), DLEN >= 7;
00081 *          if 2D type (DTYPE_A=1), DLEN >= 9 .
00082 *          The array descriptor for the distributed matrix A.
00083 *          Contains information of mapping of A to memory. Please
00084 *          see NOTES below for full description and options.
00085 *
00086 *  B       (local input/local output) COMPLEX pointer into
00087 *          local memory to an array of local lead dimension lld_b>=NB.
00088 *          On entry, this array contains the
00089 *          the local pieces of the right hand sides
00090 *          B(IB:IB+N-1, 1:NRHS).
00091 *          On exit, this contains the local piece of the solutions
00092 *          distributed matrix X.
00093 *
00094 *  IB      (global input) INTEGER
00095 *          The row index in the global array B that points to the first
00096 *          row of the matrix to be operated on (which may be either
00097 *          all of B or a submatrix of B).
00098 *
00099 *  DESCB   (global and local input) INTEGER array of dimension DLEN.
00100 *          if 1D type (DTYPE_B=502), DLEN >=7;
00101 *          if 2D type (DTYPE_B=1), DLEN >= 9.
00102 *          The array descriptor for the distributed matrix B.
00103 *          Contains information of mapping of B to memory. Please
00104 *          see NOTES below for full description and options.
00105 *
00106 *  AF      (local output) COMPLEX array, dimension LAF.
00107 *          Auxiliary Fillin Space.
00108 *          Fillin is created during the factorization routine
00109 *          PCPBTRF and this is stored in AF. If a linear system
00110 *          is to be solved using PCPBTRS after the factorization
00111 *          routine, AF *must not be altered* after the factorization.
00112 *
00113 *  LAF     (local input) INTEGER
00114 *          Size of user-input Auxiliary Fillin space AF. Must be >=
00115 *          (NB+2*bw)*bw
00116 *          If LAF is not large enough, an error code will be returned
00117 *          and the minimum acceptable size will be returned in AF( 1 )
00118 *
00119 *  WORK    (local workspace/local output)
00120 *          COMPLEX temporary workspace. This space may
00121 *          be overwritten in between calls to routines. WORK must be
00122 *          the size given in LWORK.
00123 *          On exit, WORK( 1 ) contains the minimal LWORK.
00124 *
00125 *  LWORK   (local input or global input) INTEGER
00126 *          Size of user-input workspace WORK.
00127 *          If LWORK is too small, the minimal acceptable size will be
00128 *          returned in WORK(1) and an error code is returned. LWORK>=
00129 *          (bw*NRHS)
00130 *
00131 *  INFO    (global output) INTEGER
00132 *          = 0:  successful exit
00133 *          < 0:  If the i-th argument is an array and the j-entry had
00134 *                an illegal value, then INFO = -(i*100+j), if the i-th
00135 *                argument is a scalar and had an illegal value, then
00136 *                INFO = -i.
00137 *
00138 *  =====================================================================
00139 *
00140 *
00141 *  Restrictions
00142 *  ============
00143 *
00144 *  The following are restrictions on the input parameters. Some of these
00145 *    are temporary and will be removed in future releases, while others
00146 *    may reflect fundamental technical limitations.
00147 *
00148 *    Non-cyclic restriction: VERY IMPORTANT!
00149 *      P*NB>= mod(JA-1,NB)+N.
00150 *      The mapping for matrices must be blocked, reflecting the nature
00151 *      of the divide and conquer algorithm as a task-parallel algorithm.
00152 *      This formula in words is: no processor may have more than one
00153 *      chunk of the matrix.
00154 *
00155 *    Blocksize cannot be too small:
00156 *      If the matrix spans more than one processor, the following
00157 *      restriction on NB, the size of each block on each processor,
00158 *      must hold:
00159 *      NB >= 2*BW
00160 *      The bulk of parallel computation is done on the matrix of size
00161 *      O(NB) on each processor. If this is too small, divide and conquer
00162 *      is a poor choice of algorithm.
00163 *
00164 *    Submatrix reference:
00165 *      JA = IB
00166 *      Alignment restriction that prevents unnecessary communication.
00167 *
00168 *
00169 *  =====================================================================
00170 *
00171 *
00172 *  Notes
00173 *  =====
00174 *
00175 *  If the factorization routine and the solve routine are to be called
00176 *    separately (to solve various sets of righthand sides using the same
00177 *    coefficient matrix), the auxiliary space AF *must not be altered*
00178 *    between calls to the factorization routine and the solve routine.
00179 *
00180 *  The best algorithm for solving banded and tridiagonal linear systems
00181 *    depends on a variety of parameters, especially the bandwidth.
00182 *    Currently, only algorithms designed for the case N/P >> bw are
00183 *    implemented. These go by many names, including Divide and Conquer,
00184 *    Partitioning, domain decomposition-type, etc.
00185 *
00186 *  Algorithm description: Divide and Conquer
00187 *
00188 *    The Divide and Conqer algorithm assumes the matrix is narrowly
00189 *      banded compared with the number of equations. In this situation,
00190 *      it is best to distribute the input matrix A one-dimensionally,
00191 *      with columns atomic and rows divided amongst the processes.
00192 *      The basic algorithm divides the banded matrix up into
00193 *      P pieces with one stored on each processor,
00194 *      and then proceeds in 2 phases for the factorization or 3 for the
00195 *      solution of a linear system.
00196 *      1) Local Phase:
00197 *         The individual pieces are factored independently and in
00198 *         parallel. These factors are applied to the matrix creating
00199 *         fillin, which is stored in a non-inspectable way in auxiliary
00200 *         space AF. Mathematically, this is equivalent to reordering
00201 *         the matrix A as P A P^T and then factoring the principal
00202 *         leading submatrix of size equal to the sum of the sizes of
00203 *         the matrices factored on each processor. The factors of
00204 *         these submatrices overwrite the corresponding parts of A
00205 *         in memory.
00206 *      2) Reduced System Phase:
00207 *         A small (BW* (P-1)) system is formed representing
00208 *         interaction of the larger blocks, and is stored (as are its
00209 *         factors) in the space AF. A parallel Block Cyclic Reduction
00210 *         algorithm is used. For a linear system, a parallel front solve
00211 *         followed by an analagous backsolve, both using the structure
00212 *         of the factored matrix, are performed.
00213 *      3) Backsubsitution Phase:
00214 *         For a linear system, a local backsubstitution is performed on
00215 *         each processor in parallel.
00216 *
00217 *
00218 *  Descriptors
00219 *  ===========
00220 *
00221 *  Descriptors now have *types* and differ from ScaLAPACK 1.0.
00222 *
00223 *  Note: banded codes can use either the old two dimensional
00224 *    or new one-dimensional descriptors, though the processor grid in
00225 *    both cases *must be one-dimensional*. We describe both types below.
00226 *
00227 *  Each global data object is described by an associated description
00228 *  vector.  This vector stores the information required to establish
00229 *  the mapping between an object element and its corresponding process
00230 *  and memory location.
00231 *
00232 *  Let A be a generic term for any 2D block cyclicly distributed array.
00233 *  Such a global array has an associated description vector DESCA.
00234 *  In the following comments, the character _ should be read as
00235 *  "of the global array".
00236 *
00237 *  NOTATION        STORED IN      EXPLANATION
00238 *  --------------- -------------- --------------------------------------
00239 *  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
00240 *                                 DTYPE_A = 1.
00241 *  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
00242 *                                 the BLACS process grid A is distribu-
00243 *                                 ted over. The context itself is glo-
00244 *                                 bal, but the handle (the integer
00245 *                                 value) may vary.
00246 *  M_A    (global) DESCA( M_ )    The number of rows in the global
00247 *                                 array A.
00248 *  N_A    (global) DESCA( N_ )    The number of columns in the global
00249 *                                 array A.
00250 *  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
00251 *                                 the rows of the array.
00252 *  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
00253 *                                 the columns of the array.
00254 *  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
00255 *                                 row of the array A is distributed.
00256 *  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
00257 *                                 first column of the array A is
00258 *                                 distributed.
00259 *  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
00260 *                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
00261 *
00262 *  Let K be the number of rows or columns of a distributed matrix,
00263 *  and assume that its process grid has dimension p x q.
00264 *  LOCr( K ) denotes the number of elements of K that a process
00265 *  would receive if K were distributed over the p processes of its
00266 *  process column.
00267 *  Similarly, LOCc( K ) denotes the number of elements of K that a
00268 *  process would receive if K were distributed over the q processes of
00269 *  its process row.
00270 *  The values of LOCr() and LOCc() may be determined via a call to the
00271 *  ScaLAPACK tool function, NUMROC:
00272 *          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
00273 *          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
00274 *  An upper bound for these quantities may be computed by:
00275 *          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
00276 *          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
00277 *
00278 *
00279 *  One-dimensional descriptors:
00280 *
00281 *  One-dimensional descriptors are a new addition to ScaLAPACK since
00282 *    version 1.0. They simplify and shorten the descriptor for 1D
00283 *    arrays.
00284 *
00285 *  Since ScaLAPACK supports two-dimensional arrays as the fundamental
00286 *    object, we allow 1D arrays to be distributed either over the
00287 *    first dimension of the array (as if the grid were P-by-1) or the
00288 *    2nd dimension (as if the grid were 1-by-P). This choice is
00289 *    indicated by the descriptor type (501 or 502)
00290 *    as described below.
00291 *
00292 *    IMPORTANT NOTE: the actual BLACS grid represented by the
00293 *    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
00294 *    irrespective of which one-dimensional descriptor type
00295 *    (501 or 502) is input.
00296 *    This routine will interpret the grid properly either way.
00297 *    ScaLAPACK routines *do not support intercontext operations* so that
00298 *    the grid passed to a single ScaLAPACK routine *must be the same*
00299 *    for all array descriptors passed to that routine.
00300 *
00301 *    NOTE: In all cases where 1D descriptors are used, 2D descriptors
00302 *    may also be used, since a one-dimensional array is a special case
00303 *    of a two-dimensional array with one dimension of size unity.
00304 *    The two-dimensional array used in this case *must* be of the
00305 *    proper orientation:
00306 *      If the appropriate one-dimensional descriptor is DTYPEA=501
00307 *      (1 by P type), then the two dimensional descriptor must
00308 *      have a CTXT value that refers to a 1 by P BLACS grid;
00309 *      If the appropriate one-dimensional descriptor is DTYPEA=502
00310 *      (P by 1 type), then the two dimensional descriptor must
00311 *      have a CTXT value that refers to a P by 1 BLACS grid.
00312 *
00313 *
00314 *  Summary of allowed descriptors, types, and BLACS grids:
00315 *  DTYPE           501         502         1         1
00316 *  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
00317 *  -----------------------------------------------------
00318 *  A               OK          NO          OK        NO
00319 *  B               NO          OK          NO        OK
00320 *
00321 *  Note that a consequence of this chart is that it is not possible
00322 *    for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
00323 *    to opposite requirements for the orientation of the BLACS grid,
00324 *    and as noted before, the *same* BLACS context must be used in
00325 *    all descriptors in a single ScaLAPACK subroutine call.
00326 *
00327 *  Let A be a generic term for any 1D block cyclicly distributed array.
00328 *  Such a global array has an associated description vector DESCA.
00329 *  In the following comments, the character _ should be read as
00330 *  "of the global array".
00331 *
00332 *  NOTATION        STORED IN  EXPLANATION
00333 *  --------------- ---------- ------------------------------------------
00334 *  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
00335 *                                TYPE_A = 501: 1-by-P grid.
00336 *                                TYPE_A = 502: P-by-1 grid.
00337 *  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
00338 *                                the BLACS process grid A is distribu-
00339 *                                ted over. The context itself is glo-
00340 *                                bal, but the handle (the integer
00341 *                                value) may vary.
00342 *  N_A    (global) DESCA( 3 ) The size of the array dimension being
00343 *                                distributed.
00344 *  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
00345 *                                the distributed dimension of the array.
00346 *  SRC_A  (global) DESCA( 5 ) The process row or column over which the
00347 *                                first row or column of the array
00348 *                                is distributed.
00349 *  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
00350 *                                storing the local blocks of the distri-
00351 *                                buted array A. Minimum value of LLD_A
00352 *                                depends on TYPE_A.
00353 *                                TYPE_A = 501: LLD_A >=
00354 *                                   size of undistributed dimension, 1.
00355 *                                TYPE_A = 502: LLD_A >=NB_A, 1.
00356 *  Reserved        DESCA( 7 ) Reserved for future use.
00357 *
00358 *
00359 *
00360 *  =====================================================================
00361 *
00362 *  Code Developer: Andrew J. Cleary, University of Tennessee.
00363 *    Current address: Lawrence Livermore National Labs.
00364 *  This version released: August, 2001.
00365 *
00366 *  =====================================================================
00367 *
00368 *     ..
00369 *     .. Parameters ..
00370       REAL               ONE, ZERO
00371       PARAMETER          ( ONE = 1.0E+0 )
00372       PARAMETER          ( ZERO = 0.0E+0 )
00373       COMPLEX            CONE, CZERO
00374       PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
00375       PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ) )
00376       INTEGER            INT_ONE
00377       PARAMETER          ( INT_ONE = 1 )
00378       INTEGER            DESCMULT, BIGNUM
00379       PARAMETER          (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT)
00380       INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
00381      $                   LLD_, MB_, M_, NB_, N_, RSRC_
00382       PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
00383      $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
00384      $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
00385 *     ..
00386 *     .. Local Scalars ..
00387       INTEGER            CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
00388      $                   IDUM1, IDUM2, IDUM3, JA_NEW, LEVEL_DIST, LLDA,
00389      $                   LLDB, MBW2, MYCOL, MYROW, MY_NUM_COLS, NB, NP,
00390      $                   NPCOL, NPROW, NP_SAVE, ODD_SIZE, OFST,
00391      $                   PART_OFFSET, PART_SIZE, RETURN_CODE, STORE_M_B,
00392      $                   STORE_N_A, WORK_SIZE_MIN
00393 *     ..
00394 *     .. Local Arrays ..
00395       INTEGER            DESCA_1XP( 7 ), DESCB_PX1( 7 ),
00396      $                   PARAM_CHECK( 17, 3 )
00397 *     ..
00398 *     .. External Subroutines ..
00399       EXTERNAL           BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO,
00400      $                   CGEMM, CGERV2D, CGESD2D, CLAMOV, CMATADD,
00401      $                   CTBTRS, CTRMM, CTRTRS, DESC_CONVERT, GLOBCHK,
00402      $                   PXERBLA, RESHAPE
00403 *     ..
00404 *     .. External Functions ..
00405       LOGICAL            LSAME
00406       INTEGER            NUMROC
00407       EXTERNAL           LSAME, NUMROC
00408 *     ..
00409 *     .. Intrinsic Functions ..
00410       INTRINSIC          ICHAR, MIN, MOD
00411 *     ..
00412 *     .. Executable Statements ..
00413 *
00414 *     Test the input parameters
00415 *
00416       INFO = 0
00417 *
00418 *     Convert descriptor into standard form for easy access to
00419 *        parameters, check that grid is of right shape.
00420 *
00421       DESCA_1XP( 1 ) = 501
00422       DESCB_PX1( 1 ) = 502
00423 *
00424       CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE )
00425 *
00426       IF( RETURN_CODE .NE. 0) THEN
00427          INFO = -( 8*100 + 2 )
00428       ENDIF
00429 *
00430       CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE )
00431 *
00432       IF( RETURN_CODE .NE. 0) THEN
00433          INFO = -( 11*100 + 2 )
00434       ENDIF
00435 *
00436 *     Consistency checks for DESCA and DESCB.
00437 *
00438 *     Context must be the same
00439       IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN
00440          INFO = -( 11*100 + 2 )
00441       ENDIF
00442 *
00443 *        These are alignment restrictions that may or may not be removed
00444 *        in future releases. -Andy Cleary, April 14, 1996.
00445 *
00446 *     Block sizes must be the same
00447       IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN
00448          INFO = -( 11*100 + 4 )
00449       ENDIF
00450 *
00451 *     Source processor must be the same
00452 *
00453       IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN
00454          INFO = -( 11*100 + 5 )
00455       ENDIF
00456 *
00457 *     Get values out of descriptor for use in code.
00458 *
00459       ICTXT = DESCA_1XP( 2 )
00460       CSRC = DESCA_1XP( 5 )
00461       NB = DESCA_1XP( 4 )
00462       LLDA = DESCA_1XP( 6 )
00463       STORE_N_A = DESCA_1XP( 3 )
00464       LLDB = DESCB_PX1( 6 )
00465       STORE_M_B = DESCB_PX1( 3 )
00466 *
00467 *     Get grid parameters
00468 *
00469 *
00470 *     Pre-calculate bw^2
00471 *
00472       MBW2 = BW * BW
00473 *
00474       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00475       NP = NPROW * NPCOL
00476 *
00477 *
00478 *
00479       IF( LSAME( UPLO, 'U' ) ) THEN
00480          IDUM1 = ICHAR( 'U' )
00481       ELSE IF ( LSAME( UPLO, 'L' ) ) THEN
00482          IDUM1 = ICHAR( 'L' )
00483       ELSE
00484          INFO = -1
00485       END IF
00486 *
00487       IF( LSAME( TRANS, 'N' ) ) THEN
00488          IDUM2 = ICHAR( 'N' )
00489       ELSE IF ( LSAME( TRANS, 'C' ) ) THEN
00490          IDUM2 = ICHAR( 'C' )
00491       ELSE
00492          INFO = -2
00493       END IF
00494 *
00495       IF( LWORK .LT. -1) THEN
00496          INFO = -14
00497       ELSE IF ( LWORK .EQ. -1 ) THEN
00498          IDUM3 = -1
00499       ELSE
00500          IDUM3 = 1
00501       ENDIF
00502 *
00503       IF( N .LT. 0 ) THEN
00504          INFO = -3
00505       ENDIF
00506 *
00507       IF( N+JA-1 .GT. STORE_N_A ) THEN
00508          INFO = -( 8*100 + 6 )
00509       ENDIF
00510 *
00511       IF(( BW .GT. N-1 ) .OR.
00512      $   ( BW .LT. 0 ) ) THEN
00513          INFO = -4
00514       ENDIF
00515 *
00516       IF( LLDA .LT. (BW+1) ) THEN
00517          INFO = -( 8*100 + 6 )
00518       ENDIF
00519 *
00520       IF( NB .LE. 0 ) THEN
00521          INFO = -( 8*100 + 4 )
00522       ENDIF
00523 *
00524       IF( N+IB-1 .GT. STORE_M_B ) THEN
00525          INFO = -( 11*100 + 3 )
00526       ENDIF
00527 *
00528       IF( LLDB .LT. NB ) THEN
00529          INFO = -( 11*100 + 6 )
00530       ENDIF
00531 *
00532       IF( NRHS .LT. 0 ) THEN
00533          INFO = -5
00534       ENDIF
00535 *
00536 *     Current alignment restriction
00537 *
00538       IF( JA .NE. IB) THEN
00539          INFO = -7
00540       ENDIF
00541 *
00542 *     Argument checking that is specific to Divide & Conquer routine
00543 *
00544       IF( NPROW .NE. 1 ) THEN
00545          INFO = -( 8*100+2 )
00546       ENDIF
00547 *
00548       IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN
00549          INFO = -( 3 )
00550          CALL PXERBLA( ICTXT,
00551      $      'PCPBTRSV, D&C alg.: only 1 block per proc',
00552      $      -INFO )
00553          RETURN
00554       ENDIF
00555 *
00556       IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*BW )) THEN
00557          INFO = -( 8*100+4 )
00558          CALL PXERBLA( ICTXT,
00559      $      'PCPBTRSV, D&C alg.: NB too small',
00560      $      -INFO )
00561          RETURN
00562       ENDIF
00563 *
00564 *
00565       WORK_SIZE_MIN =
00566      $           BW*NRHS
00567 *
00568       WORK( 1 ) = WORK_SIZE_MIN
00569 *
00570       IF( LWORK .LT. WORK_SIZE_MIN ) THEN
00571          IF( LWORK .NE. -1 ) THEN
00572          INFO = -14
00573          CALL PXERBLA( ICTXT,
00574      $      'PCPBTRSV: worksize error',
00575      $      -INFO )
00576          ENDIF
00577          RETURN
00578       ENDIF
00579 *
00580 *     Pack params and positions into arrays for global consistency check
00581 *
00582       PARAM_CHECK( 17, 1 ) = DESCB(5)
00583       PARAM_CHECK( 16, 1 ) = DESCB(4)
00584       PARAM_CHECK( 15, 1 ) = DESCB(3)
00585       PARAM_CHECK( 14, 1 ) = DESCB(2)
00586       PARAM_CHECK( 13, 1 ) = DESCB(1)
00587       PARAM_CHECK( 12, 1 ) = IB
00588       PARAM_CHECK( 11, 1 ) = DESCA(5)
00589       PARAM_CHECK( 10, 1 ) = DESCA(4)
00590       PARAM_CHECK(  9, 1 ) = DESCA(3)
00591       PARAM_CHECK(  8, 1 ) = DESCA(1)
00592       PARAM_CHECK(  7, 1 ) = JA
00593       PARAM_CHECK(  6, 1 ) = NRHS
00594       PARAM_CHECK(  5, 1 ) = BW
00595       PARAM_CHECK(  4, 1 ) = N
00596       PARAM_CHECK(  3, 1 ) = IDUM3
00597       PARAM_CHECK(  2, 1 ) = IDUM2
00598       PARAM_CHECK(  1, 1 ) = IDUM1
00599 *
00600       PARAM_CHECK( 17, 2 ) = 1105
00601       PARAM_CHECK( 16, 2 ) = 1104
00602       PARAM_CHECK( 15, 2 ) = 1103
00603       PARAM_CHECK( 14, 2 ) = 1102
00604       PARAM_CHECK( 13, 2 ) = 1101
00605       PARAM_CHECK( 12, 2 ) = 10
00606       PARAM_CHECK( 11, 2 ) = 805
00607       PARAM_CHECK( 10, 2 ) = 804
00608       PARAM_CHECK(  9, 2 ) = 803
00609       PARAM_CHECK(  8, 2 ) = 801
00610       PARAM_CHECK(  7, 2 ) = 7
00611       PARAM_CHECK(  6, 2 ) = 5
00612       PARAM_CHECK(  5, 2 ) = 4
00613       PARAM_CHECK(  4, 2 ) = 3
00614       PARAM_CHECK(  3, 2 ) = 14
00615       PARAM_CHECK(  2, 2 ) = 2
00616       PARAM_CHECK(  1, 2 ) = 1
00617 *
00618 *     Want to find errors with MIN( ), so if no error, set it to a big
00619 *     number. If there already is an error, multiply by the the
00620 *     descriptor multiplier.
00621 *
00622       IF( INFO.GE.0 ) THEN
00623          INFO = BIGNUM
00624       ELSE IF( INFO.LT.-DESCMULT ) THEN
00625          INFO = -INFO
00626       ELSE
00627          INFO = -INFO * DESCMULT
00628       END IF
00629 *
00630 *     Check consistency across processors
00631 *
00632       CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17,
00633      $              PARAM_CHECK( 1, 3 ), INFO )
00634 *
00635 *     Prepare output: set info = 0 if no error, and divide by DESCMULT
00636 *     if error is not in a descriptor entry.
00637 *
00638       IF( INFO.EQ.BIGNUM ) THEN
00639          INFO = 0
00640       ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN
00641          INFO = -INFO / DESCMULT
00642       ELSE
00643          INFO = -INFO
00644       END IF
00645 *
00646       IF( INFO.LT.0 ) THEN
00647          CALL PXERBLA( ICTXT, 'PCPBTRSV', -INFO )
00648          RETURN
00649       END IF
00650 *
00651 *     Quick return if possible
00652 *
00653       IF( N.EQ.0 )
00654      $   RETURN
00655 *
00656       IF( NRHS.EQ.0 )
00657      $   RETURN
00658 *
00659 *
00660 *     Adjust addressing into matrix space to properly get into
00661 *        the beginning part of the relevant data
00662 *
00663       PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) )
00664 *
00665       IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN
00666          PART_OFFSET = PART_OFFSET + NB
00667       ENDIF
00668 *
00669       IF ( MYCOL .LT. CSRC ) THEN
00670          PART_OFFSET = PART_OFFSET - NB
00671       ENDIF
00672 *
00673 *     Form a new BLACS grid (the "standard form" grid) with only procs
00674 *        holding part of the matrix, of size 1xNP where NP is adjusted,
00675 *        starting at csrc=0, with JA modified to reflect dropped procs.
00676 *
00677 *     First processor to hold part of the matrix:
00678 *
00679       FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL )
00680 *
00681 *     Calculate new JA one while dropping off unused processors.
00682 *
00683       JA_NEW = MOD( JA-1, NB ) + 1
00684 *
00685 *     Save and compute new value of NP
00686 *
00687       NP_SAVE = NP
00688       NP = ( JA_NEW+N-2 )/NB + 1
00689 *
00690 *     Call utility routine that forms "standard-form" grid
00691 *
00692       CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE,
00693      $              FIRST_PROC, INT_ONE, NP )
00694 *
00695 *     Use new context from standard grid as context.
00696 *
00697       ICTXT_SAVE = ICTXT
00698       ICTXT = ICTXT_NEW
00699       DESCA_1XP( 2 ) = ICTXT_NEW
00700       DESCB_PX1( 2 ) = ICTXT_NEW
00701 *
00702 *     Get information about new grid.
00703 *
00704       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00705 *
00706 *     Drop out processors that do not have part of the matrix.
00707 *
00708       IF( MYROW .LT. 0 ) THEN
00709          GOTO 1234
00710       ENDIF
00711 *
00712 *     ********************************
00713 *     Values reused throughout routine
00714 *
00715 *     User-input value of partition size
00716 *
00717       PART_SIZE = NB
00718 *
00719 *     Number of columns in each processor
00720 *
00721       MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL )
00722 *
00723 *     Offset in columns to beginning of main partition in each proc
00724 *
00725       IF ( MYCOL .EQ. 0 ) THEN
00726         PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE )
00727         MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE )
00728       ENDIF
00729 *
00730 *     Offset in elements
00731 *
00732       OFST = PART_OFFSET*LLDA
00733 *
00734 *     Size of main (or odd) partition in each processor
00735 *
00736       ODD_SIZE = MY_NUM_COLS
00737       IF ( MYCOL .LT. NP-1 ) THEN
00738          ODD_SIZE = ODD_SIZE - BW
00739       ENDIF
00740 *
00741 *
00742 *
00743 *     Begin main code
00744 *
00745       IF ( LSAME( UPLO, 'L' ) ) THEN
00746 *
00747       IF ( LSAME( TRANS, 'N' ) ) THEN
00748 *
00749 *        Frontsolve
00750 *
00751 *
00752 ******************************************
00753 *       Local computation phase
00754 ******************************************
00755 *
00756 *       Use main partition in each processor to solve locally
00757 *
00758         CALL CTBTRS( UPLO, 'N', 'N', ODD_SIZE,
00759      $                    BW, NRHS,
00760      $                    A( OFST+1 ), LLDA,
00761      $                    B( PART_OFFSET+1 ), LLDB, INFO )
00762 *
00763 *
00764         IF ( MYCOL .LT. NP-1 ) THEN
00765 *         Use factorization of odd-even connection block to modify
00766 *           locally stored portion of right hand side(s)
00767 *
00768 *
00769 *           First copy and multiply it into temporary storage,
00770 *             then use it on RHS
00771 *
00772             CALL CLAMOV( 'N', BW, NRHS,
00773      $                B( PART_OFFSET+ODD_SIZE-BW+1), LLDB,
00774      $                WORK( 1 ), BW )
00775 *
00776             CALL CTRMM( 'L', 'U', 'N', 'N', BW, NRHS, -CONE,
00777      $                  A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), LLDA-1,
00778      $                  WORK( 1 ), BW )
00779 *
00780             CALL CMATADD( BW, NRHS, CONE, WORK( 1 ), BW,
00781      $                CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
00782 *
00783         ENDIF
00784 *
00785 *
00786         IF ( MYCOL .NE. 0 ) THEN
00787 *         Use the "spike" fillin to calculate contribution to previous
00788 *           processor's righthand-side.
00789 *
00790             CALL CGEMM( 'C', 'N', BW, NRHS, ODD_SIZE, -CONE, AF( 1 ),
00791      $                  ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO,
00792      $                  WORK( 1+BW-BW ), BW )
00793         ENDIF
00794 *
00795 *
00796 ************************************************
00797 *       Formation and solution of reduced system
00798 ************************************************
00799 *
00800 *
00801 *       Send modifications to prior processor's right hand sides
00802 *
00803         IF( MYCOL .GT. 0) THEN
00804 *
00805           CALL CGESD2D( ICTXT, BW, NRHS,
00806      $                     WORK( 1 ), BW,
00807      $                     0, MYCOL - 1 )
00808 *
00809         ENDIF
00810 *
00811 *       Receive modifications to processor's right hand sides
00812 *
00813         IF( MYCOL .LT. NPCOL-1) THEN
00814 *
00815           CALL CGERV2D( ICTXT, BW, NRHS,
00816      $                     WORK( 1 ), BW,
00817      $                     0, MYCOL + 1 )
00818 *
00819 *         Combine contribution to locally stored right hand sides
00820 *
00821           CALL CMATADD( BW, NRHS, CONE,
00822      $                WORK( 1 ), BW, CONE,
00823      $                B( PART_OFFSET+ODD_SIZE + 1 ), LLDB )
00824 *
00825         ENDIF
00826 *
00827 *
00828 *       The last processor does not participate in the solution of the
00829 *       reduced system, having sent its contribution already.
00830         IF( MYCOL .EQ. NPCOL-1 ) THEN
00831           GOTO 14
00832         ENDIF
00833 *
00834 *
00835 *       *************************************
00836 *       Modification Loop
00837 *
00838 *       The distance for sending and receiving for each level starts
00839 *         at 1 for the first level.
00840         LEVEL_DIST = 1
00841 *
00842 *       Do until this proc is needed to modify other procs' equations
00843 *
00844    12   CONTINUE
00845         IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11
00846 *
00847 *         Receive and add contribution to righthand sides from left
00848 *
00849           IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN
00850 *
00851             CALL CGERV2D( ICTXT, BW, NRHS,
00852      $                     WORK( 1 ),
00853      $                     BW, 0, MYCOL-LEVEL_DIST )
00854 *
00855             CALL CMATADD( BW, NRHS, CONE,
00856      $                WORK( 1 ), BW, CONE,
00857      $                B( PART_OFFSET+ODD_SIZE + 1 ), LLDB )
00858 *
00859           ENDIF
00860 *
00861 *         Receive and add contribution to righthand sides from right
00862 *
00863           IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN
00864 *
00865             CALL CGERV2D( ICTXT, BW, NRHS,
00866      $                     WORK( 1 ),
00867      $                     BW, 0, MYCOL+LEVEL_DIST )
00868 *
00869             CALL CMATADD( BW, NRHS, CONE,
00870      $                WORK( 1 ), BW, CONE,
00871      $                B( PART_OFFSET+ODD_SIZE + 1 ), LLDB )
00872 *
00873           ENDIF
00874 *
00875           LEVEL_DIST = LEVEL_DIST*2
00876 *
00877         GOTO 12
00878    11   CONTINUE
00879 *       [End of GOTO Loop]
00880 *
00881 *
00882 *
00883 *       *********************************
00884 *       Calculate and use this proc's blocks to modify other procs
00885 *
00886 *       Solve with diagonal block
00887 *
00888         CALL CTRTRS( 'L', 'N', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ),
00889      $               BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO )
00890 *
00891         IF( INFO.NE.0 ) THEN
00892           GO TO 1000
00893         ENDIF
00894 *
00895 *
00896 *
00897 *       *********
00898         IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN
00899 *
00900 *         Calculate contribution from this block to next diagonal block
00901 *
00902           CALL CGEMM( 'C', 'N', BW, NRHS, BW, -CONE,
00903      $                     AF( (ODD_SIZE)*BW+1 ),
00904      $                     BW,
00905      $                     B( PART_OFFSET+ODD_SIZE+1 ),
00906      $                     LLDB, CZERO,
00907      $                     WORK( 1 ),
00908      $                     BW )
00909 *
00910 *         Send contribution to diagonal block's owning processor.
00911 *
00912           CALL CGESD2D( ICTXT, BW, NRHS,
00913      $                     WORK( 1 ),
00914      $                     BW, 0, MYCOL+LEVEL_DIST )
00915 *
00916         ENDIF
00917 *       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
00918 *
00919 *       ************
00920         IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND.
00921      $      ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN
00922 *
00923 *
00924 *         Use offdiagonal block to calculate modification to diag block
00925 *           of processor to the left
00926 *
00927           CALL CGEMM( 'N', 'N', BW, NRHS, BW, -CONE,
00928      $                     AF( ODD_SIZE*BW+2*MBW2+1 ),
00929      $                     BW,
00930      $                     B( PART_OFFSET+ODD_SIZE+1 ),
00931      $                     LLDB, CZERO,
00932      $                     WORK( 1 ),
00933      $                     BW )
00934 *
00935 *         Send contribution to diagonal block's owning processor.
00936 *
00937           CALL CGESD2D( ICTXT, BW, NRHS,
00938      $                     WORK( 1 ),
00939      $                     BW, 0, MYCOL-LEVEL_DIST )
00940 *
00941         ENDIF
00942 *       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
00943 *
00944    14  CONTINUE
00945 *
00946       ELSE
00947 *
00948 ******************** BACKSOLVE *************************************
00949 *
00950 ********************************************************************
00951 *     .. Begin reduced system phase of algorithm ..
00952 ********************************************************************
00953 *
00954 *
00955 *
00956 *       The last processor does not participate in the solution of the
00957 *       reduced system and just waits to receive its solution.
00958         IF( MYCOL .EQ. NPCOL-1 ) THEN
00959           GOTO 24
00960         ENDIF
00961 *
00962 *       Determine number of steps in tree loop
00963 *
00964         LEVEL_DIST = 1
00965    27   CONTINUE
00966         IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 26
00967 *
00968           LEVEL_DIST = LEVEL_DIST*2
00969 *
00970         GOTO 27
00971    26   CONTINUE
00972 *
00973 *
00974         IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND.
00975      $      ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN
00976 *
00977 *         Receive solution from processor to left
00978 *
00979           CALL CGERV2D( ICTXT, BW, NRHS,
00980      $                     WORK( 1 ),
00981      $                     BW, 0, MYCOL-LEVEL_DIST )
00982 *
00983 *
00984 *         Use offdiagonal block to calculate modification to RHS stored
00985 *           on this processor
00986 *
00987           CALL CGEMM( 'C', 'N', BW, NRHS, BW, -CONE,
00988      $                     AF( ODD_SIZE*BW+2*MBW2+1 ),
00989      $                     BW,
00990      $                     WORK( 1 ),
00991      $                     BW, CONE,
00992      $                     B( PART_OFFSET+ODD_SIZE+1 ),
00993      $                     LLDB )
00994         ENDIF
00995 *       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
00996 *
00997 *
00998         IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN
00999 *
01000 *         Receive solution from processor to right
01001 *
01002           CALL CGERV2D( ICTXT, BW, NRHS,
01003      $                     WORK( 1 ),
01004      $                     BW, 0, MYCOL+LEVEL_DIST )
01005 *
01006 *         Calculate contribution from this block to next diagonal block
01007 *
01008           CALL CGEMM( 'N', 'N', BW, NRHS, BW, -CONE,
01009      $                     AF( (ODD_SIZE)*BW+1 ),
01010      $                     BW,
01011      $                     WORK( 1 ),
01012      $                     BW, CONE,
01013      $                     B( PART_OFFSET+ODD_SIZE+1 ),
01014      $                     LLDB )
01015 *
01016         ENDIF
01017 *       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
01018 *
01019 *
01020 *       Solve with diagonal block
01021 *
01022         CALL CTRTRS( 'L', 'C', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ),
01023      $               BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO )
01024 *
01025         IF( INFO.NE.0 ) THEN
01026           GO TO 1000
01027         ENDIF
01028 *
01029 *
01030 *
01031 ***Modification Loop *******
01032 *
01033    22   CONTINUE
01034         IF( LEVEL_DIST .EQ. 1 ) GOTO 21
01035 *
01036           LEVEL_DIST = LEVEL_DIST/2
01037 *
01038 *         Send solution to the right
01039 *
01040           IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN
01041 *
01042           CALL CGESD2D( ICTXT, BW, NRHS,
01043      $                     B( PART_OFFSET+ODD_SIZE+1 ),
01044      $                     LLDB, 0, MYCOL+LEVEL_DIST )
01045 *
01046           ENDIF
01047 *
01048 *         Send solution to left
01049 *
01050           IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN
01051 *
01052           CALL CGESD2D( ICTXT, BW, NRHS,
01053      $                     B( PART_OFFSET+ODD_SIZE+1 ),
01054      $                     LLDB, 0, MYCOL-LEVEL_DIST )
01055 *
01056           ENDIF
01057 *
01058         GOTO 22
01059    21   CONTINUE
01060 *       [End of GOTO Loop]
01061 *
01062    24   CONTINUE
01063 *          [Processor npcol - 1 jumped to here to await next stage]
01064 *
01065 *******************************
01066 *       Reduced system has been solved, communicate solutions to nearest
01067 *         neighbors in preparation for local computation phase.
01068 *
01069 *
01070 *       Send elements of solution to next proc
01071 *
01072         IF( MYCOL .LT. NPCOL-1) THEN
01073 *
01074           CALL CGESD2D( ICTXT, BW, NRHS,
01075      $                     B( PART_OFFSET+ODD_SIZE+1 ), LLDB,
01076      $                     0, MYCOL +1 )
01077 *
01078         ENDIF
01079 *
01080 *       Receive modifications to processor's right hand sides
01081 *
01082         IF( MYCOL .GT. 0) THEN
01083 *
01084           CALL CGERV2D( ICTXT, BW, NRHS,
01085      $                     WORK( 1 ), BW,
01086      $                     0, MYCOL - 1 )
01087 *
01088         ENDIF
01089 *
01090 *
01091 *
01092 **********************************************
01093 *       Local computation phase
01094 **********************************************
01095 *
01096         IF ( MYCOL .NE. 0 ) THEN
01097 *         Use the "spike" fillin to calculate contribution from previous
01098 *           processor's solution.
01099 *
01100           CALL CGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -CONE, AF( 1 ),
01101      $                ODD_SIZE, WORK( 1+BW-BW ), BW, CONE,
01102      $                B( PART_OFFSET+1 ), LLDB )
01103 *
01104         ENDIF
01105 *
01106 *
01107         IF ( MYCOL .LT. NP-1 ) THEN
01108 *         Use factorization of odd-even connection block to modify
01109 *           locally stored portion of right hand side(s)
01110 *
01111 *
01112 *         First copy and multiply it into temporary storage,
01113 *           then use it on RHS
01114 *
01115           CALL CLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB,
01116      $                 WORK( 1+BW-BW ), BW )
01117 *
01118           CALL CTRMM( 'L', 'U', 'C', 'N', BW, NRHS, -CONE,
01119      $                A(( OFST+(BW+1)+(ODD_SIZE-BW)*LLDA )), LLDA-1,
01120      $                WORK( 1+BW-BW ), BW )
01121 *
01122           CALL CMATADD( BW, NRHS, CONE, WORK( 1+BW-BW ), BW, CONE,
01123      $                  B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB )
01124 *
01125         ENDIF
01126 *
01127 *       Use main partition in each processor to solve locally
01128 *
01129         CALL CTBTRS( UPLO, 'C', 'N', ODD_SIZE,
01130      $                    BW, NRHS,
01131      $                    A( OFST+1 ),
01132      $                    LLDA,  B( PART_OFFSET+1 ),
01133      $                    LLDB, INFO )
01134 *
01135       ENDIF
01136 *     End of "IF( LSAME( TRANS, 'N' ) )"...
01137 *
01138 *
01139       ELSE
01140 ***************************************************************
01141 *     CASE UPLO = 'U'                                         *
01142 ***************************************************************
01143       IF ( LSAME( TRANS, 'C' ) ) THEN
01144 *
01145 *        Frontsolve
01146 *
01147 *
01148 ******************************************
01149 *       Local computation phase
01150 ******************************************
01151 *
01152 *       Use main partition in each processor to solve locally
01153 *
01154         CALL CTBTRS( UPLO, 'C', 'N', ODD_SIZE,
01155      $                    BW, NRHS,
01156      $                    A( OFST+1 ), LLDA,
01157      $                    B( PART_OFFSET+1 ), LLDB, INFO )
01158 *
01159 *
01160         IF ( MYCOL .LT. NP-1 ) THEN
01161 *         Use factorization of odd-even connection block to modify
01162 *           locally stored portion of right hand side(s)
01163 *
01164 *
01165 *           First copy and multiply it into temporary storage,
01166 *             then use it on RHS
01167 *
01168             CALL CLAMOV( 'N', BW, NRHS,
01169      $                B( PART_OFFSET+ODD_SIZE-BW+1), LLDB,
01170      $                WORK( 1 ), BW )
01171 *
01172             CALL CTRMM( 'L', 'L', 'C', 'N', BW, NRHS, -CONE,
01173      $                  A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1, WORK( 1 ),
01174      $                  BW )
01175 *
01176             CALL CMATADD( BW, NRHS, CONE, WORK( 1 ), BW,
01177      $                CONE, B( PART_OFFSET+ODD_SIZE+1 ), LLDB )
01178 *
01179         ENDIF
01180 *
01181 *
01182         IF ( MYCOL .NE. 0 ) THEN
01183 *         Use the "spike" fillin to calculate contribution to previous
01184 *           processor's righthand-side.
01185 *
01186             CALL CGEMM( 'C', 'N', BW, NRHS, ODD_SIZE, -CONE, AF( 1 ),
01187      $                  ODD_SIZE, B( PART_OFFSET+1 ), LLDB, CZERO,
01188      $                  WORK( 1+BW-BW ), BW )
01189         ENDIF
01190 *
01191 *
01192 ************************************************
01193 *       Formation and solution of reduced system
01194 ************************************************
01195 *
01196 *
01197 *       Send modifications to prior processor's right hand sides
01198 *
01199         IF( MYCOL .GT. 0) THEN
01200 *
01201           CALL CGESD2D( ICTXT, BW, NRHS,
01202      $                     WORK( 1 ), BW,
01203      $                     0, MYCOL - 1 )
01204 *
01205         ENDIF
01206 *
01207 *       Receive modifications to processor's right hand sides
01208 *
01209         IF( MYCOL .LT. NPCOL-1) THEN
01210 *
01211           CALL CGERV2D( ICTXT, BW, NRHS,
01212      $                     WORK( 1 ), BW,
01213      $                     0, MYCOL + 1 )
01214 *
01215 *         Combine contribution to locally stored right hand sides
01216 *
01217           CALL CMATADD( BW, NRHS, CONE,
01218      $                WORK( 1 ), BW, CONE,
01219      $                B( PART_OFFSET+ODD_SIZE + 1 ), LLDB )
01220 *
01221         ENDIF
01222 *
01223 *
01224 *       The last processor does not participate in the solution of the
01225 *       reduced system, having sent its contribution already.
01226         IF( MYCOL .EQ. NPCOL-1 ) THEN
01227           GOTO 44
01228         ENDIF
01229 *
01230 *
01231 *       *************************************
01232 *       Modification Loop
01233 *
01234 *       The distance for sending and receiving for each level starts
01235 *         at 1 for the first level.
01236         LEVEL_DIST = 1
01237 *
01238 *       Do until this proc is needed to modify other procs' equations
01239 *
01240    42   CONTINUE
01241         IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 41
01242 *
01243 *         Receive and add contribution to righthand sides from left
01244 *
01245           IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN
01246 *
01247             CALL CGERV2D( ICTXT, BW, NRHS,
01248      $                     WORK( 1 ),
01249      $                     BW, 0, MYCOL-LEVEL_DIST )
01250 *
01251             CALL CMATADD( BW, NRHS, CONE,
01252      $                WORK( 1 ), BW, CONE,
01253      $                B( PART_OFFSET+ODD_SIZE + 1 ), LLDB )
01254 *
01255           ENDIF
01256 *
01257 *         Receive and add contribution to righthand sides from right
01258 *
01259           IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN
01260 *
01261             CALL CGERV2D( ICTXT, BW, NRHS,
01262      $                     WORK( 1 ),
01263      $                     BW, 0, MYCOL+LEVEL_DIST )
01264 *
01265             CALL CMATADD( BW, NRHS, CONE,
01266      $                WORK( 1 ), BW, CONE,
01267      $                B( PART_OFFSET+ODD_SIZE + 1 ), LLDB )
01268 *
01269           ENDIF
01270 *
01271           LEVEL_DIST = LEVEL_DIST*2
01272 *
01273         GOTO 42
01274    41   CONTINUE
01275 *       [End of GOTO Loop]
01276 *
01277 *
01278 *
01279 *       *********************************
01280 *       Calculate and use this proc's blocks to modify other procs
01281 *
01282 *       Solve with diagonal block
01283 *
01284         CALL CTRTRS( 'L', 'N', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ),
01285      $               BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO )
01286 *
01287         IF( INFO.NE.0 ) THEN
01288           GO TO 1000
01289         ENDIF
01290 *
01291 *
01292 *
01293 *       *********
01294         IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN
01295 *
01296 *         Calculate contribution from this block to next diagonal block
01297 *
01298           CALL CGEMM( 'C', 'N', BW, NRHS, BW, -CONE,
01299      $                     AF( (ODD_SIZE)*BW+1 ),
01300      $                     BW,
01301      $                     B( PART_OFFSET+ODD_SIZE+1 ),
01302      $                     LLDB, CZERO,
01303      $                     WORK( 1 ),
01304      $                     BW )
01305 *
01306 *         Send contribution to diagonal block's owning processor.
01307 *
01308           CALL CGESD2D( ICTXT, BW, NRHS,
01309      $                     WORK( 1 ),
01310      $                     BW, 0, MYCOL+LEVEL_DIST )
01311 *
01312         ENDIF
01313 *       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
01314 *
01315 *       ************
01316         IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND.
01317      $      ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN
01318 *
01319 *
01320 *         Use offdiagonal block to calculate modification to diag block
01321 *           of processor to the left
01322 *
01323           CALL CGEMM( 'N', 'N', BW, NRHS, BW, -CONE,
01324      $                     AF( ODD_SIZE*BW+2*MBW2+1 ),
01325      $                     BW,
01326      $                     B( PART_OFFSET+ODD_SIZE+1 ),
01327      $                     LLDB, CZERO,
01328      $                     WORK( 1 ),
01329      $                     BW )
01330 *
01331 *         Send contribution to diagonal block's owning processor.
01332 *
01333           CALL CGESD2D( ICTXT, BW, NRHS,
01334      $                     WORK( 1 ),
01335      $                     BW, 0, MYCOL-LEVEL_DIST )
01336 *
01337         ENDIF
01338 *       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
01339 *
01340    44  CONTINUE
01341 *
01342       ELSE
01343 *
01344 ******************** BACKSOLVE *************************************
01345 *
01346 ********************************************************************
01347 *     .. Begin reduced system phase of algorithm ..
01348 ********************************************************************
01349 *
01350 *
01351 *
01352 *       The last processor does not participate in the solution of the
01353 *       reduced system and just waits to receive its solution.
01354         IF( MYCOL .EQ. NPCOL-1 ) THEN
01355           GOTO 54
01356         ENDIF
01357 *
01358 *       Determine number of steps in tree loop
01359 *
01360         LEVEL_DIST = 1
01361    57   CONTINUE
01362         IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 56
01363 *
01364           LEVEL_DIST = LEVEL_DIST*2
01365 *
01366         GOTO 57
01367    56   CONTINUE
01368 *
01369 *
01370         IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND.
01371      $      ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN
01372 *
01373 *         Receive solution from processor to left
01374 *
01375           CALL CGERV2D( ICTXT, BW, NRHS,
01376      $                     WORK( 1 ),
01377      $                     BW, 0, MYCOL-LEVEL_DIST )
01378 *
01379 *
01380 *         Use offdiagonal block to calculate modification to RHS stored
01381 *           on this processor
01382 *
01383           CALL CGEMM( 'C', 'N', BW, NRHS, BW, -CONE,
01384      $                     AF( ODD_SIZE*BW+2*MBW2+1 ),
01385      $                     BW,
01386      $                     WORK( 1 ),
01387      $                     BW, CONE,
01388      $                     B( PART_OFFSET+ODD_SIZE+1 ),
01389      $                     LLDB )
01390         ENDIF
01391 *       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
01392 *
01393 *
01394         IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN
01395 *
01396 *         Receive solution from processor to right
01397 *
01398           CALL CGERV2D( ICTXT, BW, NRHS,
01399      $                     WORK( 1 ),
01400      $                     BW, 0, MYCOL+LEVEL_DIST )
01401 *
01402 *         Calculate contribution from this block to next diagonal block
01403 *
01404           CALL CGEMM( 'N', 'N', BW, NRHS, BW, -CONE,
01405      $                     AF( (ODD_SIZE)*BW+1 ),
01406      $                     BW,
01407      $                     WORK( 1 ),
01408      $                     BW, CONE,
01409      $                     B( PART_OFFSET+ODD_SIZE+1 ),
01410      $                     LLDB )
01411 *
01412         ENDIF
01413 *       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
01414 *
01415 *
01416 *       Solve with diagonal block
01417 *
01418         CALL CTRTRS( 'L', 'C', 'N', BW, NRHS, AF( ODD_SIZE*BW+MBW2+1 ),
01419      $               BW, B( PART_OFFSET+ODD_SIZE+1 ), LLDB, INFO )
01420 *
01421         IF( INFO.NE.0 ) THEN
01422           GO TO 1000
01423         ENDIF
01424 *
01425 *
01426 *
01427 ***Modification Loop *******
01428 *
01429    52   CONTINUE
01430         IF( LEVEL_DIST .EQ. 1 ) GOTO 51
01431 *
01432           LEVEL_DIST = LEVEL_DIST/2
01433 *
01434 *         Send solution to the right
01435 *
01436           IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN
01437 *
01438           CALL CGESD2D( ICTXT, BW, NRHS,
01439      $                     B( PART_OFFSET+ODD_SIZE+1 ),
01440      $                     LLDB, 0, MYCOL+LEVEL_DIST )
01441 *
01442           ENDIF
01443 *
01444 *         Send solution to left
01445 *
01446           IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN
01447 *
01448           CALL CGESD2D( ICTXT, BW, NRHS,
01449      $                     B( PART_OFFSET+ODD_SIZE+1 ),
01450      $                     LLDB, 0, MYCOL-LEVEL_DIST )
01451 *
01452           ENDIF
01453 *
01454         GOTO 52
01455    51   CONTINUE
01456 *       [End of GOTO Loop]
01457 *
01458    54   CONTINUE
01459 *          [Processor npcol - 1 jumped to here to await next stage]
01460 *
01461 *******************************
01462 *       Reduced system has been solved, communicate solutions to nearest
01463 *         neighbors in preparation for local computation phase.
01464 *
01465 *
01466 *       Send elements of solution to next proc
01467 *
01468         IF( MYCOL .LT. NPCOL-1) THEN
01469 *
01470           CALL CGESD2D( ICTXT, BW, NRHS,
01471      $                     B( PART_OFFSET+ODD_SIZE+1 ), LLDB,
01472      $                     0, MYCOL +1 )
01473 *
01474         ENDIF
01475 *
01476 *       Receive modifications to processor's right hand sides
01477 *
01478         IF( MYCOL .GT. 0) THEN
01479 *
01480           CALL CGERV2D( ICTXT, BW, NRHS,
01481      $                     WORK( 1 ), BW,
01482      $                     0, MYCOL - 1 )
01483 *
01484         ENDIF
01485 *
01486 *
01487 *
01488 **********************************************
01489 *       Local computation phase
01490 **********************************************
01491 *
01492         IF ( MYCOL .NE. 0 ) THEN
01493 *         Use the "spike" fillin to calculate contribution from previous
01494 *           processor's solution.
01495 *
01496           CALL CGEMM( 'N', 'N', ODD_SIZE, NRHS, BW, -CONE, AF( 1 ),
01497      $                ODD_SIZE, WORK( 1+BW-BW ), BW, CONE,
01498      $                B( PART_OFFSET+1 ), LLDB )
01499 *
01500         ENDIF
01501 *
01502 *
01503         IF ( MYCOL .LT. NP-1 ) THEN
01504 *         Use factorization of odd-even connection block to modify
01505 *           locally stored portion of right hand side(s)
01506 *
01507 *
01508 *         First copy and multiply it into temporary storage,
01509 *           then use it on RHS
01510 *
01511           CALL CLAMOV( 'N', BW, NRHS, B( PART_OFFSET+ODD_SIZE+1), LLDB,
01512      $                 WORK( 1+BW-BW ), BW )
01513 *
01514           CALL CTRMM( 'L', 'L', 'N', 'N', BW, NRHS, -CONE,
01515      $                A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1,
01516      $                WORK( 1+BW-BW ), BW )
01517 *
01518           CALL CMATADD( BW, NRHS, CONE, WORK( 1+BW-BW ), BW, CONE,
01519      $                  B( PART_OFFSET+ODD_SIZE-BW+1 ), LLDB )
01520 *
01521         ENDIF
01522 *
01523 *       Use main partition in each processor to solve locally
01524 *
01525         CALL CTBTRS( UPLO, 'N', 'N', ODD_SIZE,
01526      $                    BW, NRHS,
01527      $                    A( OFST+1 ),
01528      $                    LLDA,  B( PART_OFFSET+1 ),
01529      $                    LLDB, INFO )
01530 *
01531       ENDIF
01532 *     End of "IF( LSAME( TRANS, 'N' ) )"...
01533 *
01534 *
01535       ENDIF
01536 *     End of "IF( LSAME( UPLO, 'L' ) )"...
01537  1000 CONTINUE
01538 *
01539 *
01540 *     Free BLACS space used to hold standard-form grid.
01541 *
01542       IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN
01543          CALL BLACS_GRIDEXIT( ICTXT_NEW )
01544       ENDIF
01545 *
01546  1234 CONTINUE
01547 *
01548 *     Restore saved input parameters
01549 *
01550       ICTXT = ICTXT_SAVE
01551       NP = NP_SAVE
01552 *
01553 *     Output minimum worksize
01554 *
01555       WORK( 1 ) = WORK_SIZE_MIN
01556 *
01557 *
01558       RETURN
01559 *
01560 *     End of PCPBTRSV
01561 *
01562       END