ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
pzgbtrs.f
Go to the documentation of this file.
00001       SUBROUTINE PZGBTRS( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV,
00002      $                    B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
00003 *
00004 *  -- ScaLAPACK routine (version 2.0.2) --
00005 *     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
00006 *     May 1 2012
00007 *
00008 *     .. Scalar Arguments ..
00009       CHARACTER          TRANS
00010       INTEGER            BWU, BWL, IB, INFO, JA, LAF, LWORK, N, NRHS
00011 *     ..
00012 *     .. Array Arguments ..
00013       INTEGER            DESCA( * ), DESCB( * ), IPIV(*)
00014       COMPLEX*16         A( * ), AF( * ), B( * ), WORK( * )
00015 *     ..
00016 *
00017 *
00018 *  Purpose
00019 *  =======
00020 *
00021 *  PZGBTRS solves a system of linear equations
00022 *
00023 *            A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
00024 *                                    or
00025 *            A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS)
00026 *
00027 *  where A(1:N, JA:JA+N-1) is the matrix used to produce the factors
00028 *  stored in A(1:N,JA:JA+N-1) and AF by PZGBTRF.
00029 *  A(1:N, JA:JA+N-1) is an N-by-N complex
00030 *  banded distributed
00031 *  matrix with bandwidth BWL, BWU.
00032 *
00033 *  Routine PZGBTRF MUST be called first.
00034 *
00035 *  =====================================================================
00036 *
00037 *  Arguments
00038 *  =========
00039 *
00040 *
00041 *  TRANS   (global input) CHARACTER
00042 *          = 'N':  Solve with A(1:N, JA:JA+N-1);
00043 *          = 'C':  Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
00044 *
00045 *  N       (global input) INTEGER
00046 *          The number of rows and columns to be operated on, i.e. the
00047 *          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
00048 *
00049 *  BWL     (global input) INTEGER
00050 *          Number of subdiagonals. 0 <= BWL <= N-1
00051 *
00052 *  BWU     (global input) INTEGER
00053 *          Number of superdiagonals. 0 <= BWU <= N-1
00054 *
00055 *  NRHS    (global input) INTEGER
00056 *          The number of right hand sides, i.e., the number of columns
00057 *          of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
00058 *          NRHS >= 0.
00059 *
00060 *  A       (local input/local output) COMPLEX*16 pointer into
00061 *          local memory to an array with first dimension
00062 *          LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
00063 *          On entry, this array contains the local pieces of the
00064 *          N-by-N unsymmetric banded distributed Cholesky factor L or
00065 *          L^T A(1:N, JA:JA+N-1).
00066 *          This local portion is stored in the packed banded format
00067 *            used in LAPACK. Please see the Notes below and the
00068 *            ScaLAPACK manual for more detail on the format of
00069 *            distributed matrices.
00070 *
00071 *  JA      (global input) INTEGER
00072 *          The index in the global array A that points to the start of
00073 *          the matrix to be operated on (which may be either all of A
00074 *          or a submatrix of A).
00075 *
00076 *  DESCA   (global and local input) INTEGER array of dimension DLEN.
00077 *          if 1D type (DTYPE_A=501), DLEN >= 7;
00078 *          if 2D type (DTYPE_A=1), DLEN >= 9 .
00079 *          The array descriptor for the distributed matrix A.
00080 *          Contains information of mapping of A to memory. Please
00081 *          see NOTES below for full description and options.
00082 *
00083 *  IPIV    (local output) INTEGER array, dimension >= DESCA( NB ).
00084 *          Pivot indices for local factorizations.
00085 *          Users *should not* alter the contents between
00086 *          factorization and solve.
00087 *
00088 *  B       (local input/local output) COMPLEX*16 pointer into
00089 *          local memory to an array of local lead dimension lld_b>=NB.
00090 *          On entry, this array contains the
00091 *          the local pieces of the right hand sides
00092 *          B(IB:IB+N-1, 1:NRHS).
00093 *          On exit, this contains the local piece of the solutions
00094 *          distributed matrix X.
00095 *
00096 *  IB      (global input) INTEGER
00097 *          The row index in the global array B that points to the first
00098 *          row of the matrix to be operated on (which may be either
00099 *          all of B or a submatrix of B).
00100 *
00101 *  DESCB   (global and local input) INTEGER array of dimension DLEN.
00102 *          if 1D type (DTYPE_B=502), DLEN >=7;
00103 *          if 2D type (DTYPE_B=1), DLEN >= 9.
00104 *          The array descriptor for the distributed matrix B.
00105 *          Contains information of mapping of B to memory. Please
00106 *          see NOTES below for full description and options.
00107 *
00108 *  AF      (local output) COMPLEX*16 array, dimension LAF.
00109 *          Auxiliary Fillin Space.
00110 *          Fillin is created during the factorization routine
00111 *          PZGBTRF and this is stored in AF. If a linear system
00112 *          is to be solved using PZGBTRS after the factorization
00113 *          routine, AF *must not be altered* after the factorization.
00114 *
00115 *  LAF     (local input) INTEGER
00116 *          Size of user-input Auxiliary Fillin space AF. Must be >=
00117 *          (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
00118 *          If LAF is not large enough, an error code will be returned
00119 *          and the minimum acceptable size will be returned in AF( 1 )
00120 *
00121 *  WORK    (local workspace/local output)
00122 *          COMPLEX*16 temporary workspace. This space may
00123 *          be overwritten in between calls to routines. WORK must be
00124 *          the size given in LWORK.
00125 *          On exit, WORK( 1 ) contains the minimal LWORK.
00126 *
00127 *  LWORK   (local input or global input) INTEGER
00128 *          Size of user-input workspace WORK.
00129 *          If LWORK is too small, the minimal acceptable size will be
00130 *          returned in WORK(1) and an error code is returned. LWORK>=
00131 *          NRHS*(NB+2*bwl+4*bwu)
00132 *
00133 *  INFO    (global output) INTEGER
00134 *          = 0:  successful exit
00135 *          < 0:  If the i-th argument is an array and the j-entry had
00136 *                an illegal value, then INFO = -(i*100+j), if the i-th
00137 *                argument is a scalar and had an illegal value, then
00138 *                INFO = -i.
00139 *
00140 *  =====================================================================
00141 *
00142 *
00143 *  Restrictions
00144 *  ============
00145 *
00146 *  The following are restrictions on the input parameters. Some of these
00147 *    are temporary and will be removed in future releases, while others
00148 *    may reflect fundamental technical limitations.
00149 *
00150 *    Non-cyclic restriction: VERY IMPORTANT!
00151 *      P*NB>= mod(JA-1,NB)+N.
00152 *      The mapping for matrices must be blocked, reflecting the nature
00153 *      of the divide and conquer algorithm as a task-parallel algorithm.
00154 *      This formula in words is: no processor may have more than one
00155 *      chunk of the matrix.
00156 *
00157 *    Blocksize cannot be too small:
00158 *      If the matrix spans more than one processor, the following
00159 *      restriction on NB, the size of each block on each processor,
00160 *      must hold:
00161 *      NB >= (BWL+BWU)+1
00162 *      The bulk of parallel computation is done on the matrix of size
00163 *      O(NB) on each processor. If this is too small, divide and conquer
00164 *      is a poor choice of algorithm.
00165 *
00166 *    Submatrix reference:
00167 *      JA = IB
00168 *      Alignment restriction that prevents unnecessary communication.
00169 *
00170 *
00171 *  =====================================================================
00172 *
00173 *
00174 *  Notes
00175 *  =====
00176 *
00177 *  If the factorization routine and the solve routine are to be called
00178 *    separately (to solve various sets of righthand sides using the same
00179 *    coefficient matrix), the auxiliary space AF *must not be altered*
00180 *    between calls to the factorization routine and the solve routine.
00181 *
00182 *  The best algorithm for solving banded and tridiagonal linear systems
00183 *    depends on a variety of parameters, especially the bandwidth.
00184 *    Currently, only algorithms designed for the case N/P >> bw are
00185 *    implemented. These go by many names, including Divide and Conquer,
00186 *    Partitioning, domain decomposition-type, etc.
00187 *
00188 *  Algorithm description: Divide and Conquer
00189 *
00190 *    The Divide and Conqer algorithm assumes the matrix is narrowly
00191 *      banded compared with the number of equations. In this situation,
00192 *      it is best to distribute the input matrix A one-dimensionally,
00193 *      with columns atomic and rows divided amongst the processes.
00194 *      The basic algorithm divides the banded matrix up into
00195 *      P pieces with one stored on each processor,
00196 *      and then proceeds in 2 phases for the factorization or 3 for the
00197 *      solution of a linear system.
00198 *      1) Local Phase:
00199 *         The individual pieces are factored independently and in
00200 *         parallel. These factors are applied to the matrix creating
00201 *         fillin, which is stored in a non-inspectable way in auxiliary
00202 *         space AF. Mathematically, this is equivalent to reordering
00203 *         the matrix A as P A P^T and then factoring the principal
00204 *         leading submatrix of size equal to the sum of the sizes of
00205 *         the matrices factored on each processor. The factors of
00206 *         these submatrices overwrite the corresponding parts of A
00207 *         in memory.
00208 *      2) Reduced System Phase:
00209 *         A small (max(bwl,bwu)* (P-1)) system is formed representing
00210 *         interaction of the larger blocks, and is stored (as are its
00211 *         factors) in the space AF. A parallel Block Cyclic Reduction
00212 *         algorithm is used. For a linear system, a parallel front solve
00213 *         followed by an analagous backsolve, both using the structure
00214 *         of the factored matrix, are performed.
00215 *      3) Backsubsitution Phase:
00216 *         For a linear system, a local backsubstitution is performed on
00217 *         each processor in parallel.
00218 *
00219 *
00220 *  Descriptors
00221 *  ===========
00222 *
00223 *  Descriptors now have *types* and differ from ScaLAPACK 1.0.
00224 *
00225 *  Note: banded codes can use either the old two dimensional
00226 *    or new one-dimensional descriptors, though the processor grid in
00227 *    both cases *must be one-dimensional*. We describe both types below.
00228 *
00229 *  Each global data object is described by an associated description
00230 *  vector.  This vector stores the information required to establish
00231 *  the mapping between an object element and its corresponding process
00232 *  and memory location.
00233 *
00234 *  Let A be a generic term for any 2D block cyclicly distributed array.
00235 *  Such a global array has an associated description vector DESCA.
00236 *  In the following comments, the character _ should be read as
00237 *  "of the global array".
00238 *
00239 *  NOTATION        STORED IN      EXPLANATION
00240 *  --------------- -------------- --------------------------------------
00241 *  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
00242 *                                 DTYPE_A = 1.
00243 *  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
00244 *                                 the BLACS process grid A is distribu-
00245 *                                 ted over. The context itself is glo-
00246 *                                 bal, but the handle (the integer
00247 *                                 value) may vary.
00248 *  M_A    (global) DESCA( M_ )    The number of rows in the global
00249 *                                 array A.
00250 *  N_A    (global) DESCA( N_ )    The number of columns in the global
00251 *                                 array A.
00252 *  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
00253 *                                 the rows of the array.
00254 *  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
00255 *                                 the columns of the array.
00256 *  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
00257 *                                 row of the array A is distributed.
00258 *  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
00259 *                                 first column of the array A is
00260 *                                 distributed.
00261 *  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
00262 *                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
00263 *
00264 *  Let K be the number of rows or columns of a distributed matrix,
00265 *  and assume that its process grid has dimension p x q.
00266 *  LOCr( K ) denotes the number of elements of K that a process
00267 *  would receive if K were distributed over the p processes of its
00268 *  process column.
00269 *  Similarly, LOCc( K ) denotes the number of elements of K that a
00270 *  process would receive if K were distributed over the q processes of
00271 *  its process row.
00272 *  The values of LOCr() and LOCc() may be determined via a call to the
00273 *  ScaLAPACK tool function, NUMROC:
00274 *          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
00275 *          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
00276 *  An upper bound for these quantities may be computed by:
00277 *          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
00278 *          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
00279 *
00280 *
00281 *  One-dimensional descriptors:
00282 *
00283 *  One-dimensional descriptors are a new addition to ScaLAPACK since
00284 *    version 1.0. They simplify and shorten the descriptor for 1D
00285 *    arrays.
00286 *
00287 *  Since ScaLAPACK supports two-dimensional arrays as the fundamental
00288 *    object, we allow 1D arrays to be distributed either over the
00289 *    first dimension of the array (as if the grid were P-by-1) or the
00290 *    2nd dimension (as if the grid were 1-by-P). This choice is
00291 *    indicated by the descriptor type (501 or 502)
00292 *    as described below.
00293 *
00294 *    IMPORTANT NOTE: the actual BLACS grid represented by the
00295 *    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
00296 *    irrespective of which one-dimensional descriptor type
00297 *    (501 or 502) is input.
00298 *    This routine will interpret the grid properly either way.
00299 *    ScaLAPACK routines *do not support intercontext operations* so that
00300 *    the grid passed to a single ScaLAPACK routine *must be the same*
00301 *    for all array descriptors passed to that routine.
00302 *
00303 *    NOTE: In all cases where 1D descriptors are used, 2D descriptors
00304 *    may also be used, since a one-dimensional array is a special case
00305 *    of a two-dimensional array with one dimension of size unity.
00306 *    The two-dimensional array used in this case *must* be of the
00307 *    proper orientation:
00308 *      If the appropriate one-dimensional descriptor is DTYPEA=501
00309 *      (1 by P type), then the two dimensional descriptor must
00310 *      have a CTXT value that refers to a 1 by P BLACS grid;
00311 *      If the appropriate one-dimensional descriptor is DTYPEA=502
00312 *      (P by 1 type), then the two dimensional descriptor must
00313 *      have a CTXT value that refers to a P by 1 BLACS grid.
00314 *
00315 *
00316 *  Summary of allowed descriptors, types, and BLACS grids:
00317 *  DTYPE           501         502         1         1
00318 *  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
00319 *  -----------------------------------------------------
00320 *  A               OK          NO          OK        NO
00321 *  B               NO          OK          NO        OK
00322 *
00323 *  Note that a consequence of this chart is that it is not possible
00324 *    for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
00325 *    to opposite requirements for the orientation of the BLACS grid,
00326 *    and as noted before, the *same* BLACS context must be used in
00327 *    all descriptors in a single ScaLAPACK subroutine call.
00328 *
00329 *  Let A be a generic term for any 1D block cyclicly distributed array.
00330 *  Such a global array has an associated description vector DESCA.
00331 *  In the following comments, the character _ should be read as
00332 *  "of the global array".
00333 *
00334 *  NOTATION        STORED IN  EXPLANATION
00335 *  --------------- ---------- ------------------------------------------
00336 *  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
00337 *                                TYPE_A = 501: 1-by-P grid.
00338 *                                TYPE_A = 502: P-by-1 grid.
00339 *  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
00340 *                                the BLACS process grid A is distribu-
00341 *                                ted over. The context itself is glo-
00342 *                                bal, but the handle (the integer
00343 *                                value) may vary.
00344 *  N_A    (global) DESCA( 3 ) The size of the array dimension being
00345 *                                distributed.
00346 *  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
00347 *                                the distributed dimension of the array.
00348 *  SRC_A  (global) DESCA( 5 ) The process row or column over which the
00349 *                                first row or column of the array
00350 *                                is distributed.
00351 *  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
00352 *                                storing the local blocks of the distri-
00353 *                                buted array A. Minimum value of LLD_A
00354 *                                depends on TYPE_A.
00355 *                                TYPE_A = 501: LLD_A >=
00356 *                                   size of undistributed dimension, 1.
00357 *                                TYPE_A = 502: LLD_A >=NB_A, 1.
00358 *  Reserved        DESCA( 7 ) Reserved for future use.
00359 *
00360 *
00361 *
00362 *  =====================================================================
00363 *
00364 *  Implemented for ScaLAPACK by:
00365 *     Andrew J. Cleary, Livermore National Lab and University of Tenn.,
00366 *     and Marbwus Hegland, Australian Natonal University. Feb., 1997.
00367 *  Based on code written by    : Peter Arbenz, ETH Zurich, 1996.
00368 *
00369 *  =====================================================================
00370 *
00371 *     .. Parameters ..
00372       DOUBLE PRECISION   ONE, ZERO
00373       PARAMETER          ( ONE = 1.0D+0 )
00374       PARAMETER          ( ZERO = 0.0D+0 )
00375       COMPLEX*16         CONE, CZERO
00376       PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
00377       PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ) )
00378       INTEGER            INT_ONE
00379       PARAMETER          ( INT_ONE = 1 )
00380       INTEGER            DESCMULT, BIGNUM
00381       PARAMETER          ( DESCMULT = 100, BIGNUM = DESCMULT*DESCMULT )
00382       INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
00383      $                   LLD_, MB_, M_, NB_, N_, RSRC_
00384       PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
00385      $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
00386      $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
00387 *     ..
00388 *     .. Local Scalars ..
00389       INTEGER            APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC,
00390      $                   FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
00391      $                   IDUM2, IDUM3, J, JA_NEW, L, LBWL, LBWU, LDBB,
00392      $                   LDW, LLDA, LLDB, LM, LMJ, LN, LPTR, MYCOL,
00393      $                   MYROW, NB, NEICOL, NP, NPACT, NPCOL, NPROW,
00394      $                   NPSTR, NP_SAVE, ODD_SIZE, PART_OFFSET,
00395      $                   RECOVERY_VAL, RETURN_CODE, STORE_M_B,
00396      $                   STORE_N_A, WORK_SIZE_MIN, WPTR
00397 *     ..
00398 *     .. Local Arrays ..
00399       INTEGER            DESCA_1XP( 7 ), DESCB_PX1( 7 ),
00400      $                   PARAM_CHECK( 17, 3 )
00401 *     ..
00402 *     .. External Subroutines ..
00403       EXTERNAL           BLACS_GRIDINFO, DESC_CONVERT, GLOBCHK, PXERBLA,
00404      $                   RESHAPE
00405 *     ..
00406 *     .. External Functions ..
00407       LOGICAL            LSAME
00408       INTEGER            NUMROC
00409       EXTERNAL           LSAME
00410       EXTERNAL           NUMROC
00411 *     ..
00412 *     .. Intrinsic Functions ..
00413       INTRINSIC          ICHAR, MOD
00414 *     ..
00415 *     .. Executable Statements ..
00416 *
00417 *
00418 *     Test the input parameters
00419 *
00420       INFO = 0
00421 *
00422 *     Convert descriptor into standard form for easy access to
00423 *        parameters, check that grid is of right shape.
00424 *
00425       DESCA_1XP( 1 ) = 501
00426       DESCB_PX1( 1 ) = 502
00427 *
00428       CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE )
00429 *
00430       IF( RETURN_CODE .NE. 0) THEN
00431          INFO = -( 8*100 + 2 )
00432       ENDIF
00433 *
00434       CALL DESC_CONVERT( DESCB, DESCB_PX1, RETURN_CODE )
00435 *
00436       IF( RETURN_CODE .NE. 0) THEN
00437          INFO = -( 11*100 + 2 )
00438       ENDIF
00439 *
00440 *     Consistency checks for DESCA and DESCB.
00441 *
00442 *     Context must be the same
00443       IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN
00444          INFO = -( 11*100 + 2 )
00445       ENDIF
00446 *
00447 *        These are alignment restrictions that may or may not be removed
00448 *        in future releases. -Andy Cleary, April 14, 1996.
00449 *
00450 *     Block sizes must be the same
00451       IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN
00452          INFO = -( 11*100 + 4 )
00453       ENDIF
00454 *
00455 *     Source processor must be the same
00456 *
00457       IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN
00458          INFO = -( 11*100 + 5 )
00459       ENDIF
00460 *
00461 *     Get values out of descriptor for use in code.
00462 *
00463       ICTXT = DESCA_1XP( 2 )
00464       CSRC = DESCA_1XP( 5 )
00465       NB = DESCA_1XP( 4 )
00466       LLDA = DESCA_1XP( 6 )
00467       STORE_N_A = DESCA_1XP( 3 )
00468       LLDB = DESCB_PX1( 6 )
00469       STORE_M_B = DESCB_PX1( 3 )
00470 *
00471 *     Get grid parameters
00472 *
00473 *
00474       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00475       NP = NPROW * NPCOL
00476 *
00477 *
00478 *
00479       IF( LSAME( TRANS, 'N' ) ) THEN
00480          IDUM2 = ICHAR( 'N' )
00481       ELSE IF ( LSAME( TRANS, 'C' ) ) THEN
00482          IDUM2 = ICHAR( 'C' )
00483       ELSE
00484          INFO = -1
00485       END IF
00486 *
00487       IF( LWORK .LT. -1) THEN
00488          INFO = -16
00489       ELSE IF ( LWORK .EQ. -1 ) THEN
00490          IDUM3 = -1
00491       ELSE
00492          IDUM3 = 1
00493       ENDIF
00494 *
00495       IF( N .LT. 0 ) THEN
00496          INFO = -2
00497       ENDIF
00498 *
00499       IF( N+JA-1 .GT. STORE_N_A ) THEN
00500          INFO = -( 8*100 + 6 )
00501       ENDIF
00502 *
00503       IF(( BWL .GT. N-1 ) .OR.
00504      $   ( BWL .LT. 0 ) ) THEN
00505          INFO = -3
00506       ENDIF
00507 *
00508       IF(( BWU .GT. N-1 ) .OR.
00509      $   ( BWU .LT. 0 ) ) THEN
00510          INFO = -4
00511       ENDIF
00512 *
00513       IF( LLDA .LT. (2*BWL+2*BWU+1) ) THEN
00514          INFO = -( 8*100 + 6 )
00515       ENDIF
00516 *
00517       IF( NB .LE. 0 ) THEN
00518          INFO = -( 8*100 + 4 )
00519       ENDIF
00520 *
00521       BW = BWU+BWL
00522 *
00523       IF( N+IB-1 .GT. STORE_M_B ) THEN
00524          INFO = -( 11*100 + 3 )
00525       ENDIF
00526 *
00527       IF( LLDB .LT. NB ) THEN
00528          INFO = -( 11*100 + 6 )
00529       ENDIF
00530 *
00531       IF( NRHS .LT. 0 ) THEN
00532          INFO = -5
00533       ENDIF
00534 *
00535 *     Current alignment restriction
00536 *
00537       IF( JA .NE. IB) THEN
00538          INFO = -7
00539       ENDIF
00540 *
00541 *     Argument checking that is specific to Divide & Conquer routine
00542 *
00543       IF( NPROW .NE. 1 ) THEN
00544          INFO = -( 8*100+2 )
00545       ENDIF
00546 *
00547       IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN
00548          INFO = -( 2 )
00549          CALL PXERBLA( ICTXT,
00550      $      'PZGBTRS, D&C alg.: only 1 block per proc',
00551      $      -INFO )
00552          RETURN
00553       ENDIF
00554 *
00555       IF((JA+N-1.GT.NB) .AND. ( NB.LT.(BWL+BWU+1) )) THEN
00556          INFO = -( 8*100+4 )
00557          CALL PXERBLA( ICTXT,
00558      $      'PZGBTRS, D&C alg.: NB too small',
00559      $      -INFO )
00560          RETURN
00561       ENDIF
00562 *
00563 *
00564 *     Check worksize
00565 *
00566       WORK_SIZE_MIN = NRHS*(NB+2*BWL+4*BWU)
00567 *
00568       WORK( 1 ) = WORK_SIZE_MIN
00569 *
00570       IF( LWORK .LT. WORK_SIZE_MIN ) THEN
00571          IF( LWORK .NE. -1 ) THEN
00572          INFO = -16
00573          CALL PXERBLA( ICTXT,
00574      $      'PZGBTRS: worksize error ',
00575      $      -INFO )
00576          ENDIF
00577          RETURN
00578       ENDIF
00579 *
00580 *     Pack params and positions into arrays for global consistency check
00581 *
00582       PARAM_CHECK( 17, 1 ) = DESCB(5)
00583       PARAM_CHECK( 16, 1 ) = DESCB(4)
00584       PARAM_CHECK( 15, 1 ) = DESCB(3)
00585       PARAM_CHECK( 14, 1 ) = DESCB(2)
00586       PARAM_CHECK( 13, 1 ) = DESCB(1)
00587       PARAM_CHECK( 12, 1 ) = IB
00588       PARAM_CHECK( 11, 1 ) = DESCA(5)
00589       PARAM_CHECK( 10, 1 ) = DESCA(4)
00590       PARAM_CHECK(  9, 1 ) = DESCA(3)
00591       PARAM_CHECK(  8, 1 ) = DESCA(1)
00592       PARAM_CHECK(  7, 1 ) = JA
00593       PARAM_CHECK(  6, 1 ) = NRHS
00594       PARAM_CHECK(  5, 1 ) = BWU
00595       PARAM_CHECK(  4, 1 ) = BWL
00596       PARAM_CHECK(  3, 1 ) = N
00597       PARAM_CHECK(  2, 1 ) = IDUM3
00598       PARAM_CHECK(  1, 1 ) = IDUM2
00599 *
00600       PARAM_CHECK( 17, 2 ) = 1105
00601       PARAM_CHECK( 16, 2 ) = 1104
00602       PARAM_CHECK( 15, 2 ) = 1103
00603       PARAM_CHECK( 14, 2 ) = 1102
00604       PARAM_CHECK( 13, 2 ) = 1101
00605       PARAM_CHECK( 12, 2 ) = 10
00606       PARAM_CHECK( 11, 2 ) = 805
00607       PARAM_CHECK( 10, 2 ) = 804
00608       PARAM_CHECK(  9, 2 ) = 803
00609       PARAM_CHECK(  8, 2 ) = 801
00610       PARAM_CHECK(  7, 2 ) = 7
00611       PARAM_CHECK(  6, 2 ) = 5
00612       PARAM_CHECK(  5, 2 ) = 4
00613       PARAM_CHECK(  4, 2 ) = 3
00614       PARAM_CHECK(  3, 2 ) = 2
00615       PARAM_CHECK(  2, 2 ) = 16
00616       PARAM_CHECK(  1, 2 ) = 1
00617 *
00618 *     Want to find errors with MIN( ), so if no error, set it to a big
00619 *     number. If there already is an error, multiply by the the
00620 *     descriptor multiplier.
00621 *
00622       IF( INFO.GE.0 ) THEN
00623          INFO = BIGNUM
00624       ELSE IF( INFO.LT.-DESCMULT ) THEN
00625          INFO = -INFO
00626       ELSE
00627          INFO = -INFO * DESCMULT
00628       END IF
00629 *
00630 *     Check consistency across processors
00631 *
00632       CALL GLOBCHK( ICTXT, 17, PARAM_CHECK, 17,
00633      $              PARAM_CHECK( 1, 3 ), INFO )
00634 *
00635 *     Prepare output: set info = 0 if no error, and divide by DESCMULT
00636 *     if error is not in a descriptor entry.
00637 *
00638       IF( INFO.EQ.BIGNUM ) THEN
00639          INFO = 0
00640       ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN
00641          INFO = -INFO / DESCMULT
00642       ELSE
00643          INFO = -INFO
00644       END IF
00645 *
00646       IF( INFO.LT.0 ) THEN
00647          CALL PXERBLA( ICTXT, 'PZGBTRS', -INFO )
00648          RETURN
00649       END IF
00650 *
00651 *     Quick return if possible
00652 *
00653       IF( N.EQ.0 )
00654      $   RETURN
00655 *
00656       IF( NRHS.EQ.0 )
00657      $   RETURN
00658 *
00659 *
00660 *     Adjust addressing into matrix space to properly get into
00661 *        the beginning part of the relevant data
00662 *
00663       PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) )
00664 *
00665       IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN
00666          PART_OFFSET = PART_OFFSET + NB
00667       ENDIF
00668 *
00669       IF ( MYCOL .LT. CSRC ) THEN
00670          PART_OFFSET = PART_OFFSET - NB
00671       ENDIF
00672 *
00673 *     Form a new BLACS grid (the "standard form" grid) with only procs
00674 *        holding part of the matrix, of size 1xNP where NP is adjusted,
00675 *        starting at csrc=0, with JA modified to reflect dropped procs.
00676 *
00677 *     First processor to hold part of the matrix:
00678 *
00679       FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL )
00680 *
00681 *     Calculate new JA one while dropping off unused processors.
00682 *
00683       JA_NEW = MOD( JA-1, NB ) + 1
00684 *
00685 *     Save and compute new value of NP
00686 *
00687       NP_SAVE = NP
00688       NP = ( JA_NEW+N-2 )/NB + 1
00689 *
00690 *     Call utility routine that forms "standard-form" grid
00691 *
00692       CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE,
00693      $              FIRST_PROC, INT_ONE, NP )
00694 *
00695 *     Use new context from standard grid as context.
00696 *
00697       ICTXT_SAVE = ICTXT
00698       ICTXT = ICTXT_NEW
00699       DESCA_1XP( 2 ) = ICTXT_NEW
00700       DESCB_PX1( 2 ) = ICTXT_NEW
00701 *
00702 *     Get information about new grid.
00703 *
00704       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00705 *
00706 *     Drop out processors that do not have part of the matrix.
00707 *
00708       IF( MYROW .LT. 0 ) THEN
00709          GOTO 1234
00710       ENDIF
00711 *
00712 *
00713 *
00714 *     Begin main code
00715 *
00716 *     Move data into workspace - communicate/copy (overlap)
00717 *
00718       IF (MYCOL .LT. NPCOL-1) THEN
00719          CALL ZGESD2D( ICTXT, BWU, NRHS, B(NB-BWU+1), LLDB,
00720      $        0, MYCOL + 1)
00721       ENDIF
00722 *
00723       IF (MYCOL .LT. NPCOL-1) THEN
00724          LM = NB-BWU
00725       ELSE
00726          LM = NB
00727       ENDIF
00728 *
00729       IF (MYCOL .GT. 0) THEN
00730          WPTR = BWU+1
00731       ELSE
00732          WPTR = 1
00733       ENDIF
00734 *
00735       LDW = NB+BWU + 2*BW+BWU
00736 *
00737       CALL ZLAMOV( 'G', LM, NRHS, B(1), LLDB, WORK( WPTR ), LDW )
00738 *
00739 *     Zero out rest of work
00740 *
00741       DO 1501 J=1, NRHS
00742         DO 1502 L=WPTR+LM, LDW
00743           WORK( (J-1)*LDW+L ) = CZERO
00744  1502   CONTINUE
00745  1501 CONTINUE
00746 *
00747       IF (MYCOL .GT. 0) THEN
00748          CALL ZGERV2D( ICTXT, BWU, NRHS, WORK(1), LDW,
00749      $        0, MYCOL-1)
00750       ENDIF
00751 *
00752 ********************************************************************
00753 *       PHASE 1: Local computation phase -- Solve L*X = B
00754 ********************************************************************
00755 *
00756 *     Size of main (or odd) partition in each processor
00757 *
00758       ODD_SIZE = NUMROC( N, NB, MYCOL, 0, NPCOL )
00759 *
00760       IF (MYCOL .NE. 0) THEN
00761          LBWL = BW
00762          LBWU = 0
00763          APTR = 1
00764       ELSE
00765          LBWL = BWL
00766          LBWU = BWU
00767          APTR = 1+BWU
00768       ENDIF
00769 *
00770       IF (MYCOL .NE. NPCOL-1) THEN
00771          LM = NB - LBWU
00772          LN = NB - BW
00773       ELSE IF (MYCOL .NE. 0) THEN
00774          LM = ODD_SIZE + BWU
00775          LN = MAX(ODD_SIZE-BW,0)
00776       ELSE
00777          LM = N
00778          LN = MAX( N-BW, 0 )
00779       ENDIF
00780 *
00781       DO 21 J = 1, LN
00782 *
00783          LMJ = MIN(LBWL,LM-J)
00784          L = IPIV( J )
00785 *
00786          IF( L.NE.J ) THEN
00787             CALL ZSWAP(NRHS, WORK(L), LDW, WORK(J), LDW)
00788          ENDIF
00789 *
00790          LPTR = BW+1 + (J-1)*LLDA + APTR
00791 *
00792          CALL ZGERU(LMJ,NRHS,-CONE, A(LPTR),1, WORK(J),LDW,
00793      $           WORK(J+1),LDW)
00794 *
00795    21 CONTINUE
00796 *
00797 ********************************************************************
00798 *       PHASE 2: Global computation phase -- Solve L*X = B
00799 ********************************************************************
00800 *
00801 *     Define the initial dimensions of the diagonal blocks
00802 *     The offdiagonal blocks (for MYCOL > 0) are of size BM by BW
00803 *
00804       IF (MYCOL .NE. NPCOL-1) THEN
00805          BM = BW - LBWU
00806          BN = BW
00807       ELSE
00808          BM = MIN(BW,ODD_SIZE) + BWU
00809          BN = MIN(BW,ODD_SIZE)
00810       ENDIF
00811 *
00812 *     Pointer to first element of block bidiagonal matrix in AF
00813 *     Leading dimension of block bidiagonal system
00814 *
00815       BBPTR = (NB+BWU)*BW + 1
00816       LDBB   = 2*BW + BWU
00817 *
00818       IF (NPCOL.EQ.1) THEN
00819 *
00820 *        In this case the loop over the levels will not be
00821 *        performed.
00822          CALL ZGETRS( 'N', N-LN, NRHS, AF(BBPTR+BW*LDBB), LDBB,
00823      $        IPIV(LN+1), WORK( LN+1 ), LDW, INFO)
00824 *
00825       ENDIF
00826 *
00827 * Loop over levels ...
00828 *
00829 *     The two integers NPACT (nu. of active processors) and NPSTR
00830 *     (stride between active processors) is used to control the
00831 *     loop.
00832 *
00833       NPACT = NPCOL
00834       NPSTR = 1
00835 *
00836 *     Begin loop over levels
00837   200 IF (NPACT .LE. 1) GOTO 300
00838 *
00839 *     Test if processor is active
00840           IF (MOD(MYCOL,NPSTR) .EQ. 0) THEN
00841 *
00842 *   Send/Receive blocks
00843 *
00844              IF (MOD(MYCOL,2*NPSTR) .EQ. 0) THEN
00845 *
00846                 NEICOL = MYCOL + NPSTR
00847 *
00848                 IF (NEICOL/NPSTR .LE. NPACT-1) THEN
00849 *
00850                    IF (NEICOL/NPSTR .LT. NPACT-1) THEN
00851                       BMN = BW
00852                    ELSE
00853                       BMN = MIN(BW,NUMROC(N, NB, NEICOL, 0, NPCOL))+BWU
00854                    ENDIF
00855 *
00856                    CALL ZGESD2D( ICTXT, BM, NRHS,
00857      $                  WORK(LN+1), LDW, 0, NEICOL )
00858 *
00859                    IF( NPACT .NE. 2 )THEN
00860 *
00861 *                     Receive answers back from partner processor
00862 *
00863                       CALL ZGERV2D(ICTXT, BM+BMN-BW, NRHS,
00864      $                   WORK( LN+1 ), LDW, 0, NEICOL )
00865 *
00866                       BM = BM+BMN-BW
00867 *
00868                    ENDIF
00869 *
00870                 ENDIF
00871 *
00872              ELSE
00873 *
00874                 NEICOL = MYCOL - NPSTR
00875 *
00876                 IF (NEICOL .EQ. 0) THEN
00877                    BMN = BW - BWU
00878                 ELSE
00879                    BMN = BW
00880                 ENDIF
00881 *
00882                 CALL ZLAMOV( 'G', BM, NRHS, WORK(LN+1), LDW,
00883      $               WORK(NB+BWU+BMN+1), LDW )
00884 *
00885                 CALL ZGERV2D( ICTXT, BMN, NRHS, WORK( NB+BWU+1 ),
00886      $                  LDW, 0, NEICOL )
00887 *
00888 *               and do the permutations and eliminations
00889 *
00890                 IF (NPACT .NE. 2) THEN
00891 *
00892 *                  Solve locally for BW variables
00893 *
00894                    CALL ZLASWP( NRHS, WORK(NB+BWU+1), LDW, 1, BW,
00895      $                  IPIV(LN+1), 1)
00896 *
00897                    CALL ZTRSM('L','L','N','U', BW, NRHS, CONE,
00898      $                  AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW)
00899 *
00900 *                  Use soln just calculated to update RHS
00901 *
00902                    CALL ZGEMM( 'N', 'N', BM+BMN-BW, NRHS, BW,
00903      $                -CONE, AF(BBPTR+BW*LDBB+BW), LDBB,
00904      $                WORK(NB+BWU+1), LDW,
00905      $                CONE, WORK(NB+BWU+1+BW), LDW )
00906 *
00907 *                  Give answers back to partner processor
00908 *
00909                    CALL ZGESD2D( ICTXT, BM+BMN-BW, NRHS,
00910      $                WORK(NB+BWU+1+BW), LDW, 0, NEICOL )
00911 *
00912                 ELSE
00913 *
00914 *                  Finish up calculations for final level
00915 *
00916                    CALL ZLASWP( NRHS, WORK(NB+BWU+1), LDW, 1, BM+BMN,
00917      $                  IPIV(LN+1), 1)
00918 *
00919                    CALL ZTRSM('L','L','N','U', BM+BMN, NRHS, CONE,
00920      $                  AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW)
00921                 ENDIF
00922 *
00923              ENDIF
00924 *
00925              NPACT = (NPACT + 1)/2
00926              NPSTR = NPSTR * 2
00927              GOTO 200
00928 *
00929          ENDIF
00930 *
00931   300 CONTINUE
00932 *
00933 *
00934 **************************************
00935 *     BACKSOLVE
00936 ********************************************************************
00937 *       PHASE 2: Global computation phase -- Solve U*Y = X
00938 ********************************************************************
00939 *
00940       IF (NPCOL.EQ.1) THEN
00941 *
00942 *        In this case the loop over the levels will not be
00943 *        performed.
00944 *        In fact, the backsolve portion was done in the call to
00945 *          ZGETRS in the frontsolve.
00946 *
00947       ENDIF
00948 *
00949 *     Compute variable needed to reverse loop structure in
00950 *        reduced system.
00951 *
00952       RECOVERY_VAL = NPACT*NPSTR - NPCOL
00953 *
00954 *     Loop over levels
00955 *      Terminal values of NPACT and NPSTR from frontsolve are used
00956 *
00957  2200 IF( NPACT .GE. NPCOL ) GOTO 2300
00958 *
00959          NPSTR = NPSTR/2
00960 *
00961          NPACT = NPACT*2
00962 *
00963 *        Have to adjust npact for non-power-of-2
00964 *
00965          NPACT = NPACT-MOD( (RECOVERY_VAL/NPSTR), 2 )
00966 *
00967 *        Find size of submatrix in this proc at this level
00968 *
00969          IF( MYCOL/NPSTR .LT. NPACT-1 ) THEN
00970             BN = BW
00971          ELSE
00972             BN = MIN(BW, NUMROC(N, NB, NPCOL-1, 0, NPCOL) )
00973          ENDIF
00974 *
00975 *        If this processor is even in this level...
00976 *
00977          IF( MOD( MYCOL, 2*NPSTR ) .EQ. 0 ) THEN
00978 *
00979             NEICOL = MYCOL+NPSTR
00980 *
00981             IF( NEICOL/NPSTR .LE. NPACT-1 ) THEN
00982 *
00983                IF( NEICOL/NPSTR .LT. NPACT-1 ) THEN
00984                   BMN = BW
00985                   BNN = BW
00986                ELSE
00987                   BMN = MIN(BW,NUMROC(N, NB, NEICOL, 0, NPCOL))+BWU
00988                   BNN = MIN(BW, NUMROC(N, NB, NEICOL, 0, NPCOL) )
00989                ENDIF
00990 *
00991                IF( NPACT .GT. 2 ) THEN
00992 *
00993                   CALL ZGESD2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ),
00994      $                  LDW, 0, NEICOL )
00995 *
00996                   CALL ZGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ),
00997      $                  LDW, 0, NEICOL )
00998 *
00999                ELSE
01000 *
01001                   CALL ZGERV2D( ICTXT, BW, NRHS, WORK( LN+1 ),
01002      $                  LDW, 0, NEICOL )
01003 *
01004                ENDIF
01005 *
01006             ENDIF
01007 *
01008          ELSE
01009 *           This processor is odd on this level
01010 *
01011             NEICOL = MYCOL - NPSTR
01012 *
01013             IF (NEICOL .EQ. 0) THEN
01014                BMN = BW - BWU
01015             ELSE
01016                BMN = BW
01017             ENDIF
01018 *
01019             IF( NEICOL .LT. NPCOL-1 ) THEN
01020                BNN = BW
01021             ELSE
01022                BNN = MIN(BW, NUMROC(N, NB, NEICOL, 0, NPCOL) )
01023             ENDIF
01024 *
01025             IF( NPACT .GT. 2 ) THEN
01026 *
01027 *              Move RHS to make room for received solutions
01028 *
01029                CALL ZLAMOV( 'G', BW, NRHS, WORK(NB+BWU+1),
01030      $               LDW, WORK(NB+BWU+BW+1), LDW )
01031 *
01032                CALL ZGERV2D( ICTXT, 2*BW, NRHS, WORK( LN+1 ),
01033      $                  LDW, 0, NEICOL )
01034 *
01035                CALL ZGEMM( 'N', 'N', BW, NRHS, BN,
01036      $                -CONE, AF(BBPTR), LDBB,
01037      $                WORK(LN+1), LDW,
01038      $                CONE, WORK(NB+BWU+BW+1), LDW )
01039 *
01040 *
01041                IF( MYCOL .GT. NPSTR ) THEN
01042 *
01043                   CALL ZGEMM( 'N', 'N', BW, NRHS, BW,
01044      $                -CONE, AF(BBPTR+2*BW*LDBB), LDBB,
01045      $                WORK(LN+BW+1), LDW,
01046      $                CONE, WORK(NB+BWU+BW+1), LDW )
01047 *
01048                ENDIF
01049 *
01050                CALL ZTRSM('L','U','N','N', BW, NRHS, CONE,
01051      $                AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+BW+1), LDW)
01052 *
01053 *              Send new solution to neighbor
01054 *
01055                CALL ZGESD2D( ICTXT, BW, NRHS,
01056      $                WORK( NB+BWU+BW+1 ), LDW, 0, NEICOL )
01057 *
01058 *              Copy new solution into expected place
01059 *
01060                CALL ZLAMOV( 'G', BW, NRHS, WORK(NB+BWU+1+BW),
01061      $               LDW, WORK(LN+BW+1), LDW )
01062 *
01063             ELSE
01064 *
01065 *              Solve with local diagonal block
01066 *
01067                CALL ZTRSM( 'L','U','N','N', BN+BNN, NRHS, CONE,
01068      $                  AF(BBPTR+BW*LDBB), LDBB, WORK(NB+BWU+1), LDW)
01069 *
01070 *              Send new solution to neighbor
01071 *
01072                CALL ZGESD2D( ICTXT, BW, NRHS,
01073      $                WORK(NB+BWU+1), LDW, 0, NEICOL )
01074 *
01075 *              Shift solutions into expected positions
01076 *
01077                CALL ZLAMOV( 'G', BNN+BN-BW, NRHS, WORK(NB+BWU+1+BW),
01078      $               LDW, WORK(LN+1), LDW )
01079 *
01080 *
01081                IF( (NB+BWU+1) .NE. (LN+1+BW) ) THEN
01082 *
01083 *                 Copy one row at a time since spaces may overlap
01084 *
01085                   DO 1064 J=1, BW
01086                      CALL ZCOPY( NRHS, WORK(NB+BWU+J), LDW,
01087      $                                      WORK(LN+BW+J), LDW )
01088  1064             CONTINUE
01089 *
01090                ENDIF
01091 *
01092             ENDIF
01093 *
01094          ENDIF
01095 *
01096       GOTO 2200
01097 *
01098  2300 CONTINUE
01099 *     End of loop over levels
01100 *
01101 ********************************************************************
01102 *       PHASE 1: (Almost) Local computation phase -- Solve U*Y = X
01103 ********************************************************************
01104 *
01105 *     Reset BM to value it had before reduced system frontsolve...
01106 *
01107       IF (MYCOL .NE. NPCOL-1) THEN
01108          BM = BW - LBWU
01109       ELSE
01110          BM = MIN(BW,ODD_SIZE) + BWU
01111       ENDIF
01112 *
01113 *     First metastep is to account for the fillin blocks AF
01114 *
01115       IF( MYCOL .LT. NPCOL-1 ) THEN
01116 *
01117          CALL ZGESD2D( ICTXT, BW, NRHS, WORK( NB-BW+1 ),
01118      $                  LDW, 0, MYCOL+1 )
01119 *
01120       ENDIF
01121 *
01122       IF( MYCOL .GT. 0 ) THEN
01123 *
01124          CALL ZGERV2D( ICTXT, BW, NRHS, WORK( NB+BWU+1 ),
01125      $                  LDW, 0, MYCOL-1 )
01126 *
01127 *        Modify local right hand sides with received rhs's
01128 *
01129          CALL ZGEMM( 'N', 'N', LM-BM, NRHS, BW, -CONE,
01130      $           AF( 1 ), LM, WORK( NB+BWU+1 ), LDW, CONE,
01131      $           WORK( 1 ), LDW )
01132 *
01133       ENDIF
01134 *
01135       DO 2021 J = LN, 1, -1
01136 *
01137          LMJ = MIN( BW, ODD_SIZE-1 )
01138 *
01139          LPTR = BW-1+J*LLDA+APTR
01140 *
01141 *        In the following, the TRANS=T option is used to reverse
01142 *           the order of multiplication, not as a true transpose
01143 *
01144          CALL ZGEMV( 'T', LMJ, NRHS, -CONE, WORK( J+1), LDW,
01145      $           A( LPTR ), LLDA-1, CONE, WORK( J ), LDW )
01146 *
01147 *        Divide by diagonal element
01148 *
01149          CALL ZSCAL( NRHS, CONE/A( LPTR-LLDA+1 ),
01150      $               WORK( J ), LDW )
01151  2021 CONTINUE
01152 *
01153 *
01154 *
01155       CALL ZLAMOV( 'G', ODD_SIZE, NRHS, WORK( 1 ), LDW,
01156      $             B( 1 ), LLDB )
01157 *
01158 *     Free BLACS space used to hold standard-form grid.
01159 *
01160       ICTXT = ICTXT_SAVE
01161       IF( ICTXT .NE. ICTXT_NEW ) THEN
01162          CALL BLACS_GRIDEXIT( ICTXT_NEW )
01163       ENDIF
01164 *
01165  1234 CONTINUE
01166 *
01167 *     Restore saved input parameters
01168 *
01169       NP = NP_SAVE
01170 *
01171 *     Output worksize
01172 *
01173       WORK( 1 ) = WORK_SIZE_MIN
01174 *
01175       RETURN
01176 *
01177 *     End of PZGBTRS
01178 *
01179       END