ScaLAPACK  2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
pcdbtrf.f
Go to the documentation of this file.
00001       SUBROUTINE PCDBTRF( N, BWL, BWU, A, JA, DESCA, AF, LAF, WORK,
00002      $                    LWORK, INFO )
00003 *
00004 *  -- ScaLAPACK routine (version 2.0.2) --
00005 *     Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
00006 *     May 1 2012
00007 *
00008 *     .. Scalar Arguments ..
00009       INTEGER            BWL, BWU, INFO, JA, LAF, LWORK, N
00010 *     ..
00011 *     .. Array Arguments ..
00012       INTEGER            DESCA( * )
00013       COMPLEX            A( * ), AF( * ), WORK( * )
00014 *     ..
00015 *
00016 *
00017 *  Purpose
00018 *  =======
00019 *
00020 *  PCDBTRF computes a LU factorization
00021 *  of an N-by-N complex banded
00022 *  diagonally dominant-like distributed matrix
00023 *  with bandwidth BWL, BWU: A(1:N, JA:JA+N-1).
00024 *  Reordering is used to increase parallelism in the factorization.
00025 *  This reordering results in factors that are DIFFERENT from those
00026 *  produced by equivalent sequential codes. These factors cannot
00027 *  be used directly by users; however, they can be used in
00028 *  subsequent calls to PCDBTRS to solve linear systems.
00029 *
00030 *  The factorization has the form
00031 *
00032 *          P A(1:N, JA:JA+N-1) P^T = L U
00033 *
00034 *  where U is a banded upper triangular matrix and L is banded
00035 *  lower triangular, and P is a permutation matrix.
00036 *
00037 *  =====================================================================
00038 *
00039 *  Arguments
00040 *  =========
00041 *
00042 *
00043 *  N       (global input) INTEGER
00044 *          The number of rows and columns to be operated on, i.e. the
00045 *          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
00046 *
00047 *  BWL     (global input) INTEGER
00048 *          Number of subdiagonals. 0 <= BWL <= N-1
00049 *
00050 *  BWU     (global input) INTEGER
00051 *          Number of superdiagonals. 0 <= BWU <= N-1
00052 *
00053 *  A       (local input/local output) COMPLEX pointer into
00054 *          local memory to an array with first dimension
00055 *          LLD_A >=(bwl+bwu+1) (stored in DESCA).
00056 *          On entry, this array contains the local pieces of the
00057 *          N-by-N unsymmetric banded distributed matrix
00058 *          A(1:N, JA:JA+N-1) to be factored.
00059 *          This local portion is stored in the packed banded format
00060 *            used in LAPACK. Please see the Notes below and the
00061 *            ScaLAPACK manual for more detail on the format of
00062 *            distributed matrices.
00063 *          On exit, this array contains information containing details
00064 *            of the factorization.
00065 *          Note that permutations are performed on the matrix, so that
00066 *            the factors returned are different from those returned
00067 *            by LAPACK.
00068 *
00069 *  JA      (global input) INTEGER
00070 *          The index in the global array A that points to the start of
00071 *          the matrix to be operated on (which may be either all of A
00072 *          or a submatrix of A).
00073 *
00074 *  DESCA   (global and local input) INTEGER array of dimension DLEN.
00075 *          if 1D type (DTYPE_A=501), DLEN >= 7;
00076 *          if 2D type (DTYPE_A=1), DLEN >= 9 .
00077 *          The array descriptor for the distributed matrix A.
00078 *          Contains information of mapping of A to memory. Please
00079 *          see NOTES below for full description and options.
00080 *
00081 *  AF      (local output) COMPLEX array, dimension LAF.
00082 *          Auxiliary Fillin Space.
00083 *          Fillin is created during the factorization routine
00084 *          PCDBTRF and this is stored in AF. If a linear system
00085 *          is to be solved using PCDBTRS after the factorization
00086 *          routine, AF *must not be altered* after the factorization.
00087 *
00088 *  LAF     (local input) INTEGER
00089 *          Size of user-input Auxiliary Fillin space AF. Must be >=
00090 *          NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
00091 *          If LAF is not large enough, an error code will be returned
00092 *          and the minimum acceptable size will be returned in AF( 1 )
00093 *
00094 *  WORK    (local workspace/local output)
00095 *          COMPLEX temporary workspace. This space may
00096 *          be overwritten in between calls to routines. WORK must be
00097 *          the size given in LWORK.
00098 *          On exit, WORK( 1 ) contains the minimal LWORK.
00099 *
00100 *  LWORK   (local input or global input) INTEGER
00101 *          Size of user-input workspace WORK.
00102 *          If LWORK is too small, the minimal acceptable size will be
00103 *          returned in WORK(1) and an error code is returned. LWORK>=
00104 *          max(bwl,bwu)*max(bwl,bwu)
00105 *
00106 *  INFO    (global output) INTEGER
00107 *          = 0:  successful exit
00108 *          < 0:  If the i-th argument is an array and the j-entry had
00109 *                an illegal value, then INFO = -(i*100+j), if the i-th
00110 *                argument is a scalar and had an illegal value, then
00111 *                INFO = -i.
00112 *          > 0:  If INFO = K<=NPROCS, the submatrix stored on processor
00113 *                INFO and factored locally was not
00114 *                diagonally dominant-like,  and
00115 *                the factorization was not completed.
00116 *                If INFO = K>NPROCS, the submatrix stored on processor
00117 *                INFO-NPROCS representing interactions with other
00118 *                processors was not
00119 *                stably factorable wo/interchanges,
00120 *                and the factorization was not completed.
00121 *
00122 *  =====================================================================
00123 *
00124 *
00125 *  Restrictions
00126 *  ============
00127 *
00128 *  The following are restrictions on the input parameters. Some of these
00129 *    are temporary and will be removed in future releases, while others
00130 *    may reflect fundamental technical limitations.
00131 *
00132 *    Non-cyclic restriction: VERY IMPORTANT!
00133 *      P*NB>= mod(JA-1,NB)+N.
00134 *      The mapping for matrices must be blocked, reflecting the nature
00135 *      of the divide and conquer algorithm as a task-parallel algorithm.
00136 *      This formula in words is: no processor may have more than one
00137 *      chunk of the matrix.
00138 *
00139 *    Blocksize cannot be too small:
00140 *      If the matrix spans more than one processor, the following
00141 *      restriction on NB, the size of each block on each processor,
00142 *      must hold:
00143 *      NB >= 2*MAX(BWL,BWU)
00144 *      The bulk of parallel computation is done on the matrix of size
00145 *      O(NB) on each processor. If this is too small, divide and conquer
00146 *      is a poor choice of algorithm.
00147 *
00148 *    Submatrix reference:
00149 *      JA = IB
00150 *      Alignment restriction that prevents unnecessary communication.
00151 *
00152 *
00153 *  =====================================================================
00154 *
00155 *
00156 *  Notes
00157 *  =====
00158 *
00159 *  If the factorization routine and the solve routine are to be called
00160 *    separately (to solve various sets of righthand sides using the same
00161 *    coefficient matrix), the auxiliary space AF *must not be altered*
00162 *    between calls to the factorization routine and the solve routine.
00163 *
00164 *  The best algorithm for solving banded and tridiagonal linear systems
00165 *    depends on a variety of parameters, especially the bandwidth.
00166 *    Currently, only algorithms designed for the case N/P >> bw are
00167 *    implemented. These go by many names, including Divide and Conquer,
00168 *    Partitioning, domain decomposition-type, etc.
00169 *
00170 *  Algorithm description: Divide and Conquer
00171 *
00172 *    The Divide and Conqer algorithm assumes the matrix is narrowly
00173 *      banded compared with the number of equations. In this situation,
00174 *      it is best to distribute the input matrix A one-dimensionally,
00175 *      with columns atomic and rows divided amongst the processes.
00176 *      The basic algorithm divides the banded matrix up into
00177 *      P pieces with one stored on each processor,
00178 *      and then proceeds in 2 phases for the factorization or 3 for the
00179 *      solution of a linear system.
00180 *      1) Local Phase:
00181 *         The individual pieces are factored independently and in
00182 *         parallel. These factors are applied to the matrix creating
00183 *         fillin, which is stored in a non-inspectable way in auxiliary
00184 *         space AF. Mathematically, this is equivalent to reordering
00185 *         the matrix A as P A P^T and then factoring the principal
00186 *         leading submatrix of size equal to the sum of the sizes of
00187 *         the matrices factored on each processor. The factors of
00188 *         these submatrices overwrite the corresponding parts of A
00189 *         in memory.
00190 *      2) Reduced System Phase:
00191 *         A small (max(bwl,bwu)* (P-1)) system is formed representing
00192 *         interaction of the larger blocks, and is stored (as are its
00193 *         factors) in the space AF. A parallel Block Cyclic Reduction
00194 *         algorithm is used. For a linear system, a parallel front solve
00195 *         followed by an analagous backsolve, both using the structure
00196 *         of the factored matrix, are performed.
00197 *      3) Backsubsitution Phase:
00198 *         For a linear system, a local backsubstitution is performed on
00199 *         each processor in parallel.
00200 *
00201 *
00202 *  Descriptors
00203 *  ===========
00204 *
00205 *  Descriptors now have *types* and differ from ScaLAPACK 1.0.
00206 *
00207 *  Note: banded codes can use either the old two dimensional
00208 *    or new one-dimensional descriptors, though the processor grid in
00209 *    both cases *must be one-dimensional*. We describe both types below.
00210 *
00211 *  Each global data object is described by an associated description
00212 *  vector.  This vector stores the information required to establish
00213 *  the mapping between an object element and its corresponding process
00214 *  and memory location.
00215 *
00216 *  Let A be a generic term for any 2D block cyclicly distributed array.
00217 *  Such a global array has an associated description vector DESCA.
00218 *  In the following comments, the character _ should be read as
00219 *  "of the global array".
00220 *
00221 *  NOTATION        STORED IN      EXPLANATION
00222 *  --------------- -------------- --------------------------------------
00223 *  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
00224 *                                 DTYPE_A = 1.
00225 *  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
00226 *                                 the BLACS process grid A is distribu-
00227 *                                 ted over. The context itself is glo-
00228 *                                 bal, but the handle (the integer
00229 *                                 value) may vary.
00230 *  M_A    (global) DESCA( M_ )    The number of rows in the global
00231 *                                 array A.
00232 *  N_A    (global) DESCA( N_ )    The number of columns in the global
00233 *                                 array A.
00234 *  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
00235 *                                 the rows of the array.
00236 *  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
00237 *                                 the columns of the array.
00238 *  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
00239 *                                 row of the array A is distributed.
00240 *  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
00241 *                                 first column of the array A is
00242 *                                 distributed.
00243 *  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
00244 *                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
00245 *
00246 *  Let K be the number of rows or columns of a distributed matrix,
00247 *  and assume that its process grid has dimension p x q.
00248 *  LOCr( K ) denotes the number of elements of K that a process
00249 *  would receive if K were distributed over the p processes of its
00250 *  process column.
00251 *  Similarly, LOCc( K ) denotes the number of elements of K that a
00252 *  process would receive if K were distributed over the q processes of
00253 *  its process row.
00254 *  The values of LOCr() and LOCc() may be determined via a call to the
00255 *  ScaLAPACK tool function, NUMROC:
00256 *          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
00257 *          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
00258 *  An upper bound for these quantities may be computed by:
00259 *          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
00260 *          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
00261 *
00262 *
00263 *  One-dimensional descriptors:
00264 *
00265 *  One-dimensional descriptors are a new addition to ScaLAPACK since
00266 *    version 1.0. They simplify and shorten the descriptor for 1D
00267 *    arrays.
00268 *
00269 *  Since ScaLAPACK supports two-dimensional arrays as the fundamental
00270 *    object, we allow 1D arrays to be distributed either over the
00271 *    first dimension of the array (as if the grid were P-by-1) or the
00272 *    2nd dimension (as if the grid were 1-by-P). This choice is
00273 *    indicated by the descriptor type (501 or 502)
00274 *    as described below.
00275 *
00276 *    IMPORTANT NOTE: the actual BLACS grid represented by the
00277 *    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
00278 *    irrespective of which one-dimensional descriptor type
00279 *    (501 or 502) is input.
00280 *    This routine will interpret the grid properly either way.
00281 *    ScaLAPACK routines *do not support intercontext operations* so that
00282 *    the grid passed to a single ScaLAPACK routine *must be the same*
00283 *    for all array descriptors passed to that routine.
00284 *
00285 *    NOTE: In all cases where 1D descriptors are used, 2D descriptors
00286 *    may also be used, since a one-dimensional array is a special case
00287 *    of a two-dimensional array with one dimension of size unity.
00288 *    The two-dimensional array used in this case *must* be of the
00289 *    proper orientation:
00290 *      If the appropriate one-dimensional descriptor is DTYPEA=501
00291 *      (1 by P type), then the two dimensional descriptor must
00292 *      have a CTXT value that refers to a 1 by P BLACS grid;
00293 *      If the appropriate one-dimensional descriptor is DTYPEA=502
00294 *      (P by 1 type), then the two dimensional descriptor must
00295 *      have a CTXT value that refers to a P by 1 BLACS grid.
00296 *
00297 *
00298 *  Summary of allowed descriptors, types, and BLACS grids:
00299 *  DTYPE           501         502         1         1
00300 *  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
00301 *  -----------------------------------------------------
00302 *  A               OK          NO          OK        NO
00303 *  B               NO          OK          NO        OK
00304 *
00305 *  Let A be a generic term for any 1D block cyclicly distributed array.
00306 *  Such a global array has an associated description vector DESCA.
00307 *  In the following comments, the character _ should be read as
00308 *  "of the global array".
00309 *
00310 *  NOTATION        STORED IN  EXPLANATION
00311 *  --------------- ---------- ------------------------------------------
00312 *  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
00313 *                                TYPE_A = 501: 1-by-P grid.
00314 *                                TYPE_A = 502: P-by-1 grid.
00315 *  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
00316 *                                the BLACS process grid A is distribu-
00317 *                                ted over. The context itself is glo-
00318 *                                bal, but the handle (the integer
00319 *                                value) may vary.
00320 *  N_A    (global) DESCA( 3 ) The size of the array dimension being
00321 *                                distributed.
00322 *  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
00323 *                                the distributed dimension of the array.
00324 *  SRC_A  (global) DESCA( 5 ) The process row or column over which the
00325 *                                first row or column of the array
00326 *                                is distributed.
00327 *  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
00328 *                                storing the local blocks of the distri-
00329 *                                buted array A. Minimum value of LLD_A
00330 *                                depends on TYPE_A.
00331 *                                TYPE_A = 501: LLD_A >=
00332 *                                   size of undistributed dimension, 1.
00333 *                                TYPE_A = 502: LLD_A >=NB_A, 1.
00334 *  Reserved        DESCA( 7 ) Reserved for future use.
00335 *
00336 *
00337 *
00338 *  =====================================================================
00339 *
00340 *  Code Developer: Andrew J. Cleary, University of Tennessee.
00341 *    Current address: Lawrence Livermore National Labs.
00342 *  This version released: August, 2001.
00343 *
00344 *  =====================================================================
00345 *
00346 *     ..
00347 *     .. Parameters ..
00348       REAL               ONE, ZERO
00349       PARAMETER          ( ONE = 1.0E+0 )
00350       PARAMETER          ( ZERO = 0.0E+0 )
00351       COMPLEX            CONE, CZERO
00352       PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
00353       PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ) )
00354       INTEGER            INT_ONE
00355       PARAMETER          ( INT_ONE = 1 )
00356       INTEGER            DESCMULT, BIGNUM
00357       PARAMETER          (DESCMULT = 100, BIGNUM = DESCMULT * DESCMULT)
00358       INTEGER            BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
00359      $                   LLD_, MB_, M_, NB_, N_, RSRC_
00360       PARAMETER          ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1,
00361      $                     CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6,
00362      $                     RSRC_ = 7, CSRC_ = 8, LLD_ = 9 )
00363 *     ..
00364 *     .. Local Scalars ..
00365       INTEGER            COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
00366      $                   ICTXT_NEW, ICTXT_SAVE, IDUM3, JA_NEW, LAF_MIN,
00367      $                   LEVEL_DIST, LLDA, MAX_BW, MBW2, MYCOL, MYROW,
00368      $                   MY_NUM_COLS, NB, NEXT_TRI_SIZE_M,
00369      $                   NEXT_TRI_SIZE_N, NP, NPCOL, NPROW, NP_SAVE,
00370      $                   ODD_SIZE, OFST, PART_OFFSET, PART_SIZE,
00371      $                   PREV_TRI_SIZE_M, PREV_TRI_SIZE_N, RETURN_CODE,
00372      $                   STORE_N_A, UP_PREV_TRI_SIZE_M,
00373      $                   UP_PREV_TRI_SIZE_N, WORK_SIZE_MIN, WORK_U
00374 *     ..
00375 *     .. Local Arrays ..
00376       INTEGER            DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
00377 *     ..
00378 *     .. External Subroutines ..
00379       EXTERNAL           BLACS_GET, BLACS_GRIDEXIT, BLACS_GRIDINFO,
00380      $                   CAXPY, CGEMM, CGERV2D, CGESD2D, CLAMOV,
00381      $                   CLATCPY, CPBTRF, CPOTRF, CSYRK, CTBTRS, CTRMM,
00382      $                   CTRRV2D, CTRSD2D, CTRSM, CTRTRS, DESC_CONVERT,
00383      $                   GLOBCHK, PXERBLA, RESHAPE
00384 *     ..
00385 *     .. External Functions ..
00386       LOGICAL            LSAME
00387       INTEGER            NUMROC
00388       EXTERNAL           LSAME, NUMROC
00389 *     ..
00390 *     .. Intrinsic Functions ..
00391       INTRINSIC          ICHAR, MIN, MOD
00392 *     ..
00393 *     .. Executable Statements ..
00394 *
00395 *     Test the input parameters
00396 *
00397       INFO = 0
00398 *
00399 *     Convert descriptor into standard form for easy access to
00400 *        parameters, check that grid is of right shape.
00401 *
00402       DESCA_1XP( 1 ) = 501
00403 *
00404       CALL DESC_CONVERT( DESCA, DESCA_1XP, RETURN_CODE )
00405 *
00406       IF( RETURN_CODE .NE. 0) THEN
00407          INFO = -( 6*100 + 2 )
00408       ENDIF
00409 *
00410 *     Get values out of descriptor for use in code.
00411 *
00412       ICTXT = DESCA_1XP( 2 )
00413       CSRC = DESCA_1XP( 5 )
00414       NB = DESCA_1XP( 4 )
00415       LLDA = DESCA_1XP( 6 )
00416       STORE_N_A = DESCA_1XP( 3 )
00417 *
00418 *     Get grid parameters
00419 *
00420 *
00421 *     Size of separator blocks is maximum of bandwidths
00422 *
00423       MAX_BW = MAX(BWL,BWU)
00424       MBW2 = MAX_BW * MAX_BW
00425 *
00426       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00427       NP = NPROW * NPCOL
00428 *
00429 *
00430 *
00431       IF( LWORK .LT. -1) THEN
00432          INFO = -10
00433       ELSE IF ( LWORK .EQ. -1 ) THEN
00434          IDUM3 = -1
00435       ELSE
00436          IDUM3 = 1
00437       ENDIF
00438 *
00439       IF( N .LT. 0 ) THEN
00440          INFO = -1
00441       ENDIF
00442 *
00443       IF( N+JA-1 .GT. STORE_N_A ) THEN
00444          INFO = -( 6*100 + 6 )
00445       ENDIF
00446 *
00447       IF(( BWL .GT. N-1 ) .OR.
00448      $   ( BWL .LT. 0 ) ) THEN
00449          INFO = -2
00450       ENDIF
00451 *
00452       IF(( BWU .GT. N-1 ) .OR.
00453      $   ( BWU .LT. 0 ) ) THEN
00454          INFO = -3
00455       ENDIF
00456 *
00457       IF( LLDA .LT. (BWL+BWU+1) ) THEN
00458          INFO = -( 6*100 + 6 )
00459       ENDIF
00460 *
00461       IF( NB .LE. 0 ) THEN
00462          INFO = -( 6*100 + 4 )
00463       ENDIF
00464 *
00465 *     Argument checking that is specific to Divide & Conquer routine
00466 *
00467       IF( NPROW .NE. 1 ) THEN
00468          INFO = -( 6*100+2 )
00469       ENDIF
00470 *
00471       IF( N .GT. NP*NB-MOD( JA-1, NB )) THEN
00472          INFO = -( 1 )
00473          CALL PXERBLA( ICTXT,
00474      $      'PCDBTRF, D&C alg.: only 1 block per proc',
00475      $      -INFO )
00476          RETURN
00477       ENDIF
00478 *
00479       IF((JA+N-1.GT.NB) .AND. ( NB.LT.2*MAX(BWL,BWU) )) THEN
00480          INFO = -( 6*100+4 )
00481          CALL PXERBLA( ICTXT,
00482      $      'PCDBTRF, D&C alg.: NB too small',
00483      $      -INFO )
00484          RETURN
00485       ENDIF
00486 *
00487 *
00488 *     Check auxiliary storage size
00489 *
00490       LAF_MIN = NB*(BWL+BWU)+6*MAX(BWL,BWU)*MAX(BWL,BWU)
00491 *
00492       IF( LAF .LT. LAF_MIN ) THEN
00493          INFO = -8
00494 *        put minimum value of laf into AF( 1 )
00495          AF( 1 ) = LAF_MIN
00496          CALL PXERBLA( ICTXT,
00497      $      'PCDBTRF: auxiliary storage error ',
00498      $      -INFO )
00499          RETURN
00500       ENDIF
00501 *
00502 *     Check worksize
00503 *
00504       WORK_SIZE_MIN = MAX(BWL,BWU)*MAX(BWL,BWU)
00505 *
00506       WORK( 1 ) = WORK_SIZE_MIN
00507 *
00508       IF( LWORK .LT. WORK_SIZE_MIN ) THEN
00509          IF( LWORK .NE. -1 ) THEN
00510            INFO = -10
00511            CALL PXERBLA( ICTXT,
00512      $      'PCDBTRF: worksize error ',
00513      $      -INFO )
00514          ENDIF
00515          RETURN
00516       ENDIF
00517 *
00518 *     Pack params and positions into arrays for global consistency check
00519 *
00520       PARAM_CHECK(  9, 1 ) = DESCA(5)
00521       PARAM_CHECK(  8, 1 ) = DESCA(4)
00522       PARAM_CHECK(  7, 1 ) = DESCA(3)
00523       PARAM_CHECK(  6, 1 ) = DESCA(1)
00524       PARAM_CHECK(  5, 1 ) = JA
00525       PARAM_CHECK(  4, 1 ) = BWU
00526       PARAM_CHECK(  3, 1 ) = BWL
00527       PARAM_CHECK(  2, 1 ) = N
00528       PARAM_CHECK(  1, 1 ) = IDUM3
00529 *
00530       PARAM_CHECK(  9, 2 ) = 605
00531       PARAM_CHECK(  8, 2 ) = 604
00532       PARAM_CHECK(  7, 2 ) = 603
00533       PARAM_CHECK(  6, 2 ) = 601
00534       PARAM_CHECK(  5, 2 ) = 5
00535       PARAM_CHECK(  4, 2 ) = 3
00536       PARAM_CHECK(  3, 2 ) = 2
00537       PARAM_CHECK(  2, 2 ) = 1
00538       PARAM_CHECK(  1, 2 ) = 10
00539 *
00540 *     Want to find errors with MIN( ), so if no error, set it to a big
00541 *     number. If there already is an error, multiply by the the
00542 *     descriptor multiplier.
00543 *
00544       IF( INFO.GE.0 ) THEN
00545          INFO = BIGNUM
00546       ELSE IF( INFO.LT.-DESCMULT ) THEN
00547          INFO = -INFO
00548       ELSE
00549          INFO = -INFO * DESCMULT
00550       END IF
00551 *
00552 *     Check consistency across processors
00553 *
00554       CALL GLOBCHK( ICTXT, 9, PARAM_CHECK, 9,
00555      $              PARAM_CHECK( 1, 3 ), INFO )
00556 *
00557 *     Prepare output: set info = 0 if no error, and divide by DESCMULT
00558 *     if error is not in a descriptor entry.
00559 *
00560       IF( INFO.EQ.BIGNUM ) THEN
00561          INFO = 0
00562       ELSE IF( MOD( INFO, DESCMULT ) .EQ. 0 ) THEN
00563          INFO = -INFO / DESCMULT
00564       ELSE
00565          INFO = -INFO
00566       END IF
00567 *
00568       IF( INFO.LT.0 ) THEN
00569          CALL PXERBLA( ICTXT, 'PCDBTRF', -INFO )
00570          RETURN
00571       END IF
00572 *
00573 *     Quick return if possible
00574 *
00575       IF( N.EQ.0 )
00576      $   RETURN
00577 *
00578 *
00579 *     Adjust addressing into matrix space to properly get into
00580 *        the beginning part of the relevant data
00581 *
00582       PART_OFFSET = NB*( (JA-1)/(NPCOL*NB) )
00583 *
00584       IF ( (MYCOL-CSRC) .LT. (JA-PART_OFFSET-1)/NB ) THEN
00585          PART_OFFSET = PART_OFFSET + NB
00586       ENDIF
00587 *
00588       IF ( MYCOL .LT. CSRC ) THEN
00589          PART_OFFSET = PART_OFFSET - NB
00590       ENDIF
00591 *
00592 *     Form a new BLACS grid (the "standard form" grid) with only procs
00593 *        holding part of the matrix, of size 1xNP where NP is adjusted,
00594 *        starting at csrc=0, with JA modified to reflect dropped procs.
00595 *
00596 *     First processor to hold part of the matrix:
00597 *
00598       FIRST_PROC = MOD( ( JA-1 )/NB+CSRC, NPCOL )
00599 *
00600 *     Calculate new JA one while dropping off unused processors.
00601 *
00602       JA_NEW = MOD( JA-1, NB ) + 1
00603 *
00604 *     Save and compute new value of NP
00605 *
00606       NP_SAVE = NP
00607       NP = ( JA_NEW+N-2 )/NB + 1
00608 *
00609 *     Call utility routine that forms "standard-form" grid
00610 *
00611       CALL RESHAPE( ICTXT, INT_ONE, ICTXT_NEW, INT_ONE,
00612      $              FIRST_PROC, INT_ONE, NP )
00613 *
00614 *     Use new context from standard grid as context.
00615 *
00616       ICTXT_SAVE = ICTXT
00617       ICTXT = ICTXT_NEW
00618       DESCA_1XP( 2 ) = ICTXT_NEW
00619 *
00620 *     Get information about new grid.
00621 *
00622       CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL )
00623 *
00624 *     Drop out processors that do not have part of the matrix.
00625 *
00626       IF( MYROW .LT. 0 ) THEN
00627          GOTO 1234
00628       ENDIF
00629 *
00630 *     ********************************
00631 *     Values reused throughout routine
00632 *
00633 *     User-input value of partition size
00634 *
00635       PART_SIZE = NB
00636 *
00637 *     Number of columns in each processor
00638 *
00639       MY_NUM_COLS = NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL )
00640 *
00641 *     Offset in columns to beginning of main partition in each proc
00642 *
00643       IF ( MYCOL .EQ. 0 ) THEN
00644         PART_OFFSET = PART_OFFSET+MOD( JA_NEW-1, PART_SIZE )
00645         MY_NUM_COLS = MY_NUM_COLS - MOD(JA_NEW-1, PART_SIZE )
00646       ENDIF
00647 *
00648 *     Offset in elements
00649 *
00650       OFST = PART_OFFSET*LLDA
00651 *
00652 *     Size of main (or odd) partition in each processor
00653 *
00654       ODD_SIZE = MY_NUM_COLS
00655       IF ( MYCOL .LT. NP-1 ) THEN
00656          ODD_SIZE = ODD_SIZE - MAX_BW
00657       ENDIF
00658 *
00659 *     Offset to workspace for Upper triangular factor
00660 *
00661       WORK_U = BWU*ODD_SIZE + 3*MBW2
00662 *
00663 *
00664 *       Zero out space for fillin
00665 *
00666         DO 10  I=1, LAF_MIN
00667           AF( I ) = CZERO
00668    10   CONTINUE
00669 *
00670 *       Zero out space for work
00671 *
00672         DO 20  I=1, WORK_SIZE_MIN
00673           WORK( I ) = CZERO
00674    20   CONTINUE
00675 *
00676 *     Begin main code
00677 *
00678 *
00679 ********************************************************************
00680 *       PHASE 1: Local computation phase.
00681 ********************************************************************
00682 *
00683 *
00684 *       Sizes of the extra triangles communicated bewtween processors
00685 *
00686         IF( MYCOL .GT. 0 ) THEN
00687           PREV_TRI_SIZE_M= MIN( BWL,
00688      $          NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) )
00689           PREV_TRI_SIZE_N=MIN( BWL,
00690      $          NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) )
00691         ENDIF
00692 *
00693         IF( MYCOL .GT. 0 ) THEN
00694           UP_PREV_TRI_SIZE_M= MIN( BWU,
00695      $          NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) )
00696           UP_PREV_TRI_SIZE_N=MIN( BWU,
00697      $          NUMROC( N, PART_SIZE, MYCOL-1, 0, NPCOL ) )
00698         ENDIF
00699 *
00700         IF( MYCOL .LT. NPCOL-1 ) THEN
00701           NEXT_TRI_SIZE_M=MIN( BWL,
00702      $          NUMROC( N, PART_SIZE, MYCOL+1, 0, NPCOL ) )
00703           NEXT_TRI_SIZE_N= MIN( BWL,
00704      $          NUMROC( N, PART_SIZE, MYCOL, 0, NPCOL ) )
00705         ENDIF
00706 *
00707         IF ( MYCOL .LT. NP-1 ) THEN
00708 *         Transfer last triangle D_i of local matrix to next processor
00709 *         which needs it to calculate fillin due to factorization of
00710 *         its main (odd) block A_i.
00711 *         Overlap the send with the factorization of A_i.
00712 *
00713           CALL CTRSD2D( ICTXT, 'U', 'N', NEXT_TRI_SIZE_M,
00714      $                  NEXT_TRI_SIZE_N,
00715      $                  A( OFST+(MY_NUM_COLS-BWL)*LLDA+(BWL+BWU+1) ),
00716      $                  LLDA-1, 0, MYCOL+1 )
00717 *
00718         ENDIF
00719 *
00720 *
00721 *       Factor main partition A_i = L_i {U_i} in each processor
00722 *
00723         CALL CDBTRF( ODD_SIZE, ODD_SIZE, BWL, BWU, A( OFST + 1),
00724      $                    LLDA, INFO )
00725 *
00726         IF( INFO.NE.0 ) THEN
00727            INFO = MYCOL+1
00728            GOTO 1500
00729         ENDIF
00730 *
00731 *
00732         IF ( MYCOL .LT. NP-1 ) THEN
00733 *
00734 *         Apply factorization to lower connection block BL_i
00735 *         conjugate transpose the connection block in preparation.
00736 *         Apply factorization to upper connection block BU_i
00737 *         Move the connection block in preparation.
00738 *
00739           CALL CLATCPY( 'U', BWL, BWL,
00740      $                  A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )),
00741      $                  LLDA-1, AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ),
00742      $                  MAX_BW )
00743           CALL CLAMOV( 'L', BWU, BWU, A( ( OFST+1+ODD_SIZE*LLDA ) ),
00744      $                 LLDA-1,
00745      $                 AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ),
00746      $                 MAX_BW )
00747 *
00748 *         Perform the triangular system solve {L_i}{{BU'}_i} = {B_i}
00749 *
00750           CALL CTBTRS( 'L', 'N', 'U', BWU, BWL, BWU,
00751      $                 A( OFST+BWU+1+(ODD_SIZE-BWU )*LLDA ), LLDA,
00752      $                 AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ),
00753      $                 MAX_BW, INFO )
00754 *
00755 *         Perform the triangular solve {U_i}^C{BL'}_i^C = {BL_i}^C
00756 *
00757           CALL CTBTRS( 'U', 'C', 'N', BWL, BWU, BWL,
00758      $                 A( OFST+1+(ODD_SIZE-BWL)*LLDA ), LLDA,
00759      $                 AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW,
00760      $                 INFO )
00761 *
00762 *         conjugate transpose resulting block to its location
00763 *           in main storage.
00764 *
00765           CALL CLATCPY( 'L', BWL, BWL,
00766      $                  AF( ODD_SIZE*BWU+2*MBW2+1+MAX_BW-BWL ), MAX_BW,
00767      $                  A(( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA )),
00768      $                  LLDA-1 )
00769 *
00770 *         Move the resulting block back to its location in main storage.
00771 *
00772           CALL CLAMOV( 'L', BWU, BWU,
00773      $                 AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1+MAX_BW-BWU ),
00774      $                 MAX_BW, A(( OFST+1+ODD_SIZE*LLDA )), LLDA-1 )
00775 *
00776 *
00777 *         Compute contribution to diagonal block(s) of reduced system.
00778 *          {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i}
00779 *
00780 *         The following method uses more flops than necessary but
00781 *           does not necessitate the writing of a new BLAS routine.
00782 *
00783 *
00784           CALL CGEMM( 'C', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE ,
00785      $         AF( ODD_SIZE*BWU+2*MBW2+1), MAX_BW,
00786      $         AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1), MAX_BW, CONE,
00787      $         A( OFST+ODD_SIZE*LLDA+1+BWU ), LLDA-1 )
00788 *
00789         ENDIF
00790 *       End of "if ( MYCOL .lt. NP-1 )..." loop
00791 *
00792 *
00793  1500   CONTINUE
00794 *       If the processor could not locally factor, it jumps here.
00795 *
00796         IF ( MYCOL .NE. 0 ) THEN
00797 *         Discard temporary matrix stored beginning in
00798 *           AF( (odd_size+2*bwl, bwu)*bwl, bwu+1 ) and use for
00799 *           off_diagonal block of reduced system.
00800 *
00801 *         Receive previously transmitted matrix section, which forms
00802 *         the right-hand-side for the triangular solve that calculates
00803 *         the "spike" fillin.
00804 *
00805 *
00806           CALL CTRRV2D( ICTXT, 'U', 'N', PREV_TRI_SIZE_M,
00807      $                  PREV_TRI_SIZE_N, AF( WORK_U+1 ), ODD_SIZE, 0,
00808      $                  MYCOL-1 )
00809 *
00810           IF (INFO.EQ.0) THEN
00811 *
00812 *         Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ .
00813 *
00814           CALL CTBTRS( 'L', 'N', 'U', ODD_SIZE, BWL, BWL,
00815      $                 A( OFST + BWU+1 ), LLDA, AF( WORK_U+1 ),
00816      $                 ODD_SIZE, INFO )
00817 *
00818 *
00819 *         Calculate the "spike" fillin, ${U_i}^C {{GL}_i}^C = {DU_i}^C$
00820 *
00821 *
00822 *         Copy D block into AF storage for solve.
00823 *
00824           CALL CLATCPY( 'L', UP_PREV_TRI_SIZE_N, UP_PREV_TRI_SIZE_M,
00825      $                  A( OFST+1 ), LLDA-1, AF( 1 ), ODD_SIZE )
00826 *
00827           CALL CTBTRS( 'U', 'C', 'N', ODD_SIZE, BWU, BWU,
00828      $                    A( OFST + 1 ), LLDA,
00829      $                    AF( 1 ), ODD_SIZE, INFO )
00830 *
00831 *
00832 *         Calculate the update block for previous proc, E_i = GL_i{GU_i}
00833 *
00834 *
00835 *         Zero out space in case result is smaller than storage block
00836 *
00837         DO 30  I=1, MBW2
00838             AF( ODD_SIZE*BWU+2*MBW2+I ) = CZERO
00839    30   CONTINUE
00840 *
00841           CALL CGEMM( 'C', 'N', BWU, BWL, ODD_SIZE,
00842      $                    -CONE, AF( 1 ), ODD_SIZE,
00843      $                    AF( WORK_U+1 ), ODD_SIZE, CZERO,
00844      $                    AF( 1+MAX(0,BWL-BWU)+ODD_SIZE*BWU+
00845      $                      (2*MAX_BW+MAX(0,BWU-BWL))*MAX_BW),
00846      $                    MAX_BW )
00847 *
00848 *
00849 *         Initiate send of E_i to previous processor to overlap
00850 *           with next computation.
00851 *
00852           CALL CGESD2D( ICTXT, MAX_BW, MAX_BW,
00853      $                  AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW, 0,
00854      $                  MYCOL-1 )
00855 *
00856 *
00857           IF ( MYCOL .LT. NP-1 ) THEN
00858 *
00859 *           Calculate off-diagonal block(s) of reduced system.
00860 *           Note: for ease of use in solution of reduced system, store
00861 *           L's off-diagonal block in conjugate transpose form.
00862 *
00863 *           Copy matrix HU_i (the last bwl rows of GU_i) to AFL storage
00864 *             as per requirements of BLAS routine CTRMM.
00865 *             Since we have GU_i stored,
00866 *             conjugate transpose HU_i to HU_i^C.
00867 *
00868             CALL CLATCPY( 'N', BWL, BWL,
00869      $               AF( WORK_U+ODD_SIZE-BWL+1 ), ODD_SIZE,
00870      $        AF( (ODD_SIZE)*BWU+1+(MAX_BW-BWL) ),
00871      $               MAX_BW  )
00872 *
00873             CALL CTRMM( 'R', 'U', 'C', 'N', BWL, BWL, -CONE,
00874      $                  A( ( OFST+(BWL+BWU+1)+(ODD_SIZE-BWL)*LLDA ) ),
00875      $                  LLDA-1, AF( (ODD_SIZE)*BWU+1+(MAX_BW-BWL) ),
00876      $                  MAX_BW )
00877 *
00878 *
00879 *           Copy matrix HL_i (the last bwu rows of GL_i^C) to AFU store
00880 *             as per requirements of BLAS routine CTRMM.
00881 *             Since we have GL_i^C stored,
00882 *             conjugate transpose HL_i^C to HL_i.
00883 *
00884             CALL CLATCPY( 'N', BWU, BWU,
00885      $               AF( ODD_SIZE-BWU+1 ), ODD_SIZE,
00886      $               AF( WORK_U+(ODD_SIZE)*BWL+1+MAX_BW-BWU ),
00887      $               MAX_BW  )
00888 *
00889             CALL CTRMM( 'R', 'L', 'N', 'N', BWU, BWU, -CONE,
00890      $                  A( ( OFST+1+ODD_SIZE*LLDA ) ), LLDA-1,
00891      $                  AF( WORK_U+(ODD_SIZE)*BWL+1+MAX_BW-BWU ),
00892      $                  MAX_BW )
00893 *
00894           ENDIF
00895 *
00896         ENDIF
00897 *       End of "if ( MYCOL .ne. 0 )..."
00898 *
00899         ENDIF
00900 *       End of "if (info.eq.0) then"
00901 *
00902 *
00903 *       Check to make sure no processors have found errors
00904 *
00905         CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO,
00906      $              -1, 0, 0 )
00907 *
00908         IF( MYCOL.EQ.0 ) THEN
00909            CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
00910         ELSE
00911            CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
00912         ENDIF
00913 *
00914         IF ( INFO.NE.0 ) THEN
00915            GOTO 1000
00916         ENDIF
00917 *       No errors found, continue
00918 *
00919 *
00920 ********************************************************************
00921 *       PHASE 2: Formation and factorization of Reduced System.
00922 ********************************************************************
00923 *
00924 *       Gather up local sections of reduced system
00925 *
00926 *
00927 *     The last processor does not participate in the factorization of
00928 *       the reduced system, having sent its E_i already.
00929       IF( MYCOL .EQ. NPCOL-1 ) THEN
00930         GOTO 14
00931       ENDIF
00932 *
00933 *       Initiate send of off-diag block(s) to overlap with next part.
00934 *       Off-diagonal block needed on neighboring processor to start
00935 *       algorithm.
00936 *
00937         IF( (MOD( MYCOL+1, 2 ) .EQ. 0) .AND. ( MYCOL .GT. 0 ) ) THEN
00938 *
00939           CALL CGESD2D( ICTXT, MAX_BW, MAX_BW,
00940      $                       AF( ODD_SIZE*BWU+1 ),
00941      $                       MAX_BW, 0, MYCOL-1 )
00942 *
00943           CALL CGESD2D( ICTXT, MAX_BW, MAX_BW,
00944      $                       AF( WORK_U+ODD_SIZE*BWL+1 ),
00945      $                       MAX_BW, 0, MYCOL-1 )
00946 *
00947         ENDIF
00948 *
00949 *       Copy last diagonal block into AF storage for subsequent
00950 *         operations.
00951 *
00952         CALL CLAMOV( 'N', MAX_BW, MAX_BW,
00953      $                    A( OFST+ODD_SIZE*LLDA+BWU+1 ),
00954      $                    LLDA-1, AF( ODD_SIZE*BWU+MBW2+1 ),
00955      $                    MAX_BW )
00956 *
00957 *       Receive cont. to diagonal block that is stored on this proc.
00958 *
00959         IF( MYCOL.LT. NPCOL-1 ) THEN
00960 *
00961            CALL CGERV2D( ICTXT, MAX_BW, MAX_BW,
00962      $               AF( ODD_SIZE*BWU+2*MBW2+1 ),
00963      $               MAX_BW, 0,
00964      $               MYCOL+1 )
00965 *
00966 *          Add contribution to diagonal block
00967 *
00968            CALL CAXPY( MBW2, CONE,
00969      $               AF( ODD_SIZE*BWU+2*MBW2+1 ),
00970      $               1, AF( ODD_SIZE*BWU+MBW2+1 ), 1 )
00971 *
00972         ENDIF
00973 *
00974 *
00975 *       *************************************
00976 *       Modification Loop
00977 *
00978 *       The distance for sending and receiving for each level starts
00979 *         at 1 for the first level.
00980         LEVEL_DIST = 1
00981 *
00982 *       Do until this proc is needed to modify other procs' equations
00983 *
00984    12   CONTINUE
00985         IF( MOD( (MYCOL+1)/LEVEL_DIST, 2) .NE. 0 ) GOTO 11
00986 *
00987 *         Receive and add contribution to diagonal block from the left
00988 *
00989           IF( MYCOL-LEVEL_DIST .GE. 0 ) THEN
00990             CALL CGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ),
00991      $                     MAX_BW, 0, MYCOL-LEVEL_DIST )
00992 *
00993             CALL CAXPY( MBW2, CONE, WORK( 1 ), 1,
00994      $               AF( ODD_SIZE*BWU+MBW2+1 ), 1 )
00995 *
00996           ENDIF
00997 *
00998 *         Receive and add contribution to diagonal block from the right
00999 *
01000           IF( MYCOL+LEVEL_DIST .LT. NPCOL-1 ) THEN
01001             CALL CGERV2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ),
01002      $                        MAX_BW, 0, MYCOL+LEVEL_DIST )
01003 *
01004             CALL CAXPY( MBW2, CONE, WORK( 1 ), 1,
01005      $               AF( ODD_SIZE*BWU+MBW2+1 ), 1 )
01006 *
01007           ENDIF
01008 *
01009           LEVEL_DIST = LEVEL_DIST*2
01010 *
01011         GOTO 12
01012    11   CONTINUE
01013 *       [End of GOTO Loop]
01014 *
01015 *
01016 *       *********************************
01017 *       Calculate and use this proc's blocks to modify other procs'...
01018 *
01019 *       Factor diagonal block
01020 *
01021        CALL CDBTRF( MAX_BW, MAX_BW, MIN( MAX_BW-1, BWL ),
01022      $              MIN( MAX_BW-1, BWU ), AF( ODD_SIZE*BWU+MBW2+1
01023      $              -( MIN( MAX_BW-1, BWU ))), MAX_BW+1, INFO )
01024 *
01025         IF( INFO.NE.0 ) THEN
01026                INFO = NPCOL + MYCOL
01027         ENDIF
01028 *
01029 *       ****************************************************************
01030 *       Receive offdiagonal block from processor to right.
01031 *         If this is the first group of processors, the receive comes
01032 *         from a different processor than otherwise.
01033 *
01034         IF( LEVEL_DIST .EQ. 1 )THEN
01035           COMM_PROC = MYCOL + 1
01036 *
01037 *           Move block into place that it will be expected to be for
01038 *             calcs.
01039 *
01040           CALL CLAMOV( 'N', MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+1 ),
01041      $                 MAX_BW, AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ),
01042      $                 MAX_BW )
01043 *
01044           CALL CLAMOV( 'N', MAX_BW, MAX_BW, AF( WORK_U+ODD_SIZE*BWL+1 ),
01045      $                 MAX_BW, AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW )
01046 *
01047         ELSE
01048           COMM_PROC = MYCOL + LEVEL_DIST/2
01049         ENDIF
01050 *
01051         IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 )THEN
01052 *
01053           CALL CGERV2D( ICTXT, MAX_BW, MAX_BW,
01054      $        AF( ODD_SIZE*BWU+1 ),
01055      $                       MAX_BW, 0, COMM_PROC )
01056 *
01057           CALL CGERV2D( ICTXT, MAX_BW, MAX_BW,
01058      $        AF( WORK_U+ODD_SIZE*BWL+1 ),
01059      $                       MAX_BW, 0, COMM_PROC )
01060 *
01061           IF( INFO .EQ. 0 ) THEN
01062 *
01063 *
01064 *         Modify upper off_diagonal block with diagonal block
01065 *
01066 *
01067           CALL CTBTRS(
01068      $     'L', 'N', 'U', BWU, MIN( BWL, BWU-1 ), BWU,
01069      $     AF( ODD_SIZE*BWU+
01070      $     MBW2+1+(MAX_BW+1)*(MAX_BW-BWU)), MAX_BW+1,
01071      $     AF( WORK_U+ODD_SIZE*BWL+1+MAX_BW-BWU), MAX_BW, INFO )
01072 *
01073 *         Modify lower off_diagonal block with diagonal block
01074 *
01075 *
01076           CALL CTBTRS(
01077      $     'U', 'C', 'N', BWL, MIN( BWU, BWL-1 ), BWL,
01078      $     AF( ODD_SIZE*BWU+
01079      $     MBW2+1-MIN( BWU, BWL-1 )+(MAX_BW+1)*(MAX_BW-BWL)), MAX_BW+1,
01080      $     AF( ODD_SIZE*BWU+1+MAX_BW-BWL), MAX_BW, INFO )
01081 *
01082           ENDIF
01083 *         End of "if ( info.eq.0 ) then"
01084 *
01085 *         Calculate contribution from this block to next diagonal block
01086 *
01087           CALL CGEMM( 'C', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE,
01088      $                AF( (ODD_SIZE)*BWU+1 ), MAX_BW,
01089      $                AF( WORK_U+(ODD_SIZE)*BWL+1 ), MAX_BW, CZERO,
01090      $                WORK( 1 ), MAX_BW )
01091 *
01092 *         Send contribution to diagonal block's owning processor.
01093 *
01094           CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW,
01095      $                     0, MYCOL+LEVEL_DIST )
01096 *
01097         ENDIF
01098 *       End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
01099 *
01100 *
01101 *       ****************************************************************
01102 *       Receive off_diagonal block from left and use to finish with this
01103 *         processor.
01104 *
01105         IF( (MYCOL/LEVEL_DIST .GT. 0 ).AND.
01106      $      ( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-1 ) ) THEN
01107 *
01108           IF( LEVEL_DIST .GT. 1)THEN
01109 *
01110 *           Receive offdiagonal block(s) from proc level_dist/2 to the
01111 *           left
01112 *
01113            CALL CGERV2D( ICTXT, MAX_BW, MAX_BW,
01114      $            AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ),
01115      $            MAX_BW, 0, MYCOL-LEVEL_DIST/2 )
01116 *
01117 *           Receive offdiagonal block(s) from proc level_dist/2 to the
01118 *           left
01119 *
01120            CALL CGERV2D( ICTXT, MAX_BW, MAX_BW,
01121      $            AF( ODD_SIZE*BWU+2*MBW2+1 ),
01122      $            MAX_BW, 0, MYCOL-LEVEL_DIST/2 )
01123 *
01124           ENDIF
01125 *
01126 *
01127           IF( INFO.EQ.0 ) THEN
01128 *
01129 *         Use diagonal block(s) to modify this offdiagonal block
01130 *
01131 *
01132 *         Since CTBTRS has no "left-right" option, we must transpose
01133 *
01134           CALL CLATCPY( 'N', MAX_BW, MAX_BW, AF( WORK_U+ODD_SIZE*BWL+
01135      $      2*MBW2+1), MAX_BW, WORK( 1 ), MAX_BW )
01136 *
01137           CALL CTBTRS(
01138      $          'L', 'N', 'U', MAX_BW, MIN( BWL, MAX_BW-1 ), BWL,
01139      $          AF( ODD_SIZE*BWU+MBW2+1), MAX_BW+1,
01140      $          WORK( 1+MAX_BW*(MAX_BW-BWL) ), MAX_BW, INFO )
01141 *
01142 *         Transpose back
01143 *
01144           CALL CLATCPY(
01145      $         'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW,
01146      $         AF( WORK_U+ODD_SIZE*BWL+
01147      $      2*MBW2+1), MAX_BW )
01148 *
01149 *
01150 *
01151 *         Since CTBTRS has no "left-right" option, we must transpose
01152 *
01153           CALL CLATCPY( 'N', MAX_BW, MAX_BW, AF( ODD_SIZE*BWU+
01154      $      2*MBW2+1), MAX_BW, WORK( 1 ), MAX_BW )
01155 *
01156           CALL CTBTRS(
01157      $          'U', 'C', 'N', MAX_BW, MIN( BWU, MAX_BW-1 ), BWU,
01158      $          AF( ODD_SIZE*BWU+MBW2+1-MIN( BWU, MAX_BW-1 )), MAX_BW+1,
01159      $          WORK( 1+MAX_BW*(MAX_BW-BWU) ), MAX_BW, INFO )
01160 *
01161 *         Transpose back
01162 *
01163           CALL CLATCPY(
01164      $         'N', MAX_BW, MAX_BW, WORK( 1 ), MAX_BW,
01165      $         AF( ODD_SIZE*BWU+
01166      $      2*MBW2+1), MAX_BW )
01167 *
01168 *
01169           ENDIF
01170 *         End of "if( info.eq.0 ) then"
01171 *
01172 *         Use offdiag block(s) to calculate modification to diag block
01173 *           of processor to the left
01174 *
01175           CALL CGEMM( 'N', 'C', MAX_BW, MAX_BW, MAX_BW, -CONE,
01176      $                AF( (ODD_SIZE)*BWU+2*MBW2+1 ), MAX_BW,
01177      $                AF( WORK_U+(ODD_SIZE)*BWL+2*MBW2+1 ), MAX_BW,
01178      $                CZERO, WORK( 1 ), MAX_BW )
01179 *
01180 *         Send contribution to diagonal block's owning processor.
01181 *
01182           CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW,
01183      $                     0, MYCOL-LEVEL_DIST )
01184 *
01185 *         *******************************************************
01186 *
01187           IF( MYCOL/LEVEL_DIST .LE. (NPCOL-1)/LEVEL_DIST-2 ) THEN
01188 *
01189 *           Decide which processor offdiagonal block(s) goes to
01190 *
01191             IF( ( MOD( MYCOL/( 2*LEVEL_DIST ),2 )) .EQ.0 ) THEN
01192               COMM_PROC = MYCOL + LEVEL_DIST
01193             ELSE
01194               COMM_PROC = MYCOL - LEVEL_DIST
01195             ENDIF
01196 *
01197 *           Use offdiagonal blocks to calculate offdiag
01198 *             block to send to neighboring processor. Depending
01199 *             on circumstances, may need to transpose the matrix.
01200 *
01201               CALL CGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE,
01202      $                    AF( WORK_U+ODD_SIZE*BWL+2*MBW2+1 ), MAX_BW,
01203      $                    AF( ODD_SIZE*BWU+1 ), MAX_BW, CZERO,
01204      $                    WORK( 1 ), MAX_BW )
01205 *
01206 *           Send contribution to offdiagonal block's owning processor.
01207 *
01208             CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW,
01209      $                         0, COMM_PROC )
01210 *
01211               CALL CGEMM( 'N', 'N', MAX_BW, MAX_BW, MAX_BW, -CONE,
01212      $                    AF( ODD_SIZE*BWU+2*MBW2+1 ), MAX_BW,
01213      $                    AF( WORK_U+ODD_SIZE*BWL+1 ), MAX_BW, CZERO,
01214      $                    WORK( 1 ), MAX_BW )
01215 *
01216 *           Send contribution to offdiagonal block's owning processor.
01217 *
01218             CALL CGESD2D( ICTXT, MAX_BW, MAX_BW, WORK( 1 ), MAX_BW,
01219      $                         0, COMM_PROC )
01220 *
01221           ENDIF
01222 *
01223         ENDIF
01224 *       End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
01225 *
01226    14   CONTINUE
01227 *
01228 *
01229  1000 CONTINUE
01230 *
01231 *
01232 *     Free BLACS space used to hold standard-form grid.
01233 *
01234       IF( ICTXT_SAVE .NE. ICTXT_NEW ) THEN
01235          CALL BLACS_GRIDEXIT( ICTXT_NEW )
01236       ENDIF
01237 *
01238  1234 CONTINUE
01239 *
01240 *     Restore saved input parameters
01241 *
01242       ICTXT = ICTXT_SAVE
01243       NP = NP_SAVE
01244 *
01245 *     Output minimum worksize
01246 *
01247       WORK( 1 ) = WORK_SIZE_MIN
01248 *
01249 *         Make INFO consistent across processors
01250 *
01251           CALL IGAMX2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, INFO, INFO,
01252      $              -1, 0, 0 )
01253 *
01254           IF( MYCOL.EQ.0 ) THEN
01255              CALL IGEBS2D( ICTXT, 'A', ' ', 1, 1, INFO, 1 )
01256           ELSE
01257              CALL IGEBR2D( ICTXT, 'A', ' ', 1, 1, INFO, 1, 0, 0 )
01258           ENDIF
01259 *
01260 *
01261       RETURN
01262 *
01263 *     End of PCDBTRF
01264 *
01265       END