ScaLAPACK 2.1  2.1 ScaLAPACK: Scalable Linear Algebra PACKage
pcpbtrsv.f
Go to the documentation of this file.
1  SUBROUTINE pcpbtrsv( UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B,
2  \$ IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  CHARACTER TRANS, UPLO
10  INTEGER BW, IB, INFO, JA, LAF, LWORK, N, NRHS
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * ), DESCB( * )
14  COMPLEX A( * ), AF( * ), B( * ), WORK( * )
15 * ..
16 *
17 *
18 * Purpose
19 * =======
20 *
21 * PCPBTRSV solves a banded triangular system of linear equations
22 *
23 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24 * or
25 * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS)
26 *
27 * where A(1:N, JA:JA+N-1) is a banded
28 * triangular matrix factor produced by the
29 * Cholesky factorization code PCPBTRF
30 * and is stored in A(1:N,JA:JA+N-1) and AF.
31 * The matrix stored in A(1:N, JA:JA+N-1) is either
32 * upper or lower triangular according to UPLO,
33 * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H
34 * is dictated by the user by the parameter TRANS.
35 *
36 * Routine PCPBTRF MUST be called first.
37 *
38 * =====================================================================
39 *
40 * Arguments
41 * =========
42 *
43 * UPLO (global input) CHARACTER
44 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
45 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
46 *
47 * TRANS (global input) CHARACTER
48 * = 'N': Solve with A(1:N, JA:JA+N-1);
49 * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
50 *
51 * N (global input) INTEGER
52 * The number of rows and columns to be operated on, i.e. the
53 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
54 *
55 * BW (global input) INTEGER
56 * Number of subdiagonals in L or U. 0 <= BW <= N-1
57 *
58 * NRHS (global input) INTEGER
59 * The number of right hand sides, i.e., the number of columns
60 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
61 * NRHS >= 0.
62 *
63 * A (local input/local output) COMPLEX pointer into
64 * local memory to an array with first dimension
65 * LLD_A >=(bw+1) (stored in DESCA).
66 * On entry, this array contains the local pieces of the
67 * N-by-N symmetric banded distributed Cholesky factor L or
68 * L^T A(1:N, JA:JA+N-1).
69 * This local portion is stored in the packed banded format
70 * used in LAPACK. Please see the Notes below and the
71 * ScaLAPACK manual for more detail on the format of
72 * distributed matrices.
73 *
74 * JA (global input) INTEGER
75 * The index in the global array A that points to the start of
76 * the matrix to be operated on (which may be either all of A
77 * or a submatrix of A).
78 *
79 * DESCA (global and local input) INTEGER array of dimension DLEN.
80 * if 1D type (DTYPE_A=501), DLEN >= 7;
81 * if 2D type (DTYPE_A=1), DLEN >= 9 .
82 * The array descriptor for the distributed matrix A.
83 * Contains information of mapping of A to memory. Please
84 * see NOTES below for full description and options.
85 *
86 * B (local input/local output) COMPLEX pointer into
87 * local memory to an array of local lead dimension lld_b>=NB.
88 * On entry, this array contains the
89 * the local pieces of the right hand sides
90 * B(IB:IB+N-1, 1:NRHS).
91 * On exit, this contains the local piece of the solutions
92 * distributed matrix X.
93 *
94 * IB (global input) INTEGER
95 * The row index in the global array B that points to the first
96 * row of the matrix to be operated on (which may be either
97 * all of B or a submatrix of B).
98 *
99 * DESCB (global and local input) INTEGER array of dimension DLEN.
100 * if 1D type (DTYPE_B=502), DLEN >=7;
101 * if 2D type (DTYPE_B=1), DLEN >= 9.
102 * The array descriptor for the distributed matrix B.
103 * Contains information of mapping of B to memory. Please
104 * see NOTES below for full description and options.
105 *
106 * AF (local output) COMPLEX array, dimension LAF.
107 * Auxiliary Fillin Space.
108 * Fillin is created during the factorization routine
109 * PCPBTRF and this is stored in AF. If a linear system
110 * is to be solved using PCPBTRS after the factorization
111 * routine, AF *must not be altered* after the factorization.
112 *
113 * LAF (local input) INTEGER
114 * Size of user-input Auxiliary Fillin space AF. Must be >=
115 * (NB+2*bw)*bw
116 * If LAF is not large enough, an error code will be returned
117 * and the minimum acceptable size will be returned in AF( 1 )
118 *
119 * WORK (local workspace/local output)
120 * COMPLEX temporary workspace. This space may
121 * be overwritten in between calls to routines. WORK must be
122 * the size given in LWORK.
123 * On exit, WORK( 1 ) contains the minimal LWORK.
124 *
125 * LWORK (local input or global input) INTEGER
126 * Size of user-input workspace WORK.
127 * If LWORK is too small, the minimal acceptable size will be
128 * returned in WORK(1) and an error code is returned. LWORK>=
129 * (bw*NRHS)
130 *
131 * INFO (global output) INTEGER
132 * = 0: successful exit
133 * < 0: If the i-th argument is an array and the j-entry had
134 * an illegal value, then INFO = -(i*100+j), if the i-th
135 * argument is a scalar and had an illegal value, then
136 * INFO = -i.
137 *
138 * =====================================================================
139 *
140 *
141 * Restrictions
142 * ============
143 *
144 * The following are restrictions on the input parameters. Some of these
145 * are temporary and will be removed in future releases, while others
146 * may reflect fundamental technical limitations.
147 *
148 * Non-cyclic restriction: VERY IMPORTANT!
149 * P*NB>= mod(JA-1,NB)+N.
150 * The mapping for matrices must be blocked, reflecting the nature
151 * of the divide and conquer algorithm as a task-parallel algorithm.
152 * This formula in words is: no processor may have more than one
153 * chunk of the matrix.
154 *
155 * Blocksize cannot be too small:
156 * If the matrix spans more than one processor, the following
157 * restriction on NB, the size of each block on each processor,
158 * must hold:
159 * NB >= 2*BW
160 * The bulk of parallel computation is done on the matrix of size
161 * O(NB) on each processor. If this is too small, divide and conquer
162 * is a poor choice of algorithm.
163 *
164 * Submatrix reference:
165 * JA = IB
166 * Alignment restriction that prevents unnecessary communication.
167 *
168 *
169 * =====================================================================
170 *
171 *
172 * Notes
173 * =====
174 *
175 * If the factorization routine and the solve routine are to be called
176 * separately (to solve various sets of righthand sides using the same
177 * coefficient matrix), the auxiliary space AF *must not be altered*
178 * between calls to the factorization routine and the solve routine.
179 *
180 * The best algorithm for solving banded and tridiagonal linear systems
181 * depends on a variety of parameters, especially the bandwidth.
182 * Currently, only algorithms designed for the case N/P >> bw are
183 * implemented. These go by many names, including Divide and Conquer,
184 * Partitioning, domain decomposition-type, etc.
185 *
186 * Algorithm description: Divide and Conquer
187 *
188 * The Divide and Conqer algorithm assumes the matrix is narrowly
189 * banded compared with the number of equations. In this situation,
190 * it is best to distribute the input matrix A one-dimensionally,
191 * with columns atomic and rows divided amongst the processes.
192 * The basic algorithm divides the banded matrix up into
193 * P pieces with one stored on each processor,
194 * and then proceeds in 2 phases for the factorization or 3 for the
195 * solution of a linear system.
196 * 1) Local Phase:
197 * The individual pieces are factored independently and in
198 * parallel. These factors are applied to the matrix creating
199 * fillin, which is stored in a non-inspectable way in auxiliary
200 * space AF. Mathematically, this is equivalent to reordering
201 * the matrix A as P A P^T and then factoring the principal
202 * leading submatrix of size equal to the sum of the sizes of
203 * the matrices factored on each processor. The factors of
204 * these submatrices overwrite the corresponding parts of A
205 * in memory.
206 * 2) Reduced System Phase:
207 * A small (BW* (P-1)) system is formed representing
208 * interaction of the larger blocks, and is stored (as are its
209 * factors) in the space AF. A parallel Block Cyclic Reduction
210 * algorithm is used. For a linear system, a parallel front solve
211 * followed by an analagous backsolve, both using the structure
212 * of the factored matrix, are performed.
213 * 3) Backsubsitution Phase:
214 * For a linear system, a local backsubstitution is performed on
215 * each processor in parallel.
216 *
217 *
218 * Descriptors
219 * ===========
220 *
221 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
222 *
223 * Note: banded codes can use either the old two dimensional
224 * or new one-dimensional descriptors, though the processor grid in
225 * both cases *must be one-dimensional*. We describe both types below.
226 *
227 * Each global data object is described by an associated description
228 * vector. This vector stores the information required to establish
229 * the mapping between an object element and its corresponding process
230 * and memory location.
231 *
232 * Let A be a generic term for any 2D block cyclicly distributed array.
233 * Such a global array has an associated description vector DESCA.
234 * In the following comments, the character _ should be read as
235 * "of the global array".
236 *
237 * NOTATION STORED IN EXPLANATION
238 * --------------- -------------- --------------------------------------
239 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
240 * DTYPE_A = 1.
241 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
242 * the BLACS process grid A is distribu-
243 * ted over. The context itself is glo-
244 * bal, but the handle (the integer
245 * value) may vary.
246 * M_A (global) DESCA( M_ ) The number of rows in the global
247 * array A.
248 * N_A (global) DESCA( N_ ) The number of columns in the global
249 * array A.
250 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
251 * the rows of the array.
252 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
253 * the columns of the array.
254 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
255 * row of the array A is distributed.
256 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
257 * first column of the array A is
258 * distributed.
259 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
260 * array. LLD_A >= MAX(1,LOCr(M_A)).
261 *
262 * Let K be the number of rows or columns of a distributed matrix,
263 * and assume that its process grid has dimension p x q.
264 * LOCr( K ) denotes the number of elements of K that a process
265 * would receive if K were distributed over the p processes of its
266 * process column.
267 * Similarly, LOCc( K ) denotes the number of elements of K that a
268 * process would receive if K were distributed over the q processes of
269 * its process row.
270 * The values of LOCr() and LOCc() may be determined via a call to the
271 * ScaLAPACK tool function, NUMROC:
272 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
273 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
274 * An upper bound for these quantities may be computed by:
275 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
276 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
277 *
278 *
279 * One-dimensional descriptors:
280 *
281 * One-dimensional descriptors are a new addition to ScaLAPACK since
282 * version 1.0. They simplify and shorten the descriptor for 1D
283 * arrays.
284 *
285 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
286 * object, we allow 1D arrays to be distributed either over the
287 * first dimension of the array (as if the grid were P-by-1) or the
288 * 2nd dimension (as if the grid were 1-by-P). This choice is
289 * indicated by the descriptor type (501 or 502)
290 * as described below.
291 *
292 * IMPORTANT NOTE: the actual BLACS grid represented by the
293 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
294 * irrespective of which one-dimensional descriptor type
295 * (501 or 502) is input.
296 * This routine will interpret the grid properly either way.
297 * ScaLAPACK routines *do not support intercontext operations* so that
298 * the grid passed to a single ScaLAPACK routine *must be the same*
299 * for all array descriptors passed to that routine.
300 *
301 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
302 * may also be used, since a one-dimensional array is a special case
303 * of a two-dimensional array with one dimension of size unity.
304 * The two-dimensional array used in this case *must* be of the
305 * proper orientation:
306 * If the appropriate one-dimensional descriptor is DTYPEA=501
307 * (1 by P type), then the two dimensional descriptor must
308 * have a CTXT value that refers to a 1 by P BLACS grid;
309 * If the appropriate one-dimensional descriptor is DTYPEA=502
310 * (P by 1 type), then the two dimensional descriptor must
311 * have a CTXT value that refers to a P by 1 BLACS grid.
312 *
313 *
314 * Summary of allowed descriptors, types, and BLACS grids:
315 * DTYPE 501 502 1 1
316 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
317 * -----------------------------------------------------
318 * A OK NO OK NO
319 * B NO OK NO OK
320 *
321 * Note that a consequence of this chart is that it is not possible
322 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
323 * to opposite requirements for the orientation of the BLACS grid,
324 * and as noted before, the *same* BLACS context must be used in
325 * all descriptors in a single ScaLAPACK subroutine call.
326 *
327 * Let A be a generic term for any 1D block cyclicly distributed array.
328 * Such a global array has an associated description vector DESCA.
329 * In the following comments, the character _ should be read as
330 * "of the global array".
331 *
332 * NOTATION STORED IN EXPLANATION
333 * --------------- ---------- ------------------------------------------
334 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
335 * TYPE_A = 501: 1-by-P grid.
336 * TYPE_A = 502: P-by-1 grid.
337 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
338 * the BLACS process grid A is distribu-
339 * ted over. The context itself is glo-
340 * bal, but the handle (the integer
341 * value) may vary.
342 * N_A (global) DESCA( 3 ) The size of the array dimension being
343 * distributed.
344 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
345 * the distributed dimension of the array.
346 * SRC_A (global) DESCA( 5 ) The process row or column over which the
347 * first row or column of the array
348 * is distributed.
349 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
350 * storing the local blocks of the distri-
351 * buted array A. Minimum value of LLD_A
352 * depends on TYPE_A.
353 * TYPE_A = 501: LLD_A >=
354 * size of undistributed dimension, 1.
355 * TYPE_A = 502: LLD_A >=NB_A, 1.
356 * Reserved DESCA( 7 ) Reserved for future use.
357 *
358 *
359 *
360 * =====================================================================
361 *
362 * Code Developer: Andrew J. Cleary, University of Tennessee.
363 * Current address: Lawrence Livermore National Labs.
364 * This version released: August, 2001.
365 *
366 * =====================================================================
367 *
368 * ..
369 * .. Parameters ..
370  REAL ONE, ZERO
371  parameter( one = 1.0e+0 )
372  parameter( zero = 0.0e+0 )
373  COMPLEX CONE, CZERO
374  parameter( cone = ( 1.0e+0, 0.0e+0 ) )
375  parameter( czero = ( 0.0e+0, 0.0e+0 ) )
376  INTEGER INT_ONE
377  parameter( int_one = 1 )
378  INTEGER DESCMULT, BIGNUM
379  parameter(descmult = 100, bignum = descmult * descmult)
380  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
381  \$ lld_, mb_, m_, nb_, n_, rsrc_
382  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
383  \$ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
384  \$ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
385 * ..
386 * .. Local Scalars ..
387  INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
388  \$ idum1, idum2, idum3, ja_new, level_dist, llda,
389  \$ lldb, mbw2, mycol, myrow, my_num_cols, nb, np,
390  \$ npcol, nprow, np_save, odd_size, ofst,
391  \$ part_offset, part_size, return_code, store_m_b,
392  \$ store_n_a, work_size_min
393 * ..
394 * .. Local Arrays ..
395  INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
396  \$ param_check( 17, 3 )
397 * ..
398 * .. External Subroutines ..
399  EXTERNAL blacs_get, blacs_gridexit, blacs_gridinfo,
400  \$ cgemm, cgerv2d, cgesd2d, clamov, cmatadd,
401  \$ ctbtrs, ctrmm, ctrtrs, desc_convert, globchk,
402  \$ pxerbla, reshape
403 * ..
404 * .. External Functions ..
405  LOGICAL LSAME
406  INTEGER NUMROC
407  EXTERNAL lsame, numroc
408 * ..
409 * .. Intrinsic Functions ..
410  INTRINSIC ichar, min, mod
411 * ..
412 * .. Executable Statements ..
413 *
414 * Test the input parameters
415 *
416  info = 0
417 *
419 * parameters, check that grid is of right shape.
420 *
421  desca_1xp( 1 ) = 501
422  descb_px1( 1 ) = 502
423 *
424  CALL desc_convert( desca, desca_1xp, return_code )
425 *
426  IF( return_code .NE. 0) THEN
427  info = -( 8*100 + 2 )
428  ENDIF
429 *
430  CALL desc_convert( descb, descb_px1, return_code )
431 *
432  IF( return_code .NE. 0) THEN
433  info = -( 11*100 + 2 )
434  ENDIF
435 *
436 * Consistency checks for DESCA and DESCB.
437 *
438 * Context must be the same
439  IF( desca_1xp( 2 ) .NE. descb_px1( 2 ) ) THEN
440  info = -( 11*100 + 2 )
441  ENDIF
442 *
443 * These are alignment restrictions that may or may not be removed
444 * in future releases. -Andy Cleary, April 14, 1996.
445 *
446 * Block sizes must be the same
447  IF( desca_1xp( 4 ) .NE. descb_px1( 4 ) ) THEN
448  info = -( 11*100 + 4 )
449  ENDIF
450 *
451 * Source processor must be the same
452 *
453  IF( desca_1xp( 5 ) .NE. descb_px1( 5 ) ) THEN
454  info = -( 11*100 + 5 )
455  ENDIF
456 *
457 * Get values out of descriptor for use in code.
458 *
459  ictxt = desca_1xp( 2 )
460  csrc = desca_1xp( 5 )
461  nb = desca_1xp( 4 )
462  llda = desca_1xp( 6 )
463  store_n_a = desca_1xp( 3 )
464  lldb = descb_px1( 6 )
465  store_m_b = descb_px1( 3 )
466 *
467 * Get grid parameters
468 *
469 *
470 * Pre-calculate bw^2
471 *
472  mbw2 = bw * bw
473 *
474  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
475  np = nprow * npcol
476 *
477 *
478 *
479  IF( lsame( uplo, 'U' ) ) THEN
480  idum1 = ichar( 'U' )
481  ELSE IF ( lsame( uplo, 'L' ) ) THEN
482  idum1 = ichar( 'L' )
483  ELSE
484  info = -1
485  END IF
486 *
487  IF( lsame( trans, 'N' ) ) THEN
488  idum2 = ichar( 'N' )
489  ELSE IF ( lsame( trans, 'C' ) ) THEN
490  idum2 = ichar( 'C' )
491  ELSE
492  info = -2
493  END IF
494 *
495  IF( lwork .LT. -1) THEN
496  info = -14
497  ELSE IF ( lwork .EQ. -1 ) THEN
498  idum3 = -1
499  ELSE
500  idum3 = 1
501  ENDIF
502 *
503  IF( n .LT. 0 ) THEN
504  info = -3
505  ENDIF
506 *
507  IF( n+ja-1 .GT. store_n_a ) THEN
508  info = -( 8*100 + 6 )
509  ENDIF
510 *
511  IF(( bw .GT. n-1 ) .OR.
512  \$ ( bw .LT. 0 ) ) THEN
513  info = -4
514  ENDIF
515 *
516  IF( llda .LT. (bw+1) ) THEN
517  info = -( 8*100 + 6 )
518  ENDIF
519 *
520  IF( nb .LE. 0 ) THEN
521  info = -( 8*100 + 4 )
522  ENDIF
523 *
524  IF( n+ib-1 .GT. store_m_b ) THEN
525  info = -( 11*100 + 3 )
526  ENDIF
527 *
528  IF( lldb .LT. nb ) THEN
529  info = -( 11*100 + 6 )
530  ENDIF
531 *
532  IF( nrhs .LT. 0 ) THEN
533  info = -5
534  ENDIF
535 *
536 * Current alignment restriction
537 *
538  IF( ja .NE. ib) THEN
539  info = -7
540  ENDIF
541 *
542 * Argument checking that is specific to Divide & Conquer routine
543 *
544  IF( nprow .NE. 1 ) THEN
545  info = -( 8*100+2 )
546  ENDIF
547 *
548  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
549  info = -( 3 )
550  CALL pxerbla( ictxt,
551  \$ 'PCPBTRSV, D&C alg.: only 1 block per proc',
552  \$ -info )
553  RETURN
554  ENDIF
555 *
556  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*bw )) THEN
557  info = -( 8*100+4 )
558  CALL pxerbla( ictxt,
559  \$ 'PCPBTRSV, D&C alg.: NB too small',
560  \$ -info )
561  RETURN
562  ENDIF
563 *
564 *
565  work_size_min =
566  \$ bw*nrhs
567 *
568  work( 1 ) = work_size_min
569 *
570  IF( lwork .LT. work_size_min ) THEN
571  IF( lwork .NE. -1 ) THEN
572  info = -14
573  CALL pxerbla( ictxt,
574  \$ 'PCPBTRSV: worksize error',
575  \$ -info )
576  ENDIF
577  RETURN
578  ENDIF
579 *
580 * Pack params and positions into arrays for global consistency check
581 *
582  param_check( 17, 1 ) = descb(5)
583  param_check( 16, 1 ) = descb(4)
584  param_check( 15, 1 ) = descb(3)
585  param_check( 14, 1 ) = descb(2)
586  param_check( 13, 1 ) = descb(1)
587  param_check( 12, 1 ) = ib
588  param_check( 11, 1 ) = desca(5)
589  param_check( 10, 1 ) = desca(4)
590  param_check( 9, 1 ) = desca(3)
591  param_check( 8, 1 ) = desca(1)
592  param_check( 7, 1 ) = ja
593  param_check( 6, 1 ) = nrhs
594  param_check( 5, 1 ) = bw
595  param_check( 4, 1 ) = n
596  param_check( 3, 1 ) = idum3
597  param_check( 2, 1 ) = idum2
598  param_check( 1, 1 ) = idum1
599 *
600  param_check( 17, 2 ) = 1105
601  param_check( 16, 2 ) = 1104
602  param_check( 15, 2 ) = 1103
603  param_check( 14, 2 ) = 1102
604  param_check( 13, 2 ) = 1101
605  param_check( 12, 2 ) = 10
606  param_check( 11, 2 ) = 805
607  param_check( 10, 2 ) = 804
608  param_check( 9, 2 ) = 803
609  param_check( 8, 2 ) = 801
610  param_check( 7, 2 ) = 7
611  param_check( 6, 2 ) = 5
612  param_check( 5, 2 ) = 4
613  param_check( 4, 2 ) = 3
614  param_check( 3, 2 ) = 14
615  param_check( 2, 2 ) = 2
616  param_check( 1, 2 ) = 1
617 *
618 * Want to find errors with MIN( ), so if no error, set it to a big
619 * number. If there already is an error, multiply by the the
620 * descriptor multiplier.
621 *
622  IF( info.GE.0 ) THEN
623  info = bignum
624  ELSE IF( info.LT.-descmult ) THEN
625  info = -info
626  ELSE
627  info = -info * descmult
628  END IF
629 *
630 * Check consistency across processors
631 *
632  CALL globchk( ictxt, 17, param_check, 17,
633  \$ param_check( 1, 3 ), info )
634 *
635 * Prepare output: set info = 0 if no error, and divide by DESCMULT
636 * if error is not in a descriptor entry.
637 *
638  IF( info.EQ.bignum ) THEN
639  info = 0
640  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
641  info = -info / descmult
642  ELSE
643  info = -info
644  END IF
645 *
646  IF( info.LT.0 ) THEN
647  CALL pxerbla( ictxt, 'PCPBTRSV', -info )
648  RETURN
649  END IF
650 *
651 * Quick return if possible
652 *
653  IF( n.EQ.0 )
654  \$ RETURN
655 *
656  IF( nrhs.EQ.0 )
657  \$ RETURN
658 *
659 *
661 * the beginning part of the relevant data
662 *
663  part_offset = nb*( (ja-1)/(npcol*nb) )
664 *
665  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
666  part_offset = part_offset + nb
667  ENDIF
668 *
669  IF ( mycol .LT. csrc ) THEN
670  part_offset = part_offset - nb
671  ENDIF
672 *
673 * Form a new BLACS grid (the "standard form" grid) with only procs
674 * holding part of the matrix, of size 1xNP where NP is adjusted,
675 * starting at csrc=0, with JA modified to reflect dropped procs.
676 *
677 * First processor to hold part of the matrix:
678 *
679  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
680 *
681 * Calculate new JA one while dropping off unused processors.
682 *
683  ja_new = mod( ja-1, nb ) + 1
684 *
685 * Save and compute new value of NP
686 *
687  np_save = np
688  np = ( ja_new+n-2 )/nb + 1
689 *
690 * Call utility routine that forms "standard-form" grid
691 *
692  CALL reshape( ictxt, int_one, ictxt_new, int_one,
693  \$ first_proc, int_one, np )
694 *
695 * Use new context from standard grid as context.
696 *
697  ictxt_save = ictxt
698  ictxt = ictxt_new
699  desca_1xp( 2 ) = ictxt_new
700  descb_px1( 2 ) = ictxt_new
701 *
702 * Get information about new grid.
703 *
704  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
705 *
706 * Drop out processors that do not have part of the matrix.
707 *
708  IF( myrow .LT. 0 ) THEN
709  GOTO 1234
710  ENDIF
711 *
712 * ********************************
713 * Values reused throughout routine
714 *
715 * User-input value of partition size
716 *
717  part_size = nb
718 *
719 * Number of columns in each processor
720 *
721  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
722 *
723 * Offset in columns to beginning of main partition in each proc
724 *
725  IF ( mycol .EQ. 0 ) THEN
726  part_offset = part_offset+mod( ja_new-1, part_size )
727  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
728  ENDIF
729 *
730 * Offset in elements
731 *
732  ofst = part_offset*llda
733 *
734 * Size of main (or odd) partition in each processor
735 *
736  odd_size = my_num_cols
737  IF ( mycol .LT. np-1 ) THEN
738  odd_size = odd_size - bw
739  ENDIF
740 *
741 *
742 *
743 * Begin main code
744 *
745  IF ( lsame( uplo, 'L' ) ) THEN
746 *
747  IF ( lsame( trans, 'N' ) ) THEN
748 *
749 * Frontsolve
750 *
751 *
752 ******************************************
753 * Local computation phase
754 ******************************************
755 *
756 * Use main partition in each processor to solve locally
757 *
758  CALL ctbtrs( uplo, 'N', 'N', odd_size,
759  \$ bw, nrhs,
760  \$ a( ofst+1 ), llda,
761  \$ b( part_offset+1 ), lldb, info )
762 *
763 *
764  IF ( mycol .LT. np-1 ) THEN
765 * Use factorization of odd-even connection block to modify
766 * locally stored portion of right hand side(s)
767 *
768 *
769 * First copy and multiply it into temporary storage,
770 * then use it on RHS
771 *
772  CALL clamov( 'N', bw, nrhs,
773  \$ b( part_offset+odd_size-bw+1), lldb,
774  \$ work( 1 ), bw )
775 *
776  CALL ctrmm( 'L', 'U', 'N', 'N', bw, nrhs, -cone,
777  \$ a(( ofst+(bw+1)+(odd_size-bw)*llda )), llda-1,
778  \$ work( 1 ), bw )
779 *
780  CALL cmatadd( bw, nrhs, cone, work( 1 ), bw,
781  \$ cone, b( part_offset+odd_size+1 ), lldb )
782 *
783  ENDIF
784 *
785 *
786  IF ( mycol .NE. 0 ) THEN
787 * Use the "spike" fillin to calculate contribution to previous
788 * processor's righthand-side.
789 *
790  CALL cgemm( 'C', 'N', bw, nrhs, odd_size, -cone, af( 1 ),
791  \$ odd_size, b( part_offset+1 ), lldb, czero,
792  \$ work( 1+bw-bw ), bw )
793  ENDIF
794 *
795 *
796 ************************************************
797 * Formation and solution of reduced system
798 ************************************************
799 *
800 *
801 * Send modifications to prior processor's right hand sides
802 *
803  IF( mycol .GT. 0) THEN
804 *
805  CALL cgesd2d( ictxt, bw, nrhs,
806  \$ work( 1 ), bw,
807  \$ 0, mycol - 1 )
808 *
809  ENDIF
810 *
811 * Receive modifications to processor's right hand sides
812 *
813  IF( mycol .LT. npcol-1) THEN
814 *
815  CALL cgerv2d( ictxt, bw, nrhs,
816  \$ work( 1 ), bw,
817  \$ 0, mycol + 1 )
818 *
819 * Combine contribution to locally stored right hand sides
820 *
821  CALL cmatadd( bw, nrhs, cone,
822  \$ work( 1 ), bw, cone,
823  \$ b( part_offset+odd_size + 1 ), lldb )
824 *
825  ENDIF
826 *
827 *
828 * The last processor does not participate in the solution of the
829 * reduced system, having sent its contribution already.
830  IF( mycol .EQ. npcol-1 ) THEN
831  GOTO 14
832  ENDIF
833 *
834 *
835 * *************************************
836 * Modification Loop
837 *
838 * The distance for sending and receiving for each level starts
839 * at 1 for the first level.
840  level_dist = 1
841 *
842 * Do until this proc is needed to modify other procs' equations
843 *
844  12 CONTINUE
845  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 11
846 *
848 *
849  IF( mycol-level_dist .GE. 0 ) THEN
850 *
851  CALL cgerv2d( ictxt, bw, nrhs,
852  \$ work( 1 ),
853  \$ bw, 0, mycol-level_dist )
854 *
855  CALL cmatadd( bw, nrhs, cone,
856  \$ work( 1 ), bw, cone,
857  \$ b( part_offset+odd_size + 1 ), lldb )
858 *
859  ENDIF
860 *
862 *
863  IF( mycol+level_dist .LT. npcol-1 ) THEN
864 *
865  CALL cgerv2d( ictxt, bw, nrhs,
866  \$ work( 1 ),
867  \$ bw, 0, mycol+level_dist )
868 *
869  CALL cmatadd( bw, nrhs, cone,
870  \$ work( 1 ), bw, cone,
871  \$ b( part_offset+odd_size + 1 ), lldb )
872 *
873  ENDIF
874 *
875  level_dist = level_dist*2
876 *
877  GOTO 12
878  11 CONTINUE
879 * [End of GOTO Loop]
880 *
881 *
882 *
883 * *********************************
884 * Calculate and use this proc's blocks to modify other procs
885 *
886 * Solve with diagonal block
887 *
888  CALL ctrtrs( 'L', 'N', 'N', bw, nrhs, af( odd_size*bw+mbw2+1 ),
889  \$ bw, b( part_offset+odd_size+1 ), lldb, info )
890 *
891  IF( info.NE.0 ) THEN
892  GO TO 1000
893  ENDIF
894 *
895 *
896 *
897 * *********
898  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
899 *
900 * Calculate contribution from this block to next diagonal block
901 *
902  CALL cgemm( 'C', 'N', bw, nrhs, bw, -cone,
903  \$ af( (odd_size)*bw+1 ),
904  \$ bw,
905  \$ b( part_offset+odd_size+1 ),
906  \$ lldb, czero,
907  \$ work( 1 ),
908  \$ bw )
909 *
910 * Send contribution to diagonal block's owning processor.
911 *
912  CALL cgesd2d( ictxt, bw, nrhs,
913  \$ work( 1 ),
914  \$ bw, 0, mycol+level_dist )
915 *
916  ENDIF
917 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
918 *
919 * ************
920  IF( (mycol/level_dist .GT. 0 ).AND.
921  \$ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
922 *
923 *
924 * Use offdiagonal block to calculate modification to diag block
925 * of processor to the left
926 *
927  CALL cgemm( 'N', 'N', bw, nrhs, bw, -cone,
928  \$ af( odd_size*bw+2*mbw2+1 ),
929  \$ bw,
930  \$ b( part_offset+odd_size+1 ),
931  \$ lldb, czero,
932  \$ work( 1 ),
933  \$ bw )
934 *
935 * Send contribution to diagonal block's owning processor.
936 *
937  CALL cgesd2d( ictxt, bw, nrhs,
938  \$ work( 1 ),
939  \$ bw, 0, mycol-level_dist )
940 *
941  ENDIF
942 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
943 *
944  14 CONTINUE
945 *
946  ELSE
947 *
948 ******************** BACKSOLVE *************************************
949 *
950 ********************************************************************
951 * .. Begin reduced system phase of algorithm ..
952 ********************************************************************
953 *
954 *
955 *
956 * The last processor does not participate in the solution of the
957 * reduced system and just waits to receive its solution.
958  IF( mycol .EQ. npcol-1 ) THEN
959  GOTO 24
960  ENDIF
961 *
962 * Determine number of steps in tree loop
963 *
964  level_dist = 1
965  27 CONTINUE
966  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 26
967 *
968  level_dist = level_dist*2
969 *
970  GOTO 27
971  26 CONTINUE
972 *
973 *
974  IF( (mycol/level_dist .GT. 0 ).AND.
975  \$ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
976 *
977 * Receive solution from processor to left
978 *
979  CALL cgerv2d( ictxt, bw, nrhs,
980  \$ work( 1 ),
981  \$ bw, 0, mycol-level_dist )
982 *
983 *
984 * Use offdiagonal block to calculate modification to RHS stored
985 * on this processor
986 *
987  CALL cgemm( 'C', 'N', bw, nrhs, bw, -cone,
988  \$ af( odd_size*bw+2*mbw2+1 ),
989  \$ bw,
990  \$ work( 1 ),
991  \$ bw, cone,
992  \$ b( part_offset+odd_size+1 ),
993  \$ lldb )
994  ENDIF
995 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
996 *
997 *
998  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
999 *
1000 * Receive solution from processor to right
1001 *
1002  CALL cgerv2d( ictxt, bw, nrhs,
1003  \$ work( 1 ),
1004  \$ bw, 0, mycol+level_dist )
1005 *
1006 * Calculate contribution from this block to next diagonal block
1007 *
1008  CALL cgemm( 'N', 'N', bw, nrhs, bw, -cone,
1009  \$ af( (odd_size)*bw+1 ),
1010  \$ bw,
1011  \$ work( 1 ),
1012  \$ bw, cone,
1013  \$ b( part_offset+odd_size+1 ),
1014  \$ lldb )
1015 *
1016  ENDIF
1017 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1018 *
1019 *
1020 * Solve with diagonal block
1021 *
1022  CALL ctrtrs( 'L', 'C', 'N', bw, nrhs, af( odd_size*bw+mbw2+1 ),
1023  \$ bw, b( part_offset+odd_size+1 ), lldb, info )
1024 *
1025  IF( info.NE.0 ) THEN
1026  GO TO 1000
1027  ENDIF
1028 *
1029 *
1030 *
1031 ***Modification Loop *******
1032 *
1033  22 CONTINUE
1034  IF( level_dist .EQ. 1 ) GOTO 21
1035 *
1036  level_dist = level_dist/2
1037 *
1038 * Send solution to the right
1039 *
1040  IF( mycol+level_dist .LT. npcol-1 ) THEN
1041 *
1042  CALL cgesd2d( ictxt, bw, nrhs,
1043  \$ b( part_offset+odd_size+1 ),
1044  \$ lldb, 0, mycol+level_dist )
1045 *
1046  ENDIF
1047 *
1048 * Send solution to left
1049 *
1050  IF( mycol-level_dist .GE. 0 ) THEN
1051 *
1052  CALL cgesd2d( ictxt, bw, nrhs,
1053  \$ b( part_offset+odd_size+1 ),
1054  \$ lldb, 0, mycol-level_dist )
1055 *
1056  ENDIF
1057 *
1058  GOTO 22
1059  21 CONTINUE
1060 * [End of GOTO Loop]
1061 *
1062  24 CONTINUE
1063 * [Processor npcol - 1 jumped to here to await next stage]
1064 *
1065 *******************************
1066 * Reduced system has been solved, communicate solutions to nearest
1067 * neighbors in preparation for local computation phase.
1068 *
1069 *
1070 * Send elements of solution to next proc
1071 *
1072  IF( mycol .LT. npcol-1) THEN
1073 *
1074  CALL cgesd2d( ictxt, bw, nrhs,
1075  \$ b( part_offset+odd_size+1 ), lldb,
1076  \$ 0, mycol +1 )
1077 *
1078  ENDIF
1079 *
1080 * Receive modifications to processor's right hand sides
1081 *
1082  IF( mycol .GT. 0) THEN
1083 *
1084  CALL cgerv2d( ictxt, bw, nrhs,
1085  \$ work( 1 ), bw,
1086  \$ 0, mycol - 1 )
1087 *
1088  ENDIF
1089 *
1090 *
1091 *
1092 **********************************************
1093 * Local computation phase
1094 **********************************************
1095 *
1096  IF ( mycol .NE. 0 ) THEN
1097 * Use the "spike" fillin to calculate contribution from previous
1098 * processor's solution.
1099 *
1100  CALL cgemm( 'N', 'N', odd_size, nrhs, bw, -cone, af( 1 ),
1101  \$ odd_size, work( 1+bw-bw ), bw, cone,
1102  \$ b( part_offset+1 ), lldb )
1103 *
1104  ENDIF
1105 *
1106 *
1107  IF ( mycol .LT. np-1 ) THEN
1108 * Use factorization of odd-even connection block to modify
1109 * locally stored portion of right hand side(s)
1110 *
1111 *
1112 * First copy and multiply it into temporary storage,
1113 * then use it on RHS
1114 *
1115  CALL clamov( 'N', bw, nrhs, b( part_offset+odd_size+1), lldb,
1116  \$ work( 1+bw-bw ), bw )
1117 *
1118  CALL ctrmm( 'L', 'U', 'C', 'N', bw, nrhs, -cone,
1119  \$ a(( ofst+(bw+1)+(odd_size-bw)*llda )), llda-1,
1120  \$ work( 1+bw-bw ), bw )
1121 *
1122  CALL cmatadd( bw, nrhs, cone, work( 1+bw-bw ), bw, cone,
1123  \$ b( part_offset+odd_size-bw+1 ), lldb )
1124 *
1125  ENDIF
1126 *
1127 * Use main partition in each processor to solve locally
1128 *
1129  CALL ctbtrs( uplo, 'C', 'N', odd_size,
1130  \$ bw, nrhs,
1131  \$ a( ofst+1 ),
1132  \$ llda, b( part_offset+1 ),
1133  \$ lldb, info )
1134 *
1135  ENDIF
1136 * End of "IF( LSAME( TRANS, 'N' ) )"...
1137 *
1138 *
1139  ELSE
1140 ***************************************************************
1141 * CASE UPLO = 'U' *
1142 ***************************************************************
1143  IF ( lsame( trans, 'C' ) ) THEN
1144 *
1145 * Frontsolve
1146 *
1147 *
1148 ******************************************
1149 * Local computation phase
1150 ******************************************
1151 *
1152 * Use main partition in each processor to solve locally
1153 *
1154  CALL ctbtrs( uplo, 'C', 'N', odd_size,
1155  \$ bw, nrhs,
1156  \$ a( ofst+1 ), llda,
1157  \$ b( part_offset+1 ), lldb, info )
1158 *
1159 *
1160  IF ( mycol .LT. np-1 ) THEN
1161 * Use factorization of odd-even connection block to modify
1162 * locally stored portion of right hand side(s)
1163 *
1164 *
1165 * First copy and multiply it into temporary storage,
1166 * then use it on RHS
1167 *
1168  CALL clamov( 'N', bw, nrhs,
1169  \$ b( part_offset+odd_size-bw+1), lldb,
1170  \$ work( 1 ), bw )
1171 *
1172  CALL ctrmm( 'L', 'L', 'C', 'N', bw, nrhs, -cone,
1173  \$ a(( ofst+1+odd_size*llda )), llda-1, work( 1 ),
1174  \$ bw )
1175 *
1176  CALL cmatadd( bw, nrhs, cone, work( 1 ), bw,
1177  \$ cone, b( part_offset+odd_size+1 ), lldb )
1178 *
1179  ENDIF
1180 *
1181 *
1182  IF ( mycol .NE. 0 ) THEN
1183 * Use the "spike" fillin to calculate contribution to previous
1184 * processor's righthand-side.
1185 *
1186  CALL cgemm( 'C', 'N', bw, nrhs, odd_size, -cone, af( 1 ),
1187  \$ odd_size, b( part_offset+1 ), lldb, czero,
1188  \$ work( 1+bw-bw ), bw )
1189  ENDIF
1190 *
1191 *
1192 ************************************************
1193 * Formation and solution of reduced system
1194 ************************************************
1195 *
1196 *
1197 * Send modifications to prior processor's right hand sides
1198 *
1199  IF( mycol .GT. 0) THEN
1200 *
1201  CALL cgesd2d( ictxt, bw, nrhs,
1202  \$ work( 1 ), bw,
1203  \$ 0, mycol - 1 )
1204 *
1205  ENDIF
1206 *
1207 * Receive modifications to processor's right hand sides
1208 *
1209  IF( mycol .LT. npcol-1) THEN
1210 *
1211  CALL cgerv2d( ictxt, bw, nrhs,
1212  \$ work( 1 ), bw,
1213  \$ 0, mycol + 1 )
1214 *
1215 * Combine contribution to locally stored right hand sides
1216 *
1217  CALL cmatadd( bw, nrhs, cone,
1218  \$ work( 1 ), bw, cone,
1219  \$ b( part_offset+odd_size + 1 ), lldb )
1220 *
1221  ENDIF
1222 *
1223 *
1224 * The last processor does not participate in the solution of the
1225 * reduced system, having sent its contribution already.
1226  IF( mycol .EQ. npcol-1 ) THEN
1227  GOTO 44
1228  ENDIF
1229 *
1230 *
1231 * *************************************
1232 * Modification Loop
1233 *
1234 * The distance for sending and receiving for each level starts
1235 * at 1 for the first level.
1236  level_dist = 1
1237 *
1238 * Do until this proc is needed to modify other procs' equations
1239 *
1240  42 CONTINUE
1241  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 41
1242 *
1244 *
1245  IF( mycol-level_dist .GE. 0 ) THEN
1246 *
1247  CALL cgerv2d( ictxt, bw, nrhs,
1248  \$ work( 1 ),
1249  \$ bw, 0, mycol-level_dist )
1250 *
1251  CALL cmatadd( bw, nrhs, cone,
1252  \$ work( 1 ), bw, cone,
1253  \$ b( part_offset+odd_size + 1 ), lldb )
1254 *
1255  ENDIF
1256 *
1258 *
1259  IF( mycol+level_dist .LT. npcol-1 ) THEN
1260 *
1261  CALL cgerv2d( ictxt, bw, nrhs,
1262  \$ work( 1 ),
1263  \$ bw, 0, mycol+level_dist )
1264 *
1265  CALL cmatadd( bw, nrhs, cone,
1266  \$ work( 1 ), bw, cone,
1267  \$ b( part_offset+odd_size + 1 ), lldb )
1268 *
1269  ENDIF
1270 *
1271  level_dist = level_dist*2
1272 *
1273  GOTO 42
1274  41 CONTINUE
1275 * [End of GOTO Loop]
1276 *
1277 *
1278 *
1279 * *********************************
1280 * Calculate and use this proc's blocks to modify other procs
1281 *
1282 * Solve with diagonal block
1283 *
1284  CALL ctrtrs( 'L', 'N', 'N', bw, nrhs, af( odd_size*bw+mbw2+1 ),
1285  \$ bw, b( part_offset+odd_size+1 ), lldb, info )
1286 *
1287  IF( info.NE.0 ) THEN
1288  GO TO 1000
1289  ENDIF
1290 *
1291 *
1292 *
1293 * *********
1294  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1295 *
1296 * Calculate contribution from this block to next diagonal block
1297 *
1298  CALL cgemm( 'C', 'N', bw, nrhs, bw, -cone,
1299  \$ af( (odd_size)*bw+1 ),
1300  \$ bw,
1301  \$ b( part_offset+odd_size+1 ),
1302  \$ lldb, czero,
1303  \$ work( 1 ),
1304  \$ bw )
1305 *
1306 * Send contribution to diagonal block's owning processor.
1307 *
1308  CALL cgesd2d( ictxt, bw, nrhs,
1309  \$ work( 1 ),
1310  \$ bw, 0, mycol+level_dist )
1311 *
1312  ENDIF
1313 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1314 *
1315 * ************
1316  IF( (mycol/level_dist .GT. 0 ).AND.
1317  \$ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1318 *
1319 *
1320 * Use offdiagonal block to calculate modification to diag block
1321 * of processor to the left
1322 *
1323  CALL cgemm( 'N', 'N', bw, nrhs, bw, -cone,
1324  \$ af( odd_size*bw+2*mbw2+1 ),
1325  \$ bw,
1326  \$ b( part_offset+odd_size+1 ),
1327  \$ lldb, czero,
1328  \$ work( 1 ),
1329  \$ bw )
1330 *
1331 * Send contribution to diagonal block's owning processor.
1332 *
1333  CALL cgesd2d( ictxt, bw, nrhs,
1334  \$ work( 1 ),
1335  \$ bw, 0, mycol-level_dist )
1336 *
1337  ENDIF
1338 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1339 *
1340  44 CONTINUE
1341 *
1342  ELSE
1343 *
1344 ******************** BACKSOLVE *************************************
1345 *
1346 ********************************************************************
1347 * .. Begin reduced system phase of algorithm ..
1348 ********************************************************************
1349 *
1350 *
1351 *
1352 * The last processor does not participate in the solution of the
1353 * reduced system and just waits to receive its solution.
1354  IF( mycol .EQ. npcol-1 ) THEN
1355  GOTO 54
1356  ENDIF
1357 *
1358 * Determine number of steps in tree loop
1359 *
1360  level_dist = 1
1361  57 CONTINUE
1362  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 56
1363 *
1364  level_dist = level_dist*2
1365 *
1366  GOTO 57
1367  56 CONTINUE
1368 *
1369 *
1370  IF( (mycol/level_dist .GT. 0 ).AND.
1371  \$ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1372 *
1373 * Receive solution from processor to left
1374 *
1375  CALL cgerv2d( ictxt, bw, nrhs,
1376  \$ work( 1 ),
1377  \$ bw, 0, mycol-level_dist )
1378 *
1379 *
1380 * Use offdiagonal block to calculate modification to RHS stored
1381 * on this processor
1382 *
1383  CALL cgemm( 'C', 'N', bw, nrhs, bw, -cone,
1384  \$ af( odd_size*bw+2*mbw2+1 ),
1385  \$ bw,
1386  \$ work( 1 ),
1387  \$ bw, cone,
1388  \$ b( part_offset+odd_size+1 ),
1389  \$ lldb )
1390  ENDIF
1391 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1392 *
1393 *
1394  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1395 *
1396 * Receive solution from processor to right
1397 *
1398  CALL cgerv2d( ictxt, bw, nrhs,
1399  \$ work( 1 ),
1400  \$ bw, 0, mycol+level_dist )
1401 *
1402 * Calculate contribution from this block to next diagonal block
1403 *
1404  CALL cgemm( 'N', 'N', bw, nrhs, bw, -cone,
1405  \$ af( (odd_size)*bw+1 ),
1406  \$ bw,
1407  \$ work( 1 ),
1408  \$ bw, cone,
1409  \$ b( part_offset+odd_size+1 ),
1410  \$ lldb )
1411 *
1412  ENDIF
1413 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1414 *
1415 *
1416 * Solve with diagonal block
1417 *
1418  CALL ctrtrs( 'L', 'C', 'N', bw, nrhs, af( odd_size*bw+mbw2+1 ),
1419  \$ bw, b( part_offset+odd_size+1 ), lldb, info )
1420 *
1421  IF( info.NE.0 ) THEN
1422  GO TO 1000
1423  ENDIF
1424 *
1425 *
1426 *
1427 ***Modification Loop *******
1428 *
1429  52 CONTINUE
1430  IF( level_dist .EQ. 1 ) GOTO 51
1431 *
1432  level_dist = level_dist/2
1433 *
1434 * Send solution to the right
1435 *
1436  IF( mycol+level_dist .LT. npcol-1 ) THEN
1437 *
1438  CALL cgesd2d( ictxt, bw, nrhs,
1439  \$ b( part_offset+odd_size+1 ),
1440  \$ lldb, 0, mycol+level_dist )
1441 *
1442  ENDIF
1443 *
1444 * Send solution to left
1445 *
1446  IF( mycol-level_dist .GE. 0 ) THEN
1447 *
1448  CALL cgesd2d( ictxt, bw, nrhs,
1449  \$ b( part_offset+odd_size+1 ),
1450  \$ lldb, 0, mycol-level_dist )
1451 *
1452  ENDIF
1453 *
1454  GOTO 52
1455  51 CONTINUE
1456 * [End of GOTO Loop]
1457 *
1458  54 CONTINUE
1459 * [Processor npcol - 1 jumped to here to await next stage]
1460 *
1461 *******************************
1462 * Reduced system has been solved, communicate solutions to nearest
1463 * neighbors in preparation for local computation phase.
1464 *
1465 *
1466 * Send elements of solution to next proc
1467 *
1468  IF( mycol .LT. npcol-1) THEN
1469 *
1470  CALL cgesd2d( ictxt, bw, nrhs,
1471  \$ b( part_offset+odd_size+1 ), lldb,
1472  \$ 0, mycol +1 )
1473 *
1474  ENDIF
1475 *
1476 * Receive modifications to processor's right hand sides
1477 *
1478  IF( mycol .GT. 0) THEN
1479 *
1480  CALL cgerv2d( ictxt, bw, nrhs,
1481  \$ work( 1 ), bw,
1482  \$ 0, mycol - 1 )
1483 *
1484  ENDIF
1485 *
1486 *
1487 *
1488 **********************************************
1489 * Local computation phase
1490 **********************************************
1491 *
1492  IF ( mycol .NE. 0 ) THEN
1493 * Use the "spike" fillin to calculate contribution from previous
1494 * processor's solution.
1495 *
1496  CALL cgemm( 'N', 'N', odd_size, nrhs, bw, -cone, af( 1 ),
1497  \$ odd_size, work( 1+bw-bw ), bw, cone,
1498  \$ b( part_offset+1 ), lldb )
1499 *
1500  ENDIF
1501 *
1502 *
1503  IF ( mycol .LT. np-1 ) THEN
1504 * Use factorization of odd-even connection block to modify
1505 * locally stored portion of right hand side(s)
1506 *
1507 *
1508 * First copy and multiply it into temporary storage,
1509 * then use it on RHS
1510 *
1511  CALL clamov( 'N', bw, nrhs, b( part_offset+odd_size+1), lldb,
1512  \$ work( 1+bw-bw ), bw )
1513 *
1514  CALL ctrmm( 'L', 'L', 'N', 'N', bw, nrhs, -cone,
1515  \$ a(( ofst+1+odd_size*llda )), llda-1,
1516  \$ work( 1+bw-bw ), bw )
1517 *
1518  CALL cmatadd( bw, nrhs, cone, work( 1+bw-bw ), bw, cone,
1519  \$ b( part_offset+odd_size-bw+1 ), lldb )
1520 *
1521  ENDIF
1522 *
1523 * Use main partition in each processor to solve locally
1524 *
1525  CALL ctbtrs( uplo, 'N', 'N', odd_size,
1526  \$ bw, nrhs,
1527  \$ a( ofst+1 ),
1528  \$ llda, b( part_offset+1 ),
1529  \$ lldb, info )
1530 *
1531  ENDIF
1532 * End of "IF( LSAME( TRANS, 'N' ) )"...
1533 *
1534 *
1535  ENDIF
1536 * End of "IF( LSAME( UPLO, 'L' ) )"...
1537  1000 CONTINUE
1538 *
1539 *
1540 * Free BLACS space used to hold standard-form grid.
1541 *
1542  IF( ictxt_save .NE. ictxt_new ) THEN
1543  CALL blacs_gridexit( ictxt_new )
1544  ENDIF
1545 *
1546  1234 CONTINUE
1547 *
1548 * Restore saved input parameters
1549 *
1550  ictxt = ictxt_save
1551  np = np_save
1552 *
1553 * Output minimum worksize
1554 *
1555  work( 1 ) = work_size_min
1556 *
1557 *
1558  RETURN
1559 *
1560 * End of PCPBTRSV
1561 *
1562  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
subroutine cmatadd(M, N, ALPHA, A, LDA, BETA, C, LDC)
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
pcpbtrsv
subroutine pcpbtrsv(UPLO, TRANS, N, BW, NRHS, A, JA, DESCA, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: pcpbtrsv.f:3
min
#define min(A, B)
Definition: pcgemr.c:181