ScaLAPACK 2.1  2.1 ScaLAPACK: Scalable Linear Algebra PACKage
pcdbtrsv.f
Go to the documentation of this file.
1  SUBROUTINE pcdbtrsv( UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA,
2  \$ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  CHARACTER TRANS, UPLO
10  INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * ), DESCB( * )
14  COMPLEX A( * ), AF( * ), B( * ), WORK( * )
15 * ..
16 *
17 *
18 * Purpose
19 * =======
20 *
21 * PCDBTRSV solves a banded triangular system of linear equations
22 *
23 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24 * or
25 * A(1:N, JA:JA+N-1)^H * X = B(IB:IB+N-1, 1:NRHS)
26 *
27 * where A(1:N, JA:JA+N-1) is a banded
28 * triangular matrix factor produced by the
29 * Gaussian elimination code PC@(dom_pre)BTRF
30 * and is stored in A(1:N,JA:JA+N-1) and AF.
31 * The matrix stored in A(1:N, JA:JA+N-1) is either
32 * upper or lower triangular according to UPLO,
33 * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^H
34 * is dictated by the user by the parameter TRANS.
35 *
36 * Routine PCDBTRF MUST be called first.
37 *
38 * =====================================================================
39 *
40 * Arguments
41 * =========
42 *
43 * UPLO (global input) CHARACTER
44 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
45 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
46 *
47 * TRANS (global input) CHARACTER
48 * = 'N': Solve with A(1:N, JA:JA+N-1);
49 * = 'C': Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
50 *
51 * N (global input) INTEGER
52 * The number of rows and columns to be operated on, i.e. the
53 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
54 *
55 * BWL (global input) INTEGER
56 * Number of subdiagonals. 0 <= BWL <= N-1
57 *
58 * BWU (global input) INTEGER
59 * Number of superdiagonals. 0 <= BWU <= N-1
60 *
61 * NRHS (global input) INTEGER
62 * The number of right hand sides, i.e., the number of columns
63 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
64 * NRHS >= 0.
65 *
66 * A (local input/local output) COMPLEX pointer into
67 * local memory to an array with first dimension
68 * LLD_A >=(bwl+bwu+1) (stored in DESCA).
69 * On entry, this array contains the local pieces of the
70 * N-by-N unsymmetric banded distributed Cholesky factor L or
71 * L^T A(1:N, JA:JA+N-1).
72 * This local portion is stored in the packed banded format
73 * used in LAPACK. Please see the Notes below and the
74 * ScaLAPACK manual for more detail on the format of
75 * distributed matrices.
76 *
77 * JA (global input) INTEGER
78 * The index in the global array A that points to the start of
79 * the matrix to be operated on (which may be either all of A
80 * or a submatrix of A).
81 *
82 * DESCA (global and local input) INTEGER array of dimension DLEN.
83 * if 1D type (DTYPE_A=501), DLEN >= 7;
84 * if 2D type (DTYPE_A=1), DLEN >= 9 .
85 * The array descriptor for the distributed matrix A.
86 * Contains information of mapping of A to memory. Please
87 * see NOTES below for full description and options.
88 *
89 * B (local input/local output) COMPLEX pointer into
90 * local memory to an array of local lead dimension lld_b>=NB.
91 * On entry, this array contains the
92 * the local pieces of the right hand sides
93 * B(IB:IB+N-1, 1:NRHS).
94 * On exit, this contains the local piece of the solutions
95 * distributed matrix X.
96 *
97 * IB (global input) INTEGER
98 * The row index in the global array B that points to the first
99 * row of the matrix to be operated on (which may be either
100 * all of B or a submatrix of B).
101 *
102 * DESCB (global and local input) INTEGER array of dimension DLEN.
103 * if 1D type (DTYPE_B=502), DLEN >=7;
104 * if 2D type (DTYPE_B=1), DLEN >= 9.
105 * The array descriptor for the distributed matrix B.
106 * Contains information of mapping of B to memory. Please
107 * see NOTES below for full description and options.
108 *
109 * AF (local output) COMPLEX array, dimension LAF.
110 * Auxiliary Fillin Space.
111 * Fillin is created during the factorization routine
112 * PCDBTRF and this is stored in AF. If a linear system
113 * is to be solved using PCDBTRS after the factorization
114 * routine, AF *must not be altered* after the factorization.
115 *
116 * LAF (local input) INTEGER
117 * Size of user-input Auxiliary Fillin space AF. Must be >=
118 * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
119 * If LAF is not large enough, an error code will be returned
120 * and the minimum acceptable size will be returned in AF( 1 )
121 *
122 * WORK (local workspace/local output)
123 * COMPLEX temporary workspace. This space may
124 * be overwritten in between calls to routines. WORK must be
125 * the size given in LWORK.
126 * On exit, WORK( 1 ) contains the minimal LWORK.
127 *
128 * LWORK (local input or global input) INTEGER
129 * Size of user-input workspace WORK.
130 * If LWORK is too small, the minimal acceptable size will be
131 * returned in WORK(1) and an error code is returned. LWORK>=
132 * (max(bwl,bwu)*NRHS)
133 *
134 * INFO (global output) INTEGER
135 * = 0: successful exit
136 * < 0: If the i-th argument is an array and the j-entry had
137 * an illegal value, then INFO = -(i*100+j), if the i-th
138 * argument is a scalar and had an illegal value, then
139 * INFO = -i.
140 *
141 * =====================================================================
142 *
143 *
144 * Restrictions
145 * ============
146 *
147 * The following are restrictions on the input parameters. Some of these
148 * are temporary and will be removed in future releases, while others
149 * may reflect fundamental technical limitations.
150 *
151 * Non-cyclic restriction: VERY IMPORTANT!
152 * P*NB>= mod(JA-1,NB)+N.
153 * The mapping for matrices must be blocked, reflecting the nature
154 * of the divide and conquer algorithm as a task-parallel algorithm.
155 * This formula in words is: no processor may have more than one
156 * chunk of the matrix.
157 *
158 * Blocksize cannot be too small:
159 * If the matrix spans more than one processor, the following
160 * restriction on NB, the size of each block on each processor,
161 * must hold:
162 * NB >= 2*MAX(BWL,BWU)
163 * The bulk of parallel computation is done on the matrix of size
164 * O(NB) on each processor. If this is too small, divide and conquer
165 * is a poor choice of algorithm.
166 *
167 * Submatrix reference:
168 * JA = IB
169 * Alignment restriction that prevents unnecessary communication.
170 *
171 *
172 * =====================================================================
173 *
174 *
175 * Notes
176 * =====
177 *
178 * If the factorization routine and the solve routine are to be called
179 * separately (to solve various sets of righthand sides using the same
180 * coefficient matrix), the auxiliary space AF *must not be altered*
181 * between calls to the factorization routine and the solve routine.
182 *
183 * The best algorithm for solving banded and tridiagonal linear systems
184 * depends on a variety of parameters, especially the bandwidth.
185 * Currently, only algorithms designed for the case N/P >> bw are
186 * implemented. These go by many names, including Divide and Conquer,
187 * Partitioning, domain decomposition-type, etc.
188 *
189 * Algorithm description: Divide and Conquer
190 *
191 * The Divide and Conqer algorithm assumes the matrix is narrowly
192 * banded compared with the number of equations. In this situation,
193 * it is best to distribute the input matrix A one-dimensionally,
194 * with columns atomic and rows divided amongst the processes.
195 * The basic algorithm divides the banded matrix up into
196 * P pieces with one stored on each processor,
197 * and then proceeds in 2 phases for the factorization or 3 for the
198 * solution of a linear system.
199 * 1) Local Phase:
200 * The individual pieces are factored independently and in
201 * parallel. These factors are applied to the matrix creating
202 * fillin, which is stored in a non-inspectable way in auxiliary
203 * space AF. Mathematically, this is equivalent to reordering
204 * the matrix A as P A P^T and then factoring the principal
205 * leading submatrix of size equal to the sum of the sizes of
206 * the matrices factored on each processor. The factors of
207 * these submatrices overwrite the corresponding parts of A
208 * in memory.
209 * 2) Reduced System Phase:
210 * A small (max(bwl,bwu)* (P-1)) system is formed representing
211 * interaction of the larger blocks, and is stored (as are its
212 * factors) in the space AF. A parallel Block Cyclic Reduction
213 * algorithm is used. For a linear system, a parallel front solve
214 * followed by an analagous backsolve, both using the structure
215 * of the factored matrix, are performed.
216 * 3) Backsubsitution Phase:
217 * For a linear system, a local backsubstitution is performed on
218 * each processor in parallel.
219 *
220 *
221 * Descriptors
222 * ===========
223 *
224 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
225 *
226 * Note: banded codes can use either the old two dimensional
227 * or new one-dimensional descriptors, though the processor grid in
228 * both cases *must be one-dimensional*. We describe both types below.
229 *
230 * Each global data object is described by an associated description
231 * vector. This vector stores the information required to establish
232 * the mapping between an object element and its corresponding process
233 * and memory location.
234 *
235 * Let A be a generic term for any 2D block cyclicly distributed array.
236 * Such a global array has an associated description vector DESCA.
237 * In the following comments, the character _ should be read as
238 * "of the global array".
239 *
240 * NOTATION STORED IN EXPLANATION
241 * --------------- -------------- --------------------------------------
242 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
243 * DTYPE_A = 1.
244 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
245 * the BLACS process grid A is distribu-
246 * ted over. The context itself is glo-
247 * bal, but the handle (the integer
248 * value) may vary.
249 * M_A (global) DESCA( M_ ) The number of rows in the global
250 * array A.
251 * N_A (global) DESCA( N_ ) The number of columns in the global
252 * array A.
253 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
254 * the rows of the array.
255 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
256 * the columns of the array.
257 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
258 * row of the array A is distributed.
259 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
260 * first column of the array A is
261 * distributed.
262 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
263 * array. LLD_A >= MAX(1,LOCr(M_A)).
264 *
265 * Let K be the number of rows or columns of a distributed matrix,
266 * and assume that its process grid has dimension p x q.
267 * LOCr( K ) denotes the number of elements of K that a process
268 * would receive if K were distributed over the p processes of its
269 * process column.
270 * Similarly, LOCc( K ) denotes the number of elements of K that a
271 * process would receive if K were distributed over the q processes of
272 * its process row.
273 * The values of LOCr() and LOCc() may be determined via a call to the
274 * ScaLAPACK tool function, NUMROC:
275 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
276 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
277 * An upper bound for these quantities may be computed by:
278 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
279 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
280 *
281 *
282 * One-dimensional descriptors:
283 *
284 * One-dimensional descriptors are a new addition to ScaLAPACK since
285 * version 1.0. They simplify and shorten the descriptor for 1D
286 * arrays.
287 *
288 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
289 * object, we allow 1D arrays to be distributed either over the
290 * first dimension of the array (as if the grid were P-by-1) or the
291 * 2nd dimension (as if the grid were 1-by-P). This choice is
292 * indicated by the descriptor type (501 or 502)
293 * as described below.
294 *
295 * IMPORTANT NOTE: the actual BLACS grid represented by the
296 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
297 * irrespective of which one-dimensional descriptor type
298 * (501 or 502) is input.
299 * This routine will interpret the grid properly either way.
300 * ScaLAPACK routines *do not support intercontext operations* so that
301 * the grid passed to a single ScaLAPACK routine *must be the same*
302 * for all array descriptors passed to that routine.
303 *
304 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
305 * may also be used, since a one-dimensional array is a special case
306 * of a two-dimensional array with one dimension of size unity.
307 * The two-dimensional array used in this case *must* be of the
308 * proper orientation:
309 * If the appropriate one-dimensional descriptor is DTYPEA=501
310 * (1 by P type), then the two dimensional descriptor must
311 * have a CTXT value that refers to a 1 by P BLACS grid;
312 * If the appropriate one-dimensional descriptor is DTYPEA=502
313 * (P by 1 type), then the two dimensional descriptor must
314 * have a CTXT value that refers to a P by 1 BLACS grid.
315 *
316 *
317 * Summary of allowed descriptors, types, and BLACS grids:
318 * DTYPE 501 502 1 1
319 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
320 * -----------------------------------------------------
321 * A OK NO OK NO
322 * B NO OK NO OK
323 *
324 * Note that a consequence of this chart is that it is not possible
325 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
326 * to opposite requirements for the orientation of the BLACS grid,
327 * and as noted before, the *same* BLACS context must be used in
328 * all descriptors in a single ScaLAPACK subroutine call.
329 *
330 * Let A be a generic term for any 1D block cyclicly distributed array.
331 * Such a global array has an associated description vector DESCA.
332 * In the following comments, the character _ should be read as
333 * "of the global array".
334 *
335 * NOTATION STORED IN EXPLANATION
336 * --------------- ---------- ------------------------------------------
337 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
338 * TYPE_A = 501: 1-by-P grid.
339 * TYPE_A = 502: P-by-1 grid.
340 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
341 * the BLACS process grid A is distribu-
342 * ted over. The context itself is glo-
343 * bal, but the handle (the integer
344 * value) may vary.
345 * N_A (global) DESCA( 3 ) The size of the array dimension being
346 * distributed.
347 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
348 * the distributed dimension of the array.
349 * SRC_A (global) DESCA( 5 ) The process row or column over which the
350 * first row or column of the array
351 * is distributed.
352 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
353 * storing the local blocks of the distri-
354 * buted array A. Minimum value of LLD_A
355 * depends on TYPE_A.
356 * TYPE_A = 501: LLD_A >=
357 * size of undistributed dimension, 1.
358 * TYPE_A = 502: LLD_A >=NB_A, 1.
359 * Reserved DESCA( 7 ) Reserved for future use.
360 *
361 *
362 *
363 * =====================================================================
364 *
365 * Code Developer: Andrew J. Cleary, University of Tennessee.
366 * Current address: Lawrence Livermore National Labs.
367 * This version released: August, 2001.
368 *
369 * =====================================================================
370 *
371 * ..
372 * .. Parameters ..
373  REAL ONE, ZERO
374  parameter( one = 1.0e+0 )
375  parameter( zero = 0.0e+0 )
376  COMPLEX CONE, CZERO
377  parameter( cone = ( 1.0e+0, 0.0e+0 ) )
378  parameter( czero = ( 0.0e+0, 0.0e+0 ) )
379  INTEGER INT_ONE
380  parameter( int_one = 1 )
381  INTEGER DESCMULT, BIGNUM
382  parameter(descmult = 100, bignum = descmult * descmult)
383  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
384  \$ lld_, mb_, m_, nb_, n_, rsrc_
385  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
386  \$ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
387  \$ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
388 * ..
389 * .. Local Scalars ..
390  INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
391  \$ idum1, idum2, idum3, ja_new, level_dist, llda,
392  \$ lldb, max_bw, mbw2, mycol, myrow, my_num_cols,
393  \$ nb, np, npcol, nprow, np_save, odd_size, ofst,
394  \$ part_offset, part_size, return_code, store_m_b,
395  \$ store_n_a, work_size_min, work_u
396 * ..
397 * .. Local Arrays ..
398  INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
399  \$ param_check( 18, 3 )
400 * ..
401 * .. External Subroutines ..
402  EXTERNAL blacs_get, blacs_gridexit, blacs_gridinfo,
403  \$ cgemm, cgerv2d, cgesd2d, clamov, cmatadd,
404  \$ ctbtrs, ctrmm, ctrtrs, desc_convert, globchk,
405  \$ pxerbla, reshape
406 * ..
407 * .. External Functions ..
408  LOGICAL LSAME
409  INTEGER NUMROC
410  EXTERNAL lsame, numroc
411 * ..
412 * .. Intrinsic Functions ..
413  INTRINSIC ichar, min, mod
414 * ..
415 * .. Executable Statements ..
416 *
417 * Test the input parameters
418 *
419  info = 0
420 *
422 * parameters, check that grid is of right shape.
423 *
424  desca_1xp( 1 ) = 501
425  descb_px1( 1 ) = 502
426 *
427  CALL desc_convert( desca, desca_1xp, return_code )
428 *
429  IF( return_code .NE. 0) THEN
430  info = -( 9*100 + 2 )
431  ENDIF
432 *
433  CALL desc_convert( descb, descb_px1, return_code )
434 *
435  IF( return_code .NE. 0) THEN
436  info = -( 12*100 + 2 )
437  ENDIF
438 *
439 * Consistency checks for DESCA and DESCB.
440 *
441 * Context must be the same
442  IF( desca_1xp( 2 ) .NE. descb_px1( 2 ) ) THEN
443  info = -( 12*100 + 2 )
444  ENDIF
445 *
446 * These are alignment restrictions that may or may not be removed
447 * in future releases. -Andy Cleary, April 14, 1996.
448 *
449 * Block sizes must be the same
450  IF( desca_1xp( 4 ) .NE. descb_px1( 4 ) ) THEN
451  info = -( 12*100 + 4 )
452  ENDIF
453 *
454 * Source processor must be the same
455 *
456  IF( desca_1xp( 5 ) .NE. descb_px1( 5 ) ) THEN
457  info = -( 12*100 + 5 )
458  ENDIF
459 *
460 * Get values out of descriptor for use in code.
461 *
462  ictxt = desca_1xp( 2 )
463  csrc = desca_1xp( 5 )
464  nb = desca_1xp( 4 )
465  llda = desca_1xp( 6 )
466  store_n_a = desca_1xp( 3 )
467  lldb = descb_px1( 6 )
468  store_m_b = descb_px1( 3 )
469 *
470 * Get grid parameters
471 *
472 *
473 * Size of separator blocks is maximum of bandwidths
474 *
475  max_bw = max(bwl,bwu)
476  mbw2 = max_bw * max_bw
477 *
478  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
479  np = nprow * npcol
480 *
481 *
482 *
483  IF( lsame( uplo, 'U' ) ) THEN
484  idum1 = ichar( 'U' )
485  ELSE IF ( lsame( uplo, 'L' ) ) THEN
486  idum1 = ichar( 'L' )
487  ELSE
488  info = -1
489  END IF
490 *
491  IF( lsame( trans, 'N' ) ) THEN
492  idum2 = ichar( 'N' )
493  ELSE IF ( lsame( trans, 'C' ) ) THEN
494  idum2 = ichar( 'C' )
495  ELSE
496  info = -2
497  END IF
498 *
499  IF( lwork .LT. -1) THEN
500  info = -16
501  ELSE IF ( lwork .EQ. -1 ) THEN
502  idum3 = -1
503  ELSE
504  idum3 = 1
505  ENDIF
506 *
507  IF( n .LT. 0 ) THEN
508  info = -3
509  ENDIF
510 *
511  IF( n+ja-1 .GT. store_n_a ) THEN
512  info = -( 9*100 + 6 )
513  ENDIF
514 *
515  IF(( bwl .GT. n-1 ) .OR.
516  \$ ( bwl .LT. 0 ) ) THEN
517  info = -4
518  ENDIF
519 *
520  IF(( bwu .GT. n-1 ) .OR.
521  \$ ( bwu .LT. 0 ) ) THEN
522  info = -5
523  ENDIF
524 *
525  IF( llda .LT. (bwl+bwu+1) ) THEN
526  info = -( 9*100 + 6 )
527  ENDIF
528 *
529  IF( nb .LE. 0 ) THEN
530  info = -( 9*100 + 4 )
531  ENDIF
532 *
533  IF( n+ib-1 .GT. store_m_b ) THEN
534  info = -( 12*100 + 3 )
535  ENDIF
536 *
537  IF( lldb .LT. nb ) THEN
538  info = -( 12*100 + 6 )
539  ENDIF
540 *
541  IF( nrhs .LT. 0 ) THEN
542  info = -6
543  ENDIF
544 *
545 * Current alignment restriction
546 *
547  IF( ja .NE. ib) THEN
548  info = -8
549  ENDIF
550 *
551 * Argument checking that is specific to Divide & Conquer routine
552 *
553  IF( nprow .NE. 1 ) THEN
554  info = -( 9*100+2 )
555  ENDIF
556 *
557  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
558  info = -( 3 )
559  CALL pxerbla( ictxt,
560  \$ 'PCDBTRSV, D&C alg.: only 1 block per proc',
561  \$ -info )
562  RETURN
563  ENDIF
564 *
565  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*max(bwl,bwu) )) THEN
566  info = -( 9*100+4 )
567  CALL pxerbla( ictxt,
568  \$ 'PCDBTRSV, D&C alg.: NB too small',
569  \$ -info )
570  RETURN
571  ENDIF
572 *
573 *
574  work_size_min =
575  \$ max(bwl,bwu)*nrhs
576 *
577  work( 1 ) = work_size_min
578 *
579  IF( lwork .LT. work_size_min ) THEN
580  IF( lwork .NE. -1 ) THEN
581  info = -16
582  CALL pxerbla( ictxt,
583  \$ 'PCDBTRSV: worksize error',
584  \$ -info )
585  ENDIF
586  RETURN
587  ENDIF
588 *
589 * Pack params and positions into arrays for global consistency check
590 *
591  param_check( 18, 1 ) = descb(5)
592  param_check( 17, 1 ) = descb(4)
593  param_check( 16, 1 ) = descb(3)
594  param_check( 15, 1 ) = descb(2)
595  param_check( 14, 1 ) = descb(1)
596  param_check( 13, 1 ) = ib
597  param_check( 12, 1 ) = desca(5)
598  param_check( 11, 1 ) = desca(4)
599  param_check( 10, 1 ) = desca(3)
600  param_check( 9, 1 ) = desca(1)
601  param_check( 8, 1 ) = ja
602  param_check( 7, 1 ) = nrhs
603  param_check( 6, 1 ) = bwu
604  param_check( 5, 1 ) = bwl
605  param_check( 4, 1 ) = n
606  param_check( 3, 1 ) = idum3
607  param_check( 2, 1 ) = idum2
608  param_check( 1, 1 ) = idum1
609 *
610  param_check( 18, 2 ) = 1205
611  param_check( 17, 2 ) = 1204
612  param_check( 16, 2 ) = 1203
613  param_check( 15, 2 ) = 1202
614  param_check( 14, 2 ) = 1201
615  param_check( 13, 2 ) = 11
616  param_check( 12, 2 ) = 905
617  param_check( 11, 2 ) = 904
618  param_check( 10, 2 ) = 903
619  param_check( 9, 2 ) = 901
620  param_check( 8, 2 ) = 8
621  param_check( 7, 2 ) = 6
622  param_check( 6, 2 ) = 5
623  param_check( 5, 2 ) = 4
624  param_check( 4, 2 ) = 3
625  param_check( 3, 2 ) = 16
626  param_check( 2, 2 ) = 2
627  param_check( 1, 2 ) = 1
628 *
629 * Want to find errors with MIN( ), so if no error, set it to a big
630 * number. If there already is an error, multiply by the the
631 * descriptor multiplier.
632 *
633  IF( info.GE.0 ) THEN
634  info = bignum
635  ELSE IF( info.LT.-descmult ) THEN
636  info = -info
637  ELSE
638  info = -info * descmult
639  END IF
640 *
641 * Check consistency across processors
642 *
643  CALL globchk( ictxt, 18, param_check, 18,
644  \$ param_check( 1, 3 ), info )
645 *
646 * Prepare output: set info = 0 if no error, and divide by DESCMULT
647 * if error is not in a descriptor entry.
648 *
649  IF( info.EQ.bignum ) THEN
650  info = 0
651  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
652  info = -info / descmult
653  ELSE
654  info = -info
655  END IF
656 *
657  IF( info.LT.0 ) THEN
658  CALL pxerbla( ictxt, 'PCDBTRSV', -info )
659  RETURN
660  END IF
661 *
662 * Quick return if possible
663 *
664  IF( n.EQ.0 )
665  \$ RETURN
666 *
667  IF( nrhs.EQ.0 )
668  \$ RETURN
669 *
670 *
672 * the beginning part of the relevant data
673 *
674  part_offset = nb*( (ja-1)/(npcol*nb) )
675 *
676  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
677  part_offset = part_offset + nb
678  ENDIF
679 *
680  IF ( mycol .LT. csrc ) THEN
681  part_offset = part_offset - nb
682  ENDIF
683 *
684 * Form a new BLACS grid (the "standard form" grid) with only procs
685 * holding part of the matrix, of size 1xNP where NP is adjusted,
686 * starting at csrc=0, with JA modified to reflect dropped procs.
687 *
688 * First processor to hold part of the matrix:
689 *
690  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
691 *
692 * Calculate new JA one while dropping off unused processors.
693 *
694  ja_new = mod( ja-1, nb ) + 1
695 *
696 * Save and compute new value of NP
697 *
698  np_save = np
699  np = ( ja_new+n-2 )/nb + 1
700 *
701 * Call utility routine that forms "standard-form" grid
702 *
703  CALL reshape( ictxt, int_one, ictxt_new, int_one,
704  \$ first_proc, int_one, np )
705 *
706 * Use new context from standard grid as context.
707 *
708  ictxt_save = ictxt
709  ictxt = ictxt_new
710  desca_1xp( 2 ) = ictxt_new
711  descb_px1( 2 ) = ictxt_new
712 *
713 * Get information about new grid.
714 *
715  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
716 *
717 * Drop out processors that do not have part of the matrix.
718 *
719  IF( myrow .LT. 0 ) THEN
720  GOTO 1234
721  ENDIF
722 *
723 * ********************************
724 * Values reused throughout routine
725 *
726 * User-input value of partition size
727 *
728  part_size = nb
729 *
730 * Number of columns in each processor
731 *
732  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
733 *
734 * Offset in columns to beginning of main partition in each proc
735 *
736  IF ( mycol .EQ. 0 ) THEN
737  part_offset = part_offset+mod( ja_new-1, part_size )
738  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
739  ENDIF
740 *
741 * Offset in elements
742 *
743  ofst = part_offset*llda
744 *
745 * Size of main (or odd) partition in each processor
746 *
747  odd_size = my_num_cols
748  IF ( mycol .LT. np-1 ) THEN
749  odd_size = odd_size - max_bw
750  ENDIF
751 *
752 * Offset to workspace for Upper triangular factor
753 *
754  work_u = bwu*odd_size + 3*mbw2
755 *
756 *
757 *
758 * Begin main code
759 *
760  IF ( lsame( uplo, 'L' ) ) THEN
761 *
762  IF ( lsame( trans, 'N' ) ) THEN
763 *
764 * Frontsolve
765 *
766 *
767 ******************************************
768 * Local computation phase
769 ******************************************
770 *
771 * Use main partition in each processor to solve locally
772 *
773  CALL ctbtrs( uplo, 'N', 'U', odd_size,
774  \$ bwl, nrhs,
775  \$ a( ofst+1+bwu ), llda,
776  \$ b( part_offset+1 ), lldb, info )
777 *
778 *
779  IF ( mycol .LT. np-1 ) THEN
780 * Use factorization of odd-even connection block to modify
781 * locally stored portion of right hand side(s)
782 *
783 *
784 * First copy and multiply it into temporary storage,
785 * then use it on RHS
786 *
787  CALL clamov( 'N', bwl, nrhs,
788  \$ b( part_offset+odd_size-bwl+1), lldb,
789  \$ work( 1 ), max_bw )
790 *
791  CALL ctrmm( 'L', 'U', 'N', 'N', bwl, nrhs, -cone,
792  \$ a(( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda )),
793  \$ llda-1, work( 1 ), max_bw )
794 *
795  CALL cmatadd( bwl, nrhs, cone, work( 1 ), max_bw,
796  \$ cone, b( part_offset+odd_size+1 ), lldb )
797 *
798  ENDIF
799 *
800 * Clear garbage out of workspace block
801 *
802  DO 10 idum1=1, work_size_min
803  work( idum1 )=0.0
804  10 CONTINUE
805 *
806 *
807  IF ( mycol .NE. 0 ) THEN
808 * Use the "spike" fillin to calculate contribution to previous
809 * processor's righthand-side.
810 *
811  CALL cgemm( 'C', 'N', bwu, nrhs, odd_size, -cone, af( 1 ),
812  \$ odd_size, b( part_offset+1 ), lldb, czero,
813  \$ work( 1+max_bw-bwu ), max_bw )
814  ENDIF
815 *
816 *
817 ************************************************
818 * Formation and solution of reduced system
819 ************************************************
820 *
821 *
822 * Send modifications to prior processor's right hand sides
823 *
824  IF( mycol .GT. 0) THEN
825 *
826  CALL cgesd2d( ictxt, max_bw, nrhs,
827  \$ work( 1 ), max_bw,
828  \$ 0, mycol - 1 )
829 *
830  ENDIF
831 *
832 * Receive modifications to processor's right hand sides
833 *
834  IF( mycol .LT. npcol-1) THEN
835 *
836  CALL cgerv2d( ictxt, max_bw, nrhs,
837  \$ work( 1 ), max_bw,
838  \$ 0, mycol + 1 )
839 *
840 * Combine contribution to locally stored right hand sides
841 *
842  CALL cmatadd( max_bw, nrhs, cone,
843  \$ work( 1 ), max_bw, cone,
844  \$ b( part_offset+odd_size + 1 ), lldb )
845 *
846  ENDIF
847 *
848 *
849 * The last processor does not participate in the solution of the
850 * reduced system, having sent its contribution already.
851  IF( mycol .EQ. npcol-1 ) THEN
852  GOTO 14
853  ENDIF
854 *
855 *
856 * *************************************
857 * Modification Loop
858 *
859 * The distance for sending and receiving for each level starts
860 * at 1 for the first level.
861  level_dist = 1
862 *
863 * Do until this proc is needed to modify other procs' equations
864 *
865  12 CONTINUE
866  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 11
867 *
869 *
870  IF( mycol-level_dist .GE. 0 ) THEN
871 *
872  CALL cgerv2d( ictxt, max_bw, nrhs,
873  \$ work( 1 ),
874  \$ max_bw, 0, mycol-level_dist )
875 *
876  CALL cmatadd( max_bw, nrhs, cone,
877  \$ work( 1 ), max_bw, cone,
878  \$ b( part_offset+odd_size + 1 ), lldb )
879 *
880  ENDIF
881 *
883 *
884  IF( mycol+level_dist .LT. npcol-1 ) THEN
885 *
886  CALL cgerv2d( ictxt, max_bw, nrhs,
887  \$ work( 1 ),
888  \$ max_bw, 0, mycol+level_dist )
889 *
890  CALL cmatadd( max_bw, nrhs, cone,
891  \$ work( 1 ), max_bw, cone,
892  \$ b( part_offset+odd_size + 1 ), lldb )
893 *
894  ENDIF
895 *
896  level_dist = level_dist*2
897 *
898  GOTO 12
899  11 CONTINUE
900 * [End of GOTO Loop]
901 *
902 *
903 *
904 * *********************************
905 * Calculate and use this proc's blocks to modify other procs
906 *
907 * Solve with diagonal block
908 *
909  CALL ctbtrs( 'L', 'N', 'U', max_bw, min( bwl, max_bw-1 ), nrhs,
910  \$ af( odd_size*bwu+mbw2+1 ), max_bw+1,
911  \$ b( part_offset+odd_size+1 ), lldb, info )
912 *
913  IF( info.NE.0 ) THEN
914  GO TO 1000
915  ENDIF
916 *
917 *
918 *
919 * *********
920  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
921 *
922 * Calculate contribution from this block to next diagonal block
923 *
924  CALL cgemm( 'C', 'N', max_bw, nrhs, max_bw, -cone,
925  \$ af( (odd_size)*bwu+1 ),
926  \$ max_bw,
927  \$ b( part_offset+odd_size+1 ),
928  \$ lldb, czero,
929  \$ work( 1 ),
930  \$ max_bw )
931 *
932 * Send contribution to diagonal block's owning processor.
933 *
934  CALL cgesd2d( ictxt, max_bw, nrhs,
935  \$ work( 1 ),
936  \$ max_bw, 0, mycol+level_dist )
937 *
938  ENDIF
939 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
940 *
941 * ************
942  IF( (mycol/level_dist .GT. 0 ).AND.
943  \$ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
944 *
945 *
946 * Use offdiagonal block to calculate modification to diag block
947 * of processor to the left
948 *
949  CALL cgemm( 'N', 'N', max_bw, nrhs, max_bw, -cone,
950  \$ af( odd_size*bwu+2*mbw2+1 ),
951  \$ max_bw,
952  \$ b( part_offset+odd_size+1 ),
953  \$ lldb, czero,
954  \$ work( 1 ),
955  \$ max_bw )
956 *
957 * Send contribution to diagonal block's owning processor.
958 *
959  CALL cgesd2d( ictxt, max_bw, nrhs,
960  \$ work( 1 ),
961  \$ max_bw, 0, mycol-level_dist )
962 *
963  ENDIF
964 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
965 *
966  14 CONTINUE
967 *
968  ELSE
969 *
970 ******************** BACKSOLVE *************************************
971 *
972 ********************************************************************
973 * .. Begin reduced system phase of algorithm ..
974 ********************************************************************
975 *
976 *
977 *
978 * The last processor does not participate in the solution of the
979 * reduced system and just waits to receive its solution.
980  IF( mycol .EQ. npcol-1 ) THEN
981  GOTO 24
982  ENDIF
983 *
984 * Determine number of steps in tree loop
985 *
986  level_dist = 1
987  27 CONTINUE
988  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 26
989 *
990  level_dist = level_dist*2
991 *
992  GOTO 27
993  26 CONTINUE
994 *
995 *
996  IF( (mycol/level_dist .GT. 0 ).AND.
997  \$ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
998 *
999 * Receive solution from processor to left
1000 *
1001  CALL cgerv2d( ictxt, max_bw, nrhs,
1002  \$ work( 1 ),
1003  \$ max_bw, 0, mycol-level_dist )
1004 *
1005 *
1006 * Use offdiagonal block to calculate modification to RHS stored
1007 * on this processor
1008 *
1009  CALL cgemm( 'C', 'N', max_bw, nrhs, max_bw, -cone,
1010  \$ af( odd_size*bwu+2*mbw2+1 ),
1011  \$ max_bw,
1012  \$ work( 1 ),
1013  \$ max_bw, cone,
1014  \$ b( part_offset+odd_size+1 ),
1015  \$ lldb )
1016  ENDIF
1017 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1018 *
1019 *
1020  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1021 *
1022 * Receive solution from processor to right
1023 *
1024  CALL cgerv2d( ictxt, max_bw, nrhs,
1025  \$ work( 1 ),
1026  \$ max_bw, 0, mycol+level_dist )
1027 *
1028 * Calculate contribution from this block to next diagonal block
1029 *
1030  CALL cgemm( 'N', 'N', max_bw, nrhs, max_bw, -cone,
1031  \$ af( (odd_size)*bwu+1 ),
1032  \$ max_bw,
1033  \$ work( 1 ),
1034  \$ max_bw, cone,
1035  \$ b( part_offset+odd_size+1 ),
1036  \$ lldb )
1037 *
1038  ENDIF
1039 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1040 *
1041 *
1042 * Solve with diagonal block
1043 *
1044  CALL ctbtrs( 'L', 'C', 'U', max_bw, min( bwl, max_bw-1 ), nrhs,
1045  \$ af( odd_size*bwu+mbw2+1 ), max_bw+1,
1046  \$ b( part_offset+odd_size+1 ), lldb, info )
1047 *
1048  IF( info.NE.0 ) THEN
1049  GO TO 1000
1050  ENDIF
1051 *
1052 *
1053 *
1054 ***Modification Loop *******
1055 *
1056  22 CONTINUE
1057  IF( level_dist .EQ. 1 ) GOTO 21
1058 *
1059  level_dist = level_dist/2
1060 *
1061 * Send solution to the right
1062 *
1063  IF( mycol+level_dist .LT. npcol-1 ) THEN
1064 *
1065  CALL cgesd2d( ictxt, max_bw, nrhs,
1066  \$ b( part_offset+odd_size+1 ),
1067  \$ lldb, 0, mycol+level_dist )
1068 *
1069  ENDIF
1070 *
1071 * Send solution to left
1072 *
1073  IF( mycol-level_dist .GE. 0 ) THEN
1074 *
1075  CALL cgesd2d( ictxt, max_bw, nrhs,
1076  \$ b( part_offset+odd_size+1 ),
1077  \$ lldb, 0, mycol-level_dist )
1078 *
1079  ENDIF
1080 *
1081  GOTO 22
1082  21 CONTINUE
1083 * [End of GOTO Loop]
1084 *
1085  24 CONTINUE
1086 * [Processor npcol - 1 jumped to here to await next stage]
1087 *
1088 *******************************
1089 * Reduced system has been solved, communicate solutions to nearest
1090 * neighbors in preparation for local computation phase.
1091 *
1092 *
1093 * Send elements of solution to next proc
1094 *
1095  IF( mycol .LT. npcol-1) THEN
1096 *
1097  CALL cgesd2d( ictxt, max_bw, nrhs,
1098  \$ b( part_offset+odd_size+1 ), lldb,
1099  \$ 0, mycol +1 )
1100 *
1101  ENDIF
1102 *
1103 * Receive modifications to processor's right hand sides
1104 *
1105  IF( mycol .GT. 0) THEN
1106 *
1107  CALL cgerv2d( ictxt, max_bw, nrhs,
1108  \$ work( 1 ), max_bw,
1109  \$ 0, mycol - 1 )
1110 *
1111  ENDIF
1112 *
1113 *
1114 *
1115 **********************************************
1116 * Local computation phase
1117 **********************************************
1118 *
1119  IF ( mycol .NE. 0 ) THEN
1120 * Use the "spike" fillin to calculate contribution from previous
1121 * processor's solution.
1122 *
1123  CALL cgemm( 'N', 'N', odd_size, nrhs, bwu, -cone, af( 1 ),
1124  \$ odd_size, work( 1+max_bw-bwu ), max_bw, cone,
1125  \$ b( part_offset+1 ), lldb )
1126 *
1127  ENDIF
1128 *
1129 *
1130  IF ( mycol .LT. np-1 ) THEN
1131 * Use factorization of odd-even connection block to modify
1132 * locally stored portion of right hand side(s)
1133 *
1134 *
1135 * First copy and multiply it into temporary storage,
1136 * then use it on RHS
1137 *
1138  CALL clamov( 'N', bwl, nrhs, b( part_offset+odd_size+1), lldb,
1139  \$ work( 1+max_bw-bwl ), max_bw )
1140 *
1141  CALL ctrmm( 'L', 'U', 'C', 'N', bwl, nrhs, -cone,
1142  \$ a(( ofst+(bwl+bwu+1)+(odd_size-bwl)*llda )),
1143  \$ llda-1, work( 1+max_bw-bwl ), max_bw )
1144 *
1145  CALL cmatadd( bwl, nrhs, cone, work( 1+max_bw-bwl ), max_bw,
1146  \$ cone, b( part_offset+odd_size-bwl+1 ), lldb )
1147 *
1148  ENDIF
1149 *
1150 * Use main partition in each processor to solve locally
1151 *
1152  CALL ctbtrs( uplo, 'C', 'U', odd_size,
1153  \$ bwl, nrhs,
1154  \$ a( ofst+1+bwu ),
1155  \$ llda, b( part_offset+1 ),
1156  \$ lldb, info )
1157 *
1158  ENDIF
1159 * End of "IF( LSAME( TRANS, 'N' ) )"...
1160 *
1161 *
1162  ELSE
1163 ***************************************************************
1164 * CASE UPLO = 'U' *
1165 ***************************************************************
1166  IF ( lsame( trans, 'C' ) ) THEN
1167 *
1168 * Frontsolve
1169 *
1170 *
1171 ******************************************
1172 * Local computation phase
1173 ******************************************
1174 *
1175 * Use main partition in each processor to solve locally
1176 *
1177  CALL ctbtrs( uplo, 'C', 'N', odd_size,
1178  \$ bwu, nrhs,
1179  \$ a( ofst+1 ), llda,
1180  \$ b( part_offset+1 ), lldb, info )
1181 *
1182 *
1183  IF ( mycol .LT. np-1 ) THEN
1184 * Use factorization of odd-even connection block to modify
1185 * locally stored portion of right hand side(s)
1186 *
1187 *
1188 * First copy and multiply it into temporary storage,
1189 * then use it on RHS
1190 *
1191  CALL clamov( 'N', bwu, nrhs,
1192  \$ b( part_offset+odd_size-bwu+1), lldb,
1193  \$ work( 1 ), max_bw )
1194 *
1195  CALL ctrmm( 'L', 'L', 'C', 'N', bwu, nrhs, -cone,
1196  \$ a(( ofst+1+odd_size*llda )), llda-1, work( 1 ),
1197  \$ max_bw )
1198 *
1199  CALL cmatadd( bwu, nrhs, cone, work( 1 ), max_bw,
1200  \$ cone, b( part_offset+odd_size+1 ), lldb )
1201 *
1202  ENDIF
1203 *
1204 * Clear garbage out of workspace block
1205 *
1206  DO 20 idum1=1, work_size_min
1207  work( idum1 )=0.0
1208  20 CONTINUE
1209 *
1210 *
1211  IF ( mycol .NE. 0 ) THEN
1212 * Use the "spike" fillin to calculate contribution to previous
1213 * processor's righthand-side.
1214 *
1215  CALL cgemm( 'C', 'N', bwl, nrhs, odd_size, -cone,
1216  \$ af( work_u+1 ), odd_size, b( part_offset+1 ),
1217  \$ lldb, czero, work( 1+max_bw-bwl ), max_bw )
1218  ENDIF
1219 *
1220 *
1221 ************************************************
1222 * Formation and solution of reduced system
1223 ************************************************
1224 *
1225 *
1226 * Send modifications to prior processor's right hand sides
1227 *
1228  IF( mycol .GT. 0) THEN
1229 *
1230  CALL cgesd2d( ictxt, max_bw, nrhs,
1231  \$ work( 1 ), max_bw,
1232  \$ 0, mycol - 1 )
1233 *
1234  ENDIF
1235 *
1236 * Receive modifications to processor's right hand sides
1237 *
1238  IF( mycol .LT. npcol-1) THEN
1239 *
1240  CALL cgerv2d( ictxt, max_bw, nrhs,
1241  \$ work( 1 ), max_bw,
1242  \$ 0, mycol + 1 )
1243 *
1244 * Combine contribution to locally stored right hand sides
1245 *
1246  CALL cmatadd( max_bw, nrhs, cone,
1247  \$ work( 1 ), max_bw, cone,
1248  \$ b( part_offset+odd_size + 1 ), lldb )
1249 *
1250  ENDIF
1251 *
1252 *
1253 * The last processor does not participate in the solution of the
1254 * reduced system, having sent its contribution already.
1255  IF( mycol .EQ. npcol-1 ) THEN
1256  GOTO 44
1257  ENDIF
1258 *
1259 *
1260 * *************************************
1261 * Modification Loop
1262 *
1263 * The distance for sending and receiving for each level starts
1264 * at 1 for the first level.
1265  level_dist = 1
1266 *
1267 * Do until this proc is needed to modify other procs' equations
1268 *
1269  42 CONTINUE
1270  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 41
1271 *
1273 *
1274  IF( mycol-level_dist .GE. 0 ) THEN
1275 *
1276  CALL cgerv2d( ictxt, max_bw, nrhs,
1277  \$ work( 1 ),
1278  \$ max_bw, 0, mycol-level_dist )
1279 *
1280  CALL cmatadd( max_bw, nrhs, cone,
1281  \$ work( 1 ), max_bw, cone,
1282  \$ b( part_offset+odd_size + 1 ), lldb )
1283 *
1284  ENDIF
1285 *
1287 *
1288  IF( mycol+level_dist .LT. npcol-1 ) THEN
1289 *
1290  CALL cgerv2d( ictxt, max_bw, nrhs,
1291  \$ work( 1 ),
1292  \$ max_bw, 0, mycol+level_dist )
1293 *
1294  CALL cmatadd( max_bw, nrhs, cone,
1295  \$ work( 1 ), max_bw, cone,
1296  \$ b( part_offset+odd_size + 1 ), lldb )
1297 *
1298  ENDIF
1299 *
1300  level_dist = level_dist*2
1301 *
1302  GOTO 42
1303  41 CONTINUE
1304 * [End of GOTO Loop]
1305 *
1306 *
1307 *
1308 * *********************************
1309 * Calculate and use this proc's blocks to modify other procs
1310 *
1311 * Solve with diagonal block
1312 *
1313  CALL ctbtrs( 'U', 'C', 'N', max_bw, min( bwu, max_bw-1 ), nrhs,
1314  \$ af( odd_size*bwu+mbw2+1-min( bwu, max_bw-1 ) ),
1315  \$ max_bw+1, b( part_offset+odd_size+1 ), lldb,
1316  \$ info )
1317 *
1318  IF( info.NE.0 ) THEN
1319  GO TO 1000
1320  ENDIF
1321 *
1322 *
1323 *
1324 * *********
1325  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1326 *
1327 * Calculate contribution from this block to next diagonal block
1328 *
1329  CALL cgemm( 'C', 'N', max_bw, nrhs, max_bw, -cone,
1330  \$ af( work_u+(odd_size)*bwl+1 ),
1331  \$ max_bw,
1332  \$ b( part_offset+odd_size+1 ),
1333  \$ lldb, czero,
1334  \$ work( 1 ),
1335  \$ max_bw )
1336 *
1337 * Send contribution to diagonal block's owning processor.
1338 *
1339  CALL cgesd2d( ictxt, max_bw, nrhs,
1340  \$ work( 1 ),
1341  \$ max_bw, 0, mycol+level_dist )
1342 *
1343  ENDIF
1344 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1345 *
1346 * ************
1347  IF( (mycol/level_dist .GT. 0 ).AND.
1348  \$ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1349 *
1350 *
1351 * Use offdiagonal block to calculate modification to diag block
1352 * of processor to the left
1353 *
1354  CALL cgemm( 'N', 'N', max_bw, nrhs, max_bw, -cone,
1355  \$ af( work_u+odd_size*bwl+2*mbw2+1 ),
1356  \$ max_bw,
1357  \$ b( part_offset+odd_size+1 ),
1358  \$ lldb, czero,
1359  \$ work( 1 ),
1360  \$ max_bw )
1361 *
1362 * Send contribution to diagonal block's owning processor.
1363 *
1364  CALL cgesd2d( ictxt, max_bw, nrhs,
1365  \$ work( 1 ),
1366  \$ max_bw, 0, mycol-level_dist )
1367 *
1368  ENDIF
1369 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1370 *
1371  44 CONTINUE
1372 *
1373  ELSE
1374 *
1375 ******************** BACKSOLVE *************************************
1376 *
1377 ********************************************************************
1378 * .. Begin reduced system phase of algorithm ..
1379 ********************************************************************
1380 *
1381 *
1382 *
1383 * The last processor does not participate in the solution of the
1384 * reduced system and just waits to receive its solution.
1385  IF( mycol .EQ. npcol-1 ) THEN
1386  GOTO 54
1387  ENDIF
1388 *
1389 * Determine number of steps in tree loop
1390 *
1391  level_dist = 1
1392  57 CONTINUE
1393  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 56
1394 *
1395  level_dist = level_dist*2
1396 *
1397  GOTO 57
1398  56 CONTINUE
1399 *
1400 *
1401  IF( (mycol/level_dist .GT. 0 ).AND.
1402  \$ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1403 *
1404 * Receive solution from processor to left
1405 *
1406  CALL cgerv2d( ictxt, max_bw, nrhs,
1407  \$ work( 1 ),
1408  \$ max_bw, 0, mycol-level_dist )
1409 *
1410 *
1411 * Use offdiagonal block to calculate modification to RHS stored
1412 * on this processor
1413 *
1414  CALL cgemm( 'C', 'N', max_bw, nrhs, max_bw, -cone,
1415  \$ af( work_u+odd_size*bwl+2*mbw2+1 ),
1416  \$ max_bw,
1417  \$ work( 1 ),
1418  \$ max_bw, cone,
1419  \$ b( part_offset+odd_size+1 ),
1420  \$ lldb )
1421  ENDIF
1422 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1423 *
1424 *
1425  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1426 *
1427 * Receive solution from processor to right
1428 *
1429  CALL cgerv2d( ictxt, max_bw, nrhs,
1430  \$ work( 1 ),
1431  \$ max_bw, 0, mycol+level_dist )
1432 *
1433 * Calculate contribution from this block to next diagonal block
1434 *
1435  CALL cgemm( 'N', 'N', max_bw, nrhs, max_bw, -cone,
1436  \$ af( work_u+(odd_size)*bwl+1 ),
1437  \$ max_bw,
1438  \$ work( 1 ),
1439  \$ max_bw, cone,
1440  \$ b( part_offset+odd_size+1 ),
1441  \$ lldb )
1442 *
1443  ENDIF
1444 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1445 *
1446 *
1447 * Solve with diagonal block
1448 *
1449  CALL ctbtrs( 'U', 'N', 'N', max_bw, min( bwu, max_bw-1 ), nrhs,
1450  \$ af( odd_size*bwu+mbw2+1-min( bwu, max_bw-1 ) ),
1451  \$ max_bw+1, b( part_offset+odd_size+1 ), lldb,
1452  \$ info )
1453 *
1454  IF( info.NE.0 ) THEN
1455  GO TO 1000
1456  ENDIF
1457 *
1458 *
1459 *
1460 ***Modification Loop *******
1461 *
1462  52 CONTINUE
1463  IF( level_dist .EQ. 1 ) GOTO 51
1464 *
1465  level_dist = level_dist/2
1466 *
1467 * Send solution to the right
1468 *
1469  IF( mycol+level_dist .LT. npcol-1 ) THEN
1470 *
1471  CALL cgesd2d( ictxt, max_bw, nrhs,
1472  \$ b( part_offset+odd_size+1 ),
1473  \$ lldb, 0, mycol+level_dist )
1474 *
1475  ENDIF
1476 *
1477 * Send solution to left
1478 *
1479  IF( mycol-level_dist .GE. 0 ) THEN
1480 *
1481  CALL cgesd2d( ictxt, max_bw, nrhs,
1482  \$ b( part_offset+odd_size+1 ),
1483  \$ lldb, 0, mycol-level_dist )
1484 *
1485  ENDIF
1486 *
1487  GOTO 52
1488  51 CONTINUE
1489 * [End of GOTO Loop]
1490 *
1491  54 CONTINUE
1492 * [Processor npcol - 1 jumped to here to await next stage]
1493 *
1494 *******************************
1495 * Reduced system has been solved, communicate solutions to nearest
1496 * neighbors in preparation for local computation phase.
1497 *
1498 *
1499 * Send elements of solution to next proc
1500 *
1501  IF( mycol .LT. npcol-1) THEN
1502 *
1503  CALL cgesd2d( ictxt, max_bw, nrhs,
1504  \$ b( part_offset+odd_size+1 ), lldb,
1505  \$ 0, mycol +1 )
1506 *
1507  ENDIF
1508 *
1509 * Receive modifications to processor's right hand sides
1510 *
1511  IF( mycol .GT. 0) THEN
1512 *
1513  CALL cgerv2d( ictxt, max_bw, nrhs,
1514  \$ work( 1 ), max_bw,
1515  \$ 0, mycol - 1 )
1516 *
1517  ENDIF
1518 *
1519 *
1520 *
1521 **********************************************
1522 * Local computation phase
1523 **********************************************
1524 *
1525  IF ( mycol .NE. 0 ) THEN
1526 * Use the "spike" fillin to calculate contribution from previous
1527 * processor's solution.
1528 *
1529  CALL cgemm( 'N', 'N', odd_size, nrhs, bwl, -cone,
1530  \$ af( work_u+1 ), odd_size, work( 1+max_bw-bwl ),
1531  \$ max_bw, cone, b( part_offset+1 ), lldb )
1532 *
1533  ENDIF
1534 *
1535 *
1536  IF ( mycol .LT. np-1 ) THEN
1537 * Use factorization of odd-even connection block to modify
1538 * locally stored portion of right hand side(s)
1539 *
1540 *
1541 * First copy and multiply it into temporary storage,
1542 * then use it on RHS
1543 *
1544  CALL clamov( 'N', bwu, nrhs, b( part_offset+odd_size+1), lldb,
1545  \$ work( 1+max_bw-bwu ), max_bw+bwl )
1546 *
1547  CALL ctrmm( 'L', 'L', 'N', 'N', bwu, nrhs, -cone,
1548  \$ a(( ofst+1+odd_size*llda )), llda-1,
1549  \$ work( 1+max_bw-bwu ), max_bw+bwl )
1550 *
1551  CALL cmatadd( bwu, nrhs, cone, work( 1+max_bw-bwu ),
1552  \$ max_bw+bwl, cone,
1553  \$ b( part_offset+odd_size-bwu+1 ), lldb )
1554 *
1555  ENDIF
1556 *
1557 * Use main partition in each processor to solve locally
1558 *
1559  CALL ctbtrs( uplo, 'N', 'N', odd_size,
1560  \$ bwu, nrhs,
1561  \$ a( ofst+1 ),
1562  \$ llda, b( part_offset+1 ),
1563  \$ lldb, info )
1564 *
1565  ENDIF
1566 * End of "IF( LSAME( TRANS, 'N' ) )"...
1567 *
1568 *
1569  ENDIF
1570 * End of "IF( LSAME( UPLO, 'L' ) )"...
1571  1000 CONTINUE
1572 *
1573 *
1574 * Free BLACS space used to hold standard-form grid.
1575 *
1576  IF( ictxt_save .NE. ictxt_new ) THEN
1577  CALL blacs_gridexit( ictxt_new )
1578  ENDIF
1579 *
1580  1234 CONTINUE
1581 *
1582 * Restore saved input parameters
1583 *
1584  ictxt = ictxt_save
1585  np = np_save
1586 *
1587 * Output minimum worksize
1588 *
1589  work( 1 ) = work_size_min
1590 *
1591 *
1592  RETURN
1593 *
1594 * End of PCDBTRSV
1595 *
1596  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
max
#define max(A, B)
Definition: pcgemr.c:180
pcdbtrsv
subroutine pcdbtrsv(UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: pcdbtrsv.f:3
subroutine cmatadd(M, N, ALPHA, A, LDA, BETA, C, LDC)