ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
psdbtrsv.f
Go to the documentation of this file.
1  SUBROUTINE psdbtrsv( UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA,
2  $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  CHARACTER TRANS, UPLO
10  INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * ), DESCB( * )
14  REAL A( * ), AF( * ), B( * ), WORK( * )
15 * ..
16 *
17 *
18 * Purpose
19 * =======
20 *
21 * PSDBTRSV solves a banded triangular system of linear equations
22 *
23 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
24 * or
25 * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS)
26 *
27 * where A(1:N, JA:JA+N-1) is a banded
28 * triangular matrix factor produced by the
29 * Gaussian elimination code PD@(dom_pre)BTRF
30 * and is stored in A(1:N,JA:JA+N-1) and AF.
31 * The matrix stored in A(1:N, JA:JA+N-1) is either
32 * upper or lower triangular according to UPLO,
33 * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T
34 * is dictated by the user by the parameter TRANS.
35 *
36 * Routine PSDBTRF MUST be called first.
37 *
38 * =====================================================================
39 *
40 * Arguments
41 * =========
42 *
43 * UPLO (global input) CHARACTER
44 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
45 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
46 *
47 * TRANS (global input) CHARACTER
48 * = 'N': Solve with A(1:N, JA:JA+N-1);
49 * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T;
50 *
51 * N (global input) INTEGER
52 * The number of rows and columns to be operated on, i.e. the
53 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
54 *
55 * BWL (global input) INTEGER
56 * Number of subdiagonals. 0 <= BWL <= N-1
57 *
58 * BWU (global input) INTEGER
59 * Number of superdiagonals. 0 <= BWU <= N-1
60 *
61 * NRHS (global input) INTEGER
62 * The number of right hand sides, i.e., the number of columns
63 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
64 * NRHS >= 0.
65 *
66 * A (local input/local output) REAL pointer into
67 * local memory to an array with first dimension
68 * LLD_A >=(bwl+bwu+1) (stored in DESCA).
69 * On entry, this array contains the local pieces of the
70 * N-by-N unsymmetric banded distributed Cholesky factor L or
71 * L^T A(1:N, JA:JA+N-1).
72 * This local portion is stored in the packed banded format
73 * used in LAPACK. Please see the Notes below and the
74 * ScaLAPACK manual for more detail on the format of
75 * distributed matrices.
76 *
77 * JA (global input) INTEGER
78 * The index in the global array A that points to the start of
79 * the matrix to be operated on (which may be either all of A
80 * or a submatrix of A).
81 *
82 * DESCA (global and local input) INTEGER array of dimension DLEN.
83 * if 1D type (DTYPE_A=501), DLEN >= 7;
84 * if 2D type (DTYPE_A=1), DLEN >= 9 .
85 * The array descriptor for the distributed matrix A.
86 * Contains information of mapping of A to memory. Please
87 * see NOTES below for full description and options.
88 *
89 * B (local input/local output) REAL pointer into
90 * local memory to an array of local lead dimension lld_b>=NB.
91 * On entry, this array contains the
92 * the local pieces of the right hand sides
93 * B(IB:IB+N-1, 1:NRHS).
94 * On exit, this contains the local piece of the solutions
95 * distributed matrix X.
96 *
97 * IB (global input) INTEGER
98 * The row index in the global array B that points to the first
99 * row of the matrix to be operated on (which may be either
100 * all of B or a submatrix of B).
101 *
102 * DESCB (global and local input) INTEGER array of dimension DLEN.
103 * if 1D type (DTYPE_B=502), DLEN >=7;
104 * if 2D type (DTYPE_B=1), DLEN >= 9.
105 * The array descriptor for the distributed matrix B.
106 * Contains information of mapping of B to memory. Please
107 * see NOTES below for full description and options.
108 *
109 * AF (local output) REAL array, dimension LAF.
110 * Auxiliary Fillin Space.
111 * Fillin is created during the factorization routine
112 * PSDBTRF and this is stored in AF. If a linear system
113 * is to be solved using PSDBTRS after the factorization
114 * routine, AF *must not be altered* after the factorization.
115 *
116 * LAF (local input) INTEGER
117 * Size of user-input Auxiliary Fillin space AF. Must be >=
118 * NB*(bwl+bwu)+6*max(bwl,bwu)*max(bwl,bwu)
119 * If LAF is not large enough, an error code will be returned
120 * and the minimum acceptable size will be returned in AF( 1 )
121 *
122 * WORK (local workspace/local output)
123 * REAL temporary workspace. This space may
124 * be overwritten in between calls to routines. WORK must be
125 * the size given in LWORK.
126 * On exit, WORK( 1 ) contains the minimal LWORK.
127 *
128 * LWORK (local input or global input) INTEGER
129 * Size of user-input workspace WORK.
130 * If LWORK is too small, the minimal acceptable size will be
131 * returned in WORK(1) and an error code is returned. LWORK>=
132 * (max(bwl,bwu)*NRHS)
133 *
134 * INFO (global output) INTEGER
135 * = 0: successful exit
136 * < 0: If the i-th argument is an array and the j-entry had
137 * an illegal value, then INFO = -(i*100+j), if the i-th
138 * argument is a scalar and had an illegal value, then
139 * INFO = -i.
140 *
141 * =====================================================================
142 *
143 *
144 * Restrictions
145 * ============
146 *
147 * The following are restrictions on the input parameters. Some of these
148 * are temporary and will be removed in future releases, while others
149 * may reflect fundamental technical limitations.
150 *
151 * Non-cyclic restriction: VERY IMPORTANT!
152 * P*NB>= mod(JA-1,NB)+N.
153 * The mapping for matrices must be blocked, reflecting the nature
154 * of the divide and conquer algorithm as a task-parallel algorithm.
155 * This formula in words is: no processor may have more than one
156 * chunk of the matrix.
157 *
158 * Blocksize cannot be too small:
159 * If the matrix spans more than one processor, the following
160 * restriction on NB, the size of each block on each processor,
161 * must hold:
162 * NB >= 2*MAX(BWL,BWU)
163 * The bulk of parallel computation is done on the matrix of size
164 * O(NB) on each processor. If this is too small, divide and conquer
165 * is a poor choice of algorithm.
166 *
167 * Submatrix reference:
168 * JA = IB
169 * Alignment restriction that prevents unnecessary communication.
170 *
171 *
172 * =====================================================================
173 *
174 *
175 * Notes
176 * =====
177 *
178 * If the factorization routine and the solve routine are to be called
179 * separately (to solve various sets of righthand sides using the same
180 * coefficient matrix), the auxiliary space AF *must not be altered*
181 * between calls to the factorization routine and the solve routine.
182 *
183 * The best algorithm for solving banded and tridiagonal linear systems
184 * depends on a variety of parameters, especially the bandwidth.
185 * Currently, only algorithms designed for the case N/P >> bw are
186 * implemented. These go by many names, including Divide and Conquer,
187 * Partitioning, domain decomposition-type, etc.
188 *
189 * Algorithm description: Divide and Conquer
190 *
191 * The Divide and Conqer algorithm assumes the matrix is narrowly
192 * banded compared with the number of equations. In this situation,
193 * it is best to distribute the input matrix A one-dimensionally,
194 * with columns atomic and rows divided amongst the processes.
195 * The basic algorithm divides the banded matrix up into
196 * P pieces with one stored on each processor,
197 * and then proceeds in 2 phases for the factorization or 3 for the
198 * solution of a linear system.
199 * 1) Local Phase:
200 * The individual pieces are factored independently and in
201 * parallel. These factors are applied to the matrix creating
202 * fillin, which is stored in a non-inspectable way in auxiliary
203 * space AF. Mathematically, this is equivalent to reordering
204 * the matrix A as P A P^T and then factoring the principal
205 * leading submatrix of size equal to the sum of the sizes of
206 * the matrices factored on each processor. The factors of
207 * these submatrices overwrite the corresponding parts of A
208 * in memory.
209 * 2) Reduced System Phase:
210 * A small (max(bwl,bwu)* (P-1)) system is formed representing
211 * interaction of the larger blocks, and is stored (as are its
212 * factors) in the space AF. A parallel Block Cyclic Reduction
213 * algorithm is used. For a linear system, a parallel front solve
214 * followed by an analagous backsolve, both using the structure
215 * of the factored matrix, are performed.
216 * 3) Backsubsitution Phase:
217 * For a linear system, a local backsubstitution is performed on
218 * each processor in parallel.
219 *
220 *
221 * Descriptors
222 * ===========
223 *
224 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
225 *
226 * Note: banded codes can use either the old two dimensional
227 * or new one-dimensional descriptors, though the processor grid in
228 * both cases *must be one-dimensional*. We describe both types below.
229 *
230 * Each global data object is described by an associated description
231 * vector. This vector stores the information required to establish
232 * the mapping between an object element and its corresponding process
233 * and memory location.
234 *
235 * Let A be a generic term for any 2D block cyclicly distributed array.
236 * Such a global array has an associated description vector DESCA.
237 * In the following comments, the character _ should be read as
238 * "of the global array".
239 *
240 * NOTATION STORED IN EXPLANATION
241 * --------------- -------------- --------------------------------------
242 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
243 * DTYPE_A = 1.
244 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
245 * the BLACS process grid A is distribu-
246 * ted over. The context itself is glo-
247 * bal, but the handle (the integer
248 * value) may vary.
249 * M_A (global) DESCA( M_ ) The number of rows in the global
250 * array A.
251 * N_A (global) DESCA( N_ ) The number of columns in the global
252 * array A.
253 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
254 * the rows of the array.
255 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
256 * the columns of the array.
257 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
258 * row of the array A is distributed.
259 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
260 * first column of the array A is
261 * distributed.
262 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
263 * array. LLD_A >= MAX(1,LOCr(M_A)).
264 *
265 * Let K be the number of rows or columns of a distributed matrix,
266 * and assume that its process grid has dimension p x q.
267 * LOCr( K ) denotes the number of elements of K that a process
268 * would receive if K were distributed over the p processes of its
269 * process column.
270 * Similarly, LOCc( K ) denotes the number of elements of K that a
271 * process would receive if K were distributed over the q processes of
272 * its process row.
273 * The values of LOCr() and LOCc() may be determined via a call to the
274 * ScaLAPACK tool function, NUMROC:
275 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
276 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
277 * An upper bound for these quantities may be computed by:
278 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
279 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
280 *
281 *
282 * One-dimensional descriptors:
283 *
284 * One-dimensional descriptors are a new addition to ScaLAPACK since
285 * version 1.0. They simplify and shorten the descriptor for 1D
286 * arrays.
287 *
288 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
289 * object, we allow 1D arrays to be distributed either over the
290 * first dimension of the array (as if the grid were P-by-1) or the
291 * 2nd dimension (as if the grid were 1-by-P). This choice is
292 * indicated by the descriptor type (501 or 502)
293 * as described below.
294 *
295 * IMPORTANT NOTE: the actual BLACS grid represented by the
296 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
297 * irrespective of which one-dimensional descriptor type
298 * (501 or 502) is input.
299 * This routine will interpret the grid properly either way.
300 * ScaLAPACK routines *do not support intercontext operations* so that
301 * the grid passed to a single ScaLAPACK routine *must be the same*
302 * for all array descriptors passed to that routine.
303 *
304 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
305 * may also be used, since a one-dimensional array is a special case
306 * of a two-dimensional array with one dimension of size unity.
307 * The two-dimensional array used in this case *must* be of the
308 * proper orientation:
309 * If the appropriate one-dimensional descriptor is DTYPEA=501
310 * (1 by P type), then the two dimensional descriptor must
311 * have a CTXT value that refers to a 1 by P BLACS grid;
312 * If the appropriate one-dimensional descriptor is DTYPEA=502
313 * (P by 1 type), then the two dimensional descriptor must
314 * have a CTXT value that refers to a P by 1 BLACS grid.
315 *
316 *
317 * Summary of allowed descriptors, types, and BLACS grids:
318 * DTYPE 501 502 1 1
319 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
320 * -----------------------------------------------------
321 * A OK NO OK NO
322 * B NO OK NO OK
323 *
324 * Note that a consequence of this chart is that it is not possible
325 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
326 * to opposite requirements for the orientation of the BLACS grid,
327 * and as noted before, the *same* BLACS context must be used in
328 * all descriptors in a single ScaLAPACK subroutine call.
329 *
330 * Let A be a generic term for any 1D block cyclicly distributed array.
331 * Such a global array has an associated description vector DESCA.
332 * In the following comments, the character _ should be read as
333 * "of the global array".
334 *
335 * NOTATION STORED IN EXPLANATION
336 * --------------- ---------- ------------------------------------------
337 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
338 * TYPE_A = 501: 1-by-P grid.
339 * TYPE_A = 502: P-by-1 grid.
340 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
341 * the BLACS process grid A is distribu-
342 * ted over. The context itself is glo-
343 * bal, but the handle (the integer
344 * value) may vary.
345 * N_A (global) DESCA( 3 ) The size of the array dimension being
346 * distributed.
347 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
348 * the distributed dimension of the array.
349 * SRC_A (global) DESCA( 5 ) The process row or column over which the
350 * first row or column of the array
351 * is distributed.
352 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
353 * storing the local blocks of the distri-
354 * buted array A. Minimum value of LLD_A
355 * depends on TYPE_A.
356 * TYPE_A = 501: LLD_A >=
357 * size of undistributed dimension, 1.
358 * TYPE_A = 502: LLD_A >=NB_A, 1.
359 * Reserved DESCA( 7 ) Reserved for future use.
360 *
361 *
362 *
363 * =====================================================================
364 *
365 * Code Developer: Andrew J. Cleary, University of Tennessee.
366 * Current address: Lawrence Livermore National Labs.
367 * Last modified by: Peter Arbenz, Institute of Scientific Computing,
368 * ETH, Zurich.
369 *
370 * =====================================================================
371 *
372 * .. Parameters ..
373  REAL ONE
374  parameter( one = 1.0e+0 )
375  REAL ZERO
376  parameter( zero = 0.0e+0 )
377  INTEGER INT_ONE
378  parameter( int_one = 1 )
379  INTEGER DESCMULT, BIGNUM
380  parameter( descmult = 100, bignum = descmult*descmult )
381  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
382  $ lld_, mb_, m_, nb_, n_, rsrc_
383  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
384  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
385  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
386 * ..
387 * .. Local Scalars ..
388  INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
389  $ idum1, idum2, idum3, ja_new, level_dist, llda,
390  $ lldb, max_bw, mbw2, mycol, myrow, my_num_cols,
391  $ nb, np, npcol, nprow, np_save, odd_size, ofst,
392  $ part_offset, part_size, return_code, store_m_b,
393  $ store_n_a, work_size_min, work_u
394 * ..
395 * .. Local Arrays ..
396  INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
397  $ param_check( 18, 3 )
398 * ..
399 * .. External Subroutines ..
400  EXTERNAL blacs_gridexit, blacs_gridinfo, desc_convert,
401  $ sgemm, sgerv2d, sgesd2d, slamov, smatadd,
402  $ stbtrs, strmm, globchk, pxerbla, reshape
403 * ..
404 * .. External Functions ..
405  LOGICAL LSAME
406  INTEGER NUMROC
407  EXTERNAL lsame, numroc
408 * ..
409 * .. Intrinsic Functions ..
410  INTRINSIC ichar, max, min, mod
411 * ..
412 * .. Executable Statements ..
413 *
414 * Test the input parameters
415 *
416  info = 0
417 *
418 * Convert descriptor into standard form for easy access to
419 * parameters, check that grid is of right shape.
420 *
421  desca_1xp( 1 ) = 501
422  descb_px1( 1 ) = 502
423 *
424  CALL desc_convert( desca, desca_1xp, return_code )
425 *
426  IF( return_code.NE.0 ) THEN
427  info = -( 9*100+2 )
428  END IF
429 *
430  CALL desc_convert( descb, descb_px1, return_code )
431 *
432  IF( return_code.NE.0 ) THEN
433  info = -( 12*100+2 )
434  END IF
435 *
436 * Consistency checks for DESCA and DESCB.
437 *
438 * Context must be the same
439  IF( desca_1xp( 2 ).NE.descb_px1( 2 ) ) THEN
440  info = -( 12*100+2 )
441  END IF
442 *
443 * These are alignment restrictions that may or may not be removed
444 * in future releases. -Andy Cleary, April 14, 1996.
445 *
446 * Block sizes must be the same
447  IF( desca_1xp( 4 ).NE.descb_px1( 4 ) ) THEN
448  info = -( 12*100+4 )
449  END IF
450 *
451 * Source processor must be the same
452 *
453  IF( desca_1xp( 5 ).NE.descb_px1( 5 ) ) THEN
454  info = -( 12*100+5 )
455  END IF
456 *
457 * Get values out of descriptor for use in code.
458 *
459  ictxt = desca_1xp( 2 )
460  csrc = desca_1xp( 5 )
461  nb = desca_1xp( 4 )
462  llda = desca_1xp( 6 )
463  store_n_a = desca_1xp( 3 )
464  lldb = descb_px1( 6 )
465  store_m_b = descb_px1( 3 )
466 *
467 * Get grid parameters
468 *
469 *
470 * Size of separator blocks is maximum of bandwidths
471 *
472  max_bw = max( bwl, bwu )
473  mbw2 = max_bw*max_bw
474 *
475  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
476  np = nprow*npcol
477 *
478 *
479 *
480  IF( lsame( uplo, 'U' ) ) THEN
481  idum1 = ichar( 'U' )
482  ELSE IF( lsame( uplo, 'L' ) ) THEN
483  idum1 = ichar( 'L' )
484  ELSE
485  info = -1
486  END IF
487 *
488  IF( lsame( trans, 'N' ) ) THEN
489  idum2 = ichar( 'N' )
490  ELSE IF( lsame( trans, 'T' ) ) THEN
491  idum2 = ichar( 'T' )
492  ELSE IF( lsame( trans, 'C' ) ) THEN
493  idum2 = ichar( 'T' )
494  ELSE
495  info = -2
496  END IF
497 *
498  IF( lwork.LT.-1 ) THEN
499  info = -16
500  ELSE IF( lwork.EQ.-1 ) THEN
501  idum3 = -1
502  ELSE
503  idum3 = 1
504  END IF
505 *
506  IF( n.LT.0 ) THEN
507  info = -3
508  END IF
509 *
510  IF( n+ja-1.GT.store_n_a ) THEN
511  info = -( 9*100+6 )
512  END IF
513 *
514  IF( ( bwl.GT.n-1 ) .OR. ( bwl.LT.0 ) ) THEN
515  info = -4
516  END IF
517 *
518  IF( ( bwu.GT.n-1 ) .OR. ( bwu.LT.0 ) ) THEN
519  info = -5
520  END IF
521 *
522  IF( llda.LT.( bwl+bwu+1 ) ) THEN
523  info = -( 9*100+6 )
524  END IF
525 *
526  IF( nb.LE.0 ) THEN
527  info = -( 9*100+4 )
528  END IF
529 *
530  IF( n+ib-1.GT.store_m_b ) THEN
531  info = -( 12*100+3 )
532  END IF
533 *
534  IF( lldb.LT.nb ) THEN
535  info = -( 12*100+6 )
536  END IF
537 *
538  IF( nrhs.LT.0 ) THEN
539  info = -6
540  END IF
541 *
542 * Current alignment restriction
543 *
544  IF( ja.NE.ib ) THEN
545  info = -8
546  END IF
547 *
548 * Argument checking that is specific to Divide & Conquer routine
549 *
550  IF( nprow.NE.1 ) THEN
551  info = -( 9*100+2 )
552  END IF
553 *
554  IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
555  info = -( 3 )
556  CALL pxerbla( ictxt,
557  $ 'PSDBTRSV, D&C alg.: only 1 block per proc',
558  $ -info )
559  RETURN
560  END IF
561 *
562  IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*max( bwl, bwu ) ) ) THEN
563  info = -( 9*100+4 )
564  CALL pxerbla( ictxt, 'PSDBTRSV, D&C alg.: NB too small',
565  $ -info )
566  RETURN
567  END IF
568 *
569 *
570  work_size_min = max( bwl, bwu )*nrhs
571 *
572  work( 1 ) = work_size_min
573 *
574  IF( lwork.LT.work_size_min ) THEN
575  IF( lwork.NE.-1 ) THEN
576  info = -16
577  CALL pxerbla( ictxt, 'PSDBTRSV: worksize error', -info )
578  END IF
579  RETURN
580  END IF
581 *
582 * Pack params and positions into arrays for global consistency check
583 *
584  param_check( 18, 1 ) = descb( 5 )
585  param_check( 17, 1 ) = descb( 4 )
586  param_check( 16, 1 ) = descb( 3 )
587  param_check( 15, 1 ) = descb( 2 )
588  param_check( 14, 1 ) = descb( 1 )
589  param_check( 13, 1 ) = ib
590  param_check( 12, 1 ) = desca( 5 )
591  param_check( 11, 1 ) = desca( 4 )
592  param_check( 10, 1 ) = desca( 3 )
593  param_check( 9, 1 ) = desca( 1 )
594  param_check( 8, 1 ) = ja
595  param_check( 7, 1 ) = nrhs
596  param_check( 6, 1 ) = bwu
597  param_check( 5, 1 ) = bwl
598  param_check( 4, 1 ) = n
599  param_check( 3, 1 ) = idum3
600  param_check( 2, 1 ) = idum2
601  param_check( 1, 1 ) = idum1
602 *
603  param_check( 18, 2 ) = 1205
604  param_check( 17, 2 ) = 1204
605  param_check( 16, 2 ) = 1203
606  param_check( 15, 2 ) = 1202
607  param_check( 14, 2 ) = 1201
608  param_check( 13, 2 ) = 11
609  param_check( 12, 2 ) = 905
610  param_check( 11, 2 ) = 904
611  param_check( 10, 2 ) = 903
612  param_check( 9, 2 ) = 901
613  param_check( 8, 2 ) = 8
614  param_check( 7, 2 ) = 6
615  param_check( 6, 2 ) = 5
616  param_check( 5, 2 ) = 4
617  param_check( 4, 2 ) = 3
618  param_check( 3, 2 ) = 16
619  param_check( 2, 2 ) = 2
620  param_check( 1, 2 ) = 1
621 *
622 * Want to find errors with MIN( ), so if no error, set it to a big
623 * number. If there already is an error, multiply by the the
624 * descriptor multiplier.
625 *
626  IF( info.GE.0 ) THEN
627  info = bignum
628  ELSE IF( info.LT.-descmult ) THEN
629  info = -info
630  ELSE
631  info = -info*descmult
632  END IF
633 *
634 * Check consistency across processors
635 *
636  CALL globchk( ictxt, 18, param_check, 18, param_check( 1, 3 ),
637  $ info )
638 *
639 * Prepare output: set info = 0 if no error, and divide by DESCMULT
640 * if error is not in a descriptor entry.
641 *
642  IF( info.EQ.bignum ) THEN
643  info = 0
644  ELSE IF( mod( info, descmult ).EQ.0 ) THEN
645  info = -info / descmult
646  ELSE
647  info = -info
648  END IF
649 *
650  IF( info.LT.0 ) THEN
651  CALL pxerbla( ictxt, 'PSDBTRSV', -info )
652  RETURN
653  END IF
654 *
655 * Quick return if possible
656 *
657  IF( n.EQ.0 )
658  $ RETURN
659 *
660  IF( nrhs.EQ.0 )
661  $ RETURN
662 *
663 *
664 * Adjust addressing into matrix space to properly get into
665 * the beginning part of the relevant data
666 *
667  part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
668 *
669  IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
670  part_offset = part_offset + nb
671  END IF
672 *
673  IF( mycol.LT.csrc ) THEN
674  part_offset = part_offset - nb
675  END IF
676 *
677 * Form a new BLACS grid (the "standard form" grid) with only procs
678 * holding part of the matrix, of size 1xNP where NP is adjusted,
679 * starting at csrc=0, with JA modified to reflect dropped procs.
680 *
681 * First processor to hold part of the matrix:
682 *
683  first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
684 *
685 * Calculate new JA one while dropping off unused processors.
686 *
687  ja_new = mod( ja-1, nb ) + 1
688 *
689 * Save and compute new value of NP
690 *
691  np_save = np
692  np = ( ja_new+n-2 ) / nb + 1
693 *
694 * Call utility routine that forms "standard-form" grid
695 *
696  CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
697  $ int_one, np )
698 *
699 * Use new context from standard grid as context.
700 *
701  ictxt_save = ictxt
702  ictxt = ictxt_new
703  desca_1xp( 2 ) = ictxt_new
704  descb_px1( 2 ) = ictxt_new
705 *
706 * Get information about new grid.
707 *
708  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
709 *
710 * Drop out processors that do not have part of the matrix.
711 *
712  IF( myrow.LT.0 ) THEN
713  GO TO 200
714  END IF
715 *
716 * ********************************
717 * Values reused throughout routine
718 *
719 * User-input value of partition size
720 *
721  part_size = nb
722 *
723 * Number of columns in each processor
724 *
725  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
726 *
727 * Offset in columns to beginning of main partition in each proc
728 *
729  IF( mycol.EQ.0 ) THEN
730  part_offset = part_offset + mod( ja_new-1, part_size )
731  my_num_cols = my_num_cols - mod( ja_new-1, part_size )
732  END IF
733 *
734 * Offset in elements
735 *
736  ofst = part_offset*llda
737 *
738 * Size of main (or odd) partition in each processor
739 *
740  odd_size = my_num_cols
741  IF( mycol.LT.np-1 ) THEN
742  odd_size = odd_size - max_bw
743  END IF
744 *
745 * Offset to workspace for Upper triangular factor
746 *
747  work_u = bwu*odd_size + 3*mbw2
748 *
749 *
750 *
751 * Begin main code
752 *
753  IF( lsame( uplo, 'L' ) ) THEN
754 *
755  IF( lsame( trans, 'N' ) ) THEN
756 *
757 * Frontsolve
758 *
759 *
760 ******************************************
761 * Local computation phase
762 ******************************************
763 *
764 * Use main partition in each processor to solve locally
765 *
766  CALL stbtrs( uplo, 'N', 'U', odd_size, bwl, nrhs,
767  $ a( ofst+1+bwu ), llda, b( part_offset+1 ),
768  $ lldb, info )
769 *
770 *
771  IF( mycol.LT.np-1 ) THEN
772 * Use factorization of odd-even connection block to modify
773 * locally stored portion of right hand side(s)
774 *
775 *
776 * First copy and multiply it into temporary storage,
777 * then use it on RHS
778 *
779  CALL slamov( 'N', bwl, nrhs,
780  $ b( part_offset+odd_size-bwl+1 ), lldb,
781  $ work( 1 ), max_bw )
782 *
783  CALL strmm( 'L', 'U', 'N', 'N', bwl, nrhs, -one,
784  $ a( ( ofst+( bwl+bwu+1 )+( odd_size-bwl )*
785  $ llda ) ), llda-1, work( 1 ), max_bw )
786 *
787  CALL smatadd( bwl, nrhs, one, work( 1 ), max_bw, one,
788  $ b( part_offset+odd_size+1 ), lldb )
789 *
790  END IF
791 *
792 * Clear garbage out of workspace block
793 *
794  DO 10 idum1 = 1, work_size_min
795  work( idum1 ) = 0.0
796  10 CONTINUE
797 *
798 *
799  IF( mycol.NE.0 ) THEN
800 *
801 * Use the "spike" fillin to calculate contribution to previous
802 * processor's righthand-side.
803 *
804  CALL sgemm( 'N', 'N', bwu, nrhs, odd_size, -one, af( 1 ),
805  $ bwu, b( part_offset+1 ), lldb, zero,
806  $ work( 1+max_bw-bwu ), max_bw )
807 *
808  END IF
809 *
810 *
811 ************************************************
812 * Formation and solution of reduced system
813 ************************************************
814 *
815 *
816 * Send modifications to prior processor's right hand sides
817 *
818  IF( mycol.GT.0 ) THEN
819 *
820  CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
821  $ mycol-1 )
822 *
823  END IF
824 *
825 * Receive modifications to processor's right hand sides
826 *
827  IF( mycol.LT.npcol-1 ) THEN
828 *
829  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
830  $ mycol+1 )
831 *
832 * Combine contribution to locally stored right hand sides
833 *
834  CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
835  $ b( part_offset+odd_size+1 ), lldb )
836 *
837  END IF
838 *
839 *
840 * The last processor does not participate in the solution of the
841 * reduced system, having sent its contribution already.
842  IF( mycol.EQ.npcol-1 ) THEN
843  GO TO 40
844  END IF
845 *
846 *
847 * *************************************
848 * Modification Loop
849 *
850 * The distance for sending and receiving for each level starts
851 * at 1 for the first level.
852  level_dist = 1
853 *
854 * Do until this proc is needed to modify other procs' equations
855 *
856  20 CONTINUE
857  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
858  $ GO TO 30
859 *
860 * Receive and add contribution to righthand sides from left
861 *
862  IF( mycol-level_dist.GE.0 ) THEN
863 *
864  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
865  $ mycol-level_dist )
866 *
867  CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
868  $ b( part_offset+odd_size+1 ), lldb )
869 *
870  END IF
871 *
872 * Receive and add contribution to righthand sides from right
873 *
874  IF( mycol+level_dist.LT.npcol-1 ) THEN
875 *
876  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
877  $ mycol+level_dist )
878 *
879  CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
880  $ b( part_offset+odd_size+1 ), lldb )
881 *
882  END IF
883 *
884  level_dist = level_dist*2
885 *
886  GO TO 20
887  30 CONTINUE
888 * [End of GOTO Loop]
889 *
890 *
891 *
892 * *********************************
893 * Calculate and use this proc's blocks to modify other procs
894 *
895 * Solve with diagonal block
896 *
897  CALL stbtrs( 'L', 'N', 'U', max_bw, min( bwl, max_bw-1 ),
898  $ nrhs, af( odd_size*bwu+mbw2+1 ), max_bw+1,
899  $ b( part_offset+odd_size+1 ), lldb, info )
900 *
901  IF( info.NE.0 ) THEN
902  GO TO 190
903  END IF
904 *
905 *
906 *
907 * *********
908  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
909 *
910 * Calculate contribution from this block to next diagonal block
911 *
912  CALL sgemm( 'T', 'N', max_bw, nrhs, max_bw, -one,
913  $ af( ( odd_size )*bwu+1 ), max_bw,
914  $ b( part_offset+odd_size+1 ), lldb, zero,
915  $ work( 1 ), max_bw )
916 *
917 * Send contribution to diagonal block's owning processor.
918 *
919  CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
920  $ mycol+level_dist )
921 *
922  END IF
923 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
924 *
925 * ************
926  IF( ( mycol / level_dist.GT.0 ) .AND.
927  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
928  $ THEN
929 *
930 *
931 * Use offdiagonal block to calculate modification to diag block
932 * of processor to the left
933 *
934  CALL sgemm( 'N', 'N', max_bw, nrhs, max_bw, -one,
935  $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
936  $ b( part_offset+odd_size+1 ), lldb, zero,
937  $ work( 1 ), max_bw )
938 *
939 * Send contribution to diagonal block's owning processor.
940 *
941  CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
942  $ mycol-level_dist )
943 *
944  END IF
945 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
946 *
947  40 CONTINUE
948 *
949  ELSE
950 *
951 ******************** BACKSOLVE *************************************
952 *
953 ********************************************************************
954 * .. Begin reduced system phase of algorithm ..
955 ********************************************************************
956 *
957 *
958 *
959 * The last processor does not participate in the solution of the
960 * reduced system and just waits to receive its solution.
961  IF( mycol.EQ.npcol-1 ) THEN
962  GO TO 90
963  END IF
964 *
965 * Determine number of steps in tree loop
966 *
967  level_dist = 1
968  50 CONTINUE
969  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
970  $ GO TO 60
971 *
972  level_dist = level_dist*2
973 *
974  GO TO 50
975  60 CONTINUE
976 *
977 *
978  IF( ( mycol / level_dist.GT.0 ) .AND.
979  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
980  $ THEN
981 *
982 * Receive solution from processor to left
983 *
984  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
985  $ mycol-level_dist )
986 *
987 *
988 * Use offdiagonal block to calculate modification to RHS stored
989 * on this processor
990 *
991  CALL sgemm( 'T', 'N', max_bw, nrhs, max_bw, -one,
992  $ af( odd_size*bwu+2*mbw2+1 ), max_bw,
993  $ work( 1 ), max_bw, one,
994  $ b( part_offset+odd_size+1 ), lldb )
995  END IF
996 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
997 *
998 *
999  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1000 *
1001 * Receive solution from processor to right
1002 *
1003  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1004  $ mycol+level_dist )
1005 *
1006 * Calculate contribution from this block to next diagonal block
1007 *
1008  CALL sgemm( 'N', 'N', max_bw, nrhs, max_bw, -one,
1009  $ af( ( odd_size )*bwu+1 ), max_bw, work( 1 ),
1010  $ max_bw, one, b( part_offset+odd_size+1 ),
1011  $ lldb )
1012 *
1013  END IF
1014 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1015 *
1016 *
1017 * Solve with diagonal block
1018 *
1019  CALL stbtrs( 'L', 'T', 'U', max_bw, min( bwl, max_bw-1 ),
1020  $ nrhs, af( odd_size*bwu+mbw2+1 ), max_bw+1,
1021  $ b( part_offset+odd_size+1 ), lldb, info )
1022 *
1023  IF( info.NE.0 ) THEN
1024  GO TO 190
1025  END IF
1026 *
1027 *
1028 *
1029 ***Modification Loop *******
1030 *
1031  70 CONTINUE
1032  IF( level_dist.EQ.1 )
1033  $ GO TO 80
1034 *
1035  level_dist = level_dist / 2
1036 *
1037 * Send solution to the right
1038 *
1039  IF( mycol+level_dist.LT.npcol-1 ) THEN
1040 *
1041  CALL sgesd2d( ictxt, max_bw, nrhs,
1042  $ b( part_offset+odd_size+1 ), lldb, 0,
1043  $ mycol+level_dist )
1044 *
1045  END IF
1046 *
1047 * Send solution to left
1048 *
1049  IF( mycol-level_dist.GE.0 ) THEN
1050 *
1051  CALL sgesd2d( ictxt, max_bw, nrhs,
1052  $ b( part_offset+odd_size+1 ), lldb, 0,
1053  $ mycol-level_dist )
1054 *
1055  END IF
1056 *
1057  GO TO 70
1058  80 CONTINUE
1059 * [End of GOTO Loop]
1060 *
1061  90 CONTINUE
1062 * [Processor npcol - 1 jumped to here to await next stage]
1063 *
1064 *******************************
1065 * Reduced system has been solved, communicate solutions to nearest
1066 * neighbors in preparation for local computation phase.
1067 *
1068 *
1069 * Send elements of solution to next proc
1070 *
1071  IF( mycol.LT.npcol-1 ) THEN
1072 *
1073  CALL sgesd2d( ictxt, max_bw, nrhs,
1074  $ b( part_offset+odd_size+1 ), lldb, 0,
1075  $ mycol+1 )
1076 *
1077  END IF
1078 *
1079 * Receive modifications to processor's right hand sides
1080 *
1081  IF( mycol.GT.0 ) THEN
1082 *
1083  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1084  $ mycol-1 )
1085 *
1086  END IF
1087 *
1088 *
1089 *
1090 **********************************************
1091 * Local computation phase
1092 **********************************************
1093 *
1094  IF( mycol.NE.0 ) THEN
1095 * Use the "spike" fillin to calculate contribution from previous
1096 * processor's solution.
1097 *
1098  CALL sgemm( 'T', 'N', odd_size, nrhs, bwu, -one, af( 1 ),
1099  $ bwu, work( 1+max_bw-bwu ), max_bw, one,
1100  $ b( part_offset+1 ), lldb )
1101 *
1102  END IF
1103 *
1104 *
1105  IF( mycol.LT.np-1 ) THEN
1106 * Use factorization of odd-even connection block to modify
1107 * locally stored portion of right hand side(s)
1108 *
1109 *
1110 * First copy and multiply it into temporary storage,
1111 * then use it on RHS
1112 *
1113  CALL slamov( 'N', bwl, nrhs, b( part_offset+odd_size+1 ),
1114  $ lldb, work( 1+max_bw-bwl ), max_bw )
1115 *
1116  CALL strmm( 'L', 'U', 'T', 'N', bwl, nrhs, -one,
1117  $ a( ( ofst+( bwl+bwu+1 )+( odd_size-bwl )*
1118  $ llda ) ), llda-1, work( 1+max_bw-bwl ),
1119  $ max_bw )
1120 *
1121  CALL smatadd( bwl, nrhs, one, work( 1+max_bw-bwl ),
1122  $ max_bw, one, b( part_offset+odd_size-bwl+
1123  $ 1 ), lldb )
1124 *
1125  END IF
1126 *
1127 * Use main partition in each processor to solve locally
1128 *
1129  CALL stbtrs( uplo, 'T', 'U', odd_size, bwl, nrhs,
1130  $ a( ofst+1+bwu ), llda, b( part_offset+1 ),
1131  $ lldb, info )
1132 *
1133  END IF
1134 * End of "IF( LSAME( TRANS, 'N' ) )"...
1135 *
1136 *
1137  ELSE
1138 ***************************************************************
1139 * CASE UPLO = 'U' *
1140 ***************************************************************
1141  IF( lsame( trans, 'T' ) ) THEN
1142 *
1143 * Frontsolve
1144 *
1145 *
1146 ******************************************
1147 * Local computation phase
1148 ******************************************
1149 *
1150 * Use main partition in each processor to solve locally
1151 *
1152  CALL stbtrs( uplo, 'T', 'N', odd_size, bwu, nrhs,
1153  $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1154  $ info )
1155 *
1156 *
1157  IF( mycol.LT.np-1 ) THEN
1158 * Use factorization of odd-even connection block to modify
1159 * locally stored portion of right hand side(s)
1160 *
1161 *
1162 * First copy and multiply it into temporary storage,
1163 * then use it on RHS
1164 *
1165  CALL slamov( 'N', bwu, nrhs,
1166  $ b( part_offset+odd_size-bwu+1 ), lldb,
1167  $ work( 1 ), max_bw )
1168 *
1169  CALL strmm( 'L', 'L', 'T', 'N', bwu, nrhs, -one,
1170  $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1171  $ work( 1 ), max_bw )
1172 *
1173  CALL smatadd( bwu, nrhs, one, work( 1 ), max_bw, one,
1174  $ b( part_offset+odd_size+1 ), lldb )
1175 *
1176  END IF
1177 *
1178 * Clear garbage out of workspace block
1179 *
1180  DO 100 idum1 = 1, work_size_min
1181  work( idum1 ) = 0.0
1182  100 CONTINUE
1183 *
1184 *
1185  IF( mycol.NE.0 ) THEN
1186 * Use the "spike" fillin to calculate contribution to previous
1187 * processor's righthand-side.
1188 *
1189  CALL sgemm( 'N', 'N', bwl, nrhs, odd_size, -one,
1190  $ af( work_u+1 ), bwl, b( part_offset+1 ),
1191  $ lldb, zero, work( 1+max_bw-bwl ), max_bw )
1192  END IF
1193 *
1194 *
1195 ************************************************
1196 * Formation and solution of reduced system
1197 ************************************************
1198 *
1199 *
1200 * Send modifications to prior processor's right hand sides
1201 *
1202  IF( mycol.GT.0 ) THEN
1203 *
1204  CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1205  $ mycol-1 )
1206 *
1207  END IF
1208 *
1209 * Receive modifications to processor's right hand sides
1210 *
1211  IF( mycol.LT.npcol-1 ) THEN
1212 *
1213  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1214  $ mycol+1 )
1215 *
1216 * Combine contribution to locally stored right hand sides
1217 *
1218  CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
1219  $ b( part_offset+odd_size+1 ), lldb )
1220 *
1221  END IF
1222 *
1223 *
1224 * The last processor does not participate in the solution of the
1225 * reduced system, having sent its contribution already.
1226  IF( mycol.EQ.npcol-1 ) THEN
1227  GO TO 130
1228  END IF
1229 *
1230 *
1231 * *************************************
1232 * Modification Loop
1233 *
1234 * The distance for sending and receiving for each level starts
1235 * at 1 for the first level.
1236  level_dist = 1
1237 *
1238 * Do until this proc is needed to modify other procs' equations
1239 *
1240  110 CONTINUE
1241  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1242  $ GO TO 120
1243 *
1244 * Receive and add contribution to righthand sides from left
1245 *
1246  IF( mycol-level_dist.GE.0 ) THEN
1247 *
1248  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1249  $ mycol-level_dist )
1250 *
1251  CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
1252  $ b( part_offset+odd_size+1 ), lldb )
1253 *
1254  END IF
1255 *
1256 * Receive and add contribution to righthand sides from right
1257 *
1258  IF( mycol+level_dist.LT.npcol-1 ) THEN
1259 *
1260  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1261  $ mycol+level_dist )
1262 *
1263  CALL smatadd( max_bw, nrhs, one, work( 1 ), max_bw, one,
1264  $ b( part_offset+odd_size+1 ), lldb )
1265 *
1266  END IF
1267 *
1268  level_dist = level_dist*2
1269 *
1270  GO TO 110
1271  120 CONTINUE
1272 * [End of GOTO Loop]
1273 *
1274 *
1275 *
1276 * *********************************
1277 * Calculate and use this proc's blocks to modify other procs
1278 *
1279 * Solve with diagonal block
1280 *
1281  CALL stbtrs( 'U', 'T', 'N', max_bw, min( bwu, max_bw-1 ),
1282  $ nrhs, af( odd_size*bwu+mbw2+1-min( bwu,
1283  $ max_bw-1 ) ), max_bw+1,
1284  $ b( part_offset+odd_size+1 ), lldb, info )
1285 *
1286  IF( info.NE.0 ) THEN
1287  GO TO 190
1288  END IF
1289 *
1290 *
1291 *
1292 * *********
1293  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1294 *
1295 * Calculate contribution from this block to next diagonal block
1296 *
1297  CALL sgemm( 'T', 'N', max_bw, nrhs, max_bw, -one,
1298  $ af( work_u+( odd_size )*bwl+1 ), max_bw,
1299  $ b( part_offset+odd_size+1 ), lldb, zero,
1300  $ work( 1 ), max_bw )
1301 *
1302 * Send contribution to diagonal block's owning processor.
1303 *
1304  CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1305  $ mycol+level_dist )
1306 *
1307  END IF
1308 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1309 *
1310 * ************
1311  IF( ( mycol / level_dist.GT.0 ) .AND.
1312  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1313  $ THEN
1314 *
1315 *
1316 * Use offdiagonal block to calculate modification to diag block
1317 * of processor to the left
1318 *
1319  CALL sgemm( 'N', 'N', max_bw, nrhs, max_bw, -one,
1320  $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1321  $ b( part_offset+odd_size+1 ), lldb, zero,
1322  $ work( 1 ), max_bw )
1323 *
1324 * Send contribution to diagonal block's owning processor.
1325 *
1326  CALL sgesd2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1327  $ mycol-level_dist )
1328 *
1329  END IF
1330 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1331 *
1332  130 CONTINUE
1333 *
1334  ELSE
1335 *
1336 ******************** BACKSOLVE *************************************
1337 *
1338 ********************************************************************
1339 * .. Begin reduced system phase of algorithm ..
1340 ********************************************************************
1341 *
1342 *
1343 *
1344 * The last processor does not participate in the solution of the
1345 * reduced system and just waits to receive its solution.
1346  IF( mycol.EQ.npcol-1 ) THEN
1347  GO TO 180
1348  END IF
1349 *
1350 * Determine number of steps in tree loop
1351 *
1352  level_dist = 1
1353  140 CONTINUE
1354  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1355  $ GO TO 150
1356 *
1357  level_dist = level_dist*2
1358 *
1359  GO TO 140
1360  150 CONTINUE
1361 *
1362 *
1363  IF( ( mycol / level_dist.GT.0 ) .AND.
1364  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1365  $ THEN
1366 *
1367 * Receive solution from processor to left
1368 *
1369  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1370  $ mycol-level_dist )
1371 *
1372 *
1373 * Use offdiagonal block to calculate modification to RHS stored
1374 * on this processor
1375 *
1376  CALL sgemm( 'T', 'N', max_bw, nrhs, max_bw, -one,
1377  $ af( work_u+odd_size*bwl+2*mbw2+1 ), max_bw,
1378  $ work( 1 ), max_bw, one,
1379  $ b( part_offset+odd_size+1 ), lldb )
1380  END IF
1381 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1382 *
1383 *
1384  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1385 *
1386 * Receive solution from processor to right
1387 *
1388  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1389  $ mycol+level_dist )
1390 *
1391 * Calculate contribution from this block to next diagonal block
1392 *
1393  CALL sgemm( 'N', 'N', max_bw, nrhs, max_bw, -one,
1394  $ af( work_u+( odd_size )*bwl+1 ), max_bw,
1395  $ work( 1 ), max_bw, one,
1396  $ b( part_offset+odd_size+1 ), lldb )
1397 *
1398  END IF
1399 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1400 *
1401 *
1402 * Solve with diagonal block
1403 *
1404  CALL stbtrs( 'U', 'N', 'N', max_bw, min( bwu, max_bw-1 ),
1405  $ nrhs, af( odd_size*bwu+mbw2+1-min( bwu,
1406  $ max_bw-1 ) ), max_bw+1,
1407  $ b( part_offset+odd_size+1 ), lldb, info )
1408 *
1409  IF( info.NE.0 ) THEN
1410  GO TO 190
1411  END IF
1412 *
1413 *
1414 *
1415 ***Modification Loop *******
1416 *
1417  160 CONTINUE
1418  IF( level_dist.EQ.1 )
1419  $ GO TO 170
1420 *
1421  level_dist = level_dist / 2
1422 *
1423 * Send solution to the right
1424 *
1425  IF( mycol+level_dist.LT.npcol-1 ) THEN
1426 *
1427  CALL sgesd2d( ictxt, max_bw, nrhs,
1428  $ b( part_offset+odd_size+1 ), lldb, 0,
1429  $ mycol+level_dist )
1430 *
1431  END IF
1432 *
1433 * Send solution to left
1434 *
1435  IF( mycol-level_dist.GE.0 ) THEN
1436 *
1437  CALL sgesd2d( ictxt, max_bw, nrhs,
1438  $ b( part_offset+odd_size+1 ), lldb, 0,
1439  $ mycol-level_dist )
1440 *
1441  END IF
1442 *
1443  GO TO 160
1444  170 CONTINUE
1445 * [End of GOTO Loop]
1446 *
1447  180 CONTINUE
1448 * [Processor npcol - 1 jumped to here to await next stage]
1449 *
1450 *******************************
1451 * Reduced system has been solved, communicate solutions to nearest
1452 * neighbors in preparation for local computation phase.
1453 *
1454 *
1455 * Send elements of solution to next proc
1456 *
1457  IF( mycol.LT.npcol-1 ) THEN
1458 *
1459  CALL sgesd2d( ictxt, max_bw, nrhs,
1460  $ b( part_offset+odd_size+1 ), lldb, 0,
1461  $ mycol+1 )
1462 *
1463  END IF
1464 *
1465 * Receive modifications to processor's right hand sides
1466 *
1467  IF( mycol.GT.0 ) THEN
1468 *
1469  CALL sgerv2d( ictxt, max_bw, nrhs, work( 1 ), max_bw, 0,
1470  $ mycol-1 )
1471 *
1472  END IF
1473 *
1474 *
1475 *
1476 **********************************************
1477 * Local computation phase
1478 **********************************************
1479 *
1480  IF( mycol.NE.0 ) THEN
1481 * Use the "spike" fillin to calculate contribution from previous
1482 * processor's solution.
1483 *
1484  CALL sgemm( 'T', 'N', odd_size, nrhs, bwl, -one,
1485  $ af( work_u+1 ), bwl, work( 1+max_bw-bwl ),
1486  $ max_bw, one, b( part_offset+1 ), lldb )
1487 *
1488  END IF
1489 *
1490 *
1491  IF( mycol.LT.np-1 ) THEN
1492 * Use factorization of odd-even connection block to modify
1493 * locally stored portion of right hand side(s)
1494 *
1495 *
1496 * First copy and multiply it into temporary storage,
1497 * then use it on RHS
1498 *
1499  CALL slamov( 'N', bwu, nrhs, b( part_offset+odd_size+1 ),
1500  $ lldb, work( 1+max_bw-bwu ), max_bw+bwl )
1501 *
1502  CALL strmm( 'L', 'L', 'N', 'N', bwu, nrhs, -one,
1503  $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1504  $ work( 1+max_bw-bwu ), max_bw+bwl )
1505 *
1506  CALL smatadd( bwu, nrhs, one, work( 1+max_bw-bwu ),
1507  $ max_bw+bwl, one, b( part_offset+odd_size-
1508  $ bwu+1 ), lldb )
1509 *
1510  END IF
1511 *
1512 * Use main partition in each processor to solve locally
1513 *
1514  CALL stbtrs( uplo, 'N', 'N', odd_size, bwu, nrhs,
1515  $ a( ofst+1 ), llda, b( part_offset+1 ), lldb,
1516  $ info )
1517 *
1518  END IF
1519 * End of "IF( LSAME( TRANS, 'N' ) )"...
1520 *
1521 *
1522  END IF
1523 * End of "IF( LSAME( UPLO, 'L' ) )"...
1524  190 CONTINUE
1525 *
1526 *
1527 * Free BLACS space used to hold standard-form grid.
1528 *
1529  IF( ictxt_save.NE.ictxt_new ) THEN
1530  CALL blacs_gridexit( ictxt_new )
1531  END IF
1532 *
1533  200 CONTINUE
1534 *
1535 * Restore saved input parameters
1536 *
1537  ictxt = ictxt_save
1538  np = np_save
1539 *
1540 * Output minimum worksize
1541 *
1542  work( 1 ) = work_size_min
1543 *
1544 *
1545  RETURN
1546 *
1547 * End of PSDBTRSV
1548 *
1549  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
max
#define max(A, B)
Definition: pcgemr.c:180
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
smatadd
subroutine smatadd(M, N, ALPHA, A, LDA, BETA, C, LDC)
Definition: smatadd.f:2
psdbtrsv
subroutine psdbtrsv(UPLO, TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: psdbtrsv.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181