ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
psdttrsv.f
Go to the documentation of this file.
1  SUBROUTINE psdttrsv( UPLO, TRANS, N, NRHS, DL, D, DU, JA, DESCA,
2  $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 1.7) --
5 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
6 * and University of California, Berkeley.
7 * April 3, 2000
8 *
9 * .. Scalar Arguments ..
10  CHARACTER TRANS, UPLO
11  INTEGER IB, INFO, JA, LAF, LWORK, N, NRHS
12 * ..
13 * .. Array Arguments ..
14  INTEGER DESCA( * ), DESCB( * )
15  REAL AF( * ), B( * ), D( * ), DL( * ), DU( * ),
16  $ work( * )
17 * ..
18 *
19 *
20 * Purpose
21 * =======
22 *
23 * PSDTTRSV solves a tridiagonal triangular system of linear equations
24 *
25 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
26 * or
27 * A(1:N, JA:JA+N-1)^T * X = B(IB:IB+N-1, 1:NRHS)
28 *
29 * where A(1:N, JA:JA+N-1) is a tridiagonal
30 * triangular matrix factor produced by the
31 * Gaussian elimination code PS@(dom_pre)TTRF
32 * and is stored in A(1:N,JA:JA+N-1) and AF.
33 * The matrix stored in A(1:N, JA:JA+N-1) is either
34 * upper or lower triangular according to UPLO,
35 * and the choice of solving A(1:N, JA:JA+N-1) or A(1:N, JA:JA+N-1)^T
36 * is dictated by the user by the parameter TRANS.
37 *
38 * Routine PSDTTRF MUST be called first.
39 *
40 * =====================================================================
41 *
42 * Arguments
43 * =========
44 *
45 * UPLO (global input) CHARACTER
46 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
47 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
48 *
49 * TRANS (global input) CHARACTER
50 * = 'N': Solve with A(1:N, JA:JA+N-1);
51 * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T;
52 *
53 * N (global input) INTEGER
54 * The number of rows and columns to be operated on, i.e. the
55 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
56 *
57 * NRHS (global input) INTEGER
58 * The number of right hand sides, i.e., the number of columns
59 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
60 * NRHS >= 0.
61 *
62 * DL (local input/local output) REAL pointer to local
63 * part of global vector storing the lower diagonal of the
64 * matrix. Globally, DL(1) is not referenced, and DL must be
65 * aligned with D.
66 * Must be of size >= DESCA( NB_ ).
67 * On exit, this array contains information containing the
68 * factors of the matrix.
69 *
70 * D (local input/local output) REAL pointer to local
71 * part of global vector storing the main diagonal of the
72 * matrix.
73 * On exit, this array contains information containing the
74 * factors of the matrix.
75 * Must be of size >= DESCA( NB_ ).
76 *
77 * DU (local input/local output) REAL pointer to local
78 * part of global vector storing the upper diagonal of the
79 * matrix. Globally, DU(n) is not referenced, and DU must be
80 * aligned with D.
81 * On exit, this array contains information containing the
82 * factors of the matrix.
83 * Must be of size >= DESCA( NB_ ).
84 *
85 * JA (global input) INTEGER
86 * The index in the global array A that points to the start of
87 * the matrix to be operated on (which may be either all of A
88 * or a submatrix of A).
89 *
90 * DESCA (global and local input) INTEGER array of dimension DLEN.
91 * if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
92 * if 2D type (DTYPE_A=1), DLEN >= 9.
93 * The array descriptor for the distributed matrix A.
94 * Contains information of mapping of A to memory. Please
95 * see NOTES below for full description and options.
96 *
97 * B (local input/local output) REAL pointer into
98 * local memory to an array of local lead dimension lld_b>=NB.
99 * On entry, this array contains the
100 * the local pieces of the right hand sides
101 * B(IB:IB+N-1, 1:NRHS).
102 * On exit, this contains the local piece of the solutions
103 * distributed matrix X.
104 *
105 * IB (global input) INTEGER
106 * The row index in the global array B that points to the first
107 * row of the matrix to be operated on (which may be either
108 * all of B or a submatrix of B).
109 *
110 * DESCB (global and local input) INTEGER array of dimension DLEN.
111 * if 1D type (DTYPE_B=502), DLEN >=7;
112 * if 2D type (DTYPE_B=1), DLEN >= 9.
113 * The array descriptor for the distributed matrix B.
114 * Contains information of mapping of B to memory. Please
115 * see NOTES below for full description and options.
116 *
117 * AF (local output) REAL array, dimension LAF.
118 * Auxiliary Fillin Space.
119 * Fillin is created during the factorization routine
120 * PSDTTRF and this is stored in AF. If a linear system
121 * is to be solved using PSDTTRS after the factorization
122 * routine, AF *must not be altered* after the factorization.
123 *
124 * LAF (local input) INTEGER
125 * Size of user-input Auxiliary Fillin space AF. Must be >=
126 * 2*(NB+2)
127 * If LAF is not large enough, an error code will be returned
128 * and the minimum acceptable size will be returned in AF( 1 )
129 *
130 * WORK (local workspace/local output)
131 * REAL temporary workspace. This space may
132 * be overwritten in between calls to routines. WORK must be
133 * the size given in LWORK.
134 * On exit, WORK( 1 ) contains the minimal LWORK.
135 *
136 * LWORK (local input or global input) INTEGER
137 * Size of user-input workspace WORK.
138 * If LWORK is too small, the minimal acceptable size will be
139 * returned in WORK(1) and an error code is returned. LWORK>=
140 * 10*NPCOL+4*NRHS
141 *
142 * INFO (local output) INTEGER
143 * = 0: successful exit
144 * < 0: If the i-th argument is an array and the j-entry had
145 * an illegal value, then INFO = -(i*100+j), if the i-th
146 * argument is a scalar and had an illegal value, then
147 * INFO = -i.
148 *
149 * =====================================================================
150 *
151 *
152 * Restrictions
153 * ============
154 *
155 * The following are restrictions on the input parameters. Some of these
156 * are temporary and will be removed in future releases, while others
157 * may reflect fundamental technical limitations.
158 *
159 * Non-cyclic restriction: VERY IMPORTANT!
160 * P*NB>= mod(JA-1,NB)+N.
161 * The mapping for matrices must be blocked, reflecting the nature
162 * of the divide and conquer algorithm as a task-parallel algorithm.
163 * This formula in words is: no processor may have more than one
164 * chunk of the matrix.
165 *
166 * Blocksize cannot be too small:
167 * If the matrix spans more than one processor, the following
168 * restriction on NB, the size of each block on each processor,
169 * must hold:
170 * NB >= 2
171 * The bulk of parallel computation is done on the matrix of size
172 * O(NB) on each processor. If this is too small, divide and conquer
173 * is a poor choice of algorithm.
174 *
175 * Submatrix reference:
176 * JA = IB
177 * Alignment restriction that prevents unnecessary communication.
178 *
179 *
180 * =====================================================================
181 *
182 *
183 * Notes
184 * =====
185 *
186 * If the factorization routine and the solve routine are to be called
187 * separately (to solve various sets of righthand sides using the same
188 * coefficient matrix), the auxiliary space AF *must not be altered*
189 * between calls to the factorization routine and the solve routine.
190 *
191 * The best algorithm for solving banded and tridiagonal linear systems
192 * depends on a variety of parameters, especially the bandwidth.
193 * Currently, only algorithms designed for the case N/P >> bw are
194 * implemented. These go by many names, including Divide and Conquer,
195 * Partitioning, domain decomposition-type, etc.
196 * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
197 * algorithms are the appropriate choice.
198 *
199 * Algorithm description: Divide and Conquer
200 *
201 * The Divide and Conqer algorithm assumes the matrix is narrowly
202 * banded compared with the number of equations. In this situation,
203 * it is best to distribute the input matrix A one-dimensionally,
204 * with columns atomic and rows divided amongst the processes.
205 * The basic algorithm divides the tridiagonal matrix up into
206 * P pieces with one stored on each processor,
207 * and then proceeds in 2 phases for the factorization or 3 for the
208 * solution of a linear system.
209 * 1) Local Phase:
210 * The individual pieces are factored independently and in
211 * parallel. These factors are applied to the matrix creating
212 * fillin, which is stored in a non-inspectable way in auxiliary
213 * space AF. Mathematically, this is equivalent to reordering
214 * the matrix A as P A P^T and then factoring the principal
215 * leading submatrix of size equal to the sum of the sizes of
216 * the matrices factored on each processor. The factors of
217 * these submatrices overwrite the corresponding parts of A
218 * in memory.
219 * 2) Reduced System Phase:
220 * A small ((P-1)) system is formed representing
221 * interaction of the larger blocks, and is stored (as are its
222 * factors) in the space AF. A parallel Block Cyclic Reduction
223 * algorithm is used. For a linear system, a parallel front solve
224 * followed by an analagous backsolve, both using the structure
225 * of the factored matrix, are performed.
226 * 3) Backsubsitution Phase:
227 * For a linear system, a local backsubstitution is performed on
228 * each processor in parallel.
229 *
230 *
231 * Descriptors
232 * ===========
233 *
234 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
235 *
236 * Note: tridiagonal codes can use either the old two dimensional
237 * or new one-dimensional descriptors, though the processor grid in
238 * both cases *must be one-dimensional*. We describe both types below.
239 *
240 * Each global data object is described by an associated description
241 * vector. This vector stores the information required to establish
242 * the mapping between an object element and its corresponding process
243 * and memory location.
244 *
245 * Let A be a generic term for any 2D block cyclicly distributed array.
246 * Such a global array has an associated description vector DESCA.
247 * In the following comments, the character _ should be read as
248 * "of the global array".
249 *
250 * NOTATION STORED IN EXPLANATION
251 * --------------- -------------- --------------------------------------
252 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
253 * DTYPE_A = 1.
254 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
255 * the BLACS process grid A is distribu-
256 * ted over. The context itself is glo-
257 * bal, but the handle (the integer
258 * value) may vary.
259 * M_A (global) DESCA( M_ ) The number of rows in the global
260 * array A.
261 * N_A (global) DESCA( N_ ) The number of columns in the global
262 * array A.
263 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
264 * the rows of the array.
265 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
266 * the columns of the array.
267 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
268 * row of the array A is distributed.
269 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
270 * first column of the array A is
271 * distributed.
272 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
273 * array. LLD_A >= MAX(1,LOCr(M_A)).
274 *
275 * Let K be the number of rows or columns of a distributed matrix,
276 * and assume that its process grid has dimension p x q.
277 * LOCr( K ) denotes the number of elements of K that a process
278 * would receive if K were distributed over the p processes of its
279 * process column.
280 * Similarly, LOCc( K ) denotes the number of elements of K that a
281 * process would receive if K were distributed over the q processes of
282 * its process row.
283 * The values of LOCr() and LOCc() may be determined via a call to the
284 * ScaLAPACK tool function, NUMROC:
285 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
286 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
287 * An upper bound for these quantities may be computed by:
288 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
289 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
290 *
291 *
292 * One-dimensional descriptors:
293 *
294 * One-dimensional descriptors are a new addition to ScaLAPACK since
295 * version 1.0. They simplify and shorten the descriptor for 1D
296 * arrays.
297 *
298 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
299 * object, we allow 1D arrays to be distributed either over the
300 * first dimension of the array (as if the grid were P-by-1) or the
301 * 2nd dimension (as if the grid were 1-by-P). This choice is
302 * indicated by the descriptor type (501 or 502)
303 * as described below.
304 * However, for tridiagonal matrices, since the objects being
305 * distributed are the individual vectors storing the diagonals, we
306 * have adopted the convention that both the P-by-1 descriptor and
307 * the 1-by-P descriptor are allowed and are equivalent for
308 * tridiagonal matrices. Thus, for tridiagonal matrices,
309 * DTYPE_A = 501 or 502 can be used interchangeably
310 * without any other change.
311 * We require that the distributed vectors storing the diagonals of a
312 * tridiagonal matrix be aligned with each other. Because of this, a
313 * single descriptor, DESCA, serves to describe the distribution of
314 * of all diagonals simultaneously.
315 *
316 * IMPORTANT NOTE: the actual BLACS grid represented by the
317 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
318 * irrespective of which one-dimensional descriptor type
319 * (501 or 502) is input.
320 * This routine will interpret the grid properly either way.
321 * ScaLAPACK routines *do not support intercontext operations* so that
322 * the grid passed to a single ScaLAPACK routine *must be the same*
323 * for all array descriptors passed to that routine.
324 *
325 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
326 * may also be used, since a one-dimensional array is a special case
327 * of a two-dimensional array with one dimension of size unity.
328 * The two-dimensional array used in this case *must* be of the
329 * proper orientation:
330 * If the appropriate one-dimensional descriptor is DTYPEA=501
331 * (1 by P type), then the two dimensional descriptor must
332 * have a CTXT value that refers to a 1 by P BLACS grid;
333 * If the appropriate one-dimensional descriptor is DTYPEA=502
334 * (P by 1 type), then the two dimensional descriptor must
335 * have a CTXT value that refers to a P by 1 BLACS grid.
336 *
337 *
338 * Summary of allowed descriptors, types, and BLACS grids:
339 * DTYPE 501 502 1 1
340 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
341 * -----------------------------------------------------
342 * A OK OK OK NO
343 * B NO OK NO OK
344 *
345 * Note that a consequence of this chart is that it is not possible
346 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
347 * to opposite requirements for the orientation of the BLACS grid,
348 * and as noted before, the *same* BLACS context must be used in
349 * all descriptors in a single ScaLAPACK subroutine call.
350 *
351 * Let A be a generic term for any 1D block cyclicly distributed array.
352 * Such a global array has an associated description vector DESCA.
353 * In the following comments, the character _ should be read as
354 * "of the global array".
355 *
356 * NOTATION STORED IN EXPLANATION
357 * --------------- ---------- ------------------------------------------
358 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
359 * TYPE_A = 501: 1-by-P grid.
360 * TYPE_A = 502: P-by-1 grid.
361 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
362 * the BLACS process grid A is distribu-
363 * ted over. The context itself is glo-
364 * bal, but the handle (the integer
365 * value) may vary.
366 * N_A (global) DESCA( 3 ) The size of the array dimension being
367 * distributed.
368 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
369 * the distributed dimension of the array.
370 * SRC_A (global) DESCA( 5 ) The process row or column over which the
371 * first row or column of the array
372 * is distributed.
373 * Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
374 * Reserved DESCA( 7 ) Reserved for future use.
375 *
376 *
377 *
378 * =====================================================================
379 *
380 * Code Developer: Andrew J. Cleary, University of Tennessee.
381 * Current address: Lawrence Livermore National Labs.
382 *
383 * =====================================================================
384 *
385 * .. Parameters ..
386  REAL ONE
387  parameter( one = 1.0e+0 )
388  REAL ZERO
389  parameter( zero = 0.0e+0 )
390  INTEGER INT_ONE
391  parameter( int_one = 1 )
392  INTEGER DESCMULT, BIGNUM
393  parameter( descmult = 100, bignum = descmult*descmult )
394  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
395  $ lld_, mb_, m_, nb_, n_, rsrc_
396  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
397  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
398  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
399 * ..
400 * .. Local Scalars ..
401  INTEGER CSRC, FIRST_PROC, ICTXT, ICTXT_NEW, ICTXT_SAVE,
402  $ idum1, idum2, idum3, ja_new, level_dist, llda,
403  $ lldb, mycol, myrow, my_num_cols, nb, np, npcol,
404  $ nprow, np_save, odd_size, part_offset,
405  $ part_size, return_code, store_m_b, store_n_a,
406  $ temp, work_size_min, work_u
407 * ..
408 * .. Local Arrays ..
409  INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
410  $ param_check( 16, 3 )
411 * ..
412 * .. External Subroutines ..
413  EXTERNAL blacs_gridexit, blacs_gridinfo, desc_convert,
414  $ globchk, pxerbla, reshape, saxpy, sdttrsv,
415  $ sgemm, sgerv2d, sgesd2d, smatadd, stbtrs
416 * ..
417 * .. External Functions ..
418  LOGICAL LSAME
419  INTEGER NUMROC
420  EXTERNAL lsame, numroc
421 * ..
422 * .. Intrinsic Functions ..
423  INTRINSIC ichar, min, mod
424 * ..
425 * .. Executable Statements ..
426 *
427 * Test the input parameters
428 *
429  info = 0
430 *
431 * Convert descriptor into standard form for easy access to
432 * parameters, check that grid is of right shape.
433 *
434  desca_1xp( 1 ) = 501
435  descb_px1( 1 ) = 502
436 *
437  temp = desca( dtype_ )
438  IF( temp.EQ.502 ) THEN
439 * Temporarily set the descriptor type to 1xP type
440  desca( dtype_ ) = 501
441  END IF
442 *
443  CALL desc_convert( desca, desca_1xp, return_code )
444 *
445  desca( dtype_ ) = temp
446 *
447  IF( return_code.NE.0 ) THEN
448  info = -( 9*100+2 )
449  END IF
450 *
451  CALL desc_convert( descb, descb_px1, return_code )
452 *
453  IF( return_code.NE.0 ) THEN
454  info = -( 12*100+2 )
455  END IF
456 *
457 * Consistency checks for DESCA and DESCB.
458 *
459 * Context must be the same
460  IF( desca_1xp( 2 ).NE.descb_px1( 2 ) ) THEN
461  info = -( 12*100+2 )
462  END IF
463 *
464 * These are alignment restrictions that may or may not be removed
465 * in future releases. -Andy Cleary, April 14, 1996.
466 *
467 * Block sizes must be the same
468  IF( desca_1xp( 4 ).NE.descb_px1( 4 ) ) THEN
469  info = -( 12*100+4 )
470  END IF
471 *
472 * Source processor must be the same
473 *
474  IF( desca_1xp( 5 ).NE.descb_px1( 5 ) ) THEN
475  info = -( 12*100+5 )
476  END IF
477 *
478 * Get values out of descriptor for use in code.
479 *
480  ictxt = desca_1xp( 2 )
481  csrc = desca_1xp( 5 )
482  nb = desca_1xp( 4 )
483  llda = desca_1xp( 6 )
484  store_n_a = desca_1xp( 3 )
485  lldb = descb_px1( 6 )
486  store_m_b = descb_px1( 3 )
487 *
488 * Get grid parameters
489 *
490 *
491  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
492  np = nprow*npcol
493 *
494 *
495 *
496  IF( lsame( uplo, 'U' ) ) THEN
497  idum1 = ichar( 'U' )
498  ELSE IF( lsame( uplo, 'L' ) ) THEN
499  idum1 = ichar( 'L' )
500  ELSE
501  info = -1
502  END IF
503 *
504  IF( lsame( trans, 'N' ) ) THEN
505  idum2 = ichar( 'N' )
506  ELSE IF( lsame( trans, 'T' ) ) THEN
507  idum2 = ichar( 'T' )
508  ELSE IF( lsame( trans, 'C' ) ) THEN
509  idum2 = ichar( 'T' )
510  ELSE
511  info = -2
512  END IF
513 *
514  IF( lwork.LT.-1 ) THEN
515  info = -16
516  ELSE IF( lwork.EQ.-1 ) THEN
517  idum3 = -1
518  ELSE
519  idum3 = 1
520  END IF
521 *
522  IF( n.LT.0 ) THEN
523  info = -3
524  END IF
525 *
526  IF( n+ja-1.GT.store_n_a ) THEN
527  info = -( 9*100+6 )
528  END IF
529 *
530  IF( n+ib-1.GT.store_m_b ) THEN
531  info = -( 12*100+3 )
532  END IF
533 *
534  IF( lldb.LT.nb ) THEN
535  info = -( 12*100+6 )
536  END IF
537 *
538  IF( nrhs.LT.0 ) THEN
539  info = -4
540  END IF
541 *
542 * Current alignment restriction
543 *
544  IF( ja.NE.ib ) THEN
545  info = -8
546  END IF
547 *
548 * Argument checking that is specific to Divide & Conquer routine
549 *
550  IF( nprow.NE.1 ) THEN
551  info = -( 9*100+2 )
552  END IF
553 *
554  IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
555  info = -( 3 )
556  CALL pxerbla( ictxt,
557  $ 'PSDTTRSV, D&C alg.: only 1 block per proc',
558  $ -info )
559  RETURN
560  END IF
561 *
562  IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*int_one ) ) THEN
563  info = -( 9*100+4 )
564  CALL pxerbla( ictxt, 'PSDTTRSV, D&C alg.: NB too small',
565  $ -info )
566  RETURN
567  END IF
568 *
569 *
570  work_size_min = int_one*nrhs
571 *
572  work( 1 ) = work_size_min
573 *
574  IF( lwork.LT.work_size_min ) THEN
575  IF( lwork.NE.-1 ) THEN
576  info = -16
577  CALL pxerbla( ictxt, 'PSDTTRSV: worksize error', -info )
578  END IF
579  RETURN
580  END IF
581 *
582 * Pack params and positions into arrays for global consistency check
583 *
584  param_check( 16, 1 ) = descb( 5 )
585  param_check( 15, 1 ) = descb( 4 )
586  param_check( 14, 1 ) = descb( 3 )
587  param_check( 13, 1 ) = descb( 2 )
588  param_check( 12, 1 ) = descb( 1 )
589  param_check( 11, 1 ) = ib
590  param_check( 10, 1 ) = desca( 5 )
591  param_check( 9, 1 ) = desca( 4 )
592  param_check( 8, 1 ) = desca( 3 )
593  param_check( 7, 1 ) = desca( 1 )
594  param_check( 6, 1 ) = ja
595  param_check( 5, 1 ) = nrhs
596  param_check( 4, 1 ) = n
597  param_check( 3, 1 ) = idum3
598  param_check( 2, 1 ) = idum2
599  param_check( 1, 1 ) = idum1
600 *
601  param_check( 16, 2 ) = 1205
602  param_check( 15, 2 ) = 1204
603  param_check( 14, 2 ) = 1203
604  param_check( 13, 2 ) = 1202
605  param_check( 12, 2 ) = 1201
606  param_check( 11, 2 ) = 11
607  param_check( 10, 2 ) = 905
608  param_check( 9, 2 ) = 904
609  param_check( 8, 2 ) = 903
610  param_check( 7, 2 ) = 901
611  param_check( 6, 2 ) = 8
612  param_check( 5, 2 ) = 4
613  param_check( 4, 2 ) = 3
614  param_check( 3, 2 ) = 16
615  param_check( 2, 2 ) = 2
616  param_check( 1, 2 ) = 1
617 *
618 * Want to find errors with MIN( ), so if no error, set it to a big
619 * number. If there already is an error, multiply by the the
620 * descriptor multiplier.
621 *
622  IF( info.GE.0 ) THEN
623  info = bignum
624  ELSE IF( info.LT.-descmult ) THEN
625  info = -info
626  ELSE
627  info = -info*descmult
628  END IF
629 *
630 * Check consistency across processors
631 *
632  CALL globchk( ictxt, 16, param_check, 16, param_check( 1, 3 ),
633  $ info )
634 *
635 * Prepare output: set info = 0 if no error, and divide by DESCMULT
636 * if error is not in a descriptor entry.
637 *
638  IF( info.EQ.bignum ) THEN
639  info = 0
640  ELSE IF( mod( info, descmult ).EQ.0 ) THEN
641  info = -info / descmult
642  ELSE
643  info = -info
644  END IF
645 *
646  IF( info.LT.0 ) THEN
647  CALL pxerbla( ictxt, 'PSDTTRSV', -info )
648  RETURN
649  END IF
650 *
651 * Quick return if possible
652 *
653  IF( n.EQ.0 )
654  $ RETURN
655 *
656  IF( nrhs.EQ.0 )
657  $ RETURN
658 *
659 *
660 * Adjust addressing into matrix space to properly get into
661 * the beginning part of the relevant data
662 *
663  part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
664 *
665  IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
666  part_offset = part_offset + nb
667  END IF
668 *
669  IF( mycol.LT.csrc ) THEN
670  part_offset = part_offset - nb
671  END IF
672 *
673 * Form a new BLACS grid (the "standard form" grid) with only procs
674 * holding part of the matrix, of size 1xNP where NP is adjusted,
675 * starting at csrc=0, with JA modified to reflect dropped procs.
676 *
677 * First processor to hold part of the matrix:
678 *
679  first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
680 *
681 * Calculate new JA one while dropping off unused processors.
682 *
683  ja_new = mod( ja-1, nb ) + 1
684 *
685 * Save and compute new value of NP
686 *
687  np_save = np
688  np = ( ja_new+n-2 ) / nb + 1
689 *
690 * Call utility routine that forms "standard-form" grid
691 *
692  CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
693  $ int_one, np )
694 *
695 * Use new context from standard grid as context.
696 *
697  ictxt_save = ictxt
698  ictxt = ictxt_new
699  desca_1xp( 2 ) = ictxt_new
700  descb_px1( 2 ) = ictxt_new
701 *
702 * Get information about new grid.
703 *
704  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
705 *
706 * Drop out processors that do not have part of the matrix.
707 *
708  IF( myrow.LT.0 ) THEN
709  GO TO 180
710  END IF
711 *
712 * ********************************
713 * Values reused throughout routine
714 *
715 * User-input value of partition size
716 *
717  part_size = nb
718 *
719 * Number of columns in each processor
720 *
721  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
722 *
723 * Offset in columns to beginning of main partition in each proc
724 *
725  IF( mycol.EQ.0 ) THEN
726  part_offset = part_offset + mod( ja_new-1, part_size )
727  my_num_cols = my_num_cols - mod( ja_new-1, part_size )
728  END IF
729 *
730 * Size of main (or odd) partition in each processor
731 *
732  odd_size = my_num_cols
733  IF( mycol.LT.np-1 ) THEN
734  odd_size = odd_size - int_one
735  END IF
736 *
737 * Offset to workspace for Upper triangular factor
738 *
739  work_u = int_one*odd_size + 3
740 *
741 *
742 *
743 * Begin main code
744 *
745  IF( lsame( uplo, 'L' ) ) THEN
746 *
747  IF( lsame( trans, 'N' ) ) THEN
748 *
749 * Frontsolve
750 *
751 *
752 ******************************************
753 * Local computation phase
754 ******************************************
755 *
756 * Use main partition in each processor to solve locally
757 *
758  CALL sdttrsv( uplo, 'N', odd_size, nrhs,
759  $ dl( part_offset+2 ), d( part_offset+1 ),
760  $ du( part_offset+1 ), b( part_offset+1 ), lldb,
761  $ info )
762 *
763 *
764  IF( mycol.LT.np-1 ) THEN
765 * Use factorization of odd-even connection block to modify
766 * locally stored portion of right hand side(s)
767 *
768  CALL saxpy( nrhs, -dl( part_offset+odd_size+1 ),
769  $ b( part_offset+odd_size ), lldb,
770  $ b( part_offset+odd_size+1 ), lldb )
771 *
772  END IF
773 *
774 *
775  IF( mycol.NE.0 ) THEN
776 * Use the "spike" fillin to calculate contribution to previous
777 * processor's righthand-side.
778 *
779  CALL sgemm( 'T', 'N', int_one, nrhs, odd_size, -one,
780  $ af( 1 ), odd_size, b( part_offset+1 ), lldb,
781  $ zero, work( 1+int_one-int_one ), int_one )
782  END IF
783 *
784 *
785 ************************************************
786 * Formation and solution of reduced system
787 ************************************************
788 *
789 *
790 * Send modifications to prior processor's right hand sides
791 *
792  IF( mycol.GT.0 ) THEN
793 *
794  CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one,
795  $ 0, mycol-1 )
796 *
797  END IF
798 *
799 * Receive modifications to processor's right hand sides
800 *
801  IF( mycol.LT.npcol-1 ) THEN
802 *
803  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
804  $ 0, mycol+1 )
805 *
806 * Combine contribution to locally stored right hand sides
807 *
808  CALL smatadd( int_one, nrhs, one, work( 1 ), int_one,
809  $ one, b( part_offset+odd_size+1 ), lldb )
810 *
811  END IF
812 *
813 *
814 * The last processor does not participate in the solution of the
815 * reduced system, having sent its contribution already.
816  IF( mycol.EQ.npcol-1 ) THEN
817  GO TO 30
818  END IF
819 *
820 *
821 * *************************************
822 * Modification Loop
823 *
824 * The distance for sending and receiving for each level starts
825 * at 1 for the first level.
826  level_dist = 1
827 *
828 * Do until this proc is needed to modify other procs' equations
829 *
830  10 CONTINUE
831  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
832  $ GO TO 20
833 *
834 * Receive and add contribution to righthand sides from left
835 *
836  IF( mycol-level_dist.GE.0 ) THEN
837 *
838  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
839  $ 0, mycol-level_dist )
840 *
841  CALL smatadd( int_one, nrhs, one, work( 1 ), int_one,
842  $ one, b( part_offset+odd_size+1 ), lldb )
843 *
844  END IF
845 *
846 * Receive and add contribution to righthand sides from right
847 *
848  IF( mycol+level_dist.LT.npcol-1 ) THEN
849 *
850  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
851  $ 0, mycol+level_dist )
852 *
853  CALL smatadd( int_one, nrhs, one, work( 1 ), int_one,
854  $ one, b( part_offset+odd_size+1 ), lldb )
855 *
856  END IF
857 *
858  level_dist = level_dist*2
859 *
860  GO TO 10
861  20 CONTINUE
862 * [End of GOTO Loop]
863 *
864 *
865 *
866 * *********************************
867 * Calculate and use this proc's blocks to modify other procs
868 *
869 * Solve with diagonal block
870 *
871  CALL stbtrs( 'L', 'N', 'U', int_one,
872  $ min( int_one, int_one-1 ), nrhs,
873  $ af( odd_size+2 ), int_one+1,
874  $ b( part_offset+odd_size+1 ), lldb, info )
875 *
876  IF( info.NE.0 ) THEN
877  GO TO 170
878  END IF
879 *
880 *
881 *
882 * *********
883  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
884 *
885 * Calculate contribution from this block to next diagonal block
886 *
887  CALL sgemm( 'T', 'N', int_one, nrhs, int_one, -one,
888  $ af( ( odd_size )*int_one+1 ), int_one,
889  $ b( part_offset+odd_size+1 ), lldb, zero,
890  $ work( 1 ), int_one )
891 *
892 * Send contribution to diagonal block's owning processor.
893 *
894  CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one,
895  $ 0, mycol+level_dist )
896 *
897  END IF
898 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
899 *
900 * ************
901  IF( ( mycol / level_dist.GT.0 ) .AND.
902  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
903  $ THEN
904 *
905 *
906 * Use offdiagonal block to calculate modification to diag block
907 * of processor to the left
908 *
909  CALL sgemm( 'N', 'N', int_one, nrhs, int_one, -one,
910  $ af( odd_size*int_one+2+1 ), int_one,
911  $ b( part_offset+odd_size+1 ), lldb, zero,
912  $ work( 1 ), int_one )
913 *
914 * Send contribution to diagonal block's owning processor.
915 *
916  CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one,
917  $ 0, mycol-level_dist )
918 *
919  END IF
920 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
921 *
922  30 CONTINUE
923 *
924  ELSE
925 *
926 ******************** BACKSOLVE *************************************
927 *
928 ********************************************************************
929 * .. Begin reduced system phase of algorithm ..
930 ********************************************************************
931 *
932 *
933 *
934 * The last processor does not participate in the solution of the
935 * reduced system and just waits to receive its solution.
936  IF( mycol.EQ.npcol-1 ) THEN
937  GO TO 80
938  END IF
939 *
940 * Determine number of steps in tree loop
941 *
942  level_dist = 1
943  40 CONTINUE
944  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
945  $ GO TO 50
946 *
947  level_dist = level_dist*2
948 *
949  GO TO 40
950  50 CONTINUE
951 *
952 *
953  IF( ( mycol / level_dist.GT.0 ) .AND.
954  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
955  $ THEN
956 *
957 * Receive solution from processor to left
958 *
959  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
960  $ 0, mycol-level_dist )
961 *
962 *
963 * Use offdiagonal block to calculate modification to RHS stored
964 * on this processor
965 *
966  CALL sgemm( 'T', 'N', int_one, nrhs, int_one, -one,
967  $ af( odd_size*int_one+2+1 ), int_one,
968  $ work( 1 ), int_one, one,
969  $ b( part_offset+odd_size+1 ), lldb )
970  END IF
971 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
972 *
973 *
974  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
975 *
976 * Receive solution from processor to right
977 *
978  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
979  $ 0, mycol+level_dist )
980 *
981 * Calculate contribution from this block to next diagonal block
982 *
983  CALL sgemm( 'N', 'N', int_one, nrhs, int_one, -one,
984  $ af( ( odd_size )*int_one+1 ), int_one,
985  $ work( 1 ), int_one, one,
986  $ b( part_offset+odd_size+1 ), lldb )
987 *
988  END IF
989 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
990 *
991 *
992 * Solve with diagonal block
993 *
994  CALL stbtrs( 'L', 'T', 'U', int_one,
995  $ min( int_one, int_one-1 ), nrhs,
996  $ af( odd_size+2 ), int_one+1,
997  $ b( part_offset+odd_size+1 ), lldb, info )
998 *
999  IF( info.NE.0 ) THEN
1000  GO TO 170
1001  END IF
1002 *
1003 *
1004 *
1005 ***Modification Loop *******
1006 *
1007  60 CONTINUE
1008  IF( level_dist.EQ.1 )
1009  $ GO TO 70
1010 *
1011  level_dist = level_dist / 2
1012 *
1013 * Send solution to the right
1014 *
1015  IF( mycol+level_dist.LT.npcol-1 ) THEN
1016 *
1017  CALL sgesd2d( ictxt, int_one, nrhs,
1018  $ b( part_offset+odd_size+1 ), lldb, 0,
1019  $ mycol+level_dist )
1020 *
1021  END IF
1022 *
1023 * Send solution to left
1024 *
1025  IF( mycol-level_dist.GE.0 ) THEN
1026 *
1027  CALL sgesd2d( ictxt, int_one, nrhs,
1028  $ b( part_offset+odd_size+1 ), lldb, 0,
1029  $ mycol-level_dist )
1030 *
1031  END IF
1032 *
1033  GO TO 60
1034  70 CONTINUE
1035 * [End of GOTO Loop]
1036 *
1037  80 CONTINUE
1038 * [Processor npcol - 1 jumped to here to await next stage]
1039 *
1040 *******************************
1041 * Reduced system has been solved, communicate solutions to nearest
1042 * neighbors in preparation for local computation phase.
1043 *
1044 *
1045 * Send elements of solution to next proc
1046 *
1047  IF( mycol.LT.npcol-1 ) THEN
1048 *
1049  CALL sgesd2d( ictxt, int_one, nrhs,
1050  $ b( part_offset+odd_size+1 ), lldb, 0,
1051  $ mycol+1 )
1052 *
1053  END IF
1054 *
1055 * Receive modifications to processor's right hand sides
1056 *
1057  IF( mycol.GT.0 ) THEN
1058 *
1059  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1060  $ 0, mycol-1 )
1061 *
1062  END IF
1063 *
1064 *
1065 *
1066 **********************************************
1067 * Local computation phase
1068 **********************************************
1069 *
1070  IF( mycol.NE.0 ) THEN
1071 * Use the "spike" fillin to calculate contribution from previous
1072 * processor's solution.
1073 *
1074  CALL sgemm( 'N', 'N', odd_size, nrhs, int_one, -one,
1075  $ af( 1 ), odd_size, work( 1+int_one-int_one ),
1076  $ int_one, one, b( part_offset+1 ), lldb )
1077 *
1078  END IF
1079 *
1080 *
1081  IF( mycol.LT.np-1 ) THEN
1082 * Use factorization of odd-even connection block to modify
1083 * locally stored portion of right hand side(s)
1084 *
1085  CALL saxpy( nrhs, -( dl( part_offset+odd_size+1 ) ),
1086  $ b( part_offset+odd_size+1 ), lldb,
1087  $ b( part_offset+odd_size ), lldb )
1088 *
1089  END IF
1090 *
1091 * Use main partition in each processor to solve locally
1092 *
1093  CALL sdttrsv( uplo, 'T', odd_size, nrhs,
1094  $ dl( part_offset+2 ), d( part_offset+1 ),
1095  $ du( part_offset+1 ), b( part_offset+1 ), lldb,
1096  $ info )
1097 *
1098  END IF
1099 * End of "IF( LSAME( TRANS, 'N' ) )"...
1100 *
1101 *
1102  ELSE
1103 ***************************************************************
1104 * CASE UPLO = 'U' *
1105 ***************************************************************
1106  IF( lsame( trans, 'T' ) ) THEN
1107 *
1108 * Frontsolve
1109 *
1110 *
1111 ******************************************
1112 * Local computation phase
1113 ******************************************
1114 *
1115 * Use main partition in each processor to solve locally
1116 *
1117  CALL sdttrsv( uplo, 'T', odd_size, nrhs,
1118  $ dl( part_offset+2 ), d( part_offset+1 ),
1119  $ du( part_offset+1 ), b( part_offset+1 ), lldb,
1120  $ info )
1121 *
1122 *
1123  IF( mycol.LT.np-1 ) THEN
1124 * Use factorization of odd-even connection block to modify
1125 * locally stored portion of right hand side(s)
1126 *
1127  CALL saxpy( nrhs, -( du( part_offset+odd_size ) ),
1128  $ b( part_offset+odd_size ), lldb,
1129  $ b( part_offset+odd_size+1 ), lldb )
1130 *
1131  END IF
1132 *
1133 *
1134  IF( mycol.NE.0 ) THEN
1135 * Use the "spike" fillin to calculate contribution to previous
1136 * processor's righthand-side.
1137 *
1138  CALL sgemm( 'T', 'N', int_one, nrhs, odd_size, -one,
1139  $ af( work_u+1 ), odd_size, b( part_offset+1 ),
1140  $ lldb, zero, work( 1+int_one-int_one ),
1141  $ int_one )
1142  END IF
1143 *
1144 *
1145 ************************************************
1146 * Formation and solution of reduced system
1147 ************************************************
1148 *
1149 *
1150 * Send modifications to prior processor's right hand sides
1151 *
1152  IF( mycol.GT.0 ) THEN
1153 *
1154  CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1155  $ 0, mycol-1 )
1156 *
1157  END IF
1158 *
1159 * Receive modifications to processor's right hand sides
1160 *
1161  IF( mycol.LT.npcol-1 ) THEN
1162 *
1163  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1164  $ 0, mycol+1 )
1165 *
1166 * Combine contribution to locally stored right hand sides
1167 *
1168  CALL smatadd( int_one, nrhs, one, work( 1 ), int_one,
1169  $ one, b( part_offset+odd_size+1 ), lldb )
1170 *
1171  END IF
1172 *
1173 *
1174 * The last processor does not participate in the solution of the
1175 * reduced system, having sent its contribution already.
1176  IF( mycol.EQ.npcol-1 ) THEN
1177  GO TO 110
1178  END IF
1179 *
1180 *
1181 * *************************************
1182 * Modification Loop
1183 *
1184 * The distance for sending and receiving for each level starts
1185 * at 1 for the first level.
1186  level_dist = 1
1187 *
1188 * Do until this proc is needed to modify other procs' equations
1189 *
1190  90 CONTINUE
1191  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1192  $ GO TO 100
1193 *
1194 * Receive and add contribution to righthand sides from left
1195 *
1196  IF( mycol-level_dist.GE.0 ) THEN
1197 *
1198  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1199  $ 0, mycol-level_dist )
1200 *
1201  CALL smatadd( int_one, nrhs, one, work( 1 ), int_one,
1202  $ one, b( part_offset+odd_size+1 ), lldb )
1203 *
1204  END IF
1205 *
1206 * Receive and add contribution to righthand sides from right
1207 *
1208  IF( mycol+level_dist.LT.npcol-1 ) THEN
1209 *
1210  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1211  $ 0, mycol+level_dist )
1212 *
1213  CALL smatadd( int_one, nrhs, one, work( 1 ), int_one,
1214  $ one, b( part_offset+odd_size+1 ), lldb )
1215 *
1216  END IF
1217 *
1218  level_dist = level_dist*2
1219 *
1220  GO TO 90
1221  100 CONTINUE
1222 * [End of GOTO Loop]
1223 *
1224 *
1225 *
1226 * *********************************
1227 * Calculate and use this proc's blocks to modify other procs
1228 *
1229 * Solve with diagonal block
1230 *
1231  CALL stbtrs( 'U', 'T', 'N', int_one,
1232  $ min( int_one, int_one-1 ), nrhs,
1233  $ af( odd_size+2 ), int_one+1,
1234  $ b( part_offset+odd_size+1 ), lldb, info )
1235 *
1236  IF( info.NE.0 ) THEN
1237  GO TO 170
1238  END IF
1239 *
1240 *
1241 *
1242 * *********
1243  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1244 *
1245 * Calculate contribution from this block to next diagonal block
1246 *
1247  CALL sgemm( 'T', 'N', int_one, nrhs, int_one, -one,
1248  $ af( work_u+( odd_size )*int_one+1 ), int_one,
1249  $ b( part_offset+odd_size+1 ), lldb, zero,
1250  $ work( 1 ), int_one )
1251 *
1252 * Send contribution to diagonal block's owning processor.
1253 *
1254  CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1255  $ 0, mycol+level_dist )
1256 *
1257  END IF
1258 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1259 *
1260 * ************
1261  IF( ( mycol / level_dist.GT.0 ) .AND.
1262  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1263  $ THEN
1264 *
1265 *
1266 * Use offdiagonal block to calculate modification to diag block
1267 * of processor to the left
1268 *
1269  CALL sgemm( 'N', 'N', int_one, nrhs, int_one, -one,
1270  $ af( work_u+odd_size*int_one+2+1 ), int_one,
1271  $ b( part_offset+odd_size+1 ), lldb, zero,
1272  $ work( 1 ), int_one )
1273 *
1274 * Send contribution to diagonal block's owning processor.
1275 *
1276  CALL sgesd2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1277  $ 0, mycol-level_dist )
1278 *
1279  END IF
1280 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1281 *
1282  110 CONTINUE
1283 *
1284  ELSE
1285 *
1286 ******************** BACKSOLVE *************************************
1287 *
1288 ********************************************************************
1289 * .. Begin reduced system phase of algorithm ..
1290 ********************************************************************
1291 *
1292 *
1293 *
1294 * The last processor does not participate in the solution of the
1295 * reduced system and just waits to receive its solution.
1296  IF( mycol.EQ.npcol-1 ) THEN
1297  GO TO 160
1298  END IF
1299 *
1300 * Determine number of steps in tree loop
1301 *
1302  level_dist = 1
1303  120 CONTINUE
1304  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1305  $ GO TO 130
1306 *
1307  level_dist = level_dist*2
1308 *
1309  GO TO 120
1310  130 CONTINUE
1311 *
1312 *
1313  IF( ( mycol / level_dist.GT.0 ) .AND.
1314  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) )
1315  $ THEN
1316 *
1317 * Receive solution from processor to left
1318 *
1319  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1320  $ 0, mycol-level_dist )
1321 *
1322 *
1323 * Use offdiagonal block to calculate modification to RHS stored
1324 * on this processor
1325 *
1326  CALL sgemm( 'T', 'N', int_one, nrhs, int_one, -one,
1327  $ af( work_u+odd_size*int_one+2+1 ), int_one,
1328  $ work( 1 ), int_one, one,
1329  $ b( part_offset+odd_size+1 ), lldb )
1330  END IF
1331 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1332 *
1333 *
1334  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1335 *
1336 * Receive solution from processor to right
1337 *
1338  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1339  $ 0, mycol+level_dist )
1340 *
1341 * Calculate contribution from this block to next diagonal block
1342 *
1343  CALL sgemm( 'N', 'N', int_one, nrhs, int_one, -one,
1344  $ af( work_u+( odd_size )*int_one+1 ), int_one,
1345  $ work( 1 ), int_one, one,
1346  $ b( part_offset+odd_size+1 ), lldb )
1347 *
1348  END IF
1349 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1350 *
1351 *
1352 * Solve with diagonal block
1353 *
1354  CALL stbtrs( 'U', 'N', 'N', int_one,
1355  $ min( int_one, int_one-1 ), nrhs,
1356  $ af( odd_size+2 ), int_one+1,
1357  $ b( part_offset+odd_size+1 ), lldb, info )
1358 *
1359  IF( info.NE.0 ) THEN
1360  GO TO 170
1361  END IF
1362 *
1363 *
1364 *
1365 ***Modification Loop *******
1366 *
1367  140 CONTINUE
1368  IF( level_dist.EQ.1 )
1369  $ GO TO 150
1370 *
1371  level_dist = level_dist / 2
1372 *
1373 * Send solution to the right
1374 *
1375  IF( mycol+level_dist.LT.npcol-1 ) THEN
1376 *
1377  CALL sgesd2d( ictxt, int_one, nrhs,
1378  $ b( part_offset+odd_size+1 ), lldb, 0,
1379  $ mycol+level_dist )
1380 *
1381  END IF
1382 *
1383 * Send solution to left
1384 *
1385  IF( mycol-level_dist.GE.0 ) THEN
1386 *
1387  CALL sgesd2d( ictxt, int_one, nrhs,
1388  $ b( part_offset+odd_size+1 ), lldb, 0,
1389  $ mycol-level_dist )
1390 *
1391  END IF
1392 *
1393  GO TO 140
1394  150 CONTINUE
1395 * [End of GOTO Loop]
1396 *
1397  160 CONTINUE
1398 * [Processor npcol - 1 jumped to here to await next stage]
1399 *
1400 *******************************
1401 * Reduced system has been solved, communicate solutions to nearest
1402 * neighbors in preparation for local computation phase.
1403 *
1404 *
1405 * Send elements of solution to next proc
1406 *
1407  IF( mycol.LT.npcol-1 ) THEN
1408 *
1409  CALL sgesd2d( ictxt, int_one, nrhs,
1410  $ b( part_offset+odd_size+1 ), lldb, 0,
1411  $ mycol+1 )
1412 *
1413  END IF
1414 *
1415 * Receive modifications to processor's right hand sides
1416 *
1417  IF( mycol.GT.0 ) THEN
1418 *
1419  CALL sgerv2d( ictxt, int_one, nrhs, work( 1 ), int_one,
1420  $ 0, mycol-1 )
1421 *
1422  END IF
1423 *
1424 *
1425 *
1426 **********************************************
1427 * Local computation phase
1428 **********************************************
1429 *
1430  IF( mycol.NE.0 ) THEN
1431 * Use the "spike" fillin to calculate contribution from previous
1432 * processor's solution.
1433 *
1434  CALL sgemm( 'N', 'N', odd_size, nrhs, int_one, -one,
1435  $ af( work_u+1 ), odd_size,
1436  $ work( 1+int_one-int_one ), int_one, one,
1437  $ b( part_offset+1 ), lldb )
1438 *
1439  END IF
1440 *
1441 *
1442  IF( mycol.LT.np-1 ) THEN
1443 * Use factorization of odd-even connection block to modify
1444 * locally stored portion of right hand side(s)
1445 *
1446  CALL saxpy( nrhs, -( du( part_offset+odd_size ) ),
1447  $ b( part_offset+odd_size+1 ), lldb,
1448  $ b( part_offset+odd_size ), lldb )
1449 *
1450  END IF
1451 *
1452 * Use main partition in each processor to solve locally
1453 *
1454  CALL sdttrsv( uplo, 'N', odd_size, nrhs,
1455  $ du( part_offset+2 ), d( part_offset+1 ),
1456  $ du( part_offset+1 ), b( part_offset+1 ), lldb,
1457  $ info )
1458 *
1459  END IF
1460 * End of "IF( LSAME( TRANS, 'N' ) )"...
1461 *
1462 *
1463  END IF
1464 * End of "IF( LSAME( UPLO, 'L' ) )"...
1465  170 CONTINUE
1466 *
1467 *
1468 * Free BLACS space used to hold standard-form grid.
1469 *
1470  IF( ictxt_save.NE.ictxt_new ) THEN
1471  CALL blacs_gridexit( ictxt_new )
1472  END IF
1473 *
1474  180 CONTINUE
1475 *
1476 * Restore saved input parameters
1477 *
1478  ictxt = ictxt_save
1479  np = np_save
1480 *
1481 * Output minimum worksize
1482 *
1483  work( 1 ) = work_size_min
1484 *
1485 *
1486  RETURN
1487 *
1488 * End of PSDTTRSV
1489 *
1490  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
psdttrsv
subroutine psdttrsv(UPLO, TRANS, N, NRHS, DL, D, DU, JA, DESCA, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: psdttrsv.f:3
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
smatadd
subroutine smatadd(M, N, ALPHA, A, LDA, BETA, C, LDC)
Definition: smatadd.f:2
sdttrsv
subroutine sdttrsv(UPLO, TRANS, N, NRHS, DL, D, DU, B, LDB, INFO)
Definition: sdttrsv.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181