ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
psgbtrs.f
Go to the documentation of this file.
1  SUBROUTINE psgbtrs( TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV,
2  $ B, IB, DESCB, AF, LAF, WORK, LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  CHARACTER TRANS
10  INTEGER BWL, BWU, IB, INFO, JA, LAF, LWORK, N, NRHS
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * ), DESCB( * ), IPIV( * )
14  REAL A( * ), AF( * ), B( * ), WORK( * )
15 * ..
16 *
17 * Purpose
18 * =======
19 *
20 * PSGBTRS solves a system of linear equations
21 *
22 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
23 * or
24 * A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS)
25 *
26 * where A(1:N, JA:JA+N-1) is the matrix used to produce the factors
27 * stored in A(1:N,JA:JA+N-1) and AF by PSGBTRF.
28 * A(1:N, JA:JA+N-1) is an N-by-N real
29 * banded distributed
30 * matrix with bandwidth BWL, BWU.
31 *
32 * Routine PSGBTRF MUST be called first.
33 *
34 * =====================================================================
35 *
36 * Arguments
37 * =========
38 *
39 *
40 * TRANS (global input) CHARACTER
41 * = 'N': Solve with A(1:N, JA:JA+N-1);
42 * = 'T' or 'C': Solve with A(1:N, JA:JA+N-1)^T;
43 *
44 * N (global input) INTEGER
45 * The number of rows and columns to be operated on, i.e. the
46 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
47 *
48 * BWL (global input) INTEGER
49 * Number of subdiagonals. 0 <= BWL <= N-1
50 *
51 * BWU (global input) INTEGER
52 * Number of superdiagonals. 0 <= BWU <= N-1
53 *
54 * NRHS (global input) INTEGER
55 * The number of right hand sides, i.e., the number of columns
56 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
57 * NRHS >= 0.
58 *
59 * A (local input/local output) REAL pointer into
60 * local memory to an array with first dimension
61 * LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
62 * On entry, this array contains the local pieces of the
63 * N-by-N unsymmetric banded distributed Cholesky factor L or
64 * L^T A(1:N, JA:JA+N-1).
65 * This local portion is stored in the packed banded format
66 * used in LAPACK. Please see the Notes below and the
67 * ScaLAPACK manual for more detail on the format of
68 * distributed matrices.
69 *
70 * JA (global input) INTEGER
71 * The index in the global array A that points to the start of
72 * the matrix to be operated on (which may be either all of A
73 * or a submatrix of A).
74 *
75 * DESCA (global and local input) INTEGER array of dimension DLEN.
76 * if 1D type (DTYPE_A=501), DLEN >= 7;
77 * if 2D type (DTYPE_A=1), DLEN >= 9 .
78 * The array descriptor for the distributed matrix A.
79 * Contains information of mapping of A to memory. Please
80 * see NOTES below for full description and options.
81 *
82 * IPIV (local output) INTEGER array, dimension >= DESCA( NB ).
83 * Pivot indices for local factorizations.
84 * Users *should not* alter the contents between
85 * factorization and solve.
86 *
87 * B (local input/local output) REAL pointer into
88 * local memory to an array of local lead dimension lld_b>=NB.
89 * On entry, this array contains the
90 * the local pieces of the right hand sides
91 * B(IB:IB+N-1, 1:NRHS).
92 * On exit, this contains the local piece of the solutions
93 * distributed matrix X.
94 *
95 * IB (global input) INTEGER
96 * The row index in the global array B that points to the first
97 * row of the matrix to be operated on (which may be either
98 * all of B or a submatrix of B).
99 *
100 * DESCB (global and local input) INTEGER array of dimension DLEN.
101 * if 1D type (DTYPE_B=502), DLEN >=7;
102 * if 2D type (DTYPE_B=1), DLEN >= 9.
103 * The array descriptor for the distributed matrix B.
104 * Contains information of mapping of B to memory. Please
105 * see NOTES below for full description and options.
106 *
107 * AF (local output) REAL array, dimension LAF.
108 * Auxiliary Fillin Space.
109 * Fillin is created during the factorization routine
110 * PSGBTRF and this is stored in AF. If a linear system
111 * is to be solved using PSGBTRS after the factorization
112 * routine, AF *must not be altered* after the factorization.
113 *
114 * LAF (local input) INTEGER
115 * Size of user-input Auxiliary Fillin space AF. Must be >=
116 * (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
117 * If LAF is not large enough, an error code will be returned
118 * and the minimum acceptable size will be returned in AF( 1 )
119 *
120 * WORK (local workspace/local output)
121 * REAL temporary workspace. This space may
122 * be overwritten in between calls to routines. WORK must be
123 * the size given in LWORK.
124 * On exit, WORK( 1 ) contains the minimal LWORK.
125 *
126 * LWORK (local input or global input) INTEGER
127 * Size of user-input workspace WORK.
128 * If LWORK is too small, the minimal acceptable size will be
129 * returned in WORK(1) and an error code is returned. LWORK>=
130 * NRHS*(NB+2*bwl+4*bwu)
131 *
132 * INFO (global output) INTEGER
133 * = 0: successful exit
134 * < 0: If the i-th argument is an array and the j-entry had
135 * an illegal value, then INFO = -(i*100+j), if the i-th
136 * argument is a scalar and had an illegal value, then
137 * INFO = -i.
138 *
139 * =====================================================================
140 *
141 * Restrictions
142 * ============
143 *
144 * The following are restrictions on the input parameters. Some of these
145 * are temporary and will be removed in future releases, while others
146 * may reflect fundamental technical limitations.
147 *
148 * Non-cyclic restriction: VERY IMPORTANT!
149 * P*NB>= mod(JA-1,NB)+N.
150 * The mapping for matrices must be blocked, reflecting the nature
151 * of the divide and conquer algorithm as a task-parallel algorithm.
152 * This formula in words is: no processor may have more than one
153 * chunk of the matrix.
154 *
155 * Blocksize cannot be too small:
156 * If the matrix spans more than one processor, the following
157 * restriction on NB, the size of each block on each processor,
158 * must hold:
159 * NB >= (BWL+BWU)+1
160 * The bulk of parallel computation is done on the matrix of size
161 * O(NB) on each processor. If this is too small, divide and conquer
162 * is a poor choice of algorithm.
163 *
164 * Submatrix reference:
165 * JA = IB
166 * Alignment restriction that prevents unnecessary communication.
167 *
168 * =====================================================================
169 *
170 * Notes
171 * =====
172 *
173 * If the factorization routine and the solve routine are to be called
174 * separately (to solve various sets of righthand sides using the same
175 * coefficient matrix), the auxiliary space AF *must not be altered*
176 * between calls to the factorization routine and the solve routine.
177 *
178 * The best algorithm for solving banded and tridiagonal linear systems
179 * depends on a variety of parameters, especially the bandwidth.
180 * Currently, only algorithms designed for the case N/P >> bw are
181 * implemented. These go by many names, including Divide and Conquer,
182 * Partitioning, domain decomposition-type, etc.
183 *
184 * Algorithm description: Divide and Conquer
185 *
186 * The Divide and Conqer algorithm assumes the matrix is narrowly
187 * banded compared with the number of equations. In this situation,
188 * it is best to distribute the input matrix A one-dimensionally,
189 * with columns atomic and rows divided amongst the processes.
190 * The basic algorithm divides the banded matrix up into
191 * P pieces with one stored on each processor,
192 * and then proceeds in 2 phases for the factorization or 3 for the
193 * solution of a linear system.
194 * 1) Local Phase:
195 * The individual pieces are factored independently and in
196 * parallel. These factors are applied to the matrix creating
197 * fillin, which is stored in a non-inspectable way in auxiliary
198 * space AF. Mathematically, this is equivalent to reordering
199 * the matrix A as P A P^T and then factoring the principal
200 * leading submatrix of size equal to the sum of the sizes of
201 * the matrices factored on each processor. The factors of
202 * these submatrices overwrite the corresponding parts of A
203 * in memory.
204 * 2) Reduced System Phase:
205 * A small (max(bwl,bwu)* (P-1)) system is formed representing
206 * interaction of the larger blocks, and is stored (as are its
207 * factors) in the space AF. A parallel Block Cyclic Reduction
208 * algorithm is used. For a linear system, a parallel front solve
209 * followed by an analagous backsolve, both using the structure
210 * of the factored matrix, are performed.
211 * 3) Backsubsitution Phase:
212 * For a linear system, a local backsubstitution is performed on
213 * each processor in parallel.
214 *
215 *
216 * Descriptors
217 * ===========
218 *
219 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
220 *
221 * Note: banded codes can use either the old two dimensional
222 * or new one-dimensional descriptors, though the processor grid in
223 * both cases *must be one-dimensional*. We describe both types below.
224 *
225 * Each global data object is described by an associated description
226 * vector. This vector stores the information required to establish
227 * the mapping between an object element and its corresponding process
228 * and memory location.
229 *
230 * Let A be a generic term for any 2D block cyclicly distributed array.
231 * Such a global array has an associated description vector DESCA.
232 * In the following comments, the character _ should be read as
233 * "of the global array".
234 *
235 * NOTATION STORED IN EXPLANATION
236 * --------------- -------------- --------------------------------------
237 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
238 * DTYPE_A = 1.
239 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
240 * the BLACS process grid A is distribu-
241 * ted over. The context itself is glo-
242 * bal, but the handle (the integer
243 * value) may vary.
244 * M_A (global) DESCA( M_ ) The number of rows in the global
245 * array A.
246 * N_A (global) DESCA( N_ ) The number of columns in the global
247 * array A.
248 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
249 * the rows of the array.
250 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
251 * the columns of the array.
252 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
253 * row of the array A is distributed.
254 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
255 * first column of the array A is
256 * distributed.
257 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
258 * array. LLD_A >= MAX(1,LOCr(M_A)).
259 *
260 * Let K be the number of rows or columns of a distributed matrix,
261 * and assume that its process grid has dimension p x q.
262 * LOCr( K ) denotes the number of elements of K that a process
263 * would receive if K were distributed over the p processes of its
264 * process column.
265 * Similarly, LOCc( K ) denotes the number of elements of K that a
266 * process would receive if K were distributed over the q processes of
267 * its process row.
268 * The values of LOCr() and LOCc() may be determined via a call to the
269 * ScaLAPACK tool function, NUMROC:
270 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
271 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
272 * An upper bound for these quantities may be computed by:
273 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
274 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
275 *
276 *
277 * One-dimensional descriptors:
278 *
279 * One-dimensional descriptors are a new addition to ScaLAPACK since
280 * version 1.0. They simplify and shorten the descriptor for 1D
281 * arrays.
282 *
283 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
284 * object, we allow 1D arrays to be distributed either over the
285 * first dimension of the array (as if the grid were P-by-1) or the
286 * 2nd dimension (as if the grid were 1-by-P). This choice is
287 * indicated by the descriptor type (501 or 502)
288 * as described below.
289 *
290 * IMPORTANT NOTE: the actual BLACS grid represented by the
291 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
292 * irrespective of which one-dimensional descriptor type
293 * (501 or 502) is input.
294 * This routine will interpret the grid properly either way.
295 * ScaLAPACK routines *do not support intercontext operations* so that
296 * the grid passed to a single ScaLAPACK routine *must be the same*
297 * for all array descriptors passed to that routine.
298 *
299 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
300 * may also be used, since a one-dimensional array is a special case
301 * of a two-dimensional array with one dimension of size unity.
302 * The two-dimensional array used in this case *must* be of the
303 * proper orientation:
304 * If the appropriate one-dimensional descriptor is DTYPEA=501
305 * (1 by P type), then the two dimensional descriptor must
306 * have a CTXT value that refers to a 1 by P BLACS grid;
307 * If the appropriate one-dimensional descriptor is DTYPEA=502
308 * (P by 1 type), then the two dimensional descriptor must
309 * have a CTXT value that refers to a P by 1 BLACS grid.
310 *
311 *
312 * Summary of allowed descriptors, types, and BLACS grids:
313 * DTYPE 501 502 1 1
314 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
315 * -----------------------------------------------------
316 * A OK NO OK NO
317 * B NO OK NO OK
318 *
319 * Note that a consequence of this chart is that it is not possible
320 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
321 * to opposite requirements for the orientation of the BLACS grid,
322 * and as noted before, the *same* BLACS context must be used in
323 * all descriptors in a single ScaLAPACK subroutine call.
324 *
325 * Let A be a generic term for any 1D block cyclicly distributed array.
326 * Such a global array has an associated description vector DESCA.
327 * In the following comments, the character _ should be read as
328 * "of the global array".
329 *
330 * NOTATION STORED IN EXPLANATION
331 * --------------- ---------- ------------------------------------------
332 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
333 * TYPE_A = 501: 1-by-P grid.
334 * TYPE_A = 502: P-by-1 grid.
335 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
336 * the BLACS process grid A is distribu-
337 * ted over. The context itself is glo-
338 * bal, but the handle (the integer
339 * value) may vary.
340 * N_A (global) DESCA( 3 ) The size of the array dimension being
341 * distributed.
342 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
343 * the distributed dimension of the array.
344 * SRC_A (global) DESCA( 5 ) The process row or column over which the
345 * first row or column of the array
346 * is distributed.
347 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
348 * storing the local blocks of the distri-
349 * buted array A. Minimum value of LLD_A
350 * depends on TYPE_A.
351 * TYPE_A = 501: LLD_A >=
352 * size of undistributed dimension, 1.
353 * TYPE_A = 502: LLD_A >=NB_A, 1.
354 * Reserved DESCA( 7 ) Reserved for future use.
355 *
356 * =====================================================================
357 *
358 * Implemented for ScaLAPACK by:
359 * Andrew J. Cleary, Livermore National Lab and University of Tenn.,
360 * and Markus Hegland, Australian National University. Feb., 1997.
361 * Based on code written by : Peter Arbenz, ETH Zurich, 1996.
362 * Last modified by: Peter Arbenz, Institute of Scientific Computing,
363 * ETH, Zurich.
364 *
365 * =====================================================================
366 *
367 * .. Parameters ..
368  REAL ONE
369  parameter( one = 1.0e+0 )
370  REAL ZERO
371  parameter( zero = 0.0e+0 )
372  INTEGER INT_ONE
373  parameter( int_one = 1 )
374  INTEGER DESCMULT, BIGNUM
375  parameter( descmult = 100, bignum = descmult*descmult )
376  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
377  $ lld_, mb_, m_, nb_, n_, rsrc_
378  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
379  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
380  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
381 * ..
382 * .. Local Scalars ..
383  INTEGER APTR, BBPTR, BM, BMN, BN, BNN, BW, CSRC,
384  $ first_proc, ictxt, ictxt_new, ictxt_save,
385  $ idum2, idum3, j, ja_new, l, lbwl, lbwu, ldbb,
386  $ ldw, llda, lldb, lm, lmj, ln, lptr, mycol,
387  $ myrow, nb, neicol, np, npact, npcol, nprow,
388  $ npstr, np_save, odd_size, part_offset,
389  $ recovery_val, return_code, store_m_b,
390  $ store_n_a, work_size_min, wptr
391 * ..
392 * .. Local Arrays ..
393  INTEGER DESCA_1XP( 7 ), DESCB_PX1( 7 ),
394  $ param_check( 17, 3 )
395 * ..
396 * .. External Subroutines ..
397  EXTERNAL blacs_gridexit, blacs_gridinfo, scopy,
398  $ desc_convert, sgemm, sgemv, sger, sgerv2d,
399  $ sgesd2d, sgetrs, slamov, slaswp, sscal, sswap,
400  $ strsm, globchk, pxerbla, reshape
401 * ..
402 * .. External Functions ..
403  LOGICAL LSAME
404  INTEGER NUMROC
405  EXTERNAL lsame, numroc
406 * ..
407 * .. Intrinsic Functions ..
408  INTRINSIC ichar, max, min, mod
409 * ..
410 * .. Executable Statements ..
411 *
412 *
413 * Test the input parameters
414 *
415  info = 0
416 *
417 * Convert descriptor into standard form for easy access to
418 * parameters, check that grid is of right shape.
419 *
420  desca_1xp( 1 ) = 501
421  descb_px1( 1 ) = 502
422 *
423  CALL desc_convert( desca, desca_1xp, return_code )
424 *
425  IF( return_code.NE.0 ) THEN
426  info = -( 8*100+2 )
427  END IF
428 *
429  CALL desc_convert( descb, descb_px1, return_code )
430 *
431  IF( return_code.NE.0 ) THEN
432  info = -( 11*100+2 )
433  END IF
434 *
435 * Consistency checks for DESCA and DESCB.
436 *
437 * Context must be the same
438  IF( desca_1xp( 2 ).NE.descb_px1( 2 ) ) THEN
439  info = -( 11*100+2 )
440  END IF
441 *
442 * These are alignment restrictions that may or may not be removed
443 * in future releases. -Andy Cleary, April 14, 1996.
444 *
445 * Block sizes must be the same
446  IF( desca_1xp( 4 ).NE.descb_px1( 4 ) ) THEN
447  info = -( 11*100+4 )
448  END IF
449 *
450 * Source processor must be the same
451 *
452  IF( desca_1xp( 5 ).NE.descb_px1( 5 ) ) THEN
453  info = -( 11*100+5 )
454  END IF
455 *
456 * Get values out of descriptor for use in code.
457 *
458  ictxt = desca_1xp( 2 )
459  csrc = desca_1xp( 5 )
460  nb = desca_1xp( 4 )
461  llda = desca_1xp( 6 )
462  store_n_a = desca_1xp( 3 )
463  lldb = descb_px1( 6 )
464  store_m_b = descb_px1( 3 )
465 *
466 * Get grid parameters
467 *
468 *
469  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
470  np = nprow*npcol
471 *
472 *
473 *
474  IF( lsame( trans, 'N' ) ) THEN
475  idum2 = ichar( 'N' )
476  ELSE IF( lsame( trans, 'T' ) ) THEN
477  idum2 = ichar( 'T' )
478  ELSE IF( lsame( trans, 'C' ) ) THEN
479  idum2 = ichar( 'T' )
480  ELSE
481  info = -1
482  END IF
483 *
484  IF( lwork.LT.-1 ) THEN
485  info = -16
486  ELSE IF( lwork.EQ.-1 ) THEN
487  idum3 = -1
488  ELSE
489  idum3 = 1
490  END IF
491 *
492  IF( n.LT.0 ) THEN
493  info = -2
494  END IF
495 *
496  IF( n+ja-1.GT.store_n_a ) THEN
497  info = -( 8*100+6 )
498  END IF
499 *
500  IF( ( bwl.GT.n-1 ) .OR. ( bwl.LT.0 ) ) THEN
501  info = -3
502  END IF
503 *
504  IF( ( bwu.GT.n-1 ) .OR. ( bwu.LT.0 ) ) THEN
505  info = -4
506  END IF
507 *
508  IF( llda.LT.( 2*bwl+2*bwu+1 ) ) THEN
509  info = -( 8*100+6 )
510  END IF
511 *
512  IF( nb.LE.0 ) THEN
513  info = -( 8*100+4 )
514  END IF
515 *
516  bw = bwu + bwl
517 *
518  IF( n+ib-1.GT.store_m_b ) THEN
519  info = -( 11*100+3 )
520  END IF
521 *
522  IF( lldb.LT.nb ) THEN
523  info = -( 11*100+6 )
524  END IF
525 *
526  IF( nrhs.LT.0 ) THEN
527  info = -5
528  END IF
529 *
530 * Current alignment restriction
531 *
532  IF( ja.NE.ib ) THEN
533  info = -7
534  END IF
535 *
536 * Argument checking that is specific to Divide & Conquer routine
537 *
538  IF( nprow.NE.1 ) THEN
539  info = -( 8*100+2 )
540  END IF
541 *
542  IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
543  info = -( 2 )
544  CALL pxerbla( ictxt, 'PSGBTRS, D&C alg.: only 1 block per proc'
545  $ , -info )
546  RETURN
547  END IF
548 *
549  IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.( bwl+bwu+1 ) ) ) THEN
550  info = -( 8*100+4 )
551  CALL pxerbla( ictxt, 'PSGBTRS, D&C alg.: NB too small', -info )
552  RETURN
553  END IF
554 *
555 *
556 * Check worksize
557 *
558  work_size_min = nrhs*( nb+2*bwl+4*bwu )
559 *
560  work( 1 ) = work_size_min
561 *
562  IF( lwork.LT.work_size_min ) THEN
563  IF( lwork.NE.-1 ) THEN
564  info = -16
565  CALL pxerbla( ictxt, 'PSGBTRS: worksize error ', -info )
566  END IF
567  RETURN
568  END IF
569 *
570 * Pack params and positions into arrays for global consistency check
571 *
572  param_check( 17, 1 ) = descb( 5 )
573  param_check( 16, 1 ) = descb( 4 )
574  param_check( 15, 1 ) = descb( 3 )
575  param_check( 14, 1 ) = descb( 2 )
576  param_check( 13, 1 ) = descb( 1 )
577  param_check( 12, 1 ) = ib
578  param_check( 11, 1 ) = desca( 5 )
579  param_check( 10, 1 ) = desca( 4 )
580  param_check( 9, 1 ) = desca( 3 )
581  param_check( 8, 1 ) = desca( 1 )
582  param_check( 7, 1 ) = ja
583  param_check( 6, 1 ) = nrhs
584  param_check( 5, 1 ) = bwu
585  param_check( 4, 1 ) = bwl
586  param_check( 3, 1 ) = n
587  param_check( 2, 1 ) = idum3
588  param_check( 1, 1 ) = idum2
589 *
590  param_check( 17, 2 ) = 1105
591  param_check( 16, 2 ) = 1104
592  param_check( 15, 2 ) = 1103
593  param_check( 14, 2 ) = 1102
594  param_check( 13, 2 ) = 1101
595  param_check( 12, 2 ) = 10
596  param_check( 11, 2 ) = 805
597  param_check( 10, 2 ) = 804
598  param_check( 9, 2 ) = 803
599  param_check( 8, 2 ) = 801
600  param_check( 7, 2 ) = 7
601  param_check( 6, 2 ) = 5
602  param_check( 5, 2 ) = 4
603  param_check( 4, 2 ) = 3
604  param_check( 3, 2 ) = 2
605  param_check( 2, 2 ) = 16
606  param_check( 1, 2 ) = 1
607 *
608 * Want to find errors with MIN( ), so if no error, set it to a big
609 * number. If there already is an error, multiply by the the
610 * descriptor multiplier.
611 *
612  IF( info.GE.0 ) THEN
613  info = bignum
614  ELSE IF( info.LT.-descmult ) THEN
615  info = -info
616  ELSE
617  info = -info*descmult
618  END IF
619 *
620 * Check consistency across processors
621 *
622  CALL globchk( ictxt, 17, param_check, 17, param_check( 1, 3 ),
623  $ info )
624 *
625 * Prepare output: set info = 0 if no error, and divide by DESCMULT
626 * if error is not in a descriptor entry.
627 *
628  IF( info.EQ.bignum ) THEN
629  info = 0
630  ELSE IF( mod( info, descmult ).EQ.0 ) THEN
631  info = -info / descmult
632  ELSE
633  info = -info
634  END IF
635 *
636  IF( info.LT.0 ) THEN
637  CALL pxerbla( ictxt, 'PSGBTRS', -info )
638  RETURN
639  END IF
640 *
641 * Quick return if possible
642 *
643  IF( n.EQ.0 )
644  $ RETURN
645 *
646  IF( nrhs.EQ.0 )
647  $ RETURN
648 *
649 *
650 * Adjust addressing into matrix space to properly get into
651 * the beginning part of the relevant data
652 *
653  part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
654 *
655  IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
656  part_offset = part_offset + nb
657  END IF
658 *
659  IF( mycol.LT.csrc ) THEN
660  part_offset = part_offset - nb
661  END IF
662 *
663 * Form a new BLACS grid (the "standard form" grid) with only procs
664 * holding part of the matrix, of size 1xNP where NP is adjusted,
665 * starting at csrc=0, with JA modified to reflect dropped procs.
666 *
667 * First processor to hold part of the matrix:
668 *
669  first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
670 *
671 * Calculate new JA one while dropping off unused processors.
672 *
673  ja_new = mod( ja-1, nb ) + 1
674 *
675 * Save and compute new value of NP
676 *
677  np_save = np
678  np = ( ja_new+n-2 ) / nb + 1
679 *
680 * Call utility routine that forms "standard-form" grid
681 *
682  CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
683  $ int_one, np )
684 *
685 * Use new context from standard grid as context.
686 *
687  ictxt_save = ictxt
688  ictxt = ictxt_new
689  desca_1xp( 2 ) = ictxt_new
690  descb_px1( 2 ) = ictxt_new
691 *
692 * Get information about new grid.
693 *
694  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
695 *
696 * Drop out processors that do not have part of the matrix.
697 *
698  IF( myrow.LT.0 ) THEN
699  GO TO 100
700  END IF
701 *
702 *
703 *
704 * Begin main code
705 *
706 * Move data into workspace - communicate/copy (overlap)
707 *
708  IF( mycol.LT.npcol-1 ) THEN
709  CALL sgesd2d( ictxt, bwu, nrhs, b( nb-bwu+1 ), lldb, 0,
710  $ mycol+1 )
711  END IF
712 *
713  IF( mycol.LT.npcol-1 ) THEN
714  lm = nb - bwu
715  ELSE
716  lm = nb
717  END IF
718 *
719  IF( mycol.GT.0 ) THEN
720  wptr = bwu + 1
721  ELSE
722  wptr = 1
723  END IF
724 *
725  ldw = nb + bwu + 2*bw + bwu
726 *
727  CALL slamov( 'G', lm, nrhs, b( 1 ), lldb, work( wptr ), ldw )
728 *
729 * Zero out rest of work
730 *
731  DO 20 j = 1, nrhs
732  DO 10 l = wptr + lm, ldw
733  work( ( j-1 )*ldw+l ) = zero
734  10 CONTINUE
735  20 CONTINUE
736 *
737  IF( mycol.GT.0 ) THEN
738  CALL sgerv2d( ictxt, bwu, nrhs, work( 1 ), ldw, 0, mycol-1 )
739  END IF
740 *
741 ********************************************************************
742 * PHASE 1: Local computation phase -- Solve L*X = B
743 ********************************************************************
744 *
745 * Size of main (or odd) partition in each processor
746 *
747  odd_size = numroc( n, nb, mycol, 0, npcol )
748 *
749  IF( mycol.NE.0 ) THEN
750  lbwl = bw
751  lbwu = 0
752  aptr = 1
753  ELSE
754  lbwl = bwl
755  lbwu = bwu
756  aptr = 1 + bwu
757  END IF
758 *
759  IF( mycol.NE.npcol-1 ) THEN
760  lm = nb - lbwu
761  ln = nb - bw
762  ELSE IF( mycol.NE.0 ) THEN
763  lm = odd_size + bwu
764  ln = max( odd_size-bw, 0 )
765  ELSE
766  lm = n
767  ln = max( n-bw, 0 )
768  END IF
769 *
770  DO 30 j = 1, ln
771 *
772  lmj = min( lbwl, lm-j )
773  l = ipiv( j )
774 *
775  IF( l.NE.j ) THEN
776  CALL sswap( nrhs, work( l ), ldw, work( j ), ldw )
777  END IF
778 *
779  lptr = bw + 1 + ( j-1 )*llda + aptr
780 *
781  CALL sger( lmj, nrhs, -one, a( lptr ), 1, work( j ), ldw,
782  $ work( j+1 ), ldw )
783 *
784  30 CONTINUE
785 *
786 ********************************************************************
787 * PHASE 2: Global computation phase -- Solve L*X = B
788 ********************************************************************
789 *
790 * Define the initial dimensions of the diagonal blocks
791 * The offdiagonal blocks (for MYCOL > 0) are of size BM by BW
792 *
793  IF( mycol.NE.npcol-1 ) THEN
794  bm = bw - lbwu
795  bn = bw
796  ELSE
797  bm = min( bw, odd_size ) + bwu
798  bn = min( bw, odd_size )
799  END IF
800 *
801 * Pointer to first element of block bidiagonal matrix in AF
802 * Leading dimension of block bidiagonal system
803 *
804  bbptr = ( nb+bwu )*bw + 1
805  ldbb = 2*bw + bwu
806 *
807  IF( npcol.EQ.1 ) THEN
808 *
809 * In this case the loop over the levels will not be
810 * performed.
811  CALL sgetrs( 'N', n-ln, nrhs, af( bbptr+bw*ldbb ), ldbb,
812  $ ipiv( ln+1 ), work( ln+1 ), ldw, info )
813 *
814  END IF
815 *
816 * Loop over levels ...
817 *
818 * The two integers NPACT (nu. of active processors) and NPSTR
819 * (stride between active processors) is used to control the
820 * loop.
821 *
822  npact = npcol
823  npstr = 1
824 *
825 * Begin loop over levels
826  40 CONTINUE
827  IF( npact.LE.1 )
828  $ GO TO 50
829 *
830 * Test if processor is active
831  IF( mod( mycol, npstr ).EQ.0 ) THEN
832 *
833 * Send/Receive blocks
834 *
835  IF( mod( mycol, 2*npstr ).EQ.0 ) THEN
836 *
837  neicol = mycol + npstr
838 *
839  IF( neicol / npstr.LE.npact-1 ) THEN
840 *
841  IF( neicol / npstr.LT.npact-1 ) THEN
842  bmn = bw
843  ELSE
844  bmn = min( bw, numroc( n, nb, neicol, 0, npcol ) ) +
845  $ bwu
846  END IF
847 *
848  CALL sgesd2d( ictxt, bm, nrhs, work( ln+1 ), ldw, 0,
849  $ neicol )
850 *
851  IF( npact.NE.2 ) THEN
852 *
853 * Receive answers back from partner processor
854 *
855  CALL sgerv2d( ictxt, bm+bmn-bw, nrhs, work( ln+1 ),
856  $ ldw, 0, neicol )
857 *
858  bm = bm + bmn - bw
859 *
860  END IF
861 *
862  END IF
863 *
864  ELSE
865 *
866  neicol = mycol - npstr
867 *
868  IF( neicol.EQ.0 ) THEN
869  bmn = bw - bwu
870  ELSE
871  bmn = bw
872  END IF
873 *
874  CALL slamov( 'G', bm, nrhs, work( ln+1 ), ldw,
875  $ work( nb+bwu+bmn+1 ), ldw )
876 *
877  CALL sgerv2d( ictxt, bmn, nrhs, work( nb+bwu+1 ), ldw, 0,
878  $ neicol )
879 *
880 * and do the permutations and eliminations
881 *
882  IF( npact.NE.2 ) THEN
883 *
884 * Solve locally for BW variables
885 *
886  CALL slaswp( nrhs, work( nb+bwu+1 ), ldw, 1, bw,
887  $ ipiv( ln+1 ), 1 )
888 *
889  CALL strsm( 'L', 'L', 'N', 'U', bw, nrhs, one,
890  $ af( bbptr+bw*ldbb ), ldbb, work( nb+bwu+1 ),
891  $ ldw )
892 *
893 * Use soln just calculated to update RHS
894 *
895  CALL sgemm( 'N', 'N', bm+bmn-bw, nrhs, bw, -one,
896  $ af( bbptr+bw*ldbb+bw ), ldbb,
897  $ work( nb+bwu+1 ), ldw, one,
898  $ work( nb+bwu+1+bw ), ldw )
899 *
900 * Give answers back to partner processor
901 *
902  CALL sgesd2d( ictxt, bm+bmn-bw, nrhs,
903  $ work( nb+bwu+1+bw ), ldw, 0, neicol )
904 *
905  ELSE
906 *
907 * Finish up calculations for final level
908 *
909  CALL slaswp( nrhs, work( nb+bwu+1 ), ldw, 1, bm+bmn,
910  $ ipiv( ln+1 ), 1 )
911 *
912  CALL strsm( 'L', 'L', 'N', 'U', bm+bmn, nrhs, one,
913  $ af( bbptr+bw*ldbb ), ldbb, work( nb+bwu+1 ),
914  $ ldw )
915  END IF
916 *
917  END IF
918 *
919  npact = ( npact+1 ) / 2
920  npstr = npstr*2
921  GO TO 40
922 *
923  END IF
924 *
925  50 CONTINUE
926 *
927 *
928 **************************************
929 * BACKSOLVE
930 ********************************************************************
931 * PHASE 2: Global computation phase -- Solve U*Y = X
932 ********************************************************************
933 *
934  IF( npcol.EQ.1 ) THEN
935 *
936 * In this case the loop over the levels will not be
937 * performed.
938 * In fact, the backsolve portion was done in the call to
939 * SGETRS in the frontsolve.
940 *
941  END IF
942 *
943 * Compute variable needed to reverse loop structure in
944 * reduced system.
945 *
946  recovery_val = npact*npstr - npcol
947 *
948 * Loop over levels
949 * Terminal values of NPACT and NPSTR from frontsolve are used
950 *
951  60 CONTINUE
952  IF( npact.GE.npcol )
953  $ GO TO 80
954 *
955  npstr = npstr / 2
956 *
957  npact = npact*2
958 *
959 * Have to adjust npact for non-power-of-2
960 *
961  npact = npact - mod( ( recovery_val / npstr ), 2 )
962 *
963 * Find size of submatrix in this proc at this level
964 *
965  IF( mycol / npstr.LT.npact-1 ) THEN
966  bn = bw
967  ELSE
968  bn = min( bw, numroc( n, nb, npcol-1, 0, npcol ) )
969  END IF
970 *
971 * If this processor is even in this level...
972 *
973  IF( mod( mycol, 2*npstr ).EQ.0 ) THEN
974 *
975  neicol = mycol + npstr
976 *
977  IF( neicol / npstr.LE.npact-1 ) THEN
978 *
979  IF( neicol / npstr.LT.npact-1 ) THEN
980  bmn = bw
981  bnn = bw
982  ELSE
983  bmn = min( bw, numroc( n, nb, neicol, 0, npcol ) ) + bwu
984  bnn = min( bw, numroc( n, nb, neicol, 0, npcol ) )
985  END IF
986 *
987  IF( npact.GT.2 ) THEN
988 *
989  CALL sgesd2d( ictxt, 2*bw, nrhs, work( ln+1 ), ldw, 0,
990  $ neicol )
991 *
992  CALL sgerv2d( ictxt, bw, nrhs, work( ln+1 ), ldw, 0,
993  $ neicol )
994 *
995  ELSE
996 *
997  CALL sgerv2d( ictxt, bw, nrhs, work( ln+1 ), ldw, 0,
998  $ neicol )
999 *
1000  END IF
1001 *
1002  END IF
1003 *
1004  ELSE
1005 * This processor is odd on this level
1006 *
1007  neicol = mycol - npstr
1008 *
1009  IF( neicol.EQ.0 ) THEN
1010  bmn = bw - bwu
1011  ELSE
1012  bmn = bw
1013  END IF
1014 *
1015  IF( neicol.LT.npcol-1 ) THEN
1016  bnn = bw
1017  ELSE
1018  bnn = min( bw, numroc( n, nb, neicol, 0, npcol ) )
1019  END IF
1020 *
1021  IF( npact.GT.2 ) THEN
1022 *
1023 * Move RHS to make room for received solutions
1024 *
1025  CALL slamov( 'G', bw, nrhs, work( nb+bwu+1 ), ldw,
1026  $ work( nb+bwu+bw+1 ), ldw )
1027 *
1028  CALL sgerv2d( ictxt, 2*bw, nrhs, work( ln+1 ), ldw, 0,
1029  $ neicol )
1030 *
1031  CALL sgemm( 'N', 'N', bw, nrhs, bn, -one, af( bbptr ), ldbb,
1032  $ work( ln+1 ), ldw, one, work( nb+bwu+bw+1 ),
1033  $ ldw )
1034 *
1035 *
1036  IF( mycol.GT.npstr ) THEN
1037 *
1038  CALL sgemm( 'N', 'N', bw, nrhs, bw, -one,
1039  $ af( bbptr+2*bw*ldbb ), ldbb, work( ln+bw+1 ),
1040  $ ldw, one, work( nb+bwu+bw+1 ), ldw )
1041 *
1042  END IF
1043 *
1044  CALL strsm( 'L', 'U', 'N', 'N', bw, nrhs, one,
1045  $ af( bbptr+bw*ldbb ), ldbb, work( nb+bwu+bw+1 ),
1046  $ ldw )
1047 *
1048 * Send new solution to neighbor
1049 *
1050  CALL sgesd2d( ictxt, bw, nrhs, work( nb+bwu+bw+1 ), ldw, 0,
1051  $ neicol )
1052 *
1053 * Copy new solution into expected place
1054 *
1055  CALL slamov( 'G', bw, nrhs, work( nb+bwu+1+bw ), ldw,
1056  $ work( ln+bw+1 ), ldw )
1057 *
1058  ELSE
1059 *
1060 * Solve with local diagonal block
1061 *
1062  CALL strsm( 'L', 'U', 'N', 'N', bn+bnn, nrhs, one,
1063  $ af( bbptr+bw*ldbb ), ldbb, work( nb+bwu+1 ),
1064  $ ldw )
1065 *
1066 * Send new solution to neighbor
1067 *
1068  CALL sgesd2d( ictxt, bw, nrhs, work( nb+bwu+1 ), ldw, 0,
1069  $ neicol )
1070 *
1071 * Shift solutions into expected positions
1072 *
1073  CALL slamov( 'G', bnn+bn-bw, nrhs, work( nb+bwu+1+bw ), ldw,
1074  $ work( ln+1 ), ldw )
1075 *
1076 *
1077  IF( ( nb+bwu+1 ).NE.( ln+1+bw ) ) THEN
1078 *
1079 * Copy one row at a time since spaces may overlap
1080 *
1081  DO 70 j = 1, bw
1082  CALL scopy( nrhs, work( nb+bwu+j ), ldw,
1083  $ work( ln+bw+j ), ldw )
1084  70 CONTINUE
1085 *
1086  END IF
1087 *
1088  END IF
1089 *
1090  END IF
1091 *
1092  GO TO 60
1093 *
1094  80 CONTINUE
1095 * End of loop over levels
1096 *
1097 ********************************************************************
1098 * PHASE 1: (Almost) Local computation phase -- Solve U*Y = X
1099 ********************************************************************
1100 *
1101 * Reset BM to value it had before reduced system frontsolve...
1102 *
1103  IF( mycol.NE.npcol-1 ) THEN
1104  bm = bw - lbwu
1105  ELSE
1106  bm = min( bw, odd_size ) + bwu
1107  END IF
1108 *
1109 * First metastep is to account for the fillin blocks AF
1110 *
1111  IF( mycol.LT.npcol-1 ) THEN
1112 *
1113  CALL sgesd2d( ictxt, bw, nrhs, work( nb-bw+1 ), ldw, 0,
1114  $ mycol+1 )
1115 *
1116  END IF
1117 *
1118  IF( mycol.GT.0 ) THEN
1119 *
1120  CALL sgerv2d( ictxt, bw, nrhs, work( nb+bwu+1 ), ldw, 0,
1121  $ mycol-1 )
1122 *
1123 * Modify local right hand sides with received rhs's
1124 *
1125  CALL sgemm( 'T', 'N', lm-bm, nrhs, bw, -one, af( 1 ), bw,
1126  $ work( nb+bwu+1 ), ldw, one, work( 1 ), ldw )
1127 *
1128  END IF
1129 *
1130  DO 90 j = ln, 1, -1
1131 *
1132  lmj = min( bw, odd_size-1 )
1133 *
1134  lptr = bw - 1 + j*llda + aptr
1135 *
1136 * In the following, the TRANS=T option is used to reverse
1137 * the order of multiplication, not as a true transpose
1138 *
1139  CALL sgemv( 'T', lmj, nrhs, -one, work( j+1 ), ldw, a( lptr ),
1140  $ llda-1, one, work( j ), ldw )
1141 *
1142 * Divide by diagonal element
1143 *
1144  CALL sscal( nrhs, one / a( lptr-llda+1 ), work( j ), ldw )
1145  90 CONTINUE
1146 *
1147 *
1148 *
1149  CALL slamov( 'G', odd_size, nrhs, work( 1 ), ldw, b( 1 ), lldb )
1150 *
1151 * Free BLACS space used to hold standard-form grid.
1152 *
1153  ictxt = ictxt_save
1154  IF( ictxt.NE.ictxt_new ) THEN
1155  CALL blacs_gridexit( ictxt_new )
1156  END IF
1157 *
1158  100 CONTINUE
1159 *
1160 * Restore saved input parameters
1161 *
1162  np = np_save
1163 *
1164 * Output worksize
1165 *
1166  work( 1 ) = work_size_min
1167 *
1168  RETURN
1169 *
1170 * End of PSGBTRS
1171 *
1172  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
max
#define max(A, B)
Definition: pcgemr.c:180
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
psgbtrs
subroutine psgbtrs(TRANS, N, BWL, BWU, NRHS, A, JA, DESCA, IPIV, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: psgbtrs.f:3
min
#define min(A, B)
Definition: pcgemr.c:181