ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pzpbtrf.f
Go to the documentation of this file.
1  SUBROUTINE pzpbtrf( UPLO, N, BW, A, JA, DESCA, AF, LAF, WORK,
2  $ LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  CHARACTER UPLO
10  INTEGER BW, INFO, JA, LAF, LWORK, N
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * )
14  COMPLEX*16 A( * ), AF( * ), WORK( * )
15 * ..
16 *
17 *
18 * Purpose
19 * =======
20 *
21 * PZPBTRF computes a Cholesky factorization
22 * of an N-by-N complex banded
23 * symmetric positive definite distributed matrix
24 * with bandwidth BW: A(1:N, JA:JA+N-1).
25 * Reordering is used to increase parallelism in the factorization.
26 * This reordering results in factors that are DIFFERENT from those
27 * produced by equivalent sequential codes. These factors cannot
28 * be used directly by users; however, they can be used in
29 * subsequent calls to PZPBTRS to solve linear systems.
30 *
31 * The factorization has the form
32 *
33 * P A(1:N, JA:JA+N-1) P^T = U' U , if UPLO = 'U', or
34 *
35 * P A(1:N, JA:JA+N-1) P^T = L L', if UPLO = 'L'
36 *
37 * where U is a banded upper triangular matrix and L is banded
38 * lower triangular, and P is a permutation matrix.
39 *
40 * =====================================================================
41 *
42 * Arguments
43 * =========
44 *
45 * UPLO (global input) CHARACTER
46 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
47 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
48 *
49 * N (global input) INTEGER
50 * The number of rows and columns to be operated on, i.e. the
51 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
52 *
53 * BW (global input) INTEGER
54 * Number of subdiagonals in L or U. 0 <= BW <= N-1
55 *
56 * A (local input/local output) COMPLEX*16 pointer into
57 * local memory to an array with first dimension
58 * LLD_A >=(bw+1) (stored in DESCA).
59 * On entry, this array contains the local pieces of the
60 * N-by-N symmetric banded distributed matrix
61 * A(1:N, JA:JA+N-1) to be factored.
62 * This local portion is stored in the packed banded format
63 * used in LAPACK. Please see the Notes below and the
64 * ScaLAPACK manual for more detail on the format of
65 * distributed matrices.
66 * On exit, this array contains information containing details
67 * of the factorization.
68 * Note that permutations are performed on the matrix, so that
69 * the factors returned are different from those returned
70 * by LAPACK.
71 *
72 * JA (global input) INTEGER
73 * The index in the global array A that points to the start of
74 * the matrix to be operated on (which may be either all of A
75 * or a submatrix of A).
76 *
77 * DESCA (global and local input) INTEGER array of dimension DLEN.
78 * if 1D type (DTYPE_A=501), DLEN >= 7;
79 * if 2D type (DTYPE_A=1), DLEN >= 9 .
80 * The array descriptor for the distributed matrix A.
81 * Contains information of mapping of A to memory. Please
82 * see NOTES below for full description and options.
83 *
84 * AF (local output) COMPLEX*16 array, dimension LAF.
85 * Auxiliary Fillin Space.
86 * Fillin is created during the factorization routine
87 * PZPBTRF and this is stored in AF. If a linear system
88 * is to be solved using PZPBTRS after the factorization
89 * routine, AF *must not be altered* after the factorization.
90 *
91 * LAF (local input) INTEGER
92 * Size of user-input Auxiliary Fillin space AF. Must be >=
93 * (NB+2*bw)*bw
94 * If LAF is not large enough, an error code will be returned
95 * and the minimum acceptable size will be returned in AF( 1 )
96 *
97 * WORK (local workspace/local output)
98 * COMPLEX*16 temporary workspace. This space may
99 * be overwritten in between calls to routines. WORK must be
100 * the size given in LWORK.
101 * On exit, WORK( 1 ) contains the minimal LWORK.
102 *
103 * LWORK (local input or global input) INTEGER
104 * Size of user-input workspace WORK.
105 * If LWORK is too small, the minimal acceptable size will be
106 * returned in WORK(1) and an error code is returned. LWORK>=
107 * bw*bw
108 *
109 * INFO (global output) INTEGER
110 * = 0: successful exit
111 * < 0: If the i-th argument is an array and the j-entry had
112 * an illegal value, then INFO = -(i*100+j), if the i-th
113 * argument is a scalar and had an illegal value, then
114 * INFO = -i.
115 * > 0: If INFO = K<=NPROCS, the submatrix stored on processor
116 * INFO and factored locally was not
117 * positive definite, and
118 * the factorization was not completed.
119 * If INFO = K>NPROCS, the submatrix stored on processor
120 * INFO-NPROCS representing interactions with other
121 * processors was not
122 * positive definite,
123 * and the factorization was not completed.
124 *
125 * =====================================================================
126 *
127 *
128 * Restrictions
129 * ============
130 *
131 * The following are restrictions on the input parameters. Some of these
132 * are temporary and will be removed in future releases, while others
133 * may reflect fundamental technical limitations.
134 *
135 * Non-cyclic restriction: VERY IMPORTANT!
136 * P*NB>= mod(JA-1,NB)+N.
137 * The mapping for matrices must be blocked, reflecting the nature
138 * of the divide and conquer algorithm as a task-parallel algorithm.
139 * This formula in words is: no processor may have more than one
140 * chunk of the matrix.
141 *
142 * Blocksize cannot be too small:
143 * If the matrix spans more than one processor, the following
144 * restriction on NB, the size of each block on each processor,
145 * must hold:
146 * NB >= 2*BW
147 * The bulk of parallel computation is done on the matrix of size
148 * O(NB) on each processor. If this is too small, divide and conquer
149 * is a poor choice of algorithm.
150 *
151 * Submatrix reference:
152 * JA = IB
153 * Alignment restriction that prevents unnecessary communication.
154 *
155 *
156 * =====================================================================
157 *
158 *
159 * Notes
160 * =====
161 *
162 * If the factorization routine and the solve routine are to be called
163 * separately (to solve various sets of righthand sides using the same
164 * coefficient matrix), the auxiliary space AF *must not be altered*
165 * between calls to the factorization routine and the solve routine.
166 *
167 * The best algorithm for solving banded and tridiagonal linear systems
168 * depends on a variety of parameters, especially the bandwidth.
169 * Currently, only algorithms designed for the case N/P >> bw are
170 * implemented. These go by many names, including Divide and Conquer,
171 * Partitioning, domain decomposition-type, etc.
172 *
173 * Algorithm description: Divide and Conquer
174 *
175 * The Divide and Conqer algorithm assumes the matrix is narrowly
176 * banded compared with the number of equations. In this situation,
177 * it is best to distribute the input matrix A one-dimensionally,
178 * with columns atomic and rows divided amongst the processes.
179 * The basic algorithm divides the banded matrix up into
180 * P pieces with one stored on each processor,
181 * and then proceeds in 2 phases for the factorization or 3 for the
182 * solution of a linear system.
183 * 1) Local Phase:
184 * The individual pieces are factored independently and in
185 * parallel. These factors are applied to the matrix creating
186 * fillin, which is stored in a non-inspectable way in auxiliary
187 * space AF. Mathematically, this is equivalent to reordering
188 * the matrix A as P A P^T and then factoring the principal
189 * leading submatrix of size equal to the sum of the sizes of
190 * the matrices factored on each processor. The factors of
191 * these submatrices overwrite the corresponding parts of A
192 * in memory.
193 * 2) Reduced System Phase:
194 * A small (BW* (P-1)) system is formed representing
195 * interaction of the larger blocks, and is stored (as are its
196 * factors) in the space AF. A parallel Block Cyclic Reduction
197 * algorithm is used. For a linear system, a parallel front solve
198 * followed by an analagous backsolve, both using the structure
199 * of the factored matrix, are performed.
200 * 3) Backsubsitution Phase:
201 * For a linear system, a local backsubstitution is performed on
202 * each processor in parallel.
203 *
204 *
205 * Descriptors
206 * ===========
207 *
208 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
209 *
210 * Note: banded codes can use either the old two dimensional
211 * or new one-dimensional descriptors, though the processor grid in
212 * both cases *must be one-dimensional*. We describe both types below.
213 *
214 * Each global data object is described by an associated description
215 * vector. This vector stores the information required to establish
216 * the mapping between an object element and its corresponding process
217 * and memory location.
218 *
219 * Let A be a generic term for any 2D block cyclicly distributed array.
220 * Such a global array has an associated description vector DESCA.
221 * In the following comments, the character _ should be read as
222 * "of the global array".
223 *
224 * NOTATION STORED IN EXPLANATION
225 * --------------- -------------- --------------------------------------
226 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
227 * DTYPE_A = 1.
228 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
229 * the BLACS process grid A is distribu-
230 * ted over. The context itself is glo-
231 * bal, but the handle (the integer
232 * value) may vary.
233 * M_A (global) DESCA( M_ ) The number of rows in the global
234 * array A.
235 * N_A (global) DESCA( N_ ) The number of columns in the global
236 * array A.
237 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
238 * the rows of the array.
239 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
240 * the columns of the array.
241 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
242 * row of the array A is distributed.
243 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
244 * first column of the array A is
245 * distributed.
246 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
247 * array. LLD_A >= MAX(1,LOCr(M_A)).
248 *
249 * Let K be the number of rows or columns of a distributed matrix,
250 * and assume that its process grid has dimension p x q.
251 * LOCr( K ) denotes the number of elements of K that a process
252 * would receive if K were distributed over the p processes of its
253 * process column.
254 * Similarly, LOCc( K ) denotes the number of elements of K that a
255 * process would receive if K were distributed over the q processes of
256 * its process row.
257 * The values of LOCr() and LOCc() may be determined via a call to the
258 * ScaLAPACK tool function, NUMROC:
259 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
260 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
261 * An upper bound for these quantities may be computed by:
262 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
263 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
264 *
265 *
266 * One-dimensional descriptors:
267 *
268 * One-dimensional descriptors are a new addition to ScaLAPACK since
269 * version 1.0. They simplify and shorten the descriptor for 1D
270 * arrays.
271 *
272 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
273 * object, we allow 1D arrays to be distributed either over the
274 * first dimension of the array (as if the grid were P-by-1) or the
275 * 2nd dimension (as if the grid were 1-by-P). This choice is
276 * indicated by the descriptor type (501 or 502)
277 * as described below.
278 *
279 * IMPORTANT NOTE: the actual BLACS grid represented by the
280 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
281 * irrespective of which one-dimensional descriptor type
282 * (501 or 502) is input.
283 * This routine will interpret the grid properly either way.
284 * ScaLAPACK routines *do not support intercontext operations* so that
285 * the grid passed to a single ScaLAPACK routine *must be the same*
286 * for all array descriptors passed to that routine.
287 *
288 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
289 * may also be used, since a one-dimensional array is a special case
290 * of a two-dimensional array with one dimension of size unity.
291 * The two-dimensional array used in this case *must* be of the
292 * proper orientation:
293 * If the appropriate one-dimensional descriptor is DTYPEA=501
294 * (1 by P type), then the two dimensional descriptor must
295 * have a CTXT value that refers to a 1 by P BLACS grid;
296 * If the appropriate one-dimensional descriptor is DTYPEA=502
297 * (P by 1 type), then the two dimensional descriptor must
298 * have a CTXT value that refers to a P by 1 BLACS grid.
299 *
300 *
301 * Summary of allowed descriptors, types, and BLACS grids:
302 * DTYPE 501 502 1 1
303 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
304 * -----------------------------------------------------
305 * A OK NO OK NO
306 * B NO OK NO OK
307 *
308 * Let A be a generic term for any 1D block cyclicly distributed array.
309 * Such a global array has an associated description vector DESCA.
310 * In the following comments, the character _ should be read as
311 * "of the global array".
312 *
313 * NOTATION STORED IN EXPLANATION
314 * --------------- ---------- ------------------------------------------
315 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
316 * TYPE_A = 501: 1-by-P grid.
317 * TYPE_A = 502: P-by-1 grid.
318 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
319 * the BLACS process grid A is distribu-
320 * ted over. The context itself is glo-
321 * bal, but the handle (the integer
322 * value) may vary.
323 * N_A (global) DESCA( 3 ) The size of the array dimension being
324 * distributed.
325 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
326 * the distributed dimension of the array.
327 * SRC_A (global) DESCA( 5 ) The process row or column over which the
328 * first row or column of the array
329 * is distributed.
330 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
331 * storing the local blocks of the distri-
332 * buted array A. Minimum value of LLD_A
333 * depends on TYPE_A.
334 * TYPE_A = 501: LLD_A >=
335 * size of undistributed dimension, 1.
336 * TYPE_A = 502: LLD_A >=NB_A, 1.
337 * Reserved DESCA( 7 ) Reserved for future use.
338 *
339 *
340 *
341 * =====================================================================
342 *
343 * Code Developer: Andrew J. Cleary, University of Tennessee.
344 * Current address: Lawrence Livermore National Labs.
345 * This version released: August, 2001.
346 *
347 * =====================================================================
348 *
349 * ..
350 * .. Parameters ..
351  DOUBLE PRECISION ONE, ZERO
352  parameter( one = 1.0d+0 )
353  parameter( zero = 0.0d+0 )
354  COMPLEX*16 CONE, CZERO
355  parameter( cone = ( 1.0d+0, 0.0d+0 ) )
356  parameter( czero = ( 0.0d+0, 0.0d+0 ) )
357  INTEGER INT_ONE
358  parameter( int_one = 1 )
359  INTEGER DESCMULT, BIGNUM
360  parameter(descmult = 100, bignum = descmult * descmult)
361  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
362  $ lld_, mb_, m_, nb_, n_, rsrc_
363  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
364  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
365  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
366 * ..
367 * .. Local Scalars ..
368  INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
369  $ ictxt_new, ictxt_save, idum1, idum3, ja_new,
370  $ laf_min, level_dist, llda, mbw2, mycol, myrow,
371  $ my_num_cols, nb, next_tri_size_m,
372  $ next_tri_size_n, np, npcol, nprow, np_save,
373  $ odd_size, ofst, part_offset, part_size,
374  $ prev_tri_size_m, prev_tri_size_n, return_code,
375  $ store_n_a, work_size_min
376 * ..
377 * .. Local Arrays ..
378  INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
379 * ..
380 * .. External Subroutines ..
381  EXTERNAL blacs_get, blacs_gridexit, blacs_gridinfo,
382  $ desc_convert, globchk, pxerbla, reshape, zaxpy,
383  $ zgemm, zgerv2d, zgesd2d, zlamov, zlatcpy,
384  $ zpbtrf, zpotrf, zsyrk, ztbtrs, ztrmm, ztrrv2d,
385  $ ztrsd2d, ztrsm, ztrtrs
386 * ..
387 * .. External Functions ..
388  LOGICAL LSAME
389  INTEGER NUMROC
390  EXTERNAL lsame, numroc
391 * ..
392 * .. Intrinsic Functions ..
393  INTRINSIC ichar, min, mod
394 * ..
395 * .. Executable Statements ..
396 *
397 * Test the input parameters
398 *
399  info = 0
400 *
401 * Convert descriptor into standard form for easy access to
402 * parameters, check that grid is of right shape.
403 *
404  desca_1xp( 1 ) = 501
405 *
406  CALL desc_convert( desca, desca_1xp, return_code )
407 *
408  IF( return_code .NE. 0) THEN
409  info = -( 6*100 + 2 )
410  ENDIF
411 *
412 * Get values out of descriptor for use in code.
413 *
414  ictxt = desca_1xp( 2 )
415  csrc = desca_1xp( 5 )
416  nb = desca_1xp( 4 )
417  llda = desca_1xp( 6 )
418  store_n_a = desca_1xp( 3 )
419 *
420 * Get grid parameters
421 *
422 *
423 * Pre-calculate bw^2
424 *
425  mbw2 = bw * bw
426 *
427  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
428  np = nprow * npcol
429 *
430 *
431 *
432  IF( lsame( uplo, 'U' ) ) THEN
433  idum1 = ichar( 'U' )
434  ELSE IF ( lsame( uplo, 'L' ) ) THEN
435  idum1 = ichar( 'L' )
436  ELSE
437  info = -1
438  END IF
439 *
440  IF( lwork .LT. -1) THEN
441  info = -10
442  ELSE IF ( lwork .EQ. -1 ) THEN
443  idum3 = -1
444  ELSE
445  idum3 = 1
446  ENDIF
447 *
448  IF( n .LT. 0 ) THEN
449  info = -2
450  ENDIF
451 *
452  IF( n+ja-1 .GT. store_n_a ) THEN
453  info = -( 6*100 + 6 )
454  ENDIF
455 *
456  IF(( bw .GT. n-1 ) .OR.
457  $ ( bw .LT. 0 ) ) THEN
458  info = -3
459  ENDIF
460 *
461  IF( llda .LT. (bw+1) ) THEN
462  info = -( 6*100 + 6 )
463  ENDIF
464 *
465  IF( nb .LE. 0 ) THEN
466  info = -( 6*100 + 4 )
467  ENDIF
468 *
469 * Argument checking that is specific to Divide & Conquer routine
470 *
471  IF( nprow .NE. 1 ) THEN
472  info = -( 6*100+2 )
473  ENDIF
474 *
475  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
476  info = -( 2 )
477  CALL pxerbla( ictxt,
478  $ 'PZPBTRF, D&C alg.: only 1 block per proc',
479  $ -info )
480  RETURN
481  ENDIF
482 *
483  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*bw )) THEN
484  info = -( 6*100+4 )
485  CALL pxerbla( ictxt,
486  $ 'PZPBTRF, D&C alg.: NB too small',
487  $ -info )
488  RETURN
489  ENDIF
490 *
491 *
492 * Check auxiliary storage size
493 *
494  laf_min = (nb+2*bw)*bw
495 *
496  IF( laf .LT. laf_min ) THEN
497  info = -8
498 * put minimum value of laf into AF( 1 )
499  af( 1 ) = laf_min
500  CALL pxerbla( ictxt,
501  $ 'PZPBTRF: auxiliary storage error ',
502  $ -info )
503  RETURN
504  ENDIF
505 *
506 * Check worksize
507 *
508  work_size_min = bw*bw
509 *
510  work( 1 ) = work_size_min
511 *
512  IF( lwork .LT. work_size_min ) THEN
513  IF( lwork .NE. -1 ) THEN
514  info = -10
515  CALL pxerbla( ictxt,
516  $ 'PZPBTRF: worksize error ',
517  $ -info )
518  ENDIF
519  RETURN
520  ENDIF
521 *
522 * Pack params and positions into arrays for global consistency check
523 *
524  param_check( 9, 1 ) = desca(5)
525  param_check( 8, 1 ) = desca(4)
526  param_check( 7, 1 ) = desca(3)
527  param_check( 6, 1 ) = desca(1)
528  param_check( 5, 1 ) = ja
529  param_check( 4, 1 ) = bw
530  param_check( 3, 1 ) = n
531  param_check( 2, 1 ) = idum3
532  param_check( 1, 1 ) = idum1
533 *
534  param_check( 9, 2 ) = 605
535  param_check( 8, 2 ) = 604
536  param_check( 7, 2 ) = 603
537  param_check( 6, 2 ) = 601
538  param_check( 5, 2 ) = 5
539  param_check( 4, 2 ) = 3
540  param_check( 3, 2 ) = 2
541  param_check( 2, 2 ) = 10
542  param_check( 1, 2 ) = 1
543 *
544 * Want to find errors with MIN( ), so if no error, set it to a big
545 * number. If there already is an error, multiply by the the
546 * descriptor multiplier.
547 *
548  IF( info.GE.0 ) THEN
549  info = bignum
550  ELSE IF( info.LT.-descmult ) THEN
551  info = -info
552  ELSE
553  info = -info * descmult
554  END IF
555 *
556 * Check consistency across processors
557 *
558  CALL globchk( ictxt, 9, param_check, 9,
559  $ param_check( 1, 3 ), info )
560 *
561 * Prepare output: set info = 0 if no error, and divide by DESCMULT
562 * if error is not in a descriptor entry.
563 *
564  IF( info.EQ.bignum ) THEN
565  info = 0
566  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
567  info = -info / descmult
568  ELSE
569  info = -info
570  END IF
571 *
572  IF( info.LT.0 ) THEN
573  CALL pxerbla( ictxt, 'PZPBTRF', -info )
574  RETURN
575  END IF
576 *
577 * Quick return if possible
578 *
579  IF( n.EQ.0 )
580  $ RETURN
581 *
582 *
583 * Adjust addressing into matrix space to properly get into
584 * the beginning part of the relevant data
585 *
586  part_offset = nb*( (ja-1)/(npcol*nb) )
587 *
588  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
589  part_offset = part_offset + nb
590  ENDIF
591 *
592  IF ( mycol .LT. csrc ) THEN
593  part_offset = part_offset - nb
594  ENDIF
595 *
596 * Form a new BLACS grid (the "standard form" grid) with only procs
597 * holding part of the matrix, of size 1xNP where NP is adjusted,
598 * starting at csrc=0, with JA modified to reflect dropped procs.
599 *
600 * First processor to hold part of the matrix:
601 *
602  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
603 *
604 * Calculate new JA one while dropping off unused processors.
605 *
606  ja_new = mod( ja-1, nb ) + 1
607 *
608 * Save and compute new value of NP
609 *
610  np_save = np
611  np = ( ja_new+n-2 )/nb + 1
612 *
613 * Call utility routine that forms "standard-form" grid
614 *
615  CALL reshape( ictxt, int_one, ictxt_new, int_one,
616  $ first_proc, int_one, np )
617 *
618 * Use new context from standard grid as context.
619 *
620  ictxt_save = ictxt
621  ictxt = ictxt_new
622  desca_1xp( 2 ) = ictxt_new
623 *
624 * Get information about new grid.
625 *
626  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
627 *
628 * Drop out processors that do not have part of the matrix.
629 *
630  IF( myrow .LT. 0 ) THEN
631  GOTO 1234
632  ENDIF
633 *
634 * ********************************
635 * Values reused throughout routine
636 *
637 * User-input value of partition size
638 *
639  part_size = nb
640 *
641 * Number of columns in each processor
642 *
643  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
644 *
645 * Offset in columns to beginning of main partition in each proc
646 *
647  IF ( mycol .EQ. 0 ) THEN
648  part_offset = part_offset+mod( ja_new-1, part_size )
649  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
650  ENDIF
651 *
652 * Offset in elements
653 *
654  ofst = part_offset*llda
655 *
656 * Size of main (or odd) partition in each processor
657 *
658  odd_size = my_num_cols
659  IF ( mycol .LT. np-1 ) THEN
660  odd_size = odd_size - bw
661  ENDIF
662 *
663 *
664 * Zero out space for fillin
665 *
666  DO 10 i=1, laf_min
667  af( i ) = czero
668  10 CONTINUE
669 *
670 * Zero out space for work
671 *
672  DO 20 i=1, work_size_min
673  work( i ) = czero
674  20 CONTINUE
675 *
676 * Begin main code
677 *
678  IF ( lsame( uplo, 'L' ) ) THEN
679 *
680 ********************************************************************
681 * PHASE 1: Local computation phase.
682 ********************************************************************
683 *
684 *
685 * Sizes of the extra triangles communicated bewtween processors
686 *
687  IF( mycol .GT. 0 ) THEN
688  prev_tri_size_m= min( bw,
689  $ numroc( n, part_size, mycol, 0, npcol ) )
690  prev_tri_size_n=min( bw,
691  $ numroc( n, part_size, mycol-1, 0, npcol ) )
692  ENDIF
693 *
694  IF( mycol .LT. npcol-1 ) THEN
695  next_tri_size_m=min( bw,
696  $ numroc( n, part_size, mycol+1, 0, npcol ) )
697  next_tri_size_n= min( bw,
698  $ numroc( n, part_size, mycol, 0, npcol ) )
699  ENDIF
700 *
701  IF ( mycol .LT. np-1 ) THEN
702 * Transfer last triangle D_i of local matrix to next processor
703 * which needs it to calculate fillin due to factorization of
704 * its main (odd) block A_i.
705 * Overlap the send with the factorization of A_i.
706 *
707  CALL ztrsd2d( ictxt, 'U', 'N', next_tri_size_m,
708  $ next_tri_size_n, a( ofst+odd_size*llda+(bw+1) ),
709  $ llda-1, 0, mycol+1 )
710 *
711  ENDIF
712 *
713 *
714 * Factor main partition A_i = L_i {L_i}^C in each processor
715 *
716  CALL zpbtrf( uplo, odd_size, bw, a( ofst + 1),
717  $ llda, info )
718 *
719  IF( info.NE.0 ) THEN
720  info = mycol+1
721  GOTO 1500
722  ENDIF
723 *
724 *
725  IF ( mycol .LT. np-1 ) THEN
726 * Apply factorization to odd-even connection block B_i
727 *
728 * conjugate transpose the connection block in preparation.
729 *
730  CALL zlatcpy( 'U', bw, bw,
731  $ a(( ofst+(bw+1)+(odd_size-bw)*llda )), llda-1,
732  $ af( odd_size*bw+2*mbw2+1+bw-bw ), bw )
733 *
734 * Perform the triangular system solve {L_i}{{B'}_i}^C = {B_i}^C
735 *
736  CALL ztrtrs( 'L', 'N', 'N', bw, bw,
737  $ a( ofst+1+(odd_size-bw)*llda ), llda-1,
738  $ af( odd_size*bw+2*mbw2+1 ), bw, info )
739 *
740 *
741 * conjugate transpose resulting block to its location
742 * in main storage.
743 *
744  CALL zlatcpy( 'L', bw, bw, af( odd_size*bw+2*mbw2+1+bw-bw ),
745  $ bw, a(( ofst+(bw+1)+(odd_size-bw)*llda )),
746  $ llda-1 )
747 *
748 *
749 * Compute contribution to diagonal block(s) of reduced system.
750 * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^C
751 *
752 * The following method uses more flops than necessary but
753 * does not necessitate the writing of a new BLAS routine.
754 *
755 *
756  CALL zherk( uplo, 'C', bw, bw, -one,
757  $ af( odd_size*bw+2*mbw2+1 ), bw, one,
758  $ a( ofst+1+odd_size*llda ), llda-1 )
759 *
760  ENDIF
761 * End of "if ( MYCOL .lt. NP-1 )..." loop
762 *
763 *
764  1500 CONTINUE
765 * If the processor could not locally factor, it jumps here.
766 *
767  IF ( mycol .NE. 0 ) THEN
768 * Discard temporary matrix stored beginning in
769 * AF( (odd_size+2*bw)*bw+1 ) and use for
770 * off_diagonal block of reduced system.
771 *
772 * Receive previously transmitted matrix section, which forms
773 * the right-hand-side for the triangular solve that calculates
774 * the "spike" fillin.
775 *
776 *
777  CALL ztrrv2d( ictxt, 'U', 'N', prev_tri_size_m,
778  $ prev_tri_size_n, af( 1 ), odd_size, 0,
779  $ mycol-1 )
780 *
781  IF (info.EQ.0) THEN
782 *
783 * Calculate the "spike" fillin, ${L_i} {{G}_i}^C = {D_i}$ .
784 *
785  CALL ztbtrs( 'L', 'N', 'N', odd_size, bw, bw, a( ofst + 1 ),
786  $ llda, af( 1 ), odd_size, info )
787 *
788 *
789 * Calculate the update block for previous proc, E_i = G_i{G_i}^C
790 *
791  CALL zherk( 'L', 'C', bw, odd_size,
792  $ -one, af( 1 ), odd_size, zero,
793  $ af( 1 + (odd_size+2*bw)*bw), bw )
794 *
795 *
796 * Initiate send of E_i to previous processor to overlap
797 * with next computation.
798 *
799  CALL zgesd2d( ictxt, bw, bw, af( odd_size*bw+2*mbw2+1 ), bw,
800  $ 0, mycol-1 )
801 *
802 *
803  IF ( mycol .LT. np-1 ) THEN
804 *
805 * Calculate off-diagonal block(s) of reduced system.
806 * Note: for ease of use in solution of reduced system, store
807 * L's off-diagonal block in conjugate transpose form.
808 * {F_i}^C = {H_i}{{B'}_i}^C
809 *
810 * Copy matrix H_i (the last bw cols of G_i) to AF storage
811 * as per requirements of BLAS routine ZTRMM.
812 * Since we have G_i^C stored, conjugate transpose
813 * H_i^C to H_i.
814 *
815  CALL zlatcpy( 'N', bw, bw,
816  $ af( odd_size-bw+1 ), odd_size,
817  $ af( (odd_size)*bw+1), bw )
818 *
819  CALL ztrmm( 'R', 'U', 'C', 'N', bw, bw, -cone,
820  $ a( ( ofst+(bw+1)+(odd_size-bw)*llda ) ), llda-1,
821  $ af( (odd_size)*bw+1 ), bw )
822 *
823 *
824  ENDIF
825 *
826  ENDIF
827 * End of "if ( MYCOL .ne. 0 )..."
828 *
829  ENDIF
830 * End of "if (info.eq.0) then"
831 *
832 *
833 * Check to make sure no processors have found errors
834 *
835  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
836  $ -1, 0, 0 )
837 *
838  IF( mycol.EQ.0 ) THEN
839  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
840  ELSE
841  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
842  ENDIF
843 *
844  IF ( info.NE.0 ) THEN
845  GOTO 1000
846  ENDIF
847 * No errors found, continue
848 *
849 *
850 ********************************************************************
851 * PHASE 2: Formation and factorization of Reduced System.
852 ********************************************************************
853 *
854 * Gather up local sections of reduced system
855 *
856 *
857 * The last processor does not participate in the factorization of
858 * the reduced system, having sent its E_i already.
859  IF( mycol .EQ. npcol-1 ) THEN
860  GOTO 14
861  ENDIF
862 *
863 * Initiate send of off-diag block(s) to overlap with next part.
864 * Off-diagonal block needed on neighboring processor to start
865 * algorithm.
866 *
867  IF( (mod( mycol+1, 2 ) .EQ. 0) .AND. ( mycol .GT. 0 ) ) THEN
868 *
869  CALL zgesd2d( ictxt, bw, bw,
870  $ af( odd_size*bw+1 ),
871  $ bw, 0, mycol-1 )
872 *
873  ENDIF
874 *
875 * Copy last diagonal block into AF storage for subsequent
876 * operations.
877 *
878  CALL zlamov( 'N', bw, bw,
879  $ a( ofst+odd_size*llda+1 ),
880  $ llda-1, af( odd_size*bw+mbw2+1 ),
881  $ bw )
882 *
883 * Receive cont. to diagonal block that is stored on this proc.
884 *
885  IF( mycol.LT. npcol-1 ) THEN
886 *
887  CALL zgerv2d( ictxt, bw, bw,
888  $ af( odd_size*bw+2*mbw2+1 ),
889  $ bw, 0,
890  $ mycol+1 )
891 *
892 * Add contribution to diagonal block
893 *
894  CALL zaxpy( mbw2, cone,
895  $ af( odd_size*bw+2*mbw2+1 ),
896  $ 1, af( odd_size*bw+mbw2+1 ), 1 )
897 *
898  ENDIF
899 *
900 *
901 * *************************************
902 * Modification Loop
903 *
904 * The distance for sending and receiving for each level starts
905 * at 1 for the first level.
906  level_dist = 1
907 *
908 * Do until this proc is needed to modify other procs' equations
909 *
910  12 CONTINUE
911  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 11
912 *
913 * Receive and add contribution to diagonal block from the left
914 *
915  IF( mycol-level_dist .GE. 0 ) THEN
916  CALL zgerv2d( ictxt, bw, bw, work( 1 ),
917  $ bw, 0, mycol-level_dist )
918 *
919  CALL zaxpy( mbw2, cone, work( 1 ), 1,
920  $ af( odd_size*bw+mbw2+1 ), 1 )
921 *
922  ENDIF
923 *
924 * Receive and add contribution to diagonal block from the right
925 *
926  IF( mycol+level_dist .LT. npcol-1 ) THEN
927  CALL zgerv2d( ictxt, bw, bw, work( 1 ),
928  $ bw, 0, mycol+level_dist )
929 *
930  CALL zaxpy( mbw2, cone, work( 1 ), 1,
931  $ af( odd_size*bw+mbw2+1 ), 1 )
932 *
933  ENDIF
934 *
935  level_dist = level_dist*2
936 *
937  GOTO 12
938  11 CONTINUE
939 * [End of GOTO Loop]
940 *
941 *
942 * *********************************
943 * Calculate and use this proc's blocks to modify other procs'...
944 *
945 * Factor diagonal block
946 *
947  CALL zpotrf( 'L', bw, af( odd_size*bw+mbw2+1 ),
948  $ bw, info )
949 *
950  IF( info.NE.0 ) THEN
951  info = npcol + mycol
952  ENDIF
953 *
954 * ****************************************************************
955 * Receive offdiagonal block from processor to right.
956 * If this is the first group of processors, the receive comes
957 * from a different processor than otherwise.
958 *
959  IF( level_dist .EQ. 1 )THEN
960  comm_proc = mycol + 1
961 *
962 * Move block into place that it will be expected to be for
963 * calcs.
964 *
965  CALL zlamov( 'N', bw, bw, af( odd_size*bw+1 ), bw,
966  $ af( odd_size*bw+2*mbw2+1 ), bw )
967 *
968  ELSE
969  comm_proc = mycol + level_dist/2
970  ENDIF
971 *
972  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
973 *
974  CALL zgerv2d( ictxt, bw, bw,
975  $ af( odd_size*bw+1 ),
976  $ bw, 0, comm_proc )
977 *
978  IF( info .EQ. 0 ) THEN
979 *
980 *
981 * Modify upper off_diagonal block with diagonal block
982 *
983 *
984  CALL ztrsm( 'L', 'L', 'N', 'N', bw, bw, cone,
985  $ af( odd_size*bw+mbw2+1 ), bw,
986  $ af( odd_size*bw+1 ), bw )
987 *
988  ENDIF
989 * End of "if ( info.eq.0 ) then"
990 *
991 * Calculate contribution from this block to next diagonal block
992 *
993  CALL zherk( 'L', 'C', bw, bw, -one,
994  $ af( (odd_size)*bw+1 ), bw, zero,
995  $ work( 1 ), bw )
996 *
997 * Send contribution to diagonal block's owning processor.
998 *
999  CALL zgesd2d( ictxt, bw, bw, work( 1 ), bw,
1000  $ 0, mycol+level_dist )
1001 *
1002  ENDIF
1003 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1004 *
1005 *
1006 * ****************************************************************
1007 * Receive off_diagonal block from left and use to finish with this
1008 * processor.
1009 *
1010  IF( (mycol/level_dist .GT. 0 ).AND.
1011  $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1012 *
1013  IF( level_dist .GT. 1)THEN
1014 *
1015 * Receive offdiagonal block(s) from proc level_dist/2 to the
1016 * left
1017 *
1018  CALL zgerv2d( ictxt, bw, bw,
1019  $ af( odd_size*bw+2*mbw2+1 ),
1020  $ bw, 0, mycol-level_dist/2 )
1021 *
1022  ENDIF
1023 *
1024 *
1025  IF( info.EQ.0 ) THEN
1026 *
1027 * Use diagonal block(s) to modify this offdiagonal block
1028 *
1029  CALL ztrsm( 'R', 'L', 'C', 'N', bw, bw, cone,
1030  $ af( odd_size*bw+mbw2+1 ), bw,
1031  $ af( odd_size*bw+2*mbw2+1 ), bw )
1032 *
1033  ENDIF
1034 * End of "if( info.eq.0 ) then"
1035 *
1036 * Use offdiag block(s) to calculate modification to diag block
1037 * of processor to the left
1038 *
1039  CALL zherk( 'L', 'N', bw, bw, -one,
1040  $ af( (odd_size+2*bw)*bw+1 ), bw, zero,
1041  $ work( 1 ), bw )
1042 *
1043 * Send contribution to diagonal block's owning processor.
1044 *
1045  CALL zgesd2d( ictxt, bw, bw, work( 1 ), bw,
1046  $ 0, mycol-level_dist )
1047 *
1048 * *******************************************************
1049 *
1050  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 ) THEN
1051 *
1052 * Decide which processor offdiagonal block(s) goes to
1053 *
1054  IF( ( mod( mycol/( 2*level_dist ),2 )) .EQ.0 ) THEN
1055  comm_proc = mycol + level_dist
1056  ELSE
1057  comm_proc = mycol - level_dist
1058  ENDIF
1059 *
1060 * Use offdiagonal blocks to calculate offdiag
1061 * block to send to neighboring processor. Depending
1062 * on circumstances, may need to transpose the matrix.
1063 *
1064  CALL zgemm( 'N', 'N', bw, bw, bw, -cone,
1065  $ af( odd_size*bw+2*mbw2+1 ), bw,
1066  $ af( odd_size*bw+1 ), bw, czero, work( 1 ),
1067  $ bw )
1068 *
1069 * Send contribution to offdiagonal block's owning processor.
1070 *
1071  CALL zgesd2d( ictxt, bw, bw, work( 1 ), bw,
1072  $ 0, comm_proc )
1073 *
1074  ENDIF
1075 *
1076  ENDIF
1077 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1078 *
1079  14 CONTINUE
1080 *
1081  ELSE
1082 *
1083 * CASE UPLO = 'U'
1084 *
1085 ********************************************************************
1086 * PHASE 1: Local computation phase.
1087 ********************************************************************
1088 *
1089 *
1090 * Sizes of the extra triangles communicated bewtween processors
1091 *
1092  IF( mycol .GT. 0 ) THEN
1093  prev_tri_size_m= min( bw,
1094  $ numroc( n, part_size, mycol, 0, npcol ) )
1095  prev_tri_size_n=min( bw,
1096  $ numroc( n, part_size, mycol-1, 0, npcol ) )
1097  ENDIF
1098 *
1099  IF( mycol .LT. npcol-1 ) THEN
1100  next_tri_size_m=min( bw,
1101  $ numroc( n, part_size, mycol+1, 0, npcol ) )
1102  next_tri_size_n= min( bw,
1103  $ numroc( n, part_size, mycol, 0, npcol ) )
1104  ENDIF
1105 *
1106 *
1107 *
1108 * Factor main partition A_i^C = U_i {U_i}^C in each processor
1109 *
1110  CALL zpbtrf( uplo, odd_size, bw, a( ofst + 1),
1111  $ llda, info )
1112 *
1113  IF( info.NE.0 ) THEN
1114  info = mycol+1
1115  GOTO 1600
1116  ENDIF
1117 *
1118 *
1119  IF ( mycol .LT. np-1 ) THEN
1120 * Apply factorization to odd-even connection block B_i
1121 *
1122 * Move the connection block in preparation.
1123 *
1124  CALL zlamov( 'L', bw, bw, a( ( ofst+1+odd_size*llda ) ),
1125  $ llda-1, af( odd_size*bw+2*mbw2+1+bw-bw ), bw )
1126 *
1127 *
1128 * Perform the triangular solve {L_i}{{B'}_i}^C = {B_i}^C
1129 *
1130  CALL ztrtrs( 'U', 'C', 'N', bw, bw,
1131  $ a( ofst+bw+1+(odd_size-bw)*llda ), llda-1,
1132  $ af( odd_size*bw+2*mbw2+1 ), bw, info )
1133 *
1134 * Move the resulting block back to its location in main storage.
1135 *
1136  CALL zlamov( 'L', bw, bw, af( odd_size*bw+2*mbw2+1+bw-bw ),
1137  $ bw, a(( ofst+1+odd_size*llda )), llda-1 )
1138 *
1139 *
1140 * Compute contribution to diagonal block(s) of reduced system.
1141 * {C'}_i^C = {C_i}^C-{{B'}_i}^C{{B'}_i}
1142 *
1143 * The following method uses more flops than necessary but
1144 * does not necessitate the writing of a new BLAS routine.
1145 *
1146 *
1147  CALL zherk( uplo, 'C', bw, bw, -one,
1148  $ af( odd_size*bw+2*mbw2+1 ), bw, one,
1149  $ a( ofst+bw+1+odd_size*llda ), llda-1 )
1150 *
1151  ENDIF
1152 * End of "if ( MYCOL .lt. NP-1 )..." loop
1153 *
1154 *
1155  1600 CONTINUE
1156 * If the processor could not locally factor, it jumps here.
1157 *
1158  IF ( mycol .NE. 0 ) THEN
1159 * Discard temporary matrix stored beginning in
1160 * AF( (odd_size+2*bw)*bw+1 ) and use for
1161 * off_diagonal block of reduced system.
1162 *
1163 * Calculate the "spike" fillin, ${L_i} {{G}_i}^C = {D_i}$ .
1164 *
1165 *
1166 * Copy D block into AF storage for solve.
1167 *
1168  CALL zlatcpy( 'L', prev_tri_size_n, prev_tri_size_m,
1169  $ a( ofst+1 ), llda-1, af( 1 ), odd_size )
1170 *
1171  IF ( info.EQ.0 ) THEN
1172 *
1173  CALL ztbtrs( 'U', 'C', 'N', odd_size, bw, bw,
1174  $ a( ofst + 1 ), llda,
1175  $ af( 1 ), odd_size, info )
1176 *
1177 *
1178 * Calculate the update block for previous proc, E_i = G_i{G_i}^C
1179 *
1180  CALL zherk( 'L', 'C', bw, odd_size,
1181  $ -one, af( 1 ), odd_size, zero,
1182  $ af( 1 + (odd_size+2*bw)*bw), bw )
1183 *
1184 *
1185 * Initiate send of E_i to previous processor to overlap
1186 * with next computation.
1187 *
1188  CALL zgesd2d( ictxt, bw, bw, af( odd_size*bw+2*mbw2+1 ), bw,
1189  $ 0, mycol-1 )
1190 *
1191 *
1192  IF ( mycol .LT. np-1 ) THEN
1193 *
1194 * Calculate off-diagonal block(s) of reduced system.
1195 * Note: for ease of use in solution of reduced system, store
1196 * L's off-diagonal block in conjugate transpose form.
1197 * {F_i}^C = {H_i}{{B'}_i}^C
1198 *
1199 * Copy matrix H_i (the last bw cols of G_i) to AF storage
1200 * as per requirements of BLAS routine ZTRMM.
1201 * Since we have G_i^C stored, conjugate transpose
1202 * H_i^C to H_i.
1203 *
1204  CALL zlatcpy( 'N', bw, bw,
1205  $ af( odd_size-bw+1 ), odd_size,
1206  $ af( (odd_size)*bw+1), bw )
1207 *
1208  CALL ztrmm( 'R', 'L', 'N', 'N', bw, bw, -cone,
1209  $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1210  $ af( (odd_size)*bw+1 ), bw )
1211 *
1212  ENDIF
1213 *
1214  ENDIF
1215 * End of "if ( MYCOL .ne. 0 )..."
1216 *
1217  ENDIF
1218 * End of "if (info.eq.0) then"
1219 *
1220 *
1221 * Check to make sure no processors have found errors
1222 *
1223  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
1224  $ -1, 0, 0 )
1225 *
1226  IF( mycol.EQ.0 ) THEN
1227  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1228  ELSE
1229  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1230  ENDIF
1231 *
1232  IF ( info.NE.0 ) THEN
1233  GOTO 1000
1234  ENDIF
1235 * No errors found, continue
1236 *
1237 *
1238 ********************************************************************
1239 * PHASE 2: Formation and factorization of Reduced System.
1240 ********************************************************************
1241 *
1242 * Gather up local sections of reduced system
1243 *
1244 *
1245 * The last processor does not participate in the factorization of
1246 * the reduced system, having sent its E_i already.
1247  IF( mycol .EQ. npcol-1 ) THEN
1248  GOTO 24
1249  ENDIF
1250 *
1251 * Initiate send of off-diag block(s) to overlap with next part.
1252 * Off-diagonal block needed on neighboring processor to start
1253 * algorithm.
1254 *
1255  IF( (mod( mycol+1, 2 ) .EQ. 0) .AND. ( mycol .GT. 0 ) ) THEN
1256 *
1257  CALL zgesd2d( ictxt, bw, bw,
1258  $ af( odd_size*bw+1 ),
1259  $ bw, 0, mycol-1 )
1260 *
1261  ENDIF
1262 *
1263 * Transpose last diagonal block into AF storage for subsequent
1264 * operations.
1265 *
1266  CALL zlatcpy( 'U', bw, bw,
1267  $ a( ofst+ odd_size*llda+1+bw ),
1268  $ llda-1, af( odd_size*bw+mbw2+1 ),
1269  $ bw )
1270 *
1271 * Receive cont. to diagonal block that is stored on this proc.
1272 *
1273  IF( mycol.LT. npcol-1 ) THEN
1274 *
1275  CALL zgerv2d( ictxt, bw, bw,
1276  $ af( odd_size*bw+2*mbw2+1 ),
1277  $ bw, 0,
1278  $ mycol+1 )
1279 *
1280 * Add contribution to diagonal block
1281 *
1282  CALL zaxpy( mbw2, cone,
1283  $ af( odd_size*bw+2*mbw2+1 ),
1284  $ 1, af( odd_size*bw+mbw2+1 ), 1 )
1285 *
1286  ENDIF
1287 *
1288 *
1289 * *************************************
1290 * Modification Loop
1291 *
1292 * The distance for sending and receiving for each level starts
1293 * at 1 for the first level.
1294  level_dist = 1
1295 *
1296 * Do until this proc is needed to modify other procs' equations
1297 *
1298  22 CONTINUE
1299  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 21
1300 *
1301 * Receive and add contribution to diagonal block from the left
1302 *
1303  IF( mycol-level_dist .GE. 0 ) THEN
1304  CALL zgerv2d( ictxt, bw, bw, work( 1 ),
1305  $ bw, 0, mycol-level_dist )
1306 *
1307  CALL zaxpy( mbw2, cone, work( 1 ), 1,
1308  $ af( odd_size*bw+mbw2+1 ), 1 )
1309 *
1310  ENDIF
1311 *
1312 * Receive and add contribution to diagonal block from the right
1313 *
1314  IF( mycol+level_dist .LT. npcol-1 ) THEN
1315  CALL zgerv2d( ictxt, bw, bw, work( 1 ),
1316  $ bw, 0, mycol+level_dist )
1317 *
1318  CALL zaxpy( mbw2, cone, work( 1 ), 1,
1319  $ af( odd_size*bw+mbw2+1 ), 1 )
1320 *
1321  ENDIF
1322 *
1323  level_dist = level_dist*2
1324 *
1325  GOTO 22
1326  21 CONTINUE
1327 * [End of GOTO Loop]
1328 *
1329 *
1330 * *********************************
1331 * Calculate and use this proc's blocks to modify other procs'...
1332 *
1333 * Factor diagonal block
1334 *
1335  CALL zpotrf( 'L', bw, af( odd_size*bw+mbw2+1 ),
1336  $ bw, info )
1337 *
1338  IF( info.NE.0 ) THEN
1339  info = npcol + mycol
1340  ENDIF
1341 *
1342 * ****************************************************************
1343 * Receive offdiagonal block from processor to right.
1344 * If this is the first group of processors, the receive comes
1345 * from a different processor than otherwise.
1346 *
1347  IF( level_dist .EQ. 1 )THEN
1348  comm_proc = mycol + 1
1349 *
1350 * Move block into place that it will be expected to be for
1351 * calcs.
1352 *
1353  CALL zlamov( 'N', bw, bw, af( odd_size*bw+1 ), bw,
1354  $ af( odd_size*bw+2*mbw2+1 ), bw )
1355 *
1356  ELSE
1357  comm_proc = mycol + level_dist/2
1358  ENDIF
1359 *
1360  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
1361 *
1362  CALL zgerv2d( ictxt, bw, bw,
1363  $ af( odd_size*bw+1 ),
1364  $ bw, 0, comm_proc )
1365 *
1366  IF( info .EQ. 0 ) THEN
1367 *
1368 *
1369 * Modify upper off_diagonal block with diagonal block
1370 *
1371 *
1372  CALL ztrsm( 'L', 'L', 'N', 'N', bw, bw, cone,
1373  $ af( odd_size*bw+mbw2+1 ), bw,
1374  $ af( odd_size*bw+1 ), bw )
1375 *
1376  ENDIF
1377 * End of "if ( info.eq.0 ) then"
1378 *
1379 * Calculate contribution from this block to next diagonal block
1380 *
1381  CALL zherk( 'L', 'C', bw, bw, -one,
1382  $ af( (odd_size)*bw+1 ), bw, zero,
1383  $ work( 1 ), bw )
1384 *
1385 * Send contribution to diagonal block's owning processor.
1386 *
1387  CALL zgesd2d( ictxt, bw, bw, work( 1 ), bw,
1388  $ 0, mycol+level_dist )
1389 *
1390  ENDIF
1391 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1392 *
1393 *
1394 * ****************************************************************
1395 * Receive off_diagonal block from left and use to finish with this
1396 * processor.
1397 *
1398  IF( (mycol/level_dist .GT. 0 ).AND.
1399  $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
1400 *
1401  IF( level_dist .GT. 1)THEN
1402 *
1403 * Receive offdiagonal block(s) from proc level_dist/2 to the
1404 * left
1405 *
1406  CALL zgerv2d( ictxt, bw, bw,
1407  $ af( odd_size*bw+2*mbw2+1 ),
1408  $ bw, 0, mycol-level_dist/2 )
1409 *
1410  ENDIF
1411 *
1412 *
1413  IF( info.EQ.0 ) THEN
1414 *
1415 * Use diagonal block(s) to modify this offdiagonal block
1416 *
1417  CALL ztrsm( 'R', 'L', 'C', 'N', bw, bw, cone,
1418  $ af( odd_size*bw+mbw2+1 ), bw,
1419  $ af( odd_size*bw+2*mbw2+1 ), bw )
1420 *
1421  ENDIF
1422 * End of "if( info.eq.0 ) then"
1423 *
1424 * Use offdiag block(s) to calculate modification to diag block
1425 * of processor to the left
1426 *
1427  CALL zherk( 'L', 'N', bw, bw, -one,
1428  $ af( (odd_size+2*bw)*bw+1 ), bw, zero,
1429  $ work( 1 ), bw )
1430 *
1431 * Send contribution to diagonal block's owning processor.
1432 *
1433  CALL zgesd2d( ictxt, bw, bw, work( 1 ), bw,
1434  $ 0, mycol-level_dist )
1435 *
1436 * *******************************************************
1437 *
1438  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 ) THEN
1439 *
1440 * Decide which processor offdiagonal block(s) goes to
1441 *
1442  IF( ( mod( mycol/( 2*level_dist ),2 )) .EQ.0 ) THEN
1443  comm_proc = mycol + level_dist
1444  ELSE
1445  comm_proc = mycol - level_dist
1446  ENDIF
1447 *
1448 * Use offdiagonal blocks to calculate offdiag
1449 * block to send to neighboring processor. Depending
1450 * on circumstances, may need to transpose the matrix.
1451 *
1452  CALL zgemm( 'N', 'N', bw, bw, bw, -cone,
1453  $ af( odd_size*bw+2*mbw2+1 ), bw,
1454  $ af( odd_size*bw+1 ), bw, czero, work( 1 ),
1455  $ bw )
1456 *
1457 * Send contribution to offdiagonal block's owning processor.
1458 *
1459  CALL zgesd2d( ictxt, bw, bw, work( 1 ), bw,
1460  $ 0, comm_proc )
1461 *
1462  ENDIF
1463 *
1464  ENDIF
1465 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1466 *
1467  24 CONTINUE
1468 *
1469  ENDIF
1470 *
1471  1000 CONTINUE
1472 *
1473 *
1474 * Free BLACS space used to hold standard-form grid.
1475 *
1476  IF( ictxt_save .NE. ictxt_new ) THEN
1477  CALL blacs_gridexit( ictxt_new )
1478  ENDIF
1479 *
1480  1234 CONTINUE
1481 *
1482 * Restore saved input parameters
1483 *
1484  ictxt = ictxt_save
1485  np = np_save
1486 *
1487 * Output minimum worksize
1488 *
1489  work( 1 ) = work_size_min
1490 *
1491 * Make INFO consistent across processors
1492 *
1493  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
1494  $ -1, 0, 0 )
1495 *
1496  IF( mycol.EQ.0 ) THEN
1497  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1498  ELSE
1499  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1500  ENDIF
1501 *
1502 *
1503  RETURN
1504 *
1505 * End of PZPBTRF
1506 *
1507  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
pzpbtrf
subroutine pzpbtrf(UPLO, N, BW, A, JA, DESCA, AF, LAF, WORK, LWORK, INFO)
Definition: pzpbtrf.f:3
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181
zlatcpy
subroutine zlatcpy(UPLO, M, N, A, LDA, B, LDB)
Definition: zlatcpy.f:2