ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pdpbtrf.f
Go to the documentation of this file.
1  SUBROUTINE pdpbtrf( UPLO, N, BW, A, JA, DESCA, AF, LAF, WORK,
2  $ LWORK, INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  CHARACTER UPLO
10  INTEGER BW, INFO, JA, LAF, LWORK, N
11 * ..
12 * .. Array Arguments ..
13  INTEGER DESCA( * )
14  DOUBLE PRECISION A( * ), AF( * ), WORK( * )
15 * ..
16 *
17 *
18 * Purpose
19 * =======
20 *
21 * PDPBTRF computes a Cholesky factorization
22 * of an N-by-N real banded
23 * symmetric positive definite distributed matrix
24 * with bandwidth BW: A(1:N, JA:JA+N-1).
25 * Reordering is used to increase parallelism in the factorization.
26 * This reordering results in factors that are DIFFERENT from those
27 * produced by equivalent sequential codes. These factors cannot
28 * be used directly by users; however, they can be used in
29 * subsequent calls to PDPBTRS to solve linear systems.
30 *
31 * The factorization has the form
32 *
33 * P A(1:N, JA:JA+N-1) P^T = U' U , if UPLO = 'U', or
34 *
35 * P A(1:N, JA:JA+N-1) P^T = L L', if UPLO = 'L'
36 *
37 * where U is a banded upper triangular matrix and L is banded
38 * lower triangular, and P is a permutation matrix.
39 *
40 * =====================================================================
41 *
42 * Arguments
43 * =========
44 *
45 * UPLO (global input) CHARACTER
46 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
47 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
48 *
49 * N (global input) INTEGER
50 * The number of rows and columns to be operated on, i.e. the
51 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
52 *
53 * BW (global input) INTEGER
54 * Number of subdiagonals in L or U. 0 <= BW <= N-1
55 *
56 * A (local input/local output) DOUBLE PRECISION pointer into
57 * local memory to an array with first dimension
58 * LLD_A >=(bw+1) (stored in DESCA).
59 * On entry, this array contains the local pieces of the
60 * N-by-N symmetric banded distributed matrix
61 * A(1:N, JA:JA+N-1) to be factored.
62 * This local portion is stored in the packed banded format
63 * used in LAPACK. Please see the Notes below and the
64 * ScaLAPACK manual for more detail on the format of
65 * distributed matrices.
66 * On exit, this array contains information containing details
67 * of the factorization.
68 * Note that permutations are performed on the matrix, so that
69 * the factors returned are different from those returned
70 * by LAPACK.
71 *
72 * JA (global input) INTEGER
73 * The index in the global array A that points to the start of
74 * the matrix to be operated on (which may be either all of A
75 * or a submatrix of A).
76 *
77 * DESCA (global and local input) INTEGER array of dimension DLEN.
78 * if 1D type (DTYPE_A=501), DLEN >= 7;
79 * if 2D type (DTYPE_A=1), DLEN >= 9 .
80 * The array descriptor for the distributed matrix A.
81 * Contains information of mapping of A to memory. Please
82 * see NOTES below for full description and options.
83 *
84 * AF (local output) DOUBLE PRECISION array, dimension LAF.
85 * Auxiliary Fillin Space.
86 * Fillin is created during the factorization routine
87 * PDPBTRF and this is stored in AF. If a linear system
88 * is to be solved using PDPBTRS after the factorization
89 * routine, AF *must not be altered* after the factorization.
90 *
91 * LAF (local input) INTEGER
92 * Size of user-input Auxiliary Fillin space AF. Must be >=
93 * (NB+2*bw)*bw
94 * If LAF is not large enough, an error code will be returned
95 * and the minimum acceptable size will be returned in AF( 1 )
96 *
97 * WORK (local workspace/local output)
98 * DOUBLE PRECISION temporary workspace. This space may
99 * be overwritten in between calls to routines. WORK must be
100 * the size given in LWORK.
101 * On exit, WORK( 1 ) contains the minimal LWORK.
102 *
103 * LWORK (local input or global input) INTEGER
104 * Size of user-input workspace WORK.
105 * If LWORK is too small, the minimal acceptable size will be
106 * returned in WORK(1) and an error code is returned. LWORK>=
107 * bw*bw
108 *
109 * INFO (global output) INTEGER
110 * = 0: successful exit
111 * < 0: If the i-th argument is an array and the j-entry had
112 * an illegal value, then INFO = -(i*100+j), if the i-th
113 * argument is a scalar and had an illegal value, then
114 * INFO = -i.
115 * > 0: If INFO = K<=NPROCS, the submatrix stored on processor
116 * INFO and factored locally was not
117 * positive definite, and
118 * the factorization was not completed.
119 * If INFO = K>NPROCS, the submatrix stored on processor
120 * INFO-NPROCS representing interactions with other
121 * processors was not
122 * positive definite,
123 * and the factorization was not completed.
124 *
125 * =====================================================================
126 *
127 *
128 * Restrictions
129 * ============
130 *
131 * The following are restrictions on the input parameters. Some of these
132 * are temporary and will be removed in future releases, while others
133 * may reflect fundamental technical limitations.
134 *
135 * Non-cyclic restriction: VERY IMPORTANT!
136 * P*NB>= mod(JA-1,NB)+N.
137 * The mapping for matrices must be blocked, reflecting the nature
138 * of the divide and conquer algorithm as a task-parallel algorithm.
139 * This formula in words is: no processor may have more than one
140 * chunk of the matrix.
141 *
142 * Blocksize cannot be too small:
143 * If the matrix spans more than one processor, the following
144 * restriction on NB, the size of each block on each processor,
145 * must hold:
146 * NB >= 2*BW
147 * The bulk of parallel computation is done on the matrix of size
148 * O(NB) on each processor. If this is too small, divide and conquer
149 * is a poor choice of algorithm.
150 *
151 * Submatrix reference:
152 * JA = IB
153 * Alignment restriction that prevents unnecessary communication.
154 *
155 *
156 * =====================================================================
157 *
158 *
159 * Notes
160 * =====
161 *
162 * If the factorization routine and the solve routine are to be called
163 * separately (to solve various sets of righthand sides using the same
164 * coefficient matrix), the auxiliary space AF *must not be altered*
165 * between calls to the factorization routine and the solve routine.
166 *
167 * The best algorithm for solving banded and tridiagonal linear systems
168 * depends on a variety of parameters, especially the bandwidth.
169 * Currently, only algorithms designed for the case N/P >> bw are
170 * implemented. These go by many names, including Divide and Conquer,
171 * Partitioning, domain decomposition-type, etc.
172 *
173 * Algorithm description: Divide and Conquer
174 *
175 * The Divide and Conqer algorithm assumes the matrix is narrowly
176 * banded compared with the number of equations. In this situation,
177 * it is best to distribute the input matrix A one-dimensionally,
178 * with columns atomic and rows divided amongst the processes.
179 * The basic algorithm divides the banded matrix up into
180 * P pieces with one stored on each processor,
181 * and then proceeds in 2 phases for the factorization or 3 for the
182 * solution of a linear system.
183 * 1) Local Phase:
184 * The individual pieces are factored independently and in
185 * parallel. These factors are applied to the matrix creating
186 * fillin, which is stored in a non-inspectable way in auxiliary
187 * space AF. Mathematically, this is equivalent to reordering
188 * the matrix A as P A P^T and then factoring the principal
189 * leading submatrix of size equal to the sum of the sizes of
190 * the matrices factored on each processor. The factors of
191 * these submatrices overwrite the corresponding parts of A
192 * in memory.
193 * 2) Reduced System Phase:
194 * A small (BW* (P-1)) system is formed representing
195 * interaction of the larger blocks, and is stored (as are its
196 * factors) in the space AF. A parallel Block Cyclic Reduction
197 * algorithm is used. For a linear system, a parallel front solve
198 * followed by an analagous backsolve, both using the structure
199 * of the factored matrix, are performed.
200 * 3) Backsubsitution Phase:
201 * For a linear system, a local backsubstitution is performed on
202 * each processor in parallel.
203 *
204 *
205 * Descriptors
206 * ===========
207 *
208 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
209 *
210 * Note: banded codes can use either the old two dimensional
211 * or new one-dimensional descriptors, though the processor grid in
212 * both cases *must be one-dimensional*. We describe both types below.
213 *
214 * Each global data object is described by an associated description
215 * vector. This vector stores the information required to establish
216 * the mapping between an object element and its corresponding process
217 * and memory location.
218 *
219 * Let A be a generic term for any 2D block cyclicly distributed array.
220 * Such a global array has an associated description vector DESCA.
221 * In the following comments, the character _ should be read as
222 * "of the global array".
223 *
224 * NOTATION STORED IN EXPLANATION
225 * --------------- -------------- --------------------------------------
226 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
227 * DTYPE_A = 1.
228 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
229 * the BLACS process grid A is distribu-
230 * ted over. The context itself is glo-
231 * bal, but the handle (the integer
232 * value) may vary.
233 * M_A (global) DESCA( M_ ) The number of rows in the global
234 * array A.
235 * N_A (global) DESCA( N_ ) The number of columns in the global
236 * array A.
237 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
238 * the rows of the array.
239 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
240 * the columns of the array.
241 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
242 * row of the array A is distributed.
243 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
244 * first column of the array A is
245 * distributed.
246 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
247 * array. LLD_A >= MAX(1,LOCr(M_A)).
248 *
249 * Let K be the number of rows or columns of a distributed matrix,
250 * and assume that its process grid has dimension p x q.
251 * LOCr( K ) denotes the number of elements of K that a process
252 * would receive if K were distributed over the p processes of its
253 * process column.
254 * Similarly, LOCc( K ) denotes the number of elements of K that a
255 * process would receive if K were distributed over the q processes of
256 * its process row.
257 * The values of LOCr() and LOCc() may be determined via a call to the
258 * ScaLAPACK tool function, NUMROC:
259 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
260 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
261 * An upper bound for these quantities may be computed by:
262 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
263 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
264 *
265 *
266 * One-dimensional descriptors:
267 *
268 * One-dimensional descriptors are a new addition to ScaLAPACK since
269 * version 1.0. They simplify and shorten the descriptor for 1D
270 * arrays.
271 *
272 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
273 * object, we allow 1D arrays to be distributed either over the
274 * first dimension of the array (as if the grid were P-by-1) or the
275 * 2nd dimension (as if the grid were 1-by-P). This choice is
276 * indicated by the descriptor type (501 or 502)
277 * as described below.
278 *
279 * IMPORTANT NOTE: the actual BLACS grid represented by the
280 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
281 * irrespective of which one-dimensional descriptor type
282 * (501 or 502) is input.
283 * This routine will interpret the grid properly either way.
284 * ScaLAPACK routines *do not support intercontext operations* so that
285 * the grid passed to a single ScaLAPACK routine *must be the same*
286 * for all array descriptors passed to that routine.
287 *
288 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
289 * may also be used, since a one-dimensional array is a special case
290 * of a two-dimensional array with one dimension of size unity.
291 * The two-dimensional array used in this case *must* be of the
292 * proper orientation:
293 * If the appropriate one-dimensional descriptor is DTYPEA=501
294 * (1 by P type), then the two dimensional descriptor must
295 * have a CTXT value that refers to a 1 by P BLACS grid;
296 * If the appropriate one-dimensional descriptor is DTYPEA=502
297 * (P by 1 type), then the two dimensional descriptor must
298 * have a CTXT value that refers to a P by 1 BLACS grid.
299 *
300 *
301 * Summary of allowed descriptors, types, and BLACS grids:
302 * DTYPE 501 502 1 1
303 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
304 * -----------------------------------------------------
305 * A OK NO OK NO
306 * B NO OK NO OK
307 *
308 * Let A be a generic term for any 1D block cyclicly distributed array.
309 * Such a global array has an associated description vector DESCA.
310 * In the following comments, the character _ should be read as
311 * "of the global array".
312 *
313 * NOTATION STORED IN EXPLANATION
314 * --------------- ---------- ------------------------------------------
315 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
316 * TYPE_A = 501: 1-by-P grid.
317 * TYPE_A = 502: P-by-1 grid.
318 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
319 * the BLACS process grid A is distribu-
320 * ted over. The context itself is glo-
321 * bal, but the handle (the integer
322 * value) may vary.
323 * N_A (global) DESCA( 3 ) The size of the array dimension being
324 * distributed.
325 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
326 * the distributed dimension of the array.
327 * SRC_A (global) DESCA( 5 ) The process row or column over which the
328 * first row or column of the array
329 * is distributed.
330 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
331 * storing the local blocks of the distri-
332 * buted array A. Minimum value of LLD_A
333 * depends on TYPE_A.
334 * TYPE_A = 501: LLD_A >=
335 * size of undistributed dimension, 1.
336 * TYPE_A = 502: LLD_A >=NB_A, 1.
337 * Reserved DESCA( 7 ) Reserved for future use.
338 *
339 *
340 *
341 * =====================================================================
342 *
343 * Code Developer: Andrew J. Cleary, University of Tennessee.
344 * Current address: Lawrence Livermore National Labs.
345 *
346 * =====================================================================
347 *
348 * .. Parameters ..
349  DOUBLE PRECISION ONE
350  parameter( one = 1.0d+0 )
351  DOUBLE PRECISION ZERO
352  parameter( zero = 0.0d+0 )
353  INTEGER INT_ONE
354  parameter( int_one = 1 )
355  INTEGER DESCMULT, BIGNUM
356  parameter( descmult = 100, bignum = descmult*descmult )
357  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
358  $ lld_, mb_, m_, nb_, n_, rsrc_
359  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
360  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
361  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
362 * ..
363 * .. Local Scalars ..
364  INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
365  $ ictxt_new, ictxt_save, idum1, idum3, ja_new,
366  $ laf_min, level_dist, llda, mbw2, mycol, myrow,
367  $ my_num_cols, nb, next_tri_size_m,
368  $ next_tri_size_n, np, npcol, nprow, np_save,
369  $ odd_size, ofst, part_offset, part_size,
370  $ prev_tri_size_m, prev_tri_size_n, return_code,
371  $ store_n_a, work_size_min
372 * ..
373 * .. Local Arrays ..
374  INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
375 * ..
376 * .. External Subroutines ..
377  EXTERNAL blacs_gridexit, blacs_gridinfo, daxpy,
378  $ desc_convert, dgemm, dgerv2d, dgesd2d, dlamov,
379  $ dlatcpy, dpbtrf, dpotrf, dsyrk, dtbtrs, dtrmm,
380  $ dtrrv2d, dtrsd2d, dtrsm, dtrtrs, globchk,
381  $ igamx2d, igebr2d, igebs2d, pxerbla, reshape
382 * ..
383 * .. External Functions ..
384  LOGICAL LSAME
385  INTEGER NUMROC
386  EXTERNAL lsame, numroc
387 * ..
388 * .. Intrinsic Functions ..
389  INTRINSIC ichar, min, mod
390 * ..
391 * .. Executable Statements ..
392 *
393 * Test the input parameters
394 *
395  info = 0
396 *
397 * Convert descriptor into standard form for easy access to
398 * parameters, check that grid is of right shape.
399 *
400  desca_1xp( 1 ) = 501
401 *
402  CALL desc_convert( desca, desca_1xp, return_code )
403 *
404  IF( return_code.NE.0 ) THEN
405  info = -( 6*100+2 )
406  END IF
407 *
408 * Get values out of descriptor for use in code.
409 *
410  ictxt = desca_1xp( 2 )
411  csrc = desca_1xp( 5 )
412  nb = desca_1xp( 4 )
413  llda = desca_1xp( 6 )
414  store_n_a = desca_1xp( 3 )
415 *
416 * Get grid parameters
417 *
418 *
419 * Pre-calculate bw^2
420 *
421  mbw2 = bw*bw
422 *
423  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
424  np = nprow*npcol
425 *
426 *
427 *
428  IF( lsame( uplo, 'U' ) ) THEN
429  idum1 = ichar( 'U' )
430  ELSE IF( lsame( uplo, 'L' ) ) THEN
431  idum1 = ichar( 'L' )
432  ELSE
433  info = -1
434  END IF
435 *
436  IF( lwork.LT.-1 ) THEN
437  info = -10
438  ELSE IF( lwork.EQ.-1 ) THEN
439  idum3 = -1
440  ELSE
441  idum3 = 1
442  END IF
443 *
444  IF( n.LT.0 ) THEN
445  info = -2
446  END IF
447 *
448  IF( n+ja-1.GT.store_n_a ) THEN
449  info = -( 6*100+6 )
450  END IF
451 *
452  IF( ( bw.GT.n-1 ) .OR. ( bw.LT.0 ) ) THEN
453  info = -3
454  END IF
455 *
456  IF( llda.LT.( bw+1 ) ) THEN
457  info = -( 6*100+6 )
458  END IF
459 *
460  IF( nb.LE.0 ) THEN
461  info = -( 6*100+4 )
462  END IF
463 *
464 * Argument checking that is specific to Divide & Conquer routine
465 *
466  IF( nprow.NE.1 ) THEN
467  info = -( 6*100+2 )
468  END IF
469 *
470  IF( n.GT.np*nb-mod( ja-1, nb ) ) THEN
471  info = -( 2 )
472  CALL pxerbla( ictxt, 'PDPBTRF, D&C alg.: only 1 block per proc'
473  $ , -info )
474  RETURN
475  END IF
476 *
477  IF( ( ja+n-1.GT.nb ) .AND. ( nb.LT.2*bw ) ) THEN
478  info = -( 6*100+4 )
479  CALL pxerbla( ictxt, 'PDPBTRF, D&C alg.: NB too small', -info )
480  RETURN
481  END IF
482 *
483 *
484 * Check auxiliary storage size
485 *
486  laf_min = ( nb+2*bw )*bw
487 *
488  IF( laf.LT.laf_min ) THEN
489  info = -8
490 * put minimum value of laf into AF( 1 )
491  af( 1 ) = laf_min
492  CALL pxerbla( ictxt, 'PDPBTRF: auxiliary storage error ',
493  $ -info )
494  RETURN
495  END IF
496 *
497 * Check worksize
498 *
499  work_size_min = bw*bw
500 *
501  work( 1 ) = work_size_min
502 *
503  IF( lwork.LT.work_size_min ) THEN
504  IF( lwork.NE.-1 ) THEN
505  info = -10
506  CALL pxerbla( ictxt, 'PDPBTRF: worksize error ', -info )
507  END IF
508  RETURN
509  END IF
510 *
511 * Pack params and positions into arrays for global consistency check
512 *
513  param_check( 9, 1 ) = desca( 5 )
514  param_check( 8, 1 ) = desca( 4 )
515  param_check( 7, 1 ) = desca( 3 )
516  param_check( 6, 1 ) = desca( 1 )
517  param_check( 5, 1 ) = ja
518  param_check( 4, 1 ) = bw
519  param_check( 3, 1 ) = n
520  param_check( 2, 1 ) = idum3
521  param_check( 1, 1 ) = idum1
522 *
523  param_check( 9, 2 ) = 605
524  param_check( 8, 2 ) = 604
525  param_check( 7, 2 ) = 603
526  param_check( 6, 2 ) = 601
527  param_check( 5, 2 ) = 5
528  param_check( 4, 2 ) = 3
529  param_check( 3, 2 ) = 2
530  param_check( 2, 2 ) = 10
531  param_check( 1, 2 ) = 1
532 *
533 * Want to find errors with MIN( ), so if no error, set it to a big
534 * number. If there already is an error, multiply by the the
535 * descriptor multiplier.
536 *
537  IF( info.GE.0 ) THEN
538  info = bignum
539  ELSE IF( info.LT.-descmult ) THEN
540  info = -info
541  ELSE
542  info = -info*descmult
543  END IF
544 *
545 * Check consistency across processors
546 *
547  CALL globchk( ictxt, 9, param_check, 9, param_check( 1, 3 ),
548  $ info )
549 *
550 * Prepare output: set info = 0 if no error, and divide by DESCMULT
551 * if error is not in a descriptor entry.
552 *
553  IF( info.EQ.bignum ) THEN
554  info = 0
555  ELSE IF( mod( info, descmult ).EQ.0 ) THEN
556  info = -info / descmult
557  ELSE
558  info = -info
559  END IF
560 *
561  IF( info.LT.0 ) THEN
562  CALL pxerbla( ictxt, 'PDPBTRF', -info )
563  RETURN
564  END IF
565 *
566 * Quick return if possible
567 *
568  IF( n.EQ.0 )
569  $ RETURN
570 *
571 *
572 * Adjust addressing into matrix space to properly get into
573 * the beginning part of the relevant data
574 *
575  part_offset = nb*( ( ja-1 ) / ( npcol*nb ) )
576 *
577  IF( ( mycol-csrc ).LT.( ja-part_offset-1 ) / nb ) THEN
578  part_offset = part_offset + nb
579  END IF
580 *
581  IF( mycol.LT.csrc ) THEN
582  part_offset = part_offset - nb
583  END IF
584 *
585 * Form a new BLACS grid (the "standard form" grid) with only procs
586 * holding part of the matrix, of size 1xNP where NP is adjusted,
587 * starting at csrc=0, with JA modified to reflect dropped procs.
588 *
589 * First processor to hold part of the matrix:
590 *
591  first_proc = mod( ( ja-1 ) / nb+csrc, npcol )
592 *
593 * Calculate new JA one while dropping off unused processors.
594 *
595  ja_new = mod( ja-1, nb ) + 1
596 *
597 * Save and compute new value of NP
598 *
599  np_save = np
600  np = ( ja_new+n-2 ) / nb + 1
601 *
602 * Call utility routine that forms "standard-form" grid
603 *
604  CALL reshape( ictxt, int_one, ictxt_new, int_one, first_proc,
605  $ int_one, np )
606 *
607 * Use new context from standard grid as context.
608 *
609  ictxt_save = ictxt
610  ictxt = ictxt_new
611  desca_1xp( 2 ) = ictxt_new
612 *
613 * Get information about new grid.
614 *
615  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
616 *
617 * Drop out processors that do not have part of the matrix.
618 *
619  IF( myrow.LT.0 ) THEN
620  GO TO 120
621  END IF
622 *
623 * ********************************
624 * Values reused throughout routine
625 *
626 * User-input value of partition size
627 *
628  part_size = nb
629 *
630 * Number of columns in each processor
631 *
632  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
633 *
634 * Offset in columns to beginning of main partition in each proc
635 *
636  IF( mycol.EQ.0 ) THEN
637  part_offset = part_offset + mod( ja_new-1, part_size )
638  my_num_cols = my_num_cols - mod( ja_new-1, part_size )
639  END IF
640 *
641 * Offset in elements
642 *
643  ofst = part_offset*llda
644 *
645 * Size of main (or odd) partition in each processor
646 *
647  odd_size = my_num_cols
648  IF( mycol.LT.np-1 ) THEN
649  odd_size = odd_size - bw
650  END IF
651 *
652 *
653 * Zero out space for fillin
654 *
655  DO 10 i = 1, laf_min
656  af( i ) = zero
657  10 CONTINUE
658 *
659 * Zero out space for work
660 *
661  DO 20 i = 1, work_size_min
662  work( i ) = zero
663  20 CONTINUE
664 *
665 * Begin main code
666 *
667  IF( lsame( uplo, 'L' ) ) THEN
668 *
669 ********************************************************************
670 * PHASE 1: Local computation phase.
671 ********************************************************************
672 *
673 *
674 * Sizes of the extra triangles communicated bewtween processors
675 *
676  IF( mycol.GT.0 ) THEN
677  prev_tri_size_m = min( bw, numroc( n, part_size, mycol, 0,
678  $ npcol ) )
679  prev_tri_size_n = min( bw, numroc( n, part_size, mycol-1, 0,
680  $ npcol ) )
681  END IF
682 *
683  IF( mycol.LT.npcol-1 ) THEN
684  next_tri_size_m = min( bw, numroc( n, part_size, mycol+1, 0,
685  $ npcol ) )
686  next_tri_size_n = min( bw, numroc( n, part_size, mycol, 0,
687  $ npcol ) )
688  END IF
689 *
690  IF( mycol.LT.np-1 ) THEN
691 * Transfer last triangle D_i of local matrix to next processor
692 * which needs it to calculate fillin due to factorization of
693 * its main (odd) block A_i.
694 * Overlap the send with the factorization of A_i.
695 *
696  CALL dtrsd2d( ictxt, 'U', 'N', next_tri_size_m,
697  $ next_tri_size_n, a( ofst+odd_size*llda+( bw+
698  $ 1 ) ), llda-1, 0, mycol+1 )
699 *
700  END IF
701 *
702 *
703 * Factor main partition A_i = L_i {L_i}^T in each processor
704 *
705  CALL dpbtrf( uplo, odd_size, bw, a( ofst+1 ), llda, info )
706 *
707  IF( info.NE.0 ) THEN
708  info = mycol + 1
709  GO TO 30
710  END IF
711 *
712 *
713  IF( mycol.LT.np-1 ) THEN
714 * Apply factorization to odd-even connection block B_i
715 *
716 * transpose the connection block in preparation.
717 *
718  CALL dlatcpy( 'U', bw, bw, a( ( ofst+( bw+1 )+( odd_size-
719  $ bw )*llda ) ), llda-1,
720  $ af( odd_size*bw+2*mbw2+1+bw-bw ), bw )
721 *
722 * Perform the triangular system solve {L_i}{{B'}_i}^T = {B_i}^T
723 *
724  CALL dtrtrs( 'L', 'N', 'N', bw, bw,
725  $ a( ofst+1+( odd_size-bw )*llda ), llda-1,
726  $ af( odd_size*bw+2*mbw2+1 ), bw, info )
727 *
728 *
729 * transpose resulting block to its location
730 * in main storage.
731 *
732  CALL dlatcpy( 'L', bw, bw, af( odd_size*bw+2*mbw2+1+bw-bw ),
733  $ bw, a( ( ofst+( bw+1 )+( odd_size-bw )*
734  $ llda ) ), llda-1 )
735 *
736 *
737 * Compute contribution to diagonal block(s) of reduced system.
738 * {C'}_i = {C_i}-{{B'}_i}{{B'}_i}^T
739 *
740 * The following method uses more flops than necessary but
741 * does not necessitate the writing of a new BLAS routine.
742 *
743 *
744  CALL dsyrk( uplo, 'T', bw, bw, -one,
745  $ af( odd_size*bw+2*mbw2+1 ), bw, one,
746  $ a( ofst+1+odd_size*llda ), llda-1 )
747 *
748  END IF
749 * End of "if ( MYCOL .lt. NP-1 )..." loop
750 *
751 *
752  30 CONTINUE
753 * If the processor could not locally factor, it jumps here.
754 *
755  IF( mycol.NE.0 ) THEN
756 * Discard temporary matrix stored beginning in
757 * AF( (odd_size+2*bw)*bw+1 ) and use for
758 * off_diagonal block of reduced system.
759 *
760 * Receive previously transmitted matrix section, which forms
761 * the right-hand-side for the triangular solve that calculates
762 * the "spike" fillin.
763 *
764 *
765  CALL dtrrv2d( ictxt, 'U', 'N', prev_tri_size_m,
766  $ prev_tri_size_n, af( 1 ), odd_size, 0,
767  $ mycol-1 )
768 *
769  IF( info.EQ.0 ) THEN
770 *
771 * Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ .
772 *
773  CALL dtbtrs( 'L', 'N', 'N', odd_size, bw, bw,
774  $ a( ofst+1 ), llda, af( 1 ), odd_size, info )
775 *
776 *
777 * Calculate the update block for previous proc, E_i = G_i{G_i}^T
778 *
779  CALL dsyrk( 'L', 'T', bw, odd_size, -one, af( 1 ),
780  $ odd_size, zero, af( 1+( odd_size+2*bw )*bw ),
781  $ bw )
782 *
783 *
784 * Initiate send of E_i to previous processor to overlap
785 * with next computation.
786 *
787  CALL dgesd2d( ictxt, bw, bw, af( odd_size*bw+2*mbw2+1 ),
788  $ bw, 0, mycol-1 )
789 *
790 *
791  IF( mycol.LT.np-1 ) THEN
792 *
793 * Calculate off-diagonal block(s) of reduced system.
794 * Note: for ease of use in solution of reduced system, store
795 * L's off-diagonal block in transpose form.
796 * {F_i}^T = {H_i}{{B'}_i}^T
797 *
798 * Copy matrix H_i (the last bw cols of G_i) to AF storage
799 * as per requirements of BLAS routine DTRMM.
800 * Since we have G_i^T stored, transpose
801 * H_i^T to H_i.
802 *
803  CALL dlatcpy( 'N', bw, bw, af( odd_size-bw+1 ),
804  $ odd_size, af( ( odd_size )*bw+1 ), bw )
805 *
806  CALL dtrmm( 'R', 'U', 'T', 'N', bw, bw, -one,
807  $ a( ( ofst+( bw+1 )+( odd_size-bw )*
808  $ llda ) ), llda-1, af( ( odd_size )*bw+1 ),
809  $ bw )
810 *
811 *
812  END IF
813 *
814  END IF
815 * End of "if ( MYCOL .ne. 0 )..."
816 *
817  END IF
818 * End of "if (info.eq.0) then"
819 *
820 *
821 * Check to make sure no processors have found errors
822 *
823  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info, -1,
824  $ 0, 0 )
825 *
826  IF( mycol.EQ.0 ) THEN
827  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
828  ELSE
829  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
830  END IF
831 *
832  IF( info.NE.0 ) THEN
833  GO TO 110
834  END IF
835 * No errors found, continue
836 *
837 *
838 ********************************************************************
839 * PHASE 2: Formation and factorization of Reduced System.
840 ********************************************************************
841 *
842 * Gather up local sections of reduced system
843 *
844 *
845 * The last processor does not participate in the factorization of
846 * the reduced system, having sent its E_i already.
847  IF( mycol.EQ.npcol-1 ) THEN
848  GO TO 60
849  END IF
850 *
851 * Initiate send of off-diag block(s) to overlap with next part.
852 * Off-diagonal block needed on neighboring processor to start
853 * algorithm.
854 *
855  IF( ( mod( mycol+1, 2 ).EQ.0 ) .AND. ( mycol.GT.0 ) ) THEN
856 *
857  CALL dgesd2d( ictxt, bw, bw, af( odd_size*bw+1 ), bw, 0,
858  $ mycol-1 )
859 *
860  END IF
861 *
862 * Copy last diagonal block into AF storage for subsequent
863 * operations.
864 *
865  CALL dlamov( 'N', bw, bw, a( ofst+odd_size*llda+1 ), llda-1,
866  $ af( odd_size*bw+mbw2+1 ), bw )
867 *
868 * Receive cont. to diagonal block that is stored on this proc.
869 *
870  IF( mycol.LT.npcol-1 ) THEN
871 *
872  CALL dgerv2d( ictxt, bw, bw, af( odd_size*bw+2*mbw2+1 ), bw,
873  $ 0, mycol+1 )
874 *
875 * Add contribution to diagonal block
876 *
877  CALL daxpy( mbw2, one, af( odd_size*bw+2*mbw2+1 ), 1,
878  $ af( odd_size*bw+mbw2+1 ), 1 )
879 *
880  END IF
881 *
882 *
883 * *************************************
884 * Modification Loop
885 *
886 * The distance for sending and receiving for each level starts
887 * at 1 for the first level.
888  level_dist = 1
889 *
890 * Do until this proc is needed to modify other procs' equations
891 *
892  40 CONTINUE
893  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
894  $ GO TO 50
895 *
896 * Receive and add contribution to diagonal block from the left
897 *
898  IF( mycol-level_dist.GE.0 ) THEN
899  CALL dgerv2d( ictxt, bw, bw, work( 1 ), bw, 0,
900  $ mycol-level_dist )
901 *
902  CALL daxpy( mbw2, one, work( 1 ), 1,
903  $ af( odd_size*bw+mbw2+1 ), 1 )
904 *
905  END IF
906 *
907 * Receive and add contribution to diagonal block from the right
908 *
909  IF( mycol+level_dist.LT.npcol-1 ) THEN
910  CALL dgerv2d( ictxt, bw, bw, work( 1 ), bw, 0,
911  $ mycol+level_dist )
912 *
913  CALL daxpy( mbw2, one, work( 1 ), 1,
914  $ af( odd_size*bw+mbw2+1 ), 1 )
915 *
916  END IF
917 *
918  level_dist = level_dist*2
919 *
920  GO TO 40
921  50 CONTINUE
922 * [End of GOTO Loop]
923 *
924 *
925 * *********************************
926 * Calculate and use this proc's blocks to modify other procs'...
927 *
928 * Factor diagonal block
929 *
930  CALL dpotrf( 'L', bw, af( odd_size*bw+mbw2+1 ), bw, info )
931 *
932  IF( info.NE.0 ) THEN
933  info = npcol + mycol
934  END IF
935 *
936 * ****************************************************************
937 * Receive offdiagonal block from processor to right.
938 * If this is the first group of processors, the receive comes
939 * from a different processor than otherwise.
940 *
941  IF( level_dist.EQ.1 ) THEN
942  comm_proc = mycol + 1
943 *
944 * Move block into place that it will be expected to be for
945 * calcs.
946 *
947  CALL dlamov( 'N', bw, bw, af( odd_size*bw+1 ), bw,
948  $ af( odd_size*bw+2*mbw2+1 ), bw )
949 *
950  ELSE
951  comm_proc = mycol + level_dist / 2
952  END IF
953 *
954  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
955 *
956  CALL dgerv2d( ictxt, bw, bw, af( odd_size*bw+1 ), bw, 0,
957  $ comm_proc )
958 *
959  IF( info.EQ.0 ) THEN
960 *
961 *
962 * Modify upper off_diagonal block with diagonal block
963 *
964 *
965  CALL dtrsm( 'L', 'L', 'N', 'N', bw, bw, one,
966  $ af( odd_size*bw+mbw2+1 ), bw,
967  $ af( odd_size*bw+1 ), bw )
968 *
969  END IF
970 * End of "if ( info.eq.0 ) then"
971 *
972 * Calculate contribution from this block to next diagonal block
973 *
974  CALL dsyrk( 'L', 'T', bw, bw, -one, af( ( odd_size )*bw+1 ),
975  $ bw, zero, work( 1 ), bw )
976 *
977 * Send contribution to diagonal block's owning processor.
978 *
979  CALL dgesd2d( ictxt, bw, bw, work( 1 ), bw, 0,
980  $ mycol+level_dist )
981 *
982  END IF
983 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
984 *
985 *
986 * ****************************************************************
987 * Receive off_diagonal block from left and use to finish with this
988 * processor.
989 *
990  IF( ( mycol / level_dist.GT.0 ) .AND.
991  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) ) THEN
992 *
993  IF( level_dist.GT.1 ) THEN
994 *
995 * Receive offdiagonal block(s) from proc level_dist/2 to the
996 * left
997 *
998  CALL dgerv2d( ictxt, bw, bw, af( odd_size*bw+2*mbw2+1 ),
999  $ bw, 0, mycol-level_dist / 2 )
1000 *
1001  END IF
1002 *
1003 *
1004  IF( info.EQ.0 ) THEN
1005 *
1006 * Use diagonal block(s) to modify this offdiagonal block
1007 *
1008  CALL dtrsm( 'R', 'L', 'T', 'N', bw, bw, one,
1009  $ af( odd_size*bw+mbw2+1 ), bw,
1010  $ af( odd_size*bw+2*mbw2+1 ), bw )
1011 *
1012  END IF
1013 * End of "if( info.eq.0 ) then"
1014 *
1015 * Use offdiag block(s) to calculate modification to diag block
1016 * of processor to the left
1017 *
1018  CALL dsyrk( 'L', 'N', bw, bw, -one,
1019  $ af( ( odd_size+2*bw )*bw+1 ), bw, zero,
1020  $ work( 1 ), bw )
1021 *
1022 * Send contribution to diagonal block's owning processor.
1023 *
1024  CALL dgesd2d( ictxt, bw, bw, work( 1 ), bw, 0,
1025  $ mycol-level_dist )
1026 *
1027 * *******************************************************
1028 *
1029  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1030 *
1031 * Decide which processor offdiagonal block(s) goes to
1032 *
1033  IF( ( mod( mycol / ( 2*level_dist ), 2 ) ).EQ.0 ) THEN
1034  comm_proc = mycol + level_dist
1035  ELSE
1036  comm_proc = mycol - level_dist
1037  END IF
1038 *
1039 * Use offdiagonal blocks to calculate offdiag
1040 * block to send to neighboring processor. Depending
1041 * on circumstances, may need to transpose the matrix.
1042 *
1043  CALL dgemm( 'N', 'N', bw, bw, bw, -one,
1044  $ af( odd_size*bw+2*mbw2+1 ), bw,
1045  $ af( odd_size*bw+1 ), bw, zero, work( 1 ),
1046  $ bw )
1047 *
1048 * Send contribution to offdiagonal block's owning processor.
1049 *
1050  CALL dgesd2d( ictxt, bw, bw, work( 1 ), bw, 0,
1051  $ comm_proc )
1052 *
1053  END IF
1054 *
1055  END IF
1056 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1057 *
1058  60 CONTINUE
1059 *
1060  ELSE
1061 *
1062 * CASE UPLO = 'U'
1063 *
1064 ********************************************************************
1065 * PHASE 1: Local computation phase.
1066 ********************************************************************
1067 *
1068 *
1069 * Sizes of the extra triangles communicated bewtween processors
1070 *
1071  IF( mycol.GT.0 ) THEN
1072  prev_tri_size_m = min( bw, numroc( n, part_size, mycol, 0,
1073  $ npcol ) )
1074  prev_tri_size_n = min( bw, numroc( n, part_size, mycol-1, 0,
1075  $ npcol ) )
1076  END IF
1077 *
1078  IF( mycol.LT.npcol-1 ) THEN
1079  next_tri_size_m = min( bw, numroc( n, part_size, mycol+1, 0,
1080  $ npcol ) )
1081  next_tri_size_n = min( bw, numroc( n, part_size, mycol, 0,
1082  $ npcol ) )
1083  END IF
1084 *
1085 *
1086 *
1087 * Factor main partition A_i^T = U_i {U_i}^T in each processor
1088 *
1089  CALL dpbtrf( uplo, odd_size, bw, a( ofst+1 ), llda, info )
1090 *
1091  IF( info.NE.0 ) THEN
1092  info = mycol + 1
1093  GO TO 70
1094  END IF
1095 *
1096 *
1097  IF( mycol.LT.np-1 ) THEN
1098 * Apply factorization to odd-even connection block B_i
1099 *
1100 * Move the connection block in preparation.
1101 *
1102  CALL dlamov( 'L', bw, bw, a( ( ofst+1+odd_size*llda ) ),
1103  $ llda-1, af( odd_size*bw+2*mbw2+1+bw-bw ), bw )
1104 *
1105 *
1106 * Perform the triangular solve {L_i}{{B'}_i}^T = {B_i}^T
1107 *
1108  CALL dtrtrs( 'U', 'T', 'N', bw, bw,
1109  $ a( ofst+bw+1+( odd_size-bw )*llda ), llda-1,
1110  $ af( odd_size*bw+2*mbw2+1 ), bw, info )
1111 *
1112 * Move the resulting block back to its location in main storage.
1113 *
1114  CALL dlamov( 'L', bw, bw, af( odd_size*bw+2*mbw2+1+bw-bw ),
1115  $ bw, a( ( ofst+1+odd_size*llda ) ), llda-1 )
1116 *
1117 *
1118 * Compute contribution to diagonal block(s) of reduced system.
1119 * {C'}_i^T = {C_i}^T-{{B'}_i}^T{{B'}_i}
1120 *
1121 * The following method uses more flops than necessary but
1122 * does not necessitate the writing of a new BLAS routine.
1123 *
1124 *
1125  CALL dsyrk( uplo, 'T', bw, bw, -one,
1126  $ af( odd_size*bw+2*mbw2+1 ), bw, one,
1127  $ a( ofst+bw+1+odd_size*llda ), llda-1 )
1128 *
1129  END IF
1130 * End of "if ( MYCOL .lt. NP-1 )..." loop
1131 *
1132 *
1133  70 CONTINUE
1134 * If the processor could not locally factor, it jumps here.
1135 *
1136  IF( mycol.NE.0 ) THEN
1137 * Discard temporary matrix stored beginning in
1138 * AF( (odd_size+2*bw)*bw+1 ) and use for
1139 * off_diagonal block of reduced system.
1140 *
1141 * Calculate the "spike" fillin, ${L_i} {{G}_i}^T = {D_i}$ .
1142 *
1143 *
1144 * Copy D block into AF storage for solve.
1145 *
1146  CALL dlatcpy( 'L', prev_tri_size_n, prev_tri_size_m,
1147  $ a( ofst+1 ), llda-1, af( 1 ), odd_size )
1148 *
1149  IF( info.EQ.0 ) THEN
1150 *
1151  CALL dtbtrs( 'U', 'T', 'N', odd_size, bw, bw,
1152  $ a( ofst+1 ), llda, af( 1 ), odd_size, info )
1153 *
1154 *
1155 * Calculate the update block for previous proc, E_i = G_i{G_i}^T
1156 *
1157  CALL dsyrk( 'L', 'T', bw, odd_size, -one, af( 1 ),
1158  $ odd_size, zero, af( 1+( odd_size+2*bw )*bw ),
1159  $ bw )
1160 *
1161 *
1162 * Initiate send of E_i to previous processor to overlap
1163 * with next computation.
1164 *
1165  CALL dgesd2d( ictxt, bw, bw, af( odd_size*bw+2*mbw2+1 ),
1166  $ bw, 0, mycol-1 )
1167 *
1168 *
1169  IF( mycol.LT.np-1 ) THEN
1170 *
1171 * Calculate off-diagonal block(s) of reduced system.
1172 * Note: for ease of use in solution of reduced system, store
1173 * L's off-diagonal block in transpose form.
1174 * {F_i}^T = {H_i}{{B'}_i}^T
1175 *
1176 * Copy matrix H_i (the last bw cols of G_i) to AF storage
1177 * as per requirements of BLAS routine DTRMM.
1178 * Since we have G_i^T stored, transpose
1179 * H_i^T to H_i.
1180 *
1181  CALL dlatcpy( 'N', bw, bw, af( odd_size-bw+1 ),
1182  $ odd_size, af( ( odd_size )*bw+1 ), bw )
1183 *
1184  CALL dtrmm( 'R', 'L', 'N', 'N', bw, bw, -one,
1185  $ a( ( ofst+1+odd_size*llda ) ), llda-1,
1186  $ af( ( odd_size )*bw+1 ), bw )
1187 *
1188  END IF
1189 *
1190  END IF
1191 * End of "if ( MYCOL .ne. 0 )..."
1192 *
1193  END IF
1194 * End of "if (info.eq.0) then"
1195 *
1196 *
1197 * Check to make sure no processors have found errors
1198 *
1199  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info, -1,
1200  $ 0, 0 )
1201 *
1202  IF( mycol.EQ.0 ) THEN
1203  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1204  ELSE
1205  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1206  END IF
1207 *
1208  IF( info.NE.0 ) THEN
1209  GO TO 110
1210  END IF
1211 * No errors found, continue
1212 *
1213 *
1214 ********************************************************************
1215 * PHASE 2: Formation and factorization of Reduced System.
1216 ********************************************************************
1217 *
1218 * Gather up local sections of reduced system
1219 *
1220 *
1221 * The last processor does not participate in the factorization of
1222 * the reduced system, having sent its E_i already.
1223  IF( mycol.EQ.npcol-1 ) THEN
1224  GO TO 100
1225  END IF
1226 *
1227 * Initiate send of off-diag block(s) to overlap with next part.
1228 * Off-diagonal block needed on neighboring processor to start
1229 * algorithm.
1230 *
1231  IF( ( mod( mycol+1, 2 ).EQ.0 ) .AND. ( mycol.GT.0 ) ) THEN
1232 *
1233  CALL dgesd2d( ictxt, bw, bw, af( odd_size*bw+1 ), bw, 0,
1234  $ mycol-1 )
1235 *
1236  END IF
1237 *
1238 * Transpose last diagonal block into AF storage for subsequent
1239 * operations.
1240 *
1241  CALL dlatcpy( 'U', bw, bw, a( ofst+odd_size*llda+1+bw ),
1242  $ llda-1, af( odd_size*bw+mbw2+1 ), bw )
1243 *
1244 * Receive cont. to diagonal block that is stored on this proc.
1245 *
1246  IF( mycol.LT.npcol-1 ) THEN
1247 *
1248  CALL dgerv2d( ictxt, bw, bw, af( odd_size*bw+2*mbw2+1 ), bw,
1249  $ 0, mycol+1 )
1250 *
1251 * Add contribution to diagonal block
1252 *
1253  CALL daxpy( mbw2, one, af( odd_size*bw+2*mbw2+1 ), 1,
1254  $ af( odd_size*bw+mbw2+1 ), 1 )
1255 *
1256  END IF
1257 *
1258 *
1259 * *************************************
1260 * Modification Loop
1261 *
1262 * The distance for sending and receiving for each level starts
1263 * at 1 for the first level.
1264  level_dist = 1
1265 *
1266 * Do until this proc is needed to modify other procs' equations
1267 *
1268  80 CONTINUE
1269  IF( mod( ( mycol+1 ) / level_dist, 2 ).NE.0 )
1270  $ GO TO 90
1271 *
1272 * Receive and add contribution to diagonal block from the left
1273 *
1274  IF( mycol-level_dist.GE.0 ) THEN
1275  CALL dgerv2d( ictxt, bw, bw, work( 1 ), bw, 0,
1276  $ mycol-level_dist )
1277 *
1278  CALL daxpy( mbw2, one, work( 1 ), 1,
1279  $ af( odd_size*bw+mbw2+1 ), 1 )
1280 *
1281  END IF
1282 *
1283 * Receive and add contribution to diagonal block from the right
1284 *
1285  IF( mycol+level_dist.LT.npcol-1 ) THEN
1286  CALL dgerv2d( ictxt, bw, bw, work( 1 ), bw, 0,
1287  $ mycol+level_dist )
1288 *
1289  CALL daxpy( mbw2, one, work( 1 ), 1,
1290  $ af( odd_size*bw+mbw2+1 ), 1 )
1291 *
1292  END IF
1293 *
1294  level_dist = level_dist*2
1295 *
1296  GO TO 80
1297  90 CONTINUE
1298 * [End of GOTO Loop]
1299 *
1300 *
1301 * *********************************
1302 * Calculate and use this proc's blocks to modify other procs'...
1303 *
1304 * Factor diagonal block
1305 *
1306  CALL dpotrf( 'L', bw, af( odd_size*bw+mbw2+1 ), bw, info )
1307 *
1308  IF( info.NE.0 ) THEN
1309  info = npcol + mycol
1310  END IF
1311 *
1312 * ****************************************************************
1313 * Receive offdiagonal block from processor to right.
1314 * If this is the first group of processors, the receive comes
1315 * from a different processor than otherwise.
1316 *
1317  IF( level_dist.EQ.1 ) THEN
1318  comm_proc = mycol + 1
1319 *
1320 * Move block into place that it will be expected to be for
1321 * calcs.
1322 *
1323  CALL dlamov( 'N', bw, bw, af( odd_size*bw+1 ), bw,
1324  $ af( odd_size*bw+2*mbw2+1 ), bw )
1325 *
1326  ELSE
1327  comm_proc = mycol + level_dist / 2
1328  END IF
1329 *
1330  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1331 *
1332  CALL dgerv2d( ictxt, bw, bw, af( odd_size*bw+1 ), bw, 0,
1333  $ comm_proc )
1334 *
1335  IF( info.EQ.0 ) THEN
1336 *
1337 *
1338 * Modify upper off_diagonal block with diagonal block
1339 *
1340 *
1341  CALL dtrsm( 'L', 'L', 'N', 'N', bw, bw, one,
1342  $ af( odd_size*bw+mbw2+1 ), bw,
1343  $ af( odd_size*bw+1 ), bw )
1344 *
1345  END IF
1346 * End of "if ( info.eq.0 ) then"
1347 *
1348 * Calculate contribution from this block to next diagonal block
1349 *
1350  CALL dsyrk( 'L', 'T', bw, bw, -one, af( ( odd_size )*bw+1 ),
1351  $ bw, zero, work( 1 ), bw )
1352 *
1353 * Send contribution to diagonal block's owning processor.
1354 *
1355  CALL dgesd2d( ictxt, bw, bw, work( 1 ), bw, 0,
1356  $ mycol+level_dist )
1357 *
1358  END IF
1359 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
1360 *
1361 *
1362 * ****************************************************************
1363 * Receive off_diagonal block from left and use to finish with this
1364 * processor.
1365 *
1366  IF( ( mycol / level_dist.GT.0 ) .AND.
1367  $ ( mycol / level_dist.LE.( npcol-1 ) / level_dist-1 ) ) THEN
1368 *
1369  IF( level_dist.GT.1 ) THEN
1370 *
1371 * Receive offdiagonal block(s) from proc level_dist/2 to the
1372 * left
1373 *
1374  CALL dgerv2d( ictxt, bw, bw, af( odd_size*bw+2*mbw2+1 ),
1375  $ bw, 0, mycol-level_dist / 2 )
1376 *
1377  END IF
1378 *
1379 *
1380  IF( info.EQ.0 ) THEN
1381 *
1382 * Use diagonal block(s) to modify this offdiagonal block
1383 *
1384  CALL dtrsm( 'R', 'L', 'T', 'N', bw, bw, one,
1385  $ af( odd_size*bw+mbw2+1 ), bw,
1386  $ af( odd_size*bw+2*mbw2+1 ), bw )
1387 *
1388  END IF
1389 * End of "if( info.eq.0 ) then"
1390 *
1391 * Use offdiag block(s) to calculate modification to diag block
1392 * of processor to the left
1393 *
1394  CALL dsyrk( 'L', 'N', bw, bw, -one,
1395  $ af( ( odd_size+2*bw )*bw+1 ), bw, zero,
1396  $ work( 1 ), bw )
1397 *
1398 * Send contribution to diagonal block's owning processor.
1399 *
1400  CALL dgesd2d( ictxt, bw, bw, work( 1 ), bw, 0,
1401  $ mycol-level_dist )
1402 *
1403 * *******************************************************
1404 *
1405  IF( mycol / level_dist.LE.( npcol-1 ) / level_dist-2 ) THEN
1406 *
1407 * Decide which processor offdiagonal block(s) goes to
1408 *
1409  IF( ( mod( mycol / ( 2*level_dist ), 2 ) ).EQ.0 ) THEN
1410  comm_proc = mycol + level_dist
1411  ELSE
1412  comm_proc = mycol - level_dist
1413  END IF
1414 *
1415 * Use offdiagonal blocks to calculate offdiag
1416 * block to send to neighboring processor. Depending
1417 * on circumstances, may need to transpose the matrix.
1418 *
1419  CALL dgemm( 'N', 'N', bw, bw, bw, -one,
1420  $ af( odd_size*bw+2*mbw2+1 ), bw,
1421  $ af( odd_size*bw+1 ), bw, zero, work( 1 ),
1422  $ bw )
1423 *
1424 * Send contribution to offdiagonal block's owning processor.
1425 *
1426  CALL dgesd2d( ictxt, bw, bw, work( 1 ), bw, 0,
1427  $ comm_proc )
1428 *
1429  END IF
1430 *
1431  END IF
1432 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1433 *
1434  100 CONTINUE
1435 *
1436  END IF
1437 *
1438  110 CONTINUE
1439 *
1440 *
1441 * Free BLACS space used to hold standard-form grid.
1442 *
1443  IF( ictxt_save.NE.ictxt_new ) THEN
1444  CALL blacs_gridexit( ictxt_new )
1445  END IF
1446 *
1447  120 CONTINUE
1448 *
1449 * Restore saved input parameters
1450 *
1451  ictxt = ictxt_save
1452  np = np_save
1453 *
1454 * Output minimum worksize
1455 *
1456  work( 1 ) = work_size_min
1457 *
1458 * Make INFO consistent across processors
1459 *
1460  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info, -1, 0,
1461  $ 0 )
1462 *
1463  IF( mycol.EQ.0 ) THEN
1464  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1465  ELSE
1466  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1467  END IF
1468 *
1469 *
1470  RETURN
1471 *
1472 * End of PDPBTRF
1473 *
1474  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
dlatcpy
subroutine dlatcpy(UPLO, M, N, A, LDA, B, LDB)
Definition: dlatcpy.f:2
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
pdpbtrf
subroutine pdpbtrf(UPLO, N, BW, A, JA, DESCA, AF, LAF, WORK, LWORK, INFO)
Definition: pdpbtrf.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181