ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pcdttrf.f
Go to the documentation of this file.
1  SUBROUTINE pcdttrf( N, DL, D, DU, JA, DESCA, AF, LAF, WORK, LWORK,
2  $ INFO )
3 *
4 * -- ScaLAPACK routine (version 2.0.2) --
5 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6 * May 1 2012
7 *
8 * .. Scalar Arguments ..
9  INTEGER INFO, JA, LAF, LWORK, N
10 * ..
11 * .. Array Arguments ..
12  INTEGER DESCA( * )
13  COMPLEX AF( * ), D( * ), DL( * ), DU( * ), WORK( * )
14 * ..
15 *
16 *
17 * Purpose
18 * =======
19 *
20 * PCDTTRF computes a LU factorization
21 * of an N-by-N complex tridiagonal
22 * diagonally dominant-like distributed matrix
23 * A(1:N, JA:JA+N-1).
24 * Reordering is used to increase parallelism in the factorization.
25 * This reordering results in factors that are DIFFERENT from those
26 * produced by equivalent sequential codes. These factors cannot
27 * be used directly by users; however, they can be used in
28 * subsequent calls to PCDTTRS to solve linear systems.
29 *
30 * The factorization has the form
31 *
32 * P A(1:N, JA:JA+N-1) P^T = L U
33 *
34 * where U is a tridiagonal upper triangular matrix and L is tridiagonal
35 * lower triangular, and P is a permutation matrix.
36 *
37 * =====================================================================
38 *
39 * Arguments
40 * =========
41 *
42 *
43 * N (global input) INTEGER
44 * The number of rows and columns to be operated on, i.e. the
45 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
46 *
47 * DL (local input/local output) COMPLEX pointer to local
48 * part of global vector storing the lower diagonal of the
49 * matrix. Globally, DL(1) is not referenced, and DL must be
50 * aligned with D.
51 * Must be of size >= DESCA( NB_ ).
52 * On exit, this array contains information containing the
53 * factors of the matrix.
54 *
55 * D (local input/local output) COMPLEX pointer to local
56 * part of global vector storing the main diagonal of the
57 * matrix.
58 * On exit, this array contains information containing the
59 * factors of the matrix.
60 * Must be of size >= DESCA( NB_ ).
61 *
62 * DU (local input/local output) COMPLEX pointer to local
63 * part of global vector storing the upper diagonal of the
64 * matrix. Globally, DU(n) is not referenced, and DU must be
65 * aligned with D.
66 * On exit, this array contains information containing the
67 * factors of the matrix.
68 * Must be of size >= DESCA( NB_ ).
69 *
70 * JA (global input) INTEGER
71 * The index in the global array A that points to the start of
72 * the matrix to be operated on (which may be either all of A
73 * or a submatrix of A).
74 *
75 * DESCA (global and local input) INTEGER array of dimension DLEN.
76 * if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
77 * if 2D type (DTYPE_A=1), DLEN >= 9.
78 * The array descriptor for the distributed matrix A.
79 * Contains information of mapping of A to memory. Please
80 * see NOTES below for full description and options.
81 *
82 * AF (local output) COMPLEX array, dimension LAF.
83 * Auxiliary Fillin Space.
84 * Fillin is created during the factorization routine
85 * PCDTTRF and this is stored in AF. If a linear system
86 * is to be solved using PCDTTRS after the factorization
87 * routine, AF *must not be altered* after the factorization.
88 *
89 * LAF (local input) INTEGER
90 * Size of user-input Auxiliary Fillin space AF. Must be >=
91 * 2*(NB+2)
92 * If LAF is not large enough, an error code will be returned
93 * and the minimum acceptable size will be returned in AF( 1 )
94 *
95 * WORK (local workspace/local output)
96 * COMPLEX temporary workspace. This space may
97 * be overwritten in between calls to routines. WORK must be
98 * the size given in LWORK.
99 * On exit, WORK( 1 ) contains the minimal LWORK.
100 *
101 * LWORK (local input or global input) INTEGER
102 * Size of user-input workspace WORK.
103 * If LWORK is too small, the minimal acceptable size will be
104 * returned in WORK(1) and an error code is returned. LWORK>=
105 * 8*NPCOL
106 *
107 * INFO (local output) INTEGER
108 * = 0: successful exit
109 * < 0: If the i-th argument is an array and the j-entry had
110 * an illegal value, then INFO = -(i*100+j), if the i-th
111 * argument is a scalar and had an illegal value, then
112 * INFO = -i.
113 * > 0: If INFO = K<=NPROCS, the submatrix stored on processor
114 * INFO and factored locally was not
115 * diagonally dominant-like, and
116 * the factorization was not completed.
117 * If INFO = K>NPROCS, the submatrix stored on processor
118 * INFO-NPROCS representing interactions with other
119 * processors was not
120 * stably factorable wo/interchanges,
121 * and the factorization was not completed.
122 *
123 * =====================================================================
124 *
125 *
126 * Restrictions
127 * ============
128 *
129 * The following are restrictions on the input parameters. Some of these
130 * are temporary and will be removed in future releases, while others
131 * may reflect fundamental technical limitations.
132 *
133 * Non-cyclic restriction: VERY IMPORTANT!
134 * P*NB>= mod(JA-1,NB)+N.
135 * The mapping for matrices must be blocked, reflecting the nature
136 * of the divide and conquer algorithm as a task-parallel algorithm.
137 * This formula in words is: no processor may have more than one
138 * chunk of the matrix.
139 *
140 * Blocksize cannot be too small:
141 * If the matrix spans more than one processor, the following
142 * restriction on NB, the size of each block on each processor,
143 * must hold:
144 * NB >= 2
145 * The bulk of parallel computation is done on the matrix of size
146 * O(NB) on each processor. If this is too small, divide and conquer
147 * is a poor choice of algorithm.
148 *
149 * Submatrix reference:
150 * JA = IB
151 * Alignment restriction that prevents unnecessary communication.
152 *
153 *
154 * =====================================================================
155 *
156 *
157 * Notes
158 * =====
159 *
160 * If the factorization routine and the solve routine are to be called
161 * separately (to solve various sets of righthand sides using the same
162 * coefficient matrix), the auxiliary space AF *must not be altered*
163 * between calls to the factorization routine and the solve routine.
164 *
165 * The best algorithm for solving banded and tridiagonal linear systems
166 * depends on a variety of parameters, especially the bandwidth.
167 * Currently, only algorithms designed for the case N/P >> bw are
168 * implemented. These go by many names, including Divide and Conquer,
169 * Partitioning, domain decomposition-type, etc.
170 * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
171 * algorithms are the appropriate choice.
172 *
173 * Algorithm description: Divide and Conquer
174 *
175 * The Divide and Conqer algorithm assumes the matrix is narrowly
176 * banded compared with the number of equations. In this situation,
177 * it is best to distribute the input matrix A one-dimensionally,
178 * with columns atomic and rows divided amongst the processes.
179 * The basic algorithm divides the tridiagonal matrix up into
180 * P pieces with one stored on each processor,
181 * and then proceeds in 2 phases for the factorization or 3 for the
182 * solution of a linear system.
183 * 1) Local Phase:
184 * The individual pieces are factored independently and in
185 * parallel. These factors are applied to the matrix creating
186 * fillin, which is stored in a non-inspectable way in auxiliary
187 * space AF. Mathematically, this is equivalent to reordering
188 * the matrix A as P A P^T and then factoring the principal
189 * leading submatrix of size equal to the sum of the sizes of
190 * the matrices factored on each processor. The factors of
191 * these submatrices overwrite the corresponding parts of A
192 * in memory.
193 * 2) Reduced System Phase:
194 * A small ((P-1)) system is formed representing
195 * interaction of the larger blocks, and is stored (as are its
196 * factors) in the space AF. A parallel Block Cyclic Reduction
197 * algorithm is used. For a linear system, a parallel front solve
198 * followed by an analagous backsolve, both using the structure
199 * of the factored matrix, are performed.
200 * 3) Backsubsitution Phase:
201 * For a linear system, a local backsubstitution is performed on
202 * each processor in parallel.
203 *
204 *
205 * Descriptors
206 * ===========
207 *
208 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
209 *
210 * Note: tridiagonal codes can use either the old two dimensional
211 * or new one-dimensional descriptors, though the processor grid in
212 * both cases *must be one-dimensional*. We describe both types below.
213 *
214 * Each global data object is described by an associated description
215 * vector. This vector stores the information required to establish
216 * the mapping between an object element and its corresponding process
217 * and memory location.
218 *
219 * Let A be a generic term for any 2D block cyclicly distributed array.
220 * Such a global array has an associated description vector DESCA.
221 * In the following comments, the character _ should be read as
222 * "of the global array".
223 *
224 * NOTATION STORED IN EXPLANATION
225 * --------------- -------------- --------------------------------------
226 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
227 * DTYPE_A = 1.
228 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
229 * the BLACS process grid A is distribu-
230 * ted over. The context itself is glo-
231 * bal, but the handle (the integer
232 * value) may vary.
233 * M_A (global) DESCA( M_ ) The number of rows in the global
234 * array A.
235 * N_A (global) DESCA( N_ ) The number of columns in the global
236 * array A.
237 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
238 * the rows of the array.
239 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
240 * the columns of the array.
241 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
242 * row of the array A is distributed.
243 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
244 * first column of the array A is
245 * distributed.
246 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
247 * array. LLD_A >= MAX(1,LOCr(M_A)).
248 *
249 * Let K be the number of rows or columns of a distributed matrix,
250 * and assume that its process grid has dimension p x q.
251 * LOCr( K ) denotes the number of elements of K that a process
252 * would receive if K were distributed over the p processes of its
253 * process column.
254 * Similarly, LOCc( K ) denotes the number of elements of K that a
255 * process would receive if K were distributed over the q processes of
256 * its process row.
257 * The values of LOCr() and LOCc() may be determined via a call to the
258 * ScaLAPACK tool function, NUMROC:
259 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
260 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
261 * An upper bound for these quantities may be computed by:
262 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
263 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
264 *
265 *
266 * One-dimensional descriptors:
267 *
268 * One-dimensional descriptors are a new addition to ScaLAPACK since
269 * version 1.0. They simplify and shorten the descriptor for 1D
270 * arrays.
271 *
272 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
273 * object, we allow 1D arrays to be distributed either over the
274 * first dimension of the array (as if the grid were P-by-1) or the
275 * 2nd dimension (as if the grid were 1-by-P). This choice is
276 * indicated by the descriptor type (501 or 502)
277 * as described below.
278 * However, for tridiagonal matrices, since the objects being
279 * distributed are the individual vectors storing the diagonals, we
280 * have adopted the convention that both the P-by-1 descriptor and
281 * the 1-by-P descriptor are allowed and are equivalent for
282 * tridiagonal matrices. Thus, for tridiagonal matrices,
283 * DTYPE_A = 501 or 502 can be used interchangeably
284 * without any other change.
285 * We require that the distributed vectors storing the diagonals of a
286 * tridiagonal matrix be aligned with each other. Because of this, a
287 * single descriptor, DESCA, serves to describe the distribution of
288 * of all diagonals simultaneously.
289 *
290 * IMPORTANT NOTE: the actual BLACS grid represented by the
291 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
292 * irrespective of which one-dimensional descriptor type
293 * (501 or 502) is input.
294 * This routine will interpret the grid properly either way.
295 * ScaLAPACK routines *do not support intercontext operations* so that
296 * the grid passed to a single ScaLAPACK routine *must be the same*
297 * for all array descriptors passed to that routine.
298 *
299 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
300 * may also be used, since a one-dimensional array is a special case
301 * of a two-dimensional array with one dimension of size unity.
302 * The two-dimensional array used in this case *must* be of the
303 * proper orientation:
304 * If the appropriate one-dimensional descriptor is DTYPEA=501
305 * (1 by P type), then the two dimensional descriptor must
306 * have a CTXT value that refers to a 1 by P BLACS grid;
307 * If the appropriate one-dimensional descriptor is DTYPEA=502
308 * (P by 1 type), then the two dimensional descriptor must
309 * have a CTXT value that refers to a P by 1 BLACS grid.
310 *
311 *
312 * Summary of allowed descriptors, types, and BLACS grids:
313 * DTYPE 501 502 1 1
314 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
315 * -----------------------------------------------------
316 * A OK OK OK NO
317 * B NO OK NO OK
318 *
319 * Let A be a generic term for any 1D block cyclicly distributed array.
320 * Such a global array has an associated description vector DESCA.
321 * In the following comments, the character _ should be read as
322 * "of the global array".
323 *
324 * NOTATION STORED IN EXPLANATION
325 * --------------- ---------- ------------------------------------------
326 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
327 * TYPE_A = 501: 1-by-P grid.
328 * TYPE_A = 502: P-by-1 grid.
329 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
330 * the BLACS process grid A is distribu-
331 * ted over. The context itself is glo-
332 * bal, but the handle (the integer
333 * value) may vary.
334 * N_A (global) DESCA( 3 ) The size of the array dimension being
335 * distributed.
336 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
337 * the distributed dimension of the array.
338 * SRC_A (global) DESCA( 5 ) The process row or column over which the
339 * first row or column of the array
340 * is distributed.
341 * Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
342 * Reserved DESCA( 7 ) Reserved for future use.
343 *
344 *
345 *
346 * =====================================================================
347 *
348 * Code Developer: Andrew J. Cleary, University of Tennessee.
349 * Current address: Lawrence Livermore National Labs.
350 * This version released: August, 2001.
351 *
352 * =====================================================================
353 *
354 * ..
355 * .. Parameters ..
356  REAL ONE, ZERO
357  parameter( one = 1.0e+0 )
358  parameter( zero = 0.0e+0 )
359  COMPLEX CONE, CZERO
360  parameter( cone = ( 1.0e+0, 0.0e+0 ) )
361  parameter( czero = ( 0.0e+0, 0.0e+0 ) )
362  INTEGER INT_ONE
363  parameter( int_one = 1 )
364  INTEGER DESCMULT, BIGNUM
365  parameter(descmult = 100, bignum = descmult * descmult)
366  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
367  $ lld_, mb_, m_, nb_, n_, rsrc_
368  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
369  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
370  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
371 * ..
372 * .. Local Scalars ..
373  INTEGER COMM_PROC, CSRC, FIRST_PROC, I, ICTXT,
374  $ ictxt_new, ictxt_save, idum3, ja_new, laf_min,
375  $ level_dist, llda, mycol, myrow, my_num_cols,
376  $ nb, np, npcol, nprow, np_save, odd_size,
377  $ part_offset, part_size, return_code, store_n_a,
378  $ temp, work_size_min, work_u
379 * ..
380 * .. Local Arrays ..
381  INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 7, 3 )
382 * ..
383 * .. External Subroutines ..
384  EXTERNAL blacs_get, blacs_gridexit, blacs_gridinfo,
385  $ caxpy, cgemm, cgerv2d, cgesd2d, clamov,
386  $ clatcpy, cpbtrf, cpotrf, csyrk, ctbtrs, ctrmm,
387  $ ctrrv2d, ctrsd2d, ctrsm, ctrtrs, desc_convert,
389 * ..
390 * .. External Functions ..
391  LOGICAL LSAME
392  INTEGER NUMROC
393  COMPLEX CDOTC
394  EXTERNAL cdotc, lsame, numroc
395 * ..
396 * .. Intrinsic Functions ..
397  INTRINSIC ichar, min, mod
398 * ..
399 * .. Executable Statements ..
400 *
401 * Test the input parameters
402 *
403  info = 0
404 *
405 * Convert descriptor into standard form for easy access to
406 * parameters, check that grid is of right shape.
407 *
408  desca_1xp( 1 ) = 501
409 *
410  temp = desca( dtype_ )
411  IF( temp .EQ. 502 ) THEN
412 * Temporarily set the descriptor type to 1xP type
413  desca( dtype_ ) = 501
414  ENDIF
415 *
416  CALL desc_convert( desca, desca_1xp, return_code )
417 *
418  desca( dtype_ ) = temp
419 *
420  IF( return_code .NE. 0) THEN
421  info = -( 6*100 + 2 )
422  ENDIF
423 *
424 * Get values out of descriptor for use in code.
425 *
426  ictxt = desca_1xp( 2 )
427  csrc = desca_1xp( 5 )
428  nb = desca_1xp( 4 )
429  llda = desca_1xp( 6 )
430  store_n_a = desca_1xp( 3 )
431 *
432 * Get grid parameters
433 *
434 *
435  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
436  np = nprow * npcol
437 *
438 *
439 *
440  IF( lwork .LT. -1) THEN
441  info = -10
442  ELSE IF ( lwork .EQ. -1 ) THEN
443  idum3 = -1
444  ELSE
445  idum3 = 1
446  ENDIF
447 *
448  IF( n .LT. 0 ) THEN
449  info = -1
450  ENDIF
451 *
452  IF( n+ja-1 .GT. store_n_a ) THEN
453  info = -( 6*100 + 6 )
454  ENDIF
455 *
456 * Argument checking that is specific to Divide & Conquer routine
457 *
458  IF( nprow .NE. 1 ) THEN
459  info = -( 6*100+2 )
460  ENDIF
461 *
462  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
463  info = -( 1 )
464  CALL pxerbla( ictxt,
465  $ 'PCDTTRF, D&C alg.: only 1 block per proc',
466  $ -info )
467  RETURN
468  ENDIF
469 *
470  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*int_one )) THEN
471  info = -( 6*100+4 )
472  CALL pxerbla( ictxt,
473  $ 'PCDTTRF, D&C alg.: NB too small',
474  $ -info )
475  RETURN
476  ENDIF
477 *
478 *
479 * Check auxiliary storage size
480 *
481  laf_min = (12*npcol+3*nb)
482 *
483  IF( laf .LT. laf_min ) THEN
484  info = -8
485 * put minimum value of laf into AF( 1 )
486  af( 1 ) = laf_min
487  CALL pxerbla( ictxt,
488  $ 'PCDTTRF: auxiliary storage error ',
489  $ -info )
490  RETURN
491  ENDIF
492 *
493 * Check worksize
494 *
495  work_size_min = 8*npcol
496 *
497  work( 1 ) = work_size_min
498 *
499  IF( lwork .LT. work_size_min ) THEN
500  IF( lwork .NE. -1 ) THEN
501  info = -10
502  CALL pxerbla( ictxt,
503  $ 'PCDTTRF: worksize error ',
504  $ -info )
505  ENDIF
506  RETURN
507  ENDIF
508 *
509 * Pack params and positions into arrays for global consistency check
510 *
511  param_check( 7, 1 ) = desca(5)
512  param_check( 6, 1 ) = desca(4)
513  param_check( 5, 1 ) = desca(3)
514  param_check( 4, 1 ) = desca(1)
515  param_check( 3, 1 ) = ja
516  param_check( 2, 1 ) = n
517  param_check( 1, 1 ) = idum3
518 *
519  param_check( 7, 2 ) = 605
520  param_check( 6, 2 ) = 604
521  param_check( 5, 2 ) = 603
522  param_check( 4, 2 ) = 601
523  param_check( 3, 2 ) = 5
524  param_check( 2, 2 ) = 1
525  param_check( 1, 2 ) = 10
526 *
527 * Want to find errors with MIN( ), so if no error, set it to a big
528 * number. If there already is an error, multiply by the the
529 * descriptor multiplier.
530 *
531  IF( info.GE.0 ) THEN
532  info = bignum
533  ELSE IF( info.LT.-descmult ) THEN
534  info = -info
535  ELSE
536  info = -info * descmult
537  END IF
538 *
539 * Check consistency across processors
540 *
541  CALL globchk( ictxt, 7, param_check, 7,
542  $ param_check( 1, 3 ), info )
543 *
544 * Prepare output: set info = 0 if no error, and divide by DESCMULT
545 * if error is not in a descriptor entry.
546 *
547  IF( info.EQ.bignum ) THEN
548  info = 0
549  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
550  info = -info / descmult
551  ELSE
552  info = -info
553  END IF
554 *
555  IF( info.LT.0 ) THEN
556  CALL pxerbla( ictxt, 'PCDTTRF', -info )
557  RETURN
558  END IF
559 *
560 * Quick return if possible
561 *
562  IF( n.EQ.0 )
563  $ RETURN
564 *
565 *
566 * Adjust addressing into matrix space to properly get into
567 * the beginning part of the relevant data
568 *
569  part_offset = nb*( (ja-1)/(npcol*nb) )
570 *
571  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
572  part_offset = part_offset + nb
573  ENDIF
574 *
575  IF ( mycol .LT. csrc ) THEN
576  part_offset = part_offset - nb
577  ENDIF
578 *
579 * Form a new BLACS grid (the "standard form" grid) with only procs
580 * holding part of the matrix, of size 1xNP where NP is adjusted,
581 * starting at csrc=0, with JA modified to reflect dropped procs.
582 *
583 * First processor to hold part of the matrix:
584 *
585  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
586 *
587 * Calculate new JA one while dropping off unused processors.
588 *
589  ja_new = mod( ja-1, nb ) + 1
590 *
591 * Save and compute new value of NP
592 *
593  np_save = np
594  np = ( ja_new+n-2 )/nb + 1
595 *
596 * Call utility routine that forms "standard-form" grid
597 *
598  CALL reshape( ictxt, int_one, ictxt_new, int_one,
599  $ first_proc, int_one, np )
600 *
601 * Use new context from standard grid as context.
602 *
603  ictxt_save = ictxt
604  ictxt = ictxt_new
605  desca_1xp( 2 ) = ictxt_new
606 *
607 * Get information about new grid.
608 *
609  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
610 *
611 * Drop out processors that do not have part of the matrix.
612 *
613  IF( myrow .LT. 0 ) THEN
614  GOTO 1234
615  ENDIF
616 *
617 * ********************************
618 * Values reused throughout routine
619 *
620 * User-input value of partition size
621 *
622  part_size = nb
623 *
624 * Number of columns in each processor
625 *
626  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
627 *
628 * Offset in columns to beginning of main partition in each proc
629 *
630  IF ( mycol .EQ. 0 ) THEN
631  part_offset = part_offset+mod( ja_new-1, part_size )
632  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
633  ENDIF
634 *
635 * Size of main (or odd) partition in each processor
636 *
637  odd_size = my_num_cols
638  IF ( mycol .LT. np-1 ) THEN
639  odd_size = odd_size - int_one
640  ENDIF
641 *
642 * Offset to workspace for Upper triangular factor
643 *
644  work_u = int_one*odd_size + 3
645 *
646 *
647 * Zero out space for fillin
648 *
649  DO 10 i=1, laf_min
650  af( i ) = czero
651  10 CONTINUE
652 *
653 * Begin main code
654 *
655 *
656 ********************************************************************
657 * PHASE 1: Local computation phase.
658 ********************************************************************
659 *
660 *
661  IF ( mycol .LT. np-1 ) THEN
662 * Transfer last triangle D_i of local matrix to next processor
663 * which needs it to calculate fillin due to factorization of
664 * its main (odd) block A_i.
665 * Overlap the send with the factorization of A_i.
666 *
667  CALL ctrsd2d( ictxt, 'U', 'N', 1, 1,
668  $ du( part_offset+odd_size+1 ), llda-1, 0,
669  $ mycol+1 )
670 *
671  ENDIF
672 *
673 *
674 * Factor main partition A_i = L_i {U_i} in each processor
675 *
676  CALL cdttrf( odd_size, dl( part_offset+2 ), d( part_offset+1 ),
677  $ du( part_offset+1 ), info )
678 *
679  IF( info.NE.0 ) THEN
680  info = mycol+1
681  GOTO 1500
682  ENDIF
683 *
684 *
685  IF ( mycol .LT. np-1 ) THEN
686 *
687 * Apply factorization to lower connection block BL_i
688 * Apply factorization to upper connection block BU_i
689 *
690 *
691 * Perform the triangular solve {U_i}^C{BL'}_i^C = {BL_i}^C
692 *
693 *
694  dl( part_offset+odd_size+1 ) =
695  $ ( dl( part_offset+odd_size+1 ) )
696  $ / ( d( part_offset+odd_size ) )
697 *
698 *
699 * Compute contribution to diagonal block(s) of reduced system.
700 * {C'}_i = {C_i}-{{BL'}_i}{{BU'}_i}
701 *
702 *
703  d( part_offset+odd_size+1 ) = d( part_offset+odd_size+1 )-
704  $ dl( part_offset+odd_size+1 )*du( part_offset+odd_size )
705 *
706  ENDIF
707 * End of "if ( MYCOL .lt. NP-1 )..." loop
708 *
709 *
710  1500 CONTINUE
711 * If the processor could not locally factor, it jumps here.
712 *
713  IF ( mycol .NE. 0 ) THEN
714 *
715 * Move entry that causes spike to auxiliary storage
716 *
717  af( work_u+1 ) = ( dl( part_offset+1 ) )
718 *
719  IF (info.EQ.0) THEN
720 *
721 * Calculate the "spike" fillin, ${L_i} {{GU}_i} = {DL_i}$ .
722 *
723  CALL cdttrsv( 'L', 'N', odd_size, int_one,
724  $ dl( part_offset+2 ), d( part_offset+1 ),
725  $ du( part_offset+1 ), af( work_u+1 ), odd_size,
726  $ info )
727 *
728 *
729 * Calculate the "spike" fillin, ${U_i}^C {{GL}_i}^C = {DU_i}^C$
730 *
731  CALL ctrrv2d( ictxt, 'U', 'N', 1, 1, af( 1 ), odd_size, 0,
732  $ mycol-1 )
733 *
734  af( 1 ) = conjg( af( 1 ) )
735 *
736  CALL cdttrsv( 'U', 'C', odd_size, int_one,
737  $ dl( part_offset+2 ), d( part_offset+1 ),
738  $ du( part_offset+1 ),
739  $ af( 1 ), odd_size, info )
740 *
741 *
742 * Calculate the update block for previous proc, E_i = GL_i{GU_i}
743 *
744  af( odd_size+3 ) = -cone *
745  $ cdotc( odd_size, af( 1 ), 1, af( work_u+1 ), 1 )
746 *
747 *
748 * Initiate send of E_i to previous processor to overlap
749 * with next computation.
750 *
751  CALL cgesd2d( ictxt, int_one, int_one, af( odd_size+3 ),
752  $ int_one, 0, mycol-1 )
753 *
754 *
755  IF ( mycol .LT. np-1 ) THEN
756 *
757 * Calculate off-diagonal block(s) of reduced system.
758 * Note: for ease of use in solution of reduced system, store
759 * L's off-diagonal block in conjugate transpose form.
760 *
761  af( odd_size+1 ) = -cone
762  $ * conjg( dl( part_offset+odd_size+1 )
763  $ * af( work_u+odd_size ) )
764 *
765 *
766  af(work_u+(odd_size)+1 ) = -cone
767  $ * du( part_offset+odd_size )
768  $ * conjg( af( odd_size ) )
769 *
770  ENDIF
771 *
772  ENDIF
773 * End of "if ( MYCOL .ne. 0 )..."
774 *
775  ENDIF
776 * End of "if (info.eq.0) then"
777 *
778 *
779 * Check to make sure no processors have found errors
780 *
781  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
782  $ -1, 0, 0 )
783 *
784  IF( mycol.EQ.0 ) THEN
785  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
786  ELSE
787  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
788  ENDIF
789 *
790  IF ( info.NE.0 ) THEN
791  GOTO 1000
792  ENDIF
793 * No errors found, continue
794 *
795 *
796 ********************************************************************
797 * PHASE 2: Formation and factorization of Reduced System.
798 ********************************************************************
799 *
800 * Gather up local sections of reduced system
801 *
802 *
803 * The last processor does not participate in the factorization of
804 * the reduced system, having sent its E_i already.
805  IF( mycol .EQ. npcol-1 ) THEN
806  GOTO 14
807  ENDIF
808 *
809 * Initiate send of off-diag block(s) to overlap with next part.
810 * Off-diagonal block needed on neighboring processor to start
811 * algorithm.
812 *
813  IF( (mod( mycol+1, 2 ) .EQ. 0) .AND. ( mycol .GT. 0 ) ) THEN
814 *
815  CALL cgesd2d( ictxt, int_one, int_one,
816  $ af( odd_size+1 ),
817  $ int_one, 0, mycol-1 )
818 *
819  CALL cgesd2d( ictxt, int_one, int_one,
820  $ af( work_u+odd_size+1 ),
821  $ int_one, 0, mycol-1 )
822 *
823  ENDIF
824 *
825 * Copy last diagonal block into AF storage for subsequent
826 * operations.
827 *
828  af( odd_size+2 ) =
829  $ cmplx( d( part_offset+odd_size+1 ) )
830 *
831 * Receive cont. to diagonal block that is stored on this proc.
832 *
833  IF( mycol.LT. npcol-1 ) THEN
834 *
835  CALL cgerv2d( ictxt, int_one, int_one,
836  $ af( odd_size+2+1 ),
837  $ int_one, 0,
838  $ mycol+1 )
839 *
840 * Add contribution to diagonal block
841 *
842  af( odd_size+2 ) = af( odd_size+2 )+af( odd_size+3 )
843 *
844  ENDIF
845 *
846 *
847 * *************************************
848 * Modification Loop
849 *
850 * The distance for sending and receiving for each level starts
851 * at 1 for the first level.
852  level_dist = 1
853 *
854 * Do until this proc is needed to modify other procs' equations
855 *
856  12 CONTINUE
857  IF( mod( (mycol+1)/level_dist, 2) .NE. 0 ) GOTO 11
858 *
859 * Receive and add contribution to diagonal block from the left
860 *
861  IF( mycol-level_dist .GE. 0 ) THEN
862  CALL cgerv2d( ictxt, int_one, int_one, work( 1 ),
863  $ int_one, 0, mycol-level_dist )
864 *
865  af( odd_size+2 ) = af( odd_size+2 )+work( 1 )
866 *
867  ENDIF
868 *
869 * Receive and add contribution to diagonal block from the right
870 *
871  IF( mycol+level_dist .LT. npcol-1 ) THEN
872  CALL cgerv2d( ictxt, int_one, int_one, work( 1 ),
873  $ int_one, 0, mycol+level_dist )
874 *
875  af( odd_size+2 ) = af( odd_size+2 )+work( 1 )
876 *
877  ENDIF
878 *
879  level_dist = level_dist*2
880 *
881  GOTO 12
882  11 CONTINUE
883 * [End of GOTO Loop]
884 *
885 *
886 * *********************************
887 * Calculate and use this proc's blocks to modify other procs'...
888  IF( af( odd_size+2 ) .EQ. czero ) THEN
889  info = npcol + mycol
890  ENDIF
891 *
892 * ****************************************************************
893 * Receive offdiagonal block from processor to right.
894 * If this is the first group of processors, the receive comes
895 * from a different processor than otherwise.
896 *
897  IF( level_dist .EQ. 1 )THEN
898  comm_proc = mycol + 1
899 *
900 * Move block into place that it will be expected to be for
901 * calcs.
902 *
903  af( work_u+odd_size+3 ) = af( odd_size+1 )
904 *
905  af( odd_size+3 ) = af( work_u+odd_size+1 )
906 *
907  ELSE
908  comm_proc = mycol + level_dist/2
909  ENDIF
910 *
911  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 )THEN
912 *
913  CALL cgerv2d( ictxt, int_one, int_one,
914  $ af( odd_size+1 ),
915  $ int_one, 0, comm_proc )
916 *
917  CALL cgerv2d( ictxt, int_one, int_one,
918  $ af( work_u+odd_size+1 ),
919  $ int_one, 0, comm_proc )
920 *
921  IF( info .EQ. 0 ) THEN
922 *
923 *
924 * Modify lower off_diagonal block with diagonal block
925 *
926 *
927  af( odd_size+1 ) = af( odd_size+1 )
928  $ / conjg( af( odd_size+2 ) )
929 *
930  ENDIF
931 * End of "if ( info.eq.0 ) then"
932 *
933 * Calculate contribution from this block to next diagonal block
934 *
935  work( 1 ) = -one*conjg( af( odd_size+1 ) )*
936  $ af( work_u+(odd_size)+1 )
937 *
938 * Send contribution to diagonal block's owning processor.
939 *
940  CALL cgesd2d( ictxt, int_one, int_one, work( 1 ), int_one,
941  $ 0, mycol+level_dist )
942 *
943  ENDIF
944 * End of "if( mycol/level_dist .le. (npcol-1)/level_dist-2 )..."
945 *
946 *
947 * ****************************************************************
948 * Receive off_diagonal block from left and use to finish with this
949 * processor.
950 *
951  IF( (mycol/level_dist .GT. 0 ).AND.
952  $ ( mycol/level_dist .LE. (npcol-1)/level_dist-1 ) ) THEN
953 *
954  IF( level_dist .GT. 1)THEN
955 *
956 * Receive offdiagonal block(s) from proc level_dist/2 to the
957 * left
958 *
959  CALL cgerv2d( ictxt, int_one, int_one,
960  $ af( work_u+odd_size+2+1 ),
961  $ int_one, 0, mycol-level_dist/2 )
962 *
963 * Receive offdiagonal block(s) from proc level_dist/2 to the
964 * left
965 *
966  CALL cgerv2d( ictxt, int_one, int_one,
967  $ af( odd_size+2+1 ),
968  $ int_one, 0, mycol-level_dist/2 )
969 *
970  ENDIF
971 *
972 *
973  IF( info.EQ.0 ) THEN
974 *
975 * Use diagonal block(s) to modify this offdiagonal block
976 *
977  af( odd_size+3 ) = af( odd_size+3 )
978  $ / ( af( odd_size+2 ) )
979 *
980 *
981  ENDIF
982 * End of "if( info.eq.0 ) then"
983 *
984 * Use offdiag block(s) to calculate modification to diag block
985 * of processor to the left
986 *
987  work( 1 ) = -one*af( odd_size+3 )
988  $ *conjg( af( work_u+odd_size+3 ) )
989 *
990 * Send contribution to diagonal block's owning processor.
991 *
992  CALL cgesd2d( ictxt, int_one, int_one, work( 1 ), int_one,
993  $ 0, mycol-level_dist )
994 *
995 * *******************************************************
996 *
997  IF( mycol/level_dist .LE. (npcol-1)/level_dist-2 ) THEN
998 *
999 * Decide which processor offdiagonal block(s) goes to
1000 *
1001  IF( ( mod( mycol/( 2*level_dist ),2 )) .EQ.0 ) THEN
1002  comm_proc = mycol + level_dist
1003  ELSE
1004  comm_proc = mycol - level_dist
1005  ENDIF
1006 *
1007 * Use offdiagonal blocks to calculate offdiag
1008 * block to send to neighboring processor. Depending
1009 * on circumstances, may need to transpose the matrix.
1010 *
1011  work( 1 ) = -one*af( work_u+odd_size+3 )
1012  $ * af( odd_size+1 )
1013 *
1014 * Send contribution to offdiagonal block's owning processor.
1015 *
1016  CALL cgesd2d( ictxt, int_one, int_one, work( 1 ), int_one,
1017  $ 0, comm_proc )
1018 *
1019  work( 1 ) = -one*af( odd_size+3 )
1020  $ * af( work_u+odd_size+1 )
1021 *
1022 * Send contribution to offdiagonal block's owning processor.
1023 *
1024  CALL cgesd2d( ictxt, int_one, int_one, work( 1 ), int_one,
1025  $ 0, comm_proc )
1026 *
1027  ENDIF
1028 *
1029  ENDIF
1030 * End of "if( mycol/level_dist.le. (npcol-1)/level_dist -1 )..."
1031 *
1032  14 CONTINUE
1033 *
1034 *
1035  1000 CONTINUE
1036 *
1037 *
1038 * Free BLACS space used to hold standard-form grid.
1039 *
1040  IF( ictxt_save .NE. ictxt_new ) THEN
1041  CALL blacs_gridexit( ictxt_new )
1042  ENDIF
1043 *
1044  1234 CONTINUE
1045 *
1046 * Restore saved input parameters
1047 *
1048  ictxt = ictxt_save
1049  np = np_save
1050 *
1051 * Output minimum worksize
1052 *
1053  work( 1 ) = work_size_min
1054 *
1055 * Make INFO consistent across processors
1056 *
1057  CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
1058  $ -1, 0, 0 )
1059 *
1060  IF( mycol.EQ.0 ) THEN
1061  CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1062  ELSE
1063  CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1064  ENDIF
1065 *
1066 *
1067  RETURN
1068 *
1069 * End of PCDTTRF
1070 *
1071  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
cmplx
float cmplx[2]
Definition: pblas.h:132
cdttrf
subroutine cdttrf(N, DL, D, DU, INFO)
Definition: cdttrf.f:2
pcdttrf
subroutine pcdttrf(N, DL, D, DU, JA, DESCA, AF, LAF, WORK, LWORK, INFO)
Definition: pcdttrf.f:3
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
clatcpy
subroutine clatcpy(UPLO, M, N, A, LDA, B, LDB)
Definition: clatcpy.f:2
desc_convert
subroutine desc_convert(DESC_IN, DESC_OUT, INFO)
Definition: desc_convert.f:2
cdttrsv
subroutine cdttrsv(UPLO, TRANS, N, NRHS, DL, D, DU, B, LDB, INFO)
Definition: cdttrsv.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181