SCALAPACK 2.2.2
LAPACK: Linear Algebra PACKage
Loading...
Searching...
No Matches

◆ pzgbtrf()

subroutine pzgbtrf ( integer  n,
integer  bwl,
integer  bwu,
complex*16, dimension( * )  a,
integer  ja,
integer, dimension( * )  desca,
integer, dimension( * )  ipiv,
complex*16, dimension( * )  af,
integer  laf,
complex*16, dimension( * )  work,
integer  lwork,
integer  info 
)

Definition at line 1 of file pzgbtrf.f.

3*
4* -- ScaLAPACK routine (version 2.0.2) --
5* Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
6* May 1 2012
7*
8* .. Scalar Arguments ..
9 INTEGER BWL, BWU, INFO, JA, LAF, LWORK, N
10* ..
11* .. Array Arguments ..
12 INTEGER DESCA( * ), IPIV( * )
13 COMPLEX*16 A( * ), AF( * ), WORK( * )
14* ..
15*
16*
17* Purpose
18* =======
19*
20* PZGBTRF computes a LU factorization
21* of an N-by-N complex banded
22* distributed matrix
23* with bandwidth BWL, BWU: A(1:N, JA:JA+N-1).
24* Reordering is used to increase parallelism in the factorization.
25* This reordering results in factors that are DIFFERENT from those
26* produced by equivalent sequential codes. These factors cannot
27* be used directly by users; however, they can be used in
28* subsequent calls to PZGBTRS to solve linear systems.
29*
30* The factorization has the form
31*
32* P A(1:N, JA:JA+N-1) Q = L U
33*
34* where U is a banded upper triangular matrix and L is banded
35* lower triangular, and P and Q are permutation matrices.
36* The matrix Q represents reordering of columns
37* for parallelism's sake, while P represents
38* reordering of rows for numerical stability using
39* classic partial pivoting.
40*
41* =====================================================================
42*
43* Arguments
44* =========
45*
46*
47* N (global input) INTEGER
48* The number of rows and columns to be operated on, i.e. the
49* order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
50*
51* BWL (global input) INTEGER
52* Number of subdiagonals. 0 <= BWL <= N-1
53*
54* BWU (global input) INTEGER
55* Number of superdiagonals. 0 <= BWU <= N-1
56*
57* A (local input/local output) COMPLEX*16 pointer into
58* local memory to an array with first dimension
59* LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
60* On entry, this array contains the local pieces of the
61* N-by-N unsymmetric banded distributed matrix
62* A(1:N, JA:JA+N-1) to be factored.
63* This local portion is stored in the packed banded format
64* used in LAPACK. Please see the Notes below and the
65* ScaLAPACK manual for more detail on the format of
66* distributed matrices.
67* On exit, this array contains information containing details
68* of the factorization.
69* Note that permutations are performed on the matrix, so that
70* the factors returned are different from those returned
71* by LAPACK.
72*
73* JA (global input) INTEGER
74* The index in the global array A that points to the start of
75* the matrix to be operated on (which may be either all of A
76* or a submatrix of A).
77*
78* DESCA (global and local input) INTEGER array of dimension DLEN.
79* if 1D type (DTYPE_A=501), DLEN >= 7;
80* if 2D type (DTYPE_A=1), DLEN >= 9 .
81* The array descriptor for the distributed matrix A.
82* Contains information of mapping of A to memory. Please
83* see NOTES below for full description and options.
84*
85* IPIV (local output) INTEGER array, dimension >= DESCA( NB ).
86* Pivot indices for local factorizations.
87* Users *should not* alter the contents between
88* factorization and solve.
89*
90* AF (local output) COMPLEX*16 array, dimension LAF.
91* Auxiliary Fillin Space.
92* Fillin is created during the factorization routine
93* PZGBTRF and this is stored in AF. If a linear system
94* is to be solved using PZGBTRS after the factorization
95* routine, AF *must not be altered* after the factorization.
96*
97* LAF (local input) INTEGER
98* Size of user-input Auxiliary Fillin space AF. Must be >=
99* (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
100* If LAF is not large enough, an error code will be returned
101* and the minimum acceptable size will be returned in AF( 1 )
102*
103* WORK (local workspace/local output)
104* COMPLEX*16 temporary workspace. This space may
105* be overwritten in between calls to routines. WORK must be
106* the size given in LWORK.
107* On exit, WORK( 1 ) contains the minimal LWORK.
108*
109* LWORK (local input or global input) INTEGER
110* Size of user-input workspace WORK.
111* If LWORK is too small, the minimal acceptable size will be
112* returned in WORK(1) and an error code is returned. LWORK>=
113* 1
114*
115* INFO (global output) INTEGER
116* = 0: successful exit
117* < 0: If the i-th argument is an array and the j-entry had
118* an illegal value, then INFO = -(i*100+j), if the i-th
119* argument is a scalar and had an illegal value, then
120* INFO = -i.
121* > 0: If INFO = K<=NPROCS, the submatrix stored on processor
122* INFO and factored locally was not
123* nonsingular, and
124* the factorization was not completed.
125* If INFO = K>NPROCS, the submatrix stored on processor
126* INFO-NPROCS representing interactions with other
127* processors was not
128* nonsingular,
129* and the factorization was not completed.
130*
131* =====================================================================
132*
133*
134* Restrictions
135* ============
136*
137* The following are restrictions on the input parameters. Some of these
138* are temporary and will be removed in future releases, while others
139* may reflect fundamental technical limitations.
140*
141* Non-cyclic restriction: VERY IMPORTANT!
142* P*NB>= mod(JA-1,NB)+N.
143* The mapping for matrices must be blocked, reflecting the nature
144* of the divide and conquer algorithm as a task-parallel algorithm.
145* This formula in words is: no processor may have more than one
146* chunk of the matrix.
147*
148* Blocksize cannot be too small:
149* If the matrix spans more than one processor, the following
150* restriction on NB, the size of each block on each processor,
151* must hold:
152* NB >= (BWL+BWU)+1
153* The bulk of parallel computation is done on the matrix of size
154* O(NB) on each processor. If this is too small, divide and conquer
155* is a poor choice of algorithm.
156*
157* Submatrix reference:
158* JA = IB
159* Alignment restriction that prevents unnecessary communication.
160*
161*
162* =====================================================================
163*
164*
165* Notes
166* =====
167*
168* If the factorization routine and the solve routine are to be called
169* separately (to solve various sets of righthand sides using the same
170* coefficient matrix), the auxiliary space AF *must not be altered*
171* between calls to the factorization routine and the solve routine.
172*
173* The best algorithm for solving banded and tridiagonal linear systems
174* depends on a variety of parameters, especially the bandwidth.
175* Currently, only algorithms designed for the case N/P >> bw are
176* implemented. These go by many names, including Divide and Conquer,
177* Partitioning, domain decomposition-type, etc.
178*
179* Algorithm description: Divide and Conquer
180*
181* The Divide and Conqer algorithm assumes the matrix is narrowly
182* banded compared with the number of equations. In this situation,
183* it is best to distribute the input matrix A one-dimensionally,
184* with columns atomic and rows divided amongst the processes.
185* The basic algorithm divides the banded matrix up into
186* P pieces with one stored on each processor,
187* and then proceeds in 2 phases for the factorization or 3 for the
188* solution of a linear system.
189* 1) Local Phase:
190* The individual pieces are factored independently and in
191* parallel. These factors are applied to the matrix creating
192* fillin, which is stored in a non-inspectable way in auxiliary
193* space AF. Mathematically, this is equivalent to reordering
194* the matrix A as P A P^T and then factoring the principal
195* leading submatrix of size equal to the sum of the sizes of
196* the matrices factored on each processor. The factors of
197* these submatrices overwrite the corresponding parts of A
198* in memory.
199* 2) Reduced System Phase:
200* A small (max(bwl,bwu)* (P-1)) system is formed representing
201* interaction of the larger blocks, and is stored (as are its
202* factors) in the space AF. A parallel Block Cyclic Reduction
203* algorithm is used. For a linear system, a parallel front solve
204* followed by an analagous backsolve, both using the structure
205* of the factored matrix, are performed.
206* 3) Backsubsitution Phase:
207* For a linear system, a local backsubstitution is performed on
208* each processor in parallel.
209*
210*
211* Descriptors
212* ===========
213*
214* Descriptors now have *types* and differ from ScaLAPACK 1.0.
215*
216* Note: banded codes can use either the old two dimensional
217* or new one-dimensional descriptors, though the processor grid in
218* both cases *must be one-dimensional*. We describe both types below.
219*
220* Each global data object is described by an associated description
221* vector. This vector stores the information required to establish
222* the mapping between an object element and its corresponding process
223* and memory location.
224*
225* Let A be a generic term for any 2D block cyclicly distributed array.
226* Such a global array has an associated description vector DESCA.
227* In the following comments, the character _ should be read as
228* "of the global array".
229*
230* NOTATION STORED IN EXPLANATION
231* --------------- -------------- --------------------------------------
232* DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
233* DTYPE_A = 1.
234* CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
235* the BLACS process grid A is distribu-
236* ted over. The context itself is glo-
237* bal, but the handle (the integer
238* value) may vary.
239* M_A (global) DESCA( M_ ) The number of rows in the global
240* array A.
241* N_A (global) DESCA( N_ ) The number of columns in the global
242* array A.
243* MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
244* the rows of the array.
245* NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
246* the columns of the array.
247* RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
248* row of the array A is distributed.
249* CSRC_A (global) DESCA( CSRC_ ) The process column over which the
250* first column of the array A is
251* distributed.
252* LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
253* array. LLD_A >= MAX(1,LOCr(M_A)).
254*
255* Let K be the number of rows or columns of a distributed matrix,
256* and assume that its process grid has dimension p x q.
257* LOCr( K ) denotes the number of elements of K that a process
258* would receive if K were distributed over the p processes of its
259* process column.
260* Similarly, LOCc( K ) denotes the number of elements of K that a
261* process would receive if K were distributed over the q processes of
262* its process row.
263* The values of LOCr() and LOCc() may be determined via a call to the
264* ScaLAPACK tool function, NUMROC:
265* LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
266* LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
267* An upper bound for these quantities may be computed by:
268* LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
269* LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
270*
271*
272* One-dimensional descriptors:
273*
274* One-dimensional descriptors are a new addition to ScaLAPACK since
275* version 1.0. They simplify and shorten the descriptor for 1D
276* arrays.
277*
278* Since ScaLAPACK supports two-dimensional arrays as the fundamental
279* object, we allow 1D arrays to be distributed either over the
280* first dimension of the array (as if the grid were P-by-1) or the
281* 2nd dimension (as if the grid were 1-by-P). This choice is
282* indicated by the descriptor type (501 or 502)
283* as described below.
284*
285* IMPORTANT NOTE: the actual BLACS grid represented by the
286* CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
287* irrespective of which one-dimensional descriptor type
288* (501 or 502) is input.
289* This routine will interpret the grid properly either way.
290* ScaLAPACK routines *do not support intercontext operations* so that
291* the grid passed to a single ScaLAPACK routine *must be the same*
292* for all array descriptors passed to that routine.
293*
294* NOTE: In all cases where 1D descriptors are used, 2D descriptors
295* may also be used, since a one-dimensional array is a special case
296* of a two-dimensional array with one dimension of size unity.
297* The two-dimensional array used in this case *must* be of the
298* proper orientation:
299* If the appropriate one-dimensional descriptor is DTYPEA=501
300* (1 by P type), then the two dimensional descriptor must
301* have a CTXT value that refers to a 1 by P BLACS grid;
302* If the appropriate one-dimensional descriptor is DTYPEA=502
303* (P by 1 type), then the two dimensional descriptor must
304* have a CTXT value that refers to a P by 1 BLACS grid.
305*
306*
307* Summary of allowed descriptors, types, and BLACS grids:
308* DTYPE 501 502 1 1
309* BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
310* -----------------------------------------------------
311* A OK NO OK NO
312* B NO OK NO OK
313*
314* Let A be a generic term for any 1D block cyclicly distributed array.
315* Such a global array has an associated description vector DESCA.
316* In the following comments, the character _ should be read as
317* "of the global array".
318*
319* NOTATION STORED IN EXPLANATION
320* --------------- ---------- ------------------------------------------
321* DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
322* TYPE_A = 501: 1-by-P grid.
323* TYPE_A = 502: P-by-1 grid.
324* CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
325* the BLACS process grid A is distribu-
326* ted over. The context itself is glo-
327* bal, but the handle (the integer
328* value) may vary.
329* N_A (global) DESCA( 3 ) The size of the array dimension being
330* distributed.
331* NB_A (global) DESCA( 4 ) The blocking factor used to distribute
332* the distributed dimension of the array.
333* SRC_A (global) DESCA( 5 ) The process row or column over which the
334* first row or column of the array
335* is distributed.
336* LLD_A (local) DESCA( 6 ) The leading dimension of the local array
337* storing the local blocks of the distri-
338* buted array A. Minimum value of LLD_A
339* depends on TYPE_A.
340* TYPE_A = 501: LLD_A >=
341* size of undistributed dimension, 1.
342* TYPE_A = 502: LLD_A >=NB_A, 1.
343* Reserved DESCA( 7 ) Reserved for future use.
344*
345*
346*
347* =====================================================================
348*
349* Implemented for ScaLAPACK by:
350* Andrew J. Cleary, Livermore National Lab and University of Tenn.,
351* and Markus Hegland, Australian Natonal University. Feb., 1997.
352* Based on code written by : Peter Arbenz, ETH Zurich, 1996.
353*
354* =====================================================================
355*
356* .. Parameters ..
357 DOUBLE PRECISION ONE, ZERO
358 parameter( one = 1.0d+0 )
359 parameter( zero = 0.0d+0 )
360 COMPLEX*16 CONE, CZERO
361 parameter( cone = ( 1.0d+0, 0.0d+0 ) )
362 parameter( czero = ( 0.0d+0, 0.0d+0 ) )
363 INTEGER INT_ONE
364 parameter( int_one = 1 )
365 INTEGER DESCMULT, BIGNUM
366 parameter( descmult = 100, bignum = descmult*descmult )
367 INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
368 $ LLD_, MB_, M_, NB_, N_, RSRC_
369 parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
370 $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
371 $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
372* ..
373* .. Local Scalars ..
374 INTEGER APTR, BBPTR, BIPTR, BM, BM1, BM2, BMN, BN, BW,
375 $ CSRC, DBPTR, FIRST_PROC, I, ICTXT, ICTXT_NEW,
376 $ ICTXT_SAVE, IDUM3, J, JA_NEW, JPTR, L, LAF_MIN,
377 $ LBWL, LBWU, LDB, LDBB, LLDA, LM, LMJ, LN, LNJ,
378 $ LPTR, MYCOL, MYROW, MY_NUM_COLS, NB, NEICOL,
379 $ NP, NPACT, NPCOL, NPROW, NPSTR, NP_SAVE, NRHS,
380 $ ODD_N, ODD_SIZE, ODPTR, OFST, PART_OFFSET,
381 $ PART_SIZE, RETURN_CODE, STORE_N_A,
382 $ WORK_SIZE_MIN
383* ..
384* .. Local Arrays ..
385 INTEGER DESCA_1XP( 7 ), PARAM_CHECK( 9, 3 )
386* ..
387* .. External Subroutines ..
388 EXTERNAL blacs_gridexit, blacs_gridinfo, desc_convert,
389 $ globchk, pxerbla, reshape, zaxpy, zgemm,
390 $ zgerv2d, zgesd2d, zlamov, zlatcpy, zpbtrf,
391 $ zpotrf, zsyrk, ztbtrs, ztrmm, ztrrv2d, ztrsd2d,
392 $ ztrsm, ztrtrs
393* ..
394* .. External Functions ..
395 LOGICAL LSAME
396 INTEGER NUMROC
397 EXTERNAL lsame, numroc
398* ..
399* .. Intrinsic Functions ..
400 INTRINSIC ichar, min, mod
401* ..
402* .. Executable Statements ..
403*
404*
405* Test the input parameters
406*
407 info = 0
408*
409* Convert descriptor into standard form for easy access to
410* parameters, check that grid is of right shape.
411*
412 desca_1xp( 1 ) = 501
413*
414 CALL desc_convert( desca, desca_1xp, return_code )
415*
416 IF( return_code .NE. 0) THEN
417 info = -( 6*100 + 2 )
418 ENDIF
419*
420* Get values out of descriptor for use in code.
421*
422 ictxt = desca_1xp( 2 )
423 csrc = desca_1xp( 5 )
424 nb = desca_1xp( 4 )
425 llda = desca_1xp( 6 )
426 store_n_a = desca_1xp( 3 )
427*
428* Get grid parameters
429*
430*
431 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
432 np = nprow * npcol
433*
434*
435*
436 IF( lwork .LT. -1) THEN
437 info = -11
438 ELSE IF ( lwork .EQ. -1 ) THEN
439 idum3 = -1
440 ELSE
441 idum3 = 1
442 ENDIF
443*
444 IF( n .LT. 0 ) THEN
445 info = -1
446 ENDIF
447*
448 IF( n+ja-1 .GT. store_n_a ) THEN
449 info = -( 6*100 + 6 )
450 ENDIF
451*
452 IF(( bwl .GT. n-1 ) .OR.
453 $ ( bwl .LT. 0 ) ) THEN
454 info = -2
455 ENDIF
456*
457 IF(( bwu .GT. n-1 ) .OR.
458 $ ( bwu .LT. 0 ) ) THEN
459 info = -3
460 ENDIF
461*
462 IF( llda .LT. (2*bwl+2*bwu+1) ) THEN
463 info = -( 6*100 + 6 )
464 ENDIF
465*
466 IF( nb .LE. 0 ) THEN
467 info = -( 6*100 + 4 )
468 ENDIF
469*
470 bw = bwu+bwl
471*
472* Argument checking that is specific to Divide & Conquer routine
473*
474 IF( nprow .NE. 1 ) THEN
475 info = -( 6*100+2 )
476 ENDIF
477*
478 IF( n .GT. np*nb-mod( ja-1, nb )) THEN
479 info = -( 1 )
480 CALL pxerbla( ictxt,
481 $ 'PZGBTRF, D&C alg.: only 1 block per proc',
482 $ -info )
483 RETURN
484 ENDIF
485*
486 IF((ja+n-1.GT.nb) .AND. ( nb.LT.(bwl+bwu+1) )) THEN
487 info = -( 6*100+4 )
488 CALL pxerbla( ictxt,
489 $ 'PZGBTRF, D&C alg.: NB too small',
490 $ -info )
491 RETURN
492 ENDIF
493*
494*
495* Check auxiliary storage size
496*
497 laf_min = (nb+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
498*
499 IF( laf .LT. laf_min ) THEN
500 info = -9
501* put minimum value of laf into AF( 1 )
502 af( 1 ) = laf_min
503 CALL pxerbla( ictxt,
504 $ 'PZGBTRF: auxiliary storage error ',
505 $ -info )
506 RETURN
507 ENDIF
508*
509* Check worksize
510*
511 work_size_min = 1
512*
513 work( 1 ) = work_size_min
514*
515 IF( lwork .LT. work_size_min ) THEN
516 IF( lwork .NE. -1 ) THEN
517 info = -11
518* put minimum value of work into work( 1 )
519 work( 1 ) = work_size_min
520 CALL pxerbla( ictxt,
521 $ 'PZGBTRF: worksize error ',
522 $ -info )
523 ENDIF
524 RETURN
525 ENDIF
526*
527* Pack params and positions into arrays for global consistency check
528*
529 param_check( 9, 1 ) = desca(5)
530 param_check( 8, 1 ) = desca(4)
531 param_check( 7, 1 ) = desca(3)
532 param_check( 6, 1 ) = desca(1)
533 param_check( 5, 1 ) = ja
534 param_check( 4, 1 ) = bwu
535 param_check( 3, 1 ) = bwl
536 param_check( 2, 1 ) = n
537 param_check( 1, 1 ) = idum3
538*
539 param_check( 9, 2 ) = 605
540 param_check( 8, 2 ) = 604
541 param_check( 7, 2 ) = 603
542 param_check( 6, 2 ) = 601
543 param_check( 5, 2 ) = 5
544 param_check( 4, 2 ) = 3
545 param_check( 3, 2 ) = 2
546 param_check( 2, 2 ) = 1
547 param_check( 1, 2 ) = 11
548*
549* Want to find errors with MIN( ), so if no error, set it to a big
550* number. If there already is an error, multiply by the the
551* descriptor multiplier.
552*
553 IF( info.GE.0 ) THEN
554 info = bignum
555 ELSE IF( info.LT.-descmult ) THEN
556 info = -info
557 ELSE
558 info = -info * descmult
559 END IF
560*
561* Check consistency across processors
562*
563 CALL globchk( ictxt, 9, param_check, 9,
564 $ param_check( 1, 3 ), info )
565*
566* Prepare output: set info = 0 if no error, and divide by DESCMULT
567* if error is not in a descriptor entry.
568*
569 IF( info.EQ.bignum ) THEN
570 info = 0
571 ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
572 info = -info / descmult
573 ELSE
574 info = -info
575 END IF
576*
577 IF( info.LT.0 ) THEN
578 CALL pxerbla( ictxt, 'PZGBTRF', -info )
579 RETURN
580 END IF
581*
582* Quick return if possible
583*
584 IF( n.EQ.0 )
585 $ RETURN
586*
587*
588* Adjust addressing into matrix space to properly get into
589* the beginning part of the relevant data
590*
591 part_offset = nb*( (ja-1)/(npcol*nb) )
592*
593 IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
594 part_offset = part_offset + nb
595 ENDIF
596*
597 IF ( mycol .LT. csrc ) THEN
598 part_offset = part_offset - nb
599 ENDIF
600*
601* Form a new BLACS grid (the "standard form" grid) with only procs
602* holding part of the matrix, of size 1xNP where NP is adjusted,
603* starting at csrc=0, with JA modified to reflect dropped procs.
604*
605* First processor to hold part of the matrix:
606*
607 first_proc = mod( ( ja-1 )/nb+csrc, npcol )
608*
609* Calculate new JA one while dropping off unused processors.
610*
611 ja_new = mod( ja-1, nb ) + 1
612*
613* Save and compute new value of NP
614*
615 np_save = np
616 np = ( ja_new+n-2 )/nb + 1
617*
618* Call utility routine that forms "standard-form" grid
619*
620 CALL reshape( ictxt, int_one, ictxt_new, int_one,
621 $ first_proc, int_one, np )
622*
623* Use new context from standard grid as context.
624*
625 ictxt_save = ictxt
626 ictxt = ictxt_new
627 desca_1xp( 2 ) = ictxt_new
628*
629* Get information about new grid.
630*
631 CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
632*
633* Drop out processors that do not have part of the matrix.
634*
635 IF( myrow .LT. 0 ) THEN
636 GOTO 1234
637 ENDIF
638*
639* ********************************
640* Values reused throughout routine
641*
642* User-input value of partition size
643*
644 part_size = nb
645*
646* Number of columns in each processor
647*
648 my_num_cols = numroc( n, part_size, mycol, 0, npcol )
649*
650* Offset in columns to beginning of main partition in each proc
651*
652 IF ( mycol .EQ. 0 ) THEN
653 part_offset = part_offset+mod( ja_new-1, part_size )
654 my_num_cols = my_num_cols - mod(ja_new-1, part_size )
655 ENDIF
656*
657* Offset in elements
658*
659 ofst = part_offset*llda
660*
661* Size of main (or odd) partition in each processor
662*
663 odd_size = numroc( n, part_size, mycol, 0, npcol )
664*
665*
666* Zero out space for fillin
667*
668 DO 10 i = 1, laf_min
669 af( i ) = czero
670 10 CONTINUE
671*
672 DO 9 j = 1, odd_size
673 DO 8 i = 1, bw
674 a( i+(j-1)*llda ) = czero
675 8 CONTINUE
676 9 CONTINUE
677*
678* Begin main code
679*
680********************************************************************
681* PHASE 1: Local computation phase.
682********************************************************************
683*
684*
685* Transfer triangle B_i of local matrix to next processor
686* for fillin. Overlap the send with the factorization of A_i.
687*
688 IF (mycol .LE. npcol-2) THEN
689*
690* The last processor does not need to send anything.
691* BIPTR = location of triangle B_i in memory
692 biptr = (nb-bw)*llda + 2*bw+1
693*
694 CALL ztrsd2d( ictxt, 'U', 'N',
695 $ min( bw, bwu+numroc( n, nb, mycol+1, 0, npcol ) ),
696 $ bw, a(biptr), llda-1, 0, mycol+1)
697*
698 ENDIF
699*
700* Factor main partition P_i A_i = L_i U_i on each processor
701*
702* LBWL, LBWU: lower and upper bandwidth of local solver
703* Note that for MYCOL > 0 one has lower triangular blocks!
704* LM is the number of rows which is usually NB except for
705* MYCOL = 0 where it is BWU less and MYCOL=NPCOL-1 where it
706* is NR+BWU where NR is the number of columns on the last processor
707* Finally APTR is the pointer to the first element of A. As LAPACK
708* has a slightly different matrix format than Scalapack the pointer
709* has to be adjusted on processor MYCOL=0.
710*
711 IF (mycol .NE. 0) THEN
712 lbwl = bw
713 lbwu = 0
714 aptr = 1
715 ELSE
716 lbwl = bwl
717 lbwu = bwu
718 aptr = 1+bwu
719 ENDIF
720*
721 IF (mycol .NE. npcol-1) THEN
722 lm = nb - lbwu
723 ln = nb - bw
724 ELSE IF (mycol .NE. 0) THEN
725 lm = odd_size + bwu
726 ln = max(odd_size-bw,0)
727 ELSE
728 lm = n
729 ln = max( n-bw, 0 )
730 ENDIF
731*
732 IF (ln .GT. 0) THEN
733*
734 CALL zgbtrf(lm,ln, lbwl,lbwu, a(aptr),llda, ipiv, info)
735*
736 IF( info.NE.0 ) THEN
737 info = info + nb*mycol
738 GO TO 90
739 END IF
740*
741 nrhs = bw
742 ldb = llda-1
743*
744* Update the last BW columns of A_i (code modified from ZGBTRS)
745*
746* Only the eliminations of unknowns > LN-BW have an effect on
747* the last BW columns. Loop over them...
748*
749 DO 23 j = max(ln-bw+1,1), ln
750*
751 lmj = min( lbwl, lm-j )
752 lnj = min( bw, j+bw-ln+aptr-1 )
753*
754 l = ipiv( j )
755*
756 jptr = j-(ln+1)+2*bw+1-lbwl + ln*llda
757*
758 IF( l.NE.j ) THEN
759*
760* Element (L,LN+1) is swapped with element (J,LN+1) etc
761* Furthermore, the elements in the same row are LDB=LLDA-1 apart
762* The complicated formulas are to cope with the banded
763* data format:
764*
765 lptr = l-(ln+1)+2*bw+1-lbwl + ln*llda
766*
767 CALL zswap( lnj, a(lptr),ldb, a(jptr), ldb )
768*
769 ENDIF
770*
771* LPTR is the pointer to the beginning of the
772* coefficients of L
773*
774 lptr = bw+1+aptr + (j-1)*llda
775*
776 CALL zgeru(lmj,lnj,-cone, a(lptr),1, a(jptr),ldb,
777 $ a(jptr+1),ldb)
778 23 CONTINUE
779*
780 ENDIF
781*
782* Compute spike fill-in, L_i F_i = P_i B_{i-1}
783*
784* Receive triangle B_{i-1} from previous processor
785*
786 IF (mycol .GT. 0) THEN
787*
788 CALL ztrrv2d( ictxt, 'U', 'N', min(bw, lm), bw, af( 1 ),
789 $ lm, 0, mycol-1)
790*
791*
792* Permutation and forward elimination (triang. solve)
793*
794 DO 24 j = 1, ln
795*
796 lmj = min( lbwl, lm-j )
797 l = ipiv( j )
798*
799 IF( l .NE. j ) THEN
800*
801 CALL zswap(nrhs, af(l), lm, af(j), lm )
802 ENDIF
803*
804 lptr = bw+1+aptr + (j-1)*llda
805*
806 CALL zgeru( lmj,nrhs, -cone, a(lptr),1,
807 $ af(j), lm, af(j+1), lm)
808*
809 24 CONTINUE
810*
811 ENDIF
812*
813 90 CONTINUE
814*
815********************************************************************
816* PHASE 2: Formation and factorization of Reduced System.
817********************************************************************
818*
819* Define the initial dimensions of the diagonal blocks
820* The offdiagonal blocks (for MYCOL > 0) are of size BM by BW
821*
822 IF (mycol .NE. npcol-1) THEN
823 bm = bw - lbwu
824 bn = bw
825 ELSE
826 bm = min(bw,odd_size) + bwu
827 bn = min(bw,odd_size)
828 ENDIF
829*
830* Pointer to first element of block bidiagonal matrix in AF
831* Leading dimension of block bidiagonal system
832*
833 bbptr = (nb+bwu)*bw + 1
834 ldbb = 2*bw + bwu
835*
836* Copy from A and AF into block bidiagonal matrix (tail of AF)
837*
838* DBPTR = Pointer to diagonal blocks in A
839 dbptr = bw+1 + lbwu + ln*llda
840*
841 CALL zlamov('G',bm,bn, a(dbptr),llda-1,
842 $ af(bbptr + bw*ldbb),ldbb)
843*
844* Zero out any junk entries that were copied
845*
846 DO 870 j=1, bm
847 DO 880 i=j+lbwl, bm-1
848 af( bbptr+bw*ldbb+(j-1)*ldbb+i ) = czero
849 880 CONTINUE
850 870 CONTINUE
851*
852 IF (mycol .NE. 0) THEN
853*
854* ODPTR = Pointer to offdiagonal blocks in A
855*
856 odptr = lm-bm+1
857 CALL zlamov('G',bm,bw, af(odptr),lm,
858 $ af(bbptr +2*bw*ldbb),ldbb)
859 ENDIF
860*
861 IF (npcol.EQ.1) THEN
862*
863* In this case the loop over the levels will not be
864* performed.
865 CALL zgetrf( n-ln, n-ln, af(bbptr+bw*ldbb), ldbb,
866 $ ipiv(ln+1), info)
867*
868 ENDIF
869*
870* Loop over levels ... only occurs if npcol > 1
871*
872* The two integers NPACT (nu. of active processors) and NPSTR
873* (stride between active processors) are used to control the
874* loop.
875*
876 npact = npcol
877 npstr = 1
878*
879* Begin loop over levels
880*
881 200 IF (npact .LE. 1) GOTO 300
882*
883* Test if processor is active
884*
885 IF (mod(mycol,npstr) .EQ. 0) THEN
886*
887* Send/Receive blocks
888*
889*
890 IF (mod(mycol,2*npstr) .EQ. 0) THEN
891*
892* This node will potentially do more work later
893*
894 neicol = mycol + npstr
895*
896 IF (neicol/npstr .LT. npact-1) THEN
897 bmn = bw
898 ELSE IF (neicol/npstr .EQ. npact-1) THEN
899 odd_n = numroc(n, nb, npcol-1, 0, npcol)
900 bmn = min(bw,odd_n) + bwu
901 ELSE
902*
903* Last processor skips to next level
904 GOTO 250
905 ENDIF
906*
907* BM1 = M for 1st block on proc pair, BM2 2nd block
908*
909 bm1 = bm
910 bm2 = bmn
911*
912 IF (neicol/npstr .LE. npact-1 )THEN
913*
914 CALL zgesd2d( ictxt, bm, 2*bw, af(bbptr+bw*ldbb),
915 $ ldbb, 0, neicol )
916*
917 CALL zgerv2d( ictxt, bmn, 2*bw, af(bbptr+bm),
918 $ ldbb, 0, neicol)
919*
920 IF( npact .EQ. 2 ) THEN
921*
922* Copy diagonal block to align whole system
923*
924 CALL zlamov( 'G', bmn, bw, af( bbptr+bm ),
925 $ ldbb, af( bbptr+2*bw*ldbb+bm ), ldbb )
926 ENDIF
927*
928 ENDIF
929*
930 ELSE
931*
932* This node stops work after this stage -- an extra copy
933* is required to make the odd and even frontal matrices
934* look identical
935*
936 neicol = mycol - npstr
937*
938 IF (neicol .EQ. 0) THEN
939 bmn = bw - bwu
940 ELSE
941 bmn = bw
942 ENDIF
943*
944 bm1 = bmn
945 bm2 = bm
946*
947 CALL zgesd2d( ictxt, bm, 2*bw, af(bbptr+bw*ldbb),
948 $ ldbb, 0, neicol )
949*
950 CALL zlamov('G',bm, 2*bw, af(bbptr+bw*ldbb),ldbb,
951 $ af(bbptr+bmn),ldbb)
952*
953 DO 31 j=bbptr+2*bw*ldbb, bbptr+3*bw*ldbb-1, ldbb
954 DO 32 i=0, ldbb-1
955 af(i+j) = czero
956 32 CONTINUE
957 31 CONTINUE
958*
959 CALL zgerv2d( ictxt, bmn, 2*bw, af(bbptr+bw*ldbb),
960 $ ldbb, 0, neicol)
961*
962 IF( npact .EQ. 2 ) THEN
963*
964* Copy diagonal block to align whole system
965*
966 CALL zlamov( 'G', bm, bw, af( bbptr+bmn ),
967 $ ldbb, af( bbptr+2*bw*ldbb+bmn ), ldbb )
968 ENDIF
969*
970 ENDIF
971*
972* LU factorization with partial pivoting
973*
974 IF (npact .NE. 2) THEN
975*
976 CALL zgetrf(bm+bmn, bw, af(bbptr+bw*ldbb), ldbb,
977 $ ipiv(ln+1), info)
978*
979* Backsolve left side
980*
981 DO 301 j=bbptr,bbptr+bw*ldbb-1, ldbb
982 DO 302 i=0, bm1-1
983 af(i+j) = czero
984 302 CONTINUE
985 301 CONTINUE
986*
987 CALL zlaswp(bw, af(bbptr), ldbb, 1, bw,
988 $ ipiv(ln+1), 1)
989*
990 CALL ztrsm('L','L','N','U', bw, bw, cone,
991 $ af(bbptr+bw*ldbb), ldbb, af(bbptr), ldbb)
992*
993* Use partial factors to update remainder
994*
995 CALL zgemm( 'N', 'N', bm+bmn-bw, bw, bw,
996 $ -cone, af(bbptr+bw*ldbb+bw), ldbb,
997 $ af( bbptr ), ldbb, cone,
998 $ af( bbptr+bw ), ldbb )
999*
1000* Backsolve right side
1001*
1002 nrhs = bw
1003*
1004 CALL zlaswp(nrhs, af(bbptr+2*bw*ldbb), ldbb, 1, bw,
1005 $ ipiv(ln+1), 1)
1006*
1007 CALL ztrsm('L','L','N','U', bw, nrhs, cone,
1008 $ af(bbptr+bw*ldbb), ldbb, af(bbptr+2*bw*ldbb), ldbb)
1009*
1010* Use partial factors to update remainder
1011*
1012 CALL zgemm( 'N', 'N', bm+bmn-bw, nrhs, bw,
1013 $ -cone, af(bbptr+bw*ldbb+bw), ldbb,
1014 $ af( bbptr+2*bw*ldbb ), ldbb, cone,
1015 $ af( bbptr+2*bw*ldbb+bw ), ldbb )
1016*
1017*
1018* Test if processor is active in next round
1019*
1020 IF (mod(mycol,2*npstr) .EQ. 0) THEN
1021*
1022* Reset BM
1023*
1024 bm = bm1+bm2-bw
1025*
1026* Local copying in the block bidiagonal area
1027*
1028*
1029 CALL zlamov('G',bm,bw,
1030 $ af(bbptr+bw),
1031 $ ldbb, af(bbptr+bw*ldbb), ldbb)
1032 CALL zlamov('G',bm,bw,
1033 $ af(bbptr+2*bw*ldbb+bw),
1034 $ ldbb, af(bbptr+2*bw*ldbb), ldbb)
1035*
1036* Zero out space that held original copy
1037*
1038 DO 1020 j=0, bw-1
1039 DO 1021 i=0, bm-1
1040 af(bbptr+2*bw*ldbb+bw+j*ldbb+i) = czero
1041 1021 CONTINUE
1042 1020 CONTINUE
1043*
1044 ENDIF
1045*
1046 ELSE
1047*
1048* Factor the final 2 by 2 block matrix
1049*
1050 CALL zgetrf(bm+bmn,bm+bmn, af(bbptr+bw*ldbb), ldbb,
1051 $ ipiv(ln+1), info)
1052 ENDIF
1053*
1054 ENDIF
1055*
1056* Last processor in an odd-sized NPACT skips to here
1057*
1058 250 CONTINUE
1059*
1060 npact = (npact + 1)/2
1061 npstr = npstr * 2
1062 GOTO 200
1063*
1064 300 CONTINUE
1065* End loop over levels
1066*
1067 1000 CONTINUE
1068* If error was found in Phase 1, processors jump here.
1069*
1070* Free BLACS space used to hold standard-form grid.
1071*
1072 ictxt = ictxt_save
1073 IF( ictxt.NE.ictxt_new ) THEN
1074 CALL blacs_gridexit( ictxt_new )
1075 END IF
1076*
1077 1234 CONTINUE
1078* If this processor did not hold part of the grid it
1079* jumps here.
1080*
1081* Restore saved input parameters
1082*
1083 ictxt = ictxt_save
1084 np = np_save
1085*
1086* Output worksize
1087*
1088 work( 1 ) = work_size_min
1089*
1090* Make INFO consistent across processors
1091*
1092 CALL igamx2d( ictxt, 'A', ' ', 1, 1, info, 1, info, info,
1093 $ -1, 0, 0 )
1094*
1095 IF( mycol.EQ.0 ) THEN
1096 CALL igebs2d( ictxt, 'A', ' ', 1, 1, info, 1 )
1097 ELSE
1098 CALL igebr2d( ictxt, 'A', ' ', 1, 1, info, 1, 0, 0 )
1099 ENDIF
1100*
1101*
1102 RETURN
1103*
1104* End of PZGBTRF
1105*
subroutine desc_convert(desc_in, desc_out, info)
Definition desc_convert.f:2
integer function numroc(n, nb, iproc, isrcproc, nprocs)
Definition numroc.f:2
#define max(A, B)
Definition pcgemr.c:180
#define min(A, B)
Definition pcgemr.c:181
subroutine globchk(ictxt, n, x, ldx, iwork, info)
Definition pchkxmat.f:403
subroutine pxerbla(ictxt, srname, info)
Definition pxerbla.f:2
void reshape(Int *context_in, Int *major_in, Int *context_out, Int *major_out, Int *first_proc, Int *nprow_new, Int *npcol_new)
Definition reshape.c:80
logical function lsame(ca, cb)
Definition tools.f:1724
subroutine zlatcpy(uplo, m, n, a, lda, b, ldb)
Definition zlatcpy.f:2
Here is the call graph for this function:
Here is the caller graph for this function: