dc/d66/pslaqr0_8f_source.html

      RECURSIVE SUBROUTINE pslaqr0( WANTT, WANTZ, N, ILO, IHI, H,

     $     DESCH, WR, WI, ILOZ, IHIZ, Z, DESCZ, WORK, LWORK,

     $     IWORK, LIWORK, INFO, RECLEVEL )

*

*     Contribution from the Department of Computing Science and HPC2N,

*     Umea University, Sweden

*

*  -- ScaLAPACK auxiliary routine (version 2.0.1) --

*     University of Tennessee, Knoxville, Oak Ridge National Laboratory,

*     Univ. of Colorado Denver and University of California, Berkeley.

*     January, 2012

*

      IMPLICIT NONE

*

*     .. Scalar Arguments ..

      INTEGER            ihi, ihiz, ilo, iloz, info, liwork, lwork, n,

     $                   reclevel

      LOGICAL            wantt, wantz

*     ..

*     .. Array Arguments ..

      INTEGER            desch( * ), descz( * ), iwork( * )

      REAL               h( * ), wi( n ), work( * ), wr( n ),

     $                   z( * )

*     ..

*

*  Purpose

*  =======

*

*  PSLAQR0 computes the eigenvalues of a Hessenberg matrix H

*  and, optionally, the matrices T and Z from the Schur decomposition

*  H = Z*T*Z**T, where T is an upper quasi-triangular matrix (the

*  Schur form), and Z is the orthogonal matrix of Schur vectors.

*

*  Optionally Z may be postmultiplied into an input orthogonal

*  matrix Q so that this routine can give the Schur factorization

*  of a matrix A which has been reduced to the Hessenberg form H

*  by the orthogonal matrix Q:

*       A = Q * H * Q**T = (QZ) * T * (QZ)**T.

*

*  Notes

*  =====

*

*  Each global data object is described by an associated description

*  vector.  This vector stores the information required to establish

*  the mapping between an object element and its corresponding process

*  and memory location.

*

*  Let A be a generic term for any 2D block cyclicly distributed array.

*  Such a global array has an associated description vector DESCA.

*  In the following comments, the character _ should be read as

*  "of the global array".

*

*  NOTATION        STORED IN      EXPLANATION

*  --------------- -------------- --------------------------------------

*  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,

*                                 DTYPE_A = 1.

*  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating

*                                 the BLACS process grid A is distribu-

*                                 ted over. The context itself is glo-

*                                 bal, but the handle (the integer

*                                 value) may vary.

*  M_A    (global) DESCA( M_ )    The number of rows in the global

*                                 array A.

*  N_A    (global) DESCA( N_ )    The number of columns in the global

*                                 array A.

*  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute

*                                 the rows of the array.

*  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute

*                                 the columns of the array.

*  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first

*                                 row of the array A is distributed.

*  CSRC_A (global) DESCA( CSRC_ ) The process column over which the

*                                 first column of the array A is

*                                 distributed.

*  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local

*                                 array.  LLD_A >= MAX(1,LOCr(M_A)).

*

*  Let K be the number of rows or columns of a distributed matrix,

*  and assume that its process grid has dimension p x q.

*  LOCr( K ) denotes the number of elements of K that a process

*  would receive if K were distributed over the p processes of its

*  process column.

*  Similarly, LOCc( K ) denotes the number of elements of K that a

*  process would receive if K were distributed over the q processes of

*  its process row.

*  The values of LOCr() and LOCc() may be determined via a call to the

*  ScaLAPACK tool function, NUMROC:

*          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),

*          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).

*  An upper bound for these quantities may be computed by:

*          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A

*          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A

*

*  Arguments

*  =========

*

*  WANTT   (global input) LOGICAL

*          = .TRUE. : the full Schur form T is required;

*          = .FALSE.: only eigenvalues are required.

*

*  WANTZ   (global input) LOGICAL

*          = .TRUE. : the matrix of Schur vectors Z is required;

*          = .FALSE.: Schur vectors are not required.

*

*  N       (global input) INTEGER

*          The order of the Hessenberg matrix H (and Z if WANTZ).

*          N >= 0.

*

*  ILO     (global input) INTEGER

*  IHI     (global input) INTEGER

*          It is assumed that H is already upper triangular in rows

*          and columns 1:ILO-1 and IHI+1:N. ILO and IHI are normally

*          set by a previous call to PSGEBAL, and then passed to PSGEHRD

*          when the matrix output by PSGEBAL is reduced to Hessenberg

*          form. Otherwise ILO and IHI should be set to 1 and N

*          respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.

*          If N = 0, then ILO = 1 and IHI = 0.

*

*  H       (global input/output) REAL             array, dimension

*          (DESCH(LLD_),*)

*          On entry, the upper Hessenberg matrix H.

*          On exit, if JOB = 'S', H is upper quasi-triangular in

*          rows and columns ILO:IHI, with 1-by-1 and 2-by-2 blocks on

*          the main diagonal.  The 2-by-2 diagonal blocks (corresponding

*          to complex conjugate pairs of eigenvalues) are returned in

*          standard form, with H(i,i) = H(i+1,i+1) and

*          H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the

*          contents of H are unspecified on exit.

*

*  DESCH   (global and local input) INTEGER array of dimension DLEN_.

*          The array descriptor for the distributed matrix H.

*

*  WR      (global output) REAL             array, dimension (N)

*  WI      (global output) REAL             array, dimension (N)

*          The real and imaginary parts, respectively, of the computed

*          eigenvalues ILO to IHI are stored in the corresponding

*          elements of WR and WI. If two eigenvalues are computed as a

*          complex conjugate pair, they are stored in consecutive

*          elements of WR and WI, say the i-th and (i+1)th, with

*          WI(i) > 0 and WI(i+1) < 0. If JOB = 'S', the

*          eigenvalues are stored in the same order as on the diagonal

*          of the Schur form returned in H.

*

*  Z       (global input/output) REAL             array.

*          If COMPZ = 'V', on entry Z must contain the current

*          matrix Z of accumulated transformations from, e.g., PSGEHRD,

*          and on exit Z has been updated; transformations are applied

*          only to the submatrix Z(ILO:IHI,ILO:IHI).

*          If COMPZ = 'N', Z is not referenced.

*          If COMPZ = 'I', on entry Z need not be set and on exit,

*          if INFO = 0, Z contains the orthogonal matrix Z of the Schur

*           vectors of H.

*

*  DESCZ   (global and local input) INTEGER array of dimension DLEN_.

*          The array descriptor for the distributed matrix Z.

*

*  WORK    (local workspace) REAL             array, dimension(DWORK)

*

*  LWORK   (local input) INTEGER

*          The length of the workspace array WORK.

*

*  IWORK   (local workspace) INTEGER array, dimension (LIWORK)

*

*  LIWORK  (local input) INTEGER

*          The length of the workspace array IWORK.

*

*  INFO    (output) INTEGER

*          =    0:  successful exit

*          .LT. 0:  if INFO = -i, the i-th argument had an illegal

*                   value

*          .GT. 0:  if INFO = i, PSLAQR0 failed to compute all of

*                   the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR

*                   and WI contain those eigenvalues which have been

*                   successfully computed.  (Failures are rare.)

*

*                If INFO .GT. 0 and JOB = 'E', then on exit, the

*                remaining unconverged eigenvalues are the eigen-

*                values of the upper Hessenberg matrix rows and

*                columns ILO through INFO of the final, output

*                value of H.

*

*                If INFO .GT. 0 and JOB   = 'S', then on exit

*

*           (*)  (initial value of H)*U  = U*(final value of H)

*

*                where U is an orthogonal matrix.  The final

*                value of H is upper Hessenberg and quasi-triangular

*                in rows and columns INFO+1 through IHI.

*

*                If INFO .GT. 0 and COMPZ = 'V', then on exit

*

*                  (final value of Z)  =  (initial value of Z)*U

*

*                where U is the orthogonal matrix in (*) (regard-

*                less of the value of JOB.)

*

*                If INFO .GT. 0 and COMPZ = 'I', then on exit

*                      (final value of Z)  = U

*                where U is the orthogonal matrix in (*) (regard-

*                less of the value of JOB.)

*

*                If INFO .GT. 0 and COMPZ = 'N', then Z is not

*                accessed.

*

*     ================================================================

*     Based on contributions by

*        Robert Granat, Department of Computing Science and HPC2N,

*        Umea University, Sweden.

*     ================================================================

*

*     Restrictions: The block size in H and Z must be square and larger

*     than or equal to six (6) due to restrictions in PSLAQR1, PSLAQR5

*     and SLAQR6. Moreover, H and Z need to be distributed identically

*     with the same context.

*

*     ================================================================

*     References:

*       K. Braman, R. Byers, and R. Mathias,

*       The Multi-Shift QR Algorithm Part I: Maintaining Well Focused

*       Shifts, and Level 3 Performance.

*       SIAM J. Matrix Anal. Appl., 23(4):929--947, 2002.

*

*       K. Braman, R. Byers, and R. Mathias,

*       The Multi-Shift QR Algorithm Part II: Aggressive Early

*       Deflation.

*       SIAM J. Matrix Anal. Appl., 23(4):948--973, 2002.

*

*       R. Granat, B. Kagstrom, and D. Kressner,

*       A Novel Parallel QR Algorithm for Hybrid Distributed Momory HPC

*       Systems.

*       SIAM J. Sci. Comput., 32(4):2345--2378, 2010.

*

*     ================================================================

*

*     .. Parameters ..

*

*     ==== Exceptional deflation windows:  try to cure rare

*     .    slow convergence by increasing the size of the

*     .    deflation window after KEXNW iterations. =====

*

*     ==== Exceptional shifts: try to cure rare slow convergence

*     .    with ad-hoc exceptional shifts every KEXSH iterations.

*     .    The constants WILK1 and WILK2 are used to form the

*     .    exceptional shifts. ====

*

      INTEGER            block_cyclic_2d, csrc_, ctxt_, dlen_, dtype_,

     $                   lld_, mb_, m_, nb_, n_, rsrc_

      INTEGER            recmax

      PARAMETER          ( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,

     $                     ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,

     $                     rsrc_ = 7, csrc_ = 8, lld_ = 9, recmax = 3 )

      INTEGER            ntiny

      PARAMETER          ( ntiny = 11 )

      INTEGER            kexnw, kexsh

      parameter( kexnw = 5, kexsh = 6 )

      REAL               wilk1, wilk2

      parameter( wilk1 = 0.75e0, wilk2 = -0.4375e0 )

      REAL               zero, one

      parameter( zero = 0.0e0, one = 1.0e0 )

*     ..

*     .. Local Scalars ..

      REAL               aa, bb, cc, cs, dd, sn, ss, swap, elem, t0,

     $                   elem1, elem2, elem3, alpha, sdsum, stamp

      INTEGER            i, j, inf, it, itmax, k, kacc22, kbot, kdu, ks,

     $                   kt, ktop, ku, kv, kwh, kwtop, kwv, ld, ls,

     $                   lwkopt, ndfl, nh, nho, nibble, nmin, ns, nsmax,

     $                   nsr, nve, nw, nwmax, nwr, lldh, lldz, ii, jj,

     $                   ictxt, nprow, npcol, myrow, mycol, ipv, ipt,

     $                   ipw, ipwrk, vrows, vcols, trows, tcols, wrows,

     $                   wcols, hrsrc, hcsrc, nb, is, ie, nprocs, kk,

     $                   iroffh, icoffh, hrsrc3, hcsrc3, nwin, totit,

     $                   sweep, jw, totns, liwkopt, npmin, ictxt_new,

     $                   myrow_new, mycol_new

      LOGICAL            nwinc, sorted, lquery, recursion

      CHARACTER          jbcmpz*2

*     ..

*     .. External Functions ..

      INTEGER            pilaenvx, numroc, indxg2p, iceil, blacs_pnum

      EXTERNAL           pilaenvx, numroc, indxg2p, iceil, blacs_pnum

*     ..

*     .. Local Arrays ..

      INTEGER            descv( dlen_ ), desct( dlen_ ), descw( dlen_ ),

     $                   pmap( 64*64 )

      REAL               zdum( 1, 1 )

*     ..

*     .. External Subroutines ..

      EXTERNAL           pslacpy, pslaqr1, slanv2, pslaqr3, pslaqr5,

     $                   pselget, slaqr0, slaset, psgemr2d

*     ..

*     .. Intrinsic Functions ..

      INTRINSIC          abs, float, int, max, min, mod

*     ..

*     .. Executable Statements ..

      info = 0

      ictxt = desch( ctxt_ )

      CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )

      nprocs = nprow*npcol

      recursion = reclevel .LT. recmax

*

*     Quick return for N = 0: nothing to do.

*

      IF( n.EQ.0 ) THEN

         work( 1 ) = one

         iwork( 1 ) = 1

         RETURN

      END IF

*

*     Set up job flags for PILAENV.

*

      IF( wantt ) THEN

         jbcmpz( 1: 1 ) = 'S'

      ELSE

         jbcmpz( 1: 1 ) = 'E'

      END IF

      IF( wantz ) THEN

         jbcmpz( 2: 2 ) = 'V'

      ELSE

         jbcmpz( 2: 2 ) = 'N'

      END IF

*

*     Check if workspace query

*

      lquery = lwork.EQ.-1 .OR. liwork.EQ.-1

*

*     Extract local leading dimensions and block factors of matrices

*     H and Z

*

      lldh = desch( lld_ )

      lldz = descz( lld_ )

      nb = desch( mb_ )

*

*     Tiny (sub-) matrices must use PSLAQR1. (Stops recursion)

*

      IF( n.LE.ntiny ) THEN

*

*     Estimate optimal workspace.

*

         CALL pslaqr1( wantt, wantz, n, ilo, ihi, h, desch, wr, wi,

     $        iloz, ihiz, z, descz, work, lwork, iwork, liwork, info )

         lwkopt = int( work(1) )

         liwkopt = iwork(1)

*

*     Completely local matrices uses LAPACK. (Stops recursion)

*

      ELSEIF( n.LE.nb ) THEN

         IF( myrow.EQ.desch(rsrc_) .AND. mycol.EQ.desch(csrc_) ) THEN

            CALL slaqr0( wantt, wantz, n, ilo, ihi, h, desch(lld_),

     $           wr, wi, iloz, ihiz, z, descz(lld_), work, lwork, info )

            IF( n.GT.2 )

     $         CALL slaset( 'L', n-2, n-2, zero, zero, h(3),

     $              desch(lld_) )

            lwkopt = int( work(1) )

            liwkopt = 1

         ELSE

            lwkopt = 1

            liwkopt = 1

         END IF

*

*     Do one more step of recursion

*

      ELSE

*

*        Zero out iteration and sweep counters for debugging purposes

*

         totit = 0

         sweep = 0

         totns = 0

*

*        Use small bulge multi-shift QR with aggressive early

*        deflation on larger-than-tiny matrices.

*

*        Hope for the best.

*

         info = 0

*

*        NWR = recommended deflation window size.  At this

*        point,  N .GT. NTINY = 11, so there is enough

*        subdiagonal workspace for NWR.GE.2 as required.

*        (In fact, there is enough subdiagonal space for

*        NWR.GE.3.)

*

         nwr = pilaenvx( ictxt, 13, 'PSLAQR0', jbcmpz, n, ilo, ihi,

     $        lwork )

         nwr = max( 2, nwr )

         nwr = min( ihi-ilo+1, nwr )

         nw = nwr

*

*        NSR = recommended number of simultaneous shifts.

*        At this point N .GT. NTINY = 11, so there is at

*        enough subdiagonal workspace for NSR to be even

*        and greater than or equal to two as required.

*

         nwin = pilaenvx( ictxt, 19, 'PSLAQR0', jbcmpz, n, nb, nb, nb )

         nsr = pilaenvx( ictxt, 15, 'PSLAQR0', jbcmpz, n, ilo, ihi,

     $        max(nwin,nb) )

         nsr = min( nsr, ihi-ilo )

         nsr = max( 2, nsr-mod( nsr, 2 ) )

*

*        Estimate optimal workspace

*

         lwkopt = 3*iceil(nwr,nprow)*iceil(nwr,npcol)

*

*        Workspace query call to PSLAQR3

*

         CALL pslaqr3( wantt, wantz, n, ilo, ihi, nwr+1, h,

     $        desch, iloz, ihiz, z, descz, ls, ld, wr, wi, h,

     $        desch, n, h, desch, n, h, desch, work, -1, iwork,

     $        liwork, reclevel )

         lwkopt = lwkopt + int( work( 1 ) )

         liwkopt = iwork( 1 )

*

*        Workspace query call to PSLAQR5

*

         CALL pslaqr5( wantt, wantz, 2, n, 1, n, n, wr, wi, h,

     $        desch, iloz, ihiz, z, descz, work, -1, iwork,

     $        liwork )

*

*        Optimal workspace = MAX(PSLAQR3, PSLAQR5)

*

         lwkopt = max( lwkopt, int( work( 1 ) ) )

         liwkopt = max( liwkopt, iwork( 1 ) )

*

*        Quick return in case of workspace query.

*

         IF( lquery ) THEN

            work( 1 ) = float( lwkopt )

            iwork( 1 ) = liwkopt

            RETURN

         END IF

*

*        PSLAQR1/PSLAQR0 crossover point.

*

         nmin = pilaenvx( ictxt, 12, 'PSLAQR0', jbcmpz, n, ilo, ihi,

     $        lwork )

         nmin = max( ntiny, nmin )

*

*        Nibble crossover point.

*

         nibble = pilaenvx( ictxt, 14, 'PSLAQR0', jbcmpz, n, ilo, ihi,

     $        lwork )

         nibble = max( 0, nibble )

*

*        Accumulate reflections during ttswp?  Use block

*        2-by-2 structure during matrix-matrix multiply?

*

         kacc22 = pilaenvx( ictxt, 16, 'PSLAQR0', jbcmpz, n, ilo, ihi,

     $        lwork )

         kacc22 = max( 1, kacc22 )

         kacc22 = min( 2, kacc22 )

*

*        NWMAX = the largest possible deflation window for

*        which there is sufficient workspace.

*

         nwmax = min( ( n-1 ) / 3, lwork / 2 )

*

*        NSMAX = the Largest number of simultaneous shifts

*        for which there is sufficient workspace.

*

         nsmax = min( ( n+6 ) / 9, lwork - lwork/3 )

         nsmax = nsmax - mod( nsmax, 2 )

*

*        NDFL: an iteration count restarted at deflation.

*

         ndfl = 1

*

*        ITMAX = iteration limit

*

         itmax = max( 30, 2*kexsh )*max( 10, ( ihi-ilo+1 ) )

*

*        Last row and column in the active block.

*

         kbot = ihi

*

*        Main Loop.

*

         DO 110 it = 1, itmax

            totit = totit + 1

*

*           Done when KBOT falls below ILO.

*

            IF( kbot.LT.ilo )

     $         GO TO 120

*

*           Locate active block.

*

            DO 10 k = kbot, ilo + 1, -1

               CALL infog2l( k, k-1, desch, nprow, npcol, myrow, mycol,

     $              ii, jj, hrsrc, hcsrc )

               IF( myrow.EQ.hrsrc .AND. mycol.EQ.hcsrc ) THEN

                  IF( h( ii + (jj-1)*lldh ).EQ.zero )

     $               GO TO 20

               END IF

 10         CONTINUE

            k = ilo

 20         CONTINUE

            ktop = k

            IF( nprocs.GT.1 )

     $         CALL igamx2d( ictxt, 'All', '1-Tree', 1, 1, ktop, 1,

     $              -1, -1, -1, -1, -1 )

*

*           Select deflation window size.

*

            nh = kbot - ktop + 1

            IF( nh.LE.ntiny ) THEN

               nw = nh

            ELSEIF( ndfl.LT.kexnw .OR. nh.LT.nw ) THEN

*

*              Typical deflation window.  If possible and

*              advisable, nibble the entire active block.

*              If not, use size NWR or NWR+1 depending upon

*              which has the smaller corresponding subdiagonal

*              entry (a heuristic).

*

               nwinc = .true.

               IF( nh.LE.min( nmin, nwmax ) ) THEN

                  nw = nh

               ELSE

                  nw = min( nwr, nh, nwmax )

                  IF( nw.LT.nwmax ) THEN

                     IF( nw.GE.nh-1 ) THEN

                        nw = nh

                     ELSE

                        kwtop = kbot - nw + 1

                        CALL pselget( 'All', '1-Tree', elem1, h, kwtop,

     $                       kwtop-1, desch )

                        CALL pselget( 'All', '1-Tree', elem2, h,

     $                       kwtop-1, kwtop-2, desch )

                        IF( abs( elem1 ).GT.abs( elem2 ) ) nw = nw + 1

                     END IF

                  END IF

               END IF

            ELSE

*

*              Exceptional deflation window.  If there have

*              been no deflations in KEXNW or more iterations,

*              then vary the deflation window size.   At first,

*              because, larger windows are, in general, more

*              powerful than smaller ones, rapidly increase the

*              window up to the maximum reasonable and possible.

*              Then maybe try a slightly smaller window.

*

               IF( nwinc .AND. nw.LT.min( nwmax, nh ) ) THEN

                  nw = min( nwmax, nh, 2*nw )

               ELSE

                  nwinc = .false.

                  IF( nw.EQ.nh .AND. nh.GT.2 )

     $               nw = nh - 1

               END IF

            END IF

*

*           Aggressive early deflation:

*           split workspace into

*             - an NW-by-NW work array V for orthogonal matrix

*             - an NW-by-at-least-NW-but-more-is-better

*               (NW-by-NHO) horizontal work array for Schur factor

*             - an at-least-NW-but-more-is-better (NVE-by-NW)

*               vertical work array for matrix multiplications

*             - align T, V and W with the deflation window

*

            kv = n - nw + 1

            kt = nw + 1

            nho = ( n-nw-1 ) - kt + 1

            kwv = nw + 2

            nve = ( n-nw ) - kwv + 1

*

            jw = min( nw, kbot-ktop+1 )

            kwtop = kbot - jw + 1

            iroffh = mod( kwtop - 1, nb )

            icoffh = iroffh

            hrsrc = indxg2p( kwtop, nb, myrow, desch(rsrc_), nprow )

            hcsrc = indxg2p( kwtop, nb, mycol, desch(csrc_), npcol )

            vrows = numroc( jw+iroffh, nb, myrow, hrsrc, nprow )

            vcols = numroc( jw+icoffh, nb, mycol, hcsrc, npcol )

            CALL descinit( descv, jw+iroffh, jw+icoffh, nb, nb,

     $           hrsrc, hcsrc, ictxt, max(1, vrows), info )

*

            trows = numroc( jw+iroffh, nb, myrow, hrsrc, nprow )

            tcols = numroc( jw+icoffh, nb, mycol, hcsrc, npcol )

            CALL descinit( desct, jw+iroffh, jw+icoffh, nb, nb,

     $           hrsrc, hcsrc, ictxt, max(1, trows), info )

            wrows = numroc( jw+iroffh, nb, myrow, hrsrc, nprow )

            wcols = numroc( jw+icoffh, nb, mycol, hcsrc, npcol )

            CALL descinit( descw, jw+iroffh, jw+icoffh, nb, nb,

     $           hrsrc, hcsrc, ictxt, max(1, wrows), info )

*

            ipv   = 1

            ipt   = ipv + descv( lld_ ) * vcols

            ipw   = ipt + desct( lld_ ) * tcols

            ipwrk = ipw + descw( lld_ ) * wcols

*

*           Aggressive early deflation

*

            iwork(1) = it

            CALL pslaqr3( wantt, wantz, n, ktop, kbot, nw, h,

     $           desch, iloz, ihiz, z, descz, ls, ld, wr, wi,

     $           work(ipv), descv, nho, work(ipt), desct, nve,

     $           work(ipw), descw, work(ipwrk), lwork-ipwrk+1,

     $           iwork, liwork, reclevel )

*

*           Adjust KBOT accounting for new deflations.

*

            kbot = kbot - ld

*

*           KS points to the shifts.

*

            ks = kbot - ls + 1

*

*           Skip an expensive QR sweep if there is a (partly

*           heuristic) reason to expect that many eigenvalues

*           will deflate without it.  Here, the QR sweep is

*           skipped if many eigenvalues have just been deflated

*           or if the remaining active block is small.

*

            IF( ( ld.EQ.0 ) .OR. ( ( 100*ld.LE.nw*nibble ) .AND. ( kbot-

     $           ktop+1.GT.min( nmin, nwmax ) ) ) ) THEN

*

*              NS = nominal number of simultaneous shifts.

*              This may be lowered (slightly) if PSLAQR3

*              did not provide that many shifts.

*

               ns = min( nsmax, nsr, max( 2, kbot-ktop ) )

               ns = ns - mod( ns, 2 )

*

*              If there have been no deflations

*              in a multiple of KEXSH iterations,

*              then try exceptional shifts.

*              Otherwise use shifts provided by

*              PSLAQR3 above or from the eigenvalues

*              of a trailing principal submatrix.

*

               IF( mod( ndfl, kexsh ).EQ.0 ) THEN

                  ks = kbot - ns + 1

                  DO 30 i = kbot, max( ks+1, ktop+2 ), -2

                     CALL pselget( 'All', '1-Tree', elem1, h, i, i-1,

     $                    desch )

                     CALL pselget( 'All', '1-Tree', elem2, h, i-1, i-2,

     $                    desch )

                     CALL pselget( 'All', '1-Tree', elem3, h, i, i,

     $                    desch )

                     ss = abs( elem1 ) + abs( elem2 )

                     aa = wilk1*ss + elem3

                     bb = ss

                     cc = wilk2*ss

                     dd = aa

                     CALL slanv2( aa, bb, cc, dd, wr( i-1 ), wi( i-1 ),

     $                    wr( i ), wi( i ), cs, sn )

 30               CONTINUE

                  IF( ks.EQ.ktop ) THEN

                     CALL pselget( 'All', '1-Tree', elem1, h, ks+1,

     $                    ks+1, desch )

                     wr( ks+1 ) = elem1

                     wi( ks+1 ) = zero

                     wr( ks ) = wr( ks+1 )

                     wi( ks ) = wi( ks+1 )

                  END IF

               ELSE

*

*                 Got NS/2 or fewer shifts? Use PSLAQR0 or

*                 PSLAQR1 on a trailing principal submatrix to

*                 get more.

*

                  IF( kbot-ks+1.LE.ns / 2 ) THEN

                     ks = kbot - ns + 1

                     kt = n - ns + 1

                     npmin = pilaenvx( ictxt, 23, 'PSLAQR0', 'EN', ns,

     $                    nb, nprow, npcol )

c

c   Temporarily force NPMIN <= 8 since only PSLAQR1 is used.

c

                     npmin = min(npmin, 8)

                     IF( min(nprow, npcol).LE.npmin+1 .OR.

     $                    reclevel.GE.1 ) THEN

*

*                       The window is large enough. Compute the Schur

*                       decomposition with all processors.

*

                        iroffh = mod( ks - 1, nb )

                        icoffh = iroffh

                        IF( ns.GT.nmin ) THEN

                           hrsrc = indxg2p( ks, nb, myrow, desch(rsrc_),

     $                          nprow )

                           hcsrc = indxg2p( ks, nb, myrow, desch(csrc_),

     $                          npcol )

                        ELSE

                           hrsrc = 0

                           hcsrc = 0

                        END IF

                        trows = numroc( ns+iroffh, nb, myrow, hrsrc,

     $                       nprow )

                        tcols = numroc( ns+icoffh, nb, mycol, hcsrc,

     $                       npcol )

                        CALL descinit( desct, ns+iroffh, ns+icoffh, nb,

     $                       nb, hrsrc, hcsrc, ictxt, max(1, trows),

     $                       info )

                        ipt = 1

                        ipwrk = ipt + desct(lld_) * tcols

*

                        IF( ns.GT.nmin .AND. recursion ) THEN

                           CALL pslacpy( 'All', ns, ns, h, ks, ks,

     $                          desch, work(ipt), 1+iroffh, 1+icoffh,

     $                          desct )

                           CALL pslaqr0( .false., .false., iroffh+ns,

     $                          1+iroffh, iroffh+ns, work(ipt),

     $                          desct, wr( ks-iroffh ),

     $                          wi( ks-iroffh ), 1, 1, zdum,

     $                          descz, work( ipwrk ),

     $                          lwork-ipwrk+1, iwork, liwork,

     $                          inf, reclevel+1 )

                        ELSE

                           CALL pslamve( 'All', ns, ns, h, ks, ks,

     $                          desch, work(ipt), 1+iroffh, 1+icoffh,

     $                          desct, work(ipwrk) )

                           CALL pslaqr1( .false., .false., iroffh+ns,

     $                          1+iroffh, iroffh+ns, work(ipt),

     $                          desct, wr( ks-iroffh ),

     $                          wi( ks-iroffh ), 1+iroffh, iroffh+ns,

     $                          zdum, descz, work( ipwrk ),

     $                          lwork-ipwrk+1, iwork, liwork, inf )

                        END IF

                     ELSE

*

*                       The window is too small. Redistribute the AED

*                       window to a subgrid and do the computation on

*                       the subgrid.

*

                        ictxt_new = ictxt

                        DO 50 i = 0, npmin-1

                           DO 40 j = 0, npmin-1

                              pmap( j+1+i*npmin ) =

     $                             blacs_pnum( ictxt, i, j )

 40                        CONTINUE

 50                     CONTINUE

                        CALL blacs_gridmap( ictxt_new, pmap, npmin,

     $                       npmin, npmin )

                        CALL blacs_gridinfo( ictxt_new, npmin, npmin,

     $                       myrow_new, mycol_new )

                        IF( myrow.GE.npmin .OR. mycol.GE.npmin )

     $                     ictxt_new = -1

*

                        IF( ictxt_new.GE.0 ) THEN

                           trows = numroc( ns, nb, myrow_new, 0, npmin )

                           tcols = numroc( ns, nb, mycol_new, 0, npmin )

                           CALL descinit( desct, ns, ns, nb, nb, 0, 0,

     $                          ictxt_new, max(1,trows), info )

                           ipt = 1

                           ipwrk = ipt + desct(lld_) * tcols

                        ELSE

                           ipt = 1

                           ipwrk = 2

                           desct( ctxt_ ) = -1

                           inf = 0

                        END IF

                        CALL psgemr2d( ns, ns, h, ks, ks, desch,

     $                       work(ipt), 1, 1, desct, ictxt )

*

c

c   This part is still not perfect.

c   Either PSLAQR0 or PSLAQR1 can work, but not both.

c

c                        NMIN = PILAENVX( ICTXT_NEW, 12, 'PSLAQR0',

c     $                       'EN', NS, 1, NS, LWORK )

                        IF( ictxt_new.GE.0 ) THEN

c                           IF( NS.GT.NMIN .AND. RECLEVEL.LT.1 ) THEN

c                              CALL PSLAQR0( .FALSE., .FALSE., NS, 1,

c     $                             NS, WORK(IPT), DESCT, WR( KS ),

c     $                             WI( KS ), 1, 1, ZDUM, DESCT,

c     $                             WORK( IPWRK ), LWORK-IPWRK+1, IWORK,

c     $                             LIWORK, INF, RECLEVEL+1 )

c                           ELSE

                              CALL pslaqr1( .false., .false., ns, 1,

     $                             ns, work(ipt), desct, wr( ks ),

     $                             wi( ks ), 1, ns, zdum, desct,

     $                             work( ipwrk ), lwork-ipwrk+1, iwork,

     $                             liwork, inf )

c                           END IF

                           CALL blacs_gridexit( ictxt_new )

                        END IF

                        IF( myrow+mycol.GT.0 ) THEN

                           DO 60 j = 0, ns-1

                              wr( ks+j ) = zero

                              wi( ks+j ) = zero

 60                        CONTINUE

                        END IF

                        CALL igamn2d( ictxt, 'All', '1-Tree', 1, 1, inf,

     $                       1, -1, -1, -1, -1, -1 )

                        CALL sgsum2d( ictxt, 'All', ' ', ns, 1, wr(ks),

     $                       ns, -1, -1 )

                        CALL sgsum2d( ictxt, 'All', ' ', ns, 1, wi(ks),

     $                       ns, -1, -1 )

                     END IF

                     ks = ks + inf

*

*                    In case of a rare QR failure use

*                    eigenvalues of the trailing 2-by-2

*                    principal submatrix.

*

                     IF( ks.GE.kbot ) THEN

                        CALL pselget( 'All', '1-Tree', aa, h, kbot-1,

     $                       kbot-1, desch )

                        CALL pselget( 'All', '1-Tree', cc, h, kbot,

     $                       kbot-1, desch )

                        CALL pselget( 'All', '1-Tree', bb, h, kbot-1,

     $                       kbot, desch )

                        CALL pselget( 'All', '1-Tree', dd, h, kbot,

     $                       kbot, desch )

                        CALL slanv2( aa, bb, cc, dd, wr( kbot-1 ),

     $                       wi( kbot-1 ), wr( kbot ),

     $                       wi( kbot ), cs, sn )

                        ks = kbot - 1

                     END IF

                  END IF

*

                  IF( kbot-ks+1.GT.ns ) THEN

*

*                    Sort the shifts (helps a little)

*                    Bubble sort keeps complex conjugate

*                    pairs together.

*

                     sorted = .false.

                     DO 80 k = kbot, ks + 1, -1

                        IF( sorted )

     $                     GO TO 90

                        sorted = .true.

                        DO 70 i = ks, k - 1

                           IF( abs( wr( i ) )+abs( wi( i ) ).LT.

     $                          abs( wr( i+1 ) )+abs( wi( i+1 ) ) ) THEN

                              sorted = .false.

*

                              swap = wr( i )

                              wr( i ) = wr( i+1 )

                              wr( i+1 ) = swap

*

                              swap = wi( i )

                              wi( i ) = wi( i+1 )

                              wi( i+1 ) = swap

                           END IF

 70                     CONTINUE

 80                  CONTINUE

 90                  CONTINUE

                  END IF

*

*                 Shuffle shifts into pairs of real shifts

*                 and pairs of complex conjugate shifts

*                 assuming complex conjugate shifts are

*                 already adjacent to one another. (Yes,

*                 they are.)

*

                  DO 100 i = kbot, ks + 2, -2

                     IF( wi( i ).NE.-wi( i-1 ) ) THEN

*

                        swap = wr( i )

                        wr( i ) = wr( i-1 )

                        wr( i-1 ) = wr( i-2 )

                        wr( i-2 ) = swap

*

                        swap = wi( i )

                        wi( i ) = wi( i-1 )

                        wi( i-1 ) = wi( i-2 )

                        wi( i-2 ) = swap

                     END IF

 100              CONTINUE

               END IF

*

*              If there are only two shifts and both are

*              real, then use only one.

*

               IF( kbot-ks+1.EQ.2 ) THEN

                  IF( wi( kbot ).EQ.zero ) THEN

                     CALL pselget( 'All', '1-Tree', elem, h, kbot,

     $                    kbot, desch )

                     IF( abs( wr( kbot )-elem ).LT.

     $                    abs( wr( kbot-1 )-elem ) ) THEN

                        wr( kbot-1 ) = wr( kbot )

                     ELSE

                        wr( kbot ) = wr( kbot-1 )

                     END IF

                  END IF

               END IF

*

*              Use up to NS of the the smallest magnatiude

*              shifts.  If there aren't NS shifts available,

*              then use them all, possibly dropping one to

*              make the number of shifts even.

*

               ns = min( ns, kbot-ks+1 )

               ns = ns - mod( ns, 2 )

               ks = kbot - ns + 1

*

*              Small-bulge multi-shift QR sweep.

*

               totns = totns + ns

               sweep = sweep + 1

               CALL pslaqr5( wantt, wantz, kacc22, n, ktop, kbot,

     $              ns, wr( ks ), wi( ks ), h, desch, iloz, ihiz, z,

     $              descz, work, lwork, iwork, liwork )

            END IF

*

*           Note progress (or the lack of it).

*

            IF( ld.GT.0 ) THEN

               ndfl = 1

            ELSE

               ndfl = ndfl + 1

            END IF

*

*           End of main loop.

 110     CONTINUE

*

*        Iteration limit exceeded.  Set INFO to show where

*        the problem occurred and exit.

*

         info = kbot

 120     CONTINUE

      END IF

*

*     Return the optimal value of LWORK.

*

      work( 1 ) = float( lwkopt )

      iwork( 1 ) = liwkopt

      IF( .NOT. lquery ) THEN

         iwork( 1 ) = totit

         iwork( 2 ) = sweep

         iwork( 3 ) = totns

      END IF

      RETURN

*

*     End of PSLAQR0

*

      END