◆ zhetrd_hb2st()

subroutine zhetrd_hb2st	(	character	stage1,
		character	vect,
		character	uplo,
		integer	n,
		integer	kd,
		complex16, dimension( ldab, )	ab,
		integer	ldab,
		double precision, dimension( * )	d,
		double precision, dimension( * )	e,
		complex16, dimension( )	hous,
		integer	lhous,
		complex16, dimension( )	work,
		integer	lwork,
		integer	info )

ZHETRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric tridiagonal form T

Download ZHETRD_HB2ST + dependencies [TGZ] [ZIP] [TXT]

Purpose:

!>
!> ZHETRD_HB2ST reduces a complex Hermitian band matrix A to real symmetric
!> tridiagonal form T by a unitary similarity transformation:
!> Q**H * A * Q = T.
!>

Parameters

[in]	STAGE1	!> STAGE1 is CHARACTER*1 !> = 'N': : to mention that the stage 1 of the reduction !> from dense to band using the zhetrd_he2hb routine !> was not called before this routine to reproduce AB. !> In other term this routine is called as standalone. !> = 'Y': : to mention that the stage 1 of the !> reduction from dense to band using the zhetrd_he2hb !> routine has been called to produce AB (e.g., AB is !> the output of zhetrd_he2hb. !>
[in]	VECT	!> VECT is CHARACTER1 !> = 'N': No need for the Housholder representation, !> and thus LHOUS is of size max(1, 4N); !> = 'V': the Householder representation is needed to !> either generate or to apply Q later on, !> then LHOUS is to be queried and computed. !> (NOT AVAILABLE IN THIS RELEASE). !>
[in]	UPLO	!> UPLO is CHARACTER*1 !> = 'U': Upper triangle of A is stored; !> = 'L': Lower triangle of A is stored. !>
[in]	N	!> N is INTEGER !> The order of the matrix A. N >= 0. !>
[in]	KD	!> KD is INTEGER !> The number of superdiagonals of the matrix A if UPLO = 'U', !> or the number of subdiagonals if UPLO = 'L'. KD >= 0. !>
[in,out]	AB	!> AB is COMPLEX*16 array, dimension (LDAB,N) !> On entry, the upper or lower triangle of the Hermitian band !> matrix A, stored in the first KD+1 rows of the array. The !> j-th column of A is stored in the j-th column of the array AB !> as follows: !> if UPLO = 'U', AB(kd+1+i-j,j) = A(i,j) for max(1,j-kd)<=i<=j; !> if UPLO = 'L', AB(1+i-j,j) = A(i,j) for j<=i<=min(n,j+kd). !> On exit, the diagonal elements of AB are overwritten by the !> diagonal elements of the tridiagonal matrix T; if KD > 0, the !> elements on the first superdiagonal (if UPLO = 'U') or the !> first subdiagonal (if UPLO = 'L') are overwritten by the !> off-diagonal elements of T; the rest of AB is overwritten by !> values generated during the reduction. !>
[in]	LDAB	!> LDAB is INTEGER !> The leading dimension of the array AB. LDAB >= KD+1. !>
[out]	D	!> D is DOUBLE PRECISION array, dimension (N) !> The diagonal elements of the tridiagonal matrix T. !>
[out]	E	!> E is DOUBLE PRECISION array, dimension (N-1) !> The off-diagonal elements of the tridiagonal matrix T: !> E(i) = T(i,i+1) if UPLO = 'U'; E(i) = T(i+1,i) if UPLO = 'L'. !>
[out]	HOUS	!> HOUS is COMPLEX*16 array, dimension (MAX(1,LHOUS)) !> Stores the Householder representation. !>
[in]	LHOUS	!> LHOUS is INTEGER !> The dimension of the array HOUS. !> If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension). !> !> If LWORK = -1, or LHOUS = -1, !> then a query is assumed; the routine !> only calculates the optimal size of the HOUS array, returns !> this value as the first entry of the HOUS array, and no error !> message related to LHOUS is issued by XERBLA. !> LHOUS = MAX(1, dimension) where !> dimension = 4*N if VECT='N' !> not available now if VECT='H' !>
[out]	WORK	!> WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)). !> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. !>
[in]	LWORK	!> LWORK is INTEGER !> The dimension of the array WORK. !> If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension). !> !> If LWORK = -1, or LHOUS = -1, !> then a workspace query is assumed; the routine !> only calculates the optimal size of the WORK array, returns !> this value as the first entry of the WORK array, and no error !> message related to LWORK is issued by XERBLA. !> LWORK = MAX(1, dimension) where !> dimension = (2KD+1)N + KDNTHREADS !> where KD is the blocking size of the reduction, !> FACTOPTNB is the blocking used by the QR or LQ !> algorithm, usually FACTOPTNB=128 is a good choice !> NTHREADS is the number of threads used when !> openMP compilation is enabled, otherwise =1. !>
[out]	INFO	!> INFO is INTEGER !> = 0: successful exit !> < 0: if INFO = -i, the i-th argument had an illegal value !>

Author: Univ. of Tennessee; Univ. of California Berkeley; Univ. of Colorado Denver; NAG Ltd.

Further Details:

!>
!>  Implemented by Azzam Haidar.
!>
!>  All details are available on technical report, SC11, SC13 papers.
!>
!>  Azzam Haidar, Hatem Ltaief, and Jack Dongarra.
!>  Parallel reduction to condensed forms for symmetric eigenvalue problems
!>  using aggregated fine-grained and memory-aware kernels. In Proceedings
!>  of 2011 International Conference for High Performance Computing,
!>  Networking, Storage and Analysis (SC '11), New York, NY, USA,
!>  Article 8 , 11 pages.
!>  http://doi.acm.org/10.1145/2063384.2063394
!>
!>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
!>  An improved parallel singular value algorithm and its implementation
!>  for multicore hardware, In Proceedings of 2013 International Conference
!>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
!>  Denver, Colorado, USA, 2013.
!>  Article 90, 12 pages.
!>  http://doi.acm.org/10.1145/2503210.2503292
!>
!>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
!>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
!>  calculations based on fine-grained memory aware tasks.
!>  International Journal of High Performance Computing Applications.
!>  Volume 28 Issue 2, Pages 196-209, May 2014.
!>  http://hpc.sagepub.com/content/28/2/196
!>
!>

Definition at line 231 of file zhetrd_hb2st.F.

*
*
#if defined(_OPENMP)
      use omp_lib
#endif
*
      IMPLICIT NONE
*
*  -- LAPACK computational routine --
*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
*
*     .. Scalar Arguments ..
      CHARACTER          STAGE1, UPLO, VECT
      INTEGER            N, KD, LDAB, LHOUS, LWORK, INFO
*     ..
*     .. Array Arguments ..
      DOUBLE PRECISION   D( * ), E( * )
      COMPLEX*16         AB( LDAB, * ), HOUS( * ), WORK( * )
*     ..
*
*  =====================================================================
*
*     .. Parameters ..
      DOUBLE PRECISION   RZERO
      COMPLEX*16         ZERO, ONE
      parameter( rzero = 0.0d+0,
     $                   zero = ( 0.0d+0, 0.0d+0 ),
     $                   one  = ( 1.0d+0, 0.0d+0 ) )
*     ..
*     .. Local Scalars ..
      LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
     $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
     $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
     $                   NBTILES, TTYPE, TID, NTHREADS,
     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
     $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
     $                   SIZEV, SIZETAU, LDV, LHMIN, LWMIN
      DOUBLE PRECISION   ABSTMP
      COMPLEX*16         TMP
*     ..
*     .. External Subroutines ..
      EXTERNAL           zhb2st_kernels, zlacpy,
     $                   zlaset, xerbla
*     ..
*     .. Intrinsic Functions ..
      INTRINSIC          min, max, ceiling, dble, real
*     ..
*     .. External Functions ..
      LOGICAL            LSAME
      INTEGER            ILAENV2STAGE
      EXTERNAL           lsame, ilaenv2stage
*     ..
*     .. Executable Statements ..
*
*     Determine the minimal workspace size required.
*     Test the input parameters
*
      info    = 0
      afters1 = lsame( stage1, 'Y' )
      wantq   = lsame( vect, 'V' )
      upper   = lsame( uplo, 'U' )
      lquery  = ( lwork.EQ.-1 ) .OR. ( lhous.EQ.-1 )
*
*     Determine the block size, the workspace size and the hous size.
*
      ib       = ilaenv2stage( 2, 'ZHETRD_HB2ST', vect, n, kd,
     $                         -1, -1 )
      IF( n.EQ.0 .OR. kd.LE.1 ) THEN
         lhmin = 1
         lwmin = 1
      ELSE
         lhmin = ilaenv2stage( 3, 'ZHETRD_HB2ST', vect, n, kd, ib,
     $                         -1 )
         lwmin = ilaenv2stage( 4, 'ZHETRD_HB2ST', vect, n, kd, ib,
     $                         -1 )
      END IF
*
      IF( .NOT.afters1 .AND. .NOT.lsame( stage1, 'N' ) ) THEN
         info = -1
      ELSE IF( .NOT.lsame( vect, 'N' ) ) THEN
         info = -2
      ELSE IF( .NOT.upper .AND. .NOT.lsame( uplo, 'L' ) ) THEN
         info = -3
      ELSE IF( n.LT.0 ) THEN
         info = -4
      ELSE IF( kd.LT.0 ) THEN
         info = -5
      ELSE IF( ldab.LT.(kd+1) ) THEN
         info = -7
      ELSE IF( lhous.LT.lhmin .AND. .NOT.lquery ) THEN
         info = -11
      ELSE IF( lwork.LT.lwmin .AND. .NOT.lquery ) THEN
         info = -13
      END IF
*
      IF( info.EQ.0 ) THEN
         hous( 1 ) = lhmin
         work( 1 ) = lwmin
      END IF
*
      IF( info.NE.0 ) THEN
         CALL xerbla( 'ZHETRD_HB2ST', -info )
         RETURN
      ELSE IF( lquery ) THEN
         RETURN
      END IF
*
*     Quick return if possible
*
      IF( n.EQ.0 ) THEN
          hous( 1 ) = 1
          work( 1 ) = 1
          RETURN
      END IF
*
*     Determine pointer position
*
      ldv      = kd + ib
      sizetau  = 2 * n
      sizev    = 2 * n
      indtau   = 1
      indv     = indtau + sizetau
      lda      = 2 * kd + 1
      sizea    = lda * n
      inda     = 1
      indw     = inda + sizea
      nthreads = 1
      tid      = 0
*
      IF( upper ) THEN
          apos     = inda + kd
          awpos    = inda
          dpos     = apos + kd
          ofdpos   = dpos - 1
          abdpos   = kd + 1
          abofdpos = kd
      ELSE
          apos     = inda
          awpos    = inda + kd + 1
          dpos     = apos
          ofdpos   = dpos + 1
          abdpos   = 1
          abofdpos = 2
 
      ENDIF
*
*     Case KD=0:
*     The matrix is diagonal. We just copy it (convert to "real" for
*     complex because D is double and the imaginary part should be 0)
*     and store it in D. A sequential code here is better or
*     in a parallel environment it might need two cores for D and E
*
      IF( kd.EQ.0 ) THEN
          DO 30 i = 1, n
              d( i ) = dble( ab( abdpos, i ) )
   30     CONTINUE
          DO 40 i = 1, n-1
              e( i ) = rzero
   40     CONTINUE
*
          hous( 1 ) = 1
          work( 1 ) = 1
          RETURN
      END IF
*
*     Case KD=1:
*     The matrix is already Tridiagonal. We have to make diagonal
*     and offdiagonal elements real, and store them in D and E.
*     For that, for real precision just copy the diag and offdiag
*     to D and E while for the COMPLEX case the bulge chasing is
*     performed to convert the hermetian tridiagonal to symmetric
*     tridiagonal. A simpler conversion formula might be used, but then
*     updating the Q matrix will be required and based if Q is generated
*     or not this might complicate the story.
*
      IF( kd.EQ.1 ) THEN
          DO 50 i = 1, n
              d( i ) = dble( ab( abdpos, i ) )
   50     CONTINUE
*
*         make off-diagonal elements real and copy them to E
*
          IF( upper ) THEN
              DO 60 i = 1, n - 1
                  tmp = ab( abofdpos, i+1 )
                  abstmp = abs( tmp )
                  ab( abofdpos, i+1 ) = abstmp
                  e( i ) = abstmp
                  IF( abstmp.NE.rzero ) THEN
                     tmp = tmp / abstmp
                  ELSE
                     tmp = one
                  END IF
                  IF( i.LT.n-1 )
     $               ab( abofdpos, i+2 ) = ab( abofdpos, i+2 )*tmp
C                  IF( WANTZ ) THEN
C                     CALL ZSCAL( N, DCONJG( TMP ), Q( 1, I+1 ), 1 )
C                  END IF
   60         CONTINUE
          ELSE
              DO 70 i = 1, n - 1
                 tmp = ab( abofdpos, i )
                 abstmp = abs( tmp )
                 ab( abofdpos, i ) = abstmp
                 e( i ) = abstmp
                 IF( abstmp.NE.rzero ) THEN
                    tmp = tmp / abstmp
                 ELSE
                    tmp = one
                 END IF
                 IF( i.LT.n-1 )
     $              ab( abofdpos, i+1 ) = ab( abofdpos, i+1 )*tmp
C                 IF( WANTQ ) THEN
C                    CALL ZSCAL( N, TMP, Q( 1, I+1 ), 1 )
C                 END IF
   70         CONTINUE
          ENDIF
*
          hous( 1 ) = 1
          work( 1 ) = 1
          RETURN
      END IF
*
*     Main code start here.
*     Reduce the hermitian band of A to a tridiagonal matrix.
*
      thgrsiz   = n
      grsiz     = 1
      shift     = 3
      nbtiles   = ceiling( real(n)/real(kd) )
      stepercol = ceiling( real(shift)/real(grsiz) )
      thgrnb    = ceiling( real(n-1)/real(thgrsiz) )
*
      CALL zlacpy( "A", kd+1, n, ab, ldab, work( apos ), lda )
      CALL zlaset( "A", kd,   n, zero, zero, work( awpos ), lda )
*
*
*     openMP parallelisation start here
*
#if defined(_OPENMP)
!$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
!$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
!$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
!$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
!$OMP$         SHARED ( STEPERCOL, THGRNB, THGRSIZ, GRSIZ, SHIFT )
!$OMP MASTER
#endif
*
*     main bulge chasing loop
*
      DO 100 thgrid = 1, thgrnb
          stt  = (thgrid-1)*thgrsiz+1
          thed = min( (stt + thgrsiz -1), (n-1))
          DO 110 i = stt, n-1
              ed = min( i, thed )
              IF( stt.GT.ed ) EXIT
              DO 120 m = 1, stepercol
                  st = stt
                  DO 130 sweepid = st, ed
                      DO 140 k = 1, grsiz
                          myid  = (i-sweepid)*(stepercol*grsiz)
     $                           + (m-1)*grsiz + k
                          IF ( myid.EQ.1 ) THEN
                              ttype = 1
                          ELSE
                              ttype = mod( myid, 2 ) + 2
                          ENDIF
 
                          IF( ttype.EQ.2 ) THEN
                              colpt      = (myid/2)*kd + sweepid
                              stind      = colpt-kd+1
                              edind      = min(colpt,n)
                              blklastind = colpt
                          ELSE
                              colpt      = ((myid+1)/2)*kd + sweepid
                              stind      = colpt-kd+1
                              edind      = min(colpt,n)
                              IF( ( stind.GE.edind-1 ).AND.
     $                            ( edind.EQ.n ) ) THEN
                                  blklastind = n
                              ELSE
                                  blklastind = 0
                              ENDIF
                          ENDIF
*
*                         Call the kernel
*
#if defined(_OPENMP) &&  _OPENMP >= 201307
 
                          IF( ttype.NE.1 ) THEN
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
!$OMP$     DEPEND(in:WORK(MYID-1))
!$OMP$     DEPEND(out:WORK(MYID))
                              tid      = omp_get_thread_num()
                              CALL zhb2st_kernels(
     $                             uplo, wantq, ttype,
     $                             stind, edind, sweepid, n, kd, ib,
     $                             work( inda ), lda,
     $                             hous( indv ), hous( indtau ), ldv,
     $                             work( indw + tid*kd ) )
!$OMP END TASK
                          ELSE
!$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
!$OMP$     DEPEND(out:WORK(MYID))
                              tid      = omp_get_thread_num()
                              CALL zhb2st_kernels(
     $                             uplo, wantq, ttype,
     $                             stind, edind, sweepid, n, kd, ib,
     $                             work( inda ), lda,
     $                             hous( indv ), hous( indtau ), ldv,
     $                             work( indw + tid*kd ) )
!$OMP END TASK
                          ENDIF
#else
                          CALL zhb2st_kernels(
     $                         uplo, wantq, ttype,
     $                         stind, edind, sweepid, n, kd, ib,
     $                         work( inda ), lda,
     $                         hous( indv ), hous( indtau ), ldv,
     $                         work( indw ) )
#endif
                          IF ( blklastind.GE.(n-1) ) THEN
                              stt = stt + 1
                              EXIT
                          ENDIF
  140                 CONTINUE
  130             CONTINUE
  120         CONTINUE
  110     CONTINUE
  100 CONTINUE
*
#if defined(_OPENMP)
!$OMP END MASTER
!$OMP END PARALLEL
#endif
*
*     Copy the diagonal from A to D. Note that D is REAL thus only
*     the Real part is needed, the imaginary part should be zero.
*
      DO 150 i = 1, n
          d( i ) = dble( work( dpos+(i-1)*lda ) )
  150 CONTINUE
*
*     Copy the off diagonal from A to E. Note that E is REAL thus only
*     the Real part is needed, the imaginary part should be zero.
*
      IF( upper ) THEN
          DO 160 i = 1, n-1
             e( i ) = dble( work( ofdpos+i*lda ) )
  160     CONTINUE
      ELSE
          DO 170 i = 1, n-1
             e( i ) = dble( work( ofdpos+(i-1)*lda ) )
  170     CONTINUE
      ENDIF
*
      work( 1 ) = lwmin
      RETURN
*
*     End of ZHETRD_HB2ST
*

Here is the call graph for this function:

Here is the caller graph for this function: