C ALGORITHM 826, COLLECTED ALGORITHMS FROM ACM. C THIS WORK PUBLISHED IN TRANSACTIONS ON MATHEMATICAL SOFTWARE, C VOL. 29,NO. 3, September, 2003, P. 326--336. #! /bin/sh # This is a shell archive, meaning: # 1. Remove everything above the #! /bin/sh line. # 2. Save the resulting text in a file. # 3. Execute the file with /bin/sh (not csh) to create the files: # SLmake.inc # Src/ # Src/Makefile # Src/README # Src/install # Src/pblas.h # Src/pzaxpy.c # Src/pzdotc.c # Src/pzdotu.c # Src/pzlaconsb.f # Src/pzlacp3.f # Src/pzlahqr.f # Src/pzlasmsub.f # Src/pzlatrs.f # Src/pzlawil.f # Src/pzrot.c # Src/pztrevc.f # Src/zlahqr.f # Src/zlamsh.f # Src/zlanv2.f # Src/zlaref.f # Testing/ # Testing/EVC.dat # Testing/Makefile # Testing/NEP.dat # Testing/README # Testing/pzevcdriver.f # Testing/pzevcinfo.f # Testing/pzget22.f # Testing/pznepdriver.f # Testing/pznepfchk.f # Testing/pznepinfo.f # This archive created: Wed Apr 16 10:13:06 2003 export PATH; PATH=/bin:$PATH if test -f 'SLmake.inc' then echo shar: will not over-write existing file "'SLmake.inc'" else cat << "SHAR_EOF" > 'SLmake.inc' FORTRAN = f90 OPTS = -O3 DRVOPTS = $(OPTS) NOOPT = LOADER = f90 LOADOPTS = CC = cc CCFLAGS = $(OPTS) CDEFS = -DAdd_ $(USEMPI) ARCH = ar ARCHFLAGS= cr RANLIB = echo MACH = origin PLAT = _irix65 SCALAPACK = /usr/local/usp/PETtools/lib/libscalapackn32.a BLACSMPI = /usr/local/usp/PETtools/lib/libmpiblacsn32.a BLASLIB = -lblas LAPACKLIB = $(HOME)/LAPACK/lapack$(PLAT).a MATGENLIB = $(HOME)/LAPACK/tmglib$(PLAT).a PXLAHQRLIB = $(HOME)/$(MACH)/ScaLAPACK/pxlahqrlib.a SERLIBS = $(MATGENLIB) $(LAPACKLIB) $(BLASLIB) PARLIBS = $(PXLAHQRLIB) $(SCALAPACK) $(BLACSMPI) $(SERLIBS) -lscs -lmpi SHAR_EOF fi # end of overwriting check if test ! -d 'Src' then mkdir 'Src' fi cd 'Src' if test -f 'Makefile' then echo shar: will not over-write existing file "'Makefile'" else cat << "SHAR_EOF" > 'Makefile' include ../../SLmake.inc ZQRSRC = pzlahqr.o pzlaconsb.o pzlasmsub.o pzlacp3.o pzlawil.o pzrot.o \ zlamsh.o zlaref.o zlanv2.o zlahqr.o ZVCSRC = pztrevc.o #pzlatrs.o pzaxpy.o pzdotc.o pzdotu.o all: complex16 complex16: $(ZQRSRC) $(ZVCSRC) $(ARCH) $(ARCHFLAGS) $(PXLAHQRLIB) $(ZQRSRC) $(ZVCSRC) $(RANLIB) $(PXLAHQRLIB) $(ZQRSRC): $(FRC) $(ZVCSRC): $(FRC) FRC: @FRC=$(FRC) clean: rm -f *.o .c.o: $(CC) $(CDEFS) $(CCFLAGS) -c $*.c .f.o: $(FORTRAN) $(OPTS) -c $*.f SHAR_EOF fi # end of overwriting check if test -f 'README' then echo shar: will not over-write existing file "'README'" else cat << "SHAR_EOF" > 'README' Files included in this distribution =================================== Makefile README pblas.h pzaxpy.c pzdotu.c pzdotc.c pzlaconsb.f pzlacp3.f pzlahqr.f pzlatrs.f pzlasmsub.f pzlawil.f pzrot.c pztrevc.f zlahqr.f zlamsh.f zlanv2.f zlaref.f List of computational routines with dependencies ================================================ PZLAHQR PZLACONSB PZLACP3 PZLASMSUB PZLAWIL PZROT ZLAHQR ZLAMSH ZLANV2 ZLAREF PZTREVC PZLATRS (optional) PZAXPY (optional) PZDOTC (optional) PZDOTU (optional) Notes ===== The routine ZLAHQR included in this distribution is not the same as found in LAPACK 1.0. This routine uses a multiple double-shift strategy as is found in DLAHQR. This is included for completeness, but is not needed. It can be removed from the list of codes in the Makefile so that the default LAPACK ZLAHQR is used instead. PZLATRS is an updated version of the routine included in ScaLAPACK 1.0. It has been updated to use scaling. It is, however, very slow in comparision to PZTRSV which theoretically does the same computation. Although slow, it controls scaling for ill-conditioned problems. PZLATRS calls some PBLAS and BLAS routines. I found that some of the PBLAS routines (ScaLAPACK version 1.0) each have an undocumented bug in them. They are PZAXPY line 363 &desc_Y[LLD_] -> incy PZDOTU line 472 ixcol -> iycol PZDOTC line 471 ixcol -> iycol Thus, I have included these three update level 1 PBLAS routines with this distribution. SHAR_EOF fi # end of overwriting check if test -f 'install' then echo shar: will not over-write existing file "'install'" else cat << "SHAR_EOF" > 'install' %\section{Installation} % \label{sec:install} %The SBR toolbox comes with a {\tt UNIX} makefile for easy installation and a %testing program for validation and performance tuning. % %In this section we briefly describe the ``standard'' installation procedure. %Detailed information about the installation process may be found in the %\NAME{README} file distributed with the software. % %The installation consists of the following steps. %\begin{enumerate} % \item Get the SBR toolbox from the TOMS repository at \texttt{netlib} % and unpack it with % \COMMAND{zcat sbr.tar.Z | tar xf -}% % This command % puts all the SBR software into a new directory \texttt{sbr}. % \item Edit the file \NAME{make.inc} to match your system setup (e.g., the % location of the LAPACK library, if the latter is installed on your % machine). % \item Type % \COMMAND{make library}% % This command will build the library (called \NAME{libSBR.a} if you % did not change the name). % \item (Optional.) Fine-tune the performance of the algorithms (cf.\ the % \NAME{README} file). % \item (Recommended.) Run the validation tests by typing % \COMMAND{make checks}% % and have a look at the output files \NAME{DOUTCHK} and \NAME{SOUTCHK}. % They should report no ``skipped'' or ``failed'' tests. % \item (Optional.) Run the additional timings (cf.\ the \NAME{README} file). % \item (Optional.) Move the SBR library to a directory searched by % the linker. %\end{enumerate} % %The testing programs provide the residuals $|| U \tilde A - A U ||_{F}$, %the orthogonality errors $|| U^{T} U - I ||_{F}$, and the timings for the %reduction of a symmetric (full or banded) matrix $A$ to a (narrower) banded %or tridiagonal matrix $\tilde A$ for matrices of different orders $n$ and %different semibandwidths. %If the orthogonal matrix $U$ is not accumulated then the above error %measures are not available. %In this case the deviation of the eigenvalues, %$|| \mbox{spec}( A ) - \mbox{spec}( \tilde A ) ||_{2}$, is computed instead, %where $\mbox{spec}( A )$ and $\mbox{spec}( \tilde A )$ are the eigenvalues %of $A$ and $\tilde A$, respectively, in ascending order. SHAR_EOF fi # end of overwriting check if test -f 'pblas.h' then echo shar: will not over-write existing file "'pblas.h'" else cat << "SHAR_EOF" > 'pblas.h' /* --------------------------------------------------------------------- * * -- ScaLAPACK routine (version 1.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 17, 1996 * * --------------------------------------------------------------------- */ /* * This file includes the standard C libraries, as well as system * dependent include files. All PBLAS routines include this file. */ /* * ======================================================================== * Machine Specific PBLAS macros * ======================================================================== */ #define _HAL_ 0 #define _T3D_ 1 #ifdef T3D #define _MACH_ _T3D_ #endif #ifndef _MACH_ #define _MACH_ _HAL_ #endif /* * ======================================================================== * Include files * ======================================================================== */ #include #include #if( _MACH_ == _T3D_ ) #include #endif /* * ======================================================================== * FORTRAN <-> C interface * ======================================================================== * * These macros define how the PBLAS will be called. _F2C_ADD_ assumes * that they will be called by FORTRAN, which expects C routines to have * an underscore postfixed to the name (Suns, and Intel machines expect * this). _F2C_NOCHANGE indicates that FORTRAN will be calling, and that * it expects the name called by FORTRAN to be identical to that compiled * by the C (RS6K's do this). _F2C_UPCASE says it expects C routines * called by FORTRAN to be in all upcase (CRAY wants this). * _F2C_F77ISF2C indicates that the fortran "compiler" in use is * actually f2c, a FORTRAN to C converter. */ #define _F2C_ADD_ 0 #define _F2C_NOCHANGE 1 #define _F2C_UPCASE 2 #define _F2C_F77ISF2C 3 #ifdef UpCase #define _F2C_CALL_ _F2C_UPCASE #endif #ifdef NoChange #define _F2C_CALL_ _F2C_NOCHANGE #endif #ifdef Add_ #define _F2C_CALL_ _F2C_ADD_ #endif #ifdef f77IsF2C #define _F2C_CALL_ _F2C_F77ISF2C #endif #ifndef _F2C_CALL_ #define _F2C_CALL_ _F2C_ADD_ #endif /* * ======================================================================== * TYPE DEFINITIONS AND CONVERSION UTILITIES * ======================================================================== */ typedef struct { float re, im; } complex; typedef struct { double re, im; } complex16; #if( _MACH_ == _T3D_ ) #define float double /* Type of character argument in a FORTRAN call */ #define F_CHAR _fcd /* Character conversion utilities */ #define F2C_CHAR(a) ( _fcdtocp( (a) ) ) #define C2F_CHAR(a) ( _cptofcd( (a), 1 ) ) /* Type of FORTRAN functions */ #define F_VOID_FCT void fortran /* Subroutine */ #define F_INTG_FCT int fortran /* INTEGER function */ #define F_DBLE_FCT double fortran /* DOUBLE PRECISION function */ #else /* Type of character argument in a FORTRAN call */ typedef char * F_CHAR; /* Character conversion utilities */ #define F2C_CHAR(a) (a) #define C2F_CHAR(a) (a) /* Type of FORTRAN functions */ #define F_VOID_FCT void /* Subroutine */ #define F_INTG_FCT int /* INTEGER function */ #define F_DBLE_FCT double /* DOUBLE PRECISION function */ #endif /* * ======================================================================== * #DEFINE MACRO CONSTANTS * ======================================================================== */ #define DLEN_ 9 /* Length of a descriptor */ #define DT_ 0 /* Descriptor Type */ #define CTXT_ 1 /* BLACS context */ #define M_ 2 /* Global Number of Rows */ #define N_ 3 /* Global Number of Columns */ #define MB_ 4 /* Row Blocking Size */ #define NB_ 5 /* Column Blocking Size */ #define RSRC_ 6 /* Starting Processor Row */ #define CSRC_ 7 /* Starting Processor Column */ #define LLD_ 8 /* Local Leading Dimension */ /* * Descriptor types */ #define BLOCK_CYCLIC_2D 1 #define BLOCK_CYCLIC_INB_2D 2 #define BROADCAST "B" /* Blacs operation definitions */ #define COMBINE "C" #define ALL "A" /* Scope definitions */ #define COLUMN "C" #define ROW "R" #define TOPDEF " " /* Default BLACS topology, PB-BLAS routines */ #define CTOPDEF ' ' #define TOPGET "!" #define YES "Y" #define NO "N" #define MULLENFAC 2 #define ONE 1.0 #define ZERO 0.0 /* * ======================================================================== * PREPROCESSOR MACRO FUNCTIONS USED FOR OPTIMIZATION & CONVENIENCE * ======================================================================== */ #define ABS(a) (((a) < 0) ? -(a) : (a)) #define MIN(a,b) (((a) < (b)) ? (a) : (b)) #define MAX(a,b) (((a) > (b)) ? (a) : (b)) #define CEIL(a,b) ( ((a)+(b)-1) / (b) ) #define Mlowcase(C) ( ((C) > 64 && (C) < 91) ? (C) | 32 : (C) ) #define Mupcase(C) ( ((C) > 96 && (C) < 123) ? (C) & 0xDF : (C) ) #define INDXG2L( iglob, nb, iproc, isrcproc, nprocs )\ ( (nb) * ( ( (iglob)-1) / ( (nb) * (nprocs) ) ) +\ ( ( (iglob) - 1 ) % (nb) ) + 1 ) #define INDXL2G( iloc, nb, iproc, isrcproc, nprocs )\ ( (nprocs) * (nb) * ( ( (iloc) - 1 ) / (nb) ) +\ ( ( (iloc) - 1 ) % (nb) ) +\ ( ( (nprocs) + (iproc) - (isrcproc) ) % (nprocs) ) * (nb) + 1 ) #define INDXG2P( iglob, nb, iproc, isrcproc, nprocs ) \ ( ( (isrcproc) + ( (iglob) - 1 ) / (nb) ) % (nprocs) ) #define MYROC0( nblocks, n, nb, nprocs )\ ( ( (nblocks) % (nprocs) ) ? ( ( (nblocks) / (nprocs) ) * (nb) + (nb) )\ : ( ( (nblocks) / (nprocs) )* (nb) + ( (n) % (nb) ) ) ) #if( _F2C_CALL_ == _F2C_ADD_ ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in). * No redefinition necessary to have following FORTRAN to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void pdgemm_(...) * * This is the default. */ #endif #if( _F2C_CALL_ == _F2C_UPCASE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in) * following FORTRAN to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void PDGEMM(...) */ /* TOOLS */ #define ilcm_ ILCM #define infog2l_ INFOG2L #define numroc_ NUMROC #define pstreecomb_ PSTREECOMB #define pdtreecomb_ PDTREECOMB #define pctreecomb_ PCTREECOMB #define pztreecomb_ PZTREECOMB #define scombamax_ SCOMBAMAX #define dcombamax_ DCOMBAMAX #define ccombamax_ CCOMBAMAX #define zcombamax_ ZCOMBAMAX #define scombnrm2_ SCOMBNRM2 #define dcombnrm2_ DCOMBNRM2 /* BLACS */ #define blacs_abort_ BLACS_ABORT #define blacs_gridinfo_ BLACS_GRIDINFO #define igesd2d_ IGESD2D #define igebs2d_ IGEBS2D #define itrsd2d_ ITRSD2D #define itrbs2d_ ITRBS2D #define igerv2d_ IGERV2D #define igebr2d_ IGEBR2D #define itrrv2d_ ITRRV2D #define itrbr2d_ ITRBR2D #define igamx2d_ IGAMX2D #define igamn2d_ IGAMN2D #define igsum2d_ IGSUM2D #define sgesd2d_ SGESD2D #define sgebs2d_ SGEBS2D #define strsd2d_ STRSD2D #define strbs2d_ STRBS2D #define sgerv2d_ SGERV2D #define sgebr2d_ SGEBR2D #define strrv2d_ STRRV2D #define strbr2d_ STRBR2D #define sgamx2d_ SGAMX2D #define sgamn2d_ SGAMN2D #define sgsum2d_ SGSUM2D #define dgesd2d_ DGESD2D #define dgebs2d_ DGEBS2D #define dtrsd2d_ DTRSD2D #define dtrbs2d_ DTRBS2D #define dgerv2d_ DGERV2D #define dgebr2d_ DGEBR2D #define dtrrv2d_ DTRRV2D #define dtrbr2d_ DTRBR2D #define dgamx2d_ DGAMX2D #define dgamn2d_ DGAMN2D #define dgsum2d_ DGSUM2D #define cgesd2d_ CGESD2D #define cgebs2d_ CGEBS2D #define ctrsd2d_ CTRSD2D #define ctrbs2d_ CTRBS2D #define cgerv2d_ CGERV2D #define cgebr2d_ CGEBR2D #define ctrrv2d_ CTRRV2D #define ctrbr2d_ CTRBR2D #define cgamx2d_ CGAMX2D #define cgamn2d_ CGAMN2D #define cgsum2d_ CGSUM2D #define zgesd2d_ ZGESD2D #define zgebs2d_ ZGEBS2D #define ztrsd2d_ ZTRSD2D #define ztrbs2d_ ZTRBS2D #define zgerv2d_ ZGERV2D #define zgebr2d_ ZGEBR2D #define ztrrv2d_ ZTRRV2D #define ztrbr2d_ ZTRBR2D #define zgamx2d_ ZGAMX2D #define zgamn2d_ ZGAMN2D #define zgsum2d_ ZGSUM2D /* Level-1 BLAS */ #define srotg_ SROTG #define srotmg_ SROTMG #define srot_ SROT #define srotm_ SROTM #define sswap_ SSWAP #define sscal_ SSCAL #define scopy_ SCOPY #define saxpy_ SAXPY #define ssdot_ SSDOT #define isamax_ ISAMAX #define drotg_ DROTG #define drotmg_ DROTMG #define drot_ DROT #define drotm_ DROTM #define dswap_ DSWAP #define dscal_ DSCAL #define dcopy_ DCOPY #define daxpy_ DAXPY #define dddot_ DDDOT #define dnrm2_ DNRM2 #define dsnrm2_ DSNRM2 #define dasum_ DASUM #define dsasum_ DSASUM #define idamax_ IDAMAX #define cswap_ CSWAP #define cscal_ CSCAL #define csscal_ CSSCAL #define ccopy_ CCOPY #define caxpy_ CAXPY #define ccdotu_ CCDOTU #define ccdotc_ CCDOTC #define icamax_ ICAMAX #define zswap_ ZSWAP #define zscal_ ZSCAL #define zdscal_ ZDSCAL #define zcopy_ ZCOPY #define zaxpy_ ZAXPY #define zzdotu_ ZZDOTU #define zzdotc_ ZZDOTC #define dscnrm2_ DSCNRM2 #define dznrm2_ DZNRM2 #define dscasum_ DSCASUM #define dzasum_ DZASUM #define izamax_ IZAMAX /* Level-2 BLAS */ #define sgemv_ SGEMV #define ssymv_ SSYMV #define strmv_ STRMV #define strsv_ STRSV #define sger_ SGER #define ssyr_ SSYR #define ssyr2_ SSYR2 #define dgemv_ DGEMV #define dsymv_ DSYMV #define dtrmv_ DTRMV #define dtrsv_ DTRSV #define dger_ DGER #define dsyr_ DSYR #define dsyr2_ DSYR2 #define cgemv_ CGEMV #define chemv_ CHEMV #define ctrmv_ CTRMV #define ctrsv_ CTRSV #define cgeru_ CGERU #define cgerc_ CGERC #define cher_ CHER #define cher2_ CHER2 #define zgemv_ ZGEMV #define zhemv_ ZHEMV #define ztrmv_ ZTRMV #define ztrsv_ ZTRSV #define zgeru_ ZGERU #define zgerc_ ZGERC #define zher_ ZHER #define zher2_ ZHER2 /* Level-3 BLAS */ #define sgemm_ SGEMM #define ssymm_ SSYMM #define ssyrk_ SSYRK #define ssyr2k_ SSYR2K #define strmm_ STRMM #define strsm_ STRSM #define dgemm_ DGEMM #define dsymm_ DSYMM #define dsyrk_ DSYRK #define dsyr2k_ DSYR2K #define dtrmm_ DTRMM #define dtrsm_ DTRSM #define cgemm_ CGEMM #define chemm_ CHEMM #define csymm_ CSYMM #define csyrk_ CSYRK #define cherk_ CHERK #define csyr2k_ CSYR2K #define cher2k_ CHER2K #define ctrmm_ CTRMM #define ctrsm_ CTRSM #define zgemm_ ZGEMM #define zhemm_ ZHEMM #define zsymm_ ZSYMM #define zsyrk_ ZSYRK #define zherk_ ZHERK #define zsyr2k_ ZSYR2K #define zher2k_ ZHER2K #define ztrmm_ ZTRMM #define ztrsm_ ZTRSM /* absolute value auxiliary PBLAS */ #define psatrmv_ PSATRMV #define pdatrmv_ PDATRMV #define pcatrmv_ PCATRMV #define pzatrmv_ PZATRMV #define psagemv_ PSAGEMV #define pdagemv_ PDAGEMV #define pcagemv_ PCAGEMV #define pzagemv_ PZAGEMV #define psasymv_ PSASYMV #define pdasymv_ PDASYMV #define pcahemv_ PCAHEMV #define pzahemv_ PZAHEMV /* Auxiliary PB-BLAS */ #define pbcmatadd_ PBCMATADD #define pbdmatadd_ PBDMATADD #define pbsmatadd_ PBSMATADD #define pbzmatadd_ PBZMATADD /* Level-2 PBBLAS */ #define pbcgemv_ PBCGEMV #define pbcgeru_ PBCGERU #define pbcgerc_ PBCGERC #define pbchemv_ PBCHEMV #define pbcher_ PBCHER #define pbcher2_ PBCHER2 #define pbctrmv_ PBCTRMV #define pbctrnv_ PBCTRNV #define pbctrsv_ PBCTRSV #define pbdgemv_ PBDGEMV #define pbdger_ PBDGER #define pbdsymv_ PBDSYMV #define pbdsyr_ PBDSYR #define pbdsyr2_ PBDSYR2 #define pbdtrmv_ PBDTRMV #define pbdtrnv_ PBDTRNV #define pbdtrsv_ PBDTRSV #define pbsgemv_ PBSGEMV #define pbsger_ PBSGER #define pbssymv_ PBSSYMV #define pbssyr_ PBSSYR #define pbssyr2_ PBSSYR2 #define pbstrmv_ PBSTRMV #define pbstrnv_ PBSTRNV #define pbstrsv_ PBSTRSV #define pbzgemv_ PBZGEMV #define pbzgeru_ PBZGERU #define pbzgerc_ PBZGERC #define pbzhemv_ PBZHEMV #define pbzher_ PBZHER #define pbzher2_ PBZHER2 #define pbztrmv_ PBZTRMV #define pbztrnv_ PBZTRNV #define pbztrsv_ PBZTRSV /* Level-3 PBBLAS */ #define pbcgemm_ PBCGEMM #define pbchemm_ PBCHEMM #define pbcher2k_ PBCHER2K #define pbcherk_ PBCHERK #define pbcsymm_ PBCSYMM #define pbcsyr2k_ PBCSYR2K #define pbcsyrk_ PBCSYRK #define pbctrmm_ PBCTRMM #define pbctrsm_ PBCTRSM #define pbctran_ PBCTRAN #define pbdgemm_ PBDGEMM #define pbdsymm_ PBDSYMM #define pbdsyr2k_ PBDSYR2K #define pbdsyrk_ PBDSYRK #define pbdtrmm_ PBDTRMM #define pbdtrsm_ PBDTRSM #define pbdtran_ PBDTRAN #define pbsgemm_ PBSGEMM #define pbssymm_ PBSSYMM #define pbssyr2k_ PBSSYR2K #define pbssyrk_ PBSSYRK #define pbstrmm_ PBSTRMM #define pbstrsm_ PBSTRSM #define pbstran_ PBSTRAN #define pbzgemm_ PBZGEMM #define pbzhemm_ PBZHEMM #define pbzher2k_ PBZHER2K #define pbzherk_ PBZHERK #define pbzsymm_ PBZSYMM #define pbzsyr2k_ PBZSYR2K #define pbzsyrk_ PBZSYRK #define pbztrmm_ PBZTRMM #define pbztrsm_ PBZTRSM #define pbztran_ PBZTRAN /* Auxilliary PBLAS */ #define pberror_ PBERROR #define pbfreebuf_ PBFREEBUF #define ptopget_ PTOPGET #define ptopset_ PTOPSET /* Level-1 PBLAS */ #define psrotg_ PSROTG #define psrotmg_ PSROTMG #define psrot_ PSROT #define psrotm_ PSROTM #define psswap_ PSSWAP #define psscal_ PSSCAL #define pscopy_ PSCOPY #define psaxpy_ PSAXPY #define psdot_ PSDOT #define psnrm2_ PSNRM2 #define psasum_ PSASUM #define psamax_ PSAMAX #define pdrotg_ PDROTG #define pdrotmg_ PDROTMG #define pdrot_ PDROT #define pdrotm_ PDROTM #define pdswap_ PDSWAP #define pdscal_ PDSCAL #define pdcopy_ PDCOPY #define pdaxpy_ PDAXPY #define pddot_ PDDOT #define pdnrm2_ PDNRM2 #define pdasum_ PDASUM #define pdamax_ PDAMAX #define pcswap_ PCSWAP #define pcscal_ PCSCAL #define pcsscal_ PCSSCAL #define pccopy_ PCCOPY #define pcaxpy_ PCAXPY #define pcdotu_ PCDOTU #define pcdotc_ PCDOTC #define pscnrm2_ PSCNRM2 #define pscasum_ PSCASUM #define pcamax_ PCAMAX #define pcrot_ PCROT #define pzswap_ PZSWAP #define pzscal_ PZSCAL #define pzdscal_ PZDSCAL #define pzcopy_ PZCOPY #define pzaxpy_ PZAXPY #define pzdotu_ PZDOTU #define pzdotc_ PZDOTC #define pdznrm2_ PDZNRM2 #define pdzasum_ PDZASUM #define pzamax_ PZAMAX #define pzrot_ PZROT /* Level-2 PBLAS */ #define pcgemv_ PCGEMV #define pcgeru_ PCGERU #define pcgerc_ PCGERC #define pchemv_ PCHEMV #define pcher_ PCHER #define pcher2_ PCHER2 #define pctrmv_ PCTRMV #define pctrsv_ PCTRSV #define pdgemv_ PDGEMV #define pdger_ PDGER #define pdsymv_ PDSYMV #define pdsyr_ PDSYR #define pdsyr2_ PDSYR2 #define pdtrmv_ PDTRMV #define pdtrsv_ PDTRSV #define psgemv_ PSGEMV #define psger_ PSGER #define pssymv_ PSSYMV #define pssyr_ PSSYR #define pssyr2_ PSSYR2 #define pstrmv_ PSTRMV #define pstrsv_ PSTRSV #define pzgemv_ PZGEMV #define pzgeru_ PZGERU #define pzgerc_ PZGERC #define pzhemv_ PZHEMV #define pzher_ PZHER #define pzher2_ PZHER2 #define pztrmv_ PZTRMV #define pztrsv_ PZTRSV /* Level-3 PBLAS */ #define pcgemm_ PCGEMM #define pchemm_ PCHEMM #define pcher2k_ PCHER2K #define pcherk_ PCHERK #define pcsymm_ PCSYMM #define pcsyr2k_ PCSYR2K #define pcsyrk_ PCSYRK #define pctrmm_ PCTRMM #define pctrsm_ PCTRSM #define pctranu_ PCTRANU #define pctranc_ PCTRANC #define pdgemm_ PDGEMM #define pdsymm_ PDSYMM #define pdsyr2k_ PDSYR2K #define pdsyrk_ PDSYRK #define pdtrmm_ PDTRMM #define pdtrsm_ PDTRSM #define pdtran_ PDTRAN #define psgemm_ PSGEMM #define pssymm_ PSSYMM #define pssyr2k_ PSSYR2K #define pssyrk_ PSSYRK #define pstrmm_ PSTRMM #define pstrsm_ PSTRSM #define pstran_ PSTRAN #define pzgemm_ PZGEMM #define pzhemm_ PZHEMM #define pzher2k_ PZHER2K #define pzherk_ PZHERK #define pzsymm_ PZSYMM #define pzsyr2k_ PZSYR2K #define pzsyrk_ PZSYRK #define pztrmm_ PZTRMM #define pztrsm_ PZTRSM #define pztranu_ PZTRANU #define pztranc_ PZTRANC #endif #if( _F2C_CALL_ == _F2C_NOCHANGE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in) * for following FORTRAN to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void pdgemm(...) */ /* TOOLS */ #define ilcm_ ilcm #define infog2l_ infog2l #define numroc_ numroc #define pstreecomb_ pstreecomb #define pdtreecomb_ pdtreecomb #define pctreecomb_ pctreecomb #define pztreecomb_ pztreecomb #define scombamax_ scombamax #define dcombamax_ dcombamax #define ccombamax_ ccombamax #define zcombamax_ zcombamax #define scombnrm2_ scombnrm2 #define dcombnrm2_ dcombnrm2 /* BLACS */ #define blacs_abort_ blacs_abort #define blacs_gridinfo_ blacs_gridinfo #define igesd2d_ igesd2d #define igebs2d_ igebs2d #define itrsd2d_ itrsd2d #define itrbs2d_ itrbs2d #define igerv2d_ igerv2d #define igebr2d_ igebr2d #define itrrv2d_ itrrv2d #define itrbr2d_ itrbr2d #define igamx2d_ igamx2d #define igamn2d_ igamn2d #define igsum2d_ igsum2d #define sgesd2d_ sgesd2d #define sgebs2d_ sgebs2d #define strsd2d_ strsd2d #define strbs2d_ strbs2d #define sgerv2d_ sgerv2d #define sgebr2d_ sgebr2d #define strrv2d_ strrv2d #define strbr2d_ strbr2d #define sgamx2d_ sgamx2d #define sgamn2d_ sgamn2d #define sgsum2d_ sgsum2d #define dgesd2d_ dgesd2d #define dgebs2d_ dgebs2d #define dtrsd2d_ dtrsd2d #define dtrbs2d_ dtrbs2d #define dgerv2d_ dgerv2d #define dgebr2d_ dgebr2d #define dtrrv2d_ dtrrv2d #define dtrbr2d_ dtrbr2d #define dgamx2d_ dgamx2d #define dgamn2d_ dgamn2d #define dgsum2d_ dgsum2d #define cgesd2d_ cgesd2d #define cgebs2d_ cgebs2d #define ctrsd2d_ ctrsd2d #define ctrbs2d_ ctrbs2d #define cgerv2d_ cgerv2d #define cgebr2d_ cgebr2d #define ctrrv2d_ ctrrv2d #define ctrbr2d_ ctrbr2d #define cgamx2d_ cgamx2d #define cgamn2d_ cgamn2d #define cgsum2d_ cgsum2d #define zgesd2d_ zgesd2d #define zgebs2d_ zgebs2d #define ztrsd2d_ ztrsd2d #define ztrbs2d_ ztrbs2d #define zgerv2d_ zgerv2d #define zgebr2d_ zgebr2d #define ztrrv2d_ ztrrv2d #define ztrbr2d_ ztrbr2d #define zgamx2d_ zgamx2d #define zgamn2d_ zgamn2d #define zgsum2d_ zgsum2d /* Level-1 BLAS */ #define srotg_ srotg #define srotmg_ srotmg #define srot_ srot #define srotm_ srotm #define sswap_ sswap #define sscal_ sscal #define scopy_ scopy #define saxpy_ saxpy #define ssdot_ ssdot #define isamax_ isamax #define drotg_ drotg #define drotmg_ drotmg #define drot_ drot #define drotm_ drotm #define dswap_ dswap #define dscal_ dscal #define dcopy_ dcopy #define daxpy_ daxpy #define dddot_ dddot #define dnrm2_ dnrm2 #define dsnrm2_ dsnrm2 #define dasum_ dasum #define dsasum_ dsasum #define idamax_ idamax #define cswap_ cswap #define cscal_ cscal #define csscal_ csscal #define ccopy_ ccopy #define caxpy_ caxpy #define ccdotu_ ccdotu #define ccdotc_ ccdotc #define icamax_ icamax #define zswap_ zswap #define zscal_ zscal #define zdscal_ zdscal #define zcopy_ zcopy #define zaxpy_ zaxpy #define zzdotu_ zzdotu #define zzdotc_ zzdotc #define dscnrm2_ dscnrm2 #define dznrm2_ dznrm2 #define dscasum_ dscasum #define dzasum_ dzasum #define izamax_ izamax /* Level-2 BLAS */ #define sgemv_ sgemv #define ssymv_ ssymv #define strmv_ strmv #define strsv_ strsv #define sger_ sger #define ssyr_ ssyr #define ssyr2_ ssyr2 #define dgemv_ dgemv #define dsymv_ dsymv #define dtrmv_ dtrmv #define dtrsv_ dtrsv #define dger_ dger #define dsyr_ dsyr #define dsyr2_ dsyr2 #define cgemv_ cgemv #define chemv_ chemv #define ctrmv_ ctrmv #define ctrsv_ ctrsv #define cgeru_ cgeru #define cgerc_ cgerc #define cher_ cher #define cher2_ cher2 #define zgemv_ zgemv #define zhemv_ zhemv #define ztrmv_ ztrmv #define ztrsv_ ztrsv #define zgeru_ zgeru #define zgerc_ zgerc #define zher_ zher #define zher2_ zher2 /* Level-3 BLAS */ #define sgemm_ sgemm #define ssymm_ ssymm #define ssyrk_ ssyrk #define ssyr2k_ ssyr2k #define strmm_ strmm #define strsm_ strsm #define dgemm_ dgemm #define dsymm_ dsymm #define dsyrk_ dsyrk #define dsyr2k_ dsyr2k #define dtrmm_ dtrmm #define dtrsm_ dtrsm #define cgemm_ cgemm #define chemm_ chemm #define csymm_ csymm #define csyrk_ csyrk #define cherk_ cherk #define csyr2k_ csyr2k #define cher2k_ cher2k #define ctrmm_ ctrmm #define ctrsm_ ctrsm #define zgemm_ zgemm #define zhemm_ zhemm #define zsymm_ zsymm #define zsyrk_ zsyrk #define zherk_ zherk #define zsyr2k_ zsyr2k #define zher2k_ zher2k #define ztrmm_ ztrmm #define ztrsm_ ztrsm /* absolute value auxiliary PBLAS */ #define psatrmv_ psatrmv #define pdatrmv_ pdatrmv #define pcatrmv_ pcatrmv #define pzatrmv_ pzatrmv #define psagemv_ psagemv #define pdagemv_ pdagemv #define pcagemv_ pcagemv #define pzagemv_ pzagemv #define psasymv_ psasymv #define pdasymv_ pdasymv #define pcahemv_ pcahemv #define pzahemv_ pzahemv /* Auxiliary PB-BLAS */ #define pbcmatadd_ pbcmatadd #define pbdmatadd_ pbdmatadd #define pbsmatadd_ pbsmatadd #define pbzmatadd_ pbzmatadd /* Level-2 PBBLAS */ #define pbcgemv_ pbcgemv #define pbcgeru_ pbcgeru #define pbcgerc_ pbcgerc #define pbchemv_ pbchemv #define pbcher_ pbcher #define pbcher2_ pbcher2 #define pbctrmv_ pbctrmv #define pbctrnv_ pbctrnv #define pbctrsv_ pbctrsv #define pbdgemv_ pbdgemv #define pbdger_ pbdger #define pbdsymv_ pbdsymv #define pbdsyr_ pbdsyr #define pbdsyr2_ pbdsyr2 #define pbdtrmv_ pbdtrmv #define pbdtrnv_ pbdtrnv #define pbdtrsv_ pbdtrsv #define pbsgemv_ pbsgemv #define pbsger_ pbsger #define pbssymv_ pbssymv #define pbssyr_ pbssyr #define pbssyr2_ pbssyr2 #define pbstrmv_ pbstrmv #define pbstrnv_ pbstrnv #define pbstrsv_ pbstrsv #define pbzgemv_ pbzgemv #define pbzgeru_ pbzgeru #define pbzgerc_ pbzgerc #define pbzhemv_ pbzhemv #define pbzher_ pbzher #define pbzher2_ pbzher2 #define pbztrmv_ pbztrmv #define pbztrnv_ pbztrnv #define pbztrsv_ pbztrsv /* Level-3 PBBLAS */ #define pbcgemm_ pbcgemm #define pbchemm_ pbchemm #define pbcher2k_ pbcher2k #define pbcherk_ pbcherk #define pbcsymm_ pbcsymm #define pbcsyr2k_ pbcsyr2k #define pbcsyrk_ pbcsyrk #define pbctrmm_ pbctrmm #define pbctrsm_ pbctrsm #define pbctran_ pbctran #define pbdgemm_ pbdgemm #define pbdsymm_ pbdsymm #define pbdsyr2k_ pbdsyr2k #define pbdsyrk_ pbdsyrk #define pbdtrmm_ pbdtrmm #define pbdtrsm_ pbdtrsm #define pbdtran_ pbdtran #define pbsgemm_ pbsgemm #define pbssymm_ pbssymm #define pbssyr2k_ pbssyr2k #define pbssyrk_ pbssyrk #define pbstrmm_ pbstrmm #define pbstrsm_ pbstrsm #define pbstran_ pbstran #define pbzgemm_ pbzgemm #define pbzhemm_ pbzhemm #define pbzher2k_ pbzher2k #define pbzherk_ pbzherk #define pbzsymm_ pbzsymm #define pbzsyr2k_ pbzsyr2k #define pbzsyrk_ pbzsyrk #define pbztrmm_ pbztrmm #define pbztrsm_ pbztrsm #define pbztran_ pbztran /* Auxilliary PBLAS */ #define pberror_ pberror #define pbfreebuf_ pbfreebuf #define ptopget_ ptopget #define ptopset_ ptopset /* Level-1 PBLAS */ #define psrotg_ psrotg #define psrotmg_ psrotmg #define psrot_ psrot #define psrotm_ psrotm #define psswap_ psswap #define psscal_ psscal #define pscopy_ pscopy #define psaxpy_ psaxpy #define psdot_ psdot #define psnrm2_ psnrm2 #define psasum_ psasum #define psamax_ psamax #define pdrotg_ pdrotg #define pdrotmg_ pdrotmg #define pdrot_ pdrot #define pdrotm_ pdrotm #define pdswap_ pdswap #define pdscal_ pdscal #define pdcopy_ pdcopy #define pdaxpy_ pdaxpy #define pddot_ pddot #define pdnrm2_ pdnrm2 #define pdasum_ pdasum #define pdamax_ pdamax #define pcswap_ pcswap #define pcscal_ pcscal #define pcsscal_ pcsscal #define pccopy_ pccopy #define pcaxpy_ pcaxpy #define pcdotu_ pcdotu #define pcdotc_ pcdotc #define pscnrm2_ pscnrm2 #define pscasum_ pscasum #define pcamax_ pcamax #define pcrot_ pcrot #define pzswap_ pzswap #define pzscal_ pzscal #define pzdscal_ pzdscal #define pzcopy_ pzcopy #define pzaxpy_ pzaxpy #define pzdotu_ pzdotu #define pzdotc_ pzdotc #define pdznrm2_ pdznrm2 #define pdzasum_ pdzasum #define pzamax_ pzamax #define pzrot_ pzrot /* Level-2 PBLAS */ #define pcgemv_ pcgemv #define pcgeru_ pcgeru #define pcgerc_ pcgerc #define pchemv_ pchemv #define pcher_ pcher #define pcher2_ pcher2 #define pctrmv_ pctrmv #define pctrsv_ pctrsv #define pdgemv_ pdgemv #define pdger_ pdger #define pdsymv_ pdsymv #define pdsyr_ pdsyr #define pdsyr2_ pdsyr2 #define pdtrmv_ pdtrmv #define pdtrsv_ pdtrsv #define psgemv_ psgemv #define psger_ psger #define pssymv_ pssymv #define pssyr_ pssyr #define pssyr2_ pssyr2 #define pstrmv_ pstrmv #define pstrsv_ pstrsv #define pzgemv_ pzgemv #define pzgeru_ pzgeru #define pzgerc_ pzgerc #define pzhemv_ pzhemv #define pzher_ pzher #define pzher2_ pzher2 #define pztrmv_ pztrmv #define pztrsv_ pztrsv /* Level-3 PBLAS */ #define pcgemm_ pcgemm #define pchemm_ pchemm #define pcher2k_ pcher2k #define pcherk_ pcherk #define pcsymm_ pcsymm #define pcsyr2k_ pcsyr2k #define pcsyrk_ pcsyrk #define pctrmm_ pctrmm #define pctrsm_ pctrsm #define pctranu_ pctranu #define pctranc_ pctranc #define pdgemm_ pdgemm #define pdsymm_ pdsymm #define pdsyr2k_ pdsyr2k #define pdsyrk_ pdsyrk #define pdtrmm_ pdtrmm #define pdtrsm_ pdtrsm #define pdtran_ pdtran #define psgemm_ psgemm #define pssymm_ pssymm #define pssyr2k_ pssyr2k #define pssyrk_ pssyrk #define pstrmm_ pstrmm #define pstrsm_ pstrsm #define pstran_ pstran #define pzgemm_ pzgemm #define pzhemm_ pzhemm #define pzher2k_ pzher2k #define pzherk_ pzherk #define pzsymm_ pzsymm #define pzsyr2k_ pzsyr2k #define pzsyrk_ pzsyrk #define pztrmm_ pztrmm #define pztrsm_ pztrsm #define pztranu_ pztranu #define pztranc_ pztranc #endif #if( _F2C_CALL_ == _F2C_F77ISF2C ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in) * for systems where the fortran "compiler" is actually f2c (a Fortran * to C conversion utility). */ /* * Initialization routines */ #define blacs_pinfo_ blacs_pinfo__ #define blacs_setup_ blacs_setup__ #define blacs_set_ blacs_set__ #define blacs_get_ blacs_get__ #define blacs_gridinit_ blacs_gridinit__ #define blacs_gridmap_ blacs_gridmap__ /* * Destruction routines */ #define blacs_freebuff_ blacs_freebuff__ #define blacs_gridexit_ blacs_gridexit__ #define blacs_abort_ blacs_abort__ #define blacs_exit_ blacs_exit__ /* * Informational & misc. */ #define blacs_gridinfo_ blacs_gridinfo__ #define blacs_pnum_ blacs_pnum__ #define blacs_pcoord_ blacs_pcoord__ #define blacs_barrier_ blacs_barrier__ #endif SHAR_EOF fi # end of overwriting check if test -f 'pzaxpy.c' then echo shar: will not over-write existing file "'pzaxpy.c'" else cat << "SHAR_EOF" > 'pzaxpy.c' /* --------------------------------------------------------------------- * * Mark R. Fahey * August 2000 * This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0 * which fixes a bug in the incx=1 and incy=1 case. * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzaxpy_( n, alpha, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy ) /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; complex16 * alpha; /* .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZAXPY adds one distributed vector to another, * * sub( Y ) := sub( Y ) + alpha * sub( X ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * Parameters * ========== * * N (global input) pointer to INTEGER. * The length of the distributed vectors to be added. N >= 0. * * ALPHA (global input) pointer to COMPLEX*16 * The scalar used to multiply each component of sub( X ). * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input/local output) COMPLEX*16 array * containing the local pieces of a distributed matrix of * dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * On exit sub( Y ) is overwritten by sub( Y ) + alpha*sub( X ). * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * ===================================================================== * * .. Local Scalars .. */ int ictxt, info, iix, iiy, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, lcmq, mycol, myrow, nn, np, np0, nprow, npcol, nq, nq0, nz, ione=1, tmp1, wksz; complex16 one, tmp, zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgerv2d_(); void zgesd2d_(); void pbchkvect(); void pberror_(); char * getpbbuf(); F_VOID_FCT zaxpy_(); F_VOID_FCT zcopy_(); F_VOID_FCT pbztrnv_(); F_INTG_FCT ilcm_(); F_INTG_FCT numroc_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(600+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -10; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -9; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1100+MB_+1); } else { info = -12; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -10; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -9; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1100+MB_+1); } else { info = -12; } } else { info = -7; } } if( ictxt != desc_Y[CTXT_] ) info = -(1100+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZAXPY", &info ); return; } /* * Quick return if possible. */ if( *n == 0 ) return; /* * y <- y + alpha * x */ if( *n == 1 ) { if( ( myrow == iyrow ) && ( mycol == iycol ) ) { if( ( myrow != ixrow ) || ( mycol != ixcol ) ) zgerv2d_( &ictxt, n, n, &tmp, n, &ixrow, &ixcol ); else tmp = X[iix-1+(jjx-1)*desc_X[LLD_]]; zaxpy_( n, alpha, &tmp, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n ); } else if( ( myrow == ixrow ) && ( mycol == ixcol ) ) zgesd2d_( &ictxt, n, n, &X[iix-1+(jjx-1)*desc_X[LLD_]], n, &iyrow, &iycol ); return; } one.re = ONE; one.im = ZERO; zero.re = ZERO; zero.im = ZERO; if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) zaxpy_( &nq, alpha, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); } else { if( myrow == ixrow ) zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); else if( myrow == iyrow ) { buff = (complex16 *)getpbbuf( "PZAXPY", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zaxpy_( &nq, alpha, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) zaxpy_( &np, alpha, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); } else { if( mycol == ixcol ) zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); else if( mycol == iycol ) { buff = (complex16 *)getpbbuf( "PZAXPY", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zaxpy_( &np, alpha, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmq = lcm / npcol; nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[NB_]; nq0 = MYROC0( tmp1, nn, desc_Y[NB_], npcol ); tmp1 = nq0 / desc_Y[NB_]; wksz = np + MYROC0( tmp1, nq0, desc_Y[NB_], lcmq ); buff = (complex16 *)getpbbuf( "PZAXPY", wksz*sizeof(complex16) ); if( myrow == ixrow ) np -= nz; if( mycol == ixcol ) { zcopy_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, incx ); zscal_( &np, alpha, buff, incx ); } pbztrnv_( &ictxt, C2F_CHAR( "C" ), C2F_CHAR( "T" ), n, &desc_X[MB_], &nz, buff, incx, &one, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &ixcol, &iyrow, &iycol, buff+np ); } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*iy-1) % desc_Y[MB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_Y[MB_], &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZAXPY", wksz*sizeof(complex16) ); pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { if( myrow == iyrow ) np -= nz; zaxpy_( &np, alpha, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); } } } } SHAR_EOF fi # end of overwriting check if test -f 'pzdotc.c' then echo shar: will not over-write existing file "'pzdotc.c'" else cat << "SHAR_EOF" > 'pzdotc.c' /* --------------------------------------------------------------------- * * Mark R. Fahey * August 2000 * This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0 * which fixes a bug in the incx=1 and incy=1 case. * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzdotc_( n, dotc, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy ) /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; complex16 * dotc; /* .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZDOTC forms the dot product of two distributed vectors, * * dotc := sub( X )**H * sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * * Parameters * ========== * * N (global input) pointer to INTEGER * The length of the distributed vectors to be multiplied. * N >= 0. * * DOTC (local output) pointer to COMPLEX*16 * The dot product of sub( X ) and sub( Y ) only in their scope. * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * ===================================================================== * * .. Local Scalars .. */ char * cbtop, * cctop, * rbtop, * rctop; int ictxt, iix, iiy, info, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, mone=-1, mycol, myrow, nn, np, np0, nprow, npcol, nq, nz, ione=1, tmp1, wksz; complex16 xwork[1], ywork[1], zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgebr2d_(); void zgebs2d_(); void zgerv2d_(); void zgesd2d_(); void zgsum2d_(); void pbchkvect(); void pberror_(); char * getpbbuf(); char * ptop(); F_VOID_FCT pbztrnv_(); F_VOID_FCT zzdotc_(); F_INTG_FCT ilcm_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(600+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -10; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -9; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1100+MB_+1); } else { info = -12; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -10; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -9; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1100+MB_+1); } else { info = -12; } } else { info = -7; } } if( ictxt != desc_Y[CTXT_] ) info = -(1100+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZDOTC", &info ); return; } /* * Quick return if possible. */ dotc->re = ZERO; dotc->im = ZERO; zero.re = ZERO; zero.im = ZERO; if( *n == 0 ) return; /* * dot <- x^{h} * y */ if( *n == 1 ) { if( ( myrow == ixrow ) && ( mycol == ixcol ) ) { buff = &X[iix-1+(jjx-1)*desc_X[LLD_]]; if( ( myrow != iyrow ) || ( mycol != iycol ) ) { zgesd2d_( &ictxt, n, n, buff, n, &iyrow, &iycol ); zgerv2d_( &ictxt, n, n, ywork, n, &iyrow, &iycol ); } else *ywork = Y[iiy-1+(jjy-1)*desc_Y[LLD_]]; zzdotc_( n, dotc, buff, n, ywork, n ); } else if( ( myrow == iyrow ) && ( mycol == iycol ) ) { zgesd2d_( &ictxt, n, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, &ixrow, &ixcol ); zgerv2d_( &ictxt, n, n, xwork, n, &ixrow, &ixcol ); zzdotc_( n, dotc, xwork, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n ); } if( ( *incx == desc_X[M_] ) && ( desc_X[M_] != 1 ) ) { if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &ixcol ); } } } else if( ( *incx == 1 ) && ( desc_X[M_] != 1 ) ) { if( mycol == ixcol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == ixrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione, &ixrow, &mycol ); } } } if( ( *incy == desc_Y[M_] ) && ( desc_Y[M_] != 1 ) ) { if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &iycol ); } } } else if( ( *incy == 1 ) && ( desc_Y[M_] != 1 ) ) { if( mycol == iycol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == iyrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione, &iyrow, &mycol ); } } } return; } if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zzdotc_( &nq, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } else { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTC", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotc_( &nq, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } else if( myrow == iyrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTC", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotc_( &nq, dotc, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotc_( &np, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } else { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); buff = (complex16 *)getpbbuf( "PZDOTC", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &iycol ); zzdotc_( &np, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } else if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); buff = (complex16 *)getpbbuf( "PZDOTC", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zgesd2d_( &ictxt, &np, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &myrow, &ixcol ); zzdotc_( &np, dotc, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmp = lcm / nprow; nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_X[MB_], nprow ); tmp1 = np0 / desc_X[MB_]; wksz = MYROC0( tmp1, np0, desc_X[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTC", wksz*sizeof(complex16) ); if( mycol == iycol ) jjy -= nz; if( myrow == ixrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &zero, buff, &ione, &iyrow, &iycol, &ixrow, &ixcol, buff+np ); if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotc_( &np, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &ixcol ); } } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*jx-1) % desc_X[NB_]; nn = *n + nz; tmp1 = nn / desc_X[MB_]; np = numroc_( &nn, desc_Y+MB_, &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTC", wksz*sizeof(complex16) ); if( myrow == iyrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotc_( &np, dotc, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &iycol ); } } } } SHAR_EOF fi # end of overwriting check if test -f 'pzdotu.c' then echo shar: will not over-write existing file "'pzdotu.c'" else cat << "SHAR_EOF" > 'pzdotu.c' /* --------------------------------------------------------------------- * * Mark R. Fahey * August 2000 * This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0 * which fixes a bug in the incx=1 and incy=1 case. * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzdotu_( n, dotu, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy ) /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; complex16 * dotu; /* .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZDOTU forms the dot product of two distributed vectors, * * dotu := sub( X )**T * sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * * Parameters * ========== * * N (global input) pointer to INTEGER * The length of the distributed vectors to be multiplied. * N >= 0. * * DOTU (local output) pointer to COMPLEX*16 * The dot product of sub( X ) and sub( Y ) only in their scope. * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * ===================================================================== * * .. Local Scalars .. */ char * cbtop, * cctop, * rbtop, * rctop; int ictxt, iix, iiy, info, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, mone=-1, mycol, myrow, nn, np, np0, nprow, npcol, nq, nz, ione=1, tmp1, wksz; complex16 xwork[1], ywork[1], zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgebr2d_(); void zgebs2d_(); void zgerv2d_(); void zgesd2d_(); void zgsum2d_(); void pbchkvect(); void pberror_(); char * getpbbuf(); char * ptop(); F_VOID_FCT pbztrnv_(); F_VOID_FCT zzdotu_(); F_INTG_FCT ilcm_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(600+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -10; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -9; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1100+MB_+1); } else { info = -12; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -10; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -9; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1100+MB_+1); } else { info = -12; } } else { info = -7; } } if( ictxt != desc_Y[CTXT_] ) info = -(1100+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZDOTU", &info ); return; } /* * Quick return if possible. */ dotu->re = ZERO; dotu->im = ZERO; zero.re = ZERO; zero.im = ZERO; if( *n == 0 ) return; /* * dot <- x^{t} * y */ if( *n == 1 ) { if( ( myrow == ixrow ) && ( mycol == ixcol ) ) { buff = &X[iix-1+(jjx-1)*desc_X[LLD_]]; if( ( myrow != iyrow ) || ( mycol != iycol ) ) { zgesd2d_( &ictxt, n, n, buff, n, &iyrow, &iycol ); zgerv2d_( &ictxt, n, n, ywork, n, &iyrow, &iycol ); } else *ywork = Y[iiy-1+(jjy-1)*desc_Y[LLD_]]; zzdotu_( n, dotu, buff, n, ywork, n ); } else if( ( myrow == iyrow ) && ( mycol == iycol ) ) { zgesd2d_( &ictxt, n, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, &ixrow, &ixcol ); zgerv2d_( &ictxt, n, n, xwork, n, &ixrow, &ixcol ); zzdotu_( n, dotu, xwork, n, &Y[iiy-1+(jjx-1)*desc_X[LLD_]], n ); } if( ( *incx == desc_X[M_] ) && ( desc_X[M_] != 1 ) ) { if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &ixcol ); } } } else if( ( *incx == 1 ) && ( desc_X[M_] != 1 ) ) { if( mycol == ixcol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == ixrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione, &ixrow, &mycol ); } } } if( ( *incy == desc_Y[M_] ) && ( desc_Y[M_] != 1 ) ) { if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &iycol ); } } } else if( ( *incy == 1 ) && ( desc_Y[M_] != 1 ) ) { if( mycol == iycol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == iyrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione, &iyrow, &mycol ); } } } return; } if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zzdotu_( &nq, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } else { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTU", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotu_( &nq, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } else if( myrow == iyrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTU", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotu_( &nq, dotu, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotu_( &np, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } else { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); buff = (complex16 *)getpbbuf( "PZDOTU", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &iycol ); zzdotu_( &np, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } else if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zgesd2d_( &ictxt, &np, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &myrow, &ixcol ); buff = (complex16 *)getpbbuf( "PZDOTU", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zzdotu_( &np, dotu, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmp = lcm / nprow; nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_X[MB_], nprow ); tmp1 = np0 / desc_X[MB_]; wksz = MYROC0( tmp1, np0, desc_X[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTU", wksz*sizeof(complex16) ); if( mycol == iycol ) jjy -= nz; if( myrow == ixrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &zero, buff, &ione, &iyrow, &iycol, &ixrow, &ixcol, buff+np ); if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotu_( &np, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &ixcol ); } } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*jx-1) % desc_X[NB_]; nn = *n + nz; tmp1 = nn / desc_X[MB_]; np = numroc_( &nn, desc_Y+MB_, &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTU", wksz*sizeof(complex16) ); if( myrow == iyrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotu_( &np, dotu, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &iycol ); } } } } SHAR_EOF fi # end of overwriting check if test -f 'pzlaconsb.f' then echo shar: will not over-write existing file "'pzlaconsb.f'" else cat << "SHAR_EOF" > 'pzlaconsb.f' SUBROUTINE PZLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, BUF, $ LWORK ) * * Mark R. Fahey * May 28, 1999 * * * .. Scalar Arguments .. INTEGER I, L, LWORK, M COMPLEX*16 H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), BUF( * ) * .. * * Purpose * ======= * * PZLACONSB looks for two consecutive small subdiagonal elements by * seeing the effect of starting a double shift QR iteration * given by H44, H33, & H43H34 and see if this would make a * subdiagonal negligible. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * M (global output) INTEGER * On exit, this yields the starting location of the QR double * shift. This will satisfy: L <= M <= I-2. * * H44 * H33 * H43H34 (global input) COMPLEX*16 * These three values are for the double shift QR iteration. * * BUF (local output) COMPLEX*16 array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 7*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Logic: * ====== * * Two consecutive small subdiagonal elements will stall * convergence of a double shift if their product is small * relatively even if each is not very small. Thus it is * necessary to scan the "tridiagonal portion of the matrix." In * the LAPACK algorithm ZLAHQR, a loop of M goes from I-2 down to * L and examines * H(m,m),H(m+1,m+1),H(m+1,m),H(m,m+1),H(m-1,m-1),H(m,m-1), and * H(m+2,m-1). Since these elements may be on separate * processors, the first major loop (10) goes over the tridiagonal * and has each node store whatever values of the 7 it has that * the node owning H(m,m) does not. This will occur on a border * and can happen in no more than 3 locations per block assuming * square blocks. There are 5 buffers that each node stores these * values: a buffer to send diagonally down and right, a buffer * to send up, a buffer to send left, a buffer to send diagonally * up and left and a buffer to send right. Each of these buffers * is actually stored in one buffer BUF where BUF(ISTR1+1) starts * the first buffer, BUF(ISTR2+1) starts the second, etc.. After * the values are stored, if there are any values that a node * needs, they will be sent and received. Then the next major * loop passes over the data and searches for two consecutive * small subdiagonals. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IBUF1, IBUF2, IBUF3, IBUF4, $ IBUF5, ICOL1, II, IRCV1, IRCV2, IRCV3, IRCV4, $ IRCV5, IROW1, ISRC, ISTR1, ISTR2, ISTR3, ISTR4, $ ISTR5, JJ, JSRC, LDA, LEFT, MODKM1, MYCOL, $ MYROW, NPCOL, NPROW, NUM, RIGHT, UP DOUBLE PRECISION S, TST1, ULP COMPLEX*16 CDUM, H00, H10, H11, H12, H21, H22, H33S, H44S, $ V1, V2, V3 * .. * .. External Functions .. INTEGER ILCM DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, INFOG2L, PXERBLA, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 starts at BUF(ISTR1+1) and will contain IBUF1 elements * BUFFER2 starts at BUF(ISTR2+1) and will contain IBUF2 elements * BUFFER3 starts at BUF(ISTR3+1) and will contain IBUF3 elements * BUFFER4 starts at BUF(ISTR4+1) and will contain IBUF4 elements * BUFFER5 starts at BUF(ISTR5+1) and will contain IBUF5 elements * ISTR1 = 0 ISTR2 = ( ( I-L-1 ) / HBL ) IF( ISTR2*HBL.LT.( I-L-1 ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.7*ISTR2 ) THEN CALL PXERBLA( CONTXT, 'PZLACONSB', 10 ) RETURN END IF ISTR3 = 3*ISTR2 ISTR4 = ISTR3 + ISTR2 ISTR5 = ISTR3 + ISTR3 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) * * Copy our relevant pieces of triadiagonal that we owe into * 5 buffers to send to whomever owns H(M,M) as M moves diagonally * up the tridiagonal * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 IRCV1 = 0 IRCV2 = 0 IRCV3 = 0 IRCV4 = 0 IRCV5 = 0 DO 10 M = I - 2, L, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M-1,M-1) and send it diagonal down * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( M-1, M-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M ,M-1) and send it right * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF5 = IBUF5 + 1 BUF( ISTR5+IBUF5 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+1,M) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( MYROW.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M ,M+1) and send it left * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF3 = IBUF3 + 1 BUF( ISTR3+IBUF3 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M+1,M+1) & H(M+2,M+1) and send it * diagonally up * IF( ( UP.NE.MYROW ) .OR. ( LEFT.NE.MYCOL ) ) THEN CALL INFOG2L( M+1, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF4 = IBUF4 + 2 BUF( ISTR4+IBUF4-1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) BUF( ISTR4+IBUF4 ) = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+2,M+1) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * Add up the receives * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( M.GT.L ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M-1,M-1) from diagonal up * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) .AND. ( M.GT.L ) ) $ THEN * * We must receive H(M ,M-1) from left * IRCV5 = IRCV5 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+1,M ) from down * IRCV2 = IRCV2 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPCOL.GT.1 ) ) THEN * * We must receive H(M ,M+1) from right * IRCV3 = IRCV3 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M+1:M+2,M+1) from diagonal down * IRCV4 = IRCV4 + 2 END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+2,M+1) from down * IRCV2 = IRCV2 + 1 END IF END IF * * Possibly change owners (occurs only when MOD(M-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * * Send data on to the appropriate node if there is any data to send * IF( IBUF1.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, UP, $ MYCOL ) END IF IF( IBUF3.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF3, 1, BUF( ISTR3+1 ), IBUF3, MYROW, $ LEFT ) END IF IF( IBUF4.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF4, 1, BUF( ISTR4+1 ), IBUF4, UP, $ LEFT ) END IF IF( IBUF5.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF5, 1, BUF( ISTR5+1 ), IBUF5, MYROW, $ RIGHT ) END IF * * Receive appropriate data if there is any * IF( IRCV1.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, DOWN, $ MYCOL ) END IF IF( IRCV3.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV3, 1, BUF( ISTR3+1 ), IRCV3, MYROW, $ RIGHT ) END IF IF( IRCV4.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV4, 1, BUF( ISTR4+1 ), IRCV4, DOWN, $ RIGHT ) END IF IF( IRCV5.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV5, 1, BUF( ISTR5+1 ), IRCV5, MYROW, $ LEFT ) END IF * * Start main loop * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. $ ( MODKM1.NE.HBL-1 ) ) THEN CALL INFOG2L( I-2, I-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) END IF * * Look for two consecutive small subdiagonal elements. * DO 20 M = I - 2, L, -1 * * Determine the effect of starting the double-shift QR * iteration at row M, and see if this would make H(M,M-1) * negligible. * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H00 = BUF( ISTR1+IBUF1 ) ELSE H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF5 = IBUF5 + 1 H10 = BUF( ISTR5+IBUF5 ) ELSE H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF END IF IF( MODKM1.EQ.HBL-1 ) THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) H11 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( NUM.GT.1 ) THEN IBUF4 = IBUF4 + 2 H22 = BUF( ISTR4+IBUF4-1 ) V3 = BUF( ISTR4+IBUF4 ) ELSE H22 = A( ICOL1*LDA+IROW1+1 ) V3 = A( ( ICOL1+1 )*LDA+IROW1+1 ) END IF IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 H21 = BUF( ISTR2+IBUF2 ) ELSE H21 = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF3 = IBUF3 + 1 H12 = BUF( ISTR3+IBUF3 ) ELSE H12 = A( ICOL1*LDA+IROW1 ) END IF IF( M.GT.L ) THEN H00 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF * * Adjust ICOL1 for next iteration where MODKM1=HBL-2 * ICOL1 = ICOL1 + 1 END IF IF( MODKM1.EQ.HBL-2 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 V3 = BUF( ISTR2+IBUF2 ) ELSE V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) END IF H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.LT.HBL-2 ) .AND. ( MODKM1.GT.0 ) ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S IF( M.EQ.L ) $ GO TO 30 TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).LE.ULP*TST1 ) $ GO TO 30 * * Slide indices diagonally up one for next iteration * IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF IF( M.EQ.L ) THEN * * Stop regardless of which node we are * GO TO 30 END IF * * Possibly change owners if on border * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 20 CONTINUE 30 CONTINUE * CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, M, 1, L, L, -1, -1, -1 ) * RETURN * * End of PZLACONSB * END SHAR_EOF fi # end of overwriting check if test -f 'pzlacp3.f' then echo shar: will not over-write existing file "'pzlacp3.f'" else cat << "SHAR_EOF" > 'pzlacp3.f' SUBROUTINE PZLACP3( M, I, A, DESCA, B, LDB, II, JJ, REV ) * * Mark R. Fahey * May 28, 1999 * * .. Scalar Arguments .. INTEGER I, II, JJ, LDB, M, REV * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), B( LDB, * ) * .. * * Purpose * ======= * * PZLACP3 is an auxiliary routine that copies from a global parallel * array into a local replicated array or vise versa. Notice that * the entire submatrix that is copied gets placed on one node or * more. The receiving node can be specified precisely, or all nodes * can receive, or just one row or column of nodes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * M is the order of the square submatrix that is copied. * M >= 0. * Unchanged on exit * * I (global input) INTEGER * A(I,I) is the global location that the copying starts from. * Unchanged on exit. * * A (global input/output) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the parallel matrix to be copied into or from. * On exit, if REV=1, the copied data. * Unchanged on exit if REV=0. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/output) COMPLEX*16 array of size (LDB,M) * If REV=0, this is the global portion of the array * A(I:I+M-1,I:I+M-1). * If REV=1, this is the unchanged on exit. * * LDB (local input) INTEGER * The leading dimension of B. * * II (global input) INTEGER * By using REV 0 & 1, data can be sent out and returned again. * If REV=0, then II is destination row index for the node(s) * receiving the replicated B. * If II>=0,JJ>=0, then node (II,JJ) receives the data * If II=-1,JJ>=0, then all rows in column JJ receive the data * If II>=0,JJ=-1, then all cols in row II receive the data * If II=-1,JJ=-1, then all nodes receive the data * If REV<>0, then II is the source row index for the node(s) * sending the replicated B. * * JJ (global input) INTEGER * Similar description as II above * * REV (global input) INTEGER * Use REV = 0 to send global A into locally replicated B * (on node (II,JJ)). * Use REV <> 0 to send locally replicated B from node (II,JJ) * to its owner (which changes depending on its location in A) * into the global A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER COL, CONTXT, HBL, ICOL1, ICOL2, IDI, IDJ, IFIN, $ III, IROW1, IROW2, ISTOP, ISTOPI, ISTOPJ, ITMP, $ JJJ, LDA, MYCOL, MYROW, NPCOL, NPROW, ROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG1L, ZGEBR2D, ZGEBS2D, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.LE.0 ) $ RETURN * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( REV.EQ.0 ) THEN DO 20 IDI = 1, M DO 10 IDJ = 1, M B( IDI, IDJ ) = ZERO 10 CONTINUE 20 CONTINUE END IF * IFIN = I + M - 1 * IF( MOD( I+HBL, HBL ).NE.0 ) THEN ISTOP = MIN( I+HBL-MOD( I+HBL, HBL ), IFIN ) ELSE ISTOP = I END IF IDJ = I ISTOPJ = ISTOP IF( IDJ.LE.IFIN ) THEN 30 CONTINUE IDI = I ISTOPI = ISTOP IF( IDI.LE.IFIN ) THEN 40 CONTINUE ROW = MOD( ( IDI-1 ) / HBL, NPROW ) COL = MOD( ( IDJ-1 ) / HBL, NPCOL ) CALL INFOG1L( IDI, HBL, NPROW, ROW, 0, IROW1, ITMP ) IROW2 = NUMROC( ISTOPI, HBL, ROW, 0, NPROW ) CALL INFOG1L( IDJ, HBL, NPCOL, COL, 0, ICOL1, ITMP ) ICOL2 = NUMROC( ISTOPJ, HBL, COL, 0, NPCOL ) IF( ( MYROW.EQ.ROW ) .AND. ( MYCOL.EQ.COL ) ) THEN IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to everyone * IF( REV.EQ.0 ) THEN CALL ZGEBS2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.NE.-1 ) ) THEN * * Send the message to Column MYCOL which better be JJ * IF( REV.EQ.0 ) THEN CALL ZGEBS2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to Row MYROW which better be II * IF( REV.EQ.0 ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.NE.-1 ) .AND. ( JJ.NE.-1 ) .AND. $ ( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) ) THEN * * Recv/Send the message to (II,JJ) * IF( REV.EQ.0 ) THEN CALL ZGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ A( ( ICOL1-1 )*LDA+IROW1 ), LDA, II, $ JJ ) ELSE CALL ZGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, II, JJ ) END IF END IF IF( REV.EQ.0 ) THEN DO 60 JJJ = ICOL1, ICOL2 DO 50 III = IROW1, IROW2 B( IDI+III-IROW1+1-I, IDJ+JJJ-ICOL1+1-I ) $ = A( ( JJJ-1 )*LDA+III ) 50 CONTINUE 60 CONTINUE ELSE DO 80 JJJ = ICOL1, ICOL2 DO 70 III = IROW1, IROW2 A( ( JJJ-1 )*LDA+III ) = B( IDI+III-IROW1+1-I, $ IDJ+JJJ-ICOL1+1-I ) 70 CONTINUE 80 CONTINUE END IF ELSE IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL ZGEBR2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL ZGEBR2D( CONTXT, 'Col', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.-1 ) ) THEN IF( REV.EQ.0 ) THEN CALL ZGEBR2D( CONTXT, 'Row', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, B( IDI-I+1, IDJ-I+1 ), $ LDB, ROW, COL ) END IF END IF IF( ( II.EQ.MYROW ) .AND. ( JJ.EQ.MYCOL ) ) THEN IF( REV.EQ.0 ) THEN CALL ZGERV2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) ELSE CALL ZGESD2D( CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, $ B( IDI-I+1, IDJ-I+1 ), LDB, ROW, $ COL ) * CALL ZGESD2D(CONTXT, IROW2-IROW1+1, ICOL2-ICOL1+1, * $ A((ICOL1-1)*LDA+IROW1),LDA, ROW, COL) END IF END IF END IF IDI = ISTOPI + 1 ISTOPI = MIN( ISTOPI+HBL, IFIN ) IF( IDI.LE.IFIN ) $ GO TO 40 END IF IDJ = ISTOPJ + 1 ISTOPJ = MIN( ISTOPJ+HBL, IFIN ) IF( IDJ.LE.IFIN ) $ GO TO 30 END IF RETURN * * End of PZLACP3 * END SHAR_EOF fi # end of overwriting check if test -f 'pzlahqr.f' then echo shar: will not over-write existing file "'pzlahqr.f'" else cat << "SHAR_EOF" > 'pzlahqr.f' SUBROUTINE PZLAHQR( WANTT, WANTZ, N, ILO, IHI, A, DESCA, W, ILOZ, $ IHIZ, Z, DESCZ, WORK, LWORK, IWORK, ILWORK, $ INFO ) * * Mark R. Fahey * June 22, 2000 * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, ILWORK, INFO, LWORK, N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ), IWORK( * ) COMPLEX*16 A( * ), W( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZLAHQR is an auxiliary routine used to find the Schur decomposition * and or eigenvalues of a matrix already in Hessenberg form from * cols ILO to IHI. * If Z = I, and WANTT=WANTZ=.TRUE., H gets replaced with Z'HZ, * with Z'Z=I, and H in Schur form. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCp(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCp( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCq( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCp() and LOCq() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCp( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCq( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCp( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCq( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * WANTT (global input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (global input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (global input) INTEGER * The order of the Hessenberg matrix A (and Z if WANTZ). * N >= 0. * * ILO (global input) INTEGER * IHI (global input) INTEGER * It is assumed that A is already upper quasi-triangular in * rows and columns IHI+1:N, and that A(ILO,ILO-1) = 0 (unless * ILO = 1). PZLAHQR works primarily with the Hessenberg * submatrix in rows and columns ILO to IHI, but applies * transformations to all of H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * A (global input/output) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the upper Hessenberg matrix A. * On exit, if WANTT is .TRUE., A is upper triangular in rows * and columns ILO:IHI. If WANTT is .FALSE., the contents of * A are unspecified on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * W (global replicated output) COMPLEX*16 array, dimension (N) * The computed eigenvalues ILO to IHI are stored in the * corresponding elements of W. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in A. A may be returned with * larger diagonal blocks until the next release. * * ILOZ (global input) INTEGER * IHIZ (global input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (global input/output) COMPLEX*16 array. * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations accumulated by PZHSEQR, and on * exit Z has been updated; transformations are applied only to * the submatrix Z(ILOZ:IHIZ,ILO:IHI). * If WANTZ is .FALSE., Z is not referenced. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * WORK (local output) COMPLEX*16 array of size LWORK * (Unless LWORK=-1, in which case WORK must be at least size 1) * * LWORK (local input) INTEGER * WORK(LWORK) is a local array and LWORK is assumed big enough * so that LWORK >= 3*N + * MAX( 2*MAX(DESCZ(LLD_),DESCA(LLD_)) + 2*LOCq(N), * 7*Ceil(N/HBL)/LCM(NPROW,NPCOL)) + * MAX( 2*N, (8*LCM(NPROW,NPCOL)+2)**2 ) * If LWORK=-1, then WORK(1) gets set to the above number and * the code returns immediately. * * IWORK (global and local input) INTEGER array of size ILWORK * This will hold some of the IBLK integer arrays. * This is held as a place holder for a future release. * Currently unreferenced. * * ILWORK (local input) INTEGER * This will hold the size of the IWORK array. * This is held as a place holder for a future release. * Currently unreferenced. * * INFO (global output) INTEGER * < 0: parameter number -INFO incorrect or inconsistent * = 0: successful exit * > 0: PZLAHQR failed to compute all the eigenvalues ILO to IHI * in a total of 30*(IHI-ILO+1) iterations; if INFO = i, * elements i+1:ihi of W contains those eigenvalues * which have been successfully computed. * * Logic: * This algorithm is very similar to DLAHQR. Unlike DLAHQR, * instead of sending one double shift through the largest * unreduced submatrix, this algorithm sends multiple double shifts * and spaces them apart so that there can be parallelism across * several processor row/columns. Another critical difference is * that this algorithm aggregrates multiple transforms together in * order to apply them in a block fashion. * * Important Local Variables: * IBLK = The maximum number of bulges that can be computed. * Currently fixed. Future releases this won't be fixed. * HBL = The square block size (HBL=DESCA(MB_)=DESCA(NB_)) * ROTN = The number of transforms to block together * NBULGE = The number of bulges that will be attempted on the * current submatrix. * IBULGE = The current number of bulges started. * K1(*),K2(*) = The current bulge loops from K1(*) to K2(*). * * Subroutines: * From LAPACK, this routine calls: * ZLAHQR -> Serial QR used to determine shifts and eigenvalues * ZLARFG -> Determine the Householder transforms * * This ScaLAPACK, this routine calls: * PZLACONSB -> To determine where to start each iteration * ZLAMSH -> Sends multiple shifts through a small submatrix to * see how the consecutive subdiagonals change (if * PZLACONSB indicates we can start a run in the middle) * PZLAWIL -> Given the shift, get the transformation * PZLACP3 -> Parallel array to local replicated array copy & back. * ZLAREF -> Row/column reflector applier. Core routine here. * PZLASMSUB -> Finds negligible subdiagonal elements. * * Current Notes and/or Restrictions: * 1.) This code requires the distributed block size to be square * and at least six (6); unlike simpler codes like LU, this * algorithm is extremely sensitive to block size. Unwise * choices of too small a block size can lead to bad * performance. * 2.) This code requires A and Z to be distributed identically * and have identical contxts. A future version may allow Z to * have a different contxt to 1D row map it to all nodes (so no * communication on Z is necessary.) * 3.) This code does not currently block the initial transforms * so that none of the rows or columns for any bulge are * completed until all are started. To offset pipeline * start-up it is recommended that at least 2*LCM(NPROW,NPCOL) * bulges are used (if possible) * 4.) The maximum number of bulges currently supported is fixed at * 32. In future versions this will be limited only by the * incoming WORK and IWORK array. * 5.) The matrix A must be in upper Hessenberg form. If elements * below the subdiagonal are nonzero, the resulting transforms * may be nonsimilar. This is also true with the LAPACK * routine ZLAHQR. * 6.) For this release, this code has only been tested for * RSRC_=CSRC_=0, but it has been written for the general case. * 7.) Currently, all the eigenvalues are distributed to all the * nodes. Future releases will probably distribute the * eigenvalues by the column partitioning. * 8.) The internals of this routine are subject to change. * 9.) To optimize this for your architecture, try tuning ZLAREF. * 10.) This code has only been tested for WANTZ = .TRUE. and may * behave unpredictably for WANTZ set to .FALSE. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION RZERO, RONE, HALF PARAMETER ( RZERO = 0.0D+0, RONE = 1.0D+0, HALF = 0.5D+0) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) DOUBLE PRECISION CONST PARAMETER ( CONST = 1.50D+0 ) INTEGER IBLK PARAMETER ( IBLK = 32 ) * .. * .. Local Scalars .. LOGICAL SKIP INTEGER CONTXT, DOWN, HBL, I, I1, I2, IAFIRST, $ IBULGE, ICBUF, ICOL, ICOL1, ICOL2, IDIA, $ IERR, II, IRBUF, IROW, IROW1, IROW2, ISPEC, $ ISTART, ISTARTCOL, ISTARTROW, ISTOP, ISUB, $ ISUP, ITERMAX, ITMP1, ITMP2, ITN, ITS, $ IZBUF, J, JAFIRST, JBLK, JJ, K, KI, L, LCMRC, $ LDA, LDZ, LEFT, LIHIH, LIHIZ, LILOH, LILOZ, $ LOCALI1, LOCALI2, LOCALK, LOCALM, M, MODKM1, $ MYCOL, MYROW, NBULGE, NH, NODE, NPCOL, $ NPROW, NQ, NR, NUM, NZ, RIGHT, ROTN, UP, $ VECSIDX DOUBLE PRECISION CS, OVFL, S, SMLNUM, ULP, UNFL COMPLEX*16 CDUM, H00, H10, H11, H12, H21, H22, H33, H44, $ H43H34, SN, SUM, T1, T1COPY, T2, T3, V1SAVE, $ V2, V2SAVE, V3, V3SAVE * .. * .. Local Arrays .. INTEGER ICURCOL( IBLK ), ICURROW( IBLK ), K1( IBLK ), $ K2( IBLK ), KCOL( IBLK ), KP2COL( IBLK ), $ KP2ROW( IBLK ), KROW( IBLK ) COMPLEX*16 S1( 2*IBLK, 2*IBLK ), SMALLA( 6, 6, IBLK ), $ VCOPY( 3 ) * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMN2D, IGEBR2D, IGEBS2D, $ INFOG1L, INFOG2L, PDLABAD, PXERBLA, PZLACONSB, $ PZLACP3, PZLASMSUB, PZLAWIL, ZCOPY, ZGEBR2D, $ ZGEBS2D, ZGERV2D, ZGESD2D, ZGSUM2D, ZLAHQR, $ ZLAMSH, ZLAREF, ZLARFG * .. * .. Intrinsic Functions .. * INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, MOD, SQRT * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * INFO = 0 * ITERMAX = 30*( IHI-ILO+1 ) IF( N.EQ.0 ) $ RETURN * * NODE (IAFIRST,JAFIRST) OWNS A(1,1) * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) IAFIRST = DESCA( RSRC_ ) JAFIRST = DESCA( CSRC_ ) LDZ = DESCZ( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) NODE = MYROW*NPCOL + MYCOL NUM = NPROW*NPCOL LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) LCMRC = ILCM( NPROW, NPCOL ) IF( ( NPROW.LE.3 ) .OR. ( NPCOL.LE.3 ) ) THEN SKIP = .TRUE. ELSE SKIP = .FALSE. END IF * * Determine the number of columns we have so we can check workspace * NQ = NUMROC( N, HBL, MYCOL, JAFIRST, NPCOL ) JJ = N / HBL IF( JJ*HBL.LT.N ) $ JJ = JJ + 1 JJ = 7*JJ / LCMRC JJ = 3*N+MAX( 2*MAX( LDA, LDZ )+2*NQ, JJ ) JJ = JJ + MAX( 2*N, (8*LCMRC+2)**2 ) IF ( LWORK .EQ. -1 ) THEN WORK(1) = JJ RETURN END IF IF ( LWORK .LT. JJ ) THEN INFO = -14 END IF IF( DESCZ( CTXT_ ).NE.DESCA( CTXT_ ) ) THEN INFO = -( 1300+CTXT_ ) END IF IF( DESCA( MB_ ).NE.DESCA( NB_ ) ) THEN INFO = -( 700+NB_ ) END IF IF( DESCZ( MB_ ).NE.DESCZ( NB_ ) ) THEN INFO = -( 1300+NB_ ) END IF IF( DESCA( MB_ ).NE.DESCZ( MB_ ) ) THEN INFO = -( 1300+MB_ ) END IF IF( ( DESCA( RSRC_ ).NE.0 ) .OR. ( DESCA( CSRC_ ).NE.0 ) ) THEN INFO = -( 700+RSRC_ ) END IF IF( ( DESCZ( RSRC_ ).NE.0 ) .OR. ( DESCZ( CSRC_ ).NE.0 ) ) THEN INFO = -( 1300+RSRC_ ) END IF IF( ( ILO.GT.N ) .OR. ( ILO.LT.1 ) ) THEN INFO = -4 END IF IF( ( IHI.GT.N ) .OR. ( IHI.LT.1 ) ) THEN INFO = -5 END IF IF( HBL.LT.5 ) THEN INFO = -( 700+MB_ ) END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PZLAHQR', -INFO ) RETURN END IF * * Set work array indices * VECSIDX = 0 IDIA = 3*N ISUB = 3*N ISUP = 3*N IRBUF = 3*N ICBUF = 3*N IZBUF = 5*N * * Find a value for ROTN * ROTN = HBL / 3 ROTN = MIN( ROTN, HBL-2 ) ROTN = MAX( ROTN, 1 ) * IF( ILO.EQ.IHI ) THEN CALL INFOG2L( ILO, ILO, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN W( ILO ) = A( ( ICOL-1 )*LDA+IROW ) ELSE W( ILO ) = ZERO END IF RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * CALL INFOG1L( ILOZ, HBL, NPROW, MYROW, IAFIRST, LILOZ, LIHIZ ) LIHIZ = NUMROC( IHIZ, HBL, MYROW, IAFIRST, NPROW ) * * Set machine-dependent constants for the stopping criterion. * If NORM(H) <= SQRT(OVFL), overflow should not occur. * UNFL = PDLAMCH( CONTXT, 'SAFE MINIMUM' ) OVFL = RONE / UNFL CALL PDLABAD( CONTXT, UNFL, OVFL ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = ITERMAX * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of our schur block size (<=2*IBLK). Each * iteration of the loop works with the active submatrix in rows * and columns L to I. Eigenvalues I+1 to IHI have already * converged. Either L = ILO or the global A(L,L-1) is negligible * so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 670 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 640 ITS = 0, ITN * * Look for a single small subdiagonal element. * CALL PZLASMSUB( A, DESCA, I, L, K, SMLNUM, WORK( IRBUF+1 ), $ LWORK-IRBUF ) L = K * IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * CALL INFOG2L( L, L-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN A( ( ICOL-1 )*LDA+IROW ) = ZERO END IF WORK( ISUB+L-1 ) = ZERO END IF * * Exit from loop if a submatrix of order 1 or 2 has split off. * IF ( WANTT ) THEN * For Schur form, use 2x2 blocks IF ( L .GE. I-1 ) THEN GO TO 650 END IF ELSE * If we don't want the Schur form, use bigger blocks. IF ( L .GE. I-( 2*IBLK-1) ) THEN GO TO 650 END IF END IF * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * * Copy submatrix of size 2*JBLK and prepare to do generalized * Wilkinson shift or an exceptional shift * JBLK = MIN( IBLK, ( ( I-L+1 ) / 2 )-1 ) IF( JBLK.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * JBLK = JBLK - MOD( JBLK, LCMRC ) END IF JBLK = MIN( JBLK, 2*LCMRC ) JBLK = MAX( JBLK, 1 ) * CALL PZLACP3( 2*JBLK, I-2*JBLK+1, A, DESCA, S1, 2*IBLK, -1, -1, $ 0 ) IF( (ITS.EQ.20 .OR. ITS.EQ.40) .AND. (JBLK.GT.1) ) THEN * * Exceptional shift. * DO 20 II = 2*JBLK, 1, -1 S1( II, II ) = CONST*( CABS1( S1( II, II ) )+ $ CABS1( S1( II, II-1 ) ) ) S1( II, II-1 ) = ZERO S1( II-1, II ) = ZERO 20 CONTINUE ELSE CALL ZLAHQR( .FALSE., .FALSE., 2*JBLK, 1, 2*JBLK, S1, 2* $ IBLK, WORK( IRBUF+1 ), 1, 2*JBLK, Z, LDZ, IERR ) * * Prepare to use Wilkinson's double shift * H44 = S1( 2*JBLK, 2*JBLK ) H33 = S1( 2*JBLK-1, 2*JBLK-1 ) H43H34 = S1( 2*JBLK-1, 2*JBLK )*S1( 2*JBLK, 2*JBLK-1 ) * END IF * * Look for two consecutive small subdiagonal elements: * PZLACONSB is the routine that does this. * CALL PZLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, $ WORK( IRBUF+1 ), LWORK-IRBUF ) * * Double-shift QR step * * NBULGE is the number of bulges that will be attempted * ISTOP = MIN( M+ROTN-1-MOD( M-( M / HBL )*HBL-1, ROTN ), I-2 ) ISTOP = MIN( ISTOP, M+HBL-3-MOD( M-1, HBL ) ) ISTOP = MIN( ISTOP, I2-2 ) ISTOP = MAX( ISTOP, M ) NBULGE = ( I-1-ISTOP ) / HBL * * Do not exceed maximum determined. * NBULGE = MIN( NBULGE, JBLK ) IF( NBULGE.GT.LCMRC ) THEN * * Make sure it's divisible by LCM (we want even workloads!) * NBULGE = NBULGE - MOD( NBULGE, LCMRC ) END IF NBULGE = MAX( NBULGE, 1 ) * * If we are starting in the middle because of consecutive small * subdiagonal elements, we need to see how many bulges we * can send through without breaking the consecutive small * subdiagonal property. * IF( ( NBULGE.GT.1 ) .AND. ( M.GT.L ) ) THEN * * Copy a chunk of elements from global A(M-1:,M-1:) * CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ITMP1, ITMP2 ) II = MIN( 4*NBULGE+2, N-M+2 ) CALL PZLACP3( II, M-1, A, DESCA, WORK( IRBUF+1 ), II, ITMP1, $ ITMP2, 0 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN * * Find a new NBULGE based on the bulges we have. * CALL ZLAMSH( S1, 2*IBLK, NBULGE, JBLK, WORK( IRBUF+1 ), $ II, II, ULP ) IF( NUM.GT.1 ) THEN CALL IGEBS2D( CONTXT, 'ALL', ' ', 1, 1, NBULGE, 1 ) END IF ELSE * * Everyone needs to receive the new NBULGE * CALL IGEBR2D( CONTXT, 'ALL', ' ', 1, 1, NBULGE, 1, ITMP1, $ ITMP2 ) END IF END IF * * IBULGE is the number of bulges going so far * IBULGE = 1 * * "A" row defs : main row transforms from LOCALK to LOCALI2 * CALL INFOG1L( M, HBL, NPCOL, MYCOL, JAFIRST, ITMP1, LOCALK ) LOCALK = NQ CALL INFOG1L( 1, HBL, NPCOL, MYCOL, JAFIRST, ICOL1, LOCALI2 ) LOCALI2 = NUMROC( I2, HBL, MYCOL, JAFIRST, NPCOL ) * * "A" col defs : main col transforms from LOCALI1 to LOCALM * CALL INFOG1L( I1, HBL, NPROW, MYROW, IAFIRST, LOCALI1, $ ICOL1 ) CALL INFOG1L( 1, HBL, NPROW, MYROW, IAFIRST, LOCALM, ICOL1 ) ICOL1 = NUMROC( MIN( M+3, I ), HBL, MYROW, IAFIRST, NPROW ) * * Which row & column will start the bulges * ISTARTROW = MOD( ( M+1 ) / HBL, NPROW ) + IAFIRST ISTARTCOL = MOD( ( M+1 ) / HBL, NPCOL ) + JAFIRST * CALL INFOG1L( M, HBL, NPROW, MYROW, IAFIRST, II, ITMP2 ) CALL INFOG1L( M, HBL, NPCOL, MYCOL, JAFIRST, JJ, ITMP2 ) CALL INFOG1L( 1, HBL, NPROW, MYROW, IAFIRST, ISTOP, $ KP2ROW( 1 ) ) KP2ROW( 1 ) = NUMROC( M+2, HBL, MYROW, IAFIRST, NPROW ) CALL INFOG1L( 1, HBL, NPCOL, MYCOL, JAFIRST, ISTOP, $ KP2COL( 1 ) ) KP2COL( 1 ) = NUMROC( M+2, HBL, MYCOL, JAFIRST, NPCOL ) * * Set all values for bulges. All bulges are stored in * intermediate steps as loops over KI. Their current "task" * over the global M to I-1 values is always K1(KI) to K2(KI). * However, because there are many bulges, K1(KI) & K2(KI) might * go past that range while later bulges (KI+1,KI+2,etc..) are * finishing up. Even if ROTN=1, in order to minimize border * communication sometimes K1(KI)=HBL-2 & K2(KI)=HBL-1 so both * border messages can be handled at once. * * Rules: * If MOD(K1(KI)-1,HBL) < HBL-2 then MOD(K2(KI)-1,HBL) 'pzlasmsub.f' SUBROUTINE PZLASMSUB( A, DESCA, I, L, K, SMLNUM, BUF, LWORK ) * * Mark R. Fahey * May 28, 1999 * * .. Scalar Arguments .. INTEGER I, K, L, LWORK DOUBLE PRECISION SMLNUM * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), BUF( * ) * .. * * Purpose * ======= * * PZLASMSUB looks for a small subdiagonal element from the bottom * of the matrix that it can safely set to zero. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) COMPLEX*16 array, dimension (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * K (global output) INTEGER * On exit, this yields the bottom portion of the unreduced * submatrix. This will satisfy: L <= M <= I-1. * * SMLNUM (global input) DOUBLE PRECISION * On entry, a "small number" for the given matrix. * Unchanged on exit. * * BUF (local output) COMPLEX*16 array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 2*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * This code is basically a parallelization of the following snip * of LAPACK code from ZLAHQR: * * Look for a single small subdiagonal element. * * DO 20 K = I, L + 1, -1 * TST1 = CABS1( H( K-1, K-1 ) ) + CABS1( H( K, K ) ) * IF( TST1.EQ.ZERO ) * $ TST1 = ZLANHS( '1', I-L+1, H( L, L ), LDH, WORK ) * IF( CABS1( H( K, K-1 ) ).LE.MAX( ULP*TST1, SMLNUM ) ) * $ GO TO 30 * 20 CONTINUE * 30 CONTINUE * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO PARAMETER ( ZERO = 0.0D+0 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IBUF1, IBUF2, ICOL1, ICOL2, $ II, III, IRCV1, IRCV2, IROW1, IROW2, ISRC, $ ISTR1, ISTR2, ITMP1, ITMP2, JJ, JJJ, JSRC, LDA, $ LEFT, MODKM1, MYCOL, MYROW, NPCOL, NPROW, NUM, $ RIGHT, UP DOUBLE PRECISION TST1, ULP COMPLEX*16 CDUM, H10, H11, H22 * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, INFOG1L, INFOG2L, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MAX, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 STARTS AT BUF(ISTR1+1) AND WILL CONTAINS IBUF1 ELEMENTS * BUFFER2 STARTS AT BUF(ISTR2+1) AND WILL CONTAINS IBUF2 ELEMENTS * ISTR1 = 0 ISTR2 = ( ( I-L ) / HBL ) IF( ISTR2*HBL.LT.( I-L ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.2*ISTR2 ) THEN * * Error! * RETURN END IF CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * COPY OUR RELEVANT PIECES OF TRIADIAGONAL THAT WE OWE INTO * 2 BUFFERS TO SEND TO WHOMEVER OWNS H(K,K) AS K MOVES DIAGONALLY * UP THE TRIDIAGONAL * IBUF1 = 0 IBUF2 = 0 IRCV1 = 0 IRCV2 = 0 DO 10 K = I, L + 1, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K-1,K-1) AND SEND IT DIAGONAL DOWN * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) ) THEN * * WE MUST PACK H(K ,K-1) AND SEND IT RIGHT * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( K, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * ADD UP THE RECEIVES * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT. $ 1 ) ) ) THEN * * WE MUST RECEIVE H(K-1,K-1) FROM DIAGONAL UP * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) ) THEN * * WE MUST RECEIVE H(K ,K-1) FROM LEFT * IRCV2 = IRCV2 + 1 END IF END IF * * POSSIBLY CHANGE OWNERS (OCCURS ONLY WHEN MOD(K-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * SEND DATA ON TO THE APPROPRIATE NODE IF THERE IS ANY DATA TO SEND * IF( IBUF1.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, MYROW, $ RIGHT ) END IF * * RECEIVE APPROPRIATE DATA IF THERE IS ANY * IF( IRCV1.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, MYROW, $ LEFT ) END IF * * START MAIN LOOP * IBUF1 = 0 IBUF2 = 0 CALL INFOG2L( I, I, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-1+HBL, HBL ) * * LOOK FOR A SINGLE SMALL SUBDIAGONAL ELEMENT. * * Start loop for subdiagonal search * DO 40 K = I, L + 1, -1 IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN * * Grab information from WORK array * IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H11 = BUF( ISTR1+IBUF1 ) ELSE H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF2 = IBUF2 + 1 H10 = BUF( ISTR2+IBUF2 ) ELSE H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF ELSE * * Information is local * H11 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF H22 = A( ( ICOL1-1 )*LDA+IROW1 ) TST1 = CABS1( H11 ) + CABS1( H22 ) IF( TST1.EQ.ZERO ) THEN * * FIND SOME NORM OF THE LOCAL H(L:I,L:I) * CALL INFOG1L( L, HBL, NPROW, MYROW, 0, IROW1, III ) IROW2 = NUMROC( I, HBL, MYROW, 0, NPROW ) CALL INFOG1L( L, HBL, NPCOL, MYCOL, 0, ICOL1, III ) ICOL2 = NUMROC( I, HBL, MYCOL, 0, NPCOL ) DO 30 III = IROW1, IROW2 DO 20 JJJ = ICOL1, ICOL2 TST1 = TST1 + CABS1( A( ( JJJ-1 )*LDA+III ) ) 20 CONTINUE 30 CONTINUE END IF IF( CABS1( H10 ).LE.MAX( ULP*TST1, SMLNUM ) ) $ GO TO 50 IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 IF( ( MODKM1.EQ.HBL-1 ) .AND. ( K.GT.2 ) ) THEN II = MOD( II+NPROW-1, NPROW ) JJ = MOD( JJ+NPCOL-1, NPCOL ) CALL INFOG2L( K-1, K-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ITMP1, ITMP2 ) END IF 40 CONTINUE 50 CONTINUE CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, K, 1, ITMP1, ITMP2, -1, $ -1, -1 ) RETURN * * End of PZLASMSUB * END SHAR_EOF fi # end of overwriting check if test -f 'pzlatrs.f' then echo shar: will not over-write existing file "'pzlatrs.f'" else cat << "SHAR_EOF" > 'pzlatrs.f' SUBROUTINE PZLATRS( UPLO, TRANS, DIAG, NORMIN, N, A, IA, JA, $ DESCA, X, IX, JX, DESCX, SCALE, CNORM, INFO ) * * Mark R. Fahey * August 2000 * This is very slow relative to PZTRSV. This should only be used * when scaling is necessary to control overflow, or when it is modified * to scale better. * * .. Scalar Arguments .. CHARACTER DIAG, NORMIN, TRANS, UPLO INTEGER IA, INFO, IX, JA, JX, N DOUBLE PRECISION SCALE * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCX( * ) DOUBLE PRECISION CNORM( * ) COMPLEX*16 A( * ), X( * ) * .. * * Purpose * ======= * * PZLATRS solves one of the triangular systems * * A * x = s*b, A**T * x = s*b, or A**H * x = s*b, * * with scaling to prevent overflow. Here A is an upper or lower * triangular matrix, A**T denotes the transpose of A, A**H denotes the * conjugate transpose of A, x and b are n-element vectors, and s is a * scaling factor, usually less than or equal to 1, chosen so that the * components of x will be less than the overflow threshold. If the * unscaled problem will not cause overflow, the Level 2 PBLAS routine * PZTRSV is called. If the matrix A is singular (A(j,j) = 0 for some j), * then s is set to 0 and a non-trivial solution to A*x = 0 is returned. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * UPLO (global input) CHARACTER*1 * Specifies whether the matrix A is upper or lower triangular. * = 'U': Upper triangular * = 'L': Lower triangular * * TRANS (global input) CHARACTER*1 * Specifies the operation applied to A. * = 'N': Solve A * x = s*b (No transpose) * = 'T': Solve A**T * x = s*b (Transpose) * = 'C': Solve A**H * x = s*b (Conjugate transpose) * * DIAG (global input) CHARACTER*1 * Specifies whether or not the matrix A is unit triangular. * = 'N': Non-unit triangular * = 'U': Unit triangular * * NORMIN (global input) CHARACTER*1 * Specifies whether CNORM has been set or not. * = 'Y': CNORM contains the column norms on entry * = 'N': CNORM is not set on entry. On exit, the norms will * be computed and stored in CNORM. * * N (global input) INTEGER * The order of the matrix A. N >= 0. * * A (local input) COMPLEX*16 array, dimension (DESCA(LLD_),*) * The triangular matrix A. If UPLO = 'U', the leading n by n * upper triangular part of the array A contains the upper * triangular matrix, and the strictly lower triangular part of * A is not referenced. If UPLO = 'L', the leading n by n lower * triangular part of the array A contains the lower triangular * matrix, and the strictly upper triangular part of A is not * referenced. If DIAG = 'U', the diagonal elements of A are * also not referenced and are assumed to be 1. * * IA (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix A to operate on. * * JA (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix A to operate on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * X (local input/output) COMPLEX*16 array, * dimension (DESCX(LLD_),*) * On entry, the right hand side b of the triangular system. * On exit, X is overwritten by the solution vector x. * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix X. * * SCALE (global output) DOUBLE PRECISION * The scaling factor s for the triangular system * A * x = s*b, A**T * x = s*b, or A**H * x = s*b. * If SCALE = 0, the matrix A is singular or badly scaled, and * the vector x is an exact or approximate solution to A*x = 0. * * CNORM (global input or global output) DOUBLE PRECISION array, * dimension (N) * If NORMIN = 'Y', CNORM is an input argument and CNORM(j) * contains the norm of the off-diagonal part of the j-th column * of A. If TRANS = 'N', CNORM(j) must be greater than or equal * to the infinity-norm, and if TRANS = 'T' or 'C', CNORM(j) * must be greater than or equal to the 1-norm. * * If NORMIN = 'N', CNORM is an output argument and CNORM(j) * returns the 1-norm of the offdiagonal part of the j-th column * of A. * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -k, the k-th argument had an illegal value * * Further Details * ======= ======= * * A rough bound on x is computed; if that is less than overflow, PZTRSV * is called, otherwise, specific code is used which checks for possible * overflow or divide-by-zero at every operation. * * A columnwise scheme is used for solving A*x = b. The basic algorithm * if A is lower triangular is * * x[1:n] := b[1:n] * for j = 1, ..., n * x(j) := x(j) / A(j,j) * x[j+1:n] := x[j+1:n] - x(j) * A[j+1:n,j] * end * * Define bounds on the components of x after j iterations of the loop: * M(j) = bound on x[1:j] * G(j) = bound on x[j+1:n] * Initially, let M(0) = 0 and G(0) = max{x(i), i=1,...,n}. * * Then for iteration j+1 we have * M(j+1) <= G(j) / | A(j+1,j+1) | * G(j+1) <= G(j) + M(j+1) * | A[j+2:n,j+1] | * <= G(j) ( 1 + CNORM(j+1) / | A(j+1,j+1) | ) * * where CNORM(j+1) is greater than or equal to the infinity-norm of * column j+1 of A, not counting the diagonal. Hence * * G(j) <= G(0) product ( 1 + CNORM(i) / | A(i,i) | ) * 1<=i<=j * and * * |x(j)| <= ( G(0) / |A(j,j)| ) product ( 1 + CNORM(i) / |A(i,i)| ) * 1<=i< j * * Since |x(j)| <= M(j), we use the Level 2 PBLAS routine PZTRSV if the * reciprocal of the largest M(j), j=1,..,n, is larger than * max(underflow, 1/overflow). * * The bound on x(j) is also used to determine when a step in the * columnwise method can be performed without fear of overflow. If * the computed bound is greater than a large constant, x is scaled to * prevent overflow, but if the bound overflows, x is set to 0, x(j) to * 1, and scale to 0, and a non-trivial solution to A*x = 0 is found. * * Similarly, a row-wise scheme is used to solve A**T *x = b or * A**H *x = b. The basic algorithm for A upper triangular is * * for j = 1, ..., n * x(j) := ( b(j) - A[1:j-1,j]' * x[1:j-1] ) / A(j,j) * end * * We simultaneously compute two bounds * G(j) = bound on ( b(i) - A[1:i-1,i]' * x[1:i-1] ), 1<=i<=j * M(j) = bound on x(i), 1<=i<=j * * The initial values are G(0) = 0, M(0) = max{b(i), i=1,..,n}, and we * add the constraint G(j) >= G(j-1) and M(j) >= M(j-1) for j >= 1. * Then the bound on x(j) is * * M(j) <= M(j-1) * ( 1 + CNORM(j) ) / | A(j,j) | * * <= M(0) * product ( ( 1 + CNORM(i) ) / |A(i,i)| ) * 1<=i<=j * * and we can safely call PZTRSV if 1/M(n) and 1/G(n) are both greater * than max(underflow, 1/overflow). * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, $ TWO = 2.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL NOTRAN, NOUNIT, UPPER INTEGER CONTXT, CSRC, I, ICOL, ICOLX, IMAX, IROW, IROWX, $ ITMP1, ITMP1X, ITMP2, ITMP2X, J, JFIRST, JINC, $ JLAST, LDA, LDX, MB, MYROW, MYCOL, NB, NPCOL, $ NPROW, RSRC DOUBLE PRECISION BIGNUM, GROW, REC, SMLNUM, TJJ, TMAX, TSCAL, $ XBND, XJ, XMAX COMPLEX*16 CSUMJ, TJJS, USCAL, XJTMP, ZDUM * .. * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX DOUBLE PRECISION PDLAMCH, PDZASUM COMPLEX*16 ZLADIV EXTERNAL IDAMAX, LSAME, PDLAMCH, PDZASUM, ZLADIV * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DSCAL, DGSUM2D, INFOG2L, $ PDLABAD, PXERBLA, PZAMAX, PZAXPY, PZDOTC, $ PZDOTU, PZDSCAL, PZLASET, PZSCAL, PZTRSV, $ ZGEBS2D, ZGEBR2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN * .. * .. Statement Functions .. DOUBLE PRECISION CABS1, CABS2 * .. * .. Statement Function definitions .. CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) ) CABS2( ZDUM ) = ABS( DBLE( ZDUM ) / 2.D0 ) + $ ABS( DIMAG( ZDUM ) / 2.D0 ) * .. * .. Executable Statements .. * INFO = 0 UPPER = LSAME( UPLO, 'U' ) NOTRAN = LSAME( TRANS, 'N' ) NOUNIT = LSAME( DIAG, 'N' ) * CONTXT = DESCA( CTXT_ ) RSRC = DESCA( RSRC_ ) CSRC = DESCA( CSRC_ ) MB = DESCA( MB_ ) NB = DESCA( NB_ ) LDA = DESCA( LLD_ ) LDX = DESCX( LLD_ ) * * Test the input parameters. * IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN INFO = -1 ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT. $ LSAME( TRANS, 'C' ) ) THEN INFO = -2 ELSE IF( .NOT.NOUNIT .AND. .NOT.LSAME( DIAG, 'U' ) ) THEN INFO = -3 ELSE IF( .NOT.LSAME( NORMIN, 'Y' ) .AND. .NOT. $ LSAME( NORMIN, 'N' ) ) THEN INFO = -4 ELSE IF( N.LT.0 ) THEN INFO = -5 END IF * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( INFO.NE.0 ) THEN CALL PXERBLA( CONTXT, 'PZLATRS', -INFO ) RETURN END IF * * Quick return if possible * IF( N.EQ.0 ) $ RETURN * * Determine machine dependent parameters to control overflow. * SMLNUM = PDLAMCH( CONTXT, 'Safe minimum' ) BIGNUM = ONE / SMLNUM CALL PDLABAD( CONTXT, SMLNUM, BIGNUM ) SMLNUM = SMLNUM / PDLAMCH( CONTXT, 'Precision' ) BIGNUM = ONE / SMLNUM SCALE = ONE * IF( LSAME( NORMIN, 'N' ) ) THEN * * Compute the 1-norm of each column, not including the diagonal. * IF( UPPER ) THEN * * A is upper triangular. * CNORM( 1 ) = ZERO DO 10 J = 2, N CNORM( J ) = PDZASUM( J-1, A, IA, JA+J-1, DESCA, 1 ) 10 CONTINUE ELSE * * A is lower triangular. * DO 20 J = 1, N - 1 CNORM( J ) = PDZASUM( N-J, A, IA+J, JA+J-1, DESCA, 1 ) 20 CONTINUE CNORM( N ) = ZERO END IF CALL DGSUM2D( CONTXT, 'Row', ' ', N, 1, CNORM, 1, -1, -1 ) END IF * * Scale the column norms by TSCAL if the maximum element in CNORM is * greater than BIGNUM/2. * IMAX = IDAMAX( N, CNORM, 1 ) TMAX = CNORM( IMAX ) IF( TMAX.LE.BIGNUM*HALF ) THEN TSCAL = ONE ELSE TSCAL = HALF / ( SMLNUM*TMAX ) CALL DSCAL( N, TSCAL, CNORM, 1 ) END IF * * Compute a bound on the computed solution vector to see if the * Level 2 PBLAS routine PZTRSV can be used. * XMAX = ZERO CALL PZAMAX( N, ZDUM, IMAX, X, IX, JX, DESCX, 1 ) XMAX = CABS2( ZDUM ) CALL DGSUM2D( CONTXT, 'Row', ' ', 1, 1, XMAX, 1, -1, -1 ) XBND = XMAX * IF( NOTRAN ) THEN * * Compute the growth in A * x = b. * IF( UPPER ) THEN JFIRST = N JLAST = 1 JINC = -1 ELSE JFIRST = 1 JLAST = N JINC = 1 END IF * IF( TSCAL.NE.ONE ) THEN GROW = ZERO GO TO 60 END IF * IF( NOUNIT ) THEN * * A is non-unit triangular. * * Compute GROW = 1/G(j) and XBND = 1/M(j). * Initially, G(0) = max{x(i), i=1,...,n}. * GROW = HALF / MAX( XBND, SMLNUM ) XBND = GROW DO 40 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 60 * C TJJS = A( J, J ) CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN TJJS = A( (ICOL-1)*LDA+IROW ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF TJJ = CABS1( TJJS ) * IF( TJJ.GE.SMLNUM ) THEN * * M(j) = G(j-1) / abs(A(j,j)) * XBND = MIN( XBND, MIN( ONE, TJJ )*GROW ) ELSE * * M(j) could overflow, set XBND to 0. * XBND = ZERO END IF * IF( TJJ+CNORM( J ).GE.SMLNUM ) THEN * * G(j) = G(j-1)*( 1 + CNORM(j) / abs(A(j,j)) ) * GROW = GROW*( TJJ / ( TJJ+CNORM( J ) ) ) ELSE * * G(j) could overflow, set GROW to 0. * GROW = ZERO END IF 40 CONTINUE GROW = XBND ELSE * * A is unit triangular. * * Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. * GROW = MIN( ONE, HALF / MAX( XBND, SMLNUM ) ) DO 50 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 60 * * G(j) = G(j-1)*( 1 + CNORM(j) ) * GROW = GROW*( ONE / ( ONE+CNORM( J ) ) ) 50 CONTINUE END IF 60 CONTINUE * ELSE * * Compute the growth in A**T * x = b or A**H * x = b. * IF( UPPER ) THEN JFIRST = 1 JLAST = N JINC = 1 ELSE JFIRST = N JLAST = 1 JINC = -1 END IF * IF( TSCAL.NE.ONE ) THEN GROW = ZERO GO TO 90 END IF * IF( NOUNIT ) THEN * * A is non-unit triangular. * * Compute GROW = 1/G(j) and XBND = 1/M(j). * Initially, M(0) = max{x(i), i=1,...,n}. * GROW = HALF / MAX( XBND, SMLNUM ) XBND = GROW DO 70 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 90 * * G(j) = max( G(j-1), M(j-1)*( 1 + CNORM(j) ) ) * XJ = ONE + CNORM( J ) GROW = MIN( GROW, XBND / XJ ) * C TJJS = A( J, J ) CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN TJJS = A( (ICOL-1)*LDA+IROW ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF TJJ = CABS1( TJJS ) * IF( TJJ.GE.SMLNUM ) THEN * * M(j) = M(j-1)*( 1 + CNORM(j) ) / abs(A(j,j)) * IF( XJ.GT.TJJ ) $ XBND = XBND*( TJJ / XJ ) ELSE * * M(j) could overflow, set XBND to 0. * XBND = ZERO END IF 70 CONTINUE GROW = MIN( GROW, XBND ) ELSE * * A is unit triangular. * * Compute GROW = 1/G(j), where G(0) = max{x(i), i=1,...,n}. * GROW = MIN( ONE, HALF / MAX( XBND, SMLNUM ) ) DO 80 J = JFIRST, JLAST, JINC * * Exit the loop if the growth factor is too small. * IF( GROW.LE.SMLNUM ) $ GO TO 90 * * G(j) = ( 1 + CNORM(j) )*G(j-1) * XJ = ONE + CNORM( J ) GROW = GROW / XJ 80 CONTINUE END IF 90 CONTINUE END IF * IF( ( GROW*TSCAL ).GT.SMLNUM ) THEN * * Use the Level 2 PBLAS solve if the reciprocal of the bound on * elements of X is not too small. * CALL PZTRSV( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, $ JX, DESCX, 1 ) ELSE * * Use a Level 1 PBLAS solve, scaling intermediate results. * IF( XMAX.GT.BIGNUM*HALF ) THEN * * Scale X so that its components are less than or equal to * BIGNUM in absolute value. * SCALE = ( BIGNUM*HALF ) / XMAX CALL PZDSCAL( N, SCALE, X, IX, JX, DESCX, 1 ) XMAX = BIGNUM ELSE XMAX = XMAX*TWO END IF * IF( NOTRAN ) THEN * * Solve A * x = b * DO 120 J = JFIRST, JLAST, JINC * * Compute x(j) = b(j) / A(j,j), scaling x if necessary. * C XJ = CABS1( X( J ) ) CALL INFOG2L( IX+J-1, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, IROWX, ICOLX, ITMP1X, ITMP2X ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) THEN XJTMP = X( IROWX ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1, $ ITMP1X, ITMP2X ) END IF XJ = CABS1( XJTMP ) IF( NOUNIT ) THEN C TJJS = A( J, J )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = A( (ICOL-1)*LDA+IROW )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL IF( TSCAL.EQ.ONE ) $ GO TO 110 END IF TJJ = CABS1( TJJS ) IF( TJJ.GT.SMLNUM ) THEN * * abs(A(j,j)) > SMLNUM: * IF( TJJ.LT.ONE ) THEN IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by 1/b(j). * REC = ONE / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF C X( J ) = ZLADIV( X( J ), TJJS ) C XJ = CABS1( X( J ) ) XJTMP = ZLADIV( XJTMP, TJJS ) XJ = CABS1( XJTMP ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) THEN X( IROWX ) = XJTMP END IF ELSE IF( TJJ.GT.ZERO ) THEN * * 0 < abs(A(j,j)) <= SMLNUM: * IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM * to avoid overflow when dividing by A(j,j). * REC = ( TJJ*BIGNUM ) / XJ IF( CNORM( J ).GT.ONE ) THEN * * Scale by 1/CNORM(j) to avoid overflow when * multiplying x(j) times column j. * REC = REC / CNORM( J ) END IF CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF C X( J ) = ZLADIV( X( J ), TJJS ) C XJ = CABS1( X( J ) ) XJTMP = ZLADIV( XJTMP, TJJS ) XJ = CABS1( XJTMP ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) THEN X( IROWX ) = XJTMP END IF ELSE * * A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and * scale = 0, and compute a solution to A*x = 0. * CALL PZLASET( ' ', N, 1, CZERO, CZERO, X, IX, JX, $ DESCX ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) THEN X( IROWX ) = CONE ENDIF XJTMP = CONE XJ = ONE SCALE = ZERO XMAX = ZERO END IF 110 CONTINUE * * Scale x if necessary to avoid overflow when adding a * multiple of column j of A. * IF( XJ.GT.ONE ) THEN REC = ONE / XJ IF( CNORM( J ).GT.( BIGNUM-XMAX )*REC ) THEN * * Scale x by 1/(2*abs(x(j))). * REC = REC*HALF CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC END IF ELSE IF( XJ*CNORM( J ).GT.( BIGNUM-XMAX ) ) THEN * * Scale x by 1/2. * CALL PZDSCAL( N, HALF, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*HALF SCALE = SCALE*HALF END IF * IF( UPPER ) THEN IF( J.GT.1 ) THEN * * Compute the update * x(1:j-1) := x(1:j-1) - x(j) * A(1:j-1,j) * ZDUM = -XJTMP*TSCAL CALL PZAXPY( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1, X, $ IX, JX, DESCX, 1 ) CALL PZAMAX( J-1, ZDUM, IMAX, X, IX, JX, DESCX, 1 ) XMAX = CABS1( ZDUM ) CALL DGSUM2D( CONTXT, 'Row', ' ', 1, 1, XMAX, 1, $ -1, -1 ) END IF ELSE IF( J.LT.N ) THEN * * Compute the update * x(j+1:n) := x(j+1:n) - x(j) * A(j+1:n,j) * ZDUM = -XJTMP*TSCAL CALL PZAXPY( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) CALL PZAMAX( N-J, ZDUM, I, X, IX+J, JX, DESCX, 1 ) XMAX = CABS1( ZDUM ) CALL DGSUM2D( CONTXT, 'Row', ' ', 1, 1, XMAX, 1, $ -1, -1 ) END IF END IF 120 CONTINUE * ELSE IF( LSAME( TRANS, 'T' ) ) THEN * * Solve A**T * x = b * DO 170 J = JFIRST, JLAST, JINC * * Compute x(j) = b(j) - sum A(k,j)*x(k). * k<>j * C XJ = CABS1( X( J ) ) CALL INFOG2L( IX+J-1, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, IROWX, ICOLX, ITMP1X, ITMP2X ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) THEN XJTMP = X( IROWX ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1, $ ITMP1X, ITMP2X ) END IF XJ = CABS1( XJTMP ) USCAL = DCMPLX( TSCAL ) REC = ONE / MAX( XMAX, ONE ) IF( CNORM( J ).GT.( BIGNUM-XJ )*REC ) THEN * * If x(j) could overflow, scale x by 1/(2*XMAX). * REC = REC*HALF IF( NOUNIT ) THEN C TJJS = A( J, J )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = A( (ICOL-1)*LDA+IROW )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1,TJJS,1) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1,TJJS,1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL END IF TJJ = CABS1( TJJS ) IF( TJJ.GT.ONE ) THEN * * Divide by A(j,j) when scaling x if A(j,j) > 1. * REC = MIN( ONE, REC*TJJ ) USCAL = ZLADIV( USCAL, TJJS ) END IF IF( REC.LT.ONE ) THEN CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * CSUMJ = CZERO IF( USCAL.EQ.CONE ) THEN * * If the scaling needed for A in the dot product is 1, * call PZDOTU to perform the dot product. * IF( UPPER ) THEN CALL PZDOTU( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ELSE IF( J.LT.N ) THEN CALL PZDOTU( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF ELSE * * Otherwise, scale column of A by USCAL before dot product. * Below is not the best way to do it. * IF( UPPER ) THEN C DO 130 I = 1, J - 1 C CSUMJ = CSUMJ + ( A( I, J )*USCAL )*X( I ) C 130 CONTINUE ZDUM = DCONJG( USCAL ) CALL PZSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) CALL PZDOTU( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ZDUM = ZLADIV( ZDUM, USCAL ) CALL PZSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) ELSE IF( J.LT.N ) THEN C DO 140 I = J + 1, N C CSUMJ = CSUMJ + ( A( I, J )*USCAL )*X( I ) C 140 CONTINUE ZDUM = DCONJG( USCAL ) CALL PZSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1) CALL PZDOTU( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) ZDUM = ZLADIV( ZDUM, USCAL ) CALL PZSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF END IF * IF( USCAL.EQ.DCMPLX( TSCAL ) ) THEN * * Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) * was not used to scale the dotproduct. * C X( J ) = X( J ) - CSUMJ C XJ = CABS1( X( J ) ) XJTMP = XJTMP - CSUMJ XJ = CABS1( XJTMP ) C IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) C $ X( IROWX ) = XJTMP IF( NOUNIT ) THEN C TJJS = A( J, J )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = A( (ICOL-1)*LDA+IROW )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1,TJJS, 1) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1,TJJS, 1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL IF( TSCAL.EQ.ONE ) $ GO TO 160 END IF * * Compute x(j) = x(j) / A(j,j), scaling if necessary. * TJJ = CABS1( TJJS ) IF( TJJ.GT.SMLNUM ) THEN * * abs(A(j,j)) > SMLNUM: * IF( TJJ.LT.ONE ) THEN IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale X by 1/abs(x(j)). * REC = ONE / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF C X( J ) = ZLADIV( X( J ), TJJS ) XJTMP = ZLADIV( XJTMP, TJJS ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) $ THEN X( IROWX ) = XJTMP END IF ELSE IF( TJJ.GT.ZERO ) THEN * * 0 < abs(A(j,j)) <= SMLNUM: * IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. * REC = ( TJJ*BIGNUM ) / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF C X( J ) = ZLADIV( X( J ), TJJS ) XJTMP = ZLADIV( XJTMP, TJJS ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) $ THEN X( IROWX ) = XJTMP END IF ELSE * * A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and * scale = 0 and compute a solution to A**T *x = 0. * CALL PZLASET( ' ', N, 1, CZERO, CZERO, X, IX, JX, $ DESCX ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) $ THEN X( IROWX ) = CONE ENDIF XJTMP = CONE SCALE = ZERO XMAX = ZERO END IF 160 CONTINUE ELSE * * Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot * product has already been divided by 1/A(j,j). * C X( J ) = ZLADIV( X( J ), TJJS ) - CSUMJ XJTMP = ZLADIV( XJTMP, TJJS ) - CSUMJ IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) $ THEN X( IROWX ) = XJTMP END IF END IF XMAX = MAX( XMAX, CABS1( XJTMP ) ) 170 CONTINUE * ELSE * * Solve A**H * x = b * DO 220 J = JFIRST, JLAST, JINC * * Compute x(j) = b(j) - sum A(k,j)*x(k). * k<>j * CALL INFOG2L( IX+J-1, JX, DESCX, NPROW, NPCOL, MYROW, $ MYCOL, IROWX, ICOLX, ITMP1X, ITMP2X ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) THEN XJTMP = X( IROWX ) CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, XJTMP, 1, $ ITMP1X, ITMP2X ) END IF XJ = CABS1( XJTMP ) USCAL = TSCAL REC = ONE / MAX( XMAX, ONE ) IF( CNORM( J ).GT.( BIGNUM-XJ )*REC ) THEN * * If x(j) could overflow, scale x by 1/(2*XMAX). * REC = REC*HALF IF( NOUNIT ) THEN C TJJS = DCONJG( A( J, J ) )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = DCONJG( A( (ICOL-1)*LDA+IROW ) )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS,1) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS,1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL END IF TJJ = CABS1( TJJS ) IF( TJJ.GT.ONE ) THEN * * Divide by A(j,j) when scaling x if A(j,j) > 1. * REC = MIN( ONE, REC*TJJ ) USCAL = ZLADIV( USCAL, TJJS ) END IF IF( REC.LT.ONE ) THEN CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF * CSUMJ = CZERO IF( USCAL.EQ.CONE ) THEN * * If the scaling needed for A in the dot product is 1, * call PZDOTC to perform the dot product. * IF( UPPER ) THEN CALL PZDOTC( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ELSE IF( J.LT.N ) THEN CALL PZDOTC( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF ELSE * * Otherwise, scale column of A by USCAL before dot product. * Below is not the best way to do it. * IF( UPPER ) THEN C DO 180 I = 1, J - 1 C CSUMJ = CSUMJ + ( DCONJG( A( I, J ) )*USCAL )* C $ X( I ) C 180 CONTINUE ZDUM = DCONJG( USCAL ) CALL PZSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) CALL PZDOTC( J-1, CSUMJ, A, IA, JA+J-1, DESCA, 1, $ X, IX, JX, DESCX, 1 ) ZDUM = ZLADIV( CONE, ZDUM ) CALL PZSCAL( J-1, ZDUM, A, IA, JA+J-1, DESCA, 1 ) ELSE IF( J.LT.N ) THEN C DO 190 I = J + 1, N C CSUMJ = CSUMJ + ( DCONJG( A( I, J ) )*USCAL )* C $ X( I ) C 190 CONTINUE ZDUM = DCONJG( USCAL ) CALL PZSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1) CALL PZDOTC( N-J, CSUMJ, A, IA+J, JA+J-1, DESCA, 1, $ X, IX+J, JX, DESCX, 1 ) ZDUM = ZLADIV( CONE, ZDUM ) CALL PZSCAL( N-J, ZDUM, A, IA+J, JA+J-1, DESCA, 1 ) END IF IF( MYCOL.EQ.ITMP2X ) THEN CALL ZGEBS2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1 ) ELSE CALL ZGEBR2D( CONTXT, 'Row', ' ', 1, 1, CSUMJ, 1, $ MYROW, ITMP2X ) END IF END IF * IF( USCAL.EQ.DCMPLX( TSCAL ) ) THEN * * Compute x(j) := ( x(j) - CSUMJ ) / A(j,j) if 1/A(j,j) * was not used to scale the dotproduct. * C X( J ) = X( J ) - CSUMJ C XJ = CABS1( X( J ) ) XJTMP = XJTMP - CSUMJ XJ = CABS1( XJTMP ) C IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) C $ X( IROWX ) = XJTMP IF( NOUNIT ) THEN C TJJS = DCONJG( A( J, J ) )*TSCAL CALL INFOG2L( IA+J-1, JA+J-1, DESCA, NPROW, NPCOL, $ MYROW, MYCOL, IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) $ THEN TJJS = DCONJG( A( (ICOL-1)*LDA+IROW ) )*TSCAL CALL ZGEBS2D( CONTXT, 'All', ' ', 1, 1, TJJS,1) ELSE CALL ZGEBR2D( CONTXT, 'All', ' ', 1, 1, TJJS,1, $ ITMP1, ITMP2 ) END IF ELSE TJJS = TSCAL IF( TSCAL.EQ.ONE ) $ GO TO 210 END IF * * Compute x(j) = x(j) / A(j,j), scaling if necessary. * TJJ = CABS1( TJJS ) IF( TJJ.GT.SMLNUM ) THEN * * abs(A(j,j)) > SMLNUM: * IF( TJJ.LT.ONE ) THEN IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale X by 1/abs(x(j)). * REC = ONE / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF END IF C X( J ) = ZLADIV( X( J ), TJJS ) XJTMP = ZLADIV( XJTMP, TJJS ) IF(( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X )) $ X( IROWX ) = XJTMP ELSE IF( TJJ.GT.ZERO ) THEN * * 0 < abs(A(j,j)) <= SMLNUM: * IF( XJ.GT.TJJ*BIGNUM ) THEN * * Scale x by (1/abs(x(j)))*abs(A(j,j))*BIGNUM. * REC = ( TJJ*BIGNUM ) / XJ CALL PZDSCAL( N, REC, X, IX, JX, DESCX, 1 ) XJTMP = XJTMP*REC SCALE = SCALE*REC XMAX = XMAX*REC END IF C X( J ) = ZLADIV( X( J ), TJJS ) XJTMP = ZLADIV( XJTMP, TJJS ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = XJTMP ELSE * * A(j,j) = 0: Set x(1:n) = 0, x(j) = 1, and * scale = 0 and compute a solution to A**H *x = 0. * CALL PZLASET( ' ', N, 1, CZERO, CZERO, X, IX, JX, $ DESCX ) IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = CONE XJTMP = CONE SCALE = ZERO XMAX = ZERO END IF 210 CONTINUE ELSE * * Compute x(j) := x(j) / A(j,j) - CSUMJ if the dot * product has already been divided by 1/A(j,j). * C X( J ) = ZLADIV( X( J ), TJJS ) - CSUMJ XJTMP = ZLADIV( XJTMP, TJJS ) - CSUMJ IF( ( MYROW.EQ.ITMP1X ) .AND. ( MYCOL.EQ.ITMP2X ) ) $ X( IROWX ) = XJTMP END IF XMAX = MAX( XMAX, CABS1( XJTMP ) ) 220 CONTINUE END IF SCALE = SCALE / TSCAL END IF * * Scale the column norms by 1/TSCAL for return. * IF( TSCAL.NE.ONE ) THEN CALL DSCAL( N, ONE / TSCAL, CNORM, 1 ) END IF * RETURN * * End of PZLATRS * END SHAR_EOF fi # end of overwriting check if test -f 'pzlawil.f' then echo shar: will not over-write existing file "'pzlawil.f'" else cat << "SHAR_EOF" > 'pzlawil.f' SUBROUTINE PZLAWIL( II, JJ, M, A, DESCA, H44, H33, H43H34, V ) * * Mark R. Fahey * May 28, 1999 * * .. Scalar Arguments .. INTEGER II, JJ, M COMPLEX*16 H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), V( * ) * .. * * Purpose * ======= * * PZLAWIL gets the transform given by H44,H33, & H43H34 into V * starting at row M. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * II (global input) INTEGER * Row owner of H(M+2,M+2) * * JJ (global input) INTEGER * Column owner of H(M+2,M+2) * * M (global input) INTEGER * On entry, this is where the transform starts (row M.) * Unchanged on exit. * * A (global input) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * Unchanged on exit. * * H44 * H33 * H43H34 (global input) COMPLEX*16 * These three values are for the double shift QR iteration. * Unchanged on exit. * * V (global output) COMPLEX*16 array of size 3. * Contains the transform on ouput. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, ICOL, IROW, JSRC, LDA, LEFT, $ MODKM1, MYCOL, MYROW, NPCOL, NPROW, NUM, RIGHT, $ RSRC, UP DOUBLE PRECISION S COMPLEX*16 CDUM, H11, H12, H21, H22, H33S, H44S, V1, V2, $ V3 * .. * .. Local Arrays .. COMPLEX*16 BUF( 4 ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG2L, ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * On node (II,JJ) collect all DIA,SUP,SUB info from M, M+1 * MODKM1 = MOD( M+1, HBL ) IF( MODKM1.EQ.0 ) THEN IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) CALL ZGESD2D( CONTXT, 1, 1, BUF, 1, II, JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) BUF( 1 ) = A( ( ICOL-1 )*LDA+IROW ) BUF( 2 ) = A( ( ICOL-1 )*LDA+IROW+1 ) BUF( 3 ) = A( ICOL*LDA+IROW ) BUF( 4 ) = A( ICOL*LDA+IROW+1 ) CALL ZGESD2D( CONTXT, 4, 1, BUF, 4, II, JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NPCOL.GT.1 ) THEN CALL ZGERV2D( CONTXT, 1, 1, V3, 1, MYROW, LEFT ) ELSE V3 = A( ( ICOL-2 )*LDA+IROW ) END IF IF( NUM.GT.1 ) THEN CALL ZGERV2D( CONTXT, 4, 1, BUF, 4, UP, LEFT ) H11 = BUF( 1 ) H21 = BUF( 2 ) H12 = BUF( 3 ) H22 = BUF( 4 ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) END IF END IF END IF IF( MODKM1.EQ.1 ) THEN IF( ( DOWN.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. ( NUM.GT.1 ) ) $ THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW, $ ICOL, RSRC, JSRC ) CALL ZGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( DOWN.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. ( NPROW.GT.1 ) ) $ THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL ZGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( RIGHT.EQ.JJ ) .AND. $ ( NPCOL.GT.1 ) ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) CALL ZGESD2D( CONTXT, 1, 1, A( ( ICOL-1 )*LDA+IROW ), 1, II, $ JJ ) END IF IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) IF( NUM.GT.1 ) THEN CALL ZGERV2D( CONTXT, 1, 1, H11, 1, UP, LEFT ) ELSE H11 = A( ( ICOL-3 )*LDA+IROW-2 ) END IF IF( NPROW.GT.1 ) THEN CALL ZGERV2D( CONTXT, 1, 1, H12, 1, UP, MYCOL ) ELSE H12 = A( ( ICOL-2 )*LDA+IROW-2 ) END IF IF( NPCOL.GT.1 ) THEN CALL ZGERV2D( CONTXT, 1, 1, H21, 1, MYROW, LEFT ) ELSE H21 = A( ( ICOL-3 )*LDA+IROW-1 ) END IF H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF END IF IF( ( MYROW.NE.II ) .OR. ( MYCOL.NE.JJ ) ) $ RETURN * IF( MODKM1.GT.1 ) THEN CALL INFOG2L( M+2, M+2, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, RSRC, JSRC ) H11 = A( ( ICOL-3 )*LDA+IROW-2 ) H21 = A( ( ICOL-3 )*LDA+IROW-1 ) H12 = A( ( ICOL-2 )*LDA+IROW-2 ) H22 = A( ( ICOL-2 )*LDA+IROW-1 ) V3 = A( ( ICOL-2 )*LDA+IROW ) END IF * H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 * RETURN * * End of PZLAWIL * END SHAR_EOF fi # end of overwriting check if test -f 'pzrot.c' then echo shar: will not over-write existing file "'pzrot.c'" else cat << "SHAR_EOF" > 'pzrot.c' /* --------------------------------------------------------------------- * * -- Mark R. Fahey * June 28, 2000 * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzrot_( n, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy, c, s ) /* * Mark Fahey * June 22, 2000 */ /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; double * c; complex16 * s; /* * .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZROT applies a plane rotation, where the cos (C) is real and the * sin (S) is complex, and the vectors CX and CY are complex, i.e., * * [ sub( X ) ] := [ C S ] [ sub( X ) ] * [ sub( Y ) ] := [ -conjg(S) C ] [ sub( Y ) ] * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y, * * and where C*C + S*CONJG(S) = 1.0. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * Arguments * ========= * * N (input) INTEGER * The number of elements in the vectors CX and CY. * * X (local input) COMPLEX array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * On output, CX is overwritten with C*X + S*Y. * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input) COMPLEX array containing the local * pieces of a distributed matrix of dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * On output, CY is overwritten with -CONJG(S)*X + C*Y. * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * C (input) pointer to DOUBLE * S (input) pointer COMPLEX * C and S define a rotation * [ C S ] * [ -conjg(S) C ] * where C*C + S*CONJG(S) = 1.0. * * ===================================================================== * * .. Local Scalars .. */ int ictxt, iix, iiy, info, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, mycol, myrow, nn, np, np0, nprow, npcol, nq, nz, ione=1, tmp1, wksz; complex16 xwork[1], ywork[1], zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgerv2d_(); void zgesd2d_(); void pbchkvect(); void pbchkmat(); void pberror_(); char * getpbbuf(); F_INTG_FCT pbztrnv_(); F_INTG_FCT zrot_(); F_INTG_FCT ilcm_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(500+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 5, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 10, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -9; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1000+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -8; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1000+MB_+1); } else { info = -11; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -9; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1000+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -8; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1000+MB_+1); } else { info = -11; } } else { info = -6; } } if( ictxt != desc_Y[CTXT_] ) info = -(1000+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZROT", &info ); return; } /* * Quick return if possible. */ zero.re = ZERO; zero.im = ZERO; if( *n == 0 ) return; /* * rotation */ if( *n == 1 ) { if( ( myrow == ixrow ) && ( mycol == ixcol ) ) { buff = &X[iix-1+(jjx-1)*desc_X[LLD_]]; if( ( myrow != iyrow ) || ( mycol != iycol ) ) { zgesd2d_( &ictxt, n, n, buff, n, &iyrow, &iycol ); zgerv2d_( &ictxt, n, n, ywork, n, &iyrow, &iycol ); } else *ywork = Y[iiy-1+(jjy-1)*desc_Y[LLD_]]; zrot_( n, buff, n, ywork, n, c, s ); X[iix-1+(jjx-1)*desc_X[LLD_]] = *buff; if( ( myrow == iyrow ) && ( mycol == iycol ) ) Y[iiy-1+(jjy-1)*desc_Y[LLD_]] = *ywork; } else if( ( myrow == iyrow ) && ( mycol == iycol ) ) { zgesd2d_( &ictxt, n, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, &ixrow, &ixcol ); zgerv2d_( &ictxt, n, n, xwork, n, &ixrow, &ixcol ); zrot_( n, xwork, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, c, s ); } return; } if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) { zrot_( &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], c, s ); } } else { if( myrow == ixrow ) { zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); buff = (complex16 *)getpbbuf( "PZROT", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &iyrow, &mycol ); zrot_( &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], buff, &ione, c, s ); } else if( myrow == iyrow ) { zgesd2d_( &ictxt, &ione, &nq, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &mycol ); buff = (complex16 *)getpbbuf( "PZROT", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zrot_( &nq, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], c, s ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) { zrot_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy, c, s ); } } else { if( mycol == ixcol ) { zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); buff = (complex16 *)getpbbuf( "PZROT", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &iycol ); zrot_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione, c, s ); } else if( mycol == iycol ) { zgesd2d_( &ictxt, &np, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &myrow, &ixcol ); buff = (complex16 *)getpbbuf( "PZROT", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zrot_( &np, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy, c, s ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmp = lcm / nprow; nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_X[MB_], nprow ); tmp1 = np0 / desc_X[MB_]; wksz = MYROC0( tmp1, np0, desc_X[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZROT", wksz*sizeof(complex16) ); if( mycol == iycol ) jjy -= nz; if( myrow == ixrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &zero, buff, &ione, &iyrow, &iycol, &ixrow, &ixcol, buff+np ); if( mycol == ixcol ) { zrot_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione, c, s ); } pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, buff, &ione, &zero, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &ixcol, &iyrow, &iycol, buff+np ); } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*jx-1) % desc_X[NB_]; nn = *n + nz; tmp1 = nn / desc_X[MB_]; np = numroc_( &nn, desc_Y+MB_, &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZROT", wksz*sizeof(complex16) ); if( myrow == iyrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { zrot_( &np, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy, c, s ); } pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, buff, &ione, &zero, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &iycol, &ixrow, &ixcol, buff+np ); } } } SHAR_EOF fi # end of overwriting check if test -f 'pztrevc.f' then echo shar: will not over-write existing file "'pztrevc.f'" else cat << "SHAR_EOF" > 'pztrevc.f' SUBROUTINE PZTREVC( SIDE, HOWMNY, SELECT, N, T, DESCT, VL, DESCVL, $ VR, DESCVR, MM, M, WORK, RWORK, INFO ) * * Mark R. Fahey * June 20, 2000 * * .. Scalar Arguments .. CHARACTER HOWMNY, SIDE INTEGER INFO, M, MM, N * .. * .. Array Arguments .. LOGICAL SELECT( * ) INTEGER DESCT( * ), DESCVL( * ), DESCVR( * ) DOUBLE PRECISION RWORK( * ) COMPLEX*16 T( * ), VL( * ), VR( * ), WORK( * ) * .. * * Purpose * ======= * * PZTREVC computes some or all of the right and/or left eigenvectors of * a complex upper triangular matrix T in parallel. * * The right eigenvector x and the left eigenvector y of T corresponding * to an eigenvalue w are defined by: * * T*x = w*x, y'*T = w*y' * * where y' denotes the conjugate transpose of the vector y. * * If all eigenvectors are requested, the routine may either return the * matrices X and/or Y of right or left eigenvectors of T, or the * products Q*X and/or Q*Y, where Q is an input unitary * matrix. If T was obtained from the Schur factorization of an * original matrix A = Q*T*Q', then Q*X and Q*Y are the matrices of * right or left eigenvectors of A. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension r x c. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the r processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the c processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * SIDE (global input) CHARACTER*1 * = 'R': compute right eigenvectors only; * = 'L': compute left eigenvectors only; * = 'B': compute both right and left eigenvectors. * * HOWMNY (global input) CHARACTER*1 * = 'A': compute all right and/or left eigenvectors; * = 'B': compute all right and/or left eigenvectors, * and backtransform them using the input matrices * supplied in VR and/or VL; * = 'S': compute selected right and/or left eigenvectors, * specified by the logical array SELECT. * * SELECT (global input) LOGICAL array, dimension (N) * If HOWMNY = 'S', SELECT specifies the eigenvectors to be * computed. * If HOWMNY = 'A' or 'B', SELECT is not referenced. * To select the eigenvector corresponding to the j-th * eigenvalue, SELECT(j) must be set to .TRUE.. * * N (global input) INTEGER * The order of the matrix T. N >= 0. * * T (global input/output) COMPLEX*16 array, dimension * (DESCT(LLD_),*) * The upper triangular matrix T. T is modified, but restored * on exit. * * DESCT (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix T. * * VL (global input/output) COMPLEX*16 array, dimension * (DESCVL(LLD_),MM) * On entry, if SIDE = 'L' or 'B' and HOWMNY = 'B', VL must * contain an N-by-N matrix Q (usually the unitary matrix Q of * Schur vectors returned by ZHSEQR). * On exit, if SIDE = 'L' or 'B', VL contains: * if HOWMNY = 'A', the matrix Y of left eigenvectors of T; * if HOWMNY = 'B', the matrix Q*Y; * if HOWMNY = 'S', the left eigenvectors of T specified by * SELECT, stored consecutively in the columns * of VL, in the same order as their * eigenvalues. * If SIDE = 'R', VL is not referenced. * * DESCVL (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix VL. * * VR (global input/output) COMPLEX*16 array, dimension * (DESCVR(LLD_),MM) * On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must * contain an N-by-N matrix Q (usually the unitary matrix Q of * Schur vectors returned by ZHSEQR). * On exit, if SIDE = 'R' or 'B', VR contains: * if HOWMNY = 'A', the matrix X of right eigenvectors of T; * if HOWMNY = 'B', the matrix Q*X; * if HOWMNY = 'S', the right eigenvectors of T specified by * SELECT, stored consecutively in the columns * of VR, in the same order as their * eigenvalues. * If SIDE = 'L', VR is not referenced. * * DESCVR (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix VR. * * MM (global input) INTEGER * The number of columns in the arrays VL and/or VR. MM >= M. * * M (global output) INTEGER * The number of columns in the arrays VL and/or VR actually * used to store the eigenvectors. If HOWMNY = 'A' or 'B', M * is set to N. Each selected eigenvector occupies one * column. * * WORK (local workspace) COMPLEX*16 array, dimension ( 2*DESCT(LLD_) ) * Additional workspace may be required if PZLATRS is updated * to use WORK. * * RWORK (local workspace) DOUBLE PRECISION array, * dimension ( DESCT(LLD_) ) * * INFO (global output) INTEGER * = 0: successful exit * < 0: if INFO = -i, the i-th argument had an illegal value * * Further Details * =============== * * The algorithm used in this program is basically backward (forward) * substitution. It is the hope that scaling would be used to make the * the code robust against possible overflow. But scaling has not yet * been implemented in PZLATRS which is called by this routine to solve * the triangular systems. PZLATRS just calls PZTRSV. * * Each eigenvector is normalized so that the element of largest * magnitude has magnitude 1; here the magnitude of a complex number * (x,y) is taken to be |x| + |y|. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. LOGICAL ALLV, BOTHV, LEFTV, OVER, RIGHTV, SOMEV INTEGER CONTXT, CSRC, I, II, IS, J, K, KI, LDT, LDW, $ MB, NB, NPCOL, NPROW, RSRC, LDVL, LDVR DOUBLE PRECISION OVFL, REMAXD, SCALE, SMIN, SMLNUM, ULP, UNFL COMPLEX*16 CDUM, REMAXC, SHIFT, TEMP * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Functions .. LOGICAL LSAME INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH EXTERNAL ICEIL, LSAME, NUMROC, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMN2D, INFOG2L, DGSUM2D, $ PDLABAD, PDZASUM, PXERBLA, PZAMAX, PZCOPY, $ PZDSCAL, PZGEMV, PZLASET, PZLATRS, ZGSUM2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1, pzlantr * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * * This is just to keep ftnchek happy IF( BLOCK_CYCLIC_2D*CSRC_*CTXT_*DLEN_*DTYPE_*LLD_*MB_*M_*NB_*N_* $ RSRC_.LT.0 )RETURN * CONTXT = DESCT( CTXT_ ) RSRC = DESCT( RSRC_ ) CSRC = DESCT( CSRC_ ) MB = DESCT( MB_ ) NB = DESCT( NB_ ) LDT = DESCT( LLD_ ) LDW = LDT LDVR = DESCVR( LLD_ ) LDVL = DESCVL( LLD_ ) * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) SELF = MYROW*NPCOL + MYCOL * * Decode and test the input parameters * BOTHV = LSAME( SIDE, 'B' ) RIGHTV = LSAME( SIDE, 'R' ) .OR. BOTHV LEFTV = LSAME( SIDE, 'L' ) .OR. BOTHV * ALLV = LSAME( HOWMNY, 'A' ) OVER = LSAME( HOWMNY, 'B' ) .OR. LSAME( HOWMNY, 'O' ) SOMEV = LSAME( HOWMNY, 'S' ) * * Set M to the number of columns required to store the selected * eigenvectors. * IF( SOMEV ) THEN M = 0 DO 10 J = 1, N IF( SELECT( J ) ) $ M = M + 1 10 CONTINUE ELSE M = N END IF * INFO = 0 IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN INFO = -1 ELSE IF( .NOT.ALLV .AND. .NOT.OVER .AND. .NOT.SOMEV ) THEN INFO = -2 ELSE IF( N.LT.0 ) THEN INFO = -4 ELSE IF( MM.LT.M ) THEN INFO = -11 END IF CALL IGAMN2D( CONTXT, 'ALL', ' ', 1, 1, INFO, 1, ITMP1, ITMP2, -1, $ -1, -1 ) IF( INFO.LT.0 ) THEN CALL PXERBLA( CONTXT, 'PZTREVC', -INFO ) RETURN END IF * * Quick return if possible. * IF( N.EQ.0 ) $ RETURN * * Set the constants to control overflow. * UNFL = PDLAMCH( CONTXT, 'Safe minimum' ) OVFL = ONE / UNFL CALL PDLABAD( CONTXT, UNFL, OVFL ) ULP = PDLAMCH( CONTXT, 'Precision' ) SMLNUM = UNFL*( N / ULP ) * * Store the diagonal elements of T in working array WORK( LDW+1 ). * DO 20 I = 1, N CALL INFOG2L( I, I, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN WORK( LDW+IROW ) = T( (ICOL-1)*LDT+IROW ) END IF 20 CONTINUE * * Compute 1-norm of each column of strictly upper triangular * part of T to control overflow in triangular solver. Computed, * but not used. For use in PZLATRS. * RWORK( 1 ) = ZERO DO 30 J = 2, N CALL PDZASUM( J-1, RWORK( J ), T, 1, J, DESCT, 1 ) 30 CONTINUE * I replicate the norms in RWORK. Should they be distributed * over the process rows? CALL DGSUM2D( CONTXT, 'Row', ' ', N, 1, RWORK, 1, -1, -1 ) * IF( RIGHTV ) THEN * * Compute right eigenvectors. * * Need to set the distribution pattern of WORK * CALL DESCINIT( DESCW, N, 1, NB, 1, RSRC, CSRC, CONTXT, LDW, $ INFO ) * IS = M DO 80 KI = N, 1, -1 * IF( SOMEV ) THEN IF( .NOT.SELECT( KI ) ) $ GO TO 80 END IF * SMIN = ZERO SHIFT = CZERO CALL INFOG2L( KI, KI, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN SHIFT = T( (ICOL-1)*LDT+IROW ) SMIN = MAX( ULP*( CABS1( SHIFT ) ), SMLNUM ) END IF CALL DGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SMIN, 1, -1, -1 ) CALL ZGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SHIFT, 1, -1, -1 ) * CALL INFOG2L( 1, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN WORK( 1 ) = CONE END IF * * Form right-hand side. Distribute rhs onto first column * of processor grid. * IF( KI.GT.1 ) THEN CALL PZCOPY( KI-1, T, 1, KI, DESCT, 1, WORK, 1, $ 1, DESCW, 1 ) ENDIF DO 40 K = 1, KI-1 CALL INFOG2L( K, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = -WORK( IROW ) END IF 40 CONTINUE * * Solve the triangular system: * (T(1:KI-1,1:KI-1) - T(KI,KI))*X = SCALE*WORK. * DO 50 K = 1, KI - 1 CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( (ICOL-1)*LDT+IROW ) = T( (ICOL-1)*LDT+IROW ) - $ SHIFT IF( CABS1( T( (ICOL-1)*LDT+IROW ) ).LT.SMIN ) THEN T( (ICOL-1)*LDT+IROW ) = DCMPLX( SMIN ) END IF END IF 50 CONTINUE * IF( KI.GT.1 ) THEN CALL PZLATRS( 'Upper', 'No transpose', 'Non-unit', 'Y', $ KI-1, T, 1, 1, DESCT, WORK, 1, 1, DESCW, $ SCALE, RWORK, INFO ) CALL INFOG2L( KI, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = DCMPLX( SCALE ) END IF END IF * * Copy the vector x or Q*x to VR and normalize. * IF( .NOT.OVER ) THEN CALL PZCOPY( KI, WORK, 1, 1, DESCW, 1, VR, 1, IS, $ DESCVR, 1 ) * CALL PZAMAX( KI, REMAXC, II, VR, 1, IS, DESCVR, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PZDSCAL( KI, REMAXD, VR, 1, IS, DESCVR, 1 ) * CALL PZLASET( ' ', N-KI, 1, CZERO, CZERO, VR, KI+1, $ IS, DESCVR ) ELSE IF( KI.GT.1 ) $ CALL PZGEMV( 'N', N, KI-1, CONE, VR, 1, 1, DESCVR, $ WORK, 1, 1, DESCW, 1, DCMPLX( SCALE ), $ VR, 1, KI, DESCVR, 1 ) * CALL PZAMAX( N, REMAXC, II, VR, 1, KI, DESCVR, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PZDSCAL( N, REMAXD, VR, 1, KI, DESCVR, 1 ) END IF * * Set back the original diagonal elements of T. * DO 70 K = 1, KI - 1 CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( (ICOL-1)*LDT+IROW ) = WORK( LDW+IROW ) END IF 70 CONTINUE * IS = IS - 1 80 CONTINUE END IF * IF( LEFTV ) THEN * * Compute left eigenvectors. * * Need to set the distribution pattern of WORK * CALL DESCINIT( DESCW, N, 1, MB, 1, RSRC, CSRC, CONTXT, LDW, $ INFO ) * IS = 1 DO 130 KI = 1, N * IF( SOMEV ) THEN IF( .NOT.SELECT( KI ) ) $ GO TO 130 END IF * SMIN = ZERO SHIFT = CZERO CALL INFOG2L( KI, KI, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN SHIFT = T( (ICOL-1)*LDT+IROW ) SMIN = MAX( ULP*( CABS1( SHIFT ) ), SMLNUM ) END IF CALL DGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SMIN, 1, -1, -1 ) CALL ZGSUM2D( CONTXT, 'ALL', ' ', 1, 1, SHIFT, 1, -1, -1 ) * CALL INFOG2L( N, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN WORK( IROW ) = CONE END IF * * Form right-hand side. * IF( KI.LT.N ) THEN CALL PZCOPY( N-KI, T, KI, KI+1, DESCT, N, WORK, KI+1, $ 1, DESCW, 1 ) ENDIF DO 90 K = KI + 1, N CALL INFOG2L( K, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = -DCONJG( WORK( IROW ) ) END IF 90 CONTINUE * * Solve the triangular system: * (T(KI+1:N,KI+1:N) - T(KI,KI))'*X = SCALE*WORK. * DO 100 K = KI + 1, N CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( (ICOL-1)*LDT+IROW ) = T( (ICOL-1)*LDT+IROW ) - $ SHIFT IF( CABS1( T( (ICOL-1)*LDT+IROW ) ).LT.SMIN ) $ T( (ICOL-1)*LDT+IROW ) = DCMPLX( SMIN ) END IF 100 CONTINUE * IF( KI.LT.N ) THEN CALL PZLATRS( 'Upper', 'Conjugate transpose', 'Nonunit', $ 'Y', N-KI, T, KI+1, KI+1, DESCT, WORK, KI+1, $ 1, DESCW, SCALE, RWORK, INFO ) CALL INFOG2L( KI, 1, DESCW, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( MYROW.EQ.ITMP1 .AND. MYCOL.EQ.ITMP2 ) THEN WORK( IROW ) = DCMPLX( SCALE ) END IF END IF * * Copy the vector x or Q*x to VL and normalize. * IF( .NOT.OVER ) THEN CALL PZCOPY( N-KI+1, WORK, KI, 1, DESCW, 1, VL, KI, $ IS, DESCVL, 1 ) * CALL PZAMAX( N-KI+1, REMAXC, II, VL, KI, IS, DESCVL, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PZDSCAL( N-KI+1, REMAXD, VL, KI, IS, DESCVL, 1 ) * CALL PZLASET( ' ', KI-1, 1, CZERO, CZERO, VL, 1, $ IS, DESCVL ) ELSE IF( KI.LT.N ) $ CALL PZGEMV( 'N', N, N-KI, CONE, VL, 1, KI+1, $ DESCVL, WORK, KI+1, 1, DESCW, 1, $ DCMPLX( SCALE ), VL, 1, KI, DESCVL, 1 ) * CALL PZAMAX( N, REMAXC, II, VL, 1, KI, DESCVL, 1 ) REMAXD = ONE / MAX( CABS1( REMAXC ), UNFL ) CALL PZDSCAL( N, REMAXD, VL, 1, KI, DESCVL, 1 ) END IF * * Set back the original diagonal elements of T. * DO 120 K = KI + 1, N CALL INFOG2L( K, K, DESCT, NPROW, NPCOL, MYROW, MYCOL, $ IROW, ICOL, ITMP1, ITMP2 ) IF( ( MYROW.EQ.ITMP1 ) .AND. ( MYCOL.EQ.ITMP2 ) ) THEN T( (ICOL-1)*LDT+IROW ) = WORK( LDW+IROW ) END IF 120 CONTINUE * IS = IS + 1 130 CONTINUE END IF * RETURN * * End of PZTREVC * END SHAR_EOF fi # end of overwriting check if test -f 'zlahqr.f' then echo shar: will not over-write existing file "'zlahqr.f'" else cat << "SHAR_EOF" > 'zlahqr.f' SUBROUTINE ZLAHQR( WANTT, WANTZ, N, ILO, IHI, H, LDH, W, ILOZ, $ IHIZ, Z, LDZ, INFO ) * * Mark R. Fahey * June 22, 2000 * * .. Scalar Arguments .. LOGICAL WANTT, WANTZ INTEGER IHI, IHIZ, ILO, ILOZ, INFO, LDH, LDZ, N * .. * .. Array Arguments .. COMPLEX*16 H( LDH, * ), W( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * ZLAHQR is an auxiliary routine called by ZHSEQR to update the * eigenvalues and Schur decomposition already computed by ZHSEQR, by * dealing with the Hessenberg submatrix in rows and columns ILO to IHI. * This version of ZLAHQR (not the standard LAPACK version) uses a * double-shift algorithm (like LAPACK's DLAHQR). * Unlike the standard LAPACK convention, this does not assume the * subdiagonal is real, nor does it work to preserve this quality if * given. * * Arguments * ========= * * WANTT (input) LOGICAL * = .TRUE. : the full Schur form T is required; * = .FALSE.: only eigenvalues are required. * * WANTZ (input) LOGICAL * = .TRUE. : the matrix of Schur vectors Z is required; * = .FALSE.: Schur vectors are not required. * * N (input) INTEGER * The order of the matrix H. N >= 0. * * ILO (input) INTEGER * IHI (input) INTEGER * It is assumed that H is already upper triangular in rows and * columns IHI+1:N, and that H(ILO,ILO-1) = 0 (unless ILO = 1). * ZLAHQR works primarily with the Hessenberg submatrix in rows * and columns ILO to IHI, but applies transformations to all of * H if WANTT is .TRUE.. * 1 <= ILO <= max(1,IHI); IHI <= N. * * H (input/output) COMPLEX*16 array, dimension (LDH,N) * On entry, the upper Hessenberg matrix H. * On exit, if WANTT is .TRUE., H is upper triangular in rows * and columns ILO:IHI. If WANTT is .FALSE., the contents of H * are unspecified on exit. * * LDH (input) INTEGER * The leading dimension of the array H. LDH >= max(1,N). * * W (output) COMPLEX*16 array, dimension (N) * The computed eigenvalues ILO to IHI are stored in the * corresponding elements of W. If WANTT is .TRUE., the * eigenvalues are stored in the same order as on the diagonal * of the Schur form returned in H, with W(i) = H(i,i). * * ILOZ (input) INTEGER * IHIZ (input) INTEGER * Specify the rows of Z to which transformations must be * applied if WANTZ is .TRUE.. * 1 <= ILOZ <= ILO; IHI <= IHIZ <= N. * * Z (input/output) COMPLEX*16 array, dimension (LDZ,N) * If WANTZ is .TRUE., on entry Z must contain the current * matrix Z of transformations, and on exit Z has been updated; * transformations are applied only to the submatrix * Z(ILOZ:IHIZ,ILO:IHI). If WANTZ is .FALSE., Z is not * referenced. * * LDZ (input) INTEGER * The leading dimension of the array Z. LDZ >= max(1,N). * * INFO (output) INTEGER * = 0: successful exit * > 0: if INFO = i, ZLAHQR failed to compute all the * eigenvalues ILO to IHI in a total of 30*(IHI-ILO+1) * iterations; elements i+1:ihi of W contain those * eigenvalues which have been successfully computed. * * ===================================================================== * * .. Parameters .. COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) DOUBLE PRECISION RZERO, RONE, HALF PARAMETER ( RZERO = 0.0D+0, RONE = 1.0D+0, $ HALF = 0.5D+0 ) DOUBLE PRECISION DAT1, DAT2 PARAMETER ( DAT1 = 0.75D+0, DAT2 = -0.4375D+0 ) * .. * .. Local Scalars .. INTEGER I, I1, I2, ITN, ITS, J, K, L, M, NH, NR, NZ DOUBLE PRECISION CS, OVFL, RTEMP, S, SMLNUM, TST1, ULP, UNFL COMPLEX*16 CDUM, H00, H10, H11, H11S, H12, H21, H22, H33, $ H33S, H43H34, H44, H44S, SN, SUM, TEMP, T1, T2, $ T3, V1, V2, V3 * .. * .. Local Arrays .. DOUBLE PRECISION RWORK( 1 ) COMPLEX*16 V( 3 ) * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLAPY2, ZLANHS COMPLEX*16 ZLADIV EXTERNAL DLAMCH, DLAPY2, ZLANHS, ZLADIV * .. * .. External Subroutines .. EXTERNAL DLABAD, ZCOPY, ZLANV2, ZLARFG, ZSCAL * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * INFO = 0 * * Quick return if possible * IF( N.EQ.0 ) $ RETURN IF( ILO.EQ.IHI ) THEN W( ILO ) = H( ILO, ILO ) RETURN END IF * NH = IHI - ILO + 1 NZ = IHIZ - ILOZ + 1 * * Set machine-dependent constants for the stopping criterion. * If norm(H) <= sqrt(OVFL), overflow should not occur. * UNFL = DLAMCH( 'Safe minimum' ) OVFL = RONE / UNFL CALL DLABAD( UNFL, OVFL ) ULP = DLAMCH( 'Precision' ) SMLNUM = UNFL*( NH / ULP ) * * I1 and I2 are the indices of the first row and last column of H * to which transformations must be applied. If eigenvalues only are * being computed, I1 and I2 are set inside the main loop. * IF( WANTT ) THEN I1 = 1 I2 = N END IF * * ITN is the total number of QR iterations allowed. * ITN = 30*NH * * The main loop begins here. I is the loop index and decreases from * IHI to ILO in steps of 1 or 2. Each iteration of the loop works * with the active submatrix in rows and columns L to I. * Eigenvalues I+1 to IHI have already converged. Either L = ILO, or * H(L,L-1) is negligible so that the matrix splits. * I = IHI 10 CONTINUE L = ILO IF( I.LT.ILO ) $ GO TO 130 * * Perform QR iterations on rows and columns ILO to I until a * submatrix of order 1 or 2 splits off at the bottom because a * subdiagonal element has become negligible. * DO 110 ITS = 0, ITN * * Look for a single small subdiagonal element. * DO 20 K = I, L + 1, -1 TST1 = CABS1( H( K-1, K-1 ) ) + CABS1( H( K, K ) ) IF( TST1.EQ.RZERO ) $ TST1 = ZLANHS( '1', I-L+1, H( L, L ), LDH, RWORK ) IF( CABS1( H( K, K-1 ) ).LE.MAX( ULP*TST1, SMLNUM ) ) $ GO TO 30 20 CONTINUE 30 CONTINUE L = K IF( L.GT.ILO ) THEN * * H(L,L-1) is negligible * H( L, L-1 ) = ZERO END IF * * Exit from loop if a submatrix of order 1 or 2 has split off. * IF( L.GE.I-1 ) $ GO TO 120 * * Now the active submatrix is in rows and columns L to I. If * eigenvalues only are being computed, only the active submatrix * need be transformed. * IF( .NOT.WANTT ) THEN I1 = L I2 = I END IF * IF( ITS.EQ.10 .OR. ITS.EQ.20 ) THEN * * Exceptional shift. * C S = ABS( DBLE( H( I,I-1 ) ) ) + ABS( DBLE( H( I-1,I-2 ) ) ) S = CABS1( H( I,I-1 ) ) + CABS1( H( I-1,I-2 ) ) H44 = DAT1*S H33 = H44 H43H34 = DAT2*S*S ELSE * * Prepare to use Wilkinson's shift. * H44 = H( I, I ) H33 = H( I-1, I-1 ) H43H34 = H( I, I-1 )*H( I-1, I ) END IF * * Look for two consecutive small subdiagonal elements. * DO 40 M = I - 2, L, -1 * * Determine the effect of starting the double-shift QR * iteration at row M, and see if this would make H(M,M-1) * negligible. * H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S = CABS1( V1 ) + CABS1( V2 ) + ABS( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 IF( M.EQ.L ) $ GO TO 50 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).LE.ULP*TST1 ) $ GO TO 50 40 CONTINUE 50 CONTINUE * * Double-shift QR step * DO 100 K = M, I - 1 * * The first iteration of this loop determines a reflection G * from the vector V and applies it from left and right to H, * thus creating a nonzero bulge below the subdiagonal. * * Each subsequent iteration determines a reflection G to * restore the Hessenberg form in the (K-1)th column, and thus * chases the bulge one step toward the bottom of the active * submatrix. NR is the order of G * NR = MIN( 3, I-K+1 ) IF( K.GT.M ) $ CALL ZCOPY( NR, H( K, K-1 ), 1, V, 1 ) CALL ZLARFG( NR, V( 1 ), V( 2 ), 1, T1 ) IF( K.GT.M ) THEN H( K, K-1 ) = V( 1 ) H( K+1, K-1 ) = ZERO IF( K.LT.I-1 ) $ H( K+2, K-1 ) = ZERO ELSE IF( M.GT.L ) THEN * The real double-shift code uses H( K, K-1 ) = -H( K, K-1 ) * instead of the following. H( K, K-1 ) = H( K, K-1 ) - DCONJG( T1 )*H( K, K-1 ) END IF V2 = V( 2 ) T2 = T1*V2 IF( NR.EQ.3 ) THEN V3 = V( 3 ) T3 = T1*V3 * * Apply G from the left to transform the rows of the matrix * in columns K to I2. * DO 60 J = K, I2 SUM = DCONJG( T1 )*H( K, J ) + $ DCONJG( T2 )*H( K+1, J ) + DCONJG( T3 )*H( K+2, J ) H( K, J ) = H( K, J ) - SUM H( K+1, J ) = H( K+1, J ) - SUM*V2 H( K+2, J ) = H( K+2, J ) - SUM*V3 60 CONTINUE * * Apply G from the right to transform the columns of the * matrix in rows I1 to min(K+3,I). * DO 70 J = I1, MIN( K+3, I ) SUM = T1*H( J, K ) + T2*H( J, K+1 ) + T3*H( J, K+2 ) H( J, K ) = H( J, K ) - SUM H( J, K+1 ) = H( J, K+1 ) - SUM*DCONJG( V2 ) H( J, K+2 ) = H( J, K+2 ) - SUM*DCONJG( V3 ) 70 CONTINUE * IF( WANTZ ) THEN * * Accumulate transformations in the matrix Z * DO 80 J = ILOZ, IHIZ SUM = T1*Z( J, K ) + T2*Z( J, K+1 ) + $ T3*Z( J, K+2 ) Z( J, K ) = Z( J, K ) - SUM Z( J, K+1 ) = Z( J, K+1 ) - SUM*DCONJG( V2 ) Z( J, K+2 ) = Z( J, K+2 ) - SUM*DCONJG( V3 ) 80 CONTINUE END IF ELSE IF( NR.EQ.2 ) THEN * * Apply G from the left to transform the rows of the matrix * in columns K to I2. * DO 61 J = K, I2 SUM = DCONJG( T1 )*H( K, J ) + DCONJG(T2)*H( K+1, J ) H( K, J ) = H( K, J ) - SUM H( K+1, J ) = H( K+1, J ) - SUM*V2 61 CONTINUE * * Apply G from the right to transform the columns of the * matrix in rows I1 to min(K+2,I). * DO 71 J = I1, MIN( K+2, I ) SUM = T1*H( J, K ) + T2*H( J, K+1 ) H( J, K ) = H( J, K ) - SUM H( J, K+1 ) = H( J, K+1 ) - SUM*DCONJG( V2 ) 71 CONTINUE * IF( WANTZ ) THEN * * Accumulate transformations in the matrix Z * DO 81 J = ILOZ, IHIZ SUM = T1*Z( J, K ) + T2*Z( J, K+1 ) Z( J, K ) = Z( J, K ) - SUM Z( J, K+1 ) = Z( J, K+1 ) - SUM*DCONJG( V2 ) 81 CONTINUE END IF END IF * * Since at the start of the QR step we have for M > L * H( K, K-1 ) = H( K, K-1 ) - DCONJG( T1 )*H( K, K-1 ) * then we don't need to do the following * IF( K.EQ.M .AND. M.GT.L ) THEN * If the QR step was started at row M > L because two * consecutive small subdiagonals were found, then H(M,M-1) * must also be updated by a factor of (1-T1). * TEMP = ONE - T1 * H( m, m-1 ) = H( m, m-1 )*DCONJG( TEMP ) * END IF 100 CONTINUE * * Ensure that H(I,I-1) is real. * 110 CONTINUE * * Failure to converge in remaining number of iterations * INFO = I RETURN * 120 CONTINUE * IF( L.EQ.I ) THEN * * H(I,I-1) is negligible: one eigenvalue has converged. * W( I ) = H( I, I ) * ELSE IF( L.EQ.I-1 ) THEN * * H(I-1,I-2) is negligible: a pair of eigenvalues have converged. * * Transform the 2-by-2 submatrix to standard Schur form, * and compute and store the eigenvalues. * CALL ZLANV2( H( I-1, I-1 ), H( I-1, I ), H( I, I-1 ), $ H( I, I ), W( I-1 ), W( I ), CS, SN ) * IF( WANTT ) THEN * * Apply the transformation to the rest of H. * IF( I2.GT.I ) $ CALL ZROT( I2-I, H( I-1, I+1 ), LDH, H( I, I+1 ), LDH, $ CS, SN ) CALL ZROT( I-I1-1, H( I1, I-1 ), 1, H( I1, I ), 1, CS, $ DCONJG(SN) ) END IF IF( WANTZ ) THEN * * Apply the transformation to Z. * CALL ZROT( NZ, Z( ILOZ, I-1 ), 1, Z( ILOZ, I ), 1, CS, $ DCONJG(SN) ) END IF * END IF * * Decrement number of remaining iterations, and return to start of * the main loop with new value of I. * ITN = ITN - ITS I = L - 1 GO TO 10 * 130 CONTINUE RETURN * * End of ZLAHQR * END SHAR_EOF fi # end of overwriting check if test -f 'zlamsh.f' then echo shar: will not over-write existing file "'zlamsh.f'" else cat << "SHAR_EOF" > 'zlamsh.f' SUBROUTINE ZLAMSH ( S, LDS, NBULGE, JBLK, H, LDH, N, ULP ) * * Mark R. Fahey * May 28, 1999 * * .. Scalar Arguments .. INTEGER LDS, NBULGE, JBLK, LDH, N DOUBLE PRECISION ULP * .. * .. Array Arguments .. COMPLEX*16 S( LDS,* ), H( LDH,* ) * .. * * Purpose * ======= * * ZLAMSH sends multiple shifts through a small (single node) matrix to * see how consecutive small subdiagonal elements are modified by * subsequent shifts in an effort to maximize the number of bulges * that can be sent through. * ZLAMSH should only be called when there are multiple shifts/bulges * (NBULGE > 1) and the first shift is starting in the middle of an * unreduced Hessenberg matrix because of two or more consecutive small * subdiagonal elements. * * Arguments * ========= * * S (local input/output) COMPLEX*16 array, ( LDS,* ) * On entry, the matrix of shifts. Only the 2x2 diagonal of S is * referenced. It is assumed that S has JBLK double shifts * (size 2). * On exit, the data is rearranged in the best order for * applying. * * LDS (local input) INTEGER * On entry, the leading dimension of S. Unchanged on exit. * 1 < NBULGE <= JBLK <= LDS/2 * * NBULGE (local input/output) INTEGER * On entry, the number of bulges to send through H ( >1 ). * NBULGE should be less than the maximum determined (JBLK). * 1 < NBULGE <= JBLK <= LDS/2 * On exit, the maximum number of bulges that can be sent * through. * * JBLK (local input) INTEGER * On entry, the number of shifts determined for S. * Unchanged on exit. * * H (local input/output) COMPLEX*16 array ( LDH,N ) * On entry, the local matrix to apply the shifts on. * H should be aligned so that the starting row is 2. * On exit, the data is destroyed. * * LDH (local input) INTEGER * On entry, the leading dimension of H. Unchanged on exit. * * N (local input) INTEGER * On entry, the size of H. If all the bulges are expected to * go through, N should be at least 4*NBULGE+2. * Otherwise, NBULGE may be reduced by this routine. * * ULP (local input) DOUBLE PRECISION * On entry, machine precision * Unchanged on exit. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION RONE, TEN PARAMETER ( RONE = 1.0D+0, TEN = 10.0D+0 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IBULGE, IVAL, J, K, M, NR DOUBLE PRECISION DVAL, S1, TST1 COMPLEX*16 CDUM, H44, H33, H43H34, H00, H10, H11, H12, H21, $ H22, H44S, H33S, SUM, T1, T2, T3, V1, V2, V3 * .. * .. Local Arrays .. COMPLEX*16 V( 3 ) * .. * .. External Subroutines .. EXTERNAL ZCOPY, ZLARFG * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * M = 2 DO 10 IBULGE = 1, NBULGE H44 = S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2) H33 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) H43H34 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2)* $ S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).GT.ULP*TST1 ) THEN * Find minimum DVAL = (CABS1(H10)*(CABS1(V2)+CABS1(V3))) / (ULP*TST1) IVAL = IBULGE DO 15 I = IBULGE+1, NBULGE H44 = S(2*JBLK-2*I+2, 2*JBLK-2*I+2) H33 = S(2*JBLK-2*I+1,2*JBLK-2*I+1) H43H34 = S(2*JBLK-2*I+1,2*JBLK-2*I+2)* $ S(2*JBLK-2*I+2, 2*JBLK-2*I+1) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF ( (DVAL.GT.(CABS1(H10)*(CABS1(V2)+CABS1(V3)))/ $ (ULP*TST1)) .AND. ( DVAL .GT. RONE ) ) THEN DVAL = (CABS1(H10)*(CABS1(V2)+CABS1(V3)))/(ULP*TST1) IVAL = I END IF 15 CONTINUE IF ( (DVAL .LT. TEN) .AND. (IVAL .NE. IBULGE) ) THEN H44 = S(2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+2) H33 = S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+1) H43H34 = S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+2) H10 = S(2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1) S(2*JBLK-2*IVAL+2,2*JBLK-2*IVAL+2) = $ S(2*JBLK-2*IBULGE+2,2*JBLK-2*IBULGE+2) S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+1) = $ S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) S(2*JBLK-2*IVAL+1,2*JBLK-2*IVAL+2) = $ S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2) S(2*JBLK-2*IVAL+2, 2*JBLK-2*IVAL+1) = $ S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2) = H44 S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) = H33 S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2) = H43H34 S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) = H10 END IF H44 = S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+2) H33 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+1) H43H34 = S(2*JBLK-2*IBULGE+1,2*JBLK-2*IBULGE+2)* $ S(2*JBLK-2*IBULGE+2, 2*JBLK-2*IBULGE+1) H11 = H( M, M ) H22 = H( M+1, M+1 ) H21 = H( M+1, M ) H12 = H( M, M+1 ) H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S V3 = H( M+2, M+1 ) S1 = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S1 V2 = V2 / S1 V3 = V3 / S1 V( 1 ) = V1 V( 2 ) = V2 V( 3 ) = V3 H00 = H( M-1, M-1 ) H10 = H( M, M-1 ) TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) END IF IF( CABS1( H10 )*( CABS1( V2 )+CABS1(V3) ).GT.TEN*ULP*TST1 ) $ THEN * IBULGE better not be 1 here or we have a bug! NBULGE = MAX(IBULGE -1,1) RETURN END IF DO 120 K = M, N - 1 NR = MIN( 3, N-K+1 ) IF( K.GT.M ) $ CALL ZCOPY( NR, H( K, K-1 ), 1, V, 1 ) CALL ZLARFG( NR, V( 1 ), V( 2 ), 1, T1 ) IF( K.GT.M ) THEN H( K, K-1 ) = V( 1 ) H( K+1, K-1 ) = ZERO IF( K.LT.N-1 ) $ H( K+2, K-1 ) = ZERO ELSE * H(m,m-1) must be updated, * H( K, K-1 ) = H( K, K-1 ) - DCONJG( T1 )*H( K, K-1 ) END IF V2 = V( 2 ) T2 = T1*V2 IF( NR.EQ.3 ) THEN V3 = V( 3 ) T3 = T1*V3 DO 60 J = K, N SUM = DCONJG(T1)*H( K, J ) + DCONJG(T2)*H( K+1, J ) $ + DCONJG( T3 )*H( K+2, J ) H( K, J ) = H( K, J ) - SUM H( K+1, J ) = H( K+1, J ) - SUM*V2 H( K+2, J ) = H( K+2, J ) - SUM*V3 60 CONTINUE DO 70 J = 1, MIN( K+3, N ) SUM = T1*H( J, K ) + T2*H( J, K+1 ) + T3*H( J, K+2 ) H( J, K ) = H( J, K ) - SUM H( J, K+1 ) = H( J, K+1 ) - SUM*DCONJG( V2 ) H( J, K+2 ) = H( J, K+2 ) - SUM*DCONJG( V3 ) 70 CONTINUE END IF 120 CONTINUE 10 CONTINUE * RETURN END SHAR_EOF fi # end of overwriting check if test -f 'zlanv2.f' then echo shar: will not over-write existing file "'zlanv2.f'" else cat << "SHAR_EOF" > 'zlanv2.f' SUBROUTINE ZLANV2( A, B, C, D, RT1, RT2, CS, SN ) * * Mark R. Fahey * May 28, 1999 * * .. Scalar Arguments .. DOUBLE PRECISION CS COMPLEX*16 A, B, C, D, RT1, RT2, SN * .. * * Purpose * ======= * * ZLANV2 computes the Schur factorization of a complex 2-by-2 nonhermitian * matrix in standard form: * * [ A B ] = [ CS -SN ] [ AA BB ] [ CS SN ] * [ C D ] [ SN CS ] [ 0 DD ] [-SN CS ] * * Arguments * ========= * * A (input/output) COMPLEX*16 * B (input/output) COMPLEX*16 * C (input/output) COMPLEX*16 * D (input/output) COMPLEX*16 * On entry, the elements of the input matrix. * On exit, they are overwritten by the elements of the * standardised Schur form. * * RT1 (output) COMPLEX*16 * RT2 (output) COMPLEX*16 * The two eigenvalues. * * CS (output) DOUBLE PRECISION * SN (output) COMPLEX*16 * Parameters of the rotation matrix. * * ===================================================================== * * .. Parameters .. DOUBLE PRECISION RZERO, HALF, RONE PARAMETER ( RZERO = 0.0D+0, HALF = 0.5D+0, $ RONE = 1.0D+0 ) COMPLEX*16 ZERO, ONE PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. COMPLEX*16 AA, BB, DD, TEMP, T, U, X, Y * .. * .. External Functions .. COMPLEX*16 ZLADIV EXTERNAL ZLADIV * .. * .. External Subroutines .. EXTERNAL ZLARTG * .. * .. Intrinsic Functions .. INTRINSIC DBLE, DCONJG, DIMAG, SQRT * .. * .. Executable Statements .. * * Initialize CS and SN * CS = RONE SN = ZERO * IF( C.EQ.ZERO ) THEN GO TO 10 * ELSE IF( B.EQ.ZERO ) THEN * * Swap rows and columns * CS = RZERO SN = ONE TEMP = D D = A A = TEMP B = -C C = ZERO GO TO 10 ELSE IF( (A-D).EQ.ZERO ) THEN TEMP = SQRT( B*C ) A = A + TEMP D = D - TEMP IF( (B+C).EQ.ZERO ) THEN CS = SQRT( HALF ) SN = ( RZERO, RONE )*CS ELSE TEMP = SQRT( B + C ) CS = ZLADIV( SQRT( B ), TEMP ) SN = ZLADIV( SQRT( C ), TEMP ) END IF B = B - C C = ZERO GO TO 10 ELSE * * Compute eigenvalue closest to D * T = D U = B*C X = HALF*( A-T ) Y = SQRT( X*X+U ) IF( DBLE( X )*DBLE( Y )+DIMAG( X )*DIMAG( Y ).LT.RZERO ) $ Y = -Y T = T - ZLADIV( U, ( X+Y ) ) * * Do one QR step with exact shift T - resulting 2 x 2 in * triangular form. * CALL ZLARTG( A-T, C, CS, SN, AA ) * D = D - T BB = CS*B + SN*D DD = -DCONJG( SN )*B + CS*D A = AA*CS + BB*DCONJG( SN ) + T B = -AA*SN + BB*CS C = ZERO D = T * END IF * 10 CONTINUE * * Store eigenvalues in RT1 and RT2. * RT1 = A RT2 = D RETURN * * End of ZLANV2 * END SHAR_EOF fi # end of overwriting check if test -f 'zlaref.f' then echo shar: will not over-write existing file "'zlaref.f'" else cat << "SHAR_EOF" > 'zlaref.f' SUBROUTINE ZLAREF( TYPE, A, LDA, WANTZ, Z, LDZ, BLOCK, IROW1, $ ICOL1, ISTART, ISTOP, ITMP1, ITMP2, LILOZ, $ LIHIZ, VECS, V2, V3, T1, T2, T3 ) * * Mark R Fahey * May 28, 1999 * * .. Scalar Arguments .. LOGICAL BLOCK, WANTZ CHARACTER TYPE INTEGER ICOL1, IROW1, ISTART, ISTOP, ITMP1, ITMP2, LDA, $ LDZ, LIHIZ, LILOZ COMPLEX*16 T1, T2, T3, V2, V3 * .. * .. Array Arguments .. COMPLEX*16 A( LDA, * ), VECS( * ), Z( LDZ, * ) * .. * * Purpose * ======= * * ZLAREF applies one or several Householder reflectors of size 3 * to one or two matrices (if column is specified) on either their * rows or columns. * * Arguments * ========= * * TYPE (global input) CHARACTER*1 * If 'R': Apply reflectors to the rows of the matrix * (apply from left) * Otherwise: Apply reflectors to the columns of the matrix * Unchanged on exit. * * A (global input/output) COMPLEX*16 array, (LDA,*) * On entry, the matrix to receive the reflections. * The updated matrix on exit. * * LDA (local input) INTEGER * On entry, the leading dimension of A. Unchanged on exit. * * WANTZ (global input) LOGICAL * If .TRUE., then apply any column reflections to Z as well. * If .FALSE., then do no additional work on Z. * * Z (global input/output) COMPLEX*16 array, (LDZ,*) * On entry, the second matrix to receive column reflections. * This is changed only if WANTZ is set. * * LDZ (local input) INTEGER * On entry, the leading dimension of Z. Unchanged on exit. * * BLOCK (global input) LOGICAL * If .TRUE., then apply several reflectors at once and read * their data from the VECS array. * If .FALSE., apply the single reflector given by V2, V3, * T1, T2, and T3. * * IROW1 (local input/output) INTEGER * On entry, the local row element of A. * Undefined on output. * * * ICOL1 (local input/output) INTEGER * On entry, the local column element of A. * Undefined on output. * * ISTART (global input) INTEGER * Specifies the "number" of the first reflector. This is * used as an index into VECS if BLOCK is set. * ISTART is ignored if BLOCK is .FALSE.. * * ISTOP (global input) INTEGER * Specifies the "number" of the last reflector. This is * used as an index into VECS if BLOCK is set. * ISTOP is ignored if BLOCK is .FALSE.. * * ITMP1 (local input) INTEGER * Starting range into A. For rows, this is the local * first column. For columns, this is the local first row. * * ITMP2 (local input) INTEGER * Ending range into A. For rows, this is the local last * column. For columns, this is the local last row. * * LILOZ * LIHIZ (local input) INTEGER * These serve the same purpose as ITMP1,ITMP2 but for Z * when WANTZ is set. * * VECS (global input) COMPLEX*16 array of size 3*N (matrix size) * This holds the size 3 reflectors one after another and this * is only accessed when BLOCK is .TRUE. * * V2 * V3 * T1 * T2 * T3 (global input/output) COMPLEX*16 * This holds information on a single size 3 Householder * reflector and is read when BLOCK is .FALSE., and * overwritten when BLOCK is .TRUE. * * ===================================================================== * * .. Local Scalars .. INTEGER J, K COMPLEX*16 A1, A11, A2, A22, A3, A4, A5, B1, B2, B3, B4, $ B5, H11, H22, SUM, SUM1, SUM2, SUM3, T12, T13, $ T22, T23, T32, T33, TMP1, TMP2, TMP3, V22, V23, $ V32, V33 * .. * .. External Functions .. LOGICAL LSAME EXTERNAL LSAME * .. * .. Intrinsic Functions .. INTRINSIC DCONJG, MOD * .. * .. Executable Statements .. * IF( LSAME( TYPE, 'R' ) ) THEN IF( BLOCK ) THEN DO 30 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 10 J = ITMP1, ITMP2-MOD(ITMP2-ITMP1+1,2), 2 A1 = A ( IROW1 , J ) A2 = A ( IROW1+1, J ) A3 = A ( IROW1+2, J ) A4 = A ( IROW1+3, J ) A5 = A ( IROW1+4, J ) B1 = A ( IROW1 , J+1 ) B2 = A ( IROW1+1, J+1 ) B3 = A ( IROW1+2, J+1 ) B4 = A ( IROW1+3, J+1 ) B5 = A ( IROW1+4, J+1 ) SUM1 = DCONJG( T1 )*A1 + DCONJG( T2 )*A2 + $ DCONJG( T3 )*A3 A( IROW1 , J ) = A1 - SUM1 H11 = A2 - SUM1 * V2 H22 = A3 - SUM1 * V3 TMP1 = DCONJG( T1 )*B1 + DCONJG( T2 )*B2 + $ DCONJG( T3 )*B3 A( IROW1 , J+1 ) = B1 - TMP1 A11 = B2 - TMP1 * V2 A22 = B3 - TMP1 * V3 SUM2 = DCONJG( T12 )*H11 + DCONJG( T22 )*H22 + $ DCONJG( T32 )*A4 A( IROW1+1, J ) = H11 - SUM2 H11 = H22 - SUM2 * V22 H22 = A4 - SUM2 * V32 TMP2 = DCONJG( T12 )*A11 + DCONJG( T22 )*A22 + $ DCONJG( T32 )*B4 A( IROW1+1, J+1 ) = A11 - TMP2 A11 = A22 - TMP2 * V22 A22 = B4 - TMP2 * V32 SUM3 = DCONJG( T13 )*H11 + DCONJG( T23 )*H22 + $ DCONJG( T33 )*A5 A( IROW1+2, J ) = H11 - SUM3 A( IROW1+3, J ) = H22 - SUM3 * V23 A( IROW1+4, J ) = A5 - SUM3 * V33 TMP3 = DCONJG( T13 )*A11 + DCONJG( T23 )*A22 + $ DCONJG( T33 )*B5 A( IROW1+2, J+1 ) = A11 - TMP3 A( IROW1+3, J+1 ) = A22 - TMP3 * V23 A( IROW1+4, J+1 ) = B5 - TMP3 * V33 10 CONTINUE DO 20 J = ITMP2-MOD(ITMP2-ITMP1+1,2)+1, ITMP2 SUM = DCONJG( T1 )*A( IROW1, J ) + DCONJG( T2 )* $ A( IROW1+1, J ) + DCONJG( T3 )*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM H11 = A( IROW1+1, J ) - SUM*V2 H22 = A( IROW1+2, J ) - SUM*V3 SUM = DCONJG( T12 )*H11 + DCONJG( T22 )*H22 + $ DCONJG( T32 )*A( IROW1+3, J ) A( IROW1+1, J ) = H11 - SUM H11 = H22 - SUM*V22 H22 = A( IROW1+3, J ) - SUM*V32 SUM = DCONJG( T13 )*H11 + DCONJG( T23 )*H22 + $ DCONJG( T33 )*A( IROW1+4, J ) A( IROW1+2, J ) = H11 - SUM A( IROW1+3, J ) = H22 - SUM*V23 A( IROW1+4, J ) = A( IROW1+4, J ) - SUM*V33 20 CONTINUE IROW1 = IROW1 + 3 30 CONTINUE DO 50 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 40 J = ITMP1, ITMP2 SUM = DCONJG( T1 )*A( IROW1, J ) + DCONJG( T2 )* $ A( IROW1+1, J ) + DCONJG( T3 )*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*V2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*V3 40 CONTINUE IROW1 = IROW1 + 1 50 CONTINUE ELSE DO 60 J = ITMP1, ITMP2 SUM = DCONJG( T1 )*A( IROW1, J ) + DCONJG( T2 )* $ A( IROW1+1, J ) + DCONJG( T3 )*A( IROW1+2, J ) A( IROW1, J ) = A( IROW1, J ) - SUM A( IROW1+1, J ) = A( IROW1+1, J ) - SUM*V2 A( IROW1+2, J ) = A( IROW1+2, J ) - SUM*V3 60 CONTINUE END IF ELSE * * Do column transforms * IF( BLOCK ) THEN DO 90 K = ISTART, ISTOP - MOD( ISTOP-ISTART+1, 3 ), 3 V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) V22 = VECS( ( K-1 )*3+4 ) V32 = VECS( ( K-1 )*3+5 ) T12 = VECS( ( K-1 )*3+6 ) V23 = VECS( ( K-1 )*3+7 ) V33 = VECS( ( K-1 )*3+8 ) T13 = VECS( ( K-1 )*3+9 ) T2 = T1*V2 T3 = T1*V3 T22 = T12*V22 T32 = T12*V32 T23 = T13*V23 T33 = T13*V33 DO 70 J = ITMP1, ITMP2 SUM = T1*A( J, ICOL1 ) + T2*A( J, ICOL1+1 ) + $ T3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM H11 = A( J, ICOL1+1 ) - SUM*DCONJG( V2 ) H22 = A( J, ICOL1+2 ) - SUM*DCONJG( V3 ) SUM = T12*H11 + T22*H22 + T32*A( J, ICOL1+3 ) A( J, ICOL1+1 ) = H11 - SUM H11 = H22 - SUM*DCONJG( V22 ) H22 = A( J, ICOL1+3 ) - SUM*DCONJG( V32 ) SUM = T13*H11 + T23*H22 + T33*A( J, ICOL1+4 ) A( J, ICOL1+2 ) = H11 - SUM A( J, ICOL1+3 ) = H22 - SUM*DCONJG( V23 ) A( J, ICOL1+4 ) = A( J, ICOL1+4 ) - SUM*DCONJG( V33 ) 70 CONTINUE IF( WANTZ ) THEN DO 80 J = LILOZ, LIHIZ SUM = T1*Z( J, ICOL1 ) + T2*Z( J, ICOL1+1 ) + $ T3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM H11 = Z( J, ICOL1+1 ) - SUM*DCONJG( V2 ) H22 = Z( J, ICOL1+2 ) - SUM*DCONJG( V3 ) SUM = T12*H11 + T22*H22 + T32*Z( J, ICOL1+3 ) Z( J, ICOL1+1 ) = H11 - SUM H11 = H22 - SUM*DCONJG( V22 ) H22 = Z( J, ICOL1+3 ) - SUM*DCONJG( V32 ) SUM = T13*H11 + T23*H22 + T33*Z( J, ICOL1+4 ) Z( J, ICOL1+2 ) = H11 - SUM Z( J, ICOL1+3 ) = H22 - SUM*DCONJG( V23 ) Z( J, ICOL1+4 ) = Z( J, ICOL1+4 ) - SUM*DCONJG(V33) 80 CONTINUE END IF ICOL1 = ICOL1 + 3 90 CONTINUE DO 120 K = ISTOP - MOD( ISTOP-ISTART+1, 3 ) + 1, ISTOP V2 = VECS( ( K-1 )*3+1 ) V3 = VECS( ( K-1 )*3+2 ) T1 = VECS( ( K-1 )*3+3 ) T2 = T1*V2 T3 = T1*V3 DO 100 J = ITMP1, ITMP2 SUM = T1*A( J, ICOL1 ) + T2*A( J, ICOL1+1 ) + $ T3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*DCONJG( V2 ) A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*DCONJG( V3 ) 100 CONTINUE IF( WANTZ ) THEN DO 110 J = LILOZ, LIHIZ SUM = T1*Z( J, ICOL1 ) + T2*Z( J, ICOL1+1 ) + $ T3*Z( J, ICOL1+2 ) Z( J, ICOL1 ) = Z( J, ICOL1 ) - SUM Z( J, ICOL1+1 ) = Z( J, ICOL1+1 ) - SUM*DCONJG(V2) Z( J, ICOL1+2 ) = Z( J, ICOL1+2 ) - SUM*DCONJG(V3) 110 CONTINUE END IF ICOL1 = ICOL1 + 1 120 CONTINUE ELSE DO 130 J = ITMP1, ITMP2 SUM = T1*A( J, ICOL1 ) + T2*A( J, ICOL1+1 ) + $ T3*A( J, ICOL1+2 ) A( J, ICOL1 ) = A( J, ICOL1 ) - SUM A( J, ICOL1+1 ) = A( J, ICOL1+1 ) - SUM*DCONJG( V2 ) A( J, ICOL1+2 ) = A( J, ICOL1+2 ) - SUM*DCONJG( V3 ) 130 CONTINUE END IF END IF RETURN * * End of ZLAREF * END SHAR_EOF fi # end of overwriting check cd .. if test ! -d 'Testing' then mkdir 'Testing' fi cd 'Testing' if test -f 'EVC.dat' then echo shar: will not over-write existing file "'EVC.dat'" else cat << "SHAR_EOF" > 'EVC.dat' 'SCALAPACK NEP (Nonsymmetric Eigenvalue Problem) input file' 'MPI Machine' 'EVC.out' output file name (if any) 6 device out 4 number of problems sizes 200 400 600 800 Problem sizes, if >1500, may need to use new PZLATRS 1 number of NB's 64 values of NB 1 number of process grids (ordered pairs of P & Q) 3 2 2 1 4 2 1 values of P 3 2 1 4 1 4 8 values of Q 20.0 threshold SHAR_EOF fi # end of overwriting check if test -f 'Makefile' then echo shar: will not over-write existing file "'Makefile'" else cat << "SHAR_EOF" > 'Makefile' ############################################################################ # # Program: ScaLAPACK # # Module: Makefile # # Purpose: Eigenroutine Testing Makefile # # Creation date: March 20, 1995 # # Modified: # # Send bug reports, comments or suggestions to scalapack@cs.utk.edu # ############################################################################ include ../../SLmake.inc F77 = f90 F77LOADER = f90 znepexe = xznep zevcexe = xzevc zmatgen = pzmatgen.o pmatgeninc.o znep = pznepdriver.o pznepinfo.o pznepfchk.o $(zmatgen) zevc = pzevcdriver.o pzevcinfo.o pzget22.o $(zmatgen) all : complex16 complex: $(cnepexe) $(cevcexe) complex16: $(znepexe) $(zevcexe) $(cnepexe) : $(cnep) $(F77LOADER) $(F77LOADFLAGS) -o $(cnepexe) $(cnep) $(PARLIBS) $(cevcexe) : $(cevc) $(F77LOADER) $(F77LOADFLAGS) -o $(cevcexe) $(cevc) $(PARLIBS) $(znepexe) : $(znep) $(F77LOADER) $(F77LOADFLAGS) -o $(znepexe) $(znep) $(PARLIBS) $(zevcexe) : $(zevc) $(F77LOADER) -g $(F77LOADFLAGS) -o $(zevcexe) $(zevc) $(PARLIBS) $(cnep): $(FRC) $(cevc): $(FRC) $(znep): $(FRC) $(zevc): $(FRC) FRC: @FRC=$(FRC) clean : rm -f *.o .c.o: $(CC) $(CDEFS) $(OPTS) -c $*.c .f.o: $(F77) -c $(OPTS) $*.f SHAR_EOF fi # end of overwriting check if test -f 'NEP.dat' then echo shar: will not over-write existing file "'NEP.dat'" else cat << "SHAR_EOF" > 'NEP.dat' 'SCALAPACK NEP (Nonsymmetric Eigenvalue Problem) input file' 'MPI Machine' 'NEP.out' output file name (if any) 6 device out 4 number of problems sizes 200 400 600 1000 Problem sizes 1 number of NB's 100 values of NB 1 number of process grids (ordered pairs of P & Q) 1 2 1 1 4 2 1 values of P 1 2 1 4 1 4 8 values of Q 20.0 threshold SHAR_EOF fi # end of overwriting check if test -f 'README' then echo shar: will not over-write existing file "'README'" else cat << "SHAR_EOF" > 'README' Files in this directory ======================= EVC.dat NEP.dat pzevcdriver.f pzget22.f pznepfchk.f Makefile README pzevcinfo.f pznepdriver.f pznepinfo.f Notes ===== When using pzevcdriver to test large matrices (>1500), one should use the pzlatrs routine provided in the SRC directory. This will control scaling problems when pztrevc is calculating eigenvectors. It is, however, slow in comparison to PZTRSV which it is replacing. Note also that pzlatrs uses pzdotu, pzdotc, and pzaxpy. I found a bug in each of these routines that were included in ScaLAPACK 1.0. Thus, up-to-date versions are included in the SRC directory. See the README file in the SRC directory for more information. SHAR_EOF fi # end of overwriting check if test -f 'pzevcdriver.f' then echo shar: will not over-write existing file "'pzevcdriver.f'" else cat << "SHAR_EOF" > 'pzevcdriver.f' PROGRAM PZEVCDRIVER * * Mark R. Fahey * June 2000 * * Purpose * ======= * * PZEVCDRIVER is the main test program for the COMPLEX*16 * SCALAPACK PZTREVC routine. This test driver performs a right and * left eigenvector calculation of a triangular matrix followed by * a residual checks of the calcuated eigenvectors. * * The program must be driven by a short data file and uses the same * input file as the PZNEPDRIVER. An annotated example of a data file * can be obtained by deleting the first 3 characters from the following * 18 lines: * 'SCALAPACK, Version 1.4, NEP (Nonsymmetric EigenProblem) input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'NEP.out' output file name (if any) * 6 device out * 8 number of problems sizes * 1 2 3 4 6 10 100 200 vales of N * 3 number of NB's * 6 20 40 values of NB * 4 number of process grids (ordered pairs of P & Q) * 1 2 1 4 values of P * 1 2 4 1 values of Q * 20.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 4000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * DBLESZ INTEGER, default = 8 bytes. * DBLESZ indicate the length in bytes on the given platform * for a double precision real. * MEM COMPLEX*16 array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, TOTMEM, MEMSIZ, NTESTS COMPLEX*16 PADVAL, ZERO, ONE PARAMETER ( DBLESZ = 16, TOTMEM = 220000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20 ) PARAMETER ( PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, III, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPREPAD, IPW, IPWR, IPZ, IPVL, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LDA, LDZ, LWORK, $ M,MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, NOUT, $ NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, QRESID, TMFLOPS, ZNORM * .. * .. Local Arrays .. LOGICAL SELECT ( 1 ) INTEGER DESCA( DLEN_ ), DESCZ( DLEN_ ), IERR( 2 ), $ IDUM( 1 ), NBVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ), RWORK( 5000 ), $ RESULT( 2 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZCHEKPAD, $ PZFILLPAD, PZGEMM, PZLAHQR, PZLASET, PZMATGEN, $ PZGET22, PZEVCINFO, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE, PZLANHS EXTERNAL ILCM, NUMROC, PDLAMCH, PZLANGE, PZLANHS * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PZEVCINFO( OUTFILE, NOUT, NMAT, NVAL, NTESTS, NNB, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.6 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) IPREPAD = IPREPAD + 1000 IMIDPAD = IMIDPAD + 1000 IPOSTPAD = IPOSTPAD + 1000 ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Initialize the array descriptor for the matrix Z * CALL DESCINIT( DESCZ, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 2 ) ) * LDA = DESCA( LLD_ ) LDZ = DESCZ( LLD_ ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPZ = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWR = IPZ + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPVL = IPWR + N + IPOSTPAD + IPREPAD IPVR = IPVL + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPVR + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD III = N / NB IF( III*NB.LT.N ) $ III = III + 1 III = 7*III / ILCM( NPROW, NPCOL ) * * LWORK = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, III ) LWORK = LWORK + MAX(2*N, (8*ILCM(NPROW,NPCOL)+2)**2 ) * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZEVCFCHK and PZLANHS * WORKSIZ = LWORK + MAX( NP*DESCA( NB_ ), $ DESCA( MB_ )*NQ ) + IPOSTPAD * ELSE * WORKSIZ = LWORK + IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Schur reduction', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate matrix Z = In * CALL PZLASET( 'All', N, N, ZERO, ONE, MEM( IPZ ), 1, 1, $ DESCZ ) CALL PZLASET( 'All', N, N, ZERO, ZERO, MEM( IPVR ),1,1, $ DESCZ ) CALL PZLASET( 'All', N, N, ZERO, ZERO, MEM( IPVL ),1,1, $ DESCZ ) * * Generate matrix A upper Hessenberg * CALL PZMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) CALL PZLASET( 'Lower', MAX( 0, N-1 ), MAX( 0, N-1 ), $ ZERO, ZERO, MEM( IPA ), MIN( N, 2 ), 1, $ DESCA ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPVR-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPVL-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPZ-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANHS( 'I', N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANHS', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANHS', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, N, 1, MEM( IPWR-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, PADVAL ) * END IF * * Set eigenvalues from diagonal * DO JJJ = 1, N CALL INFOG2L( JJJ, JJJ, DESCZ, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( MYROW.EQ.II .AND. MYCOL.EQ.JJ ) THEN MEM( IPWR-1+JJJ ) = MEM( IPA-1+(ICOL-1)*LDA+IROW ) ELSE MEM( IPWR-1+JJJ ) = ZERO ENDIF END DO CALL ZGSUM2D( CONTXT, 'All', ' ', N, 1, MEM( IPWR ), 1, $ -1, -1 ) * CALL SLBOOT( ) CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform eigenvector calculation * CALL PZTREVC( 'B', 'A', SELECT, N, MEM( IPA ), DESCA, $ MEM( IPVL ), DESCZ, MEM( IPVR ), DESCZ, N, $ M, MEM( IPW ), RWORK, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * )'PZTREVC INFO=', INFO KFAIL = KFAIL + 1 GO TO 10 END IF * IF( CHECK ) THEN * * Check for memory overwrite in NEP factorization * CALL PZCHEKPAD( ICTXT, 'PZTREVC (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZTREVC (VR)', NP, NQ, $ MEM( IPVR-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZTREVC (VL)', NP, NQ, $ MEM( IPVL-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZTREVC (WR)', N, 1, $ MEM( IPWR-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZTREVC (WORK)', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || T * Z - Z * D || / ( N*|| T ||*EPS ) * FRESID = 0.0D+0 QRESID = 0.0D+0 CALL PZGET22( 'N', 'N', 'N', N, MEM( IPA ), DESCA, $ MEM( IPVR ), DESCZ, MEM( IPWR ), $ MEM( IPZ ), DESCZ, RWORK, RESULT ) FRESID = RESULT( 1 ) QRESID = RESULT( 2 ) * * Compute || T^H * L - L * D^H || / ( N*|| T ||*EPS ) * CALL PZGET22( 'C', 'N', 'C', N, MEM( IPA ), DESCA, $ MEM( IPVL ), DESCZ, MEM( IPWR ), $ MEM( IPZ ), DESCZ, RWORK, RESULT ) FRESID = MAX( FRESID, RESULT( 1 ) ) QRESID = MAX( QRESID, RESULT( 2 ) ) * CALL PZCHEKPAD( ICTXT, 'PZGET22 (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGET22 (VR)', NP, NQ, $ MEM( IPVR-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGET22 (VL)', NP, NQ, $ MEM( IPVL-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZGET22 (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( ( FRESID-FRESID ).EQ.0.0D+0 ) .AND. $ ( QRESID.LE.THRESH ) .AND. $ ( ( QRESID-QRESID ).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 )FRESID WRITE( NOUT, FMT = 9985 )QRESID END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID QRESID = QRESID - QRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 2 N^2 flops for PxTREVC * NOPS = 2.0D+0*DBLE( N )**2 * * Calculate total megaflops -- eigenvector calc only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q NEP Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- --- ---- ---- -------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, F8.2, $ 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||H*Z - Z*D|| / (||T|| * N * eps) = ', G25.7 ) 9985 FORMAT( 'max_j(max|Z(j)| - 1) / ( N * eps ) ', G25.7 ) * STOP * * End of PZEVCDRIVER * END SHAR_EOF fi # end of overwriting check if test -f 'pzevcinfo.f' then echo shar: will not over-write existing file "'pzevcinfo.f'" else cat << "SHAR_EOF" > 'pzevcinfo.f' SUBROUTINE PZEVCINFO( SUMMRY, NOUT, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * Mark R. Fahey * March 2000 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, NGRIDS, $ NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZEVCINFO gets needed startup information for PZTREVC driver * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (the order of the matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH * * WORK (local workspace) INTEGER array of dimension >= * MAX( 3, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ), used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * * Implemented by: M. Fahey, June 2000 * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a real single * precision. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'EVC.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * )NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 )'N', LDNVAL GO TO 30 END IF READ( NIN, FMT = * )( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * )NNB IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 )'NB', LDNBVAL GO TO 30 END IF READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) * DO 10 I = 1, NNB IF( NBVAL( I ).LT.6 ) THEN WRITE( NOUT, FMT = 9992 )NBVAL( I ) GO TO 30 END IF 10 CONTINUE * * Get number of grids * READ( NIN, FMT = * )NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDPVAL GO TO 30 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDQVAL GO TO 30 END IF * * Get values of P and Q * READ( NIN, FMT = * )( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * )( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * )THRESH * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 20 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 20 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK nonsymmetric eigenvector calculation.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'Tests of the parallel ' // $ 'complex double precision eigenvector calculation.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Residual = max( ||T*R-R*D||/(||H||*eps*N)' // $ ' , ||T^H*L-L*D^H||/(||H||*eps*N) )' WRITE( NOUT, FMT = 9999 ) $ ' Normalization residual = max(max_j(max|R(j)|-1),' // $ ' max_j(max|L(j)|-1))/(eps*N)' WRITE( NOUT, FMT = 9999 )'The matrix A is randomly ' // $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'An explanation of the input/output ' $ // 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or ' // $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the ' // 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the' // $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than' // $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'NEP time : Time in seconds to decompose the ' // ' matrix' WRITE( NOUT, FMT = 9999 )'MFLOPS : Rate of execution ' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 )'N ', $ ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 )'NB ', $ ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 )'P ', $ ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 )'Q ', $ ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 )EPS WRITE( NOUT, FMT = 9998 )THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 30 CONTINUE WRITE( NOUT, FMT = 9993 ) CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9992 FORMAT( ' Blocking size too small at ', I2, ' must be >=6.' ) * * End of PZEVCINFO * END SHAR_EOF fi # end of overwriting check if test -f 'pzget22.f' then echo shar: will not over-write existing file "'pzget22.f'" else cat << "SHAR_EOF" > 'pzget22.f' SUBROUTINE PZGET22( TRANSA, TRANSE, TRANSW, N, A, DESCA, E, $ DESCE, W, WORK, DESCW, RWORK, RESULT ) * * Mark R. Fahey * June 23, 2000 * * .. Scalar Arguments .. CHARACTER TRANSA, TRANSE, TRANSW INTEGER N * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCE( * ), DESCW( * ) DOUBLE PRECISION RESULT( 2 ), RWORK( * ) COMPLEX*16 A( * ), E( * ), W( * ), WORK( * ) * .. * * Purpose * ======= * * PZGET22 does an eigenvector check. * * The basic test is: * * RESULT(1) = | A E - E W | / ( |A| |E| ulp ) * * using the 1-norm. It also tests the normalization of E: * * RESULT(2) = max | m-norm(E(j)) - 1 | / ( n ulp ) * j * * where E(j) is the j-th eigenvector, and m-norm is the max-norm of a * vector. The max-norm of a complex n-vector x in this case is the * maximum of |re(x(i)| + |im(x(i)| over i = 1, ..., n. * * Arguments * ========== * * TRANSA (input) CHARACTER*1 * Specifies whether or not A is transposed. * = 'N': No transpose * = 'T': Transpose * = 'C': Conjugate transpose * * TRANSE (input) CHARACTER*1 * Specifies whether or not E is transposed. * = 'N': No transpose, eigenvectors are in columns of E * = 'T': Transpose, eigenvectors are in rows of E * = 'C': Conjugate transpose, eigenvectors are in rows of E * * TRANSW (input) CHARACTER*1 * Specifies whether or not W is transposed. * = 'N': No transpose * = 'T': Transpose, same as TRANSW = 'N' * = 'C': Conjugate transpose, use -WI(j) instead of WI(j) * * N (input) INTEGER * The order of the matrix A. N >= 0. * * A (input) COMPLEX*16 array, dimension (*) * The matrix whose eigenvectors are in E. * * DESCA (input) INTEGER array, dimension(*) * * E (input) COMPLEX*16 array, dimension (*) * The matrix of eigenvectors. If TRANSE = 'N', the eigenvectors * are stored in the columns of E, if TRANSE = 'T' or 'C', the * eigenvectors are stored in the rows of E. * * DESCE (input) INTEGER array, dimension(*) * * W (input) COMPLEX*16 array, dimension (N) * The eigenvalues of A. * * WORK (workspace) COMPLEX*16 array, dimension (*) * DESCW (input) INTEGER array, dimension(*) * * RWORK (workspace) DOUBLE PRECISION array, dimension (N) * * RESULT (output) DOUBLE PRECISION array, dimension (2) * RESULT(1) = | A E - E W | / ( |A| |E| ulp ) * RESULT(2) = max | m-norm(E(j)) - 1 | / ( n ulp ) * j * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, DLEN_, DTYPE_, CTXT_, M_, N_, $ MB_, NB_, RSRC_, CSRC_, LLD_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), $ CONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. CHARACTER NORMA, NORME INTEGER ITRNSE, ITRNSW, J, JCOL, JROW, JVEC DOUBLE PRECISION ANORM, ENORM, ENRMAX, ENRMIN, ERRNRM, TEMP1, $ TEMP2, ULP, UNFL COMPLEX*16 CDUM, WTEMP * .. * .. External Functions .. LOGICAL LSAME DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL LSAME, PDLAMCH, PZLANGE * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCINIT, DGAMX2D, DGAMN2D, $ PZAXPY, PZGEMM, PZLASET * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DCONJG, DIMAG, MAX, MIN * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * * Initialize RESULT (in case N=0) * RESULT( 1 ) = ZERO RESULT( 2 ) = ZERO IF( N.LE.0 ) $ RETURN * CONTXT = DESCA( CTXT_ ) RSRC = DESCA( RSRC_ ) CSRC = DESCA( CSRC_ ) NB = DESCA( NB_ ) MB = DESCA( MB_ ) LDA = DESCA( LLD_ ) LDE = DESCE( LLD_ ) LDW = DESCW( LLD_ ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * UNFL = PDLAMCH( CONTXT, 'Safe minimum' ) ULP = PDLAMCH( CONTXT, 'Precision' ) * ITRNSE = 0 ITRNSW = 0 NORMA = 'O' NORME = 'O' * IF( LSAME( TRANSA, 'T' ) .OR. LSAME( TRANSA, 'C' ) ) THEN NORMA = 'I' END IF * IF( LSAME( TRANSE, 'T' ) ) THEN ITRNSE = 1 NORME = 'I' ELSE IF( LSAME( TRANSE, 'C' ) ) THEN ITRNSE = 2 NORME = 'I' END IF * IF( LSAME( TRANSW, 'C' ) ) THEN ITRNSW = 1 END IF * * Normalization of E: * ENRMIN = ONE / ULP ENRMAX = ZERO IF( ITRNSE.EQ.0 ) THEN DO 20 JVEC = 1, N TEMP1 = ZERO DO 10 J = 1, N CALL INFOG2L( J, JVEC, DESCE, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN TEMP1 = MAX( TEMP1, CABS1( E( (ICOL-1)*LDE+IROW ))) ENDIF 10 CONTINUE IF( MYCOL.EQ.JJ ) THEN CALL DGAMX2D( CONTXT, 'Col', ' ', 1, 1, TEMP1, 1, RA, CA, $ -1, -1, -1 ) ENRMIN = MIN( ENRMIN, TEMP1 ) ENRMAX = MAX( ENRMAX, TEMP1 ) ENDIF 20 CONTINUE CALL DGAMX2D( CONTXT, 'Row', ' ', 1, 1, ENRMAX, 1, RA, CA, $ -1, -1, -1 ) CALL DGAMN2D( CONTXT, 'Row', ' ', 1, 1, ENRMIN, 1, RA, CA, $ -1, -1, -1 ) ELSE DO 50 J = 1, N TEMP1 = ZERO DO 40 JVEC = 1, N CALL INFOG2L( J, JVEC, DESCE, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN TEMP1 = MAX( TEMP1, CABS1( E( (ICOL-1)*LDE+IROW ))) ENDIF 40 CONTINUE IF( MYROW.EQ.II ) THEN CALL DGAMX2D( CONTXT, 'Row', ' ', 1, 1, TEMP1, 1, RA, CA, $ -1, -1, -1 ) ENRMIN = MIN( ENRMIN, TEMP1 ) ENRMAX = MAX( ENRMAX, TEMP1 ) ENDIF 50 CONTINUE CALL DGAMX2D( CONTXT, 'Row', ' ', 1, 1, ENRMAX, 1, RA, CA, $ -1, -1, -1 ) CALL DGAMN2D( CONTXT, 'Row', ' ', 1, 1, ENRMIN, 1, RA, CA, $ -1, -1, -1 ) END IF * * Norm of A: * ANORM = MAX( PZLANGE( NORMA, N, N, A, 1, 1, DESCA, RWORK ), $ UNFL ) * * Norm of E: * ENORM = MAX( PZLANGE( NORME, N, N, E, 1, 1, DESCE, RWORK ), $ ULP ) * * Norm of error: * * Error = AE - EW * CALL PZLASET( 'Full', N, N, CZERO, CZERO, WORK, 1, 1, DESCW ) * DO 100 JCOL = 1, N IF( ITRNSW.EQ.0 ) THEN WTEMP = W( JCOL ) ELSE WTEMP = DCONJG( W( JCOL ) ) END IF * IF( ITRNSE.EQ.0 ) THEN CALL PZAXPY( N, WTEMP, E, 1, JCOL, DESCE, 1, WORK, 1, $ JCOL, DESCW, 1 ) ELSE IF( ITRNSE.EQ.1 ) THEN CALL PZAXPY( N, WTEMP, E, JCOL, 1, DESCE, N, WORK, 1, $ JCOL, DESCW, 1 ) ELSE CALL PZAXPY( N, DCONJG( WTEMP ), E, JCOL, 1, DESCE, N, $ WORK, 1, JCOL, DESCW, 1 ) DO JROW = 1, N CALL INFOG2L( JROW, JCOL, DESCW, NPROW, NPCOL, MYROW, $ MYCOL, IROW, ICOL, II, JJ ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN WORK( (JCOL-1)*LDW+JROW ) = DCONJG( WORK( (JCOL-1) $ *LDW+JROW ) ) ENDIF END DO END IF 100 CONTINUE * CALL PZGEMM( TRANSA, TRANSE, N, N, N, CONE, A, 1, 1, DESCA, E, $ 1, 1, DESCE, -CONE, WORK, 1, 1, DESCW ) * ERRNRM = PZLANGE( 'One', N, N, WORK, 1, 1, DESCW, RWORK ) $ / ENORM * * Compute RESULT(1) (avoiding under/overflow) * IF( ANORM.GT.ERRNRM ) THEN RESULT( 1 ) = ( ERRNRM / ANORM ) / ULP ELSE IF( ANORM.LT.ONE ) THEN RESULT( 1 ) = ( MIN( ERRNRM, ANORM ) / ANORM ) / ULP ELSE RESULT( 1 ) = MIN( ERRNRM / ANORM, ONE ) / ULP END IF END IF * * Compute RESULT(2) : the normalization error in E. * RESULT( 2 ) = MAX( ABS( ENRMAX-ONE ), ABS( ENRMIN-ONE ) ) / $ ( DBLE( N )*ULP ) * RETURN * * End of PZGET22 * END SHAR_EOF fi # end of overwriting check if test -f 'pznepdriver.f' then echo shar: will not over-write existing file "'pznepdriver.f'" else cat << "SHAR_EOF" > 'pznepdriver.f' PROGRAM PZNEPDRIVER * * Mark R. Fahey * March 2000 * * Purpose * ======= * * PZNEPDRIVER is the main test program for the COMPLEX*16 * SCALAPACK NEP routines. This test driver performs a Schur * decomposition followed by residual check of a Hessenberg matrix. * * The program must be driven by a short data file. An annotated * example of a data file can be obtained by deleting the first 3 * characters from the following 18 lines: * 'SCALAPACK, Version 1.4, NEP (Nonsymmetric EigenProblem) input file' * 'Intel iPSC/860 hypercube, gamma model.' * 'NEP.out' output file name (if any) * 6 device out * 8 number of problems sizes * 1 2 3 4 6 10 100 200 vales of N * 3 number of NB's * 6 20 40 values of NB * 4 number of process grids (ordered pairs of P & Q) * 1 2 1 4 values of P * 1 2 4 1 values of Q * 20.0 threshold * * Internal Parameters * =================== * * TOTMEM INTEGER, default = 4000000 * TOTMEM is a machine-specific parameter indicating the * maximum amount of available memory in bytes. * The user should customize TOTMEM to his platform. Remember * to leave room in memory for the operating system, the BLACS * buffer, etc. For example, on a system with 8 MB of memory * per process (e.g., one processor on an Intel iPSC/860), the * parameters we use are TOTMEM=6200000 (leaving 1.8 MB for OS, * code, BLACS buffer, etc). However, for PVM, we usually set * TOTMEM = 2000000. Some experimenting with the maximum value * of TOTMEM may be required. * * DBLESZ INTEGER, default = 8 bytes. * DBLESZ indicate the length in bytes on the given platform * for a double precision real. * MEM COMPLEX*16 array, dimension ( TOTMEM / DBLESZ ) * * All arrays used by SCALAPACK routines are allocated from * this array and referenced by pointers. The integer IPA, * for example, is a pointer to the starting element of MEM for * the matrix A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DT_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DT_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) INTEGER DBLESZ, TOTMEM, MEMSIZ, NTESTS COMPLEX*16 PADVAL, ZERO, ONE PARAMETER ( DBLESZ = 16, TOTMEM = 200000000, $ MEMSIZ = TOTMEM / DBLESZ, NTESTS = 20 ) PARAMETER ( PADVAL = ( -9923.0D+0, -9923.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ), $ ONE = ( 1.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. LOGICAL CHECK CHARACTER*6 PASSED CHARACTER*80 OUTFILE INTEGER I, IAM, IASEED, ICTXT, III, IMIDPAD, INFO, IPA, $ IPOSTPAD, IPREPAD, IPW, IPWR, IPZ, J, K, $ KFAIL, KPASS, KSKIP, KTESTS, LDA, LDZ, LWORK, $ MYCOL, MYROW, N, NB, NGRIDS, NMAT, NNB, NOUT, $ NP, NPCOL, NPROCS, NPROW, NQ, WORKSIZ REAL THRESH DOUBLE PRECISION ANORM, FRESID, NOPS, QRESID, TMFLOPS, ZNORM * .. * .. Local Arrays .. INTEGER DESCA( DLEN_ ), DESCZ( DLEN_ ), IERR( 2 ), $ IDUM( 1 ), NBVAL( NTESTS ), NVAL( NTESTS ), $ PVAL( NTESTS ), QVAL( NTESTS ) DOUBLE PRECISION CTIME( 2 ), WTIME( 2 ), RWORK( 5000 ) COMPLEX*16 MEM( MEMSIZ ) * .. * .. External Subroutines .. EXTERNAL BLACS_BARRIER, BLACS_EXIT, BLACS_GET, $ BLACS_GRIDEXIT, BLACS_GRIDINFO, BLACS_GRIDINIT, $ BLACS_PINFO, DESCINIT, IGSUM2D, PZCHEKPAD, $ PZFILLPAD, PZGEMM, PZLAHQR, PZLASET, PZMATGEN, $ PZNEPFCHK, PZNEPINFO, SLBOOT, SLCOMBINE, $ SLTIMER * .. * .. External Functions .. INTEGER ILCM, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE, PZLANHS EXTERNAL ILCM, NUMROC, PDLAMCH, PZLANGE, PZLANHS * .. * .. Intrinsic Functions .. INTRINSIC DBLE, MAX, MIN * .. * .. Data statements .. DATA KFAIL, KPASS, KSKIP, KTESTS / 4*0 / * .. * .. Executable Statements .. * * Get starting information * CALL BLACS_PINFO( IAM, NPROCS ) IASEED = 100 CALL PZNEPINFO( OUTFILE, NOUT, NMAT, NVAL, NTESTS, NNB, NBVAL, $ NTESTS, NGRIDS, PVAL, NTESTS, QVAL, NTESTS, $ THRESH, MEM, IAM, NPROCS ) CHECK = ( THRESH.GE.0.0E+0 ) * * Print headings * IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 ) WRITE( NOUT, FMT = 9994 ) WRITE( NOUT, FMT = * ) END IF * * Loop over different process grids * DO 30 I = 1, NGRIDS * NPROW = PVAL( I ) NPCOL = QVAL( I ) * * Make sure grid information is correct * IERR( 1 ) = 0 IF( NPROW.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'nprow', NPROW IERR( 1 ) = 1 ELSE IF( NPCOL.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'GRID', 'npcol', NPCOL IERR( 1 ) = 1 ELSE IF( NPROW*NPCOL.GT.NPROCS ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9998 )NPROW*NPCOL, NPROCS IERR( 1 ) = 1 END IF * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'grid' KSKIP = KSKIP + 1 GO TO 30 END IF * * Define process grid * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', NPROW, NPCOL ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) * * Go to bottom of process grid loop if this case doesn't use my * process * IF( MYROW.GE.NPROW .OR. MYCOL.GE.NPCOL ) $ GO TO 30 * DO 20 J = 1, NMAT * N = NVAL( J ) * * Make sure matrix information is correct * IERR( 1 ) = 0 IF( N.LT.1 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'MATRIX', 'N', N IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'matrix' KSKIP = KSKIP + 1 GO TO 20 END IF * DO 10 K = 1, NNB * NB = NBVAL( K ) * * Make sure nb is legal * IERR( 1 ) = 0 IF( NB.LT.6 ) THEN IERR( 1 ) = 1 IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9999 )'NB', 'NB', NB END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'NB' KSKIP = KSKIP + 1 GO TO 10 END IF * * Padding constants * NP = NUMROC( N, NB, MYROW, 0, NPROW ) NQ = NUMROC( N, NB, MYCOL, 0, NPCOL ) IF( CHECK ) THEN IPREPAD = MAX( NB, NP ) IMIDPAD = NB IPOSTPAD = MAX( NB, NQ ) IPREPAD = IPREPAD + 1000 IMIDPAD = IMIDPAD + 1000 IPOSTPAD = IPOSTPAD + 1000 ELSE IPREPAD = 0 IMIDPAD = 0 IPOSTPAD = 0 END IF * * Initialize the array descriptor for the matrix A * CALL DESCINIT( DESCA, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 1 ) ) * * Initialize the array descriptor for the matrix Z * CALL DESCINIT( DESCZ, N, N, NB, NB, 0, 0, ICTXT, $ MAX( 1, NP )+IMIDPAD, IERR( 2 ) ) * LDA = DESCA( LLD_ ) LDZ = DESCZ( LLD_ ) LDWORK = DESCZ( LLD_ ) * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 2, 1, IERR, 2, -1, 0 ) * IF( IERR( 1 ).LT.0 .OR. IERR( 2 ).LT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'descriptor' KSKIP = KSKIP + 1 GO TO 10 END IF * * Assign pointers into MEM for SCALAPACK arrays, A is * allocated starting at position MEM( IPREPAD+1 ) * IPA = IPREPAD + 1 IPZ = IPA + DESCA( LLD_ )*NQ + IPOSTPAD + IPREPAD IPWR = IPZ + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD IPW = IPWR + DESCZ( LLD_ )*NQ + IPOSTPAD + IPREPAD III = N / NB IF( III*NB.LT.N ) $ III = III + 1 III = 7*III / ILCM( NPROW, NPCOL ) * * LWORK = 3*N + MAX( 2*MAX( LDA, LDZ )+2*NQ, III ) LWORK = LWORK + MAX(2*N, (8*ILCM(NPROW,NPCOL)+2)**2 ) * IF( CHECK ) THEN * * Figure the amount of workspace required by the * checking routines PZNEPFCHK and PZLANHS * WORKSIZ = LWORK + MAX( NP*DESCA( NB_ ), $ DESCA( MB_ )*NQ ) + IPOSTPAD * ELSE * WORKSIZ = LWORK + IPOSTPAD * END IF * * Check for adequate memory for problem size * IERR( 1 ) = 0 IF( IPW+WORKSIZ.GT.MEMSIZ ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9996 )'Schur reduction', $ ( IPW+WORKSIZ )*DBLESZ IERR( 1 ) = 1 END IF * * Check all processes for an error * CALL IGSUM2D( ICTXT, 'All', ' ', 1, 1, IERR, 1, -1, 0 ) * IF( IERR( 1 ).GT.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = 9997 )'MEMORY' KSKIP = KSKIP + 1 GO TO 10 END IF * * Generate matrix Z = In * CALL PZLASET( 'All', N, N, ZERO, ONE, MEM( IPZ ), 1, 1, $ DESCZ ) * * Generate matrix A upper Hessenberg * CALL PZMATGEN( ICTXT, 'No transpose', 'No transpose', $ DESCA( M_ ), DESCA( N_ ), DESCA( MB_ ), $ DESCA( NB_ ), MEM( IPA ), DESCA( LLD_ ), $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, 0, $ NP, 0, NQ, MYROW, MYCOL, NPROW, NPCOL ) CALL PZLASET( 'Lower', MAX( 0, N-2 ), MAX( 0, N-2 ), $ ZERO, ZERO, MEM( IPA ), MIN( N, 3 ), 1, $ DESCA ) * * Calculate inf-norm of A for residual error-checking * IF( CHECK ) THEN CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPA-IPREPAD ), $ DESCA( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, NP, NQ, MEM( IPZ-IPREPAD ), $ DESCZ( LLD_ ), IPREPAD, IPOSTPAD, $ PADVAL ) CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) ANORM = PZLANHS( 'I', N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) CALL PZCHEKPAD( ICTXT, 'PZLANHS', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLANHS', WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, N, 1, MEM( IPWR-IPREPAD ), N, $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZFILLPAD( ICTXT, LWORK, 1, MEM( IPW-IPREPAD ), $ LWORK, IPREPAD, IPOSTPAD, PADVAL ) * END IF * CALL SLBOOT( ) CALL BLACS_BARRIER( ICTXT, 'All' ) CALL SLTIMER( 1 ) * * Perform NEP factorization * CALL PZLAHQR( .TRUE., .TRUE., N, 1, N, MEM( IPA ), DESCA, $ MEM( IPWR ), 1, N, MEM( IPZ ), $ DESCZ, MEM( IPW ), LWORK, IDUM, 0, INFO ) * CALL SLTIMER( 1 ) * IF( INFO.NE.0 ) THEN IF( IAM.EQ.0 ) $ WRITE( NOUT, FMT = * )'PZLAHQR INFO=', INFO KFAIL = KFAIL + 1 GO TO 10 END IF * IF( CHECK ) THEN * * Check for memory overwrite in NEP factorization * CALL PZCHEKPAD( ICTXT, 'PZLAHQR (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLAHQR (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLAHQR (WR)', N, 1, $ MEM( IPWR-IPREPAD ), N, IPREPAD, $ IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZLAHQR (WORK)', LWORK, 1, $ MEM( IPW-IPREPAD ), LWORK, IPREPAD, $ IPOSTPAD, PADVAL ) * CALL PZFILLPAD( ICTXT, WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || Z * H * Z**T - H0 || / ( N*|| H0 ||*EPS ) * CALL PZNEPFCHK( N, MEM( IPA ), 1, 1, DESCA, IASEED, $ MEM( IPZ ), 1, 1, DESCZ, ANORM, $ FRESID, MEM( IPW ) ) * CALL PZCHEKPAD( ICTXT, 'PZNEPFCHK (A)', NP, NQ, $ MEM( IPA-IPREPAD ), DESCA( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZNEPFCHK (Z)', NP, NQ, $ MEM( IPZ-IPREPAD ), DESCZ( LLD_ ), $ IPREPAD, IPOSTPAD, PADVAL ) CALL PZCHEKPAD( ICTXT, 'PZNEPFCHK (WORK)', $ WORKSIZ-IPOSTPAD, 1, $ MEM( IPW-IPREPAD ), WORKSIZ-IPOSTPAD, $ IPREPAD, IPOSTPAD, PADVAL ) * * Compute || (Z**T)*Z - In ||_1 * CALL PZLASET( 'All', N, N, ZERO, ONE, MEM( IPA ), 1, $ 1, DESCA ) CALL PZGEMM( 'Cong Tran', 'No transpose', N, N, N, $ -ONE, MEM( IPZ ), 1, 1, DESCZ, $ MEM( IPZ ), 1, 1, DESCZ, ONE, MEM( IPA ), $ 1, 1, DESCA ) ZNORM = PZLANGE( '1', N, N, MEM( IPA ), 1, 1, DESCA, $ MEM( IPW ) ) QRESID = ZNORM / ( DBLE( N )*PDLAMCH( ICTXT, 'P' ) ) * * Test residual and detect NaN result * IF( ( FRESID.LE.THRESH ) .AND. $ ( ( FRESID-FRESID ).EQ.0.0D+0 ) .AND. $ ( QRESID.LE.THRESH ) .AND. $ ( ( QRESID-QRESID ).EQ.0.0D+0 ) ) THEN KPASS = KPASS + 1 PASSED = 'PASSED' ELSE KFAIL = KFAIL + 1 PASSED = 'FAILED' IF( IAM.EQ.0 ) THEN WRITE( NOUT, FMT = 9986 )FRESID WRITE( NOUT, FMT = 9985 )QRESID END IF END IF * ELSE * * Don't perform the checking, only timing * KPASS = KPASS + 1 FRESID = FRESID - FRESID QRESID = QRESID - QRESID PASSED = 'BYPASS' * END IF * * Gather maximum of all CPU and WALL clock timings * CALL SLCOMBINE( ICTXT, 'All', '>', 'W', 1, 1, WTIME ) CALL SLCOMBINE( ICTXT, 'All', '>', 'C', 1, 1, CTIME ) * * Print results * IF( MYROW.EQ.0 .AND. MYCOL.EQ.0 ) THEN * * 18 N^3 flops for PxLAHQR * NOPS = 18.0D+0*DBLE( N )**3 * * Calculate total megaflops -- factorization only, * -- for WALL and CPU time, and print output * * Print WALL time if machine supports it * IF( WTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( WTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF IF( WTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'WALL', N, NB, NPROW, $ NPCOL, WTIME( 1 ), TMFLOPS, PASSED * * Print CPU time if machine supports it * IF( CTIME( 1 ).GT.0.0D+0 ) THEN TMFLOPS = NOPS / ( CTIME( 1 )*1.0D+6 ) ELSE TMFLOPS = 0.0D+0 END IF * IF( CTIME( 1 ).GE.0.0D+0 ) $ WRITE( NOUT, FMT = 9993 )'CPU ', N, NB, NPROW, $ NPCOL, CTIME( 1 ), TMFLOPS, PASSED END IF * 10 CONTINUE * 20 CONTINUE * CALL BLACS_GRIDEXIT( ICTXT ) * 30 CONTINUE * * Print ending messages and close output file * IF( IAM.EQ.0 ) THEN KTESTS = KPASS + KFAIL + KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9992 )KTESTS IF( CHECK ) THEN WRITE( NOUT, FMT = 9991 )KPASS WRITE( NOUT, FMT = 9989 )KFAIL ELSE WRITE( NOUT, FMT = 9990 )KPASS END IF WRITE( NOUT, FMT = 9988 )KSKIP WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9987 ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) END IF * CALL BLACS_EXIT( 0 ) * 9999 FORMAT( 'ILLEGAL ', A6, ': ', A5, ' = ', I3, $ '; It should be at least 1' ) 9998 FORMAT( 'ILLEGAL GRID: nprow*npcol = ', I4, '. It can be at most', $ I4 ) 9997 FORMAT( 'Bad ', A6, ' parameters: going on to next test case.' ) 9996 FORMAT( 'Unable to perform ', A, ': need TOTMEM of at least', $ I11 ) 9995 FORMAT( 'TIME N NB P Q NEP Time MFLOPS CHECK' ) 9994 FORMAT( '---- ----- --- ---- ---- -------- -------- ------' ) 9993 FORMAT( A4, 1X, I5, 1X, I3, 1X, I4, 1X, I4, 1X, F8.2, 1X, F8.2, $ 1X, A6 ) 9992 FORMAT( 'Finished ', I6, ' tests, with the following results:' ) 9991 FORMAT( I5, ' tests completed and passed residual checks.' ) 9990 FORMAT( I5, ' tests completed without checking.' ) 9989 FORMAT( I5, ' tests completed and failed residual checks.' ) 9988 FORMAT( I5, ' tests skipped because of illegal input values.' ) 9987 FORMAT( 'END OF TESTS.' ) 9986 FORMAT( '||H - Q*S*Q^T|| / (||H|| * N * eps) = ', G25.7 ) 9985 FORMAT( '||Q^T*Q - I|| / ( N * eps ) ', G25.7 ) * STOP * * End of PZNEPDRIVER * END SHAR_EOF fi # end of overwriting check if test -f 'pznepfchk.f' then echo shar: will not over-write existing file "'pznepfchk.f'" else cat << "SHAR_EOF" > 'pznepfchk.f' SUBROUTINE PZNEPFCHK( N, A, IA, JA, DESCA, IASEED, Z, IZ, JZ, $ DESCZ, ANORM, FRESID, WORK ) * * Mark R. Fahey * March 2000 * * .. Scalar Arguments .. INTEGER IA, IASEED, IZ, JA, JZ, N DOUBLE PRECISION ANORM, FRESID * .. * .. Array Arguments .. INTEGER DESCA( * ), DESCZ( * ) COMPLEX*16 A( * ), WORK( * ), Z( * ) * .. * * Purpose * ======= * * PZNEPFCHK computes the residual * || sub(Z)*sub( A )*sub(Z)**T - sub( Ao ) || / (||sub( Ao )||*eps*N), * where Ao will be regenerated by the parallel random matrix generator, * sub( A ) = A(IA:IA+M-1,JA:JA+N-1), sub( Z ) = Z(IZ:IZ+N-1,JZ:JZ+N-1) * and ||.|| stands for the infinity norm. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * N (global input) INTEGER * The order of sub( A ) and sub( Z ). N >= 0. * * A (local input/local output) COMPLEX*16 pointer into the * local memory to an array of dimension (LLD_A,LOCc(JA+N-1)). * On entry, this array contains the local pieces of the N-by-N * distributed matrix sub( A ) to be checked. On exit, this * array contains the local pieces of the difference * sub(Z)*sub( A )*sub(Z)**T - sub( Ao ). * * IA (global input) INTEGER * A's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JA (global input) INTEGER * A's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * IASEED (global input) INTEGER * The seed number to generate the original matrix Ao. * * Z (local input) COMPLEX*16 pointer into the local memory * to an array of dimension (LLD_Z,LOCc(JZ+N-1)). On entry, this * array contains the local pieces of the N-by-N distributed * matrix sub( Z ). * * IZ (global input) INTEGER * Z's global row index, which points to the beginning of the * submatrix which is to be operated on. * * JZ (global input) INTEGER * Z's global column index, which points to the beginning of * the submatrix which is to be operated on. * * DESCZ (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix Z. * * ANORM (global input) DOUBLE PRECISION * The Infinity norm of sub( A ). * * FRESID (global output) DOUBLE PRECISION * The maximum (worst) factorizational error. * * WORK (local workspace) COMPLEX*16 array, dimension (LWORK). * LWORK >= MAX( NpA0 * NB_A, MB_A * NqA0 ) where * * IROFFA = MOD( IA-1, MB_A ), * ICOFFA = MOD( JA-1, NB_A ), * IAROW = INDXG2P( IA, MB_A, MYROW, RSRC_A, NPROW ), * IACOL = INDXG2P( JA, NB_A, MYCOL, CSRC_A, NPCOL ), * NpA0 = NUMROC( N+IROFFA, MB_A, MYROW, IAROW, NPROW ), * NqA0 = NUMROC( N+ICOFFA, NB_A, MYCOL, IACOL, NPCOL ), * * WORK is used to store a block of rows and a block of columns * of sub( A ). * INDXG2P and NUMROC are ScaLAPACK tool functions; MYROW, * MYCOL, NPROW and NPCOL can be determined by calling the * subroutine BLACS_GRIDINFO. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ONE, ZERO PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), $ ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER I, IACOL, IAROW, IB, ICTXT, IIA, IOFFA, IROFF, $ IW, J, JB, JJA, JN, LDA, LDW, MYCOL, MYROW, NP, $ NPCOL, NPROW DOUBLE PRECISION EPS * .. * .. Local Arrays .. INTEGER DESCW( DLEN_ ) * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, DESCSET, INFOG2L, PZGEMM, $ PZLACPY, PZLASET, PZMATGEN, ZMATADD * .. * .. External Functions .. INTEGER ICEIL, NUMROC DOUBLE PRECISION PDLAMCH, PZLANGE EXTERNAL ICEIL, NUMROC, PDLAMCH, PZLANGE * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN, MOD * .. * .. Executable Statements .. * ICTXT = DESCA( CTXT_ ) CALL BLACS_GRIDINFO( ICTXT, NPROW, NPCOL, MYROW, MYCOL ) EPS = PDLAMCH( ICTXT, 'eps' ) * CALL INFOG2L( IA, JA, DESCA, NPROW, NPCOL, MYROW, MYCOL, IIA, JJA, $ IAROW, IACOL ) IROFF = MOD( IA-1, DESCA( MB_ ) ) NP = NUMROC( N+IROFF, DESCA( MB_ ), MYROW, IAROW, NPROW ) IF( MYROW.EQ.IAROW ) $ NP = NP - IROFF LDW = MAX( 1, NP ) * * First compute H <- H * Z**T * CALL DESCSET( DESCW, DESCA( MB_ ), N, DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, DESCA( MB_ ) ) * DO 10 I = IA, IA + N - 1, DESCA( MB_ ) IB = MIN( IA+N-I, DESCA( MB_ ) ) * CALL PZLACPY( 'All', IB, N, A, I, JA, DESCA, WORK, 1, 1, $ DESCW ) CALL PZGEMM( 'No transpose', 'Cong Tran', IB, N, N, ONE, WORK, $ 1, 1, DESCW, Z, IZ, JZ, DESCZ, ZERO, A, I, JA, $ DESCA ) * DESCW( RSRC_ ) = MOD( DESCW( RSRC_ )+1, NPROW ) * 10 CONTINUE * * Then compute H <- Z * H = Z * H0 * Z**T * CALL DESCSET( DESCW, N, DESCA( NB_ ), DESCA( MB_ ), DESCA( NB_ ), $ IAROW, IACOL, ICTXT, LDW ) * DO 20 J = JA, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * CALL PZLACPY( 'All', N, JB, A, IA, J, DESCA, WORK, 1, 1, $ DESCW ) CALL PZGEMM( 'No transpose', 'No transpose', N, JB, N, ONE, Z, $ IZ, JZ, DESCZ, WORK, 1, 1, DESCW, ZERO, A, IA, J, $ DESCA ) * DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * 20 CONTINUE * * Compute H - H0 * JN = MIN( ICEIL( JA, DESCA( NB_ ) )*DESCA( NB_ ), JA+N-1 ) LDA = DESCA( LLD_ ) IOFFA = IIA + ( JJA-1 )*LDA IW = 1 JB = JN - JA + 1 DESCW( CSRC_ ) = IACOL * * Handle first block of columns separately * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PZMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, IIA-1, $ NP, JJA-1, JB, MYROW, MYCOL, NPROW, NPCOL ) CALL PZLASET( 'Lower', MAX( 0, N-2 ), JB, ZERO, ZERO, WORK, $ MIN( IW+2, N ), 1, DESCW ) CALL ZMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF * IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) * DO 30 J = JN + 1, JA + N - 1, DESCA( NB_ ) JB = MIN( JA+N-J, DESCA( NB_ ) ) * IF( MYCOL.EQ.DESCW( CSRC_ ) ) THEN CALL PZMATGEN( ICTXT, 'N', 'N', DESCA( M_ ), DESCA( N_ ), $ DESCA( MB_ ), DESCA( NB_ ), WORK, LDW, $ DESCA( RSRC_ ), DESCA( CSRC_ ), IASEED, $ IIA-1, NP, JJA-1, JB, MYROW, MYCOL, NPROW, $ NPCOL ) CALL PZLASET( 'Lower', MAX( 0, N-IW-1 ), JB, ZERO, ZERO, $ WORK, MIN( N, IW+2 ), 1, DESCW ) CALL ZMATADD( NP, JB, -ONE, WORK, LDW, ONE, A( IOFFA ), $ LDA ) JJA = JJA + JB IOFFA = IOFFA + JB*LDA END IF IW = IW + DESCA( MB_ ) DESCW( CSRC_ ) = MOD( DESCW( CSRC_ )+1, NPCOL ) 30 CONTINUE * * Calculate factor residual * FRESID = PZLANGE( 'I', N, N, A, IA, JA, DESCA, WORK ) / $ ( N*EPS*ANORM ) * RETURN * * End PZNEPFCHK * END SHAR_EOF fi # end of overwriting check if test -f 'pznepinfo.f' then echo shar: will not over-write existing file "'pznepinfo.f'" else cat << "SHAR_EOF" > 'pznepinfo.f' SUBROUTINE PZNEPINFO( SUMMRY, NOUT, NMAT, NVAL, LDNVAL, NNB, $ NBVAL, LDNBVAL, NGRIDS, PVAL, LDPVAL, QVAL, $ LDQVAL, THRESH, WORK, IAM, NPROCS ) * * Mark R. Fahey * March 2000 * * .. Scalar Arguments .. CHARACTER*( * ) SUMMRY INTEGER IAM, LDNBVAL, LDNVAL, LDPVAL, LDQVAL, NGRIDS, $ NMAT, NNB, NOUT, NPROCS REAL THRESH * .. * .. Array Arguments .. INTEGER NBVAL( LDNBVAL ), NVAL( LDNVAL ), $ PVAL( LDPVAL ), QVAL( LDQVAL ), WORK( * ) * .. * * Purpose * ======= * * PZNEPINFO gets needed startup information for PZHSEQR drivers * and transmits it to all processes. * * Arguments * ========= * * SUMMRY (global output) CHARACTER*(*) * Name of output (summary) file (if any). Only defined for * process 0. * * NOUT (global output) INTEGER * The unit number for output file. NOUT = 6, ouput to screen, * NOUT = 0, output to stderr. Only defined for process 0. * * NMAT (global output) INTEGER * The number of different values that can be used for N. * * NVAL (global output) INTEGER array, dimension (LDNVAL) * The values of N (the order of the matrix) to run the code * with. * * LDNVAL (global input) INTEGER * The maximum number of different values that can be used for * N, LDNVAL > = NMAT. * * NNB (global output) INTEGER * The number of different values that can be used for NB. * * NBVAL (global output) INTEGER array, dimension (LDNBVAL) * The values of NB (blocksize) to run the code with. * * LDNBVAL (global input) INTEGER * The maximum number of different values that can be used for * NB, LDNBVAL >= NNB. * * NGRIDS (global output) INTEGER * The number of different values that can be used for P & Q. * * PVAL (global output) INTEGER array, dimension (LDPVAL) * The values of P (number of process rows) to run the code * with. * * LDPVAL (global input) INTEGER * The maximum number of different values that can be used for * P, LDPVAL >= NGRIDS. * * QVAL (global output) INTEGER array, dimension (LDQVAL) * The values of Q (number of process columns) to run the code * with. * * LDQVAL (global input) INTEGER * The maximum number of different values that can be used for * Q, LDQVAL >= NGRIDS. * * THRESH (global output) REAL * Indicates what error checks shall be run and printed out: * < 0 : Perform no error checking * > 0 : report all residuals greater than THRESH * * WORK (local workspace) INTEGER array of dimension >= * MAX( 3, LDNVAL+LDNBVAL+LDPVAL+LDQVAL ), used to pack all * input arrays in order to send info in one message. * * IAM (local input) INTEGER * My process number. * * NPROCS (global input) INTEGER * The total number of processes. * * * Implemented by: M. Fahey, June 2000 * * ====================================================================== * * Note: For packing the information we assumed that the length in bytes * ===== of an integer is equal to the length in bytes of a complex single * precision. * * ====================================================================== * * .. Parameters .. INTEGER NIN PARAMETER ( NIN = 11 ) * .. * .. Local Scalars .. CHARACTER*79 USRINFO INTEGER I, ICTXT DOUBLE PRECISION EPS * .. * .. External Subroutines .. EXTERNAL BLACS_ABORT, BLACS_GET, BLACS_GRIDEXIT, $ BLACS_GRIDINIT, BLACS_SETUP, ICOPY, IGEBR2D, $ IGEBS2D, SGEBR2D, SGEBS2D * .. * .. External Functions .. DOUBLE PRECISION PDLAMCH EXTERNAL PDLAMCH * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN * .. * .. Executable Statements .. * * Process 0 reads the input data, broadcasts to other processes and * writes needed information to NOUT * IF( IAM.EQ.0 ) THEN * * Open file and skip data file header * OPEN( NIN, FILE = 'NEP.dat', STATUS = 'OLD' ) READ( NIN, FMT = * )SUMMRY SUMMRY = ' ' * * Read in user-supplied info about machine type, compiler, etc. * READ( NIN, FMT = 9999 )USRINFO * * Read name and unit number for summary output file * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT IF( NOUT.NE.0 .AND. NOUT.NE.6 ) $ OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) * * Read and check the parameter values for the tests. * * Get number of matrices and their dimensions * READ( NIN, FMT = * )NMAT IF( NMAT.LT.1 .OR. NMAT.GT.LDNVAL ) THEN WRITE( NOUT, FMT = 9994 )'N', LDNVAL GO TO 30 END IF READ( NIN, FMT = * )( NVAL( I ), I = 1, NMAT ) * * Get values of NB * READ( NIN, FMT = * )NNB IF( NNB.GT.LDNBVAL ) THEN WRITE( NOUT, FMT = 9994 )'NB', LDNBVAL GO TO 30 END IF READ( NIN, FMT = * )( NBVAL( I ), I = 1, NNB ) * DO 10 I = 1, NNB IF( NBVAL( I ).LT.6 ) THEN WRITE( NOUT, FMT = 9992 )NBVAL( I ) GO TO 30 END IF 10 CONTINUE * * Get number of grids * READ( NIN, FMT = * )NGRIDS IF( NGRIDS.LT.1 .OR. NGRIDS.GT.LDPVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDPVAL GO TO 30 ELSE IF( NGRIDS.GT.LDQVAL ) THEN WRITE( NOUT, FMT = 9994 )'Grids', LDQVAL GO TO 30 END IF * * Get values of P and Q * READ( NIN, FMT = * )( PVAL( I ), I = 1, NGRIDS ) READ( NIN, FMT = * )( QVAL( I ), I = 1, NGRIDS ) * * Get level of checking * READ( NIN, FMT = * )THRESH * * Close input file * CLOSE ( NIN ) * * For pvm only: if virtual machine not set up, allocate it and * spawn the correct number of processes. * IF( NPROCS.LT.1 ) THEN NPROCS = 0 DO 20 I = 1, NGRIDS NPROCS = MAX( NPROCS, PVAL( I )*QVAL( I ) ) 20 CONTINUE CALL BLACS_SETUP( IAM, NPROCS ) END IF * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * * Pack information arrays and broadcast * CALL SGEBS2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1 ) * WORK( 1 ) = NMAT WORK( 2 ) = NNB WORK( 3 ) = NGRIDS CALL IGEBS2D( ICTXT, 'All', ' ', 3, 1, WORK, 3 ) * I = 1 CALL ICOPY( NMAT, NVAL, 1, WORK( I ), 1 ) I = I + NMAT CALL ICOPY( NNB, NBVAL, 1, WORK( I ), 1 ) I = I + NNB CALL ICOPY( NGRIDS, PVAL, 1, WORK( I ), 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, QVAL, 1, WORK( I ), 1 ) I = I + NGRIDS - 1 CALL IGEBS2D( ICTXT, 'All', ' ', I, 1, WORK, I ) * * regurgitate input * WRITE( NOUT, FMT = 9999 ) $ 'ScaLAPACK QSQ^H by Schur Decomposition.' WRITE( NOUT, FMT = 9999 )USRINFO WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'Tests of the parallel ' // $ 'complex double precision Schur decomposition.' WRITE( NOUT, FMT = 9999 )'The following scaled residual ' // $ 'checks will be computed:' WRITE( NOUT, FMT = 9999 ) $ ' Residual = ||H-QSQ^H|| / ' // $ '(||H|| * eps * N )' WRITE( NOUT, FMT = 9999 ) $ ' Orthogonality residual = ||I - Q^HQ|| / ' // '( eps * N )' WRITE( NOUT, FMT = 9999 )'The matrix A is randomly ' // $ 'generated for each test.' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 )'An explanation of the input/output ' $ // 'parameters follows:' WRITE( NOUT, FMT = 9999 ) $ 'TIME : Indicates whether WALL or ' // $ 'CPU time was used.' * WRITE( NOUT, FMT = 9999 ) $ 'N : The number of columns in the ' // 'matrix A.' WRITE( NOUT, FMT = 9999 ) $ 'NB : The size of the square blocks the' // $ ' matrix A is split into.' WRITE( NOUT, FMT = 9999 ) $ 'P : The number of process rows.' WRITE( NOUT, FMT = 9999 ) $ 'Q : The number of process columns.' WRITE( NOUT, FMT = 9999 ) $ 'THRESH : If a residual value is less than' // $ ' THRESH, CHECK is flagged as PASSED' WRITE( NOUT, FMT = 9999 ) $ 'NEP time : Time in seconds to decompose the ' // ' matrix' WRITE( NOUT, FMT = 9999 )'MFLOPS : Rate of execution ' WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9999 ) $ 'The following parameter values will be used:' WRITE( NOUT, FMT = 9996 )'N ', $ ( NVAL( I ), I = 1, MIN( NMAT, 10 ) ) IF( NMAT.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NVAL( I ), I = 11, NMAT ) WRITE( NOUT, FMT = 9996 )'NB ', $ ( NBVAL( I ), I = 1, MIN( NNB, 10 ) ) IF( NNB.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( NBVAL( I ), I = 11, NNB ) WRITE( NOUT, FMT = 9996 )'P ', $ ( PVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( PVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = 9996 )'Q ', $ ( QVAL( I ), I = 1, MIN( NGRIDS, 10 ) ) IF( NGRIDS.GT.10 ) $ WRITE( NOUT, FMT = 9997 )( QVAL( I ), I = 11, NGRIDS ) WRITE( NOUT, FMT = * ) WRITE( NOUT, FMT = 9995 )EPS WRITE( NOUT, FMT = 9998 )THRESH * ELSE * * If in pvm, must participate setting up virtual machine * IF( NPROCS.LT.1 ) $ CALL BLACS_SETUP( IAM, NPROCS ) * * Temporarily define blacs grid to include all processes so * information can be broadcast to all processes * CALL BLACS_GET( -1, 0, ICTXT ) CALL BLACS_GRIDINIT( ICTXT, 'Row-major', 1, NPROCS ) * * Compute machine epsilon * EPS = PDLAMCH( ICTXT, 'eps' ) * CALL SGEBR2D( ICTXT, 'All', ' ', 1, 1, THRESH, 1, 0, 0 ) CALL IGEBR2D( ICTXT, 'All', ' ', 3, 1, WORK, 3, 0, 0 ) NMAT = WORK( 1 ) NNB = WORK( 2 ) NGRIDS = WORK( 3 ) * I = NMAT + NNB + 2*NGRIDS CALL IGEBR2D( ICTXT, 'All', ' ', I, 1, WORK, I, 0, 0 ) I = 1 CALL ICOPY( NMAT, WORK( I ), 1, NVAL, 1 ) I = I + NMAT CALL ICOPY( NNB, WORK( I ), 1, NBVAL, 1 ) I = I + NNB CALL ICOPY( NGRIDS, WORK( I ), 1, PVAL, 1 ) I = I + NGRIDS CALL ICOPY( NGRIDS, WORK( I ), 1, QVAL, 1 ) * END IF * CALL BLACS_GRIDEXIT( ICTXT ) * RETURN * 30 CONTINUE WRITE( NOUT, FMT = 9993 ) CLOSE ( NIN ) IF( NOUT.NE.6 .AND. NOUT.NE.0 ) $ CLOSE ( NOUT ) CALL BLACS_ABORT( ICTXT, 1 ) * STOP * 9999 FORMAT( A ) 9998 FORMAT( 'Routines pass computational tests if scaled residual ', $ 'is less than ', G12.5 ) 9997 FORMAT( ' ', 10I6 ) 9996 FORMAT( 2X, A5, ' : ', 10I6 ) 9995 FORMAT( 'Relative machine precision (eps) is taken to be ', $ E18.6 ) 9994 FORMAT( ' Number of values of ', 5A, $ ' is less than 1 or greater ', 'than ', I2 ) 9993 FORMAT( ' Illegal input in file ', 40A, '. Aborting run.' ) 9992 FORMAT( ' Blocking size too small at ', I2, ' must be >=6.' ) * * End of PZNEPINFO * END SHAR_EOF fi # end of overwriting check cd .. # End of shell archive exit 0