C ALGORITHM 826, COLLECTED ALGORITHMS FROM ACM. C THIS WORK PUBLISHED IN TRANSACTIONS ON MATHEMATICAL SOFTWARE, C VOL. 29,NO. 3, September, 2003, P. 326--336. #! /bin/sh # This is a shell archive, meaning: # 1. Remove everything above the #! /bin/sh line. # 2. Save the resulting text in a file. # 3. Execute the file with /bin/sh (not csh) to create the files: # SLmake.inc # Src/ # Src/Makefile # Src/README # Src/install # Src/pblas.h # Src/pzaxpy.c # Src/pzdotc.c # Src/pzdotu.c # Src/pzlaconsb.f # Src/pzlacp3.f # Src/pzlahqr.f # Src/pzlasmsub.f # Src/pzlatrs.f # Src/pzlawil.f # Src/pzrot.c # Src/pztrevc.f # Src/zlahqr.f # Src/zlamsh.f # Src/zlanv2.f # Src/zlaref.f # Testing/ # Testing/EVC.dat # Testing/Makefile # Testing/NEP.dat # Testing/README # Testing/pzevcdriver.f # Testing/pzevcinfo.f # Testing/pzget22.f # Testing/pznepdriver.f # Testing/pznepfchk.f # Testing/pznepinfo.f # This archive created: Wed Apr 16 10:13:06 2003 export PATH; PATH=/bin:$PATH if test -f 'SLmake.inc' then echo shar: will not over-write existing file "'SLmake.inc'" else cat << "SHAR_EOF" > 'SLmake.inc' FORTRAN = f90 OPTS = -O3 DRVOPTS = $(OPTS) NOOPT = LOADER = f90 LOADOPTS = CC = cc CCFLAGS = $(OPTS) CDEFS = -DAdd_ $(USEMPI) ARCH = ar ARCHFLAGS= cr RANLIB = echo MACH = origin PLAT = _irix65 SCALAPACK = /usr/local/usp/PETtools/lib/libscalapackn32.a BLACSMPI = /usr/local/usp/PETtools/lib/libmpiblacsn32.a BLASLIB = -lblas LAPACKLIB = $(HOME)/LAPACK/lapack$(PLAT).a MATGENLIB = $(HOME)/LAPACK/tmglib$(PLAT).a PXLAHQRLIB = $(HOME)/$(MACH)/ScaLAPACK/pxlahqrlib.a SERLIBS = $(MATGENLIB) $(LAPACKLIB) $(BLASLIB) PARLIBS = $(PXLAHQRLIB) $(SCALAPACK) $(BLACSMPI) $(SERLIBS) -lscs -lmpi SHAR_EOF fi # end of overwriting check if test ! -d 'Src' then mkdir 'Src' fi cd 'Src' if test -f 'Makefile' then echo shar: will not over-write existing file "'Makefile'" else cat << "SHAR_EOF" > 'Makefile' include ../../SLmake.inc ZQRSRC = pzlahqr.o pzlaconsb.o pzlasmsub.o pzlacp3.o pzlawil.o pzrot.o \ zlamsh.o zlaref.o zlanv2.o zlahqr.o ZVCSRC = pztrevc.o #pzlatrs.o pzaxpy.o pzdotc.o pzdotu.o all: complex16 complex16: $(ZQRSRC) $(ZVCSRC) $(ARCH) $(ARCHFLAGS) $(PXLAHQRLIB) $(ZQRSRC) $(ZVCSRC) $(RANLIB) $(PXLAHQRLIB) $(ZQRSRC): $(FRC) $(ZVCSRC): $(FRC) FRC: @FRC=$(FRC) clean: rm -f *.o .c.o: $(CC) $(CDEFS) $(CCFLAGS) -c $*.c .f.o: $(FORTRAN) $(OPTS) -c $*.f SHAR_EOF fi # end of overwriting check if test -f 'README' then echo shar: will not over-write existing file "'README'" else cat << "SHAR_EOF" > 'README' Files included in this distribution =================================== Makefile README pblas.h pzaxpy.c pzdotu.c pzdotc.c pzlaconsb.f pzlacp3.f pzlahqr.f pzlatrs.f pzlasmsub.f pzlawil.f pzrot.c pztrevc.f zlahqr.f zlamsh.f zlanv2.f zlaref.f List of computational routines with dependencies ================================================ PZLAHQR PZLACONSB PZLACP3 PZLASMSUB PZLAWIL PZROT ZLAHQR ZLAMSH ZLANV2 ZLAREF PZTREVC PZLATRS (optional) PZAXPY (optional) PZDOTC (optional) PZDOTU (optional) Notes ===== The routine ZLAHQR included in this distribution is not the same as found in LAPACK 1.0. This routine uses a multiple double-shift strategy as is found in DLAHQR. This is included for completeness, but is not needed. It can be removed from the list of codes in the Makefile so that the default LAPACK ZLAHQR is used instead. PZLATRS is an updated version of the routine included in ScaLAPACK 1.0. It has been updated to use scaling. It is, however, very slow in comparision to PZTRSV which theoretically does the same computation. Although slow, it controls scaling for ill-conditioned problems. PZLATRS calls some PBLAS and BLAS routines. I found that some of the PBLAS routines (ScaLAPACK version 1.0) each have an undocumented bug in them. They are PZAXPY line 363 &desc_Y[LLD_] -> incy PZDOTU line 472 ixcol -> iycol PZDOTC line 471 ixcol -> iycol Thus, I have included these three update level 1 PBLAS routines with this distribution. SHAR_EOF fi # end of overwriting check if test -f 'install' then echo shar: will not over-write existing file "'install'" else cat << "SHAR_EOF" > 'install' %\section{Installation} % \label{sec:install} %The SBR toolbox comes with a {\tt UNIX} makefile for easy installation and a %testing program for validation and performance tuning. % %In this section we briefly describe the ``standard'' installation procedure. %Detailed information about the installation process may be found in the %\NAME{README} file distributed with the software. % %The installation consists of the following steps. %\begin{enumerate} % \item Get the SBR toolbox from the TOMS repository at \texttt{netlib} % and unpack it with % \COMMAND{zcat sbr.tar.Z | tar xf -}% % This command % puts all the SBR software into a new directory \texttt{sbr}. % \item Edit the file \NAME{make.inc} to match your system setup (e.g., the % location of the LAPACK library, if the latter is installed on your % machine). % \item Type % \COMMAND{make library}% % This command will build the library (called \NAME{libSBR.a} if you % did not change the name). % \item (Optional.) Fine-tune the performance of the algorithms (cf.\ the % \NAME{README} file). % \item (Recommended.) Run the validation tests by typing % \COMMAND{make checks}% % and have a look at the output files \NAME{DOUTCHK} and \NAME{SOUTCHK}. % They should report no ``skipped'' or ``failed'' tests. % \item (Optional.) Run the additional timings (cf.\ the \NAME{README} file). % \item (Optional.) Move the SBR library to a directory searched by % the linker. %\end{enumerate} % %The testing programs provide the residuals $|| U \tilde A - A U ||_{F}$, %the orthogonality errors $|| U^{T} U - I ||_{F}$, and the timings for the %reduction of a symmetric (full or banded) matrix $A$ to a (narrower) banded %or tridiagonal matrix $\tilde A$ for matrices of different orders $n$ and %different semibandwidths. %If the orthogonal matrix $U$ is not accumulated then the above error %measures are not available. %In this case the deviation of the eigenvalues, %$|| \mbox{spec}( A ) - \mbox{spec}( \tilde A ) ||_{2}$, is computed instead, %where $\mbox{spec}( A )$ and $\mbox{spec}( \tilde A )$ are the eigenvalues %of $A$ and $\tilde A$, respectively, in ascending order. SHAR_EOF fi # end of overwriting check if test -f 'pblas.h' then echo shar: will not over-write existing file "'pblas.h'" else cat << "SHAR_EOF" > 'pblas.h' /* --------------------------------------------------------------------- * * -- ScaLAPACK routine (version 1.0) -- * University of Tennessee, Knoxville, Oak Ridge National Laboratory, * and University of California, Berkeley. * November 17, 1996 * * --------------------------------------------------------------------- */ /* * This file includes the standard C libraries, as well as system * dependent include files. All PBLAS routines include this file. */ /* * ======================================================================== * Machine Specific PBLAS macros * ======================================================================== */ #define _HAL_ 0 #define _T3D_ 1 #ifdef T3D #define _MACH_ _T3D_ #endif #ifndef _MACH_ #define _MACH_ _HAL_ #endif /* * ======================================================================== * Include files * ======================================================================== */ #include #include #if( _MACH_ == _T3D_ ) #include #endif /* * ======================================================================== * FORTRAN <-> C interface * ======================================================================== * * These macros define how the PBLAS will be called. _F2C_ADD_ assumes * that they will be called by FORTRAN, which expects C routines to have * an underscore postfixed to the name (Suns, and Intel machines expect * this). _F2C_NOCHANGE indicates that FORTRAN will be calling, and that * it expects the name called by FORTRAN to be identical to that compiled * by the C (RS6K's do this). _F2C_UPCASE says it expects C routines * called by FORTRAN to be in all upcase (CRAY wants this). * _F2C_F77ISF2C indicates that the fortran "compiler" in use is * actually f2c, a FORTRAN to C converter. */ #define _F2C_ADD_ 0 #define _F2C_NOCHANGE 1 #define _F2C_UPCASE 2 #define _F2C_F77ISF2C 3 #ifdef UpCase #define _F2C_CALL_ _F2C_UPCASE #endif #ifdef NoChange #define _F2C_CALL_ _F2C_NOCHANGE #endif #ifdef Add_ #define _F2C_CALL_ _F2C_ADD_ #endif #ifdef f77IsF2C #define _F2C_CALL_ _F2C_F77ISF2C #endif #ifndef _F2C_CALL_ #define _F2C_CALL_ _F2C_ADD_ #endif /* * ======================================================================== * TYPE DEFINITIONS AND CONVERSION UTILITIES * ======================================================================== */ typedef struct { float re, im; } complex; typedef struct { double re, im; } complex16; #if( _MACH_ == _T3D_ ) #define float double /* Type of character argument in a FORTRAN call */ #define F_CHAR _fcd /* Character conversion utilities */ #define F2C_CHAR(a) ( _fcdtocp( (a) ) ) #define C2F_CHAR(a) ( _cptofcd( (a), 1 ) ) /* Type of FORTRAN functions */ #define F_VOID_FCT void fortran /* Subroutine */ #define F_INTG_FCT int fortran /* INTEGER function */ #define F_DBLE_FCT double fortran /* DOUBLE PRECISION function */ #else /* Type of character argument in a FORTRAN call */ typedef char * F_CHAR; /* Character conversion utilities */ #define F2C_CHAR(a) (a) #define C2F_CHAR(a) (a) /* Type of FORTRAN functions */ #define F_VOID_FCT void /* Subroutine */ #define F_INTG_FCT int /* INTEGER function */ #define F_DBLE_FCT double /* DOUBLE PRECISION function */ #endif /* * ======================================================================== * #DEFINE MACRO CONSTANTS * ======================================================================== */ #define DLEN_ 9 /* Length of a descriptor */ #define DT_ 0 /* Descriptor Type */ #define CTXT_ 1 /* BLACS context */ #define M_ 2 /* Global Number of Rows */ #define N_ 3 /* Global Number of Columns */ #define MB_ 4 /* Row Blocking Size */ #define NB_ 5 /* Column Blocking Size */ #define RSRC_ 6 /* Starting Processor Row */ #define CSRC_ 7 /* Starting Processor Column */ #define LLD_ 8 /* Local Leading Dimension */ /* * Descriptor types */ #define BLOCK_CYCLIC_2D 1 #define BLOCK_CYCLIC_INB_2D 2 #define BROADCAST "B" /* Blacs operation definitions */ #define COMBINE "C" #define ALL "A" /* Scope definitions */ #define COLUMN "C" #define ROW "R" #define TOPDEF " " /* Default BLACS topology, PB-BLAS routines */ #define CTOPDEF ' ' #define TOPGET "!" #define YES "Y" #define NO "N" #define MULLENFAC 2 #define ONE 1.0 #define ZERO 0.0 /* * ======================================================================== * PREPROCESSOR MACRO FUNCTIONS USED FOR OPTIMIZATION & CONVENIENCE * ======================================================================== */ #define ABS(a) (((a) < 0) ? -(a) : (a)) #define MIN(a,b) (((a) < (b)) ? (a) : (b)) #define MAX(a,b) (((a) > (b)) ? (a) : (b)) #define CEIL(a,b) ( ((a)+(b)-1) / (b) ) #define Mlowcase(C) ( ((C) > 64 && (C) < 91) ? (C) | 32 : (C) ) #define Mupcase(C) ( ((C) > 96 && (C) < 123) ? (C) & 0xDF : (C) ) #define INDXG2L( iglob, nb, iproc, isrcproc, nprocs )\ ( (nb) * ( ( (iglob)-1) / ( (nb) * (nprocs) ) ) +\ ( ( (iglob) - 1 ) % (nb) ) + 1 ) #define INDXL2G( iloc, nb, iproc, isrcproc, nprocs )\ ( (nprocs) * (nb) * ( ( (iloc) - 1 ) / (nb) ) +\ ( ( (iloc) - 1 ) % (nb) ) +\ ( ( (nprocs) + (iproc) - (isrcproc) ) % (nprocs) ) * (nb) + 1 ) #define INDXG2P( iglob, nb, iproc, isrcproc, nprocs ) \ ( ( (isrcproc) + ( (iglob) - 1 ) / (nb) ) % (nprocs) ) #define MYROC0( nblocks, n, nb, nprocs )\ ( ( (nblocks) % (nprocs) ) ? ( ( (nblocks) / (nprocs) ) * (nb) + (nb) )\ : ( ( (nblocks) / (nprocs) )* (nb) + ( (n) % (nb) ) ) ) #if( _F2C_CALL_ == _F2C_ADD_ ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in). * No redefinition necessary to have following FORTRAN to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void pdgemm_(...) * * This is the default. */ #endif #if( _F2C_CALL_ == _F2C_UPCASE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in) * following FORTRAN to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void PDGEMM(...) */ /* TOOLS */ #define ilcm_ ILCM #define infog2l_ INFOG2L #define numroc_ NUMROC #define pstreecomb_ PSTREECOMB #define pdtreecomb_ PDTREECOMB #define pctreecomb_ PCTREECOMB #define pztreecomb_ PZTREECOMB #define scombamax_ SCOMBAMAX #define dcombamax_ DCOMBAMAX #define ccombamax_ CCOMBAMAX #define zcombamax_ ZCOMBAMAX #define scombnrm2_ SCOMBNRM2 #define dcombnrm2_ DCOMBNRM2 /* BLACS */ #define blacs_abort_ BLACS_ABORT #define blacs_gridinfo_ BLACS_GRIDINFO #define igesd2d_ IGESD2D #define igebs2d_ IGEBS2D #define itrsd2d_ ITRSD2D #define itrbs2d_ ITRBS2D #define igerv2d_ IGERV2D #define igebr2d_ IGEBR2D #define itrrv2d_ ITRRV2D #define itrbr2d_ ITRBR2D #define igamx2d_ IGAMX2D #define igamn2d_ IGAMN2D #define igsum2d_ IGSUM2D #define sgesd2d_ SGESD2D #define sgebs2d_ SGEBS2D #define strsd2d_ STRSD2D #define strbs2d_ STRBS2D #define sgerv2d_ SGERV2D #define sgebr2d_ SGEBR2D #define strrv2d_ STRRV2D #define strbr2d_ STRBR2D #define sgamx2d_ SGAMX2D #define sgamn2d_ SGAMN2D #define sgsum2d_ SGSUM2D #define dgesd2d_ DGESD2D #define dgebs2d_ DGEBS2D #define dtrsd2d_ DTRSD2D #define dtrbs2d_ DTRBS2D #define dgerv2d_ DGERV2D #define dgebr2d_ DGEBR2D #define dtrrv2d_ DTRRV2D #define dtrbr2d_ DTRBR2D #define dgamx2d_ DGAMX2D #define dgamn2d_ DGAMN2D #define dgsum2d_ DGSUM2D #define cgesd2d_ CGESD2D #define cgebs2d_ CGEBS2D #define ctrsd2d_ CTRSD2D #define ctrbs2d_ CTRBS2D #define cgerv2d_ CGERV2D #define cgebr2d_ CGEBR2D #define ctrrv2d_ CTRRV2D #define ctrbr2d_ CTRBR2D #define cgamx2d_ CGAMX2D #define cgamn2d_ CGAMN2D #define cgsum2d_ CGSUM2D #define zgesd2d_ ZGESD2D #define zgebs2d_ ZGEBS2D #define ztrsd2d_ ZTRSD2D #define ztrbs2d_ ZTRBS2D #define zgerv2d_ ZGERV2D #define zgebr2d_ ZGEBR2D #define ztrrv2d_ ZTRRV2D #define ztrbr2d_ ZTRBR2D #define zgamx2d_ ZGAMX2D #define zgamn2d_ ZGAMN2D #define zgsum2d_ ZGSUM2D /* Level-1 BLAS */ #define srotg_ SROTG #define srotmg_ SROTMG #define srot_ SROT #define srotm_ SROTM #define sswap_ SSWAP #define sscal_ SSCAL #define scopy_ SCOPY #define saxpy_ SAXPY #define ssdot_ SSDOT #define isamax_ ISAMAX #define drotg_ DROTG #define drotmg_ DROTMG #define drot_ DROT #define drotm_ DROTM #define dswap_ DSWAP #define dscal_ DSCAL #define dcopy_ DCOPY #define daxpy_ DAXPY #define dddot_ DDDOT #define dnrm2_ DNRM2 #define dsnrm2_ DSNRM2 #define dasum_ DASUM #define dsasum_ DSASUM #define idamax_ IDAMAX #define cswap_ CSWAP #define cscal_ CSCAL #define csscal_ CSSCAL #define ccopy_ CCOPY #define caxpy_ CAXPY #define ccdotu_ CCDOTU #define ccdotc_ CCDOTC #define icamax_ ICAMAX #define zswap_ ZSWAP #define zscal_ ZSCAL #define zdscal_ ZDSCAL #define zcopy_ ZCOPY #define zaxpy_ ZAXPY #define zzdotu_ ZZDOTU #define zzdotc_ ZZDOTC #define dscnrm2_ DSCNRM2 #define dznrm2_ DZNRM2 #define dscasum_ DSCASUM #define dzasum_ DZASUM #define izamax_ IZAMAX /* Level-2 BLAS */ #define sgemv_ SGEMV #define ssymv_ SSYMV #define strmv_ STRMV #define strsv_ STRSV #define sger_ SGER #define ssyr_ SSYR #define ssyr2_ SSYR2 #define dgemv_ DGEMV #define dsymv_ DSYMV #define dtrmv_ DTRMV #define dtrsv_ DTRSV #define dger_ DGER #define dsyr_ DSYR #define dsyr2_ DSYR2 #define cgemv_ CGEMV #define chemv_ CHEMV #define ctrmv_ CTRMV #define ctrsv_ CTRSV #define cgeru_ CGERU #define cgerc_ CGERC #define cher_ CHER #define cher2_ CHER2 #define zgemv_ ZGEMV #define zhemv_ ZHEMV #define ztrmv_ ZTRMV #define ztrsv_ ZTRSV #define zgeru_ ZGERU #define zgerc_ ZGERC #define zher_ ZHER #define zher2_ ZHER2 /* Level-3 BLAS */ #define sgemm_ SGEMM #define ssymm_ SSYMM #define ssyrk_ SSYRK #define ssyr2k_ SSYR2K #define strmm_ STRMM #define strsm_ STRSM #define dgemm_ DGEMM #define dsymm_ DSYMM #define dsyrk_ DSYRK #define dsyr2k_ DSYR2K #define dtrmm_ DTRMM #define dtrsm_ DTRSM #define cgemm_ CGEMM #define chemm_ CHEMM #define csymm_ CSYMM #define csyrk_ CSYRK #define cherk_ CHERK #define csyr2k_ CSYR2K #define cher2k_ CHER2K #define ctrmm_ CTRMM #define ctrsm_ CTRSM #define zgemm_ ZGEMM #define zhemm_ ZHEMM #define zsymm_ ZSYMM #define zsyrk_ ZSYRK #define zherk_ ZHERK #define zsyr2k_ ZSYR2K #define zher2k_ ZHER2K #define ztrmm_ ZTRMM #define ztrsm_ ZTRSM /* absolute value auxiliary PBLAS */ #define psatrmv_ PSATRMV #define pdatrmv_ PDATRMV #define pcatrmv_ PCATRMV #define pzatrmv_ PZATRMV #define psagemv_ PSAGEMV #define pdagemv_ PDAGEMV #define pcagemv_ PCAGEMV #define pzagemv_ PZAGEMV #define psasymv_ PSASYMV #define pdasymv_ PDASYMV #define pcahemv_ PCAHEMV #define pzahemv_ PZAHEMV /* Auxiliary PB-BLAS */ #define pbcmatadd_ PBCMATADD #define pbdmatadd_ PBDMATADD #define pbsmatadd_ PBSMATADD #define pbzmatadd_ PBZMATADD /* Level-2 PBBLAS */ #define pbcgemv_ PBCGEMV #define pbcgeru_ PBCGERU #define pbcgerc_ PBCGERC #define pbchemv_ PBCHEMV #define pbcher_ PBCHER #define pbcher2_ PBCHER2 #define pbctrmv_ PBCTRMV #define pbctrnv_ PBCTRNV #define pbctrsv_ PBCTRSV #define pbdgemv_ PBDGEMV #define pbdger_ PBDGER #define pbdsymv_ PBDSYMV #define pbdsyr_ PBDSYR #define pbdsyr2_ PBDSYR2 #define pbdtrmv_ PBDTRMV #define pbdtrnv_ PBDTRNV #define pbdtrsv_ PBDTRSV #define pbsgemv_ PBSGEMV #define pbsger_ PBSGER #define pbssymv_ PBSSYMV #define pbssyr_ PBSSYR #define pbssyr2_ PBSSYR2 #define pbstrmv_ PBSTRMV #define pbstrnv_ PBSTRNV #define pbstrsv_ PBSTRSV #define pbzgemv_ PBZGEMV #define pbzgeru_ PBZGERU #define pbzgerc_ PBZGERC #define pbzhemv_ PBZHEMV #define pbzher_ PBZHER #define pbzher2_ PBZHER2 #define pbztrmv_ PBZTRMV #define pbztrnv_ PBZTRNV #define pbztrsv_ PBZTRSV /* Level-3 PBBLAS */ #define pbcgemm_ PBCGEMM #define pbchemm_ PBCHEMM #define pbcher2k_ PBCHER2K #define pbcherk_ PBCHERK #define pbcsymm_ PBCSYMM #define pbcsyr2k_ PBCSYR2K #define pbcsyrk_ PBCSYRK #define pbctrmm_ PBCTRMM #define pbctrsm_ PBCTRSM #define pbctran_ PBCTRAN #define pbdgemm_ PBDGEMM #define pbdsymm_ PBDSYMM #define pbdsyr2k_ PBDSYR2K #define pbdsyrk_ PBDSYRK #define pbdtrmm_ PBDTRMM #define pbdtrsm_ PBDTRSM #define pbdtran_ PBDTRAN #define pbsgemm_ PBSGEMM #define pbssymm_ PBSSYMM #define pbssyr2k_ PBSSYR2K #define pbssyrk_ PBSSYRK #define pbstrmm_ PBSTRMM #define pbstrsm_ PBSTRSM #define pbstran_ PBSTRAN #define pbzgemm_ PBZGEMM #define pbzhemm_ PBZHEMM #define pbzher2k_ PBZHER2K #define pbzherk_ PBZHERK #define pbzsymm_ PBZSYMM #define pbzsyr2k_ PBZSYR2K #define pbzsyrk_ PBZSYRK #define pbztrmm_ PBZTRMM #define pbztrsm_ PBZTRSM #define pbztran_ PBZTRAN /* Auxilliary PBLAS */ #define pberror_ PBERROR #define pbfreebuf_ PBFREEBUF #define ptopget_ PTOPGET #define ptopset_ PTOPSET /* Level-1 PBLAS */ #define psrotg_ PSROTG #define psrotmg_ PSROTMG #define psrot_ PSROT #define psrotm_ PSROTM #define psswap_ PSSWAP #define psscal_ PSSCAL #define pscopy_ PSCOPY #define psaxpy_ PSAXPY #define psdot_ PSDOT #define psnrm2_ PSNRM2 #define psasum_ PSASUM #define psamax_ PSAMAX #define pdrotg_ PDROTG #define pdrotmg_ PDROTMG #define pdrot_ PDROT #define pdrotm_ PDROTM #define pdswap_ PDSWAP #define pdscal_ PDSCAL #define pdcopy_ PDCOPY #define pdaxpy_ PDAXPY #define pddot_ PDDOT #define pdnrm2_ PDNRM2 #define pdasum_ PDASUM #define pdamax_ PDAMAX #define pcswap_ PCSWAP #define pcscal_ PCSCAL #define pcsscal_ PCSSCAL #define pccopy_ PCCOPY #define pcaxpy_ PCAXPY #define pcdotu_ PCDOTU #define pcdotc_ PCDOTC #define pscnrm2_ PSCNRM2 #define pscasum_ PSCASUM #define pcamax_ PCAMAX #define pcrot_ PCROT #define pzswap_ PZSWAP #define pzscal_ PZSCAL #define pzdscal_ PZDSCAL #define pzcopy_ PZCOPY #define pzaxpy_ PZAXPY #define pzdotu_ PZDOTU #define pzdotc_ PZDOTC #define pdznrm2_ PDZNRM2 #define pdzasum_ PDZASUM #define pzamax_ PZAMAX #define pzrot_ PZROT /* Level-2 PBLAS */ #define pcgemv_ PCGEMV #define pcgeru_ PCGERU #define pcgerc_ PCGERC #define pchemv_ PCHEMV #define pcher_ PCHER #define pcher2_ PCHER2 #define pctrmv_ PCTRMV #define pctrsv_ PCTRSV #define pdgemv_ PDGEMV #define pdger_ PDGER #define pdsymv_ PDSYMV #define pdsyr_ PDSYR #define pdsyr2_ PDSYR2 #define pdtrmv_ PDTRMV #define pdtrsv_ PDTRSV #define psgemv_ PSGEMV #define psger_ PSGER #define pssymv_ PSSYMV #define pssyr_ PSSYR #define pssyr2_ PSSYR2 #define pstrmv_ PSTRMV #define pstrsv_ PSTRSV #define pzgemv_ PZGEMV #define pzgeru_ PZGERU #define pzgerc_ PZGERC #define pzhemv_ PZHEMV #define pzher_ PZHER #define pzher2_ PZHER2 #define pztrmv_ PZTRMV #define pztrsv_ PZTRSV /* Level-3 PBLAS */ #define pcgemm_ PCGEMM #define pchemm_ PCHEMM #define pcher2k_ PCHER2K #define pcherk_ PCHERK #define pcsymm_ PCSYMM #define pcsyr2k_ PCSYR2K #define pcsyrk_ PCSYRK #define pctrmm_ PCTRMM #define pctrsm_ PCTRSM #define pctranu_ PCTRANU #define pctranc_ PCTRANC #define pdgemm_ PDGEMM #define pdsymm_ PDSYMM #define pdsyr2k_ PDSYR2K #define pdsyrk_ PDSYRK #define pdtrmm_ PDTRMM #define pdtrsm_ PDTRSM #define pdtran_ PDTRAN #define psgemm_ PSGEMM #define pssymm_ PSSYMM #define pssyr2k_ PSSYR2K #define pssyrk_ PSSYRK #define pstrmm_ PSTRMM #define pstrsm_ PSTRSM #define pstran_ PSTRAN #define pzgemm_ PZGEMM #define pzhemm_ PZHEMM #define pzher2k_ PZHER2K #define pzherk_ PZHERK #define pzsymm_ PZSYMM #define pzsyr2k_ PZSYR2K #define pzsyrk_ PZSYRK #define pztrmm_ PZTRMM #define pztrsm_ PZTRSM #define pztranu_ PZTRANU #define pztranc_ PZTRANC #endif #if( _F2C_CALL_ == _F2C_NOCHANGE ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in) * for following FORTRAN to C interface: * FORTRAN CALL C DECLARATION * call pdgemm(...) void pdgemm(...) */ /* TOOLS */ #define ilcm_ ilcm #define infog2l_ infog2l #define numroc_ numroc #define pstreecomb_ pstreecomb #define pdtreecomb_ pdtreecomb #define pctreecomb_ pctreecomb #define pztreecomb_ pztreecomb #define scombamax_ scombamax #define dcombamax_ dcombamax #define ccombamax_ ccombamax #define zcombamax_ zcombamax #define scombnrm2_ scombnrm2 #define dcombnrm2_ dcombnrm2 /* BLACS */ #define blacs_abort_ blacs_abort #define blacs_gridinfo_ blacs_gridinfo #define igesd2d_ igesd2d #define igebs2d_ igebs2d #define itrsd2d_ itrsd2d #define itrbs2d_ itrbs2d #define igerv2d_ igerv2d #define igebr2d_ igebr2d #define itrrv2d_ itrrv2d #define itrbr2d_ itrbr2d #define igamx2d_ igamx2d #define igamn2d_ igamn2d #define igsum2d_ igsum2d #define sgesd2d_ sgesd2d #define sgebs2d_ sgebs2d #define strsd2d_ strsd2d #define strbs2d_ strbs2d #define sgerv2d_ sgerv2d #define sgebr2d_ sgebr2d #define strrv2d_ strrv2d #define strbr2d_ strbr2d #define sgamx2d_ sgamx2d #define sgamn2d_ sgamn2d #define sgsum2d_ sgsum2d #define dgesd2d_ dgesd2d #define dgebs2d_ dgebs2d #define dtrsd2d_ dtrsd2d #define dtrbs2d_ dtrbs2d #define dgerv2d_ dgerv2d #define dgebr2d_ dgebr2d #define dtrrv2d_ dtrrv2d #define dtrbr2d_ dtrbr2d #define dgamx2d_ dgamx2d #define dgamn2d_ dgamn2d #define dgsum2d_ dgsum2d #define cgesd2d_ cgesd2d #define cgebs2d_ cgebs2d #define ctrsd2d_ ctrsd2d #define ctrbs2d_ ctrbs2d #define cgerv2d_ cgerv2d #define cgebr2d_ cgebr2d #define ctrrv2d_ ctrrv2d #define ctrbr2d_ ctrbr2d #define cgamx2d_ cgamx2d #define cgamn2d_ cgamn2d #define cgsum2d_ cgsum2d #define zgesd2d_ zgesd2d #define zgebs2d_ zgebs2d #define ztrsd2d_ ztrsd2d #define ztrbs2d_ ztrbs2d #define zgerv2d_ zgerv2d #define zgebr2d_ zgebr2d #define ztrrv2d_ ztrrv2d #define ztrbr2d_ ztrbr2d #define zgamx2d_ zgamx2d #define zgamn2d_ zgamn2d #define zgsum2d_ zgsum2d /* Level-1 BLAS */ #define srotg_ srotg #define srotmg_ srotmg #define srot_ srot #define srotm_ srotm #define sswap_ sswap #define sscal_ sscal #define scopy_ scopy #define saxpy_ saxpy #define ssdot_ ssdot #define isamax_ isamax #define drotg_ drotg #define drotmg_ drotmg #define drot_ drot #define drotm_ drotm #define dswap_ dswap #define dscal_ dscal #define dcopy_ dcopy #define daxpy_ daxpy #define dddot_ dddot #define dnrm2_ dnrm2 #define dsnrm2_ dsnrm2 #define dasum_ dasum #define dsasum_ dsasum #define idamax_ idamax #define cswap_ cswap #define cscal_ cscal #define csscal_ csscal #define ccopy_ ccopy #define caxpy_ caxpy #define ccdotu_ ccdotu #define ccdotc_ ccdotc #define icamax_ icamax #define zswap_ zswap #define zscal_ zscal #define zdscal_ zdscal #define zcopy_ zcopy #define zaxpy_ zaxpy #define zzdotu_ zzdotu #define zzdotc_ zzdotc #define dscnrm2_ dscnrm2 #define dznrm2_ dznrm2 #define dscasum_ dscasum #define dzasum_ dzasum #define izamax_ izamax /* Level-2 BLAS */ #define sgemv_ sgemv #define ssymv_ ssymv #define strmv_ strmv #define strsv_ strsv #define sger_ sger #define ssyr_ ssyr #define ssyr2_ ssyr2 #define dgemv_ dgemv #define dsymv_ dsymv #define dtrmv_ dtrmv #define dtrsv_ dtrsv #define dger_ dger #define dsyr_ dsyr #define dsyr2_ dsyr2 #define cgemv_ cgemv #define chemv_ chemv #define ctrmv_ ctrmv #define ctrsv_ ctrsv #define cgeru_ cgeru #define cgerc_ cgerc #define cher_ cher #define cher2_ cher2 #define zgemv_ zgemv #define zhemv_ zhemv #define ztrmv_ ztrmv #define ztrsv_ ztrsv #define zgeru_ zgeru #define zgerc_ zgerc #define zher_ zher #define zher2_ zher2 /* Level-3 BLAS */ #define sgemm_ sgemm #define ssymm_ ssymm #define ssyrk_ ssyrk #define ssyr2k_ ssyr2k #define strmm_ strmm #define strsm_ strsm #define dgemm_ dgemm #define dsymm_ dsymm #define dsyrk_ dsyrk #define dsyr2k_ dsyr2k #define dtrmm_ dtrmm #define dtrsm_ dtrsm #define cgemm_ cgemm #define chemm_ chemm #define csymm_ csymm #define csyrk_ csyrk #define cherk_ cherk #define csyr2k_ csyr2k #define cher2k_ cher2k #define ctrmm_ ctrmm #define ctrsm_ ctrsm #define zgemm_ zgemm #define zhemm_ zhemm #define zsymm_ zsymm #define zsyrk_ zsyrk #define zherk_ zherk #define zsyr2k_ zsyr2k #define zher2k_ zher2k #define ztrmm_ ztrmm #define ztrsm_ ztrsm /* absolute value auxiliary PBLAS */ #define psatrmv_ psatrmv #define pdatrmv_ pdatrmv #define pcatrmv_ pcatrmv #define pzatrmv_ pzatrmv #define psagemv_ psagemv #define pdagemv_ pdagemv #define pcagemv_ pcagemv #define pzagemv_ pzagemv #define psasymv_ psasymv #define pdasymv_ pdasymv #define pcahemv_ pcahemv #define pzahemv_ pzahemv /* Auxiliary PB-BLAS */ #define pbcmatadd_ pbcmatadd #define pbdmatadd_ pbdmatadd #define pbsmatadd_ pbsmatadd #define pbzmatadd_ pbzmatadd /* Level-2 PBBLAS */ #define pbcgemv_ pbcgemv #define pbcgeru_ pbcgeru #define pbcgerc_ pbcgerc #define pbchemv_ pbchemv #define pbcher_ pbcher #define pbcher2_ pbcher2 #define pbctrmv_ pbctrmv #define pbctrnv_ pbctrnv #define pbctrsv_ pbctrsv #define pbdgemv_ pbdgemv #define pbdger_ pbdger #define pbdsymv_ pbdsymv #define pbdsyr_ pbdsyr #define pbdsyr2_ pbdsyr2 #define pbdtrmv_ pbdtrmv #define pbdtrnv_ pbdtrnv #define pbdtrsv_ pbdtrsv #define pbsgemv_ pbsgemv #define pbsger_ pbsger #define pbssymv_ pbssymv #define pbssyr_ pbssyr #define pbssyr2_ pbssyr2 #define pbstrmv_ pbstrmv #define pbstrnv_ pbstrnv #define pbstrsv_ pbstrsv #define pbzgemv_ pbzgemv #define pbzgeru_ pbzgeru #define pbzgerc_ pbzgerc #define pbzhemv_ pbzhemv #define pbzher_ pbzher #define pbzher2_ pbzher2 #define pbztrmv_ pbztrmv #define pbztrnv_ pbztrnv #define pbztrsv_ pbztrsv /* Level-3 PBBLAS */ #define pbcgemm_ pbcgemm #define pbchemm_ pbchemm #define pbcher2k_ pbcher2k #define pbcherk_ pbcherk #define pbcsymm_ pbcsymm #define pbcsyr2k_ pbcsyr2k #define pbcsyrk_ pbcsyrk #define pbctrmm_ pbctrmm #define pbctrsm_ pbctrsm #define pbctran_ pbctran #define pbdgemm_ pbdgemm #define pbdsymm_ pbdsymm #define pbdsyr2k_ pbdsyr2k #define pbdsyrk_ pbdsyrk #define pbdtrmm_ pbdtrmm #define pbdtrsm_ pbdtrsm #define pbdtran_ pbdtran #define pbsgemm_ pbsgemm #define pbssymm_ pbssymm #define pbssyr2k_ pbssyr2k #define pbssyrk_ pbssyrk #define pbstrmm_ pbstrmm #define pbstrsm_ pbstrsm #define pbstran_ pbstran #define pbzgemm_ pbzgemm #define pbzhemm_ pbzhemm #define pbzher2k_ pbzher2k #define pbzherk_ pbzherk #define pbzsymm_ pbzsymm #define pbzsyr2k_ pbzsyr2k #define pbzsyrk_ pbzsyrk #define pbztrmm_ pbztrmm #define pbztrsm_ pbztrsm #define pbztran_ pbztran /* Auxilliary PBLAS */ #define pberror_ pberror #define pbfreebuf_ pbfreebuf #define ptopget_ ptopget #define ptopset_ ptopset /* Level-1 PBLAS */ #define psrotg_ psrotg #define psrotmg_ psrotmg #define psrot_ psrot #define psrotm_ psrotm #define psswap_ psswap #define psscal_ psscal #define pscopy_ pscopy #define psaxpy_ psaxpy #define psdot_ psdot #define psnrm2_ psnrm2 #define psasum_ psasum #define psamax_ psamax #define pdrotg_ pdrotg #define pdrotmg_ pdrotmg #define pdrot_ pdrot #define pdrotm_ pdrotm #define pdswap_ pdswap #define pdscal_ pdscal #define pdcopy_ pdcopy #define pdaxpy_ pdaxpy #define pddot_ pddot #define pdnrm2_ pdnrm2 #define pdasum_ pdasum #define pdamax_ pdamax #define pcswap_ pcswap #define pcscal_ pcscal #define pcsscal_ pcsscal #define pccopy_ pccopy #define pcaxpy_ pcaxpy #define pcdotu_ pcdotu #define pcdotc_ pcdotc #define pscnrm2_ pscnrm2 #define pscasum_ pscasum #define pcamax_ pcamax #define pcrot_ pcrot #define pzswap_ pzswap #define pzscal_ pzscal #define pzdscal_ pzdscal #define pzcopy_ pzcopy #define pzaxpy_ pzaxpy #define pzdotu_ pzdotu #define pzdotc_ pzdotc #define pdznrm2_ pdznrm2 #define pdzasum_ pdzasum #define pzamax_ pzamax #define pzrot_ pzrot /* Level-2 PBLAS */ #define pcgemv_ pcgemv #define pcgeru_ pcgeru #define pcgerc_ pcgerc #define pchemv_ pchemv #define pcher_ pcher #define pcher2_ pcher2 #define pctrmv_ pctrmv #define pctrsv_ pctrsv #define pdgemv_ pdgemv #define pdger_ pdger #define pdsymv_ pdsymv #define pdsyr_ pdsyr #define pdsyr2_ pdsyr2 #define pdtrmv_ pdtrmv #define pdtrsv_ pdtrsv #define psgemv_ psgemv #define psger_ psger #define pssymv_ pssymv #define pssyr_ pssyr #define pssyr2_ pssyr2 #define pstrmv_ pstrmv #define pstrsv_ pstrsv #define pzgemv_ pzgemv #define pzgeru_ pzgeru #define pzgerc_ pzgerc #define pzhemv_ pzhemv #define pzher_ pzher #define pzher2_ pzher2 #define pztrmv_ pztrmv #define pztrsv_ pztrsv /* Level-3 PBLAS */ #define pcgemm_ pcgemm #define pchemm_ pchemm #define pcher2k_ pcher2k #define pcherk_ pcherk #define pcsymm_ pcsymm #define pcsyr2k_ pcsyr2k #define pcsyrk_ pcsyrk #define pctrmm_ pctrmm #define pctrsm_ pctrsm #define pctranu_ pctranu #define pctranc_ pctranc #define pdgemm_ pdgemm #define pdsymm_ pdsymm #define pdsyr2k_ pdsyr2k #define pdsyrk_ pdsyrk #define pdtrmm_ pdtrmm #define pdtrsm_ pdtrsm #define pdtran_ pdtran #define psgemm_ psgemm #define pssymm_ pssymm #define pssyr2k_ pssyr2k #define pssyrk_ pssyrk #define pstrmm_ pstrmm #define pstrsm_ pstrsm #define pstran_ pstran #define pzgemm_ pzgemm #define pzhemm_ pzhemm #define pzher2k_ pzher2k #define pzherk_ pzherk #define pzsymm_ pzsymm #define pzsyr2k_ pzsyr2k #define pzsyrk_ pzsyrk #define pztrmm_ pztrmm #define pztrsm_ pztrsm #define pztranu_ pztranu #define pztranc_ pztranc #endif #if( _F2C_CALL_ == _F2C_F77ISF2C ) /* * These defines set up the naming scheme required to have a FORTRAN * routine call a C routine (which is what the PBLAS are written in) * for systems where the fortran "compiler" is actually f2c (a Fortran * to C conversion utility). */ /* * Initialization routines */ #define blacs_pinfo_ blacs_pinfo__ #define blacs_setup_ blacs_setup__ #define blacs_set_ blacs_set__ #define blacs_get_ blacs_get__ #define blacs_gridinit_ blacs_gridinit__ #define blacs_gridmap_ blacs_gridmap__ /* * Destruction routines */ #define blacs_freebuff_ blacs_freebuff__ #define blacs_gridexit_ blacs_gridexit__ #define blacs_abort_ blacs_abort__ #define blacs_exit_ blacs_exit__ /* * Informational & misc. */ #define blacs_gridinfo_ blacs_gridinfo__ #define blacs_pnum_ blacs_pnum__ #define blacs_pcoord_ blacs_pcoord__ #define blacs_barrier_ blacs_barrier__ #endif SHAR_EOF fi # end of overwriting check if test -f 'pzaxpy.c' then echo shar: will not over-write existing file "'pzaxpy.c'" else cat << "SHAR_EOF" > 'pzaxpy.c' /* --------------------------------------------------------------------- * * Mark R. Fahey * August 2000 * This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0 * which fixes a bug in the incx=1 and incy=1 case. * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzaxpy_( n, alpha, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy ) /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; complex16 * alpha; /* .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZAXPY adds one distributed vector to another, * * sub( Y ) := sub( Y ) + alpha * sub( X ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * Parameters * ========== * * N (global input) pointer to INTEGER. * The length of the distributed vectors to be added. N >= 0. * * ALPHA (global input) pointer to COMPLEX*16 * The scalar used to multiply each component of sub( X ). * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input/local output) COMPLEX*16 array * containing the local pieces of a distributed matrix of * dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * On exit sub( Y ) is overwritten by sub( Y ) + alpha*sub( X ). * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * ===================================================================== * * .. Local Scalars .. */ int ictxt, info, iix, iiy, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, lcmq, mycol, myrow, nn, np, np0, nprow, npcol, nq, nq0, nz, ione=1, tmp1, wksz; complex16 one, tmp, zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgerv2d_(); void zgesd2d_(); void pbchkvect(); void pberror_(); char * getpbbuf(); F_VOID_FCT zaxpy_(); F_VOID_FCT zcopy_(); F_VOID_FCT pbztrnv_(); F_INTG_FCT ilcm_(); F_INTG_FCT numroc_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(600+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -10; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -9; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1100+MB_+1); } else { info = -12; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -10; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -9; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1100+MB_+1); } else { info = -12; } } else { info = -7; } } if( ictxt != desc_Y[CTXT_] ) info = -(1100+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZAXPY", &info ); return; } /* * Quick return if possible. */ if( *n == 0 ) return; /* * y <- y + alpha * x */ if( *n == 1 ) { if( ( myrow == iyrow ) && ( mycol == iycol ) ) { if( ( myrow != ixrow ) || ( mycol != ixcol ) ) zgerv2d_( &ictxt, n, n, &tmp, n, &ixrow, &ixcol ); else tmp = X[iix-1+(jjx-1)*desc_X[LLD_]]; zaxpy_( n, alpha, &tmp, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n ); } else if( ( myrow == ixrow ) && ( mycol == ixcol ) ) zgesd2d_( &ictxt, n, n, &X[iix-1+(jjx-1)*desc_X[LLD_]], n, &iyrow, &iycol ); return; } one.re = ONE; one.im = ZERO; zero.re = ZERO; zero.im = ZERO; if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) zaxpy_( &nq, alpha, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); } else { if( myrow == ixrow ) zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); else if( myrow == iyrow ) { buff = (complex16 *)getpbbuf( "PZAXPY", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zaxpy_( &nq, alpha, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) zaxpy_( &np, alpha, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); } else { if( mycol == ixcol ) zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); else if( mycol == iycol ) { buff = (complex16 *)getpbbuf( "PZAXPY", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zaxpy_( &np, alpha, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmq = lcm / npcol; nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[NB_]; nq0 = MYROC0( tmp1, nn, desc_Y[NB_], npcol ); tmp1 = nq0 / desc_Y[NB_]; wksz = np + MYROC0( tmp1, nq0, desc_Y[NB_], lcmq ); buff = (complex16 *)getpbbuf( "PZAXPY", wksz*sizeof(complex16) ); if( myrow == ixrow ) np -= nz; if( mycol == ixcol ) { zcopy_( &np, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, incx ); zscal_( &np, alpha, buff, incx ); } pbztrnv_( &ictxt, C2F_CHAR( "C" ), C2F_CHAR( "T" ), n, &desc_X[MB_], &nz, buff, incx, &one, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &ixcol, &iyrow, &iycol, buff+np ); } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*iy-1) % desc_Y[MB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_Y[MB_], &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZAXPY", wksz*sizeof(complex16) ); pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { if( myrow == iyrow ) np -= nz; zaxpy_( &np, alpha, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); } } } } SHAR_EOF fi # end of overwriting check if test -f 'pzdotc.c' then echo shar: will not over-write existing file "'pzdotc.c'" else cat << "SHAR_EOF" > 'pzdotc.c' /* --------------------------------------------------------------------- * * Mark R. Fahey * August 2000 * This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0 * which fixes a bug in the incx=1 and incy=1 case. * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzdotc_( n, dotc, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy ) /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; complex16 * dotc; /* .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZDOTC forms the dot product of two distributed vectors, * * dotc := sub( X )**H * sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * * Parameters * ========== * * N (global input) pointer to INTEGER * The length of the distributed vectors to be multiplied. * N >= 0. * * DOTC (local output) pointer to COMPLEX*16 * The dot product of sub( X ) and sub( Y ) only in their scope. * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * ===================================================================== * * .. Local Scalars .. */ char * cbtop, * cctop, * rbtop, * rctop; int ictxt, iix, iiy, info, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, mone=-1, mycol, myrow, nn, np, np0, nprow, npcol, nq, nz, ione=1, tmp1, wksz; complex16 xwork[1], ywork[1], zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgebr2d_(); void zgebs2d_(); void zgerv2d_(); void zgesd2d_(); void zgsum2d_(); void pbchkvect(); void pberror_(); char * getpbbuf(); char * ptop(); F_VOID_FCT pbztrnv_(); F_VOID_FCT zzdotc_(); F_INTG_FCT ilcm_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(600+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -10; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -9; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1100+MB_+1); } else { info = -12; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -10; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -9; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1100+MB_+1); } else { info = -12; } } else { info = -7; } } if( ictxt != desc_Y[CTXT_] ) info = -(1100+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZDOTC", &info ); return; } /* * Quick return if possible. */ dotc->re = ZERO; dotc->im = ZERO; zero.re = ZERO; zero.im = ZERO; if( *n == 0 ) return; /* * dot <- x^{h} * y */ if( *n == 1 ) { if( ( myrow == ixrow ) && ( mycol == ixcol ) ) { buff = &X[iix-1+(jjx-1)*desc_X[LLD_]]; if( ( myrow != iyrow ) || ( mycol != iycol ) ) { zgesd2d_( &ictxt, n, n, buff, n, &iyrow, &iycol ); zgerv2d_( &ictxt, n, n, ywork, n, &iyrow, &iycol ); } else *ywork = Y[iiy-1+(jjy-1)*desc_Y[LLD_]]; zzdotc_( n, dotc, buff, n, ywork, n ); } else if( ( myrow == iyrow ) && ( mycol == iycol ) ) { zgesd2d_( &ictxt, n, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, &ixrow, &ixcol ); zgerv2d_( &ictxt, n, n, xwork, n, &ixrow, &ixcol ); zzdotc_( n, dotc, xwork, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n ); } if( ( *incx == desc_X[M_] ) && ( desc_X[M_] != 1 ) ) { if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &ixcol ); } } } else if( ( *incx == 1 ) && ( desc_X[M_] != 1 ) ) { if( mycol == ixcol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == ixrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione, &ixrow, &mycol ); } } } if( ( *incy == desc_Y[M_] ) && ( desc_Y[M_] != 1 ) ) { if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &iycol ); } } } else if( ( *incy == 1 ) && ( desc_Y[M_] != 1 ) ) { if( mycol == iycol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == iyrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotc, &ione, &iyrow, &mycol ); } } } return; } if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zzdotc_( &nq, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } else { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTC", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotc_( &nq, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } else if( myrow == iyrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTC", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotc_( &nq, dotc, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotc_( &np, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } else { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); buff = (complex16 *)getpbbuf( "PZDOTC", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &iycol ); zzdotc_( &np, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } else if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); buff = (complex16 *)getpbbuf( "PZDOTC", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zgesd2d_( &ictxt, &np, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &myrow, &ixcol ); zzdotc_( &np, dotc, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmp = lcm / nprow; nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_X[MB_], nprow ); tmp1 = np0 / desc_X[MB_]; wksz = MYROC0( tmp1, np0, desc_X[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTC", wksz*sizeof(complex16) ); if( mycol == iycol ) jjy -= nz; if( myrow == ixrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &zero, buff, &ione, &iyrow, &iycol, &ixrow, &ixcol, buff+np ); if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotc_( &np, dotc, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &ixcol ); } } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*jx-1) % desc_X[NB_]; nn = *n + nz; tmp1 = nn / desc_X[MB_]; np = numroc_( &nn, desc_Y+MB_, &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTC", wksz*sizeof(complex16) ); if( myrow == iyrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotc_( &np, dotc, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotc, &ione, &mone, &mycol ); } if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotc, &ione, &myrow, &iycol ); } } } } SHAR_EOF fi # end of overwriting check if test -f 'pzdotu.c' then echo shar: will not over-write existing file "'pzdotu.c'" else cat << "SHAR_EOF" > 'pzdotu.c' /* --------------------------------------------------------------------- * * Mark R. Fahey * August 2000 * This is a slightly modified version of pzaxpy_ from ScaLAPACK 1.0 * which fixes a bug in the incx=1 and incy=1 case. * * --------------------------------------------------------------------- */ /* * Include files */ #include "pblas.h" void pzdotu_( n, dotu, X, ix, jx, desc_X, incx, Y, iy, jy, desc_Y, incy ) /* * .. Scalar Arguments .. */ int * incx, * incy, * ix, * iy, * jx, * jy, * n; complex16 * dotu; /* .. * .. Array Arguments .. */ int desc_X[], desc_Y[]; complex16 X[], Y[]; { /* * Purpose * ======= * * PZDOTU forms the dot product of two distributed vectors, * * dotu := sub( X )**T * sub( Y ) * * where sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, * * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector descA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DT_A (global) descA[ DT_ ] The descriptor type. In this case, * DT_A = 1. * CTXT_A (global) descA[ CTXT_ ] The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) descA[ M_ ] The number of rows in the global * array A. * N_A (global) descA[ N_ ] The number of columns in the global * array A. * MB_A (global) descA[ MB_ ] The blocking factor used to distribu- * te the rows of the array. * NB_A (global) descA[ NB_ ] The blocking factor used to distribu- * te the columns of the array. * RSRC_A (global) descA[ RSRC_ ] The process row over which the first * row of the array A is distributed. * CSRC_A (global) descA[ CSRC_ ] The process column over which the * first column of the array A is * distributed. * LLD_A (local) descA[ LLD_ ] The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Because vectors may be seen as particular matrices, a distributed * vector is considered to be a distributed matrix. * * If INCX = M_X and INCY = M_Y, NB_X must be equal to NB_Y, and the * process column having the first entries of sub( Y ) must also contain * the first entries of sub( X ). Moreover, the quantity * MOD( JX-1, NB_X ) must be equal to MOD( JY-1, NB_Y ). * * If INCX = M_X, INCY = 1 and INCY <> M_Y, NB_X must be equal to MB_Y. * Moreover, the quantity MOD( JX-1, NB_X ) must be equal to * MOD( IY-1, MB_Y ). * * If INCX = 1, INCX <> M_X and INCY = M_Y, MB_X must be equal to NB_Y. * Moreover, the quantity MOD( IX-1, MB_X ) must be equal to * MOD( JY-1, NB_Y ). * * If INCX = 1, INCX <> M_X, INCY = 1 and INCY <> M_Y, MB_X must be * equal to MB_Y, and the process row having the first entries of * sub( Y ) must also contain the first entries of sub( X ). Moreover, * the quantity MOD( IX-1, MB_X ) must be equal to MOD( IY-1, MB_Y ). * * * Parameters * ========== * * N (global input) pointer to INTEGER * The length of the distributed vectors to be multiplied. * N >= 0. * * DOTU (local output) pointer to COMPLEX*16 * The dot product of sub( X ) and sub( Y ) only in their scope. * * X (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JX-1)*M_X + IX + ( N - 1 )*abs( INCX ) ) * This array contains the entries of the distributed vector * sub( X ). * * IX (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix X to operate on. * * JX (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix X to operate on. * * DESCX (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix X. * * INCX (global input) pointer to INTEGER * The global increment for the elements of X. Only two values * of INCX are supported in this version, namely 1 and M_X. * * Y (local input) COMPLEX*16 array containing the local * pieces of a distributed matrix of dimension of at least * ( (JY-1)*M_Y + IY + ( N - 1 )*abs( INCY ) ) * This array contains the entries of the distributed vector * sub( Y ). * * IY (global input) pointer to INTEGER * The global row index of the submatrix of the distributed * matrix Y to operate on. * * JY (global input) pointer to INTEGER * The global column index of the submatrix of the distributed * matrix Y to operate on. * * DESCY (global and local input) INTEGER array of dimension 8. * The array descriptor of the distributed matrix Y. * * INCY (global input) pointer to INTEGER * The global increment for the elements of Y. Only two values * of INCY are supported in this version, namely 1 and M_Y. * * ===================================================================== * * .. Local Scalars .. */ char * cbtop, * cctop, * rbtop, * rctop; int ictxt, iix, iiy, info, ixcol, ixrow, iycol, iyrow, jjx, jjy, lcm, lcmp, mone=-1, mycol, myrow, nn, np, np0, nprow, npcol, nq, nz, ione=1, tmp1, wksz; complex16 xwork[1], ywork[1], zero; /* .. * .. PBLAS Buffer .. */ complex16 * buff; /* .. * .. External Functions .. */ void blacs_gridinfo_(); void zgebr2d_(); void zgebs2d_(); void zgerv2d_(); void zgesd2d_(); void zgsum2d_(); void pbchkvect(); void pberror_(); char * getpbbuf(); char * ptop(); F_VOID_FCT pbztrnv_(); F_VOID_FCT zzdotu_(); F_INTG_FCT ilcm_(); /* .. * .. Executable Statements .. * * Get grid parameters */ ictxt = desc_X[CTXT_]; blacs_gridinfo_( &ictxt, &nprow, &npcol, &myrow, &mycol ); /* * Test the input parameters */ info = 0; if( nprow == -1 ) info = -(600+CTXT_+1); else { pbchkvect( *n, 1, *ix, *jx, desc_X, *incx, 6, &iix, &jjx, &ixrow, &ixcol, nprow, npcol, myrow, mycol, &info ); pbchkvect( *n, 1, *iy, *jy, desc_Y, *incy, 11, &iiy, &jjy, &iyrow, &iycol, nprow, npcol, myrow, mycol, &info ); if( info == 0 ) { if( *n != 1 ) { if( *incx == desc_X[M_] ) { /* X is distributed along a process row */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( ixcol != iycol ) || ( ( (*jx-1) % desc_X[NB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) ) info = -10; else if( desc_Y[NB_] != desc_X[NB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( (*jx-1) % desc_X[NB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) info = -9; else if( desc_Y[MB_] != desc_X[NB_] ) info = -(1100+MB_+1); } else { info = -12; } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed along a process column */ if( *incy == desc_Y[M_] ) { /* Y is distributed over a process row */ if( ( (*ix-1) % desc_X[MB_] ) != ( (*jy-1) % desc_Y[NB_] ) ) info = -10; else if( desc_Y[NB_] != desc_X[MB_] ) info = -(1100+NB_+1); } else if( ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* Y is distributed over a process column */ if( ( ixrow != iyrow ) || ( ( (*ix-1) % desc_X[MB_] ) != ( (*iy-1) % desc_Y[MB_] ) ) ) info = -9; else if( desc_Y[MB_] != desc_X[MB_] ) info = -(1100+MB_+1); } else { info = -12; } } else { info = -7; } } if( ictxt != desc_Y[CTXT_] ) info = -(1100+CTXT_+1); } } if( info ) { pberror_( &ictxt, "PZDOTU", &info ); return; } /* * Quick return if possible. */ dotu->re = ZERO; dotu->im = ZERO; zero.re = ZERO; zero.im = ZERO; if( *n == 0 ) return; /* * dot <- x^{t} * y */ if( *n == 1 ) { if( ( myrow == ixrow ) && ( mycol == ixcol ) ) { buff = &X[iix-1+(jjx-1)*desc_X[LLD_]]; if( ( myrow != iyrow ) || ( mycol != iycol ) ) { zgesd2d_( &ictxt, n, n, buff, n, &iyrow, &iycol ); zgerv2d_( &ictxt, n, n, ywork, n, &iyrow, &iycol ); } else *ywork = Y[iiy-1+(jjy-1)*desc_Y[LLD_]]; zzdotu_( n, dotu, buff, n, ywork, n ); } else if( ( myrow == iyrow ) && ( mycol == iycol ) ) { zgesd2d_( &ictxt, n, n, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], n, &ixrow, &ixcol ); zgerv2d_( &ictxt, n, n, xwork, n, &ixrow, &ixcol ); zzdotu_( n, dotu, xwork, n, &Y[iiy-1+(jjx-1)*desc_X[LLD_]], n ); } if( ( *incx == desc_X[M_] ) && ( desc_X[M_] != 1 ) ) { if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &ixcol ); } } } else if( ( *incx == 1 ) && ( desc_X[M_] != 1 ) ) { if( mycol == ixcol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == ixrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione, &ixrow, &mycol ); } } } if( ( *incy == desc_Y[M_] ) && ( desc_Y[M_] != 1 ) ) { if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) { zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &iycol ); } } } else if( ( *incy == 1 ) && ( desc_Y[M_] != 1 ) ) { if( mycol == iycol ) { cbtop = ptop( BROADCAST, COLUMN, TOPGET ); if( myrow == iyrow ) { zgebs2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione ); } else { zgebr2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cbtop ), &ione, &ione, dotu, &ione, &iyrow, &mycol ); } } } return; } if( ( *incx == desc_X[M_] ) && ( *incy == desc_Y[M_] ) ) { /* X and Y are both distributed over a process row */ nz = (*jx-1) % desc_Y[NB_]; nn = *n + nz; nq = numroc_( &nn, &desc_X[NB_], &mycol, &ixcol, &npcol ); if( mycol == ixcol ) nq -= nz; if( ixrow == iyrow ) { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zzdotu_( &nq, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } else { if( myrow == ixrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &iyrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTU", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotu_( &nq, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } else if( myrow == iyrow ) { rctop = ptop( COMBINE, ROW, TOPGET ); zgesd2d_( &ictxt, &ione, &nq, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &ixrow, &mycol ); buff = (complex16 *)getpbbuf( "PZDOTU", nq*sizeof(complex16) ); zgerv2d_( &ictxt, &nq, &ione, buff, &ione, &ixrow, &mycol ); zzdotu_( &nq, dotu, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_] ); zgsum2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } } else if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) && ( *incy == 1 ) && ( *incy != desc_Y[M_] ) ) { /* X and Y are both distributed over a process column */ nz = (*ix-1) % desc_X[MB_]; nn = *n + nz; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); if( myrow == ixrow ) np -= nz; if( ixcol == iycol ) { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotu_( &np, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } else { if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zgesd2d_( &ictxt, &np, &ione, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &myrow, &iycol ); buff = (complex16 *)getpbbuf( "PZDOTU", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &iycol ); zzdotu_( &np, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } else if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zgesd2d_( &ictxt, &np, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &myrow, &ixcol ); buff = (complex16 *)getpbbuf( "PZDOTU", np*sizeof(complex16) ); zgerv2d_( &ictxt, &np, &ione, buff, &ione, &myrow, &ixcol ); zzdotu_( &np, dotu, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } } } else /* X and Y are not distributed along the same direction */ { lcm = ilcm_( &nprow, &npcol ); if( ( *incx == 1 ) && ( *incx != desc_X[M_] ) ) { /* X is distributed over a process column */ lcmp = lcm / nprow; nz = (*jy-1) % desc_Y[NB_]; nn = *n + nz; tmp1 = nn / desc_Y[MB_]; np = numroc_( &nn, &desc_X[MB_], &myrow, &ixrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_X[MB_], nprow ); tmp1 = np0 / desc_X[MB_]; wksz = MYROC0( tmp1, np0, desc_X[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTU", wksz*sizeof(complex16) ); if( mycol == iycol ) jjy -= nz; if( myrow == ixrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_Y[NB_], &nz, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], &desc_Y[LLD_], &zero, buff, &ione, &iyrow, &iycol, &ixrow, &ixcol, buff+np ); if( mycol == ixcol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotu_( &np, dotu, &X[iix-1+(jjx-1)*desc_X[LLD_]], incx, buff, &ione ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } if( myrow == iyrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == ixcol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &ixcol ); } } else /* Y is distributed over a process column */ { lcmp = lcm / nprow; nz = (*jx-1) % desc_X[NB_]; nn = *n + nz; tmp1 = nn / desc_X[MB_]; np = numroc_( &nn, desc_Y+MB_, &myrow, &iyrow, &nprow ); np0 = MYROC0( tmp1, nn, desc_Y[MB_], nprow ); tmp1 = np0 / desc_Y[MB_]; wksz = MYROC0( tmp1, np0, desc_Y[MB_], lcmp ); wksz = np + wksz; buff = (complex16 *)getpbbuf( "PZDOTU", wksz*sizeof(complex16) ); if( myrow == iyrow ) np -= nz; pbztrnv_( &ictxt, C2F_CHAR( "R" ), C2F_CHAR( "T" ), n, &desc_X[NB_], &nz, &X[iix-1+(jjx-1)*desc_X[LLD_]], &desc_X[LLD_], &zero, buff, &ione, &ixrow, &ixcol, &iyrow, &iycol, buff+np ); if( mycol == iycol ) { cctop = ptop( COMBINE, COLUMN, TOPGET ); zzdotu_( &np, dotu, buff, &ione, &Y[iiy-1+(jjy-1)*desc_Y[LLD_]], incy ); zgsum2d_( &ictxt, C2F_CHAR( COLUMN ), C2F_CHAR( cctop ), &ione, &ione, dotu, &ione, &mone, &mycol ); } if( myrow == ixrow ) { rbtop = ptop( BROADCAST, ROW, TOPGET ); if( mycol == iycol ) zgebs2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione ); else zgebr2d_( &ictxt, C2F_CHAR( ROW ), C2F_CHAR( rbtop ), &ione, &ione, dotu, &ione, &myrow, &iycol ); } } } } SHAR_EOF fi # end of overwriting check if test -f 'pzlaconsb.f' then echo shar: will not over-write existing file "'pzlaconsb.f'" else cat << "SHAR_EOF" > 'pzlaconsb.f' SUBROUTINE PZLACONSB( A, DESCA, I, L, M, H44, H33, H43H34, BUF, $ LWORK ) * * Mark R. Fahey * May 28, 1999 * * * .. Scalar Arguments .. INTEGER I, L, LWORK, M COMPLEX*16 H33, H43H34, H44 * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), BUF( * ) * .. * * Purpose * ======= * * PZLACONSB looks for two consecutive small subdiagonal elements by * seeing the effect of starting a double shift QR iteration * given by H44, H33, & H43H34 and see if this would make a * subdiagonal negligible. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * A (global input) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the Hessenberg matrix whose tridiagonal part is * being scanned. * Unchanged on exit. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * I (global input) INTEGER * The global location of the bottom of the unreduced * submatrix of A. * Unchanged on exit. * * L (global input) INTEGER * The global location of the top of the unreduced submatrix * of A. * Unchanged on exit. * * M (global output) INTEGER * On exit, this yields the starting location of the QR double * shift. This will satisfy: L <= M <= I-2. * * H44 * H33 * H43H34 (global input) COMPLEX*16 * These three values are for the double shift QR iteration. * * BUF (local output) COMPLEX*16 array of size LWORK. * * LWORK (global input) INTEGER * On exit, LWORK is the size of the work buffer. * This must be at least 7*Ceil( Ceil( (I-L)/HBL ) / * LCM(NPROW,NPCOL) ) * Here LCM is least common multiple, and NPROWxNPCOL is the * logical grid size. * * Logic: * ====== * * Two consecutive small subdiagonal elements will stall * convergence of a double shift if their product is small * relatively even if each is not very small. Thus it is * necessary to scan the "tridiagonal portion of the matrix." In * the LAPACK algorithm ZLAHQR, a loop of M goes from I-2 down to * L and examines * H(m,m),H(m+1,m+1),H(m+1,m),H(m,m+1),H(m-1,m-1),H(m,m-1), and * H(m+2,m-1). Since these elements may be on separate * processors, the first major loop (10) goes over the tridiagonal * and has each node store whatever values of the 7 it has that * the node owning H(m,m) does not. This will occur on a border * and can happen in no more than 3 locations per block assuming * square blocks. There are 5 buffers that each node stores these * values: a buffer to send diagonally down and right, a buffer * to send up, a buffer to send left, a buffer to send diagonally * up and left and a buffer to send right. Each of these buffers * is actually stored in one buffer BUF where BUF(ISTR1+1) starts * the first buffer, BUF(ISTR2+1) starts the second, etc.. After * the values are stored, if there are any values that a node * needs, they will be sent and received. Then the next major * loop passes over the data and searches for two consecutive * small subdiagonals. * * Notes: * * This routine does a global maximum and must be called by all * processes. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) * .. * .. Local Scalars .. INTEGER CONTXT, DOWN, HBL, IBUF1, IBUF2, IBUF3, IBUF4, $ IBUF5, ICOL1, II, IRCV1, IRCV2, IRCV3, IRCV4, $ IRCV5, IROW1, ISRC, ISTR1, ISTR2, ISTR3, ISTR4, $ ISTR5, JJ, JSRC, LDA, LEFT, MODKM1, MYCOL, $ MYROW, NPCOL, NPROW, NUM, RIGHT, UP DOUBLE PRECISION S, TST1, ULP COMPLEX*16 CDUM, H00, H10, H11, H12, H21, H22, H33S, H44S, $ V1, V2, V3 * .. * .. External Functions .. INTEGER ILCM DOUBLE PRECISION PDLAMCH EXTERNAL ILCM, PDLAMCH * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, IGAMX2D, INFOG2L, PXERBLA, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC ABS, DBLE, DIMAG, MOD * .. * .. Statement Functions .. DOUBLE PRECISION CABS1 * .. * .. Statement Function definitions .. CABS1( CDUM ) = ABS( DBLE( CDUM ) ) + ABS( DIMAG( CDUM ) ) * .. * .. Executable Statements .. * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) ULP = PDLAMCH( CONTXT, 'PRECISION' ) CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) LEFT = MOD( MYCOL+NPCOL-1, NPCOL ) RIGHT = MOD( MYCOL+1, NPCOL ) UP = MOD( MYROW+NPROW-1, NPROW ) DOWN = MOD( MYROW+1, NPROW ) NUM = NPROW*NPCOL * * BUFFER1 starts at BUF(ISTR1+1) and will contain IBUF1 elements * BUFFER2 starts at BUF(ISTR2+1) and will contain IBUF2 elements * BUFFER3 starts at BUF(ISTR3+1) and will contain IBUF3 elements * BUFFER4 starts at BUF(ISTR4+1) and will contain IBUF4 elements * BUFFER5 starts at BUF(ISTR5+1) and will contain IBUF5 elements * ISTR1 = 0 ISTR2 = ( ( I-L-1 ) / HBL ) IF( ISTR2*HBL.LT.( I-L-1 ) ) $ ISTR2 = ISTR2 + 1 II = ISTR2 / ILCM( NPROW, NPCOL ) IF( II*ILCM( NPROW, NPCOL ).LT.ISTR2 ) THEN ISTR2 = II + 1 ELSE ISTR2 = II END IF IF( LWORK.LT.7*ISTR2 ) THEN CALL PXERBLA( CONTXT, 'PZLACONSB', 10 ) RETURN END IF ISTR3 = 3*ISTR2 ISTR4 = ISTR3 + ISTR2 ISTR5 = ISTR3 + ISTR3 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) * * Copy our relevant pieces of triadiagonal that we owe into * 5 buffers to send to whomever owns H(M,M) as M moves diagonally * up the tridiagonal * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 IRCV1 = 0 IRCV2 = 0 IRCV3 = 0 IRCV4 = 0 IRCV5 = 0 DO 10 M = I - 2, L, -1 IF( ( MODKM1.EQ.0 ) .AND. ( DOWN.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M-1,M-1) and send it diagonal down * IF( ( DOWN.NE.MYROW ) .OR. ( RIGHT.NE.MYCOL ) ) THEN CALL INFOG2L( M-1, M-1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF1 = IBUF1 + 1 BUF( ISTR1+IBUF1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.0 ) .AND. ( MYROW.EQ.II ) .AND. $ ( RIGHT.EQ.JJ ) .AND. ( M.GT.L ) ) THEN * * We must pack H(M ,M-1) and send it right * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF5 = IBUF5 + 1 BUF( ISTR5+IBUF5 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+1,M) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+1, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( MYROW.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M ,M+1) and send it left * IF( NPCOL.GT.1 ) THEN CALL INFOG2L( M, M+1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) IBUF3 = IBUF3 + 1 BUF( ISTR3+IBUF3 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( UP.EQ.II ) .AND. $ ( LEFT.EQ.JJ ) ) THEN * * We must pack H(M+1,M+1) & H(M+2,M+1) and send it * diagonally up * IF( ( UP.NE.MYROW ) .OR. ( LEFT.NE.MYCOL ) ) THEN CALL INFOG2L( M+1, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF4 = IBUF4 + 2 BUF( ISTR4+IBUF4-1 ) = A( ( ICOL1-1 )*LDA+IROW1 ) BUF( ISTR4+IBUF4 ) = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( UP.EQ.II ) .AND. $ ( MYCOL.EQ.JJ ) ) THEN * * We must pack H(M+2,M+1) and send it up * IF( NPROW.GT.1 ) THEN CALL INFOG2L( M+2, M+1, DESCA, NPROW, NPCOL, MYROW, $ MYCOL, IROW1, ICOL1, ISRC, JSRC ) IBUF2 = IBUF2 + 1 BUF( ISTR2+IBUF2 ) = A( ( ICOL1-1 )*LDA+IROW1 ) END IF END IF * * Add up the receives * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( ( MODKM1.EQ.0 ) .AND. ( M.GT.L ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M-1,M-1) from diagonal up * IRCV1 = IRCV1 + 1 END IF IF( ( MODKM1.EQ.0 ) .AND. ( NPCOL.GT.1 ) .AND. ( M.GT.L ) ) $ THEN * * We must receive H(M ,M-1) from left * IRCV5 = IRCV5 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+1,M ) from down * IRCV2 = IRCV2 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. ( NPCOL.GT.1 ) ) THEN * * We must receive H(M ,M+1) from right * IRCV3 = IRCV3 + 1 END IF IF( ( MODKM1.EQ.HBL-1 ) .AND. $ ( ( NPROW.GT.1 ) .OR. ( NPCOL.GT.1 ) ) ) THEN * * We must receive H(M+1:M+2,M+1) from diagonal down * IRCV4 = IRCV4 + 2 END IF IF( ( MODKM1.EQ.HBL-2 ) .AND. ( NPROW.GT.1 ) ) THEN * * We must receive H(M+2,M+1) from down * IRCV2 = IRCV2 + 1 END IF END IF * * Possibly change owners (occurs only when MOD(M-1,HBL) = 0) * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 10 CONTINUE * * * Send data on to the appropriate node if there is any data to send * IF( IBUF1.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF1, 1, BUF( ISTR1+1 ), IBUF1, DOWN, $ RIGHT ) END IF IF( IBUF2.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF2, 1, BUF( ISTR2+1 ), IBUF2, UP, $ MYCOL ) END IF IF( IBUF3.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF3, 1, BUF( ISTR3+1 ), IBUF3, MYROW, $ LEFT ) END IF IF( IBUF4.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF4, 1, BUF( ISTR4+1 ), IBUF4, UP, $ LEFT ) END IF IF( IBUF5.GT.0 ) THEN CALL ZGESD2D( CONTXT, IBUF5, 1, BUF( ISTR5+1 ), IBUF5, MYROW, $ RIGHT ) END IF * * Receive appropriate data if there is any * IF( IRCV1.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV1, 1, BUF( ISTR1+1 ), IRCV1, UP, $ LEFT ) END IF IF( IRCV2.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV2, 1, BUF( ISTR2+1 ), IRCV2, DOWN, $ MYCOL ) END IF IF( IRCV3.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV3, 1, BUF( ISTR3+1 ), IRCV3, MYROW, $ RIGHT ) END IF IF( IRCV4.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV4, 1, BUF( ISTR4+1 ), IRCV4, DOWN, $ RIGHT ) END IF IF( IRCV5.GT.0 ) THEN CALL ZGERV2D( CONTXT, IRCV5, 1, BUF( ISTR5+1 ), IRCV5, MYROW, $ LEFT ) END IF * * Start main loop * IBUF1 = 0 IBUF2 = 0 IBUF3 = 0 IBUF4 = 0 IBUF5 = 0 CALL INFOG2L( I-2, I-2, DESCA, NPROW, NPCOL, MYROW, MYCOL, IROW1, $ ICOL1, II, JJ ) MODKM1 = MOD( I-3+HBL, HBL ) IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) .AND. $ ( MODKM1.NE.HBL-1 ) ) THEN CALL INFOG2L( I-2, I-1, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) END IF * * Look for two consecutive small subdiagonal elements. * DO 20 M = I - 2, L, -1 * * Determine the effect of starting the double-shift QR * iteration at row M, and see if this would make H(M,M-1) * negligible. * IF( ( MYROW.EQ.II ) .AND. ( MYCOL.EQ.JJ ) ) THEN IF( MODKM1.EQ.0 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN IF( NUM.GT.1 ) THEN IBUF1 = IBUF1 + 1 H00 = BUF( ISTR1+IBUF1 ) ELSE H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF5 = IBUF5 + 1 H10 = BUF( ISTR5+IBUF5 ) ELSE H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF END IF IF( MODKM1.EQ.HBL-1 ) THEN CALL INFOG2L( M, M, DESCA, NPROW, NPCOL, MYROW, MYCOL, $ IROW1, ICOL1, ISRC, JSRC ) H11 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( NUM.GT.1 ) THEN IBUF4 = IBUF4 + 2 H22 = BUF( ISTR4+IBUF4-1 ) V3 = BUF( ISTR4+IBUF4 ) ELSE H22 = A( ICOL1*LDA+IROW1+1 ) V3 = A( ( ICOL1+1 )*LDA+IROW1+1 ) END IF IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 H21 = BUF( ISTR2+IBUF2 ) ELSE H21 = A( ( ICOL1-1 )*LDA+IROW1+1 ) END IF IF( NPCOL.GT.1 ) THEN IBUF3 = IBUF3 + 1 H12 = BUF( ISTR3+IBUF3 ) ELSE H12 = A( ICOL1*LDA+IROW1 ) END IF IF( M.GT.L ) THEN H00 = A( ( ICOL1-2 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-2 )*LDA+IROW1 ) END IF * * Adjust ICOL1 for next iteration where MODKM1=HBL-2 * ICOL1 = ICOL1 + 1 END IF IF( MODKM1.EQ.HBL-2 ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) IF( NPROW.GT.1 ) THEN IBUF2 = IBUF2 + 1 V3 = BUF( ISTR2+IBUF2 ) ELSE V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) END IF H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF IF( ( MODKM1.LT.HBL-2 ) .AND. ( MODKM1.GT.0 ) ) THEN H22 = A( ( ICOL1-1 )*LDA+IROW1+1 ) H11 = A( ( ICOL1-2 )*LDA+IROW1 ) V3 = A( ( ICOL1-1 )*LDA+IROW1+2 ) H21 = A( ( ICOL1-2 )*LDA+IROW1+1 ) H12 = A( ( ICOL1-1 )*LDA+IROW1 ) IF( M.GT.L ) THEN H00 = A( ( ICOL1-3 )*LDA+IROW1-1 ) H10 = A( ( ICOL1-3 )*LDA+IROW1 ) END IF END IF H44S = H44 - H11 H33S = H33 - H11 V1 = ( H33S*H44S-H43H34 ) / H21 + H12 V2 = H22 - H11 - H33S - H44S S = CABS1( V1 ) + CABS1( V2 ) + CABS1( V3 ) V1 = V1 / S V2 = V2 / S V3 = V3 / S IF( M.EQ.L ) $ GO TO 30 TST1 = CABS1( V1 )*( CABS1( H00 )+CABS1( H11 )+ $ CABS1( H22 ) ) IF( CABS1( H10 )*( CABS1( V2 )+CABS1( V3 ) ).LE.ULP*TST1 ) $ GO TO 30 * * Slide indices diagonally up one for next iteration * IROW1 = IROW1 - 1 ICOL1 = ICOL1 - 1 END IF IF( M.EQ.L ) THEN * * Stop regardless of which node we are * GO TO 30 END IF * * Possibly change owners if on border * IF( MODKM1.EQ.0 ) THEN II = II - 1 JJ = JJ - 1 IF( II.LT.0 ) $ II = NPROW - 1 IF( JJ.LT.0 ) $ JJ = NPCOL - 1 END IF MODKM1 = MODKM1 - 1 IF( MODKM1.LT.0 ) $ MODKM1 = HBL - 1 20 CONTINUE 30 CONTINUE * CALL IGAMX2D( CONTXT, 'ALL', ' ', 1, 1, M, 1, L, L, -1, -1, -1 ) * RETURN * * End of PZLACONSB * END SHAR_EOF fi # end of overwriting check if test -f 'pzlacp3.f' then echo shar: will not over-write existing file "'pzlacp3.f'" else cat << "SHAR_EOF" > 'pzlacp3.f' SUBROUTINE PZLACP3( M, I, A, DESCA, B, LDB, II, JJ, REV ) * * Mark R. Fahey * May 28, 1999 * * .. Scalar Arguments .. INTEGER I, II, JJ, LDB, M, REV * .. * .. Array Arguments .. INTEGER DESCA( * ) COMPLEX*16 A( * ), B( LDB, * ) * .. * * Purpose * ======= * * PZLACP3 is an auxiliary routine that copies from a global parallel * array into a local replicated array or vise versa. Notice that * the entire submatrix that is copied gets placed on one node or * more. The receiving node can be specified precisely, or all nodes * can receive, or just one row or column of nodes. * * Notes * ===== * * Each global data object is described by an associated description * vector. This vector stores the information required to establish * the mapping between an object element and its corresponding process * and memory location. * * Let A be a generic term for any 2D block cyclicly distributed array. * Such a global array has an associated description vector DESCA. * In the following comments, the character _ should be read as * "of the global array". * * NOTATION STORED IN EXPLANATION * --------------- -------------- -------------------------------------- * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case, * DTYPE_A = 1. * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating * the BLACS process grid A is distribu- * ted over. The context itself is glo- * bal, but the handle (the integer * value) may vary. * M_A (global) DESCA( M_ ) The number of rows in the global * array A. * N_A (global) DESCA( N_ ) The number of columns in the global * array A. * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute * the rows of the array. * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute * the columns of the array. * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first * row of the array A is distributed. * CSRC_A (global) DESCA( CSRC_ ) The process column over which the * first column of the array A is * distributed. * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local * array. LLD_A >= MAX(1,LOCr(M_A)). * * Let K be the number of rows or columns of a distributed matrix, * and assume that its process grid has dimension p x q. * LOCr( K ) denotes the number of elements of K that a process * would receive if K were distributed over the p processes of its * process column. * Similarly, LOCc( K ) denotes the number of elements of K that a * process would receive if K were distributed over the q processes of * its process row. * The values of LOCr() and LOCc() may be determined via a call to the * ScaLAPACK tool function, NUMROC: * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ), * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ). * An upper bound for these quantities may be computed by: * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A * * Arguments * ========= * * M (global input) INTEGER * M is the order of the square submatrix that is copied. * M >= 0. * Unchanged on exit * * I (global input) INTEGER * A(I,I) is the global location that the copying starts from. * Unchanged on exit. * * A (global input/output) COMPLEX*16 array, dimension * (DESCA(LLD_),*) * On entry, the parallel matrix to be copied into or from. * On exit, if REV=1, the copied data. * Unchanged on exit if REV=0. * * DESCA (global and local input) INTEGER array of dimension DLEN_. * The array descriptor for the distributed matrix A. * * B (local input/output) COMPLEX*16 array of size (LDB,M) * If REV=0, this is the global portion of the array * A(I:I+M-1,I:I+M-1). * If REV=1, this is the unchanged on exit. * * LDB (local input) INTEGER * The leading dimension of B. * * II (global input) INTEGER * By using REV 0 & 1, data can be sent out and returned again. * If REV=0, then II is destination row index for the node(s) * receiving the replicated B. * If II>=0,JJ>=0, then node (II,JJ) receives the data * If II=-1,JJ>=0, then all rows in column JJ receive the data * If II>=0,JJ=-1, then all cols in row II receive the data * If II=-1,JJ=-1, then all nodes receive the data * If REV<>0, then II is the source row index for the node(s) * sending the replicated B. * * JJ (global input) INTEGER * Similar description as II above * * REV (global input) INTEGER * Use REV = 0 to send global A into locally replicated B * (on node (II,JJ)). * Use REV <> 0 to send locally replicated B from node (II,JJ) * to its owner (which changes depending on its location in A) * into the global A. * * ===================================================================== * * .. Parameters .. INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_, $ LLD_, MB_, M_, NB_, N_, RSRC_ PARAMETER ( BLOCK_CYCLIC_2D = 1, DLEN_ = 9, DTYPE_ = 1, $ CTXT_ = 2, M_ = 3, N_ = 4, MB_ = 5, NB_ = 6, $ RSRC_ = 7, CSRC_ = 8, LLD_ = 9 ) COMPLEX*16 ZERO PARAMETER ( ZERO = ( 0.0D+0, 0.0D+0 ) ) * .. * .. Local Scalars .. INTEGER COL, CONTXT, HBL, ICOL1, ICOL2, IDI, IDJ, IFIN, $ III, IROW1, IROW2, ISTOP, ISTOPI, ISTOPJ, ITMP, $ JJJ, LDA, MYCOL, MYROW, NPCOL, NPROW, ROW * .. * .. External Functions .. INTEGER NUMROC EXTERNAL NUMROC * .. * .. External Subroutines .. EXTERNAL BLACS_GRIDINFO, INFOG1L, ZGEBR2D, ZGEBS2D, $ ZGERV2D, ZGESD2D * .. * .. Intrinsic Functions .. INTRINSIC MIN, MOD * .. * .. Executable Statements .. * IF( M.LE.0 ) $ RETURN * HBL = DESCA( MB_ ) CONTXT = DESCA( CTXT_ ) LDA = DESCA( LLD_ ) * CALL BLACS_GRIDINFO( CONTXT, NPROW, NPCOL, MYROW, MYCOL ) * IF( REV.EQ.0 ) THEN DO 20 IDI = 1, M DO 10 IDJ = 1, M B( IDI, IDJ ) = ZERO 10 CONTINUE 20 CONTINUE END IF * IFIN = I + M - 1 * IF( MOD( I+HBL, HBL ).NE.0 ) THEN ISTOP = MIN( I+HBL-MOD( I+HBL, HBL ), IFIN ) ELSE ISTOP = I END IF IDJ = I ISTOPJ = ISTOP IF( IDJ.LE.IFIN ) THEN 30 CONTINUE IDI = I ISTOPI = ISTOP IF( IDI.LE.IFIN ) THEN 40 CONTINUE ROW = MOD( ( IDI-1 ) / HBL, NPROW ) COL = MOD( ( IDJ-1 ) / HBL, NPCOL ) CALL INFOG1L( IDI, HBL, NPROW, ROW, 0, IROW1, ITMP ) IROW2 = NUMROC( ISTOPI, HBL, ROW, 0, NPROW ) CALL INFOG1L( IDJ, HBL, NPCOL, COL, 0, ICOL1, ITMP ) ICOL2 = NUMROC( ISTOPJ, HBL, COL, 0, NPCOL ) IF( ( MYROW.EQ.ROW ) .AND. ( MYCOL.EQ.COL ) ) THEN IF( ( II.EQ.-1 ) .AND. ( JJ.EQ.-1 ) ) THEN * * Send the message to everyone * IF( REV.EQ.0 ) THEN CALL ZGEBS2D( CONTXT, 'All', ' ', IROW2-IROW1+1, $ ICOL2-ICOL1+1, A( ( ICOL1-1 )*LDA+ $ IROW1 ), LDA ) END IF END IF IF( ( II.EQ.-1 ) .AND. ( JJ.NE.-1 ) ) THEN * * Send the message to Column MYCOL which better be JJ * IF( REV.EQ.0 ) THEN CALL ZGEBS2D( CONTXT,