Index: Make.atlas =================================================================== RCS file: Make.atlas diff -N Make.atlas --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Make.atlas 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,66 @@ +################################################################## +# (C) Copyright IBM Corporation 2008 +# +################################################################## + +# Platform + +ARCH := atlas + +# Tools + +SHELL := /bin/sh +CD := cd +CP := cp +LN_S := ln -s +MKDIR := mkdir +TOUCH := touch + +CC := mpicc +LINKER := mpicc +ARCHIVER := /usr/bin/ar +RANLIB := echo + +# Directories + +INCdir := $(TOPdir)/include +BINdir := $(TOPdir)/bin/$(ARCH) + +# HPL library + +HPLlib := $(TOPdir)/lib/$(ARCH)/libhpl.a + +# MPI package + +MPdir := +MPinc := +MPlib := + +# Linear Algebra Library package -- Atlas + +LAdir := /usr/local/atlas +LAinc := -I$(LAdir)/include +LAlib := -L$(LAdir)/lib -lf77blas -latlas -lgfortran + +# F2C options + +F2CDEFS := -DAdd__ -DF77_INTEGER=int -DStringSunStyle + +# HPL options + +HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc) +HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS) +HPL_DEFS += -DHPL_USE_HUGE_PAGES=1 + +ifdef TIMING +HPL_DEFS += -DHPL_DETAILED_TIMING +endif + +HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) + +CCNOOPT := -m64 -Wall $(HPL_DEFS) +CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops +#CCFLAGS := $(CCNOOPT) -O0 -ggdb -g3 +LINKFLAGS := $(CCFLAGS) +ARFLAGS := -r + Index: Make.qs22 =================================================================== RCS file: Make.qs22 diff -N Make.qs22 --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Make.qs22 20 Aug 2008 03:57:53 -0000 1.7 @@ -0,0 +1,74 @@ +################################################################## +# (C) Copyright IBM Corporation 2008 +# +################################################################## + +# Platform + +ARCH := qs22 + +# Tools + +SHELL := /bin/sh +CD := cd +CP := cp +LN_S := ln -s +MKDIR := mkdir +TOUCH := touch + +CC := mpicc +LINKER := mpicc +ARCHIVER := /usr/bin/ar +RANLIB := echo + +# Directories + +INCdir := $(TOPdir)/include +BINdir := $(TOPdir)/bin/$(ARCH) + +# HPL library + +HPLlib := $(TOPdir)/lib/$(ARCH)/libhpl.a +ACLlib := $(TOPdir)/accel/lib/libhpl_accel_ppu.a + +# MPI package + +MPdir := +MPinc := +MPlib := + +# Linear Algebra Library package -- Atlas + +LAdir := /usr/local/atlas +LAinc := -I$(LAdir)/include +LAlib := -L$(LAdir)/lib -lf77blas -latlas -lgfortran + +# Cell SDK + +CSdir := /opt/cell/sdk/prototype +CSinc := -I$(CSdir)/usr/include +CSlib := -L$(CSdir)/usr/lib64 -lstdc++ -lpthread -lrt -lspe2 -lnuma + +# F2C options + +F2CDEFS := -DAdd__ -DF77_INTEGER=int -DStringSunStyle + +# HPL options + +HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc) +HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS) +HPL_DEFS += -DHPL_USE_HUGE_PAGES=1 + +ifdef TIMING +HPL_DEFS += -DHPL_DETAILED_TIMING +endif + +HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) $(ACLlib) + +CCNOOPT := -m64 -Wall $(HPL_DEFS) +CCNOOPT += -DHPL_CALL_ACCEL +CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops +#CCFLAGS := $(CCNOOPT) -O0 -ggdb3 +LINKFLAGS := $(CCFLAGS) +ARFLAGS := -r + Index: Make.qs22_sdkblas =================================================================== RCS file: Make.qs22_sdkblas diff -N Make.qs22_sdkblas --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Make.qs22_sdkblas 7 Aug 2008 13:07:08 -0000 1.4 @@ -0,0 +1,78 @@ +################################################################## +# Licensed Materials - Property of IBM. +# (C) Copyright IBM Corporation 2007 +# All Rights Reserved. +# +# US Government Users Restricted Rights - +# Use, duplication or disclosure restricted by +# GSA ADP Schedule Contract with IBM Corporation. + +################################################################## + +# Platform + +ARCH := qs22_sdkblas + +# Tools + +SHELL := /bin/sh +CD := cd +CP := cp +LN_S := ln -s +MKDIR := mkdir +TOUCH := touch + +CC := mpicc +LINKER := mpicc +ARCHIVER := /usr/bin/ar +RANLIB := echo + +# Directories + +INCdir := $(TOPdir)/include +BINdir := $(TOPdir)/bin/$(ARCH) + +# HPL library + +HPLlib := $(TOPdir)/lib/$(ARCH)/libhpl.a + +# MPI package + +MPdir := +MPinc := +MPlib := + +# Linear Algebra Library package + +LAdir := /usr +LAinc := -I$(LAdir)/include +LAlib := -L$(LAdir)/lib64 -lblas + +# Cell SDK + +CSdir := /opt/cell/sdk/prototype +CSinc := -I$(CSdir)/usr/include +CSlib := -L$(CSdir)/usr/lib64 -lstdc++ -lpthread -lrt -lspe2 -lnuma + +# F2C options + +F2CDEFS := -DAdd__ -DF77_INTEGER=int -DStringSunStyle + +# HPL options + +HPL_INCS := -I$(INCdir) -I$(INCdir)/$(ARCH) $(LAinc) $(MPinc) $(CSinc) +HPL_DEFS := $(F2CDEFS) $(HPL_OPTS) $(HPL_INCS) +HPL_DEFS += -DHPL_USE_HUGE_PAGES=1 + +ifdef TIMING +HPL_DEFS += -DHPL_DETAILED_TIMING +endif + +HPL_LIBS := $(HPLlib) $(LAlib) $(MPlib) $(CSlib) + +CCNOOPT := -m64 -Wall $(HPL_DEFS) +CCFLAGS := $(CCNOOPT) -O3 -fomit-frame-pointer -funroll-loops +#CCFLAGS := $(CCNOOPT) -O0 -ggdb -g3 +LINKFLAGS := $(CCFLAGS) +ARFLAGS := -r + Index: Make.top =================================================================== RCS file: /cvsroot/hpl_qs22/Make.top,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- Make.top 10 Feb 2008 21:45:50 -0000 1.1 +++ Make.top 26 Aug 2008 13:24:26 -0000 1.4 @@ -43,6 +43,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # arch = UNKNOWN # @@ -51,6 +53,7 @@ ## build ############################################################### # build_src : + ( $(CD) src/accel/$(arch); $(MAKE) ) ( $(CD) src/auxil/$(arch); $(MAKE) ) ( $(CD) src/blas/$(arch); $(MAKE) ) ( $(CD) src/comm/$(arch); $(MAKE) ) @@ -78,6 +81,7 @@ - $(MKDIR) bin/$(arch) # startup_src : + - $(MAKE) -f Make.top leaf le=src/accel arch=$(arch) - $(MAKE) -f Make.top leaf le=src/auxil arch=$(arch) - $(MAKE) -f Make.top leaf le=src/blas arch=$(arch) - $(MAKE) -f Make.top leaf le=src/comm arch=$(arch) @@ -98,6 +102,7 @@ ## refresh ############################################################# # refresh_src : + - $(CP) makes/Make.accel src/accel/$(arch)/Makefile - $(CP) makes/Make.auxil src/auxil/$(arch)/Makefile - $(CP) makes/Make.blas src/blas/$(arch)/Makefile - $(CP) makes/Make.comm src/comm/$(arch)/Makefile @@ -118,6 +123,7 @@ ## clean ############################################################### # clean_src : + - ( $(CD) src/accel/$(arch); $(MAKE) clean ) - ( $(CD) src/auxil/$(arch); $(MAKE) clean ) - ( $(CD) src/blas/$(arch); $(MAKE) clean ) - ( $(CD) src/comm/$(arch); $(MAKE) clean ) @@ -138,6 +144,7 @@ ## clean_arch ########################################################## # clean_arch_src : + - $(RM) -r src/accel/$(arch) - $(RM) -r src/auxil/$(arch) - $(RM) -r src/blas/$(arch) - $(RM) -r src/comm/$(arch) @@ -165,6 +172,7 @@ ## clean_guard ######################################################### # clean_guard_src : + - ( $(CD) src/accel/$(arch); $(RM) *.grd ) - ( $(CD) src/auxil/$(arch); $(RM) *.grd ) - ( $(CD) src/blas/$(arch); $(RM) *.grd ) - ( $(CD) src/comm/$(arch); $(RM) *.grd ) Index: Makefile =================================================================== RCS file: /cvsroot/hpl_qs22/Makefile,v retrieving revision 1.1 retrieving revision 1.4 diff -u -r1.1 -r1.4 --- Makefile 10 Feb 2008 21:45:50 -0000 1.1 +++ Makefile 26 Aug 2008 13:24:26 -0000 1.4 @@ -43,12 +43,16 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ###################################################################### +# Modifications (C) Copyright IBM Corporation 2008 +# ###################################################################### # # SHELL = /bin/sh # arch = UNKNOWN # +export TOPdir = $(shell pwd) +# ## Targets ############################################################# # all : install @@ -70,10 +74,12 @@ # build : $(MAKE) -f Make.top build_src arch=$(arch) + $(MAKE) -C accel arch=$(arch) $(MAKE) -f Make.top build_tst arch=$(arch) # clean : $(MAKE) -f Make.top clean_src arch=$(arch) + $(MAKE) -C accel clean arch=$(arch) $(MAKE) -f Make.top clean_tst arch=$(arch) # clean_arch : Index: accel/Makefile =================================================================== RCS file: accel/Makefile diff -N accel/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/Makefile 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,25 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# All Rights Reserved. +# --------------------------------------------------------------- + +ifeq ($(arch),qs22) + +######################################################################## +# Target +######################################################################## + +DIRS = lib + +######################################################################## +# make.footer +######################################################################## + +include $(CELL_TOP)/buildutils/make.footer + +else + +all clean : + +endif Index: accel/lib/Makefile =================================================================== RCS file: accel/lib/Makefile diff -N accel/lib/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/Makefile 20 Aug 2008 03:57:53 -0000 1.5 @@ -0,0 +1,39 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# --------------------------------------------------------------- + +######################################################################## +# Subdirectories +######################################################################## + +DIRS = spu + +######################################################################## +# Target +######################################################################## + +TARGET_PROCESSOR = ppu64 +LIBRARY = libhpl_accel_ppu.a + +#CC_OPT_LEVEL = -g + +CPPFLAGS = -DNDEBUG +#CPPFLAGS += -DACCEL_LITTLE_ENDIAN +CPPFLAGS += -DVALIDATE_4GB_CROSSING +CPPFLAGS += -DMATRIX_4GB_CROSSING +#CPPFLAGS += -DPANEL_4GB_CROSSING + +######################################################################## +# Local Defines +######################################################################## + +SYS_LIBS += -lspe2 -lpthread -lm + +IMPORTS = spu/hpl_accel_spu-embed64.o + +######################################################################## +# make.footer +######################################################################## + +include $(CELL_TOP)/buildutils/make.footer Index: accel/lib/hpl_accel.h =================================================================== RCS file: accel/lib/hpl_accel.h diff -N accel/lib/hpl_accel.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel.h 20 Aug 2008 03:57:53 -0000 1.13 @@ -0,0 +1,758 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _HPL_ACCEL_H_ +#define _HPL_ACCEL_H_ + +#define M_SUB (64) /* Size of sub-blocks - M_SUB x M_SUB */ + +/* ---------------------------------------------------------------- */ +/* Inline functions for addressing matrix storage of various formats*/ +/* ---------------------------------------------------------------- */ + +/* The following inline functions compute an array index for the each + * of the supported formats - column ordered, row ordered, and blocked + * (column ordered blocks, whose blocks are row ordered). + * The inputs are the row (row), the column (col), the leading dimension + * (ld). + */ + +/* ld is the number of elements from column n to column n+1 + */ +static inline unsigned int INDEX_COL(unsigned int row, unsigned int col, unsigned int ld) { + return (col*ld + row); +} + +/* ld is the number of elements from row n to row n+1 + */ +static inline unsigned int INDEX_ROW(unsigned int row, unsigned int col, unsigned int ld) { + return (row*ld + col); +} + +/* ld is the number of elements from block column n to block column n+1. + * This can also be described as the number of elements between column + * n and column n+M_SUB + */ +static inline unsigned int INDEX_BLK(unsigned int row, unsigned int col, unsigned int ld) { + return ((col / M_SUB)*ld + INDEX_ROW( row, (col % M_SUB), M_SUB )); +} + + +/* NOTE 1: + * + * The following defines can be used to configure the code for handling + * 4GB crossings. They include: + * + * MATRIX_4GB_CROSSING If defined then all block ordered matrices can cross a 4GB + * address boundary. However, the crossing can only occur on a + * block boundary, never within a matrix block. In addition, + * the block leading dimension must be no larger than 2^28 - 1. + * If not defined, then a matrix can not cross a 4GB + * address boundary. + * + * PANEL_4GB_CROSSING If defined then all row or column order panels (this includes + * U panels, L panels,and row buffers) may cross at most 1 4GB + * address boundary, but only on a row/column boundary. In addition, + * the leading dimension must not exceed 2^28 - 1. + * If not defined, then a panel can not cross a 4GB address boundary. + * + * VALIDATE_4GB_CROSSING If defined, then include code to validate the specified + * boundary constraints. This define is intended for debug + * purposes only. + */ + +#ifdef __PPU__ + +#include + +/* hpl_accel_byte_swap + * ------------------- + * Convert a double from little-endian format to big-endian format. This + * function is not optimal. Recommend using hpl_accel_byte_swap_load and + * hpl_accel_byte_swap_store instead. + */ +static inline double hpl_accel_byte_swap(double d) { +#ifdef ACCEL_LITTLE_ENDIAN + union { + unsigned long long ull; + double d; + } in, out; + + in.d = d; + out.ull = __ldbrx(&in.ull); + return (out.d); +#else + return (d); +#endif +} + + +/* hpl_accel_byte_swap_load + * ------------------------ + * Load a little endian byte ordered, double word value. + */ +static inline double hpl_accel_byte_swap_load(unsigned long long *ptr) +{ +#ifdef ACCEL_LITTLE_ENDIAN + union { + unsigned long long ull; + double d; + } x; + + x.ull = __ldbrx(ptr); + return (x.d); +#else + return (*((double *)ptr)); +#endif +} + + +/* hpl_accel_byte_swap_store + * ------------------------- + * Store a double word value in little endian byte ordering. + */ +static inline void hpl_accel_byte_swap_store(unsigned long long *ptr, double d) +{ +#ifdef ACCEL_LITTLE_ENDIAN + union { + unsigned long long ull; + double d; + } x; + + x.d = d; + __stdbrx(ptr, x.ull); +#else + *((double *)ptr) = d; +#endif +} + + +/* hpl_accel_init + * -------------- + * Initialize the HPL accelerator. If the accelerator is successfully + * initialized, then HPL_ACCEL_INIT_SUCCESS is returned, otherwise + * HPL_ACCEL_INIT_FAIL is returned. + */ + +#define HPL_ACCEL_INIT_SUCCESS 0 +#define HPL_ACCEL_INIT_FAIL -1 + +extern int hpl_accel_init(); + +/* hpl_accel_fini + * -------------- + * Finalize the HPL accelerator. If the accelerator successfully + * finishes , then HPL_ACCEL_FINI_SUCCESS is returned, otherwise + * HPL_ACCEL_FINI_FAIL is returned. + */ +#define HPL_ACCEL_FINI_SUCCESS 0 +#define HPL_ACCEL_FINI_FAIL -1 + +extern int hpl_accel_fini(); + + +/* hpl_accel_dgemm_CL_R_B_CL + * hpl_accel_dgemm_CL_B_B_CL + * ------------------------- + * Specialized accelerated DGEMM. The DGEMM computes: + * + * [c] -= [a]*[b] + * + * If a panel is specified, then the output in placed in [panel]: + * + * [panel] = [c] - [a]*[b]; + * + * m Number of rows in [a], [c], and [panel]. + * n Number of cols in [b], [c], and [panel]. + * k Number of cols in [a] and rows in [b]. + * a Column-ordered, little-endian, matrix of m rows and k columns. + * lda Leading dimension of matrix [a]. + * b Big endian matrix of k rows and n columns. This is either row ordered, + * in the case of hpl_accel_dgemm_CL_R_B_CL, or block formatted, in the + * hpl_accel_dgemm_CL_B_B_CL. + * ldb Leading dimension of matrix [b]. For a block formatted [b] matrix, + * this is the number of doubles to advance b from block column n to + * column n+1. + * c Block-formatted, big-endian, matrix of m rows and n columns. + * The block contents are row-ordered with the individual blocks + * that are column-ordered. Blocks are 64x64. + * ldc Leading block dimension of matrix [c]. The number of doubles to + * to advance c from block column n to column n+1. + * blk_row Starting block matrix row offset. This offset is applied only to the + * [c] matrix. + * blk_col Starting block matrix column offset. This offset is applied to the [c] + * matrix and [b] matrix when it is block formatted (i.e. for + * hpl_accel_dgemm_CL_B_B_CL. + * panel Column ordered, little endian DGEMM result matrix of m rows and n columns. + * If NULL, the result is returned in [c]. + * ldp Leading dimension of [panel]. If [panel] is NULL, this must be 0. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * b Buffer may not straddle 4GB boundary (See Note 1). + * c Buffer may not straddle 4GB boundary (See Note 1). + * panel Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * k Must be 128 to be accelerated. + * m Optimal if a multiple of 64. Integral multiples of 64 may be accelerated. + * n Optimal if a multiple of 64. Integral multiples of 64 may be accelerated. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * c Optimal if cacheline aligned. Accelerated if [c] is quadword aligned. + * panel Optimal if cacheline aligned. Accelerated if [panel] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. + * ldc Optimal if a multiple of 16. Accelerated if ldc is even. + * ldp Optimal if a multiple of 16. Accelerated if ldp is even. + * blk_row Must be a multiple of M_SUB in order to be SPE accelerated. + * blk_col Must be a multiple of M_SUB in order to be SPE accelerated. + */ + +extern void hpl_accel_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_accel_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + + +/* hpl_accel_dgemm_C_C_C + * ------------------------- + * Specialized accelerated DGEMM. The DGEMM computes: + * + * [c] -= [a]*[b] + * + * m Number of rows in [a] and [c]. + * n Number of cols in [b] and [c]. + * k Number of cols in [a] and rows in [b]. + * a Column-ordered, big-endian, matrix of m rows and k columns. + * lda Leading dimension of matrix [a]. + * b Column-ordered, big endian matrix of k rows and n columns. + * ldb Leading dimension of matrix [b]. + * c Column-ordered, big-endian, matrix of m rows and n columns. + * ldc Leading block dimension of matrix [c]. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * c Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * k Must be a multiple of 4 and no bigger than 64 to be accelerated. + * m Optimal if a multiple of 16. Integral multiples of 8 may be accelerated. + * n Optimal if a multiple of 4. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * c Optimal if cacheline aligned. Accelerated if [c] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. + * ldc Optimal if a multiple of 16. Accelerated if ldc is even. + */ + +extern void hpl_accel_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +/* hpl_accel_dtrsm_CL_R_B + * ---------------------- + * Specialized accelerated DTRSM. The DTRSM solves for [x] the matrix equation + * + * [a]*[x] = [b] + * + * where a is unit lower triangle matrix. The solution is returned in [b] unless + * [c] is non-NULL, in which the solution is returned in [c]. + * + * m Number of rows in [b], number of column in [a]. + * n Number of columns in [b]. + * a Column-ordered, little-endian, unit lower triangle matrix of + * dimension lda rows by m columns. + * lda Leading dimension of matrix [a]. + * b Row-order, big-endian, matrix of m rows and n columns. On entry + * contains the right-hand side matrix and is overwritten by the + * solution matrix [x]. + * ldb Leading dimension of matrix [b]. + * c Block-formatted, big-endian, matrix. The block contents are + * row-ordered with the individual blocks that are column-ordered. + * Blocks are 64x64. If non-NULL, the solution is returned in the + * row of blocks in [c] instead of [b]. This must point to the start + * of a matrix block. + * ldc Leading block dimension of matrix [c]. The number of doubles to + * to advance c from block column to the next block column. If [c] + * is NULL, then ldc should also be 0. + * blk_row Starting [c] block matrix row offset. If [c] is NULL, then blk_row + * must also be 0. + * blk_col Starting [c] block matrix column offset. If [c] is NULL, then blk_col + * must also be 0. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * b Buffer may not straddle 4GB boundary (See Note 1). + * c Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * m Must be 128 to be accelerated. + * n Optimal if a multiple of 16. Integral multiples of 16 may be accelerated. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * c Optimal if cacheline aligned. Accelerated if [c] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. Memory throughput + * is maximized if ldb is NOT an integral multiple of 256. + * ldc Optimal if a multiple of 16. Accelerated if ldc is even. + * blk_col Must be a multiple of 16 in order to be SPE accelerated. This is a current + * implementation restriction. + */ + +extern void hpl_accel_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + + +/* hpl_accel_dtrsm_CL_B + * -------------------- + * Specialized accelerated DTRSM. The DTRSM solves for [x] the matrix equation + * + * [a]*[x] = [b] + * + * where a is unit lower triangle matrix. The solution is returned in [b]. + * + * m Number of rows in [b], number of column in [a]. + * n Number of columns in [b]. + * a Column-ordered, little-endian, unit lower triangle matrix of + * dimension lda rows by m columns. + * lda Leading dimension of matrix [a]. + * b Block-formatted, big-endian, matrix of m rows and n columns. + * The block contents are row-ordered with the individual blocks + * that are column-ordered. Blocks are 64x64. On entry contains + * the right-hand side matrix and is overwritten by the + * solution matrix [x]. This must point to the start + * of a matrix block. + * ldb Leading dimension of matrix [b]. The number of doubles to + * to advance b from block column to the next block column. + * blk_row Starting [b] block matrix row offset. + * blk_col Starting [b] block matrix column offset. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * b Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * m Must be 128 to be accelerated. + * n Optimal if a multiple of 16. Integral multiples of 16 may be accelerated. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * b Optimal if cacheline aligned. Accelerated if [b] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * ldb Optimal if a multiple of 16. Accelerated if ldb is even. + * blk_col Must be a multiple of 16 in order to be SPE accelerated. This is a current + * implementation restriction. + */ + +extern void hpl_accel_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + + + +/* hpl_accel_reform_panel_CL_to_B + * ------------------------------- + * Copy and reformat the L panel from the panel buffer pointed to by panel into matrix [a]. + * The input L panel is assumed to be column-order, little endian with a leading dimension of ldp. + * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. The blocks + * are assumed to be column ordered. + * + * m Number of rows of panel to copy to [a] + * n Number of columns of panel to copy to [a] + * a Block formatted matrix. a points to the location with [a] to receive the + * data being copied and reformatted from panel + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * panel Pointer to the L panel containing the data to be reformatted and copied to + * matrix [a]. The [panel] is column-ordered, little endian. + * ldp Leading dimension of the panel. This is the number of doubles between + * column n and column n+1 + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * + */ + +void hpl_ref_reform_panel_CL_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete); + + +/* hpl_accel_reform_matrix_CL_to_B + * ------------------------------- + * Inplace reformat the matrix [a] from column-ordered, little-endian to blocked, big-endian format. The blocked + * format is 64x64, row-ordered blocks with the blocks being column ordered. The pad between the columns of + * blocks are zero filled. + * + * m Number of rows in [a]. If m is not a multiple of 64, then the additional rows needed + * pad [a] to a multiple of 64 rows are zero'd. + * n Number of cols in [a]. + * a Column-ordered, little-endian, matrix of m rows and n columns. + * lda Leading dimension of matrix [a]. + * scratch Scratch buffer used to assist the reformating of [a]. The scratch buffer + * must be at least 64*roundup(m,64) elements. + * size The size (number of elements) of the scratch buffer. The scratch buffer + * must be at least approximately 64*m elements. In general, better performance + * is achieved if the scratch buffer is larger and more SPEs can be deployed + * to the problem. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * n Must be an integral multiple of 64. + * a Must be quadword aligned and buffer may not straddle 4GB boundary (See Note 1). + * lda Must be even and at least roundup(m,64). + * scratch Must be quadword aligned and must not straddle 4GB boundary. + * size Must be at least 64*roundup(m,64). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * a Mush be cacheline aligned. + * lda Must be a multiple of 16. + * scratch Must be cacheline aligned. + * size Must be at least 4*64*m for optimal performance. + * + * Note: For 4GB crossing support, the matrix a is considered to be a block "matrix". + */ +extern void hpl_accel_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, int size, + unsigned long long *incomplete); + + + +/* hpl_accel_reform_panel_B_to_CL + * ------------------------------- + * Copy and reformat the L panel from matrix [a] into the panel buffer pointed to by panel. + * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. The blocks + * are assumed to be column ordered. The output L panel is assumed to be column-order, little endian + * with a leading dimension of ldp. + * + * m Number of rows of [a] to copy to panel + * n Number of columns of [a] to copy to panel + * panel Pointer to the L panel extracted and reformatted from matrix [a]. The + * [panel] is column-ordered, little-endian. + * ldp Leading dimension of the panel. + * a Block formatted matrix. a points to the start of the panel to be reformatted + * and copied into [panel]. + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * m Must be a multiple of 64. + * panel Must be quadword aligned and the buffer may not straddle 4GB boundary (See Note 1). + * ldp Must be even and at least m. + * a Must be quadword aligned and may not straddle a 4GB boundary (See Note 1). + * lda Must be even and at least m*M_SUB. + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + * a Mush be cacheline aligned. + * lda Must be a multiple of 16. + */ +extern void hpl_accel_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete); + + + +/* hpl_accel_reform_panel_R_to_B + * ------------------------------- + * Copy and reformat a U panel from a row buffer pointed to by panel into matrix [a]. + * The input U panel is assumed to be row-order, big endian with a leading dimension of ldp. + * The matrix is assumed to be constructed in 64x64 element, row-ordered, big-endian blocks. + * The blocks are assumed to be column ordered. + * + * m Number of rows of panel to copy to [a] + * n Number of columns of panel to copy to [a] + * a Block formatted matrix. a points to the location with [a] to receive the + * data being copied and reformatted from panel + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * panel Pointer to the U panel containing the data to be reformatted and copied to + * matrix [a]. The [panel] is row-ordered, big-endian. + * ldp Leading dimension of the panel. This is the number of doubles between + * row n and row n+1 + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * m None + * panel Must be quadword aligned and the buffer may not straddle 4GB boundary (See Note 1). + * ldp Must be even and at least n. + * a Must be quadword aligned and may not straddle a 4GB boundary (See Note 1). + * lda Must be even and at least m*M_SUB. + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + * a Must be cacheline aligned. + * lda Must be a multiple of 16. + */ +extern void hpl_accel_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete); + + +/* hpl_accel_reform_rows_R_to_B + * hpl_accel_reform_rows_B_to_R + * ---------------------------- + * Copy and reformat a set of rows between row ordered and block ordered formats. + * hpl_accel_reform_rows_R_to_B reformats rows into blocks and hpl_accel_reform_rows_B_to_R + * reformats blocks into rows. These functions are expected to be used to gather/scatter winners + * and losers when pivoting so that rows are coalesced into large DMAs for efficient transfer. + * No endian swapping is performed on the data. Block data is assumed to be ordered in 64x64, + * row ordered elements. The blocks themselves are column ordered. + * + * m Number of rows to copy. Specifies the number of entries in the blk_rows array. + * n Number of values (doubles) per row to copy. + * rows Pointer to the data rows to be reformatted and copied to/from matrix [a]. + * ldr Leading dimension of the row buffer. This is the number of doubles between + * rows of the [rows] buffer. + * a Block formatted matrix. + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * blk_rows Array of row indices. blk_rows specifies starting [a] block matrix row offset + * for each of the m rows. + * blk_col Starting [a] block matrix column offset. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * rows Buffer must not straddle 4GB boundary. + * a Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * rows Optimal if cacheline aligned. Accelerated if [rows] is quadword aligned. + * ldr Optimal if a multiple of 16. Accelerated if ldr is even. + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * blk_col Optimal if a multiple of 16. Accelerated if blk_col is even. + */ + +extern void hpl_accel_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_accel_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +/* hpl_accel_swap_rows_B_to_B + * ---------------------------- + * Swap a set of rows in block ordered format. + * hpl_accel_swap_rows_B_to_B swaps a set of rows pairwise in a block-formatted matrix. + * No endian swapping is performed on the data. Block data is assumed to be ordered in 64x64, + * row ordered elements. The blocks themselves are column ordered. + * + * m Number of rows to swap. Specifies the number of entries in the blk_rows array. + * n Number of values (doubles) per row to copy. + * a Block formatted matrix. + * lda Leading dimension of matrix [a]. This contains the number of doubles to + * advance a from block column n to column n+1. + * blk_rows Array of row indices. blk_rows specifies starting [a] block matrix row offset + * for each of the m rows. + * blk_col Starting [a] block matrix column offset. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * a Buffer may not straddle 4GB boundary (See Note 1). + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + * blk_col Optimal if a multiple of 16. Accelerated if blk_col is even. + */ + +extern void hpl_accel_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +/* hpl_accel_copy_rows_R_to_R + * ---------------------------- + * Copy a set of rows in row ordered format. + * hpl_accel_copy_rows_R_to_R copies a set of rows from row-oriented matrix a to + * row-oriented matrix b. + * No endian swapping is performed on the data. + * + * m Number of rows to copy. Specifies the number of entries in the blk_rows array. + * n Number of values (doubles) per row to copy. + * a Pointer to the source data rows to be copied to row-ordered matrix b. + * lda Leading dimension of the row-ordered source matrix a. + * b Pointer to the row-ordered destination matrix. + * ldb Leading dimension of the row-ordered destination matrix b. + * rows Array of row indices. rows specifies the destination row address in row-ordered + * matrix b to receive source row from matrix a. + * incomplete Pointer system variable that is first initialized to non-zero + * and asynchronously cleared when the requested operation has completed. + * If NULL, no completion notification is performed. + * + * FUNCTIONAL RESTRICTIONS: + * + * ADDITIONAL PERFORMANCE RESTRICTIONS: + * a Optimal if cacheline aligned. Accelerated if [a] is quadword aligned. + * lda Optimal if a multiple of 16. Accelerated if lda is even. + */ + +extern void hpl_accel_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete); + +/* REFERENCE FUNCTIONS. + * + * These functions are non-accelerated implementations that run on the PPU. + * + * They may not place the same functional and performance restrictions as the + * SPU accelerated functions. + */ + +extern int hpl_ref_init(); + +extern void hpl_ref_dgemm_CL_R_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_CL_B_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_ref_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete); + +extern void hpl_ref_dtrsm_CL_R(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned long long *incomplete); + +extern void hpl_ref_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, int size, + unsigned long long *incomplete); + +extern void hpl_ref_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete); + +extern void hpl_ref_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete); + +extern void hpl_ref_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete); + +extern void hpl_ref_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete); + +#endif /* __PPU__ */ + +#endif /* _HPL_ACCEL_H_ */ Index: accel/lib/hpl_accel_copy.c =================================================================== RCS file: accel/lib/hpl_accel_copy.c diff -N accel/lib/hpl_accel_copy.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_copy.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,98 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "ppu_intrinsics.h" + +/* General purpose, reference, reformating facilities. + */ + +void hpl_accel_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete) +{ + unsigned int non_aligned; + int n0 = 0; + + non_aligned = (((unsigned int)(lda | ldb) & 1) | + (((unsigned int)((uintptr_t)a) | (uintptr_t)b) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_start, m_left, m_per_cmd; + unsigned int idx; + volatile hpl_accel_copy_rows_parms_t *parms; + int i; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_start = 0; + m_left = m; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + m_per_cmd = (int)(sizeof(parms->rows) / sizeof(int)); + + while (m_left > 0) { + + parms = (volatile hpl_accel_copy_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = (m_left < m_per_cmd) ? m_left : m_per_cmd; + parms->n = n0; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + + parms->a = a + m_start * lda; + parms->b = b; + + parms->incomplete = (parms->m < m_left) ? NULL : incomplete; + + for (i=0; im; i++) parms->rows[i] = rows[m_start+i]; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_COPY_ROWS_R_TO_R, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_start += parms->m; + m_left -= parms->m; + } + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int y1, y2, x; + double *src, *dst; + + /* For each of the rows */ + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = rows[y1]; /* New location for row y1 */ + src = a + (y1 * lda); + dst = b + (y2 * ldb); + for (x=n0; x<(unsigned int)n; x++) { + dst[x] = src[x]; + } + } + } +} + Index: accel/lib/hpl_accel_dgemm.c =================================================================== RCS file: accel/lib/hpl_accel_dgemm.c diff -N accel/lib/hpl_accel_dgemm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_dgemm.c 20 Aug 2008 03:57:53 -0000 1.12 @@ -0,0 +1,495 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "hpl_accel_global.h" + +#include + +static void _dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp) +{ + unsigned int i, x, y; + unsigned long long *a_ptr; + double a_val, b_val, *p; + + if (panel) { + /* Write the result into the panel buffer. We first perform the compution, + * placing the result into [panel]. Then byte swap panel. + */ + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_ROW(0,x,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ = c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] - a_val * b_val; + } + } + + a += lda; + for (i=1; i<(unsigned int)k; i++, a+=lda) { + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_ROW(i,x,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ -= a_val * b_val; + } + } + } +#ifdef ACCEL_LITTLE_ENDIAN + /* Byte swap panel buffer + */ + unsigned long long *p_ptr = (unsigned long long *)panel; + for (x=0; x<(unsigned int)n; x++, p_ptr += ldp-m) { + for (y=0; y<(unsigned int)m; y++, p_ptr++) { + __stdbrx(p_ptr, *p_ptr); + } + } +#endif + } else { + /* Write the result into the c matrix. + */ + for (i=0; i<(unsigned int)k; i++, a+=lda) { + a_ptr = (unsigned long long *)a; + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + } +} + + +void hpl_accel_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete) +{ + int n0; + int m0 = 0; + unsigned int cmd; + unsigned int idx; + unsigned int aligned, bc, br; + volatile hpl_accel_dgemm_parms_t *parms; + + /* Do as much of the dgemm as possible using the blocked dgemm SPU specialist. + * This specialist assumes: + * m is at least M_SUB + * n is at least M_SUB + * k is equal to M + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * panel is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * ldc is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * ldp is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance + * blk_col is a multiple of M_SUB + * blk_row is a multiple of M_SUB + */ + bc = blk_col/M_SUB; + br = blk_row/M_SUB; + + c += (ldc * bc) + br*(M_SUB*M_SUB); + + blk_col %= M_SUB; + blk_row %= M_SUB; + + aligned = (blk_row | blk_col | + ((unsigned int)(lda | ldb | ldc | ldp) & 1) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c | (uintptr_t)panel) & (16-1))); + + + if ((m >= M_SUB) && (n >= M_SUB) && (k == M) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + m0 = (m/M_SUB) * M_SUB; + n0 = (n/M_SUB) * M_SUB; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, k, lda); + VALIDATE_PANEL_4GB_CROSSING(b, k, ldb); + VALIDATE_MATRIX_4GB_CROSSING(c, m0, n0, ldc); + VALIDATE_PANEL_4GB_CROSSING(panel, n0, ldp); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n0 / M_SUB; + parms->m = m0 / M_SUB; + parms->b_blk = 0; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->b_count, b, ldb, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + if (panel) { + parms->p = panel; + parms->ldp = ldp * sizeof(double); + cmd = HPL_ACCEL_CMD_DGEMM_PANEL; + } else { + cmd = HPL_ACCEL_CMD_DGEMM; + } + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + send_cmd_to_spes(cmd, idx, HPL_ACCEL_SPES); + + /* Complete any remain portion on the right side. That is when n is not a multiple + * of M_SUB. + */ + if (n0 < n) { + _dgemm_CL_R_B_CL(m0, n-n0, k, a, lda, b+n0, ldb, c, ldc, blk_row, blk_col+n0, panel + n0*ldp, ldp); + } + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (m0 < m) { + _dgemm_CL_R_B_CL(m-m0, n, k, a+m0, lda, b, ldb, c, ldc, blk_row+m0, blk_col, ((panel) ? panel + m0 : panel), ldp); + } +} + + + +static void _dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp) +{ + unsigned int i, x, y; + unsigned long long *a_ptr; + double a_val, b_val, *p; + + if (panel) { + /* Write the result into the panel buffer. We first perform the compution, + * placing the result into [panel]. Then byte swap panel. + */ + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_BLK(0,x+blk_col,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ = c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] - a_val * b_val; + } + } + + a += lda; + for (i=1; i<(unsigned int)k; i++, a+=lda) { + p = panel; + for (x=0; x<(unsigned int)n; x++, p += ldp-m) { + a_ptr = (unsigned long long *)a; + b_val = b[INDEX_BLK(i,x+blk_col,ldb)]; + + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + *p++ -= a_val * b_val; + } + } + } +#ifdef ACCEL_LITTLE_ENDIAN + /* Byte swap panel buffer + */ + unsigned long long *p_ptr = (unsigned long long *)panel; + for (x=0; x<(unsigned int)n; x++, p_ptr += ldp-m) { + for (y=0; y<(unsigned int)m; y++, p_ptr++) { + __stdbrx(p_ptr, *p_ptr); + } + } +#endif + } else { + /* Write the result into the c matrix. + */ + for (i=0; i<(unsigned int)k; i++, a+=lda) { + a_ptr = (unsigned long long *)a; + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row,x+blk_col,ldc)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)]; + } + } + } + } +} + + +void hpl_accel_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *panel, int ldp, + unsigned long long *incomplete) +{ + int n0; + int m0 = 0; + unsigned int cmd; + unsigned int idx; + unsigned int aligned, bc, br; + volatile hpl_accel_dgemm_parms_t *parms; + + /* Do as much of the dgemm as possible using the blocked dgemm SPU specialist. + * This specialist assumes: + * m is at least M_SUB + * n is at least M_SUB + * k is equal to M + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * panel is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldc is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldp is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * blk_col is a multiple of M_SUB + * blk_row is a multiple of M_SUB + */ + + bc = blk_col/M_SUB; + br = blk_row/M_SUB; + + c += (ldc * bc) + br*(M_SUB*M_SUB); + b += (ldb * bc); + + blk_col %= M_SUB; + blk_row %= M_SUB; + + aligned = (blk_row | blk_col | + ((unsigned int)(lda | ldb | ldc | ldp) & 1) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c | (uintptr_t)panel) & (16-1))); + + + if ((m >= M_SUB) && (n >= M_SUB) && (k == M) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + m0 = (m/M_SUB) * M_SUB; + n0 = (n/M_SUB) * M_SUB; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, k, lda); + VALIDATE_MATRIX_4GB_CROSSING(b, k, n0, ldb); + VALIDATE_MATRIX_4GB_CROSSING(c, m0, n0, ldc); + VALIDATE_PANEL_4GB_CROSSING(panel, n0, ldp); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n0 / M_SUB; + parms->m = m0 / M_SUB; + parms->b_blk = -1; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + if (panel) { + parms->p = panel; + parms->ldp = ldp * sizeof(double); + cmd = HPL_ACCEL_CMD_DGEMM_PANEL; + } else { + cmd = HPL_ACCEL_CMD_DGEMM; + } + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + send_cmd_to_spes(cmd, idx, HPL_ACCEL_SPES); + + /* Complete any remain portion on the right side. That is when n is not a multiple + * of M_SUB. + */ + if (n0 < n) { + _dgemm_CL_B_B_CL(m0, n-n0, k, a, lda, b, ldb, c, ldc, blk_row, blk_col+n0, panel + n0*ldp, ldp); + } + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (m0 < m) { + _dgemm_CL_B_B_CL(m-m0, n, k, a+m0, lda, b, ldb, c, ldc, blk_row+m0, blk_col, ((panel) ? panel + m0 : panel), ldp); + } +} + + +void _dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = a[INDEX_COL(y,i,lda)]; + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_COL(y,x,ldc)] -= a_val * b[INDEX_COL(i,x,ldb)]; + } + } + } +} + +void hpl_accel_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + int m0 = 0; + int spes; + unsigned int cmd, idx, aligned; + volatile hpl_accel_dgemm_parms_t *parms; + + /* Do as much of the dgemm as possible using the column-ordered dgemm SPU specialist. + * This specialist assumes: + * k is a multiple of 4 and less than or equal to 64 + * m is a multiple of 8 + * n is a multiple of 4 + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned cols). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned cols). A multiple of 16 for optimal DMA + * performance. + * ldc is even (qword aligned cols). A multiple of 16 for optimal DMA + * performance. + */ + + aligned = (((unsigned int)(lda | ldb | ldc) & 1) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c) & (16-1))); + + if ((m >= 8) && (k <= 64) && (((k & (4-1)) | (n & (4-1))) == 0) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + m0 = (m/8) * 8; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, k, lda); + VALIDATE_PANEL_4GB_CROSSING(c, n, ldc); + + idx = hpl_accel_cmd_idx; + parms = (volatile hpl_accel_dgemm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n; + parms->m = m0; + parms->k = k; + parms->incomplete = incomplete; + + /* Compute the number of SPES to deploy. Each SPE will need to compute + * at least one M_SUB high block. + */ + spes = (m + (M_SUB-1)) / M_SUB; + if (spes > HPL_ACCEL_SPES) spes = HPL_ACCEL_SPES; + + init_incomplete(incomplete, spes); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + cmd = HPL_ACCEL_CMD_DGEMM_C_C_C; + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + send_cmd_to_spes(cmd, idx, spes); + + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (m0 < m) { + _dgemm_C_C_C(m-m0, n, k, a+m0, lda, b, ldb, c+m0, ldc); + } +} Index: accel/lib/hpl_accel_dtrsm.c =================================================================== RCS file: accel/lib/hpl_accel_dtrsm.c diff -N accel/lib/hpl_accel_dtrsm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_dtrsm.c 20 Aug 2008 03:57:53 -0000 1.5 @@ -0,0 +1,250 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "hpl_accel_global.h" + +#include + + + +void hpl_accel_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + int spes; + int spans; + int n0 = 0; + unsigned int idx; + unsigned int aligned; + unsigned int cmd; + volatile hpl_accel_dtrsm_parms_t *parms; + + /* Do as much of the dtrsm as possible using the dtrsm SPU specialist. + * This specialist assumes: + * m is at 128. + * n is a multiple of 16. + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * c is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldc is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + */ + c += (blk_row * M_SUB) + ldc*(blk_col / M_SUB); + blk_col %= M_SUB; + + aligned = (((unsigned int)(lda | ldb | ldc) & 1) | (blk_col & 15) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b | (uintptr_t)c) & (16-1))); + + + if ((m == M) && (n > 15) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + spans = n/16; + n0 = spans * 16; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, m, lda); + VALIDATE_PANEL_4GB_CROSSING(b, m, ldb); + VALIDATE_MATRIX_4GB_CROSSING(c, m, n0, ldc); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dtrsm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->c = c; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->ldc = ldc * sizeof(double); + parms->n = n0; + parms->m = m / M; + parms->blk_col = blk_col / 16; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->b_count, b, ldb, M); + + spes = (spans < HPL_ACCEL_SPES) ? spans : HPL_ACCEL_SPES; + + init_incomplete(incomplete, spes); + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + cmd = (c == NULL) ? HPL_ACCEL_CMD_DTRSM : HPL_ACCEL_CMD_DTRSM_PANEL; + + send_cmd_to_spes(cmd, idx, spes); + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + if (n0 < n) { + unsigned int i, x, y; + unsigned long long *a_ptr; + double a_val; + double *b_next; + + a_ptr = (unsigned long long *)a; + if (c) { + /* Perform DTRSM cleanup into a block format matrix row. + */ + for (x=n0; x<(unsigned int)n; x++) { + c[INDEX_BLK(0, x+blk_col, ldc)] = b[INDEX_ROW(0, x, ldb)]; + } + /* y == 1 */ + a_ptr++; + for (i=1; i<(unsigned int)m; i++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=n0; x<(unsigned int)n; x++) { + c[INDEX_BLK(i, x+blk_col, ldc)] = b[INDEX_ROW(i, x, ldb)] - b[INDEX_ROW(0, x, ldb)] * a_val; + } + } + a_ptr += (lda - m); + + /* y > 1 + */ + for (y=2; y<(unsigned int)m; y++) { + a_ptr += y; + for (i=y; i<(unsigned int)m; i++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=n0; x<(unsigned int)n; x++) { + c[INDEX_BLK(i, x+blk_col, ldc)] -= c[INDEX_BLK(y-1, x+blk_col, ldc)] * a_val; + } + } + a_ptr += (lda - m); + } + } else { + /* Perform DTRSM cleanup into [b] + */ + for (y=1; y<(unsigned int)m; y++, b+=ldb) { + a_ptr += y; + b_next = b+ldb; + for (i=y; i<(unsigned int)m; i++) { + a_val = hpl_accel_byte_swap_load(a_ptr++); + for (x=n0; x<(unsigned int)n; x++) { + b_next[x] -= b[x] * a_val; + } + b_next += ldb; + } + a_ptr += (lda - m); + } + } + } +} + + + +void hpl_accel_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + int spes; + int spans; + int n0 = 0; + unsigned int i, x, y; + unsigned int idx; + unsigned int aligned; + volatile hpl_accel_dtrsm_parms_t *parms; + + /* Do as much of the dtrsm as possible using the dtrsm SPU specialist. + * This specialist assumes: + * m is at 128. + * n is a multiple of 16. + * a is quadword aligned. A multiple of 16 for optimal DMA performance + * b is quadword aligned. A multiple of 16 for optimal DMA performance + * lda is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + * ldb is even (qword aligned rows). A multiple of 16 for optimal DMA + * performance. + */ + b += (blk_row * M_SUB) + ldb*(blk_col / M_SUB); + blk_col %= M_SUB; + + aligned = (((unsigned int)(lda | ldb) & 1) | (blk_col & 15) | + ((unsigned int)((uintptr_t)a | (uintptr_t)b) & (16-1))); + + + if ((m == M) && (n > 15) && (aligned == 0)) { + /* Either all or a portion of the computation can be done by the SPE accelerators. + */ + spans = n/16; + n0 = spans * 16; + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(a, m, lda); + VALIDATE_MATRIX_4GB_CROSSING(b, m, n0, ldb); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_dtrsm_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->b = b; + parms->lda = lda * sizeof(double); + parms->ldb = ldb * sizeof(double); + parms->n = n0; + parms->m = m / M; + parms->blk_col = blk_col / 16; + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->a_count, a, lda, M); + + spes = (spans < HPL_ACCEL_SPES) ? spans : HPL_ACCEL_SPES; + + init_incomplete(incomplete, spes); + + /* Perform a sync in order to ensure that the parameters are written to + * memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_DTRSM_CL_B, idx, spes); + } else { + /* This function is completely synchronous, therefore, clear incomplete. + */ + if (incomplete) *incomplete = 0; + } + + /* Cleanup any remaining portion of the matrix that was not handled above. + */ + for (x=n0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_BLK(i, x+blk_col, ldb)] -= b[INDEX_BLK(y-1, x+blk_col, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } +} Index: accel/lib/hpl_accel_global.c =================================================================== RCS file: accel/lib/hpl_accel_global.c diff -N accel/lib/hpl_accel_global.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_global.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,19 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include "hpl_accel_spu.h" +#include "hpl_accel_global.h" + + +/* SPE Thread Info + */ +int hpl_accel_initialized = 0; +hpl_accel_thread_info_t hpl_accel_threads[HPL_ACCEL_SPES]; + + +/* SPE Command Queue + */ +unsigned int hpl_accel_cmd_idx = 0; +hpl_accel_cmd_entry_t hpl_accel_cmd_queue[HPL_ACCEL_CMD_ENTRIES]; + Index: accel/lib/hpl_accel_global.h =================================================================== RCS file: accel/lib/hpl_accel_global.h diff -N accel/lib/hpl_accel_global.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_global.h 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,34 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ +#include +#include +#include "hpl_accel_spu.h" + +#ifndef _HPL_ACCEL_GLOBAL_H_ +#define _HPL_ACCEL_GLOBAL_H_ + +#define HPL_ACCEL_CMD_ENTRIES 8 /* number of command queue entries */ + + +typedef struct hpl_accel_thread_info { + spe_context_ptr_t id; + pthread_t pthread; + spe_spu_control_area_t *ctl_area; // pointer to control ps area + int in_cnt; // inbound mailbox available element count + struct hpl_accel_init_parms *init_parms; +} hpl_accel_thread_info_t; + + +typedef struct hpl_accel_cmd_entry { + unsigned char parms[128] __attribute__ ((aligned (128))); +} hpl_accel_cmd_entry_t; + + +extern int hpl_accel_initialized; +extern hpl_accel_thread_info_t hpl_accel_threads[HPL_ACCEL_SPES]; +extern unsigned int hpl_accel_cmd_idx; +extern hpl_accel_cmd_entry_t hpl_accel_cmd_queue[HPL_ACCEL_CMD_ENTRIES]; + +#endif /* _HPL_ACCEL_GLOBAL_H_ */ Index: accel/lib/hpl_accel_init.c =================================================================== RCS file: accel/lib/hpl_accel_init.c diff -N accel/lib/hpl_accel_init.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_init.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,112 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_global.h" +#include "hpl_accel_spu.h" + +static hpl_accel_init_parms_t init_parms[HPL_ACCEL_SPES]; + +static void *ppu_pthread_function(void *arg) { + hpl_accel_thread_info_t *info; + unsigned int entry = SPE_DEFAULT_ENTRY; + + info = (hpl_accel_thread_info_t *)arg; + + if (spe_context_run(info->id, &entry, 0, (void *)(info->init_parms), NULL, NULL) < 0) { + perror("Failed running context"); + exit (1); + } + pthread_exit(NULL); +} + +extern spe_program_handle_t hpl_accel_spu; + + +int hpl_accel_init() +{ + int i; + + if (!hpl_accel_initialized) { + + /* Create each of the SPU threads + */ + for (i=0; i +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "ppu_intrinsics.h" + +/* General purpose, reference, reformating facilities. + */ +void hpl_accel_reform_panel_CL_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + hpl_ref_reform_panel_CL_to_B(m, n, a, lda, panel, ldp, incomplete); +} + + +void hpl_accel_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, int size, + unsigned long long *incomplete) + +{ + unsigned int idx; + int spes; + int m_padded; + volatile hpl_accel_reform_matrix_CL_to_B_parms_t *parms; + + m_padded = ((m + M_SUB-1)/M_SUB)*M_SUB; + + /* Assert that the parameter restrictions are not violated. + * n Must be an intregral multiple of 64. + * a Must be quadword aligned. + * lda Must be even and at least roundup(m,64). + * scratch Must be quadword aligned and must not straddle 4GB boundary. + * size Must be at least 64*roundup(m,64). + */ + assert((n % M_SUB) == 0); + assert(lda >= m_padded); + assert(size >= (m_padded-4)*M_SUB); + + /* Assert that the parameters conform also to the desired performance restrictions: + * a Must be cacheline aligned. + * lda Must be a mulitple of 16. + * scratch Must be cacheline aligned. + * size Must be at least 4*64*m for optimal performance. + */ + assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0); + assert(((uintptr_t)scratch & (uintptr_t)127) == (uintptr_t)0); + assert((lda & 15) == 0); + + + /* Verify 4GB boundary expectation. + */ + VALIDATE_PANEL_4GB_CROSSING(scratch, 1, size); + VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda*M_SUB); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_reform_matrix_CL_to_B_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Compute the number of SPEs to deploy + */ + spes = size / ((m_padded-4) * M_SUB); + if (spes > HPL_ACCEL_SPES) spes = HPL_ACCEL_SPES; + + /* Place the parameters into a command queue buffer + */ + parms->a = a; + parms->scratch = scratch; + parms->lda = lda * sizeof(double); + parms->n = n; + parms->m = m; + parms->spes = spes; + parms->incomplete = incomplete; + + init_incomplete(incomplete, spes); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_MATRIX_CL_TO_B, idx, spes); +} + + + + + +void hpl_accel_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete) +{ + unsigned int idx; + volatile hpl_accel_reform_panel_parms_t *parms; + + /* Assert that the parameter restrictions are not violated. + * m Must be an intregral multiple of 64. + * n Must be at least 1. + * panel Must be quadword aligned and buffer may not straddle 4GB boundary. + * ldp Must be at least m. + * a Must be quadword aligned and buffer may not straddle 4GB boundary. + * lda Must be at least m*M_SUB. + */ + assert((m % M_SUB) == 0); + assert(n > 0); + assert(ldp >= m); + assert(lda >= m*M_SUB); + + VALIDATE_PANEL_4GB_CROSSING(panel, n, ldp); + VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda); + + /* Assert that the parameters conform also to the desired performance restrictions: + * a Mush be cacheline aligned. + * lda Must be a mulitple of 16. + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + */ + assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0); + assert(((uintptr_t)panel & (uintptr_t)127) == (uintptr_t)0); + assert((lda & 15) == 0); + assert((ldp & 15) == 0); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_reform_panel_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->n = n; + parms->m = m; + parms->a = a; + parms->lda = lda * sizeof(double); + parms->panel = panel; + parms->ldp = ldp * sizeof(double); + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_REFORM_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_PANEL_B_TO_CL, idx, HPL_ACCEL_REFORM_SPES); +} + + +void hpl_accel_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + unsigned int idx; + volatile hpl_accel_reform_panel_parms_t *parms; + + /* Assert that the parameter restrictions are not violated. + * panel Must be quadword aligned and buffer may not straddle 4GB boundary. + * ldp Must be even at least n. + * a Must be quadword aligned and buffer may not straddle 4GB boundary. + * lda Must be at least m*M_SUB. + */ + assert((ldp & 1) == 0); + assert(ldp >= n); + assert(lda >= m*M_SUB); + + VALIDATE_PANEL_4GB_CROSSING(panel, m, ldp); + VALIDATE_MATRIX_4GB_CROSSING(a, m, n, lda); + + /* Assert that the parameters conform also to the desired performance restrictions: + * a Must be cacheline aligned. + * lda Must be a mulitple of 16. + * panel Must be cacheline aligned. + * ldp Must be a multiple of 16. + */ + assert(((uintptr_t)a & (uintptr_t)127) == (uintptr_t)0); + assert(((uintptr_t)panel & (uintptr_t)127) == (uintptr_t)0); + assert((lda & 15) == 0); + assert((ldp & 15) == 0); + + idx = hpl_accel_cmd_idx; + + parms = (volatile hpl_accel_reform_panel_parms_t *)(&hpl_accel_cmd_queue[idx]); + + /* Place the parameters into a command queue buffer + */ + parms->n = n; + parms->m = m; + parms->a = a; + parms->lda = lda * sizeof(double); + parms->panel = panel; + parms->ldp = ldp * sizeof(double); + parms->incomplete = incomplete; + COMPUTE_PANEL_4GB_CROSSING_COUNT(parms->p_count, panel, ldp, n); + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + hpl_accel_cmd_idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_PANEL_R_TO_B, idx, HPL_ACCEL_SPES); +} + + +void hpl_accel_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + int i; + unsigned int non_aligned; + int n0 = 0; + + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + non_aligned = (((unsigned int)(blk_col | lda | ldr) & 1) | + (((unsigned int)(uintptr_t)a | (uintptr_t)rows) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_left; + int rows_per_block; + int *blk_row_ptr; + double *rows_ptr; + unsigned int idx; + volatile hpl_accel_reform_rows_parms_t *parms; + + /* Assert that we won't span a 4G boundary crossing + */ + assert((((uintptr_t)rows) >> 32) == ((uintptr_t)(rows + m*ldr - 1) >> 32)); + + VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_left = m; + blk_row_ptr = blk_rows; + rows_ptr = rows; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int)); + + while (m_left > rows_per_block) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = rows_per_block; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = NULL; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_left -= rows_per_block; + rows_ptr += rows_per_block * ldr; + blk_row_ptr += rows_per_block; + } + + if (m_left > 0) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = m_left; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = incomplete; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + } else { + if (incomplete) *incomplete = 0; + } + hpl_accel_cmd_idx = idx; + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int x, y, row; + int first_span, span, left; + double *src, *dst; + + blk_col += n0; + rows += n0; + n -= n0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + first_span = M_SUB - blk_col; + if (first_span > n) first_span = n; + + /* For each of the rows */ + for (y=0; y<(unsigned int)m; y++) { + row = blk_rows[y]; + left = n; + dst = a + (row * M_SUB); + span = first_span; + left = n - first_span; + + /* For each of the destination buffer block spans + */ + src = rows; + + for (x=0; x<(unsigned int)span; x++) dst[x+blk_col] = src[x]; + while (left) { + dst += lda; + src += span; + span = (left > M_SUB) ? M_SUB : left; + for (x=0; x<(unsigned int)span; x++) dst[x] = src[x]; + left -= span; + } + rows += ldr; + } + } +} + + +void hpl_accel_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + int i; + unsigned int non_aligned; + int n0 = 0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + non_aligned = (((unsigned int)(blk_col | lda | ldr) & 1) | + (((unsigned int)(uintptr_t)a | (uintptr_t)rows) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_left; + int rows_per_block; + int *blk_row_ptr; + double *rows_ptr; + unsigned int idx; + volatile hpl_accel_reform_rows_parms_t *parms; + + /* Assert that we won't span a 4G boundary crossing + */ + assert((((uintptr_t)rows) >> 32) == ((uintptr_t)(rows + m*ldr - 1) >> 32)); + + VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_left = m; + blk_row_ptr = blk_rows; + rows_ptr = rows; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int)); + + while (m_left > rows_per_block) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = rows_per_block; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = NULL; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_left -= rows_per_block; + rows_ptr += rows_per_block * ldr; + blk_row_ptr += rows_per_block; + } + + if (m_left > 0) { + parms = (volatile hpl_accel_reform_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = m_left; + parms->n = n0; + parms->rows = rows_ptr; + parms->ldr = ldr * sizeof(double); + parms->a = a; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->incomplete = incomplete; + for (i=0; iblk_rows[i] = blk_row_ptr[i]; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + } else { + if (incomplete) *incomplete = 0; + } + hpl_accel_cmd_idx = idx; + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int x, y, row; + int first_span, span, left; + double *src, *dst; + + blk_col += n0; + rows += n0; + n -= n0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + first_span = M_SUB - blk_col; + if (first_span > n) first_span = n; + + /* For each of the rows */ + for (y=0; y<(unsigned int)m; y++) { + row = (unsigned int)blk_rows[y]; + left = n; + src = a + (row * M_SUB); + span = first_span; + left = n - first_span; + + /* For each of the destination buffer block spans + */ + dst = rows; + + for (x=0; x<(unsigned int)span; x++) dst[x] = src[x+(unsigned int)blk_col]; + while (left) { + src += lda; + dst += span; + span = (left > M_SUB) ? M_SUB : left; + for (x=0; x<(unsigned int)span; x++) dst[x] = src[x]; + left -= span; + } + rows += ldr; + } + } +} + Index: accel/lib/hpl_accel_spu.h =================================================================== RCS file: accel/lib/hpl_accel_spu.h diff -N accel/lib/hpl_accel_spu.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_spu.h 23 Oct 2008 21:20:24 -0000 1.12 @@ -0,0 +1,417 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +/* This file contains definitions shared between the PPE and SPE + */ + +#ifndef _HPL_ACCEL_SPU_H_ +#define _HPL_ACCEL_SPU_H_ + +#include "hpl_accel.h" +#include + +#define SUB (2) /* Number of sub-blocks per block (1 dim)*/ +#define M (SUB*M_SUB) /* Size of the matrix block - M x M */ +#define SUB_SUB (SUB*SUB) /* The number of sub-blocks per block */ + + +/* SPE Commands + */ +#define HPL_ACCEL_CMD_DGEMM 0 +#define HPL_ACCEL_CMD_DTRSM 1 +#define HPL_ACCEL_CMD_REFORM_MATRIX_CL_TO_B 2 +#define HPL_ACCEL_CMD_REFORM_PANEL_B_TO_CL 3 +#define HPL_ACCEL_CMD_REFORM_PANEL_R_TO_B 4 +#define HPL_ACCEL_CMD_DGEMM_PANEL 5 +#define HPL_ACCEL_CMD_REFORM_ROWS_R_TO_B 6 +#define HPL_ACCEL_CMD_REFORM_ROWS_B_TO_R 7 +#define HPL_ACCEL_CMD_FINI 8 +#define HPL_ACCEL_CMD_DTRSM_CL_B 9 +#define HPL_ACCEL_CMD_DTRSM_PANEL 10 +#define HPL_ACCEL_CMD_DGEMM_C_C_C 11 +#define HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B 12 +#define HPL_ACCEL_CMD_COPY_ROWS_R_TO_R 13 + + +#define HPL_ACCEL_CMD_MASK 0x7F + +#define HPL_ACCEL_SPES 8 /* # of SPEs to use per accelerator */ +#define HPL_ACCEL_REFORM_SPES 4 /* # of SPEs to use during some reformat */ +#define HPL_ACCEL_PARM_TAG 31 + +/* Function parameters */ + +#ifdef __SPU__ +#include + +typedef struct hpl_accel_init_parms { + unsigned int id __attribute__ ((aligned (16))); + unsigned long long cmd_base __attribute__ ((aligned (16)));; +} hpl_accel_init_parms_t; + +typedef struct hpl_accel_dgemm_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long b __attribute__ ((aligned (16))); + unsigned long long c __attribute__ ((aligned (16))); + unsigned long long p __attribute__ ((aligned (16))); + vec_uint4 ld; /* lda, ldb, ldc, ldp */ + vec_uint4 dim; /* n, m, k, pad */ + vec_uint4 flags; /* b_blk, a_count, b_count, p_count */ + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_dgemm_parms_t; + +typedef struct hpl_accel_dtrsm_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long b __attribute__ ((aligned (16))); + unsigned long long c __attribute__ ((aligned (16))); + vec_uint4 ld; /* lda, ldb, ldc, pad */ + vec_uint4 dim; /* n, m, a_count, b_count */ + vec_uint4 blk_col; + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_dtrsm_parms_t; + +typedef struct hpl_accel_reform_matrix_CL_to_B_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long scratch __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16))); + int spes __attribute__ ((aligned (16))); + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_matrix_CL_to_B_parms_t; + +typedef struct hpl_accel_reform_panel_parms { + unsigned long long a __attribute__ ((aligned (16))); + unsigned long long panel __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldp __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16))); + int p_count __attribute__ ((aligned (16))); + unsigned long long incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_panel_parms_t; + +typedef struct hpl_accel_reform_rows_parms { + vector signed int m_n_ldr_lda; + vector unsigned long long rows_a; + vector unsigned long long incomplete_blk_col; + int blk_rows[5*4]; +} hpl_accel_reform_rows_parms_t; + +typedef struct hpl_accel_swap_rows_parms { + vector signed int m_n_lda_blk_col __attribute__ ((aligned (16))); + vector unsigned long long a_incomplete __attribute__ ((aligned (16))); + int blk_rows[6*4]; +} hpl_accel_swap_rows_parms_t; + +typedef struct hpl_accel_copy_rows_parms { + vector signed int m_n_lda_ldb __attribute__ ((aligned (16))); + vector unsigned long long a_b __attribute__ ((aligned (16))); + vector unsigned long long incomplete_pad __attribute__ ((aligned (16))); + int rows[4*4]; +} hpl_accel_copy_rows_parms_t; + +#else + +typedef struct hpl_accel_init_parms { + unsigned int id __attribute__ ((aligned (16))); + void *cmd_base __attribute__ ((aligned (16))); + void *signotify1[HPL_ACCEL_SPES] __attribute__ ((aligned (16))); +} hpl_accel_init_parms_t; + +typedef struct hpl_accel_dgemm_parms { + const double *a __attribute__ ((aligned (16))); + const double *b __attribute__ ((aligned (16))); + double *c __attribute__ ((aligned (16))); + double *p __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldb; + int ldc; + int ldp; + int n __attribute__ ((aligned (16))); + int m; + int k; + int b_blk __attribute__ ((aligned (16))); + int a_count; + int b_count; + int p_count; + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_dgemm_parms_t; + + +typedef struct hpl_accel_dtrsm_parms { + const double *a __attribute__ ((aligned (16))); + double *b __attribute__ ((aligned (16))); + double *c __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldb; + int ldc; + int n __attribute__ ((aligned (16))); + int m; + int a_count; + int b_count; + unsigned int blk_col __attribute__ ((aligned (16))); + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_dtrsm_parms_t; + +typedef struct hpl_accel_reform_matrix_CL_to_B_parms { + double *a __attribute__ ((aligned (16))); + double *scratch __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16)));; + int spes __attribute__ ((aligned (16))); + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_matrix_CL_to_B_parms_t; + +typedef struct hpl_accel_reform_panel_parms { + double *a __attribute__ ((aligned (16))); + double *panel __attribute__ ((aligned (16))); + int lda __attribute__ ((aligned (16))); + int ldp __attribute__ ((aligned (16))); + int n __attribute__ ((aligned (16))); + int m __attribute__ ((aligned (16))); + int p_count __attribute__ ((aligned (16))); + unsigned long long *incomplete __attribute__ ((aligned (16))); +} hpl_accel_reform_panel_parms_t; + +typedef struct hpl_accel_reform_rows_parms { + int m, n, ldr, lda; + double *rows, *a; + unsigned long long *incomplete; + int blk_col, pad; + int blk_rows[5*4]; +} hpl_accel_reform_rows_parms_t; + +typedef struct hpl_accel_swap_rows_parms { + int m, n, lda, blk_col; + double *a; + unsigned long long *incomplete; + int blk_rows[6*4]; +} hpl_accel_swap_rows_parms_t; + +typedef struct hpl_accel_copy_rows_parms { + int m, n, lda, ldb; + double *a; + double *b; + unsigned long long *incomplete; + unsigned long long pad; + int rows[4*4]; +} hpl_accel_copy_rows_parms_t; + +#endif + + +/* Inline support functions. + */ +#ifdef __PPU__ + +#include +#include "hpl_accel_global.h" + + +/* init_incomplete + * --------------- + * Initialize the asynchronous completion notification variable according + * to the specified number of paraticants. The number of participants can + * be between 1 and 8 where each byte in the unsigned long long variable + * is a flag for each of the participants. The bytes are assigned as follows: + * + * msb lsb + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | SPE 0 | SPE 1 | SPE 2 | SPE 3 | SPE 4 | SPE 5 | SPE 6 | SPE 7 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Parameters: + * incomplete Pointer to the asynchronous completion variable. + * + * participants Number of participants that will participate in the + * command that need to acknowledge completion status. + * + */ +static inline void init_incomplete(unsigned long long *incomplete, int participants) +{ + if (incomplete) { + *incomplete = 0xFFFFFFFFFFFFFFFFULL << (8*(8-participants)); + } +} + + +/* send_cmd_to_spes + * ---------------- + * Send the command with the index to the parameter buffer to the specified + * number of SPE participants. The command and index are combined into a + * 32-bit message that is placed in the inbound SPE mailbox. The 7 least + * significant bits of the message contain the command id. The 25 most + * significant bits is an offset from the cmd_base to the cacheline containing + * the command paramters. + * + * Parameters: + * idx Command buffer index that contains the parameters for + * this command. + * + * participants Number of participants that will participate in the command. + */ + +static inline void send_cmd_to_spes(unsigned int cmd, int idx, int participants) +{ + int i; + int cnt; + volatile spe_spu_control_area_t *ctl; + + /* Construct cmd message to be sent to each of the SPEs via the + * inbound mailbox. + */ + cmd |= (unsigned int)(idx * sizeof(struct hpl_accel_cmd_entry)); + + for (i=0; iSPU_Mbox_Stat >> 8) & 0xFF; + } + + /* Place the command into the inbound mailbox. + */ + ctl->SPU_In_Mbox = cmd; + hpl_accel_threads[i].in_cnt = cnt-1; + } +} +#endif + + +#ifdef PANEL_4GB_CROSSING +#define COMPUTE_PANEL_4GB_CROSSING_COUNT(_count, _panel, _ld, _max) { \ + int _cnt; \ + /* Calculate the number of rows/columns to the 4GB crossing and clamp \ + * the result to max. \ + */ \ + _cnt = (0x20000000 - ((unsigned int)(uintptr_t)_panel) / sizeof(double)) / _ld; \ + _count = (_cnt > _max) ? _max : _cnt; \ +} +#else /* !PANEL_4GB_CROSSING */ +#define COMPUTE_PANEL_4GB_CROSSING_COUNT(_count, _panel, _ld, _max) +#endif /* PANEL_4GB_CROSSING */ + +#define COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld) \ + /* return the number of 4GB crossings in panel _p */ \ + (((uintptr_t)(_p + _n*_ld - 1) >> 32) - ((uintptr_t)_p >> 32)) + +#ifdef VALIDATE_4GB_CROSSING +#ifdef PANEL_4GB_CROSSING + +#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) { \ + /* Verify that if the panel crosses a 4GB boundary. It does so only on a row \ + * boundary, and only once. \ + */ \ + if (_p) { \ + unsigned int _crossings; \ + unsigned int _bytes_til_crossing; \ + \ + _crossings = COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld); \ + switch (_crossings) { \ + case 0: \ + break; \ + case 1: \ + _bytes_til_crossing = ((uintptr_t)_p ^ (-1)) + 1; \ + if ((_bytes_til_crossing % (_ld * sizeof(double))) != 0) { \ + fprintf(stderr, "%s %d - Panel crosses 4GB boundary within a row/col. Parameters p=%p n=%d ld=%d\n",\ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + break; \ + default: \ + fprintf(stderr, "%s %d - Panel crosses %d 4GB boundary. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _crossings, _p, _n, _ld); \ + abort(); \ + break; \ + } \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Panel leading dimension too big. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + } \ +} + +#else /* ! PANEL_4GB_CROSSING */ + +#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) { \ + /* Verify that the panel does not cross a 4GB boundary */ \ + if (_p) { \ + if ( COUNT_PANEL_4GB_CROSSINGS(_p, _n, _ld) != 0 ) { \ + fprintf(stderr, "%s %d - Panel crosses 4GB boundary unexpectedly. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Panel leading dimension too big. Parameters p=%p n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _n, _ld); \ + abort(); \ + } \ + } \ +} +#endif + +#ifdef MATRIX_4GB_CROSSING + +#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) { \ + if (_p) { \ + int _i; \ + double *_start, *_end; \ + unsigned int _blks_per_col, _dbls_to_crossing; \ + \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Matrix leading dimension too big. Parameters p=%p m=%d n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + /* For each column of blocks */ \ + _blks_per_col = (_m + (M_SUB-1))/M_SUB; \ + for (_i=0, _start=(double *)_p; _i<_n; _i+=M_SUB) { \ + _end = _start + _ld; \ + if (((uintptr_t)(_end) >> 32) > ((uintptr_t)(_start) >> 32)) { \ + /* This column crosses a 4GB boundary. Check to see that it occurs only on a block boundary */ \ + _dbls_to_crossing = 0x20000000 - ((unsigned int)(uintptr_t)_start) / sizeof(double); \ + if (((M_SUB*M_SUB)*_blks_per_col > _dbls_to_crossing) && \ + ((_dbls_to_crossing % (M_SUB*M_SUB)) != 0)) { \ + fprintf(stderr, "%s %d - Matrix block straddles 4GB boundary. Parameters p=%p m=%d n=%d ld=%d\n",\ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + } \ + _start = _end; \ + } \ + } \ +} + +#else /* !MATRIX_4GB_CROSSING */ + +#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) { \ + if (_p) { \ + if ((((uintptr_t)_p) >> 32) != ((uintptr_t)(_p + _ld * (((_n+M_SUB-1)/M_SUB)-1) + ((_m+M_SUB-1)/M_SUB)*M_SUB*M_SUB-1) >> 32)) { \ + fprintf(stderr, "%s %d - Matrix crosses 4GB boundary unexpectedly. Parameters p=%p m=%d n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + if (_ld > 0x0FFFFFFF) { \ + fprintf(stderr, "%s %d - Matrix leading dimension too big. Parameters p=%p m=%d n=%d ld=%d\n", \ + __PRETTY_FUNCTION__, __LINE__, _p, _m, _n, _ld); \ + abort(); \ + } \ + } \ +} +#endif + +#else /* VALIDATE_4GB_CROSSING */ +#define VALIDATE_PANEL_4GB_CROSSING(_p, _n, _ld) +#define VALIDATE_MATRIX_4GB_CROSSING(_p, _m, _n, _ld) +#endif /* VALIDATE_4GB_CROSSING */ + +#endif /* _HPL_ACCEL_SPU_H_ */ Index: accel/lib/hpl_accel_swap.c =================================================================== RCS file: accel/lib/hpl_accel_swap.c diff -N accel/lib/hpl_accel_swap.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_accel_swap.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,150 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" +#include "ppu_intrinsics.h" + +/* General purpose, reference, reformating facilities. + */ + +void hpl_accel_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + int i; + unsigned int non_aligned; + int n0 = 0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + non_aligned = (((unsigned int)(blk_col | lda) & 1) | + (((unsigned int)(uintptr_t)a) & (16-1))); + + if ((non_aligned == 0) && (n > 1)) { + int m_start, m_left; + int rows_per_block; + unsigned int idx; + volatile hpl_accel_swap_rows_parms_t *parms; + + /* Assert that we won't span a 4G boundary crossing + */ + VALIDATE_MATRIX_4GB_CROSSING(a, ((lda/M_SUB) & ~(M_SUB-1)), n, lda); + + n0 = n & ~1; + + idx = hpl_accel_cmd_idx; + + m_start = 0; + m_left = m; + + /* Generate multiple command requests if the number of rows + * is greater than what will fit in a single command request. + */ + rows_per_block = (int)(sizeof(parms->blk_rows) / sizeof(int)); + + while (m_left > rows_per_block) { + parms = (volatile hpl_accel_swap_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = rows_per_block; + parms->n = n0; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->a = a + INDEX_BLK(m_start,0,lda); + parms->incomplete = NULL; + + for (i=0; iblk_rows[i] = blk_rows[m_start+i]-m_start; + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + + m_start += rows_per_block; + m_left -= rows_per_block; + } + + if (m_left > 0) { + parms = (volatile hpl_accel_swap_rows_parms_t *)(&hpl_accel_cmd_queue[idx]); + + parms->m = m_left; + parms->n = n0; + parms->lda = lda * sizeof(double); + parms->blk_col = blk_col; + + parms->a = a + INDEX_BLK(m_start,0,lda); + parms->incomplete = incomplete; + + for (i=0; iblk_rows[i] = blk_rows[m_start+i]-m_start; + + init_incomplete(incomplete, HPL_ACCEL_SPES); + + /* Perform a sync in order to ensure that the parameters are written + * to memory before writing to the mailbox command queue. + */ + __sync(); + + /* Send the command to each of the SPEs. + */ + send_cmd_to_spes(HPL_ACCEL_CMD_SWAP_ROWS_B_TO_B, idx, HPL_ACCEL_SPES); + + idx = (idx+1) % HPL_ACCEL_CMD_ENTRIES; + } else { + if (incomplete) *incomplete = 0; + } + hpl_accel_cmd_idx = idx; + } else { + if (incomplete) *incomplete = 0; + } + + /* Cleanup portions of the rows not implemented by the SPEs above. + */ + if (n0 < n) { + unsigned int y1, y2, x; + int first_span, span, left; + double tmp, *src, *dst; + + blk_col += n0; + n -= n0; + + a += (blk_col/M_SUB) * lda; + blk_col %= M_SUB; + + first_span = M_SUB - blk_col; + if (first_span > n) first_span = n; + + /* For each of the rows */ + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = blk_rows[y1]; /* New location for row y1 */ + if (y1 != y2) { + dst = a + (y1 * M_SUB); + src = a + (y2 * M_SUB); + for (x=0; x<(unsigned int)first_span; x++) + {tmp = dst[x+blk_col]; dst[x+blk_col] = src[x+blk_col]; src[x+blk_col] = tmp;} + left = n - first_span; + while (left) { + dst += lda; + src += lda; + span = (left > M_SUB) ? M_SUB : left; + for (x=0; x<(unsigned int)span; x++) + {tmp = dst[x]; dst[x] = src[x]; src[x] = tmp;} + left -= span; + } + } + } + } +} + Index: accel/lib/hpl_ref.c =================================================================== RCS file: accel/lib/hpl_ref.c diff -N accel/lib/hpl_ref.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/hpl_ref.c 20 Aug 2008 03:57:53 -0000 1.11 @@ -0,0 +1,419 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include "hpl_accel.h" +#include "hpl_accel_spu.h" + +#include + + +int hpl_ref_init() +{ + return HPL_ACCEL_INIT_SUCCESS; +} + + +void hpl_ref_dgemm_CL_R_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y,x,ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dgemm_CL_R_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *p, int ldp, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + if (p) { + /* Copy c into p */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = c[INDEX_BLK(y+blk_row, x+blk_col, ldc)]; + } + } + /* Perform DGEMM on p */ + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + /* Byte swap the result */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(p[INDEX_COL(y,x,ldp)]); + } + } + } else { + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row, x+blk_col, ldc)] -= a_val * b[INDEX_ROW(i,x,ldb)]; + } + } + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dgemm_CL_B_B_CL(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + double *p, int ldp, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + if (p) { + /* Copy c into p */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = c[INDEX_BLK(y+blk_row, x+blk_col, ldc)]; + } + } + /* Perform DGEMM on P */ + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)]; + } + } + } + /* Byte swap the result */ + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + p[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(p[INDEX_COL(y,x,ldp)]); + } + } + } else { + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y+blk_row, x+blk_col, ldc)] -= a_val * b[INDEX_BLK(i,x+blk_col,ldb)]; + } + } + } + } + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_dgemm_CL_B_B(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = hpl_accel_byte_swap(a[INDEX_COL(y,i,lda)]); + for (x=0; x<(unsigned int)n; x++) { + c[INDEX_BLK(y,x,ldc)] -= a_val * b[INDEX_BLK(i,x,ldb)]; + } + } + } + + if (incomplete) *incomplete = 0; +} + + +extern void hpl_ref_dgemm_C_C_C(int m, int n, int k, + const double *a, int lda, + const double *b, int ldb, + double *c, int ldc, + unsigned long long *incomplete) +{ + unsigned int i; + unsigned int x, y; + double a_val, c_val; + + for (i=0; i<(unsigned int)k; i++) { + for (y=0; y<(unsigned int)m; y++) { + a_val = a[INDEX_COL(y,i,lda)]; + for (x=0; x<(unsigned int)n; x++) { + c_val = c[INDEX_COL(y,x,ldc)]; + c_val -= a_val * b[INDEX_COL(i,x,ldb)]; + c[INDEX_COL(y,x,ldc)] = c_val; + } + } + } + + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_dtrsm_CL_R(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned long long *incomplete) +{ + unsigned int i, x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_ROW(i, x, ldb)] -= b[INDEX_ROW(y-1, x, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dtrsm_CL_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + unsigned int i, x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_BLK(i+blk_row, x+blk_col, ldb)] -= b[INDEX_BLK(y-1+blk_row, x+blk_col, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_dtrsm_CL_R_B(int m, int n, + const double *a, int lda, + double *b, int ldb, + double *c, int ldc, + unsigned int blk_row, unsigned int blk_col, + unsigned long long *incomplete) +{ + unsigned int i, x, y; + + if (c) { + for (x=0; x<(unsigned int)n; x++) { + + for (i=0; i<(unsigned int)m; i++) c[INDEX_BLK(i+blk_row, x+blk_col, ldc)] = b[INDEX_ROW(i, x, ldb)]; /* Copy the column of b into c */ + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + c[INDEX_BLK(i+blk_row, x+blk_col, ldc)] -= c[INDEX_BLK(y-1+blk_row, x+blk_col, ldc)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + } else { + for (x=0; x<(unsigned int)n; x++) { + for (y=1; y<(unsigned int)m; y++) { + for (i=y; i<(unsigned int)m; i++) { + b[INDEX_ROW(i, x, ldb)] -= b[INDEX_ROW(y-1, x, ldb)] * hpl_accel_byte_swap(a[INDEX_COL(i, y-1, lda)]); + } + } + } + } + if (incomplete) *incomplete = 0; +} + + + + +/* General purpose, reference, reformating facilities. + */ +void hpl_ref_reform_panel_CL_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=0; y<(unsigned int)m; y++) { + a[INDEX_BLK(y,x,lda)] = hpl_accel_byte_swap(panel[INDEX_COL(y,x,ldp)]); + } + } + + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_reform_matrix_CL_to_B(int m, int n, + double *a, int lda, + double *scratch, + int size __attribute__ ((unused)) , + unsigned long long *incomplete) + +{ + unsigned int i; + unsigned int x, y; + unsigned int col; + + /* Reformat the matrix [a] from column-order, little-endian to blocked, + * big-endian format. + */ + + /* For each column of blocks */ + for (col=0; col<(unsigned int)n; col+=M_SUB) { + /* Reformat the column of block into the scratch buffer */ + for (x=0; x<(unsigned int)M_SUB; x++) { + for (y=0; y<(unsigned int)m; y++) { + scratch[INDEX_ROW(y,x,M_SUB)] = hpl_accel_byte_swap(a[INDEX_COL(y,x,lda)]); + } + } + /* Copy the reformated data back into a */ + memcpy(a, scratch, sizeof(double)*M_SUB*m); + + /* Zero the trailing block column of data */ + a += M_SUB*m; + for (i=0; i<(unsigned int)M_SUB*(lda-m); i++) *a++ = 0.0; + } + if (incomplete) *incomplete = 0; +} + + + + +void hpl_ref_reform_panel_B_to_CL(int m, int n, + double *panel, int ldp, + double *a, int lda, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=0; y<(unsigned int)m; y++) { + panel[INDEX_COL(y,x,ldp)] = hpl_accel_byte_swap(a[INDEX_BLK(y,x,lda)]); + } + } + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_reform_panel_R_to_B(int m, int n, + double *a, int lda, + double *panel, int ldp, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (x=0; x<(unsigned int)n; x++) { + for (y=0; y<(unsigned int)m; y++) { + a[INDEX_BLK(y,x,lda)] = panel[INDEX_ROW(y,x,ldp)]; + } + } + if (incomplete) *incomplete = 0; +} + + +void hpl_ref_reform_rows_R_to_B(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + unsigned int x, y; + + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + a[INDEX_BLK((unsigned int)blk_rows[y], blk_col+x, lda)] = rows[INDEX_ROW(y, x, ldr)]; + } + } + if (incomplete) *incomplete = 0; +} + + + +void hpl_ref_reform_rows_B_to_R(int m, int n, + double *rows, int ldr, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) + +{ + unsigned int x, y; + + for (y=0; y<(unsigned int)m; y++) { + for (x=0; x<(unsigned int)n; x++) { + rows[INDEX_ROW(y, x, ldr)] = a[INDEX_BLK((unsigned int)blk_rows[y], blk_col+x, lda)]; + } + } + if (incomplete) *incomplete = 0; +} + +void hpl_ref_swap_rows_B_to_B(int m, int n, + double *a, int lda, + int *blk_rows, int blk_col, + unsigned long long *incomplete) +{ + unsigned int y1, y2, x; + + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = blk_rows[y1]; /* New location for row y1 */ + if (y1 != y2) { + /* Swap rows y1 and y2 */ + for (x=0; x<(unsigned int)n; x++) { + double tmp = a[INDEX_BLK(y1, x+blk_col, lda)]; + a[INDEX_BLK(y1, x+blk_col, lda)] = a[INDEX_BLK(y2, x+blk_col, lda)]; + a[INDEX_BLK(y2, x+blk_col, lda)] = tmp; + } + } + } + if (incomplete) *incomplete = 0; +} + +void hpl_ref_copy_rows_R_to_R(int m, int n, + double *a, int lda, + double *b, int ldb, + int *rows, + unsigned long long *incomplete) +{ + unsigned int y1, y2, x; + + for (y1=0; y1<(unsigned int)m; y1++) { + y2 = rows[y1]; /* New location for row y1 */ + /* Copy row a[y1] to b[y2] */ + for (x=0; x<(unsigned int)n; x++) { + b[INDEX_ROW(y2, x, ldb)] = a[INDEX_ROW(y1, x, lda)]; + } + } + if (incomplete) *incomplete = 0; +} Index: accel/lib/spu/Makefile =================================================================== RCS file: accel/lib/spu/Makefile diff -N accel/lib/spu/Makefile --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/Makefile 20 Aug 2008 03:57:53 -0000 1.9 @@ -0,0 +1,57 @@ +# --------------------------------------------------------------- +# (C) Copyright IBM Corporation 2007,2008 +# +# --------------------------------------------------------------- + +######################################################################## +# Target +######################################################################## + +PROGRAM_spu := hpl_accel_spu + +LIBRARY_embed64 = libhpl_accel_spu.a + +OBJS = hpl_accel_spu.o \ + accel_dgemm.o \ + accel_dgemm_panel.o \ + accel_dgemm_C.o \ + accel_dtrsm.o \ + accel_dtrsm_panel.o \ + accel_dtrsm_CL_B.o \ + accel_reform_matrix_CL_to_B.o \ + accel_reform_panel_B_to_CL.o \ + accel_reform_panel_R_to_B.o \ + accel_reform_rows_B_to_R.o \ + accel_reform_rows_R_to_B.o \ + accel_swap_rows_B_to_B.o \ + accel_copy_rows_R_to_R.o \ + accel_buffers.o \ + accel_mm_dp_64Cx64.o \ + accel_dtrsm_dp_128Cx16.o \ + accel_mm_dp.o + + +######################################################################## +# Local Defines +######################################################################## + +# CC_OPT_LEVEL = -g + +#CPPFLAGS = -DACCEL_LITTLE_ENDIAN +CPPFLAGS += -DMATRIX_4GB_CROSSING + +# THE SPU CODE DOES NOT YET SUPPORT 4GB PANEL CROSSING +#CPPFLAGS += -DPANEL_4GB_CROSSING + +CFLAGS_gcc = -march=celledp -mtune=celledp +CFLAGS_xlc = -qarch=edp -qtune=edp + +INCLUDE = -I.. + + + +######################################################################## +# make.footer +######################################################################## + +include $(CELL_TOP)/buildutils/make.footer Index: accel/lib/spu/accel_buffers.S =================================================================== RCS file: accel/lib/spu/accel_buffers.S diff -N accel/lib/spu/accel_buffers.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_buffers.S 23 Oct 2008 21:20:24 -0000 1.3 @@ -0,0 +1,24 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + + .data + .align 7 + .global bufA +bufA: + .global bufA_128x128 +bufA_128x128: + .skip 2*64*64*8 + .global bufB +bufB: .skip 2*64*64*8 + + .global bufC +bufC: + .global bufB_128x16 +bufB_128x16: + .skip 2*128*16*8 + .global bufB_list +bufB_list: + .skip 64*64*8 + Index: accel/lib/spu/accel_buffers.h =================================================================== RCS file: accel/lib/spu/accel_buffers.h diff -N accel/lib/spu/accel_buffers.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_buffers.h 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,24 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_BUFFERS_H_ +#define _ACCEL_BUFFERS_H_ + +#include + +/* The local store buffers is carved up uniquely for each acceleration function. + */ + +/* DGEMM buffer set */ +extern vec_double2 bufA[2][64*64/2]; +extern vec_double2 bufB[2][64*64/2]; +extern vec_double2 bufC[2][64*64/2]; + +/* DTRSM buffer set */ +extern vec_double2 bufA_128x128[128*128/2]; +extern vec_double2 bufB_128x16[2][128*16/2]; +extern vec_uint4 bufB_list[8][128/2]; + +#endif /* _ACCEL_BUFFERS_H_ */ Index: accel/lib/spu/accel_copy_rows_R_to_R.c =================================================================== RCS file: accel/lib/spu/accel_copy_rows_R_to_R.c diff -N accel/lib/spu/accel_copy_rows_R_to_R.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_copy_rows_R_to_R.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,127 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +static inline void row_R_to_R(unsigned int src_hi, unsigned int src_lo, + unsigned int dst_hi, unsigned int dst_lo, + unsigned int left, unsigned int *tag) +{ + void *buf[2]; + unsigned int size; + + buf[0] = bufA; + buf[1] = bufB; + + size = 16*1024; + if (size > left) size = left; + + spu_mfcdma64(buf[*tag], src_hi, src_lo, size, *tag, MFC_GETB_CMD); + left -= size; + + while (left) { + + spu_mfcdma64(buf[*tag], dst_hi, dst_lo, size, *tag, MFC_PUTB_CMD); + + *tag ^= 1; + + /* increment src_hi, src_lo, dst_hi, dst_lo */ + MATRIX_EA_UADD32(src_hi, src_lo, size); + MATRIX_EA_UADD32(dst_hi, dst_lo, size); + + size = 16*1024; + if (size > left) size = left; + + spu_mfcdma64(buf[*tag], src_hi, src_lo, size, *tag, MFC_GETB_CMD); + left -= size; + } + + spu_mfcdma64(buf[*tag], dst_hi, dst_lo, size, *tag, MFC_PUTB_CMD); +} + + +void accel_copy_rows_R_to_R(hpl_accel_init_parms_t *parms, + volatile hpl_accel_copy_rows_parms_t *cmd_parms) +{ + int m, n, lda, ldb; + unsigned int src, dst; + unsigned int id; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int src_hi, src_lo; + unsigned int dst_hi, dst_lo; + unsigned int row_size; + unsigned int tag; + unsigned int rows_per_spe, extra_rows, start_row, end_row; + vector signed int m_n_lda_ldb; + vector unsigned long long a_b, incomplete_pad; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + m_n_lda_ldb = cmd_parms->m_n_lda_ldb; + a_b = cmd_parms->a_b; + incomplete_pad = cmd_parms->incomplete_pad; + + m = spu_extract(m_n_lda_ldb, 0); + n = spu_extract(m_n_lda_ldb, 1); + lda = spu_extract(m_n_lda_ldb, 2); + ldb = spu_extract(m_n_lda_ldb, 3); + + a_hi = spu_extract((vector unsigned int)a_b, 0); + a_lo = spu_extract((vector unsigned int)a_b, 1); + + b_hi = spu_extract((vector unsigned int)a_b, 2); + b_lo = spu_extract((vector unsigned int)a_b, 3); + + /* Process rows by assigning each row to one SPE. + */ + row_size = n*sizeof(double); + rows_per_spe = m / HPL_ACCEL_SPES; + extra_rows = m % HPL_ACCEL_SPES; + + start_row = id * rows_per_spe + ((id > extra_rows) ? extra_rows : id); + end_row = start_row + rows_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_rows, 0), spu_promote(id, 0)), 0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + tag = 0; + for (src=start_row; srcrows[src]; + + src_hi = a_hi; src_lo = a_lo; + MATRIX_EA_UADD32(src_hi, src_lo, src*lda); + + dst_hi = b_hi; dst_lo = b_lo; + MATRIX_EA_UADD32(dst_hi, dst_lo, dst*ldb); + + row_R_to_R(src_hi, src_lo, dst_hi, dst_lo, row_size, &tag); + + tag ^= 1; + } + + DMA_WAIT(1<id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + + /* Fetch the command parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + ldb = spu_extract(ld, 1); + + dim = cmd_parms->dim; + + flags = cmd_parms->flags; + + b_blk = spu_maskw(spu_extract(flags, 0)); + + /* Computation of [C] -= [A][B] is performed in a surpetine pattern + * through the various sub-blocks of C. Below is a graphical attempt + * to explain the partitioning and order of the computation. For this + * example, consider the matrix-matrix multiply of a 5x5 (128x128 block) + * result after panel factorization of block 0,0 (bx,by). In this case, + * we must compute 128x128 blocks multiplies as follows: + * + * for (x=1; x<5; x++) { + * for (y=1; y<5; y++) { + * C(x,y) -= A(bx,y)*B(x,by); + * } + * } + * + * Assuming this computation is performed by 3 SPEs, the 16 blocks + * are subdivided as: + * + * SPE 0 : C(1,1), C(1,2), C(1,3), C(1,4), C(2,1), C(2,2) + * SPE 1 : C(2,3), C(2,4), C(3,1), C(3,2), C(3,3), C(3,4) + * SPE 2 : C(4,1), C(4,2), C(4,3), C(4,4) + * + * Therefore, SPE 1 will compute the resulting sub-blocks of C in the + * alphabetic order (a thru z) as marked below. + * + * X + * 0 1 2 3 4 + * +---B---+---+---+---+ + * 0 | | U row | + * | | | + * A---C---+---+---+---+ + * 1 | | | |i x| | + * | | | |j w| | + * + L +---+---+---+---+ + * Y 2 | | | |k v| | + * | p | | |l u| | + * + a +---+---+---+---+ + * 3 | n | |a h|m t| | + * | e | |b g|n s| | + * + l +---+---+---+---+ + * 4 | | |c f|o r| | + * | | |d e|p q| | + * +---+---+---+---+---+ + * + * Using 128x128 block partitioning amongst the SPEs results non-optimal + * load balancing of the SPEs. This is shown by the above example in which + * SPEs 0 and 1 compute 24 64x64 multiplies, while SPE 2 only computes + * 16 64x64 multiplies. In addition, the corner turn between sub-blocks + * 'h' and 'i' will incur extra DMAs. + * + * A more computational and transfer efficient load balance would be + * to allocate computation on the 64 sub-blocks. This would allocate + * 22,22,20 sub-block multiplies to each of the SPEs and the corner + * turn becomes efficient. The sub-block, computation (alphabetically + * ordered) for SPE 1 becomes: + * + * X + * 0 1 2 3 4 + * +---B---+---+---+---+ + * 0 | | U row | + * | | | + * A---C---+---+---+---+ + * 1 | | | j|k | | + * | | | i|l | | + * + L +---+---+---+---+ + * Y 2 | | | h|m | | + * | p | | g|n | | + * + a +---+---+---+---+ + * 3 | n | | f|o v| | + * | e | | e|p u| | + * + l +---+---+---+---+ + * 4 | | |a d|q t| | + * | | |b c|r s| | + * +---+---+---+---+---+ + * + * This more efficient method is employed in the following code. + */ + + w_sub = spu_extract(dim, 0); + h_sub_v = spu_shuffle(dim, dim, splat_1); + h_sub = spu_extract(h_sub_v, 0); + + h_sub2_v = spu_sl(h_sub_v, 1); + + sub_blocks = w_sub * h_sub; + sub_blocks_per_spe = (sub_blocks + HPL_ACCEL_SPES-1) / HPL_ACCEL_SPES; + + start_sub = ((unsigned short)id) * sub_blocks_per_spe; + end_sub = start_sub + sub_blocks_per_spe; + if (end_sub > sub_blocks) end_sub = sub_blocks; + + sub_blocks = end_sub - start_sub; + + if (LIKELY((int)sub_blocks > 0)) { + /* This SPE has some work to do + */ + DMA_WAIT_REQUEST(-1); + + /* Compute vectors for stepping the effective address matrix pointers. + * The pictograms below show 64x64 blocks within the 128x128 blocks. + * + * A (L panel) B (U panel) C matrix + * ++===+===++ ++---+---++---+---++ ++===+===++===+===++ + * || 1 | 2 || || 1 | 4 || 5 | || || 1 | || | || + * ++---+---++ ++---+---++---+---++ ++---+---++---+---++ + * || 3 | || || 2 | 3 || | || || 2 | || | || + * ++===+===++ ++---+---++---+---++ ++===+===++===+===++ + * || | || || | || | || + * ++---+---++ ++---+---++---+---++ + * || | || || 3 | 4 || | || + * ++===+===++ ++===+===++===+===++ + * + * a_step = {1 to 2, 2 to 3, 1 to 2, 2 to 3} + * b_step = {1 to 2, 2 to 3, 3 to 4, 4 to 5} + * c_stepv= {1 to 2, 1 to 2, 1 to 2, 1 to 2} + * c_steph= {3 to 4, 3 to 4, 3 to 4, 3 to 4} + */ + + a_step = spu_promote(lda * M_SUB, 0); + a_step = spu_shuffle(a_step, spu_sub(step_sub, a_step), shuf_0404); + + c_stepv = spu_splats(M_SUB*M_SUB*sizeof(double)); + c_steph = spu_shuffle(ld, ld, splat_2); + + b_step = spu_sel(spu_promote(ldb * M_SUB, 0), c_stepv, b_blk); + b_step = spu_shuffle(b_step, spu_sub(0, b_step), shuf_0044); + b_step = spu_sel(b_step, spu_sel(step_sub, spu_shuffle(ld, ld, splat_1), b_blk), mask_0101); + + ldb = spu_extract(spu_sel(spu_promote(ldb, 0), step_sub, b_blk), 0); + + /* Determine the following: + * 1) Starting sub-block - x_sub, y_sub + * 2) Number of sub-block multiplies before a corner turn - corner. + */ + x_sub = start_sub / h_sub; + y_sub = start_sub - h_sub * x_sub; + + start_x = x_sub / SUB; + y_sub = start_sub - h_sub*SUB*start_x; + + /* rotate = 4; + * + * if (x_sub & 1) { + * y_sub = h_sub - 1 - y_sub; + * a_step = spu_sub(0, a_step); + * c_stepv = spu_sub(0, c_stepv); + * rotate = -rotate; + * corner = 2*y_sub + 2 + * } else { + * corner = 2 * (h_sub-y_sub) + * } + */ + odd = x_sub & 1; + + down = spu_cmpeq(spu_splats(odd), 0); + + y_sub = spu_extract(spu_sel(spu_sub(h_sub2_v, spu_promote(y_sub + 1, 0)), + spu_promote(y_sub, 0), + down), 0); + + y_sub2_v = spu_splats(2*y_sub); + + corner = spu_sel(spu_add(y_sub2_v, 2), spu_sub(h_sub2_v, y_sub2_v), down); + + /* Compute the initial EA buffer pointers. + */ + a_addend = y_sub * spu_extract(step_sub, 0) + spu_extract(spu_andc(a_step, down), 0); + b_addend = spu_extract(spu_andc(b_step, down), 0); + c_addend = y_sub * spu_extract(c_stepv, 0); + + a_lo += a_addend; + + MATRIX_EA_UADD32(b_hi, b_lo, b_addend); + MATRIX_EA_UMADD32(b_hi, b_lo, x_sub, spu_extract(b_step, 1)); + MATRIX_EA_UADD32(c_hi, c_lo, c_addend); + MATRIX_EA_UMADD32(c_hi, c_lo, x_sub, spu_extract(c_steph, 0)); + + /* Adjust the pointer steps according to the initial direction. + */ + a_step = spu_sel(spu_sub(0, a_step), a_step, down); + b_step = spu_rlqwbyte(b_step, 8 & ~spu_extract(down, 0)); + c_stepv = spu_sel(spu_sub(0, c_stepv), c_stepv, down); + rotate = ((-4) ^ spu_extract(down, 0)) - spu_extract(down, 0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + /* Download 3 blocks to get the process started. After that, each + * 64x64 block multiple requires 2 block transfers. + */ + dma_block_getl(&bufA[0][0], a_hi, a_lo, 0, lda); + + dma_block_getl(&bufB[0][0], b_hi, b_lo, 0, ldb); + + dma_block(&bufC[0][0], c_hi, c_lo, 0, MFC_GET_CMD); + + DMA_WAIT_REQUEST(1<<0); + + c_lo_prev = c_lo; + + a_lo += spu_extract(a_step, 0); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(b_step, 0)); + + dma_block_getl(&bufA[1][0], a_hi, a_lo, 1, lda); + + dma_block_getl(&bufB[1][0], b_hi, b_lo, 1, ldb); + + phase = 0; + + i1 = 0; + a_idx = 0; + + /* For each C block, we perform 2 block computations + */ + for (i=0; i<(int)sub_blocks-1; i++) { + /* First block computation + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST((1<<1)|(1<<2)); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]); + + a_step = spu_rlqwbyte(a_step, rotate); + + c_idx = i1 ^ 1; + + corner_eq2 = spu_cmpeq(corner, 2); + + /* if (corner == 2) { + * rotate = -rotate; + * a_step = 0-a_step; + * } else { + * a_lo += a_step; + * } + */ + rotate = (rotate ^ spu_extract(corner_eq2, 0)) - spu_extract(corner_eq2, 0); + a_lo += spu_extract(spu_andc(a_step, corner_eq2), 0); + a_step = spu_sel(a_step, spu_sub(0, a_step), corner_eq2); + + /* if corner != 2 then fetch next A buffer + * else "corner turn" fetch next B buffer + */ + b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0)); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0)); + + idx = spu_extract(spu_andc(spu_promote(a_idx, 0), corner_eq2), 0); + buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0), + spu_promote((unsigned int)bufB, 0), + corner_eq2), 0); + hi = spu_extract(spu_sel(spu_promote(a_hi, 0), + spu_promote(b_hi, 0), + corner_eq2), 0); + lo = spu_extract(spu_sel(spu_promote(a_lo, 0), + spu_promote(b_lo, 0), + corner_eq2), 0); + stride = spu_extract(spu_sel(spu_promote(lda, 0), + spu_promote(ldb, 0), + corner_eq2), 0); + + buf += idx * (unsigned int)(sizeof(bufA)/2); + +#ifdef __GNUC__ + /* The following lnop was added to keep gcc from unscheduling the + * series of add,stqd instruction pairs used to build the DMA list in + * dma_block_getl. + */ + si_lnop(); +#endif + + dma_block_getl((vec_double2 *)buf, hi, lo, 0, stride); + + /* if (corner == 2) { + * c_lo += c_steph; + * c_stepv = -c_stepv; + * } else { + * c_lo += c_stepv; + * } + */ +#ifdef MATRIX_4GB_CROSSING + c_hi_prev = c_hi; +#endif + c_lo_prev = c_lo; + c_addend = spu_extract(spu_sel(c_stepv, c_steph, corner_eq2), 0); + MATRIX_EA_ADD32(c_hi, c_lo, c_addend); + c_stepv = spu_sel(c_stepv, spu_sub(0, c_stepv), corner_eq2); + + /* Before getting another C buffer, we must wait for the previous + * one to be stored. + */ + DMA_WAIT_RECEIVE(); + dma_block(&bufC[c_idx][0], c_hi, c_lo, 0, MFC_GET_CMD); + + DMA_WAIT_REQUEST(1<<0); + + a_idx = phase^1; + + /* Second block computation + */ + c_ptr = &bufC[i1][0]; + + mm_dp_64Cx64(c_ptr, &bufA[a_idx][0], &bufB[1][0]); + + a_step = spu_rlqwbyte(a_step, rotate); + a_lo += spu_extract(a_step, 0); + + /* if corner != 2 then fetch next A buffer + * else "corner turn" fetch next B buffer + */ + b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0)); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0)); + + idx = spu_extract(spu_sel(spu_promote(a_idx, 0), vone, corner_eq2), 0); + buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0), + spu_promote((unsigned int)bufB, 0), + corner_eq2), 0); + hi = spu_extract(spu_sel(spu_promote(a_hi, 0), + spu_promote(b_hi, 0), + corner_eq2), 0); + lo = spu_extract(spu_sel(spu_promote(a_lo, 0), + spu_promote(b_lo, 0), + corner_eq2), 0); + stride = spu_extract(spu_sel(spu_promote(lda, 0), + spu_promote(ldb, 0), + corner_eq2), 0); + + buf += idx * (unsigned int)(sizeof(bufA)/2); + dma_block_getl((vec_double2 *)buf, hi, lo, 1, stride); + +#ifdef MATRIX_4GB_CROSSING + dma_block(c_ptr, c_hi_prev, c_lo_prev, 2, MFC_PUT_CMD); +#else + dma_block(c_ptr, c_hi, c_lo_prev, 2, MFC_PUT_CMD); +#endif + + corner = spu_sel(spu_add(corner, -2), h_sub2_v, corner_eq2); + phase ^= spu_extract(corner_eq2, 0) & 1; + + i1 ^= 1; + a_idx = phase; + } + + /* Finish the last sub-block */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST((1<<1)|(1<<2)); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]); + + DMA_WAIT_RECEIVE(); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx^1][0], &bufB[1][0]); + + dma_block(&bufC[i1][0], c_hi, c_lo, 1, MFC_PUT_CMD); + } + + /* Report completion status if requested. + */ + report_completion(id, cmd_parms->incomplete, 1); +} + + + Index: accel/lib/spu/accel_dgemm.h =================================================================== RCS file: accel/lib/spu/accel_dgemm.h diff -N accel/lib/spu/accel_dgemm.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dgemm.h 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,164 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_DGEMM_H_ +#define _ACCEL_DGEMM_H_ 1 + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" + +extern hpl_accel_init_parms_t parms; + +extern void mm_dp_64Cx64(vec_double2 *blkC, vec_double2 *blkA, vec_double2 *blkB); + +static inline void dma_block(vec_double2 *ls, unsigned int hi, unsigned int lo, unsigned int tag, unsigned int cmd) +{ + spu_mfcdma64(ls, hi, lo, 16384, tag, cmd); + spu_mfcdma64(ls+(16384/16), hi, lo+16384, 16384, tag, cmd); +} + +static inline void dma_block_getl(vec_double2 *ls, unsigned int hi, unsigned int lo, unsigned int tag, unsigned int stride) +{ + vec_uint4 *list; + vec_uint4 e0, e1, e2; + vec_uint4 stride2, stride4, stride6; + + /* Place the list at the end of the target LS buffer. + */ + list = (vec_uint4 *)ls + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + + /* Construct e0, e1, e2 and stride6 to contain + * + * e0 = {row size, lo+0*stride, row size, lo+1*stride} + * e1 = {row size, lo+2*stride, row size, lo+3*stride} + * e2 = {row size, lo+4*stride, row size, lo+5*stride} + * + * stride6 = {0, 6*stride, 0, 6*stride} + */ + + e0 = spu_add(spu_shuffle(spu_splats((unsigned int)(M_SUB*sizeof(double))), + spu_promote(lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + + stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + e1 = spu_add(e0, stride2); + + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + list[30] = e0; + list[31] = e1; + + /* Initiate the DMA transfer + */ + spu_mfcdma64(ls, hi, (unsigned int)list, 8*M_SUB, tag, MFC_GETL_CMD); +} + +static inline void dma_block_putl(vec_uint4 *list, vec_double2 *ls, unsigned int hi, unsigned int lo, unsigned int tag, unsigned int stride) +{ + vec_uint4 e0, e1, e2; + vec_uint4 stride2, stride4, stride6; + + /* Construct e0, e1, e2 and stride6 to contain + * + * e0 = {row size, lo+0*stride, row size, lo+1*stride} + * e1 = {row size, lo+2*stride, row size, lo+3*stride} + * e2 = {row size, lo+4*stride, row size, lo+5*stride} + * + * stride6 = {0, 6*stride, 0, 6*stride} + */ + + e0 = spu_add(spu_shuffle(spu_splats((unsigned int)(M_SUB*sizeof(double))), + spu_promote(lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + + stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + e1 = spu_add(e0, stride2); + + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + list[30] = e0; + list[31] = e1; + + /* Initiate the DMA transfer + */ + spu_mfcdma64(ls, hi, (unsigned int)list, 8*M_SUB, tag, MFC_PUTL_CMD); +} + + + +#endif /* _ACCEL_DGEMM_H_ */ Index: accel/lib/spu/accel_dgemm_C.c =================================================================== RCS file: accel/lib/spu/accel_dgemm_C.c diff -N accel/lib/spu/accel_dgemm_C.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dgemm_C.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,229 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" + + +extern void mm_dp(int k, int m, int n, vector double *c, vector double *a, vector double *b); + + +/* Construct a DMA list assuming that there are 64 columns. If it is less, then they don't get used. + */ +static inline void construct_list(vec_uint4 *list, unsigned int lo, unsigned int stride, unsigned int elementsize) +{ + vec_uint4 e0, e1, e2; + vec_uint4 stride2, stride4, stride6; + + /* Construct e0, e1, e2 and stride6 to contain + * + * e0 = {row size, lo+0*stride, row size, lo+1*stride} + * e1 = {row size, lo+2*stride, row size, lo+3*stride} + * e2 = {row size, lo+4*stride, row size, lo+5*stride} + * + * stride6 = {0, 6*stride, 0, 6*stride} + */ + + e0 = spu_add(spu_shuffle(spu_promote(elementsize, 0), spu_promote(lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + + stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + e1 = spu_add(e0, stride2); + + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + list[30] = e0; + list[31] = e1; +} + + + +/* Double precision DGEMM matrix-matrix multiply for column-ordered + * matrices. + */ +void accel_dgemm_C_C_C(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dgemm_parms_t *cmd_parms) +{ + int rows, next_rows; + unsigned int id, i, k, m, m_start, m_next, n; + unsigned int elementsize, idx, tag; + unsigned int blks, blks_per_spe, extra_blks; + unsigned long long a, b, c; /* ea pointers */ + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int c_hi, c_lo; + unsigned int lda, ldb, ldc; + vec_uint4 ld, dim, *list, *c_list, *c_list_next; + vec_double2 *A, *B, *C; + void *ptrB; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the command parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + ldb = spu_extract(ld, 1); + ldc = spu_extract(ld, 2); + + dim = cmd_parms->dim; + + n = spu_extract(dim, 0); + m = spu_extract(dim, 1); + k = spu_extract(dim, 2); + + /* Get a copy of B + */ + B = (void *)&bufB[0][0]; + ptrB = B; + DMA_WAIT_RECEIVE(); + for (i=0; i m) rows = m - m_start; + + a_lo += m_start * sizeof(double); + c_lo += m_start * sizeof(double); + + /* Fetch a block of A and C + */ + m = (rows > M_SUB) ? M_SUB : rows; + + elementsize = m * sizeof(double); + A = (void *)&bufA[0][0]; + list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + construct_list(list, a_lo, lda, elementsize); + spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, 0, MFC_GETL_CMD); + + c_list = (vec_uint4 *)&bufB[1][0]; + construct_list(c_list, c_lo, ldc, elementsize); + spu_mfcdma64((vec_double2 *)&bufC[0][0], c_hi, (unsigned int)c_list, 8*n, 0, MFC_GETL_CMD); + + DMA_WAIT_REQUEST(1); + + tag = 1; + idx = 1; + next_rows = rows - M_SUB; + + while (next_rows > 0) { + /* Fetch the next block of A and C */ + a_lo += elementsize; + c_lo += elementsize; + + m_next = (next_rows > M_SUB) ? M_SUB : next_rows; + elementsize = m_next * sizeof(double); + A = (void *)&bufA[tag][0]; + list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + construct_list(list, a_lo, lda, elementsize); + spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, tag, MFC_GETLB_CMD); + + c_list_next = (vec_uint4 *)&bufB[1][idx*M_SUB]; + construct_list(c_list_next, c_lo, ldc, elementsize); + spu_mfcdma64((vec_double2 *)&bufC[tag][0], c_hi, (unsigned int)c_list_next, 8*n, tag, MFC_GETL_CMD); + + /* Compute a block */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(1<incomplete, tag); +} Index: accel/lib/spu/accel_dgemm_CL.c =================================================================== RCS file: accel/lib/spu/accel_dgemm_CL.c diff -N accel/lib/spu/accel_dgemm_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dgemm_CL.c 14 May 2008 21:35:00 -0000 1.6 @@ -0,0 +1,231 @@ +/* -------------------------------------------------------------- */ +/* (C)Copyright 2007 */ +/* International Business Machines Corporation, */ +/* */ +/* All Rights Reserved. */ +/* -------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" + + +extern void mm_dp(int k, int m, int n, vector double *c, vector double *a, vector double *b); + + +/* Construct a DMA list assuming that there are 64 columns. If it is less, then they don't get used. + */ +static inline void construct_list(vec_uint4 *list, unsigned int lo, unsigned int stride, unsigned int elementsize) +{ + vec_uint4 e0, e1, e2; + vec_uint4 stride2, stride4, stride6; + + /* Construct e0, e1, e2 and stride6 to contain + * + * e0 = {row size, lo+0*stride, row size, lo+1*stride} + * e1 = {row size, lo+2*stride, row size, lo+3*stride} + * e2 = {row size, lo+4*stride, row size, lo+5*stride} + * + * stride6 = {0, 6*stride, 0, 6*stride} + */ + + e0 = spu_add(spu_shuffle(spu_promote(elementsize, 0), spu_promote(lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + + stride2 = spu_sl(spu_shuffle(spu_promote(stride, 0), spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + e1 = spu_add(e0, stride2); + + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + list[30] = e0; + list[31] = e1; +} + + + +/* Double precision DGEMM matrix-matrix multiply for column-ordered + * matrices. + */ +void accel_dgemm_CL_C_C(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dgemm_parms_t *cmd_parms) +{ + int rows, next_rows; + unsigned int id, i, k, m, m_start, m_next, n; + unsigned int elementsize, idx, tag; + unsigned int blks, blks_per_spe, extra_blks; + unsigned long long a, b, c; /* ea pointers */ + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int c_hi, c_lo; + unsigned int lda, ldb, ldc; + vec_uint4 ld, dim, *list, *c_list, *c_list_next; + vec_double2 *A, *B, *C; + void *ptrB; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the command parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + ldb = spu_extract(ld, 1); + ldc = spu_extract(ld, 2); + + dim = cmd_parms->dim; + + n = spu_extract(dim, 0); + m = spu_extract(dim, 1); + k = spu_extract(dim, 2); + + /* Get a copy of B + */ + B = (void *)&bufB[0][0]; + ptrB = B; + DMA_WAIT_RECEIVE(); + for (i=0; i m) rows = m - m_start; + + a_lo += m_start * sizeof(double); + c_lo += m_start * sizeof(double); + + /* Fetch a block of A and C + */ + m = (rows > M_SUB) ? M_SUB : rows; + + elementsize = m * sizeof(double); + A = (void *)&bufA[0][0]; + list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + construct_list(list, a_lo, lda, elementsize); + spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, 0, MFC_GETL_CMD); + + c_list = (vec_uint4 *)&bufB[1][0]; + construct_list(c_list, c_lo, ldc, elementsize); + spu_mfcdma64((vec_double2 *)&bufC[0][0], c_hi, (unsigned int)c_list, 8*n, 0, MFC_GETL_CMD); + + DMA_WAIT_REQUEST(1); + + tag = 1; + idx = 1; + next_rows = rows - M_SUB; + + while (next_rows > 0) { + /* Fetch the next block of A and C */ + a_lo += elementsize; + c_lo += elementsize; + + m_next = (next_rows > M_SUB) ? M_SUB : next_rows; + elementsize = m_next * sizeof(double); + A = (void *)&bufA[tag][0]; + list = (vec_uint4 *)A + (((M_SUB*M_SUB*sizeof(double)) - (M_SUB*8)) / sizeof(vec_uint4)); + construct_list(list, a_lo, lda, elementsize); + spu_mfcdma64(A, a_hi, (unsigned int)list, 8*k, tag, MFC_GETLB_CMD); + + c_list_next = (vec_uint4 *)&bufB[1][idx*M_SUB]; + construct_list(c_list_next, c_lo, ldc, elementsize); + spu_mfcdma64((vec_double2 *)&bufC[tag][0], c_hi, (unsigned int)c_list_next, 8*n, tag, MFC_GETL_CMD); + + /* Compute a block */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(1<incomplete, tag); +} Index: accel/lib/spu/accel_dgemm_panel.c =================================================================== RCS file: accel/lib/spu/accel_dgemm_panel.c diff -N accel/lib/spu/accel_dgemm_panel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dgemm_panel.c 23 Oct 2008 21:20:24 -0000 1.5 @@ -0,0 +1,585 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include "accel_dgemm.h" + +/* transpose_and_swap + * ------------------ + * For a 64x64 matrix m, inplace transpose the matrix and byte swap the contents. + */ + +static void transpose_and_swap(vec_double2 m[]) +{ + int i, j; + vec_double2 *row, *col; +#ifdef ACCEL_LITTLE_ENDIAN + vec_uchar16 pat_even = (vec_uchar16){7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; +#else + vec_uchar16 pat_even = (vec_uchar16){0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23}; +#endif + vec_uchar16 pat_odd; + vec_double2 r00, r01, r10, r11, r20, r21, r30, r31; + vec_double2 c00, c01, c10, c11, c20, c21, c30, c31; + + pat_odd = spu_or(pat_even, 8); + + /* Perform transpose and swap on 4x4 micro blocks + */ + for (i=0; i<64; i+=4) { + /* Transpose and swap the micro block on the diagonal. For example, consider + * the 16x16 matrix consisting of the following 16 micro blocks. The following + * code transposes the micro block along the diagonal, as marked by the "X". + * + * +---+---+---+---+ + * | X | | | | + * +---+---+---+---+ + * | | X | | | + * +---+---+---+---+ + * | | | X | | + * +---+---+---+---+ + * | | | | X | + * +---+---+---+---+ + */ + r00 = m[0*32+0]; + r01 = m[0*32+1]; + r10 = m[1*32+0]; + r11 = m[1*32+1]; + r20 = m[2*32+0]; + r21 = m[2*32+1]; + r30 = m[3*32+0]; + r31 = m[3*32+1]; + + m[0*32+0] = spu_shuffle(r00, r10, pat_even); + m[0*32+1] = spu_shuffle(r20, r30, pat_even); + m[1*32+0] = spu_shuffle(r00, r10, pat_odd); + m[1*32+1] = spu_shuffle(r20, r30, pat_odd); + m[2*32+0] = spu_shuffle(r01, r11, pat_even); + m[2*32+1] = spu_shuffle(r21, r31, pat_even); + m[3*32+0] = spu_shuffle(r01, r11, pat_odd); + m[3*32+1] = spu_shuffle(r21, r31, pat_odd); + + row = m + 2; + col = m + 4*32; + + for (j=i+4; j<64; j+=4) { + /* Tranpose and swap the micro blocks across the diagonal. For example, consider + * the 16x16 matrix consisting of the following 16 micro blocks. For each row + * of micro blocks, the row blocks to the right of the diagonal are transposed + * and swap with the column blocks below the diagonal. In our example, the first + * row, row block A is transposed and swap with column block 'a'. Likewise for + * 'B' and 'b'; and 'C' and 'c'. + * + * +---+---+---+---+ + * | | A | B | C | + * +---+---+---+---+ + * | a | | D | E | + * +---+---+---+---+ + * | b | d | | F | + * +---+---+---+---+ + * | c | e | f | | + * +---+---+---+---+ + */ + r00 = row[0*32+0]; + r01 = row[0*32+1]; + r10 = row[1*32+0]; + r11 = row[1*32+1]; + r20 = row[2*32+0]; + r21 = row[2*32+1]; + r30 = row[3*32+0]; + r31 = row[3*32+1]; + + c00 = col[0*32+0]; + c01 = col[0*32+1]; + c10 = col[1*32+0]; + c11 = col[1*32+1]; + c20 = col[2*32+0]; + c21 = col[2*32+1]; + c30 = col[3*32+0]; + c31 = col[3*32+1]; + + row[0*32+0] = spu_shuffle(c00, c10, pat_even); + row[0*32+1] = spu_shuffle(c20, c30, pat_even); + row[1*32+0] = spu_shuffle(c00, c10, pat_odd); + row[1*32+1] = spu_shuffle(c20, c30, pat_odd); + + col[0*32+0] = spu_shuffle(r00, r10, pat_even); + col[0*32+1] = spu_shuffle(r20, r30, pat_even); + col[1*32+0] = spu_shuffle(r00, r10, pat_odd); + col[1*32+1] = spu_shuffle(r20, r30, pat_odd); + + row[2*32+0] = spu_shuffle(c01, c11, pat_even); + row[2*32+1] = spu_shuffle(c21, c31, pat_even); + row[3*32+0] = spu_shuffle(c01, c11, pat_odd); + row[3*32+1] = spu_shuffle(c21, c31, pat_odd); + + col[2*32+0] = spu_shuffle(r01, r11, pat_even); + col[2*32+1] = spu_shuffle(r21, r31, pat_even); + col[3*32+0] = spu_shuffle(r01, r11, pat_odd); + col[3*32+1] = spu_shuffle(r21, r31, pat_odd); + + row += 2; /* Advance pointer to next row micro block */ + col += 4*32; /* Advance pointer to next column micro block */ + } + + m += 4*32+2; + } +} + +void accel_dgemm_panel(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dgemm_parms_t *cmd_parms) +{ + int i; + int rotate; + unsigned int id; + unsigned int idx, a_idx, c_idx; + unsigned int i1, phase; + unsigned long long a, b, c, p; /* ea pointers */ + unsigned int hi, lo; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int c_hi, c_lo; + unsigned int p_hi, p_lo; + unsigned int sub_blocks, sub_blocks_per_spe; + unsigned int start_x, start_sub, end_sub; + unsigned int odd, buf; + unsigned int x_sub, y_sub; + unsigned int w_sub, h_sub; /* width & height in sub_blocks */ + unsigned int lda, ldb, ldp, stride; + unsigned int a_addend, b_addend, c_addend, p_addend; + vec_uint4 vone = (vec_uint4){1, 1, 1, 1}; + vec_uint4 ld, flags, b_blk; + vec_uint4 a_step, b_step, c_stepv, c_steph, p_stepv, p_steph; + vec_uint4 dim; + vec_uint4 h_sub_v, h_sub2_v, y_sub2_v; + vec_uint4 down, corner; + vec_uint4 corner_eq2; + vec_uint4 step_sub = spu_splats(M_SUB*sizeof(double)); + vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1}; + vec_uint4 list[2][M_SUB/2]; + vec_uchar16 splat_1 = (vec_uchar16)spu_splats((unsigned int)0x04050607); + vec_uchar16 splat_2 = (vec_uchar16)spu_splats((unsigned int)0x08090A0B); + vec_uchar16 shuf_0404 = (vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19}; + vec_uchar16 shuf_0044 = (vec_uchar16){0,1,2,3, 0,1,2,3, 16,17,18,19, 16,17,18,19}; + vec_double2 *c_ptr; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + + /* Fetch the command parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + p = cmd_parms->p; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + p_hi = mfc_ea2h(p); + p_lo = mfc_ea2l(p); + + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + ldb = spu_extract(ld, 1); + ldp = spu_extract(ld, 3); + + dim = cmd_parms->dim; + + flags = cmd_parms->flags; + + b_blk = spu_maskw(spu_extract(flags, 0)); + + /* Computation of [C] -= [A][B] is performed in a surpetine pattern + * through the various sub-blocks of C. Below is a graphical attempt + * to explain the partitioning and order of the computation. For this + * example, consider the matrix-matrix multiply of a 5x5 (128x128 block) + * result after panel factorization of block 0,0 (bx,by). In this case, + * we must compute 128x128 blocks multiplies as follows: + * + * for (x=1; x<5; x++) { + * for (y=1; y<5; y++) { + * C(x,y) -= A(bx,y)*B(x,by); + * } + * } + * + * Assuming this computation is performed by 3 SPEs, the 16 blocks + * are subdivided as: + * + * SPE 0 : C(1,1), C(1,2), C(1,3), C(1,4), C(2,1), C(2,2) + * SPE 1 : C(2,3), C(2,4), C(3,1), C(3,2), C(3,3), C(3,4) + * SPE 2 : C(4,1), C(4,2), C(4,3), C(4,4) + * + * Therefore, SPE 1 will compute the resulting sub-blocks of C in the + * alphabetic order (a thru z) as marked below. + * + * X + * 0 1 2 3 4 + * +---B---+---+---+---+ + * 0 | | U row | + * | | | + * A---C---+---+---+---+ + * 1 | | | |i x| | + * | | | |j w| | + * + L +---+---+---+---+ + * Y 2 | | | |k v| | + * | p | | |l u| | + * + a +---+---+---+---+ + * 3 | n | |a h|m t| | + * | e | |b g|n s| | + * + l +---+---+---+---+ + * 4 | | |c f|o r| | + * | | |d e|p q| | + * +---+---+---+---+---+ + * + * Using 128x128 block partitioning amongst the SPEs results non-optimal + * load balancing of the SPEs. This is shown by the above example in which + * SPEs 0 and 1 compute 24 64x64 multiplies, while SPE 2 only computes + * 16 64x64 multiplies. In addition, the corner turn between sub-blocks + * 'h' and 'i' will incur extra DMAs. + * + * A more computational and transfer efficient load balance would be + * to allocate computation on the 64 sub-blocks. This would allocate + * 22,22,20 sub-block multiplies to each of the SPEs and the corner + * turn becomes efficient. The sub-block, computation (alphabetically + * ordered) for SPE 1 becomes: + * + * X + * 0 1 2 3 4 + * +---B---+---+---+---+ + * 0 | | U row | + * | | | + * A---C---+---+---+---+ + * 1 | | | j|k | | + * | | | i|l | | + * + L +---+---+---+---+ + * Y 2 | | | h|m | | + * | p | | g|n | | + * + a +---+---+---+---+ + * 3 | n | | f|o v| | + * | e | | e|p u| | + * + l +---+---+---+---+ + * 4 | | |a d|q t| | + * | | |b c|r s| | + * +---+---+---+---+---+ + * + * This more efficient method is employed in the following code. + */ + + w_sub = spu_extract(dim, 0); + h_sub_v = spu_shuffle(dim, dim, splat_1); + h_sub = spu_extract(h_sub_v, 0); + + h_sub2_v = spu_sl(h_sub_v, 1); + + sub_blocks = w_sub * h_sub; + sub_blocks_per_spe = (sub_blocks + HPL_ACCEL_SPES-1) / HPL_ACCEL_SPES; + + start_sub = ((unsigned short)id) * sub_blocks_per_spe; + end_sub = start_sub + sub_blocks_per_spe; + if (end_sub > sub_blocks) end_sub = sub_blocks; + + sub_blocks = end_sub - start_sub; + + if (LIKELY((int)sub_blocks > 0)) { + /* This SPE has some work to do + */ + DMA_WAIT_REQUEST(-1); + + /* Compute vectors for stepping the effective address matrix pointers. + * The pictograms below show 64x64 blocks within the 128x128 blocks. + * + * A (L panel) B (U panel) C matrix + * ++===+===++ ++---+---++---+---++ ++===+===++===+===++ + * || 1 | 2 || || 1 | 4 || 5 | || || 1 | || | || + * ++---+---++ ++---+---++---+---++ ++---+---++---+---++ + * || 3 | || || 2 | 3 || | || || 2 | || | || + * ++===+===++ ++---+---++---+---++ ++===+===++===+===++ + * || | || || | || | || + * ++---+---++ ++---+---++---+---++ + * || | || || 3 | 4 || | || + * ++===+===++ ++===+===++===+===++ + * + * + * P (output matrix) + * ++===+===++===+===++ + * || 1 | || | || + * ++---+---++---+---++ + * || 2 | || | || + * ++===+===++===+===++ + * || | || | || + * ++---+---++---+---++ + * || 3 | 4 || | || + * ++===+===++===+===++ + * a_step = {1 to 2, 2 to 3, 1 to 2, 2 to 3} + * b_step = {1 to 2, 2 to 3, 3 to 4, 4 to 5} + * c_stepv= {1 to 2, 1 to 2, 1 to 2, 1 to 2} + * c_steph= {3 to 4, 3 to 4, 3 to 4, 3 to 4} + * p_stepv= {1 to 2, 1 to 2, 1 to 2, 1 to 2} + * p_steph= {3 to 4, 3 to 4, 3 to 4, 3 to 4} + */ + + a_step = spu_promote(lda * M_SUB, 0); + a_step = spu_shuffle(a_step, spu_sub(step_sub, a_step), shuf_0404); + + c_stepv = spu_splats(M_SUB*M_SUB*sizeof(double)); + c_steph = spu_shuffle(ld, ld, splat_2); + + p_stepv = step_sub; + p_steph = spu_promote(ldp * M_SUB, 0); + + b_step = spu_sel(spu_promote(ldb * M_SUB, 0), c_stepv, b_blk); + b_step = spu_shuffle(b_step, spu_sub(0, b_step), shuf_0044); + b_step = spu_sel(b_step, spu_sel(step_sub, spu_shuffle(ld, ld, splat_1), b_blk), mask_0101); + + ldb = spu_extract(spu_sel(spu_promote(ldb, 0), step_sub, b_blk), 0); + + /* Determine the following: + * 1) Starting sub-block - x_sub, y_sub + * 2) Number of sub-block multiplies before a corner turn - corner. + */ + x_sub = start_sub / h_sub; + y_sub = start_sub - h_sub * x_sub; + + start_x = x_sub / SUB; + y_sub = start_sub - h_sub*SUB*start_x; + + /* rotate = 4; + * + * if (x_sub & 1) { + * y_sub = h_sub - 1 - y_sub; + * a_step = spu_sub(0, a_step); + * c_stepv = spu_sub(0, c_stepv); + * p_stepv = spu_sub(0, p_stepv); + * rotate = -rotate; + * corner = 2*y_sub + 2 + * } else { + * corner = 2 * (h_sub-y_sub) + * } + */ + odd = x_sub & 1; + + down = spu_cmpeq(spu_splats(odd), 0); + + y_sub = spu_extract(spu_sel(spu_sub(h_sub2_v, spu_promote(y_sub + 1, 0)), + spu_promote(y_sub, 0), + down), 0); + + y_sub2_v = spu_splats(2*y_sub); + + corner = spu_sel(spu_add(y_sub2_v, 2), spu_sub(h_sub2_v, y_sub2_v), down); + + /* Compute the initial EA buffer pointers. + */ + a_addend = y_sub * spu_extract(step_sub, 0) + spu_extract(spu_andc(a_step, down), 0); + b_addend = spu_extract(spu_andc(b_step, down), 0); + c_addend = y_sub * spu_extract(c_stepv, 0); + p_addend = y_sub * spu_extract(p_stepv, 0) + x_sub * spu_extract(p_steph, 0); + + a_lo += a_addend; + p_lo += p_addend; + MATRIX_EA_UADD32(b_hi, b_lo, b_addend); + MATRIX_EA_UMADD32(b_hi, b_lo, x_sub, spu_extract(b_step, 1)); + MATRIX_EA_UADD32(c_hi, c_lo, c_addend); + MATRIX_EA_UMADD32(c_hi, c_lo, x_sub, spu_extract(c_steph, 0)); + + /* Adjust the pointer steps according to the initial direction. + */ + a_step = spu_sel(spu_sub(0, a_step), a_step, down); + b_step = spu_rlqwbyte(b_step, 8 & ~spu_extract(down, 0)); + c_stepv = spu_sel(spu_sub(0, c_stepv), c_stepv, down); + p_stepv = spu_sel(spu_sub(0, p_stepv), p_stepv, down); + rotate = ((-4) ^ spu_extract(down, 0)) - spu_extract(down, 0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + /* Download 3 blocks to get the process started. After that, each + * 64x64 block multiple requires 2 block transfers. + */ + dma_block_getl(&bufA[0][0], a_hi, a_lo, 0, lda); + + dma_block_getl(&bufB[0][0], b_hi, b_lo, 0, ldb); + + dma_block(&bufC[0][0], c_hi, c_lo, 0, MFC_GET_CMD); + + DMA_WAIT_REQUEST(1<<0); + + a_lo += spu_extract(a_step, 0); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(b_step, 0)); + + dma_block_getl(&bufA[1][0], a_hi, a_lo, 1, lda); + + dma_block_getl(&bufB[1][0], b_hi, b_lo, 1, ldb); + + phase = 0; + + i1 = 0; + a_idx = 0; + + for (i=0; i<(int)sub_blocks-1; i++) { + /* First block computation + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST((1<<1)|(1<<2)); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]); + + a_step = spu_rlqwbyte(a_step, rotate); + + c_idx = i1 ^ 1; + + corner_eq2 = spu_cmpeq(corner, 2); + + /* if (corner == 2) { + * rotate = -rotate; + * a_step = 0-a_step; + * } else { + * a_lo += a_step; + * } + */ + rotate = (rotate ^ spu_extract(corner_eq2, 0)) - spu_extract(corner_eq2, 0); + a_lo += spu_extract(spu_andc(a_step, corner_eq2), 0); + a_step = spu_sel(a_step, spu_sub(0, a_step), corner_eq2); + + /* if corner != 2 then fetch next A buffer + * else "corner turn" fetch next B buffer + */ + b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0)); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0)); + + idx = spu_extract(spu_andc(spu_promote(a_idx, 0), corner_eq2), 0); + buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0), + spu_promote((unsigned int)bufB, 0), + corner_eq2), 0); + hi = spu_extract(spu_sel(spu_promote(a_hi, 0), + spu_promote(b_hi, 0), + corner_eq2), 0); + lo = spu_extract(spu_sel(spu_promote(a_lo, 0), + spu_promote(b_lo, 0), + corner_eq2), 0); + stride = spu_extract(spu_sel(spu_promote(lda, 0), + spu_promote(ldb, 0), + corner_eq2), 0); + + buf += idx * (unsigned int)(sizeof(bufA)/2); + +#ifdef __GNUC__ + /* The following lnop was added to keep gcc from unscheduling the + * series of add,stqd instruction pairs used to build the DMA list in + * dma_block_getl. + */ + si_lnop(); +#endif + + dma_block_getl((vec_double2 *)buf, hi, lo, 0, stride); + + /* if (corner == 2) { + * c_lo += c_steph; + * c_stepv = -c_stepv; + * } else { + * c_lo += c_stepv; + * } + */ + c_addend = spu_extract(spu_sel(c_stepv, c_steph, corner_eq2), 0); + MATRIX_EA_ADD32(c_hi, c_lo, c_addend); + c_stepv = spu_sel(c_stepv, spu_sub(0, c_stepv), corner_eq2); + + /* Before getting another C buffer, we must wait for the previous + * one to be stored. + */ + DMA_WAIT_RECEIVE(); + dma_block(&bufC[c_idx][0], c_hi, c_lo, 0, MFC_GET_CMD); + + DMA_WAIT_REQUEST(1<<0); + + a_idx = phase^1; + + /* Second block computation + */ + c_ptr = &bufC[i1][0]; + + mm_dp_64Cx64(c_ptr, &bufA[a_idx][0], &bufB[1][0]); + + a_step = spu_rlqwbyte(a_step, rotate); + a_lo += spu_extract(a_step, 0); + + /* if corner != 2 then fetch next A buffer + * else "corner turn" fetch next B buffer + */ + b_step = spu_rlqwbyte(b_step, 4 & spu_extract(corner_eq2, 0)); + MATRIX_EA_ADD32(b_hi, b_lo, spu_extract(spu_and(b_step, corner_eq2), 0)); + + idx = spu_extract(spu_sel(spu_promote(a_idx, 0), vone, corner_eq2), 0); + buf = spu_extract(spu_sel(spu_promote((unsigned int)bufA, 0), + spu_promote((unsigned int)bufB, 0), + corner_eq2), 0); + hi = spu_extract(spu_sel(spu_promote(a_hi, 0), + spu_promote(b_hi, 0), + corner_eq2), 0); + lo = spu_extract(spu_sel(spu_promote(a_lo, 0), + spu_promote(b_lo, 0), + corner_eq2), 0); + stride = spu_extract(spu_sel(spu_promote(lda, 0), + spu_promote(ldb, 0), + corner_eq2), 0); + + buf += idx * (unsigned int)(sizeof(bufA)/2); + dma_block_getl((vec_double2 *)buf, hi, lo, 1, stride); + + /* Transpose and swap the resulting block + * + * if (corner == 2) { + * p_lo += p_steph; + * p_stepv = -p_stepv; + * } else { + * p_lo += p_stepv; + * } + */ + transpose_and_swap(&bufC[i1][0]); + + dma_block_putl(&list[i1][0], c_ptr, p_hi, p_lo, 2, ldp); + + p_lo += spu_extract(spu_sel(p_stepv, p_steph, corner_eq2), 0); + p_stepv = spu_sel(p_stepv, spu_sub(0, p_stepv), corner_eq2); + + corner = spu_sel(spu_add(corner, -2), h_sub2_v, corner_eq2); + phase ^= spu_extract(corner_eq2, 0) & 1; + + i1 ^= 1; + a_idx = phase; + } + + /* Finish the last sub-block */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST((1<<1)|(1<<2)); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx][0], &bufB[0][0]); + + DMA_WAIT_RECEIVE(); + + mm_dp_64Cx64(&bufC[i1][0], &bufA[a_idx^1][0], &bufB[1][0]); + + /* Transpose and swap the resulting block + */ + transpose_and_swap(&bufC[i1][0]); + dma_block_putl(&list[i1][0], &bufC[i1][0], p_hi, p_lo, 1, ldp); + } + + /* Report completion status if requested. + */ + report_completion(id, cmd_parms->incomplete, 1); +} Index: accel/lib/spu/accel_dtrsm.c =================================================================== RCS file: accel/lib/spu/accel_dtrsm.c diff -N accel/lib/spu/accel_dtrsm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dtrsm.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,154 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_dtrsm.h" + + +void accel_dtrsm(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dtrsm_parms_t *cmd_parms) +{ + int i; + unsigned int idx, tag, next_tag; + unsigned int size, lda, stride; + unsigned int id; + unsigned long long a, b; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int list; + unsigned int n; + vec_uint4 ld; + vec_uint4 element, stride2, stride4, stride6; + volatile void *lsa; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + + /* DMA the entire 128x128 unit lower triangle into the LS. To reduce startup + * time, we will download only the necessary data columns in groups of 16 + * while preserving the cacheline alignment. The download will be done + * starting from the smallest column to the largest. + */ + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + lsa = (volatile void *)(&bufA_128x128[0]); + size = 128*sizeof(double); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (i=0; i<127; i++) { + unsigned int adjust; + + spu_mfcdma64(lsa, a_hi, a_lo, size, 0, MFC_GET_CMD); + + a_lo += lda; + lsa += 128*sizeof(double); + + /* Compute the next DMA parameters + */ + adjust = spu_extract(spu_and(spu_cmpeq(spu_promote((i & 15), 0), 14), 16*sizeof(double)), 0); + + a_lo += adjust; + lsa += adjust; + size -= adjust; + } + + n = spu_extract(cmd_parms->dim, 0) / 16; + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + b_lo += 16 * sizeof(double) * id; + + /* Download the initial set of 16 B columns + */ + stride = spu_extract(ld, 1); + + element = spu_add(spu_shuffle(spu_splats((unsigned int)(16*sizeof(double))), + spu_promote(b_lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + stride2 = spu_sl(spu_shuffle(ld, ld, ((vec_uchar16){128,128,128,128, 4,5,6,7, 128,128,128,128, 4,5,6,7})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + fill_dma_list(&bufB_list[0][0], element, stride2, stride4, stride6); + spu_mfcdma64(&bufB_128x16[0][0], b_hi, (unsigned int)(&bufB_list[0][0]), 128*8, 0, MFC_GETL_CMD); + + + idx = 1; + next_tag = 0; + tag = 1; + + DMA_WAIT_REQUEST(1<<0); + + for (i=id+HPL_ACCEL_SPES; i<(int)n; i+=HPL_ACCEL_SPES) { + /* Fetch the next buffer + */ + element = spu_add(element, ((vec_uint4){0, HPL_ACCEL_SPES*16*sizeof(double), 0, HPL_ACCEL_SPES*16*sizeof(double)})); + + fill_dma_list(&bufB_list[idx][0], element, stride2, stride4, stride6); + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, (unsigned int)(&bufB_list[idx][0]), 128*8, tag, MFC_GETLB_CMD); + tag ^= 1; + + /* Wait for the previous get to complete */ + DMA_WAIT_RECEIVE(); + + /* Perform the dtrsm. + */ + dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[tag][0]); + + idx = (idx + 1) & 3; + + list = (unsigned int)&bufB_list[idx^2][0]; + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, 128*8, tag, MFC_PUTL_CMD); + + next_tag = tag ^ 1; + + DMA_WAIT_REQUEST(1<incomplete, tag); +} + Index: accel/lib/spu/accel_dtrsm.h =================================================================== RCS file: accel/lib/spu/accel_dtrsm.h diff -N accel/lib/spu/accel_dtrsm.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dtrsm.h 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,83 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_DTRSM_H_ +#define _ACCEL_DTRSM_H_ 1 + + +extern void dtrsm_dp_128Cx16(vec_double2 *bufA, vec_double2 *bufB); + +static inline void fill_dma_list(volatile vec_uint4 *list, vec_uint4 e0, vec_uint4 stride2, vec_uint4 stride4, vec_uint4 stride6) +{ + vec_uint4 e1, e2; + + e1 = spu_add(e0, stride2); + e2 = spu_add(e0, stride4); list[0] = e0; + e0 = spu_add(e0, stride6); list[1] = e1; + e1 = spu_add(e1, stride6); list[2] = e2; + e2 = spu_add(e2, stride6); list[3] = e0; + e0 = spu_add(e0, stride6); list[4] = e1; + e1 = spu_add(e1, stride6); list[5] = e2; + e2 = spu_add(e2, stride6); list[6] = e0; + e0 = spu_add(e0, stride6); list[7] = e1; + e1 = spu_add(e1, stride6); list[8] = e2; + e2 = spu_add(e2, stride6); list[9] = e0; + e0 = spu_add(e0, stride6); list[10] = e1; + e1 = spu_add(e1, stride6); list[11] = e2; + e2 = spu_add(e2, stride6); list[12] = e0; + e0 = spu_add(e0, stride6); list[13] = e1; + e1 = spu_add(e1, stride6); list[14] = e2; + e2 = spu_add(e2, stride6); list[15] = e0; + e0 = spu_add(e0, stride6); list[16] = e1; + e1 = spu_add(e1, stride6); list[17] = e2; + e2 = spu_add(e2, stride6); list[18] = e0; + e0 = spu_add(e0, stride6); list[19] = e1; + e1 = spu_add(e1, stride6); list[20] = e2; + e2 = spu_add(e2, stride6); list[21] = e0; + e0 = spu_add(e0, stride6); list[22] = e1; + e1 = spu_add(e1, stride6); list[23] = e2; + e2 = spu_add(e2, stride6); list[24] = e0; + e0 = spu_add(e0, stride6); list[25] = e1; + e1 = spu_add(e1, stride6); list[26] = e2; + e2 = spu_add(e2, stride6); list[27] = e0; + e0 = spu_add(e0, stride6); list[28] = e1; + e1 = spu_add(e1, stride6); list[29] = e2; + e2 = spu_add(e2, stride6); list[30] = e0; + e0 = spu_add(e0, stride6); list[31] = e1; + e1 = spu_add(e1, stride6); list[32] = e2; + e2 = spu_add(e2, stride6); list[33] = e0; + e0 = spu_add(e0, stride6); list[34] = e1; + e1 = spu_add(e1, stride6); list[35] = e2; + e2 = spu_add(e2, stride6); list[36] = e0; + e0 = spu_add(e0, stride6); list[37] = e1; + e1 = spu_add(e1, stride6); list[38] = e2; + e2 = spu_add(e2, stride6); list[39] = e0; + e0 = spu_add(e0, stride6); list[40] = e1; + e1 = spu_add(e1, stride6); list[41] = e2; + e2 = spu_add(e2, stride6); list[42] = e0; + e0 = spu_add(e0, stride6); list[43] = e1; + e1 = spu_add(e1, stride6); list[44] = e2; + e2 = spu_add(e2, stride6); list[45] = e0; + e0 = spu_add(e0, stride6); list[46] = e1; + e1 = spu_add(e1, stride6); list[47] = e2; + e2 = spu_add(e2, stride6); list[48] = e0; + e0 = spu_add(e0, stride6); list[49] = e1; + e1 = spu_add(e1, stride6); list[50] = e2; + e2 = spu_add(e2, stride6); list[51] = e0; + e0 = spu_add(e0, stride6); list[52] = e1; + e1 = spu_add(e1, stride6); list[53] = e2; + e2 = spu_add(e2, stride6); list[54] = e0; + e0 = spu_add(e0, stride6); list[55] = e1; + e1 = spu_add(e1, stride6); list[56] = e2; + e2 = spu_add(e2, stride6); list[57] = e0; + e0 = spu_add(e0, stride6); list[58] = e1; + e1 = spu_add(e1, stride6); list[59] = e2; + e2 = spu_add(e2, stride6); list[60] = e0; + e0 = spu_add(e0, stride6); list[61] = e1; + list[62] = e2; + list[63] = e0; +} + +#endif /* _ACCEL_DTRSM_H_ */ Index: accel/lib/spu/accel_dtrsm_CL_B.c =================================================================== RCS file: accel/lib/spu/accel_dtrsm_CL_B.c diff -N accel/lib/spu/accel_dtrsm_CL_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dtrsm_CL_B.c 22 Oct 2008 03:28:08 -0000 1.4 @@ -0,0 +1,249 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_dtrsm.h" + + +void accel_dtrsm_CL_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dtrsm_parms_t *cmd_parms) +{ + int i; + unsigned int idx, tag, next_tag; + unsigned int size, lda, stride; + unsigned int id; + unsigned long long a, b; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int list; + unsigned int n; + unsigned int span; + vec_uint4 ld; + vec_uint4 element, stride2, stride4, stride6, next; + volatile void *lsa; +#ifdef MATRIX_4GB_CROSSING + unsigned int list_size, hi; + vec_uint4 sizes[4]; + vec_uint4 b_his[4]; +#endif +#if (HPL_ACCEL_SPES & 3) != 0 + unsigned int stride0, stride1; + vec_uint4 blk_idx, next0, next1; +#endif + + id = parms->id; + + stride2 = ((vec_uint4){0, 2*64*sizeof(double), 0, 2*64*sizeof(double)}); + stride4 = ((vec_uint4){0, 4*64*sizeof(double), 0, 4*64*sizeof(double)}); + stride6 = ((vec_uint4){0, 6*64*sizeof(double), 0, 6*64*sizeof(double)}); + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + + /* DMA the entire 128x128 unit lower triangle into the LS. To reduce startup + * time, we will download only the necessary data columns in groups of 16 + * while preserving the cacheline alignment. The download will be done + * starting from the smallest column to the largest. + */ + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + lsa = (volatile void *)(&bufA_128x128[0]); + size = 128*sizeof(double); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (i=0; i<127; i++) { + unsigned int adjust; + + spu_mfcdma64(lsa, a_hi, a_lo, size, 0, MFC_GET_CMD); + + a_lo += lda; + lsa += 128*sizeof(double); + + /* Compute the next DMA parameters + */ + adjust = spu_extract(spu_and(spu_cmpeq(spu_promote((i & 15), 0), 14), 16*sizeof(double)), 0); + + a_lo += adjust; + lsa += adjust; + size -= adjust; + } + + n = spu_extract(cmd_parms->dim, 0) / 16; + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + /* Download the initial set of 16 B columns + */ + span = spu_extract(cmd_parms->blk_col, 0) + id; + stride = spu_extract(ld, 1); + b_lo += (span & 3) * 16 * sizeof(double); + MATRIX_EA_UMADD32(b_hi, b_lo, (span/4), stride); + element = spu_add(spu_shuffle(spu_splats((unsigned int)(16*sizeof(double))), spu_promote(b_lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + ((vec_uint4){0, 0, 0, 64*sizeof(double)})); + + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_splats(span & 3); + + stride0 = stride * (HPL_ACCEL_SPES / 4); + stride1 = stride * (1 + HPL_ACCEL_SPES / 4); + stride0 += ( HPL_ACCEL_SPES & 3)*16*(int)sizeof(double); + stride1 -= (-HPL_ACCEL_SPES & 3)*16*(int)sizeof(double); + + next0 = spu_shuffle(spu_promote(stride0, 0), spu_promote(stride0, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); + next1 = spu_shuffle(spu_promote(stride1, 0), spu_promote(stride1, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); +#else + stride *= HPL_ACCEL_SPES / 4; + next = spu_shuffle(spu_promote(stride, 0), + spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); +#endif + + + list = (unsigned int)&bufB_list[0][0]; + fill_dma_list((volatile vec_uint4 *)list, element, stride2, stride4, stride6); + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3)); + next = spu_sel(next0, next1, spu_cmpgt(blk_idx, 3)); + blk_idx = spu_and(blk_idx, 3); +#endif + +#ifdef MATRIX_4GB_CROSSING + /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through + * the list. + */ + list_size = (spu_extract(element, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8); + + spu_mfcdma64(&bufB_128x16[0][0], b_hi, list, list_size, 0, MFC_GETL_CMD); + spu_mfcdma64(&bufB_128x16[0][list_size], b_hi+1, list+(M_SUB*8), M*8-list_size, 0, MFC_GETL_CMD); + + sizes[0] = spu_promote(list_size, 0); + b_his[0] = spu_promote(b_hi, 0); + + b_hi += spu_extract(spu_genc(element, next), 1); +#else + spu_mfcdma64(&bufB_128x16[0][0], b_hi, list, 128*8, 0, MFC_GETL_CMD); +#endif + element = spu_add(element, next); + + idx = 1; + next_tag = 0; + tag = 1; + + DMA_WAIT_REQUEST(1<<0); + + for (i=id+HPL_ACCEL_SPES; i<(int)n; i+=HPL_ACCEL_SPES) { + /* Fetch the next buffer + */ + list = (unsigned int)&bufB_list[idx][0]; + fill_dma_list((volatile vec_uint4 *)list, element, stride2, stride4, stride6); + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3)); + next = spu_sel(next0, next1, spu_cmpgt(blk_idx, 3)); + blk_idx = spu_and(blk_idx, 3); +#endif + +#ifdef MATRIX_4GB_CROSSING + /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through + * the list. + */ + list_size = (spu_extract(element, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8); + + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, list_size, tag, MFC_GETLB_CMD); + spu_mfcdma64(&bufB_128x16[tag][list_size], b_hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_GETL_CMD); + + sizes[idx] = spu_promote(list_size, 0); + b_his[idx] = spu_promote(b_hi, 0); + + b_hi += spu_extract(spu_genc(element, next), 1); +#else + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, 128*8, tag, MFC_GETLB_CMD); +#endif + element = spu_add(element, next); + + tag ^= 1; + + /* Wait for the previous get to complete */ + DMA_WAIT_RECEIVE(); + + /* Perform the dtrsm. + */ + dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[tag][0]); + + idx = (idx + 1) & 3; + + /* Store the update matrix columns back to memory + */ + list = (unsigned int)&bufB_list[idx^2][0]; +#ifdef MATRIX_4GB_CROSSING + list_size = spu_extract(sizes[idx^2], 0); + + hi = spu_extract(b_his[idx^2], 0); + + spu_mfcdma64(&bufB_128x16[tag][0], hi, list, list_size, tag, MFC_PUTL_CMD); + spu_mfcdma64(&bufB_128x16[tag][list_size], hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD); +#else + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, list, 128*8, tag, MFC_PUTL_CMD); +#endif + + next_tag = tag ^ 1; + + DMA_WAIT_REQUEST(1<incomplete, tag); +} + Index: accel/lib/spu/accel_dtrsm_dp_128Cx16.S =================================================================== RCS file: accel/lib/spu/accel_dtrsm_dp_128Cx16.S diff -N accel/lib/spu/accel_dtrsm_dp_128Cx16.S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_dtrsm_dp_128Cx16.S 23 Oct 2008 21:20:24 -0000 1.3 @@ -0,0 +1,2270 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +/* + * SYNOPSIS: + * void dtrsm_dp_128Cx16(vec_double2 *bufA, vec_double2 *bufB) + * + * DESCRIPTION: + * This file contains a specialized DTRSM function that solves + * the matrix equation for [x]. + * + * [a]*[x] = [b] + * + * where: + * [a] is a unit lower, column ordered, double precision, little endian triangle + * matrix of 128 rows by 128 columns. + * [b] is a row ordered, double precision, matrix of 128 rows and 16 columns. + * The solution [x] is returned in [b]. + * + * This implementation is a highly optimized solution that mimics the following + * scalar design that processes 4 rows of b at a time: + * + * for (i=0; i<128; i+=4) { # iloop + * for (x=0; x +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_dtrsm.h" + +void accel_dtrsm_panel(hpl_accel_init_parms_t *parms, + volatile hpl_accel_dtrsm_parms_t *cmd_parms) +{ + int i; + unsigned int idx, tag, next_tag; + unsigned int size, lda, stride; + unsigned int id; + unsigned long long a, b, c; + unsigned int a_hi, a_lo; + unsigned int b_hi, b_lo; + unsigned int c_hi, c_lo; + unsigned int list; + unsigned int n; + unsigned int span; + vec_uint4 ld; + vec_uint4 element, stride2, stride4, stride6; + vec_uint4 elementc, nextc, stride2c, stride4c, stride6c; + volatile void *lsa; +#ifdef MATRIX_4GB_CROSSING + unsigned int list_size; +#endif +#if (HPL_ACCEL_SPES & 3) != 0 + unsigned int stride0c, stride1c; + vec_uint4 blk_idx, next0c, next1c; +#endif + + id = parms->id; + + stride2c = ((vec_uint4){0, 2*64*sizeof(double), 0, 2*64*sizeof(double)}); + stride4c = ((vec_uint4){0, 4*64*sizeof(double), 0, 4*64*sizeof(double)}); + stride6c = ((vec_uint4){0, 6*64*sizeof(double), 0, 6*64*sizeof(double)}); + + elementc = (vec_uint4){0}; /* included just to eliminate a warning */ + nextc = (vec_uint4){0}; /* included just to eliminate a warning */ + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + a = cmd_parms->a; + b = cmd_parms->b; + c = cmd_parms->c; + ld = cmd_parms->ld; + + lda = spu_extract(ld, 0); + + /* DMA the entire 128x128 unit lower triangle into the LS. To reduce startup + * time, we will download only the necessary data columns in groups of 16 + * while preserving the cacheline alignment. The download will be done + * starting from the smallest column to the largest. + */ + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + lsa = (volatile void *)(&bufA_128x128[0]); + size = 128*sizeof(double); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (i=0; i<127; i++) { + unsigned int adjust; + + spu_mfcdma64(lsa, a_hi, a_lo, size, 0, MFC_GET_CMD); + + a_lo += lda; + lsa += 128*sizeof(double); + + /* Compute the next DMA parameters + */ + adjust = spu_extract(spu_and(spu_cmpeq(spu_promote((i & 15), 0), 14), 16*sizeof(double)), 0); + + a_lo += adjust; + lsa += adjust; + size -= adjust; + } + + n = spu_extract(cmd_parms->dim, 0) / 16; + b_hi = mfc_ea2h(b); + b_lo = mfc_ea2l(b); + + b_lo += 16 * sizeof(double) * id; + + /* Download the initial set of 16 B columns + */ + stride = spu_extract(ld, 1); + + element = spu_add(spu_shuffle(spu_splats((unsigned int)(16*sizeof(double))), + spu_promote(b_lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + spu_rlmaskqwbyte(spu_promote(stride, 0), -12)); + + stride2 = spu_sl(spu_shuffle(ld, ld, ((vec_uchar16){128,128,128,128, 4,5,6,7, 128,128,128,128, 4,5,6,7})), 1); + stride4 = spu_add(stride2, stride2); + stride6 = spu_add(stride2, stride4); + + fill_dma_list(&bufB_list[0][0], element, stride2, stride4, stride6); + spu_mfcdma64(&bufB_128x16[0][0], b_hi, (unsigned int)(&bufB_list[0][0]), 128*8, 0, MFC_GETL_CMD); + + + c_hi = mfc_ea2h(c); + c_lo = mfc_ea2l(c); + + span = spu_extract(cmd_parms->blk_col, 0) + id; + stride = spu_extract(ld, 2); + c_lo += (span & 3) * 16 * sizeof(double); + MATRIX_EA_UMADD32(c_hi, c_lo, (span/4), stride); + elementc = spu_add(spu_shuffle(element, spu_promote(c_lo, 0), + ((vec_uchar16){0,1,2,3, 16,17,18,19, 0,1,2,3, 16,17,18,19})), + ((vec_uint4){0, 0, 0, 64*sizeof(double)})); + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_splats(span & 3); + + stride0c = stride * (HPL_ACCEL_SPES / 4); + stride1c = stride * (1 + HPL_ACCEL_SPES / 4); + stride0c += ( HPL_ACCEL_SPES & 3)*16*(int)sizeof(double); + stride1c -= (-HPL_ACCEL_SPES & 3)*16*(int)sizeof(double); + + next0c = spu_shuffle(spu_promote(stride0c, 0), spu_promote(stride0c, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); + next1c = spu_shuffle(spu_promote(stride1c, 0), spu_promote(stride1c, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); +#else + stride *= HPL_ACCEL_SPES / 4; + nextc = spu_shuffle(spu_promote(stride, 0), + spu_promote(stride, 0), + ((vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3})); +#endif + + idx = 1; + next_tag = 0; + tag = 1; + + DMA_WAIT_REQUEST(1<<0); + + for (i=id+HPL_ACCEL_SPES; i<(int)n; i+=HPL_ACCEL_SPES) { + /* Fetch the next buffer + */ + element = spu_add(element, ((vec_uint4){0, HPL_ACCEL_SPES*16*sizeof(double), 0, HPL_ACCEL_SPES*16*sizeof(double)})); + + fill_dma_list(&bufB_list[idx][0], element, stride2, stride4, stride6); + spu_mfcdma64(&bufB_128x16[tag][0], b_hi, (unsigned int)(&bufB_list[idx][0]), 128*8, tag, MFC_GETLB_CMD); + tag ^= 1; + + /* Wait for the previous get to complete */ + DMA_WAIT_RECEIVE(); + + /* Perform the dtrsm. + */ + dtrsm_dp_128Cx16(&bufA[0][0], &bufB_128x16[tag][0]); + + idx = (idx + 1) & 3; + + /* Store the results back to system memory in c + * Construct the display list to store to the blocked formated C matrix. + */ + list = (unsigned int)(&bufB_list[idx+4][0]); + fill_dma_list((volatile vec_uint4 *)list, elementc, stride2c, stride4c, stride6c); + +#if (HPL_ACCEL_SPES & 3) != 0 + blk_idx = spu_add(blk_idx, (HPL_ACCEL_SPES & 3)); + nextc = spu_sel(next0c, next1c, spu_cmpgt(blk_idx, 3)); + blk_idx = spu_and(blk_idx, 3); +#endif + +#ifdef MATRIX_4GB_CROSSING + /* The list 4GB crossing can only occur at block boundary. Therefore, halfway through + * the list. + */ + list_size = (spu_extract(elementc, 1) > (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8); + + spu_mfcdma64(&bufB_128x16[tag][0], c_hi, list, list_size, tag, MFC_PUTL_CMD); + spu_mfcdma64(&bufB_128x16[tag][list_size], c_hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD); + + c_hi += spu_extract(spu_genc(elementc, nextc), 1); +#else + spu_mfcdma64(&bufB_128x16[tag][0], c_hi, list, 128*8, tag, MFC_PUTL_CMD); +#endif + elementc = spu_add(elementc, nextc); + + next_tag = tag ^ 1; + + DMA_WAIT_REQUEST(1< (0xFFFFFFFF - M_SUB*M_SUB*8)) ? (M_SUB*8) : (M*8); + + spu_mfcdma64(&bufB_128x16[next_tag][0], c_hi, list, list_size, tag, MFC_PUTL_CMD); + spu_mfcdma64(&bufB_128x16[next_tag][list_size], c_hi+1, list+(M_SUB*8), M*8-list_size, tag, MFC_PUTL_CMD); + + c_hi += spu_extract(spu_genc(elementc, nextc), 1); +#else + spu_mfcdma64(&bufB_128x16[next_tag][0], c_hi, list, 128*8, tag, MFC_PUTL_CMD); +#endif + + elementc = spu_add(elementc, nextc); + + /* Report completion status if requested. + */ + report_completion(id, cmd_parms->incomplete, tag); +} + Index: accel/lib/spu/accel_mm_dp.c =================================================================== RCS file: accel/lib/spu/accel_mm_dp.c diff -N accel/lib/spu/accel_mm_dp.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_mm_dp.c 20 Aug 2008 03:57:53 -0000 1.8 @@ -0,0 +1,289 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include + + +/* Compute generalized matrix multiply of the form + * + * [C] -= [A] * [B] + * + * where + * C is a row ordered matrix of dimension n by m (width by height) elements + * with leading dimension n. + * A is a row ordered matrix of dimension k by m elements with leading + * dimension k. + * B is a row ordered matrix of dimension n by k elements with leading + * dimenstion n. + * + * The computation is performed by computing the result using sub-block of + * the size 8x4 for B and C, and 4x4 for A. + * + * This blocking mandates that k and m must be an integral multiple of 4 and + * n must be an integral multiple of 8. + * + * NOTE: The leading dimensions are a double stride, not a vector stride. + */ + +void mm_dp(int k, int m, int n, vector double *c, vector double *a, vector double *b) +{ + int i, x, y; + vector unsigned int pA, pB, pC; + vector unsigned int pA_start, pA_row, pB_start, pC_start; + vector unsigned int n1, n32, k1, k32; + vector double *pA0, *pA1, *pA2, *pA3; + vector double *pB0, *pB1, *pB2, *pB3; + vector double *pC0, *pC1, *pC2, *pC3; + vector double A00, A01, A10, A11, A20, A21, A30, A31; + vector double A00_0, A10_0, A20_0, A30_0; + vector double A00_1, A10_1, A20_1, A30_1; + vector double A01_0, A11_0, A21_0, A31_0; + vector double A01_1, A11_1, A21_1, A31_1; + vector double B00, B01, B02, B03; + vector double B10, B11, B12, B13; + vector double B20, B21, B22, B23; + vector double B30, B31, B32, B33; + vector double C00, C01, C02, C03; + vector double C10, C11, C12, C13; + vector double C20, C21, C22, C23; + vector double C30, C31, C32, C33; + vector unsigned int v_0123 = (vector unsigned int){0, 8, 16, 24}; + vector unsigned int n_0123, k_0123; + vector unsigned char pat0 = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; + vector unsigned char pat1; + + pat1 = spu_or(pat0, 8); + + /* Precompute 4 local store pointers for each of the buffer pointers + * + * pA_start = a+0*k, a+1*k, a+2*k, a+3*k + * pB_start = b+0*n, b+1*n, a+2*n, a+3*n + * pC_start = c+0*n, c+1*n, a+2*n, a+3*n + * + * where a, b, c are double pointers. + */ + k1 = spu_splats((unsigned int)k); + n1 = spu_splats((unsigned int)n); + + k_0123 = spu_mulo((vector unsigned short)k1, (vector unsigned short)v_0123); + n_0123 = spu_mulo((vector unsigned short)n1, (vector unsigned short)v_0123); + pA_start = spu_add(spu_splats((unsigned int)a), k_0123); + pB_start = spu_add(spu_splats((unsigned int)b), n_0123); + pC_start = spu_add(spu_splats((unsigned int)c), n_0123); + + n32 = spu_sl(n1, 5); + k32 = spu_sl(k1, 5); + + for (x=0; x +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + +void accel_reform_matrix_CL_to_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_matrix_CL_to_B_parms_t *cmd_parms) +{ + int i; + unsigned int x, y; + unsigned int id; + unsigned long long a, scratch; + unsigned int a_hi, a_lo, out_hi, out_lo; + unsigned int scratch_hi, scratch_lo, lo; + unsigned int n, nb, m, mb, m_pad, lda, spes, trailing, left; + unsigned int dst_idx; + unsigned int tag, next_tag; + unsigned int retained; /* Number of buffers kept in local store instead of the scratch buffer */ + vec_uint4 next_col_blk, next_row_blk; + vec_uint4 element0, element1, element2, element3, element_next; + volatile vec_uint4 *list; + vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1}; +#ifdef ACCEL_LITTLE_ENDIAN + vec_uchar16 pat_even = (vec_uchar16){7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; +#else + vec_uchar16 pat_even = (vec_uchar16){0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23}; +#endif + vec_uchar16 pat_odd = spu_or(pat_even, 8); + vec_uchar16 pat_z0z0 = (vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}; + vec_uchar16 pat_zzzz = (vec_uchar16){128,128,128,128, 128,128,128,128, 128,128,128,128, 128,128,128,128}; + vec_double2 *srcTop, *srcBot, *dst, *buf; + vec_double2 a0, a1, a2, a3, a4, a5, a6, a7; +#ifdef MATRIX_4GB_CROSSING + unsigned int in_hi; + vec_uint4 carry; +#endif + + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + + /* Fetch the parameters + */ + a = cmd_parms->a; + scratch = cmd_parms->scratch; + lda = cmd_parms->lda; + m = cmd_parms->m; + n = cmd_parms->n; + spes = cmd_parms->spes; + + /* Pad m and n to the nearest block and compute the number of blocks to be + * reformated. Rows are padded with 0.0. Columns to filled in with don't care + * values. + */ + m_pad = (m % M_SUB) - 1; + mb = (m + M_SUB-1)/M_SUB; + + nb = (n + M_SUB-1)/M_SUB; + + /* Compute the amount of trailing data to zero after the blocked data. + */ + trailing = (lda - mb*M_SUB*sizeof(double))*M_SUB; + + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + MATRIX_EA_UMADD32(a_hi, a_lo, lda, id*M_SUB); + +#ifdef MATRIX_4GB_CROSSING + in_hi = a_hi; +#endif + + scratch_hi = mfc_ea2h(scratch); + scratch_lo = mfc_ea2l(scratch); + + scratch_lo += id*(mb-4)*M_SUB*M_SUB*sizeof(double); + + /* Compute all the working variables needed to generate that DMA lists. + * + * element0 = {M_SUB*sizeof(double), a_lo + 0*lda, M_SUB*sizeof(double), a_lo + 1*lda} + * element1 = {M_SUB*sizeof(double), a_lo + 2*lda, M_SUB*sizeof(double), a_lo + 3*lda} + * element2 = {M_SUB*sizeof(double), a_lo + 4*lda, M_SUB*sizeof(double), a_lo + 5*lda} + * element3 = {M_SUB*sizeof(double), a_lo + 6*lda, M_SUB*sizeof(double), a_lo + 7*lda} + * element_next = { 0, 8*lda, 0, 8*lda} + */ + next_col_blk = spu_splats((unsigned int)(M_SUB*sizeof(double))); + next_row_blk = spu_and(spu_splats(lda*M_SUB*spes - mb*M_SUB*sizeof(double)), mask_0101); + + element_next = spu_promote(8*lda, 0); + element_next = spu_shuffle(element_next, element_next, pat_z0z0); + + element0 = spu_add(spu_rlmaskqwbyte(spu_rlmask(element_next, -3), -8), + spu_sel(spu_splats((unsigned int)(M_SUB*sizeof(double))), spu_splats(a_lo), mask_0101)); + + next_col_blk = spu_and(next_col_blk, mask_0101); + + element1 = spu_rlmask(element_next, -2); + element2 = spu_rlmask(element_next, -1); + element3 = spu_add(spu_add(element1, element2), element0); + element1 = spu_add(element1, element0); + element2 = spu_add(element2, element0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT(-1); + + /* Reformat the blocks + */ + tag = 0; + + for (x=id; x 1) { + if (y != mb-2) { + /* If this is the next to last block of the column, then do not put into + * the scratch buffer. + */ + dst = &bufB[dst_idx][0]; + spu_mfcdma64(dst, scratch_hi, lo, 16384, tag, MFC_PUT_CMD); + spu_mfcdma64(dst + 1024, scratch_hi, lo+16384, 16384, tag, MFC_PUT_CMD); + lo += M_SUB*M_SUB*sizeof(double); + } + dst_idx ^= 1; + } else { + dst_idx++; + retained++; + } + + tag = next_tag; + } + + /* Wait for the final block get before putting reformated blocks back into the + * matrix. + */ + DMA_WAIT(1< 3) { + srcTop = &bufB[dst_idx^1][0]; + spu_mfcdma64(srcTop, out_hi, out_lo, 16384, tag^1, MFC_PUT_CMD); + spu_mfcdma64(srcTop + 1024, out_hi, out_lo+16384, 16384, tag^1, MFC_PUT_CMD); + MATRIX_EA_UADD32(out_hi, out_lo, 32768); + } + + + /* Finish reformating the last block. The last block contains special handling + * code to zeros out the pad rows. + */ + srcTop = &bufA[tag][16*M_SUB/2]; + srcBot = &bufA[tag][48*M_SUB/2]; + dst = &bufB[dst_idx][0]; + + for (i=0; i<64; i+=2) { + vec_uchar16 pat_e, pat_o; + + pat_e = spu_sel(pat_even, pat_zzzz, + spu_maskb(spu_extract(spu_cmpgt(spu_promote((unsigned int)i, 0), spu_promote(m_pad, 0)), 0))); + pat_o = spu_sel(pat_odd, pat_zzzz, + spu_maskb(spu_extract(spu_cmpgt(spu_promote((unsigned int)(i+1), 0), spu_promote(m_pad, 0)), 0))); + REFORM_8(dst, srcTop, -16, 0, pat_e, pat_o); + REFORM_8(dst, srcTop, -8, 4, pat_e, pat_o); + REFORM_8(dst, srcTop, 0, 8, pat_e, pat_o); + REFORM_8(dst, srcTop, 8, 12, pat_e, pat_o); + + REFORM_8(dst, srcBot, -16, 16, pat_e, pat_o); + REFORM_8(dst, srcBot, -8, 20, pat_e, pat_o); + REFORM_8(dst, srcBot, 0, 24, pat_e, pat_o); + REFORM_8(dst, srcBot, 8, 28, pat_e, pat_o); + + srcTop += 1; + srcBot += 1; + dst += 2*M_SUB/2; + } + + /* Store the final block back into the matrix. + */ + spu_mfcdma64(&bufB[dst_idx][0], out_hi, out_lo, 16384, tag, MFC_PUT_CMD); + spu_mfcdma64(&bufB[dst_idx][1024], out_hi, out_lo+16384, 16384, tag, MFC_PUT_CMD); + + /* Zero out final trailing data resulting from lda striding. + */ + MATRIX_EA_UADD32(out_hi, out_lo, 32768); + + left = trailing; + while (left) { + unsigned int size; + +#ifndef MFC_SDCRZ_CMD +#define MFC_SDCRZ_CMD 0x0089 /* SPU Only */ +#endif /* MFC_SDCRZ_CMD */ + + size = (left > 16384) ? 16384 : left; + spu_mfcdma64(0, out_hi, out_lo, size, tag, MFC_SDCRZ_CMD); + + MATRIX_EA_UADD32(out_hi, out_lo, size); + left -= size; + } + + /* Advance pointers to next column to be processed. + */ +#ifdef MATRIX_4GB_CROSSING + in_hi += spu_extract(spu_genc(element0, next_row_blk), 1); +#endif + element0 = spu_add(element0, next_row_blk); + element1 = spu_add(element1, next_row_blk); + element2 = spu_add(element2, next_row_blk); + element3 = spu_add(element3, next_row_blk); + + MATRIX_EA_UMADD32(a_hi, a_lo, lda, spes*M_SUB); + + /* Wait for all the transfers except the final block to complete + */ + DMA_WAIT(1<<(tag^1)); + } + + /* Report completion status if requested. + */ + report_completion(id, cmd_parms->incomplete, tag); +} + Index: accel/lib/spu/accel_reform_panel_B_to_CL.c =================================================================== RCS file: accel/lib/spu/accel_reform_panel_B_to_CL.c diff -N accel/lib/spu/accel_reform_panel_B_to_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_reform_panel_B_to_CL.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,247 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +void accel_reform_panel_B_to_CL(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_panel_parms_t *cmd_parms) +{ + int i; + unsigned int id; + int x, y, columns, rows; + int dma_size1, dma_size2; + unsigned long long a, panel; + unsigned int a_hi, a_lo, hi, lo; + unsigned int panel_hi, panel_lo; + unsigned int lda, ldp; + unsigned int n, m, mb; + unsigned int tag, next_tag; + unsigned int addend; +#ifdef ACCEL_LITTLE_ENDIAN + vec_uchar16 pat_even = (vec_uchar16){7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16}; +#else + vec_uchar16 pat_even = (vec_uchar16){0,1,2,3,4,5,6,7, 16,17,18,19,20,21,22,23}; +#endif + vec_uchar16 pat_odd = spu_or(pat_even, 8); + vec_uchar16 pat_z0z0 = (vec_uchar16){128,128,128,128, 0,1,2,3, 128,128,128,128, 0,1,2,3}; + vec_uint4 e0, e1, e2, e3, esize; + vec_uint4 element0, element1, element2, element3, element_next; + vec_uint4 next_col_blk, next_row_blk; + vec_uint4 mask_0101 = (vec_uint4){0,-1,0,-1}; + vec_double2 a0, a1, a2, a3, a4, a5, a6, a7; + vec_double2 *srcTop, *srcBot, *dst; + volatile vec_uint4 *list; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + a = cmd_parms->a; + panel = cmd_parms->panel; + lda = cmd_parms->lda; + ldp = cmd_parms->ldp; + m = cmd_parms->m; + n = cmd_parms->n; + + mb = (m + (M_SUB-1)) / M_SUB; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + addend = id * (M_SUB * M_SUB * sizeof(double)); + MATRIX_EA_UADD32(a_hi, a_lo, addend); + + panel_hi = mfc_ea2h(panel); + panel_lo = mfc_ea2l(panel); + + panel_lo += id * M_SUB * sizeof(double); + + /* Compute all the working variables needed to generate the DMA lists. + * + * element0 = {M_SUB*sizeof(double), panel_lo + 0*ldp, M_SUB*sizeof(double), panel_lo + 1*ldp} + * element1 = {M_SUB*sizeof(double), panel_lo + 2*ldp, M_SUB*sizeof(double), panel_lo + 3*ldp} + * element2 = {M_SUB*sizeof(double), panel_lo + 4*ldp, M_SUB*sizeof(double), panel_lo + 5*ldp} + * element3 = {M_SUB*sizeof(double), panel_lo + 6*ldp, M_SUB*sizeof(double), panel_lo + 7*ldp} + * element_next = { 0, 8*ldp, 0, 8*ldp} + */ + next_col_blk = spu_and(spu_splats(ldp*M_SUB), mask_0101); + next_row_blk = spu_and(spu_splats(HPL_ACCEL_REFORM_SPES*M_SUB*sizeof(double)), mask_0101); + + element_next = spu_promote(8*ldp, 0); + element_next = spu_shuffle(element_next, element_next, pat_z0z0); + + element0 = spu_add(spu_rlmaskqwbyte(spu_rlmask(element_next, -3), -8), + spu_sel(spu_splats((unsigned int)(M_SUB*sizeof(double))), spu_splats(panel_lo), mask_0101)); + + element1 = spu_rlmask(element_next, -2); + element2 = spu_rlmask(element_next, -1); + element3 = spu_add(spu_add(element1, element2), element0); + element1 = spu_add(element1, element0); + element2 = spu_add(element2, element0); + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + /* Reformat the blocks + */ + tag = 0; + + /* Fetch the first block + */ + if (id < mb) { + dma_size1 = (int)m - id*M_SUB; + dma_size2 = dma_size1-32; + dma_size1 = clamp_0_32(dma_size1); + dma_size2 = clamp_0_32(dma_size2); + rows = (dma_size1 + dma_size2) / 2; + esize = spu_promote(rows * sizeof(vec_double2), 0); + + spu_mfcdma64(&bufA[0][0], a_hi, a_lo, (unsigned int)dma_size1*M_SUB*sizeof(double), 0, MFC_GET_CMD); + spu_mfcdma64(&bufA[0][1024], a_hi, a_lo+16384, (unsigned int)dma_size2*M_SUB*sizeof(double), 0, MFC_GET_CMD); + } + + /* For each of the row of blocks. + */ + for (y=id; y<(int)mb; ) { + hi = a_hi; + lo = a_lo; + MATRIX_EA_UADD32(hi, lo, lda); + + e0 = element0; + e1 = element1; + e2 = element2; + e3 = element3; + + for (x=0; x<(int)n-M_SUB; x+=M_SUB) { + next_tag = tag ^ 1; + + /* Fetch the next block. + */ + spu_mfcdma64(&bufA[next_tag][0], hi, lo, 16384, next_tag, MFC_GET_CMD); + spu_mfcdma64(&bufA[next_tag][1024], hi, lo+16384, 16384, next_tag, MFC_GET_CMD); + MATRIX_EA_UADD32(hi, lo, lda); + + DMA_WAIT(1<incomplete, tag^1); +} Index: accel/lib/spu/accel_reform_panel_R_to_B.c =================================================================== RCS file: accel/lib/spu/accel_reform_panel_R_to_B.c diff -N accel/lib/spu/accel_reform_panel_R_to_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_reform_panel_R_to_B.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,153 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +void accel_reform_panel_R_to_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_panel_parms_t *cmd_parms) +{ + int i, x, y; + unsigned int id; + unsigned int idx; + unsigned int a_hi, a_lo, hi, lo; + unsigned long long a, panel; + unsigned int panel_hi, panel_lo, p_lo; + unsigned int tag; + unsigned int lda, ldp; + unsigned int n, m, row_len, size, left, esize, extra; + unsigned int *list, list_offset; + unsigned int addend; + vec_double2 *buf; +#ifdef MATRIX_4GB_CROSSING + unsigned int carry; +#endif + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + + /* Fetch the parameters + */ + a = cmd_parms->a; + panel = cmd_parms->panel; + lda = cmd_parms->lda; + ldp = cmd_parms->ldp; + m = cmd_parms->m; + n = cmd_parms->n; + + a_hi = mfc_ea2h(a); + a_lo = mfc_ea2l(a); + + panel_hi = mfc_ea2h(panel); + panel_lo = mfc_ea2l(panel); + + addend = id * (M_SUB * sizeof(double)); + + MATRIX_EA_UADD32(a_hi, a_lo, addend); + panel_lo += id * ldp; + + row_len = (n&~1)*sizeof(double); + extra = (n&1)*sizeof(double); + + tag = 0; + list_offset = 0; + size = 0; + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT(-1); + + /* For each row + */ + for (y=(int)id; y<(int)m; y+=HPL_ACCEL_SPES) { + /* For each portion of the row in 16K chunks + */ + hi = a_hi; + lo = a_lo; + esize = lda; + + p_lo = panel_lo; + + for (x=0; x<(int)row_len; x+=(int)size) { + + left = row_len - (unsigned int)x; + size = (left < 16384) ? left : 16384; + + buf = &bufA[tag][0]; + spu_mfcdma64(buf, panel_hi, p_lo, size, tag, MFC_GET_CMD); + + p_lo += size; + + /* Construct a list for the placement into blocked format. + */ + list = (unsigned int *)(&bufB[0][0] + list_offset); + for (i=0, idx=0; i<(int)size; i+=(int)M_SUB*sizeof(double)) { + esize = size - i; + if (esize > M_SUB*sizeof(double)) esize = M_SUB*sizeof(double); + list[idx+0] = esize; + list[idx+1] = lo; + idx += 2; +#ifdef MATRIX_4GB_CROSSING + carry = spu_extract(spu_genc(spu_promote(lo, 0), spu_promote(lda, 0)), 0); + /* If we cross a 4GB boundary, flush the list and start a new one. + */ + if (carry) { + spu_mfcdma64(buf, hi, (unsigned int)list, 4*idx, tag, MFC_PUTLB_CMD); + buf += (M_SUB/4)*idx;; + list += idx; + idx = 0; + hi += carry; + } +#endif + lo += lda; + } + spu_mfcdma64(buf, hi, (unsigned int)list, 4*idx, tag, MFC_PUTLB_CMD); + spu_mfcdma32(0, 0, 0, tag, MFC_BARRIER_CMD); + + /* Advance pointers to next row or buffer + */ + list_offset = (list_offset + 16) % (128*16); /* accomodate up to 128 enqueued DMAs */ + tag ^= 1; + } + + /* Handle the final odd column values + */ + if (extra) { + buf = &bufA[tag][0]; + if (size & (M_SUB*sizeof(double)-1)) { + addend = esize - lda; + MATRIX_EA_ADD32(hi, lo, addend); + } + spu_mfcdma64(buf, panel_hi, p_lo, extra, tag, MFC_GET_CMD); + spu_mfcdma64(buf, hi, lo, extra, tag, MFC_PUTB_CMD); + tag ^= 1; + } + + /* Advance pointers to the next row */ + addend = M_SUB*sizeof(double)*HPL_ACCEL_SPES; + MATRIX_EA_UADD32(a_hi, a_lo, addend); + panel_lo += ldp * HPL_ACCEL_SPES; + + } + /* Wait for next to last DMA to complete before posting completion. + */ + DMA_WAIT(1<incomplete, tag^1); +} + Index: accel/lib/spu/accel_reform_rows_B_to_R.c =================================================================== RCS file: accel/lib/spu/accel_reform_rows_B_to_R.c diff -N accel/lib/spu/accel_reform_rows_B_to_R.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_reform_rows_B_to_R.c 22 Oct 2008 03:28:08 -0000 1.3 @@ -0,0 +1,166 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +static inline void row_B_to_R(unsigned int src_hi, unsigned int src_lo, int ld_src, + unsigned int dst_hi, unsigned int dst_lo, + unsigned int skip, unsigned int left, void *buf) +{ + unsigned int src_size, dst_size, size; + void *ptr; + + dst_size = 16*1024 - skip; + src_size = (M_SUB*sizeof(double)) - skip; + if (dst_size > left) dst_size = left; + if (src_size > left) src_size = left; + + while (left) { + /* Fetch (up to) 16KB buffer of M_SUB spans */ + spu_mfcdma64(buf, src_hi, src_lo+skip, src_size, 0, MFC_GETB_CMD); + ptr = buf; + + skip = 0; + size = dst_size; + left -= dst_size; + + while ((size -= src_size)) { + ptr += src_size; + MATRIX_EA_UADD32(src_hi, src_lo, ld_src); + src_size = (size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : size; + + spu_mfcdma64(ptr, src_hi, src_lo, src_size, 0, MFC_GET_CMD); + } + + /* Store the 16KB span into the row buffer */ + spu_mfcdma64(buf, dst_hi, dst_lo, dst_size, 0, MFC_PUTB_CMD); + + MATRIX_EA_UADD32(src_hi, src_lo, ld_src); + dst_lo += dst_size; + dst_size = (left > 16*1024) ? 16*1024 : left; + src_size = (dst_size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : dst_size; + } +} + + + +void accel_reform_rows_B_to_R(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_rows_parms_t *cmd_parms) +{ + int i; + int m, n, ldr, lda; + int row; + unsigned int id; + unsigned int a_hi, a_lo, r_hi, r_lo; + unsigned int blk_col, skip, mask; + unsigned int spans, spans_per_spe, extra_spans, start_span, end_span; + unsigned int start_col, end_col, max_end_col; + unsigned int row_size; + vector signed int m_n_ldr_lda; + vector unsigned long long rows_a, incomplete_blk_col; + void *buf; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + m_n_ldr_lda = cmd_parms->m_n_ldr_lda; + rows_a = cmd_parms->rows_a; + incomplete_blk_col = cmd_parms->incomplete_blk_col; + + m = spu_extract(m_n_ldr_lda, 0); + n = spu_extract(m_n_ldr_lda, 1); + ldr = spu_extract(m_n_ldr_lda, 2); + lda = spu_extract(m_n_ldr_lda, 3); + + blk_col = spu_extract((vector unsigned int)incomplete_blk_col, 2); + + r_hi = spu_extract((vector unsigned int)rows_a, 0); + r_lo = spu_extract((vector unsigned int)rows_a, 1); + + a_hi = spu_extract((vector unsigned int)rows_a, 2); + a_lo = spu_extract((vector unsigned int)rows_a, 3); + + buf = bufA; + + skip = (blk_col % M_SUB) * sizeof(double); + + blk_col /= M_SUB; + MATRIX_EA_UMADD32(a_hi, a_lo, blk_col, lda); + + /* Equally assign complete rows to each of the SPEs. + */ + row_size = n*sizeof(double); + + /* Process remaining rows by assigning each row to groups of HPL_ACCEL_SPES SPEs. + * Compute the spanning parameters assigned to this SPE. + */ + spans = (row_size + skip + (M_SUB-1)*sizeof(double)) / (M_SUB * sizeof(double)); + spans_per_spe = spans / HPL_ACCEL_SPES; + extra_spans = spans % HPL_ACCEL_SPES; + + start_span = id * spans_per_spe + ((id > extra_spans) ? extra_spans : id); + end_span = start_span + spans_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_spans, 0), spu_promote(id, 0)), 0); + + if (end_span > start_span) { + start_col = start_span * (M_SUB * sizeof(double)); + end_col = end_span * (M_SUB * sizeof(double)); + + max_end_col = skip + row_size; + + mask = spu_extract(spu_cmpeq(spu_promote(id, 0), 0), 0); + r_lo += start_col - (skip & ~mask); + MATRIX_EA_UMADD32(a_hi, a_lo, start_span, lda); + + skip &= mask; + + start_col += skip; + end_col = (end_col > max_end_col) ? max_end_col : end_col; + + row_size = end_col - start_col; + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + for (i=0; iblk_rows[i]; + + hi = a_hi; + lo = a_lo; + EA_UADD64(hi, lo, (unsigned int)row >> (32-9), (unsigned int)row << 9); + row_B_to_R(hi, lo, lda, r_hi, r_lo + (i*ldr), skip, row_size, buf); +#else + row = cmd_parms->blk_rows[i]; + row_B_to_R(a_hi, a_lo + (row * (M_SUB * sizeof(double))), lda, r_hi, r_lo + (i*ldr), skip, row_size, buf); +#endif + } + } else { + DMA_WAIT_RECEIVE(); + } + + /* Report completion status if requested. + */ + report_completion(id, spu_extract(incomplete_blk_col, 0), 0); +} + + + Index: accel/lib/spu/accel_reform_rows_R_to_B.c =================================================================== RCS file: accel/lib/spu/accel_reform_rows_R_to_B.c diff -N accel/lib/spu/accel_reform_rows_R_to_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_reform_rows_R_to_B.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,164 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +static inline void row_R_to_B(unsigned int src_hi, unsigned int src_lo, + unsigned int dst_hi, unsigned int dst_lo, int ld_dst, + unsigned int skip, unsigned int left, void *buf) +{ + unsigned int src_size, dst_size; + void *ptr; + + + src_size = 16*1024 - skip; + dst_size = (M_SUB*sizeof(double)) - skip; + if (src_size > left) src_size = left; + if (dst_size > left) dst_size = left; + + while (left) { + /* Fetch a big (16KB) span from the row buffer */ + + spu_mfcdma64(buf, src_hi, src_lo, src_size, 0, MFC_GETB_CMD); + + left -= src_size; + src_lo += src_size; + + /* Store the big span into the matrix in M_SUB element spans */ + spu_mfcdma64(buf, dst_hi, dst_lo+skip, dst_size, 0, MFC_PUTB_CMD); + ptr = buf; + skip = 0; + + while ((src_size -= dst_size)) { + ptr += dst_size; + MATRIX_EA_UADD32(dst_hi, dst_lo, ld_dst); + dst_size = (src_size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : src_size; + + spu_mfcdma64(ptr, dst_hi, dst_lo, dst_size, 0, MFC_PUT_CMD); + } + MATRIX_EA_UADD32(dst_hi, dst_lo, ld_dst); + src_size = (left > 16*1024) ? 16*1024 : left; + dst_size = (src_size > (M_SUB*sizeof(double))) ? M_SUB*sizeof(double) : src_size; + } +} + + + +void accel_reform_rows_R_to_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_reform_rows_parms_t *cmd_parms) +{ + int i; + int m, n, ldr, lda; + int row; + unsigned int id; + unsigned int a_hi, a_lo, r_hi, r_lo; + unsigned int blk_col, skip, mask; + unsigned int spans, spans_per_spe, extra_spans, start_span, end_span; + unsigned int start_col, end_col, max_end_col; + unsigned int row_size; + vector signed int m_n_ldr_lda; + vector unsigned long long rows_a, incomplete_blk_col; + void *buf; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + m_n_ldr_lda = cmd_parms->m_n_ldr_lda; + rows_a = cmd_parms->rows_a; + incomplete_blk_col = cmd_parms->incomplete_blk_col; + + m = spu_extract(m_n_ldr_lda, 0); + n = spu_extract(m_n_ldr_lda, 1); + ldr = spu_extract(m_n_ldr_lda, 2); + lda = spu_extract(m_n_ldr_lda, 3); + + blk_col = spu_extract((vector unsigned int)incomplete_blk_col, 2); + + r_hi = spu_extract((vector unsigned int)rows_a, 0); + r_lo = spu_extract((vector unsigned int)rows_a, 1); + + a_hi = spu_extract((vector unsigned int)rows_a, 2); + a_lo = spu_extract((vector unsigned int)rows_a, 3); + + buf = bufA; + + skip = (blk_col % M_SUB) * sizeof(double); + + blk_col /= M_SUB; + MATRIX_EA_UMADD32(a_hi, a_lo, blk_col, lda); + + /* Equally assign complete rows to each of the SPEs. + */ + row_size = n*sizeof(double); + + /* Process remaining rows by assigning each row to groups of 4 SPEs. + * Compute the spanning parameters assigned to this SPE. + */ + spans = (row_size + skip + (M_SUB-1)*sizeof(double)) / (M_SUB * sizeof(double)); + spans_per_spe = spans / 8; + extra_spans = spans % 8; + + start_span = id * spans_per_spe + ((id > extra_spans) ? extra_spans : id); + end_span = start_span + spans_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_spans, 0), spu_promote(id, 0)), 0); + + if (end_span > start_span) { + start_col = start_span * (M_SUB * sizeof(double)); + end_col = end_span * (M_SUB * sizeof(double)); + + max_end_col = skip + row_size; + + mask = spu_extract(spu_cmpeq(spu_promote(id, 0), 0), 0); + r_lo += start_col - (skip & ~mask); + MATRIX_EA_UMADD32(a_hi, a_lo, start_span, lda); + + skip &= mask; + + start_col += skip; + end_col = (end_col > max_end_col) ? max_end_col : end_col; + + row_size = end_col - start_col; + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (i=0; iblk_rows[i]; + + hi = a_hi; + lo = a_lo; + EA_UADD64(hi, lo, (unsigned int)row >> (32-9), (unsigned int)row << 9); + row_R_to_B(r_hi, r_lo + (i*ldr), hi, lo, lda, skip, row_size, buf); +#else + row = cmd_parms->blk_rows[i]; + row_R_to_B(r_hi, r_lo + (i*ldr), a_hi, a_lo + (row * (M_SUB * sizeof(double))), lda, skip, row_size, buf); +#endif + } + } else { + DMA_WAIT_RECEIVE(); + } + + /* Report completion status if requested. + */ + report_completion(id, spu_extract(incomplete_blk_col, 0), 0); +} Index: accel/lib/spu/accel_spu.h =================================================================== RCS file: accel/lib/spu/accel_spu.h diff -N accel/lib/spu/accel_spu.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_spu.h 20 Aug 2008 03:57:53 -0000 1.7 @@ -0,0 +1,49 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_SPU_H_ +#define _ACCEL_SPU_H_ 1 + +typedef void (*accel_specialist_t)(hpl_accel_init_parms_t *, volatile void *); + +/* Accellerator specialists and dispatch table + */ +extern void accel_dgemm(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dgemm_C_C_C(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dgemm_panel(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dtrsm(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dtrsm_panel(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_matrix_CL_to_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_panel_B_to_CL(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_panel_R_to_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_rows_R_to_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_reform_rows_B_to_R(hpl_accel_init_parms_t *, volatile void *); +extern void accel_fini(hpl_accel_init_parms_t *, volatile void *); +extern void accel_dtrsm_CL_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_swap_rows_B_to_B(hpl_accel_init_parms_t *, volatile void *); +extern void accel_copy_rows_R_to_R(hpl_accel_init_parms_t *, volatile void *); + + +accel_specialist_t dispatch[] = { + &accel_dgemm, + &accel_dtrsm, + &accel_reform_matrix_CL_to_B, + &accel_reform_panel_B_to_CL, + &accel_reform_panel_R_to_B, + &accel_dgemm_panel, + &accel_reform_rows_R_to_B, + &accel_reform_rows_B_to_R, + &accel_fini, + &accel_dtrsm_CL_B, + &accel_dtrsm_panel, + &accel_dgemm_C_C_C, + &accel_swap_rows_B_to_B, + &accel_copy_rows_R_to_R +}; + +#endif /* _ACCEL_SPU_H_ */ + + + Index: accel/lib/spu/accel_swap_rows_B_to_B.c =================================================================== RCS file: accel/lib/spu/accel_swap_rows_B_to_B.c diff -N accel/lib/spu/accel_swap_rows_B_to_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_swap_rows_B_to_B.c 20 Aug 2008 03:57:53 -0000 1.5 @@ -0,0 +1,186 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_buffers.h" +#include "accel_utils.h" +#include "accel_reform.h" + + +static inline void row_B_to_B(unsigned int src_hi, unsigned int src_lo, + unsigned int dst_hi, unsigned int dst_lo, + int ld, unsigned int skip, unsigned int left) +{ + unsigned int size, blk_size; + + if (skip>0) { + size = (M_SUB*sizeof(double)) - skip; + if (size > left) size = left; + + spu_mfcdma64(bufA+skip, src_hi, src_lo+skip, size, 0, MFC_GET_CMD); + spu_mfcdma64(bufB+skip, dst_hi, dst_lo+skip, size, 0, MFC_GET_CMD); + spu_mfcdma64(bufA+skip, dst_hi, dst_lo+skip, size, 0, MFC_PUTB_CMD); + spu_mfcdma64(bufB+skip, src_hi, src_lo+skip, size, 0, MFC_PUT_CMD); + + MATRIX_EA_UADD32(src_hi, src_lo, ld); + MATRIX_EA_UADD32(dst_hi, dst_lo, ld); + left -= size; + } + + while (left) { + void *ptrA, *ptrB; + unsigned int get_size, put_size; + unsigned int save_src_hi = src_hi, save_src_lo = src_lo; + unsigned int save_dst_hi = dst_hi, save_dst_lo = dst_lo; + + /* size is the number of bytes swapped in this iteration of the loop */ + size = 16*1024; + if (size > left) size = left; + + /* Barrier to ensure all prior transfers are complete */ + spu_mfcdma64(0, 0, 0, 0, 0, MFC_BARRIER_CMD); + + /* Fetch (up to) 16KB of src and dst rows into separate buffers */ + ptrA = bufA; + ptrB = bufB; + get_size = size; + while (get_size) { + blk_size = (M_SUB*sizeof(double)); + if (blk_size > get_size) blk_size = get_size; + spu_mfcdma64(ptrA, src_hi, src_lo, blk_size, 0, MFC_GET_CMD); + spu_mfcdma64(ptrB, dst_hi, dst_lo, blk_size, 0, MFC_GET_CMD); + ptrA += blk_size; + ptrB += blk_size; + MATRIX_EA_UADD32(src_hi, src_lo, ld); + MATRIX_EA_UADD32(dst_hi, dst_lo, ld); + get_size -= blk_size; + } + + /* Barrier to ensure all gets are complete */ + spu_mfcdma64(0, 0, 0, 0, 0, MFC_BARRIER_CMD); + + src_hi = save_src_hi; src_lo = save_src_lo; + dst_hi = save_dst_hi; dst_lo = save_dst_lo; + + /* Store the fetched bytes back into the src and dst rows */ + ptrA = bufA; + ptrB = bufB; + put_size = size; + while (put_size) { + blk_size = (M_SUB*sizeof(double)); + if (blk_size > put_size) blk_size = put_size; + spu_mfcdma64(ptrB, src_hi, src_lo, blk_size, 0, MFC_PUT_CMD); + spu_mfcdma64(ptrA, dst_hi, dst_lo, blk_size, 0, MFC_PUT_CMD); + ptrA += blk_size; + ptrB += blk_size; + MATRIX_EA_UADD32(src_hi, src_lo, ld); + MATRIX_EA_UADD32(dst_hi, dst_lo, ld); + put_size -= blk_size; + } + + left -= size; + } + + DMA_WAIT(-1); +} + + +void accel_swap_rows_B_to_B(hpl_accel_init_parms_t *parms, + volatile hpl_accel_swap_rows_parms_t *cmd_parms) +{ + int m, n, lda; + int src, dst; + unsigned int id; + unsigned int a_hi, a_lo; + unsigned int blk_col, row_size, skip, mask; + unsigned int spans, spans_per_spe, extra_spans, start_span, end_span; + unsigned int start_col, end_col, max_end_col; + vector signed int m_n_lda_blk_col; + vector unsigned long long a_incomplete; + + id = parms->id; + + /* Wait for the transfer of the parameters to complete + */ + DMA_WAIT_RECEIVE(); + DMA_WAIT_REQUEST(-1); + + /* Fetch the parameters + */ + m_n_lda_blk_col = cmd_parms->m_n_lda_blk_col; + a_incomplete = cmd_parms->a_incomplete; + + m = spu_extract(m_n_lda_blk_col, 0); + n = spu_extract(m_n_lda_blk_col, 1); + lda = spu_extract(m_n_lda_blk_col, 2); + blk_col = spu_extract(m_n_lda_blk_col, 3); + + a_hi = spu_extract((vector unsigned int)a_incomplete, 0); + a_lo = spu_extract((vector unsigned int)a_incomplete, 1); + + skip = (blk_col % M_SUB) * sizeof(double); + blk_col /= M_SUB; + + MATRIX_EA_UMADD32(a_hi, a_lo, blk_col, lda); + + /* Process rows by assigning each row to a group of 8 SPEs. + * Compute the spanning parameters assigned to this SPE. + */ + row_size = n*sizeof(double); + spans = (row_size + skip + (M_SUB-1)*sizeof(double)) / (M_SUB * sizeof(double)); + spans_per_spe = spans / 8; + extra_spans = spans % 8; + + start_span = id * spans_per_spe + ((id > extra_spans) ? extra_spans : id); + end_span = start_span + spans_per_spe - spu_extract(spu_cmpgt(spu_promote(extra_spans, 0), spu_promote(id, 0)), 0); + + if (end_span > start_span) { + start_col = start_span * (M_SUB * sizeof(double)); + end_col = end_span * (M_SUB * sizeof(double)); + + max_end_col = skip + row_size; + + mask = spu_extract(spu_cmpeq(spu_promote(id, 0), 0), 0); + MATRIX_EA_UMADD32(a_hi, a_lo, start_span, lda); + + skip &= mask; + + start_col += skip; + end_col = (end_col > max_end_col) ? max_end_col : end_col; + + row_size = end_col - start_col; + + /* Before starting, make sure all previous DMA transfers are completed so + * that all the LS buffers are known to be available. + */ + DMA_WAIT_RECEIVE(); + + for (src=0; srcblk_rows[src]; + if ( src != dst ) { +#ifdef MATRIX_4GB_CROSSING + unsigned int src_hi = a_hi, src_lo = a_lo, dst_hi = a_hi, dst_lo = a_lo; + EA_UADD64(src_hi, src_lo, (unsigned int)src >> (32-9), (unsigned int)src << 9); + EA_UADD64(dst_hi, dst_lo, (unsigned int)dst >> (32-9), (unsigned int)dst << 9); + row_B_to_B(src_hi, src_lo, dst_hi, dst_lo, lda, skip, row_size); +#else + row_B_to_B(a_hi, a_lo + (src * (M_SUB * sizeof(double))), + a_hi, a_lo + (dst * (M_SUB * sizeof(double))), + lda, skip, row_size); +#endif + } + } + } else { + DMA_WAIT_RECEIVE(); + } + + /* Report completion status if requested. + */ + report_completion(id, spu_extract(a_incomplete, 1), 0); +} Index: accel/lib/spu/accel_utils.h =================================================================== RCS file: accel/lib/spu/accel_utils.h diff -N accel/lib/spu/accel_utils.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/accel_utils.h 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,173 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#ifndef _ACCEL_UTILS_H_ +#define _ACCEL_UTILS_H_ + +#include +#include + +#define LIKELY(COND) __builtin_expect(COND, 1) +#define UNLIKELY(COND) __builtin_expect(COND, 0) + +/* The waiting for DMA has been broken into two parts. + * 1) DMA_WAIT_REQUEST - Make a channel request for the wait by + * setting the tag mask and writing to the + * tag update channel. + * 2) DMA_WAIT_RECEIVE - Reading the tag status. + * + * The two parts needs to be seperated by 36 cycles to avoid + * stalling even when no DMAs are still in flight. + */ +#define DMA_WAIT_REQUEST(_mask) spu_writech(MFC_WrTagMask, _mask); \ + spu_writech(MFC_WrTagUpdate, MFC_TAG_UPDATE_ALL); +#define DMA_WAIT_RECEIVE() (void)spu_readch(MFC_RdTagStat); + + +/* The simplified DMA_WAIT is used by the reformating routines since + * these are not computation bound and do not benefit in spliting + * the wait into two parts. + */ +#define DMA_WAIT(_mask) DMA_WAIT_REQUEST(_mask); \ + DMA_WAIT_RECEIVE(); + + + +/* Add the unsigned 32-bit _addend to the 64 bit effective address _eah,_eal. + */ +#define EA_UADD32(_eah, _eal, _addend) { \ + unsigned int _a; \ + \ + _a = _addend; \ + _eah += spu_extract(spu_genc(spu_promote(_eal, 0), \ + spu_promote(_a, 0)), 0); \ + _eal += _a; \ +} + +/* Add the signed 32-bit _addend to the 64 bit effective address _eah,_eal. + */ +#define EA_ADD32(_eah, _eal, _addend) { \ + vec_uint4 _va; \ + \ + _va = spu_promote((unsigned int)_addend, 0); \ + _eah = spu_extract(spu_addx(spu_promote(_eah, 0), \ + spu_rlmaska(_va, -31), \ + spu_genc(spu_promote(_eal, 0), _va)), 0); \ + _eal += spu_extract(_va, 0); \ +} + + + +/* Add the unsigned 64-bit addend specified by _ah,_al to the 64 bit effective + * address _eah,_eal. + */ +#define EA_UADD64(_eah, _eal, _ah, _al) { \ + vec_uint4 _vah, _val; \ + \ + _vah = spu_promote((unsigned int)_ah, 0); \ + _val = spu_promote((unsigned int)_al, 0); \ + _eah = spu_extract(spu_addx(spu_promote(_eah, 0), \ + _vah, \ + spu_genc(spu_promote(_eal, 0), _val)), 0);\ + _eal += spu_extract(_val, 0); \ +} + + + +/* Multiply two unsigned 32-bit values, _m1 and _m2, and return the 64-bit product + * in _ph,_pl. + */ +#define EA_UMUL32(_ph, _pl, _m1, _m2) \ +{ \ + vec_uint4 _vll, _vlh, _vhl, _vh, _vl, _v0, _v1; \ + vec_ushort8 _va, _vb, _vb2; \ + \ + _va = (vec_ushort8)spu_promote(_m1, 0); \ + _vb = (vec_ushort8)spu_promote(_m2, 0); \ + _vb2 = (vec_ushort8)spu_rl((vec_uint4)_vb, 16); \ + \ + _vll = spu_mulo(_va, _vb); \ + _vlh = spu_mulo(_va, _vb2); \ + _vhl = spu_mule(_va, _vb2); \ + \ + _vh = spu_mhhadd(_va, _vb, spu_add(spu_rlmask(_vhl, -16), spu_rlmask(_vlh, -16))); \ + \ + _v0 = spu_sl(_vhl, 16); \ + _v1 = spu_sl(_vlh, 16); \ + \ + _vh = spu_add(_vh, spu_genc(_v1, _v0)); \ + _vl = spu_add(_v1, _v0); \ + _vh = spu_add(_vh, spu_genc(_vl, _vll)); \ + _vl = spu_add(_vl, _vll); \ + \ + _ph = spu_extract(_vh, 0); \ + _pl = spu_extract(_vl, 0); \ +} + +/* Multiply two unsigned 32-bit values, _m1 and _m2, and add the 64-bit product to + * the 64-bit effective address. + */ +#define EA_UMADD32(_eah, _eal, _m1, _m2) { \ + unsigned int _ph, _pl; \ + EA_UMUL32(_ph, _pl, _m1, _m2); \ + EA_UADD64(_eah, _eal, _ph, _pl); \ +} + + +#ifdef PANEL_4GB_CROSSING +#define PANEL_EA_ADD32(_eah, _eal, _addend) EA_ADD32(_eah, _eal, _addend) +#define PANEL_EA_UADD32(_eah, _eal, _addend) EA_UADD32(_eah, _eal, _addend) +#else +#define PANEL_EA_ADD32(_eah, _eal, _addend) _eal += _addend; +#define PANEL_EA_UADD32(_eah, _eal, _addend) _eal += _addend; +#endif + +#ifdef MATRIX_4GB_CROSSING +#define MATRIX_EA_ADD32(_eah, _eal, _addend) EA_ADD32(_eah, _eal, _addend) +#define MATRIX_EA_UADD32(_eah, _eal, _addend) EA_UADD32(_eah, _eal, _addend) +#define MATRIX_EA_UMADD32(_eah, _eal, _m1, _m2) EA_UMADD32(_eah, _eal, _m1, _m2) +#else +#define MATRIX_EA_ADD32(_eah, _eal, _addend) _eal += _addend; +#define MATRIX_EA_UADD32(_eah, _eal, _addend) _eal += _addend; +#define MATRIX_EA_UMADD32(_eah, _eal, _m1, _m2) _eal += _m1 * _m2; +#endif + + +/* report_completion + * ----------------- + * Write a byte to system memory to report that the requested operation + * has been completed by the specified SPE. The DMA put is fenced using + * the specified tag ID so that the writeback is ordered with respect + * to the results posted to system memory. Caller's MUST ensure that the + * tag ID be the same as the DMA for the results. + */ +static vec_uchar16 completion_writeback = (vec_uchar16){0}; + +static inline void report_completion(int id, + unsigned long long incomplete_ea, + unsigned int tag) +{ + unsigned int incomplete_hi, incomplete_lo; + unsigned int size; + void *lsa; + + incomplete_lo = mfc_ea2l(incomplete_ea); + incomplete_hi = mfc_ea2h(incomplete_ea); + + size = 1 & ~(spu_extract(spu_cmpeq(spu_or(spu_promote(incomplete_hi, 0), + spu_promote(incomplete_lo, 0)), 0), 0)); + + incomplete_lo += id; + + lsa = ((void *)&completion_writeback) + (incomplete_lo & 0xF);; + + spu_mfcdma64(lsa, incomplete_hi, incomplete_lo, size, tag, MFC_PUTF_CMD); +} + + +#endif /* _ACCEL_UTILS_H_ */ + + + Index: accel/lib/spu/hpl_accel_spu.c =================================================================== RCS file: accel/lib/spu/hpl_accel_spu.c diff -N accel/lib/spu/hpl_accel_spu.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/spu/hpl_accel_spu.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,60 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel_spu.h" +#include "accel_utils.h" +#include "accel_spu.h" + +volatile hpl_accel_init_parms_t parms; + +volatile unsigned char cmd_parms[128] __attribute__ ((aligned (128))); + + +int main(unsigned long long speid __attribute__ ((unused)), + unsigned long long parms_ea) +{ + unsigned int cmd; + uint64_t cmd_queue; + + /* Fetch the global parameters + */ + + mfc_get(&parms, parms_ea, sizeof(parms), HPL_ACCEL_PARM_TAG, 0, 0); + DMA_WAIT(1 << HPL_ACCEL_PARM_TAG); + + while (1) { + cmd = spu_readch(SPU_RdInMbox); + + /* Fetch the command parameters + */ + cmd_queue = parms.cmd_base + (cmd & ~HPL_ACCEL_CMD_MASK); + + mfc_get((volatile void *)cmd_parms, cmd_queue, 128, HPL_ACCEL_PARM_TAG, 0, 0); + + DMA_WAIT_REQUEST(1< +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +int norepeat_rand_row(int max, int *rows, int cnt) +{ + int i; + int new_row; + int unique; + + do { + new_row = (max * (rand() & 0xFFFF)) >> 16; + unique = 1; + + for (i=0; i=0; i--) { + if (B1[i] != B2[i]) { + errors++; + if (errors < 20) printf("B1<->B2 %d expected=%f got=%f\n", i, B1[i], B2[i]); + } + } + + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm.c =================================================================== RCS file: accel/lib/tests/dgemm.c diff -N accel/lib/tests/dgemm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,113 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + + +/* dgemm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int k=128; + int n=128; + int m=128; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + + switch (argc) { + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1280; + n = rand() % 1280; + lda = rand() % 2048; + ldb = rand() % 2048; + ldc = rand() % (64*2048); + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m = m & ~(64-1); + if (m < 64) m = 64; + + n = n & ~(64-1); + if (n < 64) n = 64; + + lda = lda & ~(15); + if (lda < m) lda = m; + + ldb = ldb & ~(15); + if (ldb < n) ldb = n; + + ldc = ldc & ~(15); + if (ldc < 64*m) ldc = 64*m; + + printf("Performing dgemm test with m=%d n=%d k=%d lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_panel(k, ldb, 128); + C1 = (double *)allocate_matrix(n/64, ldc, 128); + C2 = (double *)allocate_matrix(n/64, ldc, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors = %d\n", errors); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_CL_B_B.c =================================================================== RCS file: accel/lib/tests/dgemm_CL_B_B.c diff -N accel/lib/tests/dgemm_CL_B_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_CL_B_B.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,113 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + + +/* dgemm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int k=128; + int n=128; + int m=128; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + + switch (argc) { + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1280; + n = rand() % 1280; + lda = rand() % 2048; + ldb = rand() % 16384; + ldc = rand() % (64*2048); + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m = m & ~(64-1); + if (m < 64) m = 64; + + n = n & ~(64-1); + if (n < 64) n = 64; + + lda = lda & ~(15); + if (lda < m) lda = m; + + ldb = ldb & ~(15); + if (ldb < 64*k) ldb = 64*k; + + ldc = ldc & ~(15); + if (ldc < 64*m) ldc = 64*m; + + printf("Performing dgemm test with m=%d n=%d k=%d lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_matrix(n/64, ldb, 128); + C1 = (double *)allocate_matrix(n/64, ldc, 128); + C2 = (double *)allocate_matrix(n/64, ldc, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_CL_B_B_CL.c =================================================================== RCS file: accel/lib/tests/dgemm_CL_B_B_CL.c diff -N accel/lib/tests/dgemm_CL_B_B_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_CL_B_B_CL.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,179 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + +unsigned long long dab(double d) +{ + union { + unsigned long long ull; + double d; + } x; + x.d = d; + return (x.ull); +} + + +/* dgemm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int ldp = 0; + int c_col = 0; + int c_row = 0; + int c_cols, c_rows; + int k=128; + int n=128; + int m=128; + int csize, psize, bsize; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + double *P1 = NULL; + double *P2 = NULL; + + switch (argc) { + case 9: + c_col = atoi(argv[8]); + case 8: + c_row = atoi(argv[7]); + case 7: + ldp = atoi(argv[6]); + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1024; + n = rand() % 1024; + lda = rand() % 1200; + ldb = rand() % 1200; + ldc = rand() % 1200; + if (rand() & 1) ldp = rand() % 1200; + c_row = rand() % 150; + c_col = rand() % 150; + + if (rand() & 1) { + lda &= ~1; + ldb &= ~1; + ldc &= ~1; + ldp &= ~1; + c_row = 0; + c_col = 0; + } + + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc [ldp [c_col [c_row]]]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if (lda < m) lda = m; + lda = (lda + 15) & ~(15); + + if (ldb < M_SUB*k) ldb = M_SUB*k; + ldb = (ldb + 15) & ~(15); + + if (ldc < m) ldc = m; + ldc = (ldc + 15) & ~(15); + + if (ldp) { + if (ldp < m) ldp = m; + ldp = (ldp + 15) & ~(15); + } + + c_cols = c_col + n; + c_rows = c_row + m; + + if (ldc < c_rows*64) ldc = c_rows*64; + ldc = ((ldc + 63) & ~63); + c_cols = (c_cols + 63) & ~63; + + csize = ldc*c_cols/64; + psize = ldp*n; + + bsize = ldb*(n+M_SUB-1)/M_SUB; + + printf("Performing dgemm test with m=%d n=%d lda=%d ldb=%d ldc=%d ldp=%d c_row=%d c_col=%d\n", m, n, lda, ldb, ldc, ldp, c_row, c_col); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_matrix(n+63, ldb, 128); + if (ldp) { + C1 = C2 = (double *)allocate_matrix(c_cols, ldc, 128); + P1 = (double *)allocate_panel(n, ldp, 128); + P2 = (double *)allocate_panel(n, ldp, 128); + for (i=0; i=0; i--) { + double p1, p2; + + p1 = byte_swap(P1[i]); + p2 = byte_swap(P2[i]); + + if (fabs(p1 - p2) > EPSILON) { + if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, p1, p2); + } + } + } else { + for (i=csize, errors=0; i>=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_CL_C_C.c =================================================================== RCS file: accel/lib/tests/dgemm_CL_C_C.c diff -N accel/lib/tests/dgemm_CL_C_C.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_CL_C_C.c 14 May 2008 21:35:01 -0000 1.3 @@ -0,0 +1,126 @@ +/* ------------------------------------------------------------------ */ +/* (C) Copyright 2007 */ +/* International Business Machines Corporation, */ +/* */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------ */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + + +/* dgemm_CL_C_C + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int k=64; + int n=64; + int m=128; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + + switch (argc) { + case 7: + ldc = atoi(argv[6]); + case 6: + ldb = atoi(argv[5]); + case 5: + lda = atoi(argv[4]); + case 4: + k = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 2000; + n = rand() % 70; + k = rand() % 70; + lda = rand() % 2000; + ldb = rand() % 200; + ldc = rand() % 2000; + + /* Force all parameter within constraints */ + if ((rand() & 1) == 0) { + k &= ~(4-1); + m &= ~(8-1); + n &= ~(4-1); + + if (k < 4) k = 4; + if (k > 64) k = 64; + if (m < 8) m = 8; + if (n < 4) n = 4; + if (n > 64) n = 64; + + lda &= ~1; + ldb &= ~1; + ldc &= ~1; + } + break; + default: + printf("Usage: %s [m [n [k [lda [ldb [ldc]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if (m == 0) m = 1; + if (n == 0) n = 1; + if (k == 0) k = 1; + + if (lda < m) lda = m; + if (ldb < k) ldb = k; + if (ldc < m) ldc = m; + + printf("Performing dgemm_CL_C_C test with m=%d n=%d k=%d lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_panel(n, ldb, 128); + C1 = (double *)allocate_panel(n, ldc, 128); + C2 = (double *)allocate_panel(n, ldc, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors = %d\n", errors); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_CL_R_B_CL.c =================================================================== RCS file: accel/lib/tests/dgemm_CL_R_B_CL.c diff -N accel/lib/tests/dgemm_CL_R_B_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_CL_R_B_CL.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,177 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + +unsigned long long dab(double d) +{ + union { + unsigned long long ull; + double d; + } x; + x.d = d; + return (x.ull); +} + + +/* dgemm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int ldp = 0; + int c_col = 0; + int c_row = 0; + int c_cols, c_rows; + int k=128; + int n=128; + int m=128; + int csize, psize; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + double *P1 = NULL; + double *P2 = NULL; + + switch (argc) { + case 9: + c_col = atoi(argv[8]); + case 8: + c_row = atoi(argv[7]); + case 7: + ldp = atoi(argv[6]); + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1024; + n = rand() % 1024; + lda = rand() % 1536; + ldb = rand() % 1536; + ldc = rand() % 1536; + if (rand() & 1) ldp = rand() % 1536; + c_row = rand() % 256; + c_col = rand() % 256; + + if (rand() & 1) { + lda &= ~1; + ldb &= ~1; + ldc &= ~1; + ldp &= ~1; + c_row = 0; + c_col = 0; + } + + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc [ldp [c_col [c_row]]]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if (lda < m) lda = m; + lda = (lda + 15) & ~(15); + + if (ldb < n) ldb = n; + ldb = (ldb + 15) & ~(15); + + if (ldc < m) ldc = m; + ldc = (ldc + 15) & ~(15); + + if (ldp) { + if (ldp < m) ldp = m; + ldp = (ldp + 15) & ~(15); + } + + c_cols = c_col + n; + c_rows = c_row + m; + + if (ldc < c_rows*64) ldc = c_rows*64; + ldc = ((ldc + 63) & ~63); + c_cols = (c_cols + 63) & ~63; + + csize = ldc*c_cols/64; + psize = ldp*n; + + printf("Performing dgemm test with m=%d n=%d lda=%d ldb=%d ldc=%d ldp=%d c_row=%d c_col=%d\n", m, n, lda, ldb, ldc, ldp, c_row, c_col); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_panel(k, ldb, 128); + if (ldp) { + C1 = C2 = (double *)allocate_matrix(c_cols, ldc, 128); + P1 = (double *)allocate_panel(n, ldp, 128); + P2 = (double *)allocate_panel(n, ldp, 128); + for (i=0; i=0; i--) { + double p1, p2; + + p1 = byte_swap(P1[i]); + p2 = byte_swap(P2[i]); + + if (fabs(p1 - p2) > EPSILON) { + if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, p1, p2); + } + } + } else { + for (i=csize, errors=0; i>=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + if (errors++ < 20) printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dgemm_C_C_C.c =================================================================== RCS file: accel/lib/tests/dgemm_C_C_C.c diff -N accel/lib/tests/dgemm_C_C_C.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dgemm_C_C_C.c 20 Aug 2008 03:57:53 -0000 1.4 @@ -0,0 +1,126 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + +#define EPSILON 0.0000001 + + +/* dgemm_C_C_C + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int k=64; + int n=64; + int m=128; + volatile unsigned long long incomplete; + double *A, *B, *C1, *C2; + + switch (argc) { + case 7: + ldc = atoi(argv[6]); + case 6: + ldb = atoi(argv[5]); + case 5: + lda = atoi(argv[4]); + case 4: + k = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 2000; + n = rand() % 70; + k = rand() % 70; + lda = rand() % 2000; + ldb = rand() % 200; + ldc = rand() % 2000; + + /* Force all parameter within constraints */ + if ((rand() & 1) == 0) { + k &= ~(4-1); + m &= ~(8-1); + n &= ~(4-1); + + if (k < 4) k = 4; + if (k > 64) k = 64; + if (m < 8) m = 8; + if (n < 4) n = 4; + if (n > 64) n = 64; + + lda &= ~1; + ldb &= ~1; + ldc &= ~1; + } + break; + default: + printf("Usage: %s [m [n [k [lda [ldb [ldc]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if (m == 0) m = 1; + if (n == 0) n = 1; + if (k == 0) k = 1; + + if (lda < m) lda = m; + if (ldb < k) ldb = k; + if (ldc < m) ldc = m; + + printf("Performing dgemm_C_C_C test with m=%d n=%d k=%d lda=%d ldb=%d ldc=%d\n", m, n, k, lda, ldb, ldc); + + /* Allocate and initialize the arrays + */ + A = (double *)allocate_panel(k, lda, 128); + B = (double *)allocate_panel(n, ldb, 128); + C1 = (double *)allocate_panel(n, ldc, 128); + C2 = (double *)allocate_panel(n, ldc, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dtrsm.c =================================================================== RCS file: accel/lib/tests/dtrsm.c diff -N accel/lib/tests/dtrsm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dtrsm.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,147 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +#define EPSILON 0.0000001 + + +/* dtrsm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int ldc = 0; + int n=128; + int n_padded, nb, m_padded; + int m=128; + unsigned int blk_col = 0; + unsigned int blk_row = 0; + volatile unsigned long long incomplete; + double *A, *B1, *B2, *C1, *C2; + + switch (argc) { + case 8: + blk_col = atoi(argv[7]); + case 7: + blk_row = atoi(argv[6]); + case 6: + ldc = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = ((rand() & 3) == 0) ? (rand() % 1024) : 128; + n = rand() % 1024; + lda = rand() % 1536; + ldb = rand() % 1536; + ldc = ((rand() & 3) == 0) ? (rand() % 1536) : 0; + if ((rand() & 7) == 0) { + blk_row = rand() & 127; + blk_col = rand() & 127; + } + break; + default: + printf("Usage: %s [m [n [lda [ldb [ldc [blk_row [blk_col]]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if ((rand() & 7) != 0) lda &= ~1; + if (lda < m) lda = m; + + if ((rand() & 7) != 0) ldb &= ~1; + if (ldb < n) ldb = n; + + + if (ldc) { + ldc = (ldc + 1) & ~(1); + m_padded = blk_row + m; + if (ldc < 64*m_padded) ldc = 64*m_padded; + } else { + blk_row = blk_col = 0; + } + + n_padded = (n + blk_col + 63) & ~63; + + printf("Performing dtrsm test with m=%d n=%d lda=%d ldb=%d ldc=%d blk_row=%d blk_col=%d\n", m, n, lda, ldb, ldc, blk_row, blk_col); + + /* Allocate and initialize the arrays + */ + + hpl_ref_init(); + hpl_accel_init(); + + /* First test the DRTSM without copy into the C matrix. + */ + A = (double *)allocate_panel(m, lda, 128); + B1 = (double *)allocate_panel(m, ldb, 128); + + for (i=0; i=0; i--) { + if (fabs(C1[i] - C2[i]) > EPSILON) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, C1[i], C2[i]); + } + } + printf("Errors (with copy) = %d\n", errors); + } else { + for (i=ldb*m-1, errors=0; i>=0; i--) { + if (fabs(B1[i] - B2[i]) > EPSILON) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, B1[i], B2[i]); + } + } + printf("Errors (without copy) = %d\n", errors); + if (errors) return 1; + } + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/dtrsm_CL_B.c =================================================================== RCS file: accel/lib/tests/dtrsm_CL_B.c diff -N accel/lib/tests/dtrsm_CL_B.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/dtrsm_CL_B.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,121 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +#define EPSILON 0.0000001 + + +/* dtrsm + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int ldb = 0; + int n=128; + int n_padded, nb, m_padded; + int m=128; + unsigned int blk_col = 0; + unsigned int blk_row = 0; + volatile unsigned long long incomplete; + double *A, *B1, *B2; + + switch (argc) { + case 7: + blk_col = atoi(argv[6]); + case 6: + blk_row = atoi(argv[5]); + case 5: + ldb = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = ((rand() & 3) == 0) ? (rand() % 1024) : 128; + n = rand() % 1024; + lda = rand() % 1536; + ldb = rand() % 1536; + if ((rand() & 7) == 0) { + blk_row = rand() & 127; + blk_col = rand() & 127; + } + break; + default: + printf("Usage: %s [m [n [lda [ldb [blk_row [blk_col]]]]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + if ((rand() & 7) != 0) lda &= ~1; + if (lda < m) lda = m; + + if ((rand() & 7) != 0) ldb &= ~1; + if (ldb < n) ldb = n; + + + ldb = (ldb + 1) & ~(1); + m_padded = (m + blk_row + 63) & ~63; + + if (ldb < 64*m_padded) ldb = 64*m_padded; + + n_padded = (n + blk_col + 63) & ~63; + + printf("Performing dtrsm_CL_B test with m=%d n=%d lda=%d ldb=%d blk_row=%d blk_col=%d\n", m, n, lda, ldb, blk_row, blk_col); + + /* Allocate and initialize the arrays + */ + + hpl_ref_init(); + hpl_accel_init(); + + /* First test the DRTSM without copy into the C matrix. + */ + A = (double *)allocate_panel(m, lda, 128); + + for (i=0; i=0; i--) { + if (fabs(B1[i] - B2[i]) > EPSILON) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, B1[i], B2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/perf_dgemm.c =================================================================== RCS file: accel/lib/tests/perf_dgemm.c diff -N accel/lib/tests/perf_dgemm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_dgemm.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,182 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + +/* dgemm performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=64; + int k=128; + int m_padded, n_padded; + int iterations = 1; + int ldp = 0; + volatile unsigned long long incomplete; + void *ptr; + char *env; + double *A, *B, *C, *P = NULL; + double tbfreq, gflops; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + case 1: + break; + default: + printf("Usage: %s [n]\n", argv[0]); + return 1; + break; + } + + m_padded = (m + 63) & ~63; + n_padded = (n + 63) & ~63; + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + /* Allocate and initialize the arrays + */ + if (getenv("HUGE_TLBFS")) { + size_t memsize = 4*128 + (128*(m_padded+n_padded) + ((size_t)m_padded*(size_t)n_padded)) * sizeof(double); + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + if (getenv("PANEL")) { + memsize += 128 + ((size_t)m_padded * (size_t)n_padded)*sizeof(double); + } + + sprintf(filename, "/huge/perf_dgemm_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + B = (double *)ALIGN128(A + 128*m_padded); + C = (double *)ALIGN128(B + 128*n_padded); + if (getenv("PANEL")) { + ldp = m_padded; + P = (double *)ALIGN128(C + m_padded*n_padded); + } + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + if (posix_memalign(&ptr, 128, 128*m_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + if (posix_memalign(&ptr, 128, 128*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + B = (double *)ptr; + } + if (posix_memalign(&ptr, 128, m_padded*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + C = (double *)ptr; + } + if (getenv("PANEL")) { + ldp = m_padded; + if (posix_memalign(&ptr, 128, m_padded*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + P = (double *)ptr; + } + } + } + + for (i=0; i<128*m_padded; i++) { + A[i] = 0.0f; + __dcbf(&A[i]); + } + for (i=0; i<128*n_padded; i++) { + B[i] = 0.0f; + __dcbf(&B[i]); + } + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + + printf("DGEMM m=%d n=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% %f Gflops/sec\n", m, n, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), gflops); + + + return 0; +} Index: accel/lib/tests/perf_dgemm_C.c =================================================================== RCS file: accel/lib/tests/perf_dgemm_C.c diff -N accel/lib/tests/perf_dgemm_C.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_dgemm_C.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,161 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + +/* dgemm_C_C_C performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=64; + int k=64; + int m_padded, k_padded; + int iterations = 1; + volatile unsigned long long incomplete; + void *ptr; + char *env; + double *A, *B, *C; + double tbfreq, gflops; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 4: + k = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + case 1: + break; + default: + printf("Usage: %s [m [n [k]]]\n", argv[0]); + return 1; + break; + } + + k_padded = (k + 15) & ~15; + m_padded = (m + 15) & ~15; + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + /* Allocate and initialize the arrays + */ + if (getenv("HUGE_TLBFS")) { + size_t memsize = (m_padded*(k+n) + k_padded*n) * sizeof(double); + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + sprintf(filename, "/huge/perf_dgemm_C_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + B = (double *)ALIGN128(A + m_padded*k); + C = (double *)ALIGN128(B + k_padded*n); + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + if (posix_memalign(&ptr, 128, m_padded*k*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + if (posix_memalign(&ptr, 128, k_padded*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + B = (double *)ptr; + } + if (posix_memalign(&ptr, 128, m_padded*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + C = (double *)ptr; + } + } + + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + + printf("DGEMM_CL m=%d n=%d k=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% %f Gflops/sec\n", m, n, k, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), gflops); + + + return 0; +} Index: accel/lib/tests/perf_dgemm_CL.c =================================================================== RCS file: accel/lib/tests/perf_dgemm_CL.c diff -N accel/lib/tests/perf_dgemm_CL.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_dgemm_CL.c 14 May 2008 21:35:01 -0000 1.2 @@ -0,0 +1,163 @@ +/* ------------------------------------------------------------------ */ +/* (C) Copyright 2007 */ +/* International Business Machines Corporation, */ +/* */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------ */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + +/* dgemm performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=64; + int k=64; + int m_padded, k_padded; + int iterations = 1; + volatile unsigned long long incomplete; + void *ptr; + char *env; + double *A, *B, *C; + double tbfreq, gflops; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 4: + k = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + case 1: + break; + default: + printf("Usage: %s [m [n [k]]]\n", argv[0]); + return 1; + break; + } + + k_padded = (k + 15) & ~15; + m_padded = (m + 15) & ~15; + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + /* Allocate and initialize the arrays + */ + if (getenv("HUGE_TLBFS")) { + size_t memsize = (m_padded*(k+n) + k_padded*n) * sizeof(double); + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + sprintf(filename, "/huge/perf_dgemm_CL_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + B = (double *)ALIGN128(A + m_padded*k); + C = (double *)ALIGN128(B + k_padded*n); + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + if (posix_memalign(&ptr, 128, m_padded*k*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + if (posix_memalign(&ptr, 128, k_padded*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + B = (double *)ptr; + } + if (posix_memalign(&ptr, 128, m_padded*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + C = (double *)ptr; + } + } + + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + + printf("DGEMM_CL m=%d n=%d k=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% %f Gflops/sec\n", m, n, k, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), gflops); + + + return 0; +} Index: accel/lib/tests/perf_dtrsm.c =================================================================== RCS file: accel/lib/tests/perf_dtrsm.c diff -N accel/lib/tests/perf_dtrsm.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_dtrsm.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,159 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + + +/* dtrsm performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=128; + int n=128; + int n_padded; + volatile unsigned long long incomplete; + void *ptr; + double *A, *B, *C; + double ops, bytes, tbfreq, gflops, grate; + int iterations = 1; + char *env; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + case 1: + break; + default: + printf("Usage: %s [m [n]]\n", argv[0]); + return 1; + break; + } + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + /* Allocate and initialize the arrays + */ + n_padded = (n | 128) & ~(128-1); + if (getenv("HUGE_TLBFS")) { + size_t memsize = (m*m + m*n_padded + m*n)*sizeof(double) + 3*128; + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + sprintf(filename, "/huge/perf_dtrsm_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + B = (double *)ALIGN128(A + m*m); + C = (double *)ALIGN128(B + m*n_padded); + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + if (posix_memalign(&ptr, 128, m*m*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + /* Pad n to an off multiple of 128 for bank utilization performance reasons. + */ + if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + B = (double *)ptr; + } + if (posix_memalign(&ptr, 128, m*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + C = (double *)ptr; + } + } + + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + printf("m=%d n=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% %f Gflops/sec %f Gbytes/sec\n", + m, n, min, max, mean, std, + 100.0*((double)(max-min))/((double)mean), + gflops, grate); + + return 0; +} Index: accel/lib/tests/perf_reform_lpanel.c =================================================================== RCS file: accel/lib/tests/perf_reform_lpanel.c diff -N accel/lib/tests/perf_reform_lpanel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_reform_lpanel.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,86 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reform l panel performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=128; + int n_padded; + volatile unsigned long long incomplete; + void *ptr; + double *A, *panel; + double tbfreq; + unsigned long long ticks; + + + switch (argc) { + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + break; + default: + printf("Usage: %s [m [n]]\n", argv[0]); + return 1; + break; + } + + m = m & ~(64-1); + if (m < 64) m = 64; + + if (n < 1) n = 1; + n_padded = (n + 63) & ~(63); + + /* Allocate and initialize the arrays + */ + if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + if (posix_memalign(&ptr, 128, m*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + panel = (double *)ptr; + } + + for (i=0; i +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reform matrix performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=128; + int mb, nb; + int spes=1; + int size; + volatile unsigned long long incomplete; + void *ptr; + double *A, *scratch; + double tbfreq, blocks_xfer; + unsigned long long ticks; + + + switch (argc) { + case 4: + spes = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + break; + default: + printf("Usage: %s [m [n [spes]]]\n", argv[0]); + return 1; + break; + } + + m = m & ~(64-1); + if (m < 64) m = 64; + + n = n & ~(64-1); + if (n < 64) n = 64; + + size = spes*64*(m-4); + + /* Allocate and initialize the arrays + */ + if (posix_memalign(&ptr, 128, m*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + if (posix_memalign(&ptr, 128, size*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + scratch = (double *)ptr; + } + + for (i=0; i +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" +#include +#include +#include +#include +#include + + +/* reform rows performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=1; + int n=128; + int height=256; + int blk_col=0; + int n_padded; + int iterations=1; + int *rows; + volatile unsigned long long incomplete; + void *ptr; + double *A, *R; + double tbfreq; + char *env; + unsigned long long *ticks; + unsigned long long total; + double min, max, mean, std, delta; + + switch (argc) { + case 5: + blk_col = atoi(argv[4]); + case 4: + height = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + break; + default: + printf("Usage: %s [m [n [lda [blk_col]]]\n", argv[0]); + return 1; + break; + } + + blk_col &= (64-1); + + if (n < 1) n = 1; + n_padded = (n + blk_col + 63) & ~(63); + + if (height < m) height = m; + height = (height + 63) & ~63; + + if ((env = getenv("ITERATIONS"))) + iterations = atoi(env); + ticks = (unsigned long long *)malloc(iterations * sizeof(unsigned long long)); + + if (getenv("HUGE_TLBFS")) { + size_t memsize = 2*128 + m*sizeof(int) + (size_t)(height+m)*n_padded*sizeof(double); + size_t hugepagesize = 16*1024*1024; + int fd; + void *mem = NULL; + char filename[100]; + + sprintf(filename, "/huge/perf_reform_rows_%d.dat", getpid()); + + if ((fd = open (filename, O_CREAT | O_RDWR, 0755)) == -1) { + printf("open for huge page file %s failed (errno=%d): %s\n", filename, errno, strerror(errno)); + exit(1); + } else { + /* Delete file so that huge pages will get freed on program termination. */ + remove(filename); + + memsize = ( memsize + hugepagesize-1) & ~(hugepagesize-1); + + mem = mmap((void *)(0x100000000ULL), memsize, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + printf("mmap for %lld bytes in huge page file %s failed (errno=%d): %s\n", + (unsigned long long)memsize, filename, errno, strerror(errno)); + exit(1); + } + + A = (double *)ALIGN128(mem); + R = (double *)ALIGN128(A + height*n_padded); + rows = (int *)(R + m*n_padded); + + /* Closing the file descriptor does not unmap the region, so let's just take care of that right away */ + close (fd); + } + } else { + /* Allocate and initialize the arrays + */ + if (posix_memalign(&ptr, 128, height*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + R = (double *)ptr; + } + rows = (int *)malloc(m*sizeof(int)); + } + + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + printf("REFORM ROW (R_to_B) m=%d n=%d height=%d blk_col=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% rate=%f Gbytes/sec xfer=%f Gbytes/sec\n", m, n, height, blk_col, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), + (double)iterations * (double)m * (double)n * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9), + (double)iterations * (double)m * (double)(2*n) * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9)); + + /* Test BLOCK to ROW copy */ + + /* Perform 1 iteration first to pre-charge the PTEs + */ + hpl_accel_reform_rows_B_to_R(m, n, R, n_padded, A, M_SUB*height, rows, blk_col, (unsigned long long *)&incomplete); + while (incomplete); + + total = 0; + for (i=0; i max) max = ticks[i]; + } + std = sqrt(std/(double)(iterations)); + + printf("REFORM ROW (B_to_R) m=%d n=%d height=%d blk_col=%d MIN=%f MAX=%f MEAN=%f ticks Std Dev=%f Variance=%f%% rate=%f Gbytes/sec xfer=%f Gbytes/sec\n", m, n, height, blk_col, + min, max, mean, std, 100.0*((double)(max-min))/((double)mean), + (double)iterations * (double)m * (double)n * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9), + (double)iterations * (double)m * (double)(2*n) * tbfreq * (double)sizeof(double) / ((double)total * 1.0e9)); + + + return 0; +} Index: accel/lib/tests/perf_reform_upanel.c =================================================================== RCS file: accel/lib/tests/perf_reform_upanel.c diff -N accel/lib/tests/perf_reform_upanel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/perf_reform_upanel.c 20 Aug 2008 03:57:53 -0000 1.2 @@ -0,0 +1,80 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reform u panel performance test + */ +int main(int argc, char *argv[]) +{ + int i; + int m=64; + int n=128; + int n_padded; + volatile unsigned long long incomplete; + void *ptr; + double *A, *panel; + double tbfreq; + unsigned long long ticks; + + + switch (argc) { + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + break; + default: + printf("Usage: %s [m [n]]\n", argv[0]); + return 1; + break; + } + + n_padded = (n + 15) & ~(15); + + /* Allocate and initialize the arrays + */ + if (posix_memalign(&ptr, 128, m*n_padded*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + A = (double *)ptr; + } + + if (posix_memalign(&ptr, 128, m*n*sizeof(double))) { + perror("posix_memalign failed"); + exit(1); + } else { + panel = (double *)ptr; + } + + for (i=0; i +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +#define EPSILON 0.0000001 + + +/* reform + */ +int main(int argc, char *argv[]) +{ + int i, j; + int col; + int errors; + int lda = 0; + int ldp = 0; + int n=128; + int m=128; + volatile unsigned long long incomplete; + double *A1, *A2, *panel, *scratch; + + switch (argc) { + case 5: + ldp = atoi(argv[3]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1280; + n = rand() % 1280; + lda = rand() % 2048; + ldp = rand() % 2048; + break; + default: + printf("Usage: %s [m [n [lda [ldp]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m = m & ~(128-1); + if (m < 128) m = 128; + + n = n & ~(128-1); + if (n < 128) n = 128; + + lda = lda & ~(15); + if (lda < m) lda = m; + + ldp = ldp & ~(15); + if (ldp < m) ldp = m; + + printf("Performing reform test with m=%d n=%d lda=%d ldp=%d\n", m, n, lda, ldp); + + /* Allocate and initialize the arrays + */ + A1 = (double *)allocate_matrix(n/64, lda*M_SUB, 128); + A2 = (double *)allocate_matrix(n/64, lda*M_SUB, 128); + scratch = (double *)allocate_panel(1, 128*ldp, 128); /* allocate 1 row so that no 4GB crossings occur */ + panel = (double *)allocate_panel(128, ldp, 128); + + for (i=0; i +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reformat L panel + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int ldp = 0; + int lda = 0; + int n=64; + int m=64; + int n_padded; + volatile unsigned long long incomplete; + double *A, *P1, *P2; + + switch (argc) { + case 5: + ldp = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 768; + n = rand() % 768; + lda = M_SUB * (rand() % 1536); + ldp = rand() % 1536; + break; + default: + printf("Usage: %s [m [n [lda [ldp]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m = m & ~(64-1); + + ldp = (ldp + 15) & ~(15); + if (ldp < m) ldp = m; + if (ldp < 16) ldp = 16; + if (lda < m*M_SUB) lda = m*M_SUB; + + printf("Performing reform_lpanel test with m=%d n=%d lda=%d ldp=%d\n", m, n, lda, ldp); + + /* Allocate and initialize the arrays + */ + n_padded = (n + M_SUB-1) & ~(M_SUB-1); + + A = (double *)allocate_matrix(n_padded/M_SUB, lda, 128); + P1 = (double *)allocate_panel(n, ldp, 128); + P2 = (double *)allocate_panel(n, ldp, 128); + + if ((A == NULL) || (P1 == NULL) || (P2 == NULL)) { + printf("Failed to allocate buffers. Total allocation is %f MB. %p %p %p\n", (2.0*ldp*n + (double)lda*n_padded)*sizeof(double)/(1024.0*1024.0), A, P1, P2); + return 0; + } + + for (i=0; i=0; i--) { + if (P1[i] != P2[i]) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, P1[i], P2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/reform_matrix.c =================================================================== RCS file: accel/lib/tests/reform_matrix.c diff -N accel/lib/tests/reform_matrix.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/reform_matrix.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,97 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +/* reformat matrix + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int lda = 0; + int n=64; + int m=64; + int m_pad; + int size; + volatile unsigned long long incomplete; + double *A1, *A2, *scratch; + + switch (argc) { + case 5: + size = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 1280; + n = rand() % 1280; + lda = rand() % 2048; + size = rand() % (8*m*64); + break; + default: + printf("Usage: %s [m [n [lda [size]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + n = n & ~(64-1); + if (n < 64) n = 64; + + if (m < 1) m = 1; + m_pad = (m + 63) & ~63; + + lda = lda & ~(64-1); + if (lda < m_pad) lda = m_pad; + + if (size < m_pad*64) size = m_pad*64; + size = (size + (128-1)) & ~(128-1); /* Pad the scratch buffer to a cacheline */ + + printf("Performing reform_matrix test with m=%d n=%d lda=%d size=%d\n", m, n, lda, size); + + /* Allocate and initialize the arrays + */ + A1 = (double *)allocate_matrix(n/M_SUB, lda*M_SUB, 128); + A2 = (double *)allocate_matrix(n/M_SUB, lda*M_SUB, 128); + scratch = (double *)allocate_panel(1, size, 128); + + for (i=0; i=0; i--) { + if (A1[i] != A2[i]) { + errors++; + if (errors < 20) printf(" %d expected=%f got=%f\n", i, A1[i], A2[i]); + } + } + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/reform_rows.c =================================================================== RCS file: accel/lib/tests/reform_rows.c diff -N accel/lib/tests/reform_rows.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/reform_rows.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,158 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + +int rand_row(int max, int *rows, int cnt) +{ + int i; + int new_row; + int unique; + + do { + new_row = (max * (rand() & 0xFFFF)) >> 16; + unique = 1; + + for (i=0; i=0; i--) { + if (A1[i] != A2[i]) { + errors++; + if (errors < 20) printf("R->B %d expected=%f got=%f\n", i, A1[i], A2[i]); + } + } + + /* Test BLOCK to ROW copy */ + for (i=0; i=0; i--) { + if (R1[i] != R2[i]) { + errors++; + if (errors < 20) printf("B->R %d expected=%f got=%f\n", i, R1[i], R2[i]); + } + } + + printf("Errors = %d\n", errors); + + hpl_accel_fini(); + + return ((errors) ? 1 : 0); +} Index: accel/lib/tests/reform_upanel.c =================================================================== RCS file: accel/lib/tests/reform_upanel.c diff -N accel/lib/tests/reform_upanel.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ accel/lib/tests/reform_upanel.c 20 Aug 2008 03:57:53 -0000 1.3 @@ -0,0 +1,101 @@ +/* ---------------------------------------------------------------- */ +/* (C) Copyright IBM Corporation 2007,2008 */ +/* */ +/* ---------------------------------------------------------------- */ + +#include +#include +#include +#include +#include "hpl_accel.h" +#include "test_utils.h" + + + +/* reformat L panel + */ +int main(int argc, char *argv[]) +{ + int i; + int errors; + int ldp = 0; + int lda = 0; + int n=64; + int m=64; + int m_padded, n_padded; + volatile unsigned long long incomplete; + double *A1, *A2, *P; + + switch (argc) { + case 5: + ldp = atoi(argv[4]); + case 4: + lda = atoi(argv[3]); + case 3: + n = atoi(argv[2]); + case 2: + m = atoi(argv[1]); + break; + case 1: + /* No parameters, randomly select a parameter set. + */ + srand((unsigned int)__mftb()); + m = rand() % 700; + n = rand() % 700; + lda = M_SUB * (rand() % 1280); + ldp = rand() % 1280; + break; + default: + printf("Usage: %s [m [n [lda [ldp]]]]\n", argv[0]); + return 1; + break; + } + + /* Apply functional constraints to the parameter set. + */ + m_padded = (m + M_SUB-1) & ~(M_SUB-1); + if (ldp < n) ldp = n; + ldp = (ldp + 15) & ~(15); + if (lda < m_padded*M_SUB) lda = m_padded*M_SUB; + lda = (lda + 15) & ~(15); + + printf("Performing reform_upanel test with m=%d n=%d lda=%d ldp=%d\n", m, n, lda, ldp); + + /* Allocate and initialize the arrays + */ + n_padded = (n + M_SUB-1) & ~(M_SUB-1); + + A1 = (double *)allocate_matrix(n_padded/M_SUB, lda, 128); + A2 = (double *)allocate_matrix(n_padded/M_SUB, lda, 128); + P = (double *)allocate_panel(m, ldp, 128); + + if ((A1 == NULL) || (A2 == NULL) || (P == NULL)) { + printf("Failed to allocate buffers. Total allocation is %f MB. %p %p %p\n", (2.0*lda*n_padded + (double)ldp*m)*sizeof(double)/(1024.0*1024.0), A1, A2, P); + return 0; + } + + for