|
ScaLAPACK
2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
|
00001 /* --------------------------------------------------------------------- 00002 * 00003 * -- PBLAS auxiliary routine (version 2.0) -- 00004 * University of Tennessee, Knoxville, Oak Ridge National Laboratory, 00005 * and University of California, Berkeley. 00006 * April 1, 1998 00007 * 00008 * --------------------------------------------------------------------- 00009 */ 00010 /* 00011 * Include files 00012 */ 00013 #include "../pblas.h" 00014 #include "../PBpblas.h" 00015 #include "../PBtools.h" 00016 #include "../PBblacs.h" 00017 #include "../PBblas.h" 00018 00019 #ifdef __STDC__ 00020 void PB_CpdotNN( PBTYP_T * TYPE, int N, char * DOT, 00021 char * X, int IX, int JX, int * DESCX, int INCX, 00022 char * Y, int IY, int JY, int * DESCY, int INCY, 00023 VVDOT_T FDOT ) 00024 #else 00025 void PB_CpdotNN( TYPE, N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, 00026 INCY, FDOT ) 00027 /* 00028 * .. Scalar Arguments .. 00029 */ 00030 int INCX, INCY, IX, IY, JX, JY, N; 00031 char * DOT; 00032 PBTYP_T * TYPE; 00033 VVDOT_T FDOT; 00034 /* 00035 * .. Array Arguments .. 00036 */ 00037 int * DESCX, * DESCY; 00038 char * X, * Y; 00039 #endif 00040 { 00041 /* 00042 * Purpose 00043 * ======= 00044 * 00045 * PB_CpdotNN forms the dot product of two subvectors, 00046 * 00047 * DOT := sub( X )**T * sub( Y ) or DOT := sub( X )**H * sub( Y ), 00048 * 00049 * where 00050 * 00051 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, 00052 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, 00053 * 00054 * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, 00055 * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. 00056 * 00057 * Both subvectors are assumed to be not distributed. 00058 * 00059 * Notes 00060 * ===== 00061 * 00062 * A description vector is associated with each 2D block-cyclicly dis- 00063 * tributed matrix. This vector stores the information required to 00064 * establish the mapping between a matrix entry and its corresponding 00065 * process and memory location. 00066 * 00067 * In the following comments, the character _ should be read as 00068 * "of the distributed matrix". Let A be a generic term for any 2D 00069 * block cyclicly distributed matrix. Its description vector is DESC_A: 00070 * 00071 * NOTATION STORED IN EXPLANATION 00072 * ---------------- --------------- ------------------------------------ 00073 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. 00074 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating 00075 * the NPROW x NPCOL BLACS process grid 00076 * A is distributed over. The context 00077 * itself is global, but the handle 00078 * (the integer value) may vary. 00079 * M_A (global) DESCA[ M_ ] The number of rows in the distribu- 00080 * ted matrix A, M_A >= 0. 00081 * N_A (global) DESCA[ N_ ] The number of columns in the distri- 00082 * buted matrix A, N_A >= 0. 00083 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left 00084 * block of the matrix A, IMB_A > 0. 00085 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper 00086 * left block of the matrix A, 00087 * INB_A > 0. 00088 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- 00089 * bute the last M_A-IMB_A rows of A, 00090 * MB_A > 0. 00091 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- 00092 * bute the last N_A-INB_A columns of 00093 * A, NB_A > 0. 00094 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first 00095 * row of the matrix A is distributed, 00096 * NPROW > RSRC_A >= 0. 00097 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the 00098 * first column of A is distributed. 00099 * NPCOL > CSRC_A >= 0. 00100 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local 00101 * array storing the local blocks of 00102 * the distributed matrix A, 00103 * IF( Lc( 1, N_A ) > 0 ) 00104 * LLD_A >= MAX( 1, Lr( 1, M_A ) ) 00105 * ELSE 00106 * LLD_A >= 1. 00107 * 00108 * Let K be the number of rows of a matrix A starting at the global in- 00109 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows 00110 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would 00111 * receive if these K rows were distributed over NPROW processes. If K 00112 * is the number of columns of a matrix A starting at the global index 00113 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- 00114 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if 00115 * these K columns were distributed over NPCOL processes. 00116 * 00117 * The values of Lr() and Lc() may be determined via a call to the func- 00118 * tion PB_Cnumroc: 00119 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) 00120 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) 00121 * 00122 * Arguments 00123 * ========= 00124 * 00125 * TYPE (local input) pointer to a PBTYP_T structure 00126 * On entry, TYPE is a pointer to a structure of type PBTYP_T, 00127 * that contains type information (See pblas.h). 00128 * 00129 * N (global input) INTEGER 00130 * On entry, N specifies the length of the subvectors to be 00131 * multiplied. N must be at least zero. 00132 * 00133 * DOT (local output) pointer to CHAR 00134 * On exit, DOT specifies the dot product of the two subvectors 00135 * sub( X ) and sub( Y ) only in their scope (See below for fur- 00136 * ther details). 00137 * 00138 * X (local input) pointer to CHAR 00139 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X 00140 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and 00141 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least 00142 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. 00143 * Before entry, this array contains the local entries of the 00144 * matrix X. 00145 * 00146 * IX (global input) INTEGER 00147 * On entry, IX specifies X's global row index, which points to 00148 * the beginning of the submatrix sub( X ). 00149 * 00150 * JX (global input) INTEGER 00151 * On entry, JX specifies X's global column index, which points 00152 * to the beginning of the submatrix sub( X ). 00153 * 00154 * DESCX (global and local input) INTEGER array 00155 * On entry, DESCX is an integer array of dimension DLEN_. This 00156 * is the array descriptor for the matrix X. 00157 * 00158 * INCX (global input) INTEGER 00159 * On entry, INCX specifies the global increment for the 00160 * elements of X. Only two values of INCX are supported in 00161 * this version, namely 1 and M_X. INCX must not be zero. 00162 * 00163 * Y (local input) pointer to CHAR 00164 * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y 00165 * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and 00166 * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least 00167 * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. 00168 * Before entry, this array contains the local entries of the 00169 * matrix Y. 00170 * 00171 * IY (global input) INTEGER 00172 * On entry, IY specifies Y's global row index, which points to 00173 * the beginning of the submatrix sub( Y ). 00174 * 00175 * JY (global input) INTEGER 00176 * On entry, JY specifies Y's global column index, which points 00177 * to the beginning of the submatrix sub( Y ). 00178 * 00179 * DESCY (global and local input) INTEGER array 00180 * On entry, DESCY is an integer array of dimension DLEN_. This 00181 * is the array descriptor for the matrix Y. 00182 * 00183 * INCY (global input) INTEGER 00184 * On entry, INCY specifies the global increment for the 00185 * elements of Y. Only two values of INCY are supported in 00186 * this version, namely 1 and M_Y. INCY must not be zero. 00187 * 00188 * FDOT (local input) pointer to a function of type VVDOT 00189 * On entry, FDOT points to a subroutine that computes the local 00190 * dot product of two vectors. 00191 * 00192 * Further Details 00193 * =============== 00194 * 00195 * When the result of a vector-oriented PBLAS call is a scalar, this 00196 * scalar is set only within the process scope which owns the vector(s) 00197 * being operated on. Let sub( X ) be a generic term for the input vec- 00198 * tor(s). Then, the processes owning the correct the answer is determi- 00199 * ned as follows: if an operation involves more than one vector, the 00200 * processes receiving the result will be the union of the following set 00201 * of processes for each vector: 00202 * 00203 * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- 00204 * cess row or process column owns the vector operand, therefore only 00205 * the process owning sub( X ) receives the correct result; 00206 * 00207 * If INCX = M_X, then sub( X ) is a vector distributed over a process 00208 * row. Each process in this row receives the result; 00209 * 00210 * If INCX = 1, then sub( X ) is a vector distributed over a process 00211 * column. Each process in this column receives the result; 00212 * 00213 * -- Written on April 1, 1998 by 00214 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. 00215 * 00216 * --------------------------------------------------------------------- 00217 */ 00218 /* 00219 * .. Local Scalars .. 00220 */ 00221 char Xscope, Yscope, * top; 00222 int RRorCC, Xcol, Xii, XisR, XisRow, Xjj, Xld, Xlinc, XmyprocD, 00223 XmyprocR, XnprocsR, XprocR, Xrow, Ycol, Yii, YisR, YisRow, 00224 Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnprocsR, YprocR, Yrow, 00225 csrc, ctxt, ione=1, mycol, myrow, npcol, nprow, rsrc, size; 00226 /* 00227 * .. Local Arrays .. 00228 */ 00229 char * buf = NULL; 00230 /* .. 00231 * .. Executable Statements .. 00232 * 00233 */ 00234 /* 00235 * Retrieve process grid information 00236 */ 00237 Cblacs_gridinfo( ( ctxt = DESCX[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); 00238 /* 00239 * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol ... 00240 */ 00241 PB_Cinfog2l( IX, JX, DESCX, nprow, npcol, myrow, mycol, &Xii, &Xjj, 00242 &Xrow, &Xcol ); 00243 if( ( XisRow = ( INCX == DESCX[M_] ) ) != 0 ) 00244 { 00245 Xld = DESCX[LLD_]; Xlinc = Xld; 00246 XmyprocD = mycol; XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; 00247 XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); 00248 } 00249 else 00250 { 00251 Xld = DESCX[LLD_]; Xlinc = 1; 00252 XmyprocD = myrow; XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; 00253 XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); 00254 } 00255 /* 00256 * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol ... 00257 */ 00258 PB_Cinfog2l( IY, JY, DESCY, nprow, npcol, myrow, mycol, &Yii, &Yjj, 00259 &Yrow, &Ycol ); 00260 if( ( YisRow = ( INCY == DESCY[M_] ) ) != 0 ) 00261 { 00262 Yld = DESCY[LLD_]; Ylinc = Yld; 00263 YmyprocD = mycol; YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; 00264 YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); 00265 } 00266 else 00267 { 00268 Yld = DESCY[LLD_]; Ylinc = 1; 00269 YmyprocD = myrow; YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; 00270 YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); 00271 } 00272 /* 00273 * Are sub( X ) and sub( Y ) both row or column vectors ? 00274 */ 00275 RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); 00276 /* 00277 * Neither sub( X ) nor sub( Y ) are distributed 00278 */ 00279 if( !XisR ) 00280 { 00281 /* 00282 * sub( X ) is not replicated 00283 */ 00284 if( !( YisR ) ) 00285 { 00286 /* 00287 * sub( Y ) is not replicated 00288 */ 00289 if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) 00290 /* 00291 * If I am not in XprocR or YprocR, then return immediately 00292 */ 00293 return; 00294 00295 size = TYPE->size; 00296 00297 if( RRorCC ) 00298 { 00299 /* 00300 * sub( X ) and sub( Y ) are both row or column vectors 00301 */ 00302 if( XprocR == YprocR ) 00303 { 00304 /* 00305 * sub( X ) and sub( Y ) are in the same process row or column 00306 */ 00307 FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, 00308 Yii, Yjj, Yld, size ), &Ylinc ); 00309 } 00310 else 00311 { 00312 /* 00313 * sub( X ) and sub( Y ) are in a different process row or column 00314 */ 00315 if( XmyprocR == XprocR ) 00316 { 00317 buf = PB_Cmalloc( N * size ); 00318 /* 00319 * Send sub( X ) to where sub( Y ) resides, and receive sub( Y ) from the same 00320 * location. 00321 */ 00322 if( XisRow ) 00323 { 00324 TYPE->Cgesd2d( ctxt, 1, N, Mptr( X, Xii, Xjj, Xld, size ), 00325 Xld, YprocR, XmyprocD ); 00326 TYPE->Cgerv2d( ctxt, 1, N, buf, 1, YprocR, XmyprocD ); 00327 } 00328 else 00329 { 00330 TYPE->Cgesd2d( ctxt, N, 1, Mptr( X, Xii, Xjj, Xld, size ), 00331 Xld, XmyprocD, YprocR ); 00332 TYPE->Cgerv2d( ctxt, N, 1, buf, N, XmyprocD, YprocR ); 00333 } 00334 FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, buf, 00335 &ione ); 00336 if( buf ) free( buf ); 00337 } 00338 00339 if( YmyprocR == YprocR ) 00340 { 00341 buf = PB_Cmalloc( N * size ); 00342 /* 00343 * Send sub( Y ) to where sub( X ) resides, and receive sub( X ) from the same 00344 * location. 00345 */ 00346 if( YisRow ) 00347 { 00348 TYPE->Cgesd2d( ctxt, 1, N, Mptr( Y, Yii, Yjj, Yld, size ), 00349 Yld, XprocR, YmyprocD ); 00350 TYPE->Cgerv2d( ctxt, 1, N, buf, 1, XprocR, YmyprocD ); 00351 } 00352 else 00353 { 00354 TYPE->Cgesd2d( ctxt, N, 1, Mptr( Y, Yii, Yjj, Yld, size ), 00355 Yld, YmyprocD, XprocR ); 00356 TYPE->Cgerv2d( ctxt, N, 1, buf, N, YmyprocD, XprocR ); 00357 } 00358 FDOT( &N, DOT, buf, &ione, Mptr( Y, Yii, Yjj, Yld, size ), 00359 &Ylinc ); 00360 if( buf ) free( buf ); 00361 } 00362 } 00363 } 00364 else 00365 { 00366 /* 00367 * sub( X ) and sub( Y ) are not both row or column vectors 00368 */ 00369 if( ( XmyprocR == XprocR ) && ( YmyprocR == YprocR ) ) 00370 { 00371 /* 00372 * If I am at the intersection of the process row and column, then compute the 00373 * dot product and broadcast it in my process row and column. 00374 */ 00375 FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, 00376 Yii, Yjj, Yld, size ), &Ylinc ); 00377 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); 00378 TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); 00379 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); 00380 TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); 00381 } 00382 else if( XmyprocR == XprocR ) 00383 { 00384 if( XisRow ) { Xscope = CROW; rsrc = XprocR; csrc = YprocR; } 00385 else { Xscope = CCOLUMN; rsrc = YprocR; csrc = XprocR; } 00386 top = PB_Ctop( &ctxt, BCAST, &Xscope, TOP_GET ); 00387 TYPE->Cgebr2d( ctxt, &Xscope, top, 1, 1, DOT, 1, rsrc, csrc ); 00388 } 00389 else if( YmyprocR == YprocR ) 00390 { 00391 if( YisRow ) { Yscope = CROW; rsrc = YprocR; csrc = XprocR; } 00392 else { Yscope = CCOLUMN; rsrc = XprocR; csrc = YprocR; } 00393 top = PB_Ctop( &ctxt, BCAST, &Yscope, TOP_GET ); 00394 TYPE->Cgebr2d( ctxt, &Yscope, top, 1, 1, DOT, 1, rsrc, csrc ); 00395 } 00396 } 00397 } 00398 else 00399 { 00400 /* 00401 * sub( Y ) is replicated 00402 */ 00403 if( XmyprocR == XprocR ) 00404 { 00405 /* 00406 * If I am in the process row (resp. column) owning sub( X ), then compute the 00407 * dot product and broadcast in my column (resp. row). 00408 */ 00409 size = TYPE->size; 00410 00411 FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, 00412 Yjj, Yld, size ), &Ylinc ); 00413 00414 if( XisRow ) 00415 { 00416 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); 00417 TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); 00418 } 00419 else 00420 { 00421 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); 00422 TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); 00423 } 00424 } 00425 else 00426 { 00427 /* 00428 * Otherwise, receive the dot product 00429 */ 00430 if( XisRow ) 00431 { 00432 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); 00433 TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, XprocR, 00434 XmyprocD ); 00435 } 00436 else 00437 { 00438 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); 00439 TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, XmyprocD, 00440 XprocR ); 00441 } 00442 } 00443 } 00444 } 00445 else 00446 { 00447 /* 00448 * sub( X ) is replicated 00449 */ 00450 if( YisR || ( YmyprocR == YprocR ) ) 00451 { 00452 /* 00453 * If I own a piece of sub( Y ), then compute the dot product 00454 */ 00455 size = TYPE->size; 00456 00457 FDOT( &N, DOT, Mptr( X, Xii, Xjj, Xld, size ), &Xlinc, Mptr( Y, Yii, 00458 Yjj, Yld, size ), &Ylinc ); 00459 } 00460 00461 if( !YisR ) 00462 { 00463 /* 00464 * If sub( Y ) is not replicated, then broadcast the result to the other 00465 * processes that own a piece of sub( X ), but were not involved in the above 00466 * dot-product computation. 00467 */ 00468 if( YisRow ) 00469 { 00470 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); 00471 if( YmyprocR == YprocR ) 00472 TYPE->Cgebs2d( ctxt, COLUMN, top, 1, 1, DOT, 1 ); 00473 else 00474 TYPE->Cgebr2d( ctxt, COLUMN, top, 1, 1, DOT, 1, YprocR, 00475 YmyprocD ); 00476 } 00477 else 00478 { 00479 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); 00480 if( YmyprocR == YprocR ) 00481 TYPE->Cgebs2d( ctxt, ROW, top, 1, 1, DOT, 1 ); 00482 else 00483 TYPE->Cgebr2d( ctxt, ROW, top, 1, 1, DOT, 1, YmyprocD, YprocR ); 00484 } 00485 } 00486 } 00487 /* 00488 * End of PB_CpdotNN 00489 */ 00490 }