|
ScaLAPACK
2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
|
00001 /* --------------------------------------------------------------------- 00002 * 00003 * -- PBLAS routine (version 2.0) -- 00004 * University of Tennessee, Knoxville, Oak Ridge National Laboratory, 00005 * and University of California, Berkeley. 00006 * April 1, 1998 00007 * 00008 * --------------------------------------------------------------------- 00009 */ 00010 /* 00011 * Include files 00012 */ 00013 #include "pblas.h" 00014 #include "PBpblas.h" 00015 #include "PBtools.h" 00016 #include "PBblacs.h" 00017 #include "PBblas.h" 00018 00019 #ifdef __STDC__ 00020 void pcdotc_( int * N, 00021 float * DOT, 00022 float * X, int * IX, int * JX, int * DESCX, int * INCX, 00023 float * Y, int * IY, int * JY, int * DESCY, int * INCY ) 00024 #else 00025 void pcdotc_( N, DOT, X, IX, JX, DESCX, INCX, Y, IY, JY, DESCY, INCY ) 00026 /* 00027 * .. Scalar Arguments .. 00028 */ 00029 int * INCX, * INCY, * IX, * IY, * JX, * JY, * N; 00030 float * DOT; 00031 /* 00032 * .. Array Arguments .. 00033 */ 00034 int * DESCX, * DESCY; 00035 float * X, * Y; 00036 #endif 00037 { 00038 /* 00039 * Purpose 00040 * ======= 00041 * 00042 * PCDOTC forms the dot product of two subvectors, 00043 * 00044 * DOT := sub( X )**H * sub( Y ), 00045 * 00046 * where 00047 * 00048 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, 00049 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X, and, 00050 * 00051 * sub( Y ) denotes Y(IY,JY:JY+N-1) if INCY = M_Y, 00052 * Y(IY:IY+N-1,JY) if INCY = 1 and INCY <> M_Y. 00053 * 00054 * Notes 00055 * ===== 00056 * 00057 * A description vector is associated with each 2D block-cyclicly dis- 00058 * tributed matrix. This vector stores the information required to 00059 * establish the mapping between a matrix entry and its corresponding 00060 * process and memory location. 00061 * 00062 * In the following comments, the character _ should be read as 00063 * "of the distributed matrix". Let A be a generic term for any 2D 00064 * block cyclicly distributed matrix. Its description vector is DESC_A: 00065 * 00066 * NOTATION STORED IN EXPLANATION 00067 * ---------------- --------------- ------------------------------------ 00068 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. 00069 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating 00070 * the NPROW x NPCOL BLACS process grid 00071 * A is distributed over. The context 00072 * itself is global, but the handle 00073 * (the integer value) may vary. 00074 * M_A (global) DESCA[ M_ ] The number of rows in the distribu- 00075 * ted matrix A, M_A >= 0. 00076 * N_A (global) DESCA[ N_ ] The number of columns in the distri- 00077 * buted matrix A, N_A >= 0. 00078 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left 00079 * block of the matrix A, IMB_A > 0. 00080 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper 00081 * left block of the matrix A, 00082 * INB_A > 0. 00083 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- 00084 * bute the last M_A-IMB_A rows of A, 00085 * MB_A > 0. 00086 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- 00087 * bute the last N_A-INB_A columns of 00088 * A, NB_A > 0. 00089 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first 00090 * row of the matrix A is distributed, 00091 * NPROW > RSRC_A >= 0. 00092 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the 00093 * first column of A is distributed. 00094 * NPCOL > CSRC_A >= 0. 00095 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local 00096 * array storing the local blocks of 00097 * the distributed matrix A, 00098 * IF( Lc( 1, N_A ) > 0 ) 00099 * LLD_A >= MAX( 1, Lr( 1, M_A ) ) 00100 * ELSE 00101 * LLD_A >= 1. 00102 * 00103 * Let K be the number of rows of a matrix A starting at the global in- 00104 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows 00105 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would 00106 * receive if these K rows were distributed over NPROW processes. If K 00107 * is the number of columns of a matrix A starting at the global index 00108 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- 00109 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if 00110 * these K columns were distributed over NPCOL processes. 00111 * 00112 * The values of Lr() and Lc() may be determined via a call to the func- 00113 * tion PB_Cnumroc: 00114 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) 00115 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) 00116 * 00117 * Arguments 00118 * ========= 00119 * 00120 * N (global input) INTEGER 00121 * On entry, N specifies the length of the subvectors to be 00122 * multiplied. N must be at least zero. 00123 * 00124 * DOT (local output) COMPLEX array 00125 * On exit, DOT specifies the dot product of the two subvectors 00126 * sub( X ) and sub( Y ) only in their scope (See below for fur- 00127 * ther details). 00128 * 00129 * X (local input) COMPLEX array 00130 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X 00131 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and 00132 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least 00133 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. 00134 * Before entry, this array contains the local entries of the 00135 * matrix X. 00136 * 00137 * IX (global input) INTEGER 00138 * On entry, IX specifies X's global row index, which points to 00139 * the beginning of the submatrix sub( X ). 00140 * 00141 * JX (global input) INTEGER 00142 * On entry, JX specifies X's global column index, which points 00143 * to the beginning of the submatrix sub( X ). 00144 * 00145 * DESCX (global and local input) INTEGER array 00146 * On entry, DESCX is an integer array of dimension DLEN_. This 00147 * is the array descriptor for the matrix X. 00148 * 00149 * INCX (global input) INTEGER 00150 * On entry, INCX specifies the global increment for the 00151 * elements of X. Only two values of INCX are supported in 00152 * this version, namely 1 and M_X. INCX must not be zero. 00153 * 00154 * Y (local input) COMPLEX array 00155 * On entry, Y is an array of dimension (LLD_Y, Ky), where LLD_Y 00156 * is at least MAX( 1, Lr( 1, IY ) ) when INCY = M_Y and 00157 * MAX( 1, Lr( 1, IY+N-1 ) ) otherwise, and, Ky is at least 00158 * Lc( 1, JY+N-1 ) when INCY = M_Y and Lc( 1, JY ) otherwise. 00159 * Before entry, this array contains the local entries of the 00160 * matrix Y. 00161 * 00162 * IY (global input) INTEGER 00163 * On entry, IY specifies Y's global row index, which points to 00164 * the beginning of the submatrix sub( Y ). 00165 * 00166 * JY (global input) INTEGER 00167 * On entry, JY specifies Y's global column index, which points 00168 * to the beginning of the submatrix sub( Y ). 00169 * 00170 * DESCY (global and local input) INTEGER array 00171 * On entry, DESCY is an integer array of dimension DLEN_. This 00172 * is the array descriptor for the matrix Y. 00173 * 00174 * INCY (global input) INTEGER 00175 * On entry, INCY specifies the global increment for the 00176 * elements of Y. Only two values of INCY are supported in 00177 * this version, namely 1 and M_Y. INCY must not be zero. 00178 * 00179 * Further Details 00180 * =============== 00181 * 00182 * When the result of a vector-oriented PBLAS call is a scalar, this 00183 * scalar is set only within the process scope which owns the vector(s) 00184 * being operated on. Let sub( X ) be a generic term for the input vec- 00185 * tor(s). Then, the processes owning the correct the answer is determi- 00186 * ned as follows: if an operation involves more than one vector, the 00187 * processes receiving the result will be the union of the following set 00188 * of processes for each vector: 00189 * 00190 * If N = 1, M_X = 1 and INCX = 1, then one cannot determine if a pro- 00191 * cess row or process column owns the vector operand, therefore only 00192 * the process owning sub( X ) receives the correct result; 00193 * 00194 * If INCX = M_X, then sub( X ) is a vector distributed over a process 00195 * row. Each process in this row receives the result; 00196 * 00197 * If INCX = 1, then sub( X ) is a vector distributed over a process 00198 * column. Each process in this column receives the result; 00199 * 00200 * -- Written on April 1, 1998 by 00201 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. 00202 * 00203 * --------------------------------------------------------------------- 00204 */ 00205 /* 00206 * .. Local Scalars .. 00207 */ 00208 char scope, * top; 00209 int OneBlock, OneDgrid, RRorCC, Square, Xcol, Xi, Xii, XinbD, 00210 Xinb1D, XisD, XisR, XisRow, Xj, Xjj, Xld, Xlinc, XmyprocD, 00211 XmyprocR, XnbD, XnpD, XnprocsD, XnprocsR, XprocD, XprocR, 00212 Xrow, Ycol, Yi, Yii, YinbD, Yinb1D, YisD, YisR, YisRow, Yj, 00213 Yjj, Yld, Ylinc, YmyprocD, YmyprocR, YnbD, YnpD, YnprocsD, 00214 YnprocsR, YprocD, YprocR, Yrow, cdst, csrc, ctxt, dst, info, 00215 ione=1, mycol, myrow, npcol, nprow, rdst, rsrc, size, src; 00216 PBTYP_T * type; 00217 VVDOT_T dot; 00218 /* 00219 * .. Local Arrays .. 00220 */ 00221 char * buf = NULL; 00222 int Xd[DLEN_], Yd[DLEN_], dbuf[ DLEN_ ]; 00223 /* .. 00224 * .. Executable Statements .. 00225 * 00226 */ 00227 PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); 00228 PB_CargFtoC( *IY, *JY, DESCY, &Yi, &Yj, Yd ); 00229 #ifndef NO_ARGCHK 00230 /* 00231 * Test the input parameters 00232 */ 00233 Cblacs_gridinfo( ( ctxt = Xd[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); 00234 if( !( info = ( ( nprow == -1 ) ? -( 601 + CTXT_ ) : 0 ) ) ) 00235 { 00236 PB_Cchkvec( ctxt, "PCDOTC", "X", *N, 1, Xi, Xj, Xd, *INCX, 6, &info ); 00237 PB_Cchkvec( ctxt, "PCDOTC", "Y", *N, 1, Yi, Yj, Yd, *INCY, 11, &info ); 00238 } 00239 if( info ) { PB_Cabort( ctxt, "PCDOTC", info ); return; } 00240 #endif 00241 DOT[REAL_PART] = ZERO; 00242 DOT[IMAG_PART] = ZERO; 00243 /* 00244 * Quick return if possible 00245 */ 00246 if( *N == 0 ) return; 00247 /* 00248 * Handle degenerate case 00249 */ 00250 if( ( *N == 1 ) && ( ( Xd[ M_ ] == 1 ) || ( Yd[ M_ ] == 1 ) ) ) 00251 { 00252 type = PB_Cctypeset(); 00253 PB_Cpdot11( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, 00254 ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc ); 00255 return; 00256 } 00257 /* 00258 * Start the operations 00259 */ 00260 #ifdef NO_ARGCHK 00261 Cblacs_gridinfo( ( ctxt = Xd[ CTXT_ ] ), &nprow, &npcol, &myrow, &mycol ); 00262 #endif 00263 /* 00264 * Determine if sub( X ) is distributed or not 00265 */ 00266 if( ( XisRow = ( *INCX == Xd[M_] ) ) != 0 ) 00267 XisD = ( ( Xd[CSRC_] >= 0 ) && ( ( XnprocsD = npcol ) > 1 ) ); 00268 else 00269 XisD = ( ( Xd[RSRC_] >= 0 ) && ( ( XnprocsD = nprow ) > 1 ) ); 00270 /* 00271 * Determine if sub( Y ) is distributed or not 00272 */ 00273 if( ( YisRow = ( *INCY == Yd[M_] ) ) != 0 ) 00274 YisD = ( ( Yd[CSRC_] >= 0 ) && ( ( YnprocsD = npcol ) > 1 ) ); 00275 else 00276 YisD = ( ( Yd[RSRC_] >= 0 ) && ( ( YnprocsD = nprow ) > 1 ) ); 00277 /* 00278 * Are sub( X ) and sub( Y ) both row or column vectors ? 00279 */ 00280 RRorCC = ( ( XisRow && YisRow ) || ( !( XisRow ) && !( YisRow ) ) ); 00281 /* 00282 * XisD && YisD <=> both vector operands are indeed distributed 00283 */ 00284 if( XisD && YisD ) 00285 { 00286 /* 00287 * Retrieve sub( X )'s local information: Xii, Xjj, Xrow, Xcol 00288 */ 00289 PB_Cinfog2l( Xi, Xj, Xd, nprow, npcol, myrow, mycol, &Xii, &Xjj, 00290 &Xrow, &Xcol ); 00291 if( XisRow ) 00292 { 00293 XinbD = Xd[INB_]; XnbD = Xd[NB_]; 00294 Xld = Xd[LLD_]; Xlinc = Xld; 00295 XprocD = Xcol; XmyprocD = mycol; 00296 XprocR = Xrow; XmyprocR = myrow; XnprocsR = nprow; 00297 XisR = ( ( Xrow == -1 ) || ( XnprocsR == 1 ) ); 00298 Mfirstnb( Xinb1D, *N, Xj, XinbD, XnbD ); 00299 } 00300 else 00301 { 00302 XinbD = Xd[IMB_]; XnbD = Xd[MB_]; 00303 Xld = Xd[LLD_]; Xlinc = 1; 00304 XprocD = Xrow; XmyprocD = myrow; 00305 XprocR = Xcol; XmyprocR = mycol; XnprocsR = npcol; 00306 XisR = ( ( Xcol == -1 ) || ( XnprocsR == 1 ) ); 00307 Mfirstnb( Xinb1D, *N, Xi, XinbD, XnbD ); 00308 } 00309 /* 00310 * Retrieve sub( Y )'s local information: Yii, Yjj, Yrow, Ycol 00311 */ 00312 PB_Cinfog2l( Yi, Yj, Yd, nprow, npcol, myrow, mycol, &Yii, &Yjj, 00313 &Yrow, &Ycol ); 00314 if( YisRow ) 00315 { 00316 YinbD = Yd[INB_]; YnbD = Yd[NB_]; 00317 Yld = Yd[LLD_]; Ylinc = Yld; 00318 YprocD = Ycol; YmyprocD = mycol; 00319 YprocR = Yrow; YmyprocR = myrow; YnprocsR = nprow; 00320 YisR = ( ( Yrow == -1 ) || ( YnprocsR == 1 ) ); 00321 Mfirstnb( Yinb1D, *N, Yj, YinbD, YnbD ); 00322 } 00323 else 00324 { 00325 YinbD = Yd[IMB_]; YnbD = Yd[MB_]; 00326 Yld = Yd[LLD_]; Ylinc = 1; 00327 YprocD = Yrow; YmyprocD = myrow; 00328 YprocR = Ycol; YmyprocR = mycol; YnprocsR = npcol; 00329 YisR = ( ( Ycol == -1 ) || ( YnprocsR == 1 ) ); 00330 Mfirstnb( Yinb1D, *N, Yi, YinbD, YnbD ); 00331 } 00332 /* 00333 * Do sub( X ) and sub( Y ) span more than one process ? 00334 */ 00335 OneDgrid = ( ( XnprocsD == 1 ) && ( YnprocsD == 1 ) ); 00336 OneBlock = ( ( Xinb1D >= *N ) && ( Yinb1D >= *N ) ); 00337 /* 00338 * Are sub( X ) and sub( Y ) distributed in the same manner ? 00339 */ 00340 Square = ( ( Xinb1D == Yinb1D ) && ( XnbD == YnbD ) && 00341 ( XnprocsD == YnprocsD ) ); 00342 00343 if( !( XisR ) ) 00344 { 00345 /* 00346 * sub( X ) is not replicated 00347 */ 00348 if( YisR ) 00349 { 00350 /* 00351 * If sub( X ) is not replicated, but sub( Y ) is, a process row or column 00352 * YprocR need to be selected. It will contain the non-replicated vector used 00353 * to perform the dot product computation. 00354 */ 00355 if( RRorCC ) 00356 { 00357 /* 00358 * sub( X ) and sub( Y ) are both row or column vectors 00359 */ 00360 if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) 00361 { 00362 /* 00363 * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. 00364 * Enforce a purely local operation by choosing YprocR to be equal to XprocR. 00365 */ 00366 YprocR = XprocR; 00367 } 00368 else 00369 { 00370 /* 00371 * Otherwise, communication has to occur, so choose the next process row or 00372 * column for YprocR to maximize the number of links, i.e reduce contention. 00373 */ 00374 YprocR = MModAdd1( XprocR, XnprocsR ); 00375 } 00376 } 00377 else 00378 { 00379 /* 00380 * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is 00381 * chosen for YprocR does not really matter. Select the process origin. 00382 */ 00383 YprocR = XprocD; 00384 } 00385 } 00386 else 00387 { 00388 /* 00389 * Neither sub( X ) nor sub( Y ) are replicated. If I am not in process row or 00390 * column XprocR and not in process row or column YprocR, then quick return. 00391 */ 00392 if( ( XmyprocR != XprocR ) && ( YmyprocR != YprocR ) ) 00393 return; 00394 } 00395 } 00396 else 00397 { 00398 /* 00399 * sub( X ) is distributed and replicated (so no quick return possible) 00400 */ 00401 if( YisR ) 00402 { 00403 /* 00404 * sub( Y ) is distributed and replicated as well 00405 */ 00406 if( RRorCC ) 00407 { 00408 /* 00409 * sub( X ) and sub( Y ) are both row or column vectors 00410 */ 00411 if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) 00412 { 00413 /* 00414 * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. 00415 * Enforce a purely local operation by choosing XprocR and YprocR to be equal 00416 * to zero. 00417 */ 00418 XprocR = YprocR = 0; 00419 } 00420 else 00421 { 00422 /* 00423 * Otherwise, communication has to occur, so select YprocR to be zero and the 00424 * next process row or column for XprocR in order to maximize the number of 00425 * used links, i.e reduce contention. 00426 */ 00427 YprocR = 0; 00428 XprocR = MModAdd1( YprocR, YnprocsR ); 00429 } 00430 } 00431 else 00432 { 00433 /* 00434 * sub( X ) and sub( Y ) are distributed in orthogonal directions, select the 00435 * origin processes. 00436 */ 00437 XprocR = YprocD; 00438 YprocR = XprocD; 00439 } 00440 } 00441 else 00442 { 00443 /* 00444 * sub( Y ) is distributed, but not replicated 00445 */ 00446 if( RRorCC ) 00447 { 00448 /* 00449 * sub( X ) and sub( Y ) are both row or column vectors 00450 */ 00451 if( ( OneDgrid || OneBlock || Square ) && ( XprocD == YprocD ) ) 00452 { 00453 /* 00454 * sub( X ) and sub( Y ) start in the same process row or column XprocD=YprocD. 00455 * Enforce a purely local operation by choosing XprocR to be equal to YprocR. 00456 */ 00457 XprocR = YprocR; 00458 } 00459 else 00460 { 00461 /* 00462 * Otherwise, communication has to occur, so choose the next process row or 00463 * column for XprocR to maximize the number of links, i.e reduce contention. 00464 */ 00465 XprocR = MModAdd1( YprocR, YnprocsR ); 00466 } 00467 } 00468 else 00469 { 00470 /* 00471 * sub( X ) and sub( Y ) are distributed in orthogonal directions, what is 00472 * chosen for XprocR does not really matter. Select the origin process. 00473 */ 00474 XprocR = YprocD; 00475 } 00476 } 00477 } 00478 /* 00479 * Even if sub( X ) and/or sub( Y ) are replicated, only two process row or 00480 * column are active, namely XprocR and YprocR. If any of those operands is 00481 * replicated, broadcast will occur (unless there is an easy way out). 00482 */ 00483 type = PB_Cctypeset(); size = type->size; dot = type->Fvvdotc; 00484 /* 00485 * A purely operation occurs iff the operands start in the same process and if 00486 * either the grid is mono-dimensional or there is a single local block to be 00487 * operated with or if both operands are aligned. 00488 */ 00489 if( ( ( RRorCC && ( XprocD == YprocD ) && ( XprocR == YprocR ) ) || 00490 ( !( RRorCC ) && ( XprocD == YprocR ) && ( XprocR == YprocD ) ) ) && 00491 ( OneDgrid || OneBlock || ( RRorCC && Square ) ) ) 00492 { 00493 if( ( !XisR && ( XmyprocR == XprocR ) && 00494 !YisR && ( YmyprocR == YprocR ) ) || 00495 ( !XisR && YisR && ( YmyprocR == YprocR ) ) || 00496 ( !YisR && XisR && ( XmyprocR == XprocR ) ) || 00497 ( XisR && YisR ) ) 00498 { 00499 XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, 00500 XnprocsD ); 00501 YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, 00502 YnprocsD ); 00503 if( ( XnpD > 0 ) && ( YnpD > 0 ) ) 00504 { 00505 dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, 00506 size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, size ), 00507 &Ylinc ); 00508 } 00509 } 00510 /* 00511 * Combine the local results in sub( X )'s scope 00512 */ 00513 if( ( XisR && YisR ) || ( XmyprocR == XprocR ) ) 00514 { 00515 scope = ( XisRow ? CROW : CCOLUMN ); 00516 top = PB_Ctop( &ctxt, COMBINE, &scope, TOP_GET ); 00517 Ccgsum2d( ctxt, &scope, top, 1, 1, ((char *) DOT), 1, -1, 0 ); 00518 } 00519 if( RRorCC && XisR && YisR ) return; 00520 } 00521 else if( ( RRorCC && OneDgrid ) || OneBlock || Square ) 00522 { 00523 /* 00524 * Otherwise, it may be possible to compute the desired dot-product in a single 00525 * message exchange iff the grid is mono-dimensional and the operands are 00526 * distributed in the same direction, or there is just one block to be exchanged 00527 * or if both operands are similarly distributed in their respective direction. 00528 */ 00529 if( ( YmyprocR == YprocR ) ) 00530 { 00531 /* 00532 * The processes owning a piece of sub( Y ) send it to the corresponding 00533 * process owning s piece of sub ( X ). 00534 */ 00535 YnpD = PB_Cnumroc( *N, 0, Yinb1D, YnbD, YmyprocD, YprocD, 00536 YnprocsD ); 00537 if( YnpD > 0 ) 00538 { 00539 dst = XprocD + MModSub( YmyprocD, YprocD, YnprocsD ); 00540 dst = MPosMod( dst, XnprocsD ); 00541 if( XisRow ) { rdst = XprocR; cdst = dst; } 00542 else { rdst = dst; cdst = XprocR; } 00543 00544 if( ( myrow == rdst ) && ( mycol == cdst ) ) 00545 { 00546 dot( &YnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, 00547 size ), &Xlinc, Mptr( ((char *) Y), Yii, Yjj, Yld, 00548 size ), &Ylinc ); 00549 } 00550 else 00551 { 00552 if( YisRow ) 00553 Ccgesd2d( ctxt, 1, YnpD, Mptr( ((char *) Y), Yii, Yjj, 00554 Yld, size ), Yld, rdst, cdst ); 00555 else 00556 Ccgesd2d( ctxt, YnpD, 1, Mptr( ((char *) Y), Yii, Yjj, 00557 Yld, size ), Yld, rdst, cdst ); 00558 } 00559 } 00560 } 00561 if( XmyprocR == XprocR ) 00562 { 00563 /* 00564 * The processes owning a piece of sub( X ) receive the corresponding local 00565 * piece of sub( Y ), compute the local dot product and combine the results 00566 * within sub( X )'s scope. 00567 */ 00568 XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, 00569 XnprocsD ); 00570 if( XnpD > 0 ) 00571 { 00572 src = YprocD + MModSub( XmyprocD, XprocD, XnprocsD ); 00573 src = MPosMod( src, YnprocsD ); 00574 if( YisRow ) { rsrc = YprocR; csrc = src; } 00575 else { rsrc = src; csrc = YprocR; } 00576 if( ( myrow != rsrc ) || ( mycol != csrc ) ) 00577 { 00578 buf = PB_Cmalloc( XnpD * size ); 00579 if( YisRow ) 00580 Ccgerv2d( ctxt, 1, XnpD, buf, 1, rsrc, csrc ); 00581 else 00582 Ccgerv2d( ctxt, XnpD, 1, buf, XnpD, rsrc, csrc ); 00583 dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, 00584 size ), &Xlinc, buf, &ione ); 00585 if( buf ) free( buf ); 00586 } 00587 } 00588 if( XisRow ) 00589 { 00590 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); 00591 Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); 00592 } 00593 else 00594 { 00595 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); 00596 Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); 00597 } 00598 } 00599 } 00600 else 00601 { 00602 /* 00603 * General case, copy sub( Y ) within sub( X )'s scope, compute the local 00604 * results and combine them within sub( X )'s scope. 00605 */ 00606 XnpD = PB_Cnumroc( *N, 0, Xinb1D, XnbD, XmyprocD, XprocD, XnprocsD ); 00607 00608 if( XisRow ) 00609 { 00610 PB_Cdescset( dbuf, 1, *N, 1, Xinb1D, 1, XnbD, XprocR, XprocD, ctxt, 00611 1 ); 00612 } 00613 else 00614 { 00615 PB_Cdescset( dbuf, *N, 1, Xinb1D, 1, XnbD, 1, XprocD, XprocR, ctxt, 00616 MAX( 1, XnpD ) ); 00617 } 00618 if( ( XmyprocR == XprocR ) && ( XnpD > 0 ) ) 00619 buf = PB_Cmalloc( XnpD * size ); 00620 00621 if( YisRow ) 00622 { 00623 PB_Cpaxpby( type, NOCONJG, 1, *N, type->one, ((char *) Y), Yi, Yj, 00624 Yd, ROW, type->zero, buf, 0, 0, dbuf, ( XisRow ? ROW : 00625 COLUMN ) ); 00626 } 00627 else 00628 { 00629 PB_Cpaxpby( type, NOCONJG, *N, 1, type->one, ((char *) Y), Yi, Yj, 00630 Yd, COLUMN, type->zero, buf, 0, 0, dbuf, ( XisRow ? 00631 ROW : COLUMN ) ); 00632 } 00633 00634 if( XmyprocR == XprocR ) 00635 { 00636 if( XnpD > 0 ) 00637 { 00638 dot( &XnpD, ((char *) DOT), Mptr( ((char *) X), Xii, Xjj, Xld, 00639 size ), &Xlinc, buf, &ione ); 00640 if( buf ) free( buf ); 00641 } 00642 if( XisRow ) 00643 { 00644 top = PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); 00645 Ccgsum2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, -1, 0 ); 00646 } 00647 else 00648 { 00649 top = PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); 00650 Ccgsum2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, -1, 0 ); 00651 } 00652 } 00653 } 00654 /* 00655 * Send the DOT product result within sub( Y )'s scope 00656 */ 00657 if( XisR || YisR ) 00658 { 00659 /* 00660 * Either sub( X ) or sub( Y ) are replicated, so that every process should have 00661 * the result -> broadcast it orthogonally from sub( X )'s direction. 00662 */ 00663 if( XisRow ) 00664 { 00665 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); 00666 if( XmyprocR == XprocR ) 00667 Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); 00668 else 00669 Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, XprocR, 00670 XmyprocD ); 00671 } 00672 else 00673 { 00674 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); 00675 if( XmyprocR == XprocR ) 00676 Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); 00677 else 00678 Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, XmyprocD, 00679 XprocR ); 00680 } 00681 } 00682 else 00683 { 00684 /* 00685 * Neither sub( X ) nor sub( Y ) are replicated 00686 */ 00687 if( RRorCC ) 00688 { 00689 /* 00690 * Both sub( X ) are distributed in the same direction -> the process row or 00691 * column XprocR sends the result to the process row or column YprocR. 00692 */ 00693 if( XprocR != YprocR ) 00694 { 00695 if( XmyprocR == XprocR ) 00696 { 00697 if( XisRow ) 00698 Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YprocR, 00699 YmyprocD ); 00700 else 00701 Ccgesd2d( ctxt, 1, 1, ((char *) DOT), 1, YmyprocD, 00702 YprocR ); 00703 } 00704 else if( YmyprocR == YprocR ) 00705 { 00706 if( XisRow ) 00707 Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XprocR, 00708 XmyprocD ); 00709 else 00710 Ccgerv2d( ctxt, 1, 1, ((char *) DOT), 1, XmyprocD, 00711 XprocR ); 00712 } 00713 } 00714 } 00715 else 00716 { 00717 /* 00718 * Otherwise, the process at the intersection of sub( X )'s and sub( Y )'s 00719 * scope, broadcast the result within sub( Y )'s scope. 00720 */ 00721 if( YmyprocR == YprocR ) 00722 { 00723 if( YisRow ) 00724 { 00725 top = PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); 00726 if( YmyprocD == XprocR ) 00727 Ccgebs2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1 ); 00728 else 00729 Ccgebr2d( ctxt, ROW, top, 1, 1, ((char*)DOT), 1, 00730 YprocR, XprocR ); 00731 } 00732 else 00733 { 00734 top = PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); 00735 if( YmyprocD == XprocR ) 00736 Ccgebs2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1 ); 00737 else 00738 Ccgebr2d( ctxt, COLUMN, top, 1, 1, ((char*)DOT), 1, 00739 XprocR, YprocR ); 00740 } 00741 } 00742 } 00743 } 00744 } 00745 else if( !( XisD ) && YisD ) 00746 { 00747 /* 00748 * sub( X ) is not distributed and sub( Y ) is distributed. 00749 */ 00750 type = PB_Cctypeset(); 00751 PB_CpdotND( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, 00752 ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc ); 00753 } 00754 else if( XisD && !( YisD ) ) 00755 { 00756 /* 00757 * sub( X ) is distributed and sub( Y ) is not distributed. 00758 */ 00759 type = PB_Cctypeset(); 00760 /* 00761 * Compute DOT := sub( Y )**H * sub( X ) 00762 */ 00763 PB_CpdotND( type, *N, ((char *) DOT), ((char *) Y), Yi, Yj, Yd, *INCY, 00764 ((char *) X), Xi, Xj, Xd, *INCX, type->Fvvdotc ); 00765 /* 00766 * Conjugate the result 00767 */ 00768 DOT[IMAG_PART] = -DOT[IMAG_PART]; 00769 } 00770 else 00771 { 00772 /* 00773 * Neither sub( X ) nor sub( Y ) are distributed 00774 */ 00775 type = PB_Cctypeset(); 00776 PB_CpdotNN( type, *N, ((char *) DOT), ((char *) X), Xi, Xj, Xd, *INCX, 00777 ((char *) Y), Yi, Yj, Yd, *INCY, type->Fvvdotc ); 00778 } 00779 /* 00780 * End of PCDOTC 00781 */ 00782 }