|
ScaLAPACK
2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
|
00001 /* --------------------------------------------------------------------- 00002 * 00003 * -- PBLAS routine (version 2.0) -- 00004 * University of Tennessee, Knoxville, Oak Ridge National Laboratory, 00005 * and University of California, Berkeley. 00006 * April 1, 1998 00007 * 00008 * --------------------------------------------------------------------- 00009 */ 00010 /* 00011 * Include files 00012 */ 00013 #include "pblas.h" 00014 #include "PBpblas.h" 00015 #include "PBtools.h" 00016 #include "PBblacs.h" 00017 #include "PBblas.h" 00018 00019 #ifdef __STDC__ 00020 void pztrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N, 00021 double * A, int * IA, int * JA, int * DESCA, 00022 double * X, int * IX, int * JX, int * DESCX, 00023 int * INCX ) 00024 #else 00025 void pztrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX, 00026 DESCX, INCX ) 00027 /* 00028 * .. Scalar Arguments .. 00029 */ 00030 F_CHAR_T DIAG, TRANS, UPLO; 00031 int * IA, * INCX, * IX, * JA, * JX, * N; 00032 /* 00033 * .. Array Arguments .. 00034 */ 00035 int * DESCA, * DESCX; 00036 double * A, * X; 00037 #endif 00038 { 00039 /* 00040 * Purpose 00041 * ======= 00042 * 00043 * PZTRSV solves one of the systems of equations 00044 * 00045 * sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b, or 00046 * 00047 * conjg( sub( A )' )*sub( X ) = b, 00048 * 00049 * where 00050 * 00051 * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and, 00052 * 00053 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X, 00054 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X. 00055 * 00056 * b and sub( X ) are n element subvectors and sub( A ) is an n by n 00057 * unit, or non-unit, upper or lower triangular submatrix. 00058 * 00059 * No test for singularity or near-singularity is included in this 00060 * routine. Such tests must be performed before calling this routine. 00061 * 00062 * Notes 00063 * ===== 00064 * 00065 * A description vector is associated with each 2D block-cyclicly dis- 00066 * tributed matrix. This vector stores the information required to 00067 * establish the mapping between a matrix entry and its corresponding 00068 * process and memory location. 00069 * 00070 * In the following comments, the character _ should be read as 00071 * "of the distributed matrix". Let A be a generic term for any 2D 00072 * block cyclicly distributed matrix. Its description vector is DESC_A: 00073 * 00074 * NOTATION STORED IN EXPLANATION 00075 * ---------------- --------------- ------------------------------------ 00076 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type. 00077 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating 00078 * the NPROW x NPCOL BLACS process grid 00079 * A is distributed over. The context 00080 * itself is global, but the handle 00081 * (the integer value) may vary. 00082 * M_A (global) DESCA[ M_ ] The number of rows in the distribu- 00083 * ted matrix A, M_A >= 0. 00084 * N_A (global) DESCA[ N_ ] The number of columns in the distri- 00085 * buted matrix A, N_A >= 0. 00086 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left 00087 * block of the matrix A, IMB_A > 0. 00088 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper 00089 * left block of the matrix A, 00090 * INB_A > 0. 00091 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri- 00092 * bute the last M_A-IMB_A rows of A, 00093 * MB_A > 0. 00094 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri- 00095 * bute the last N_A-INB_A columns of 00096 * A, NB_A > 0. 00097 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first 00098 * row of the matrix A is distributed, 00099 * NPROW > RSRC_A >= 0. 00100 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the 00101 * first column of A is distributed. 00102 * NPCOL > CSRC_A >= 0. 00103 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local 00104 * array storing the local blocks of 00105 * the distributed matrix A, 00106 * IF( Lc( 1, N_A ) > 0 ) 00107 * LLD_A >= MAX( 1, Lr( 1, M_A ) ) 00108 * ELSE 00109 * LLD_A >= 1. 00110 * 00111 * Let K be the number of rows of a matrix A starting at the global in- 00112 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows 00113 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would 00114 * receive if these K rows were distributed over NPROW processes. If K 00115 * is the number of columns of a matrix A starting at the global index 00116 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co- 00117 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if 00118 * these K columns were distributed over NPCOL processes. 00119 * 00120 * The values of Lr() and Lc() may be determined via a call to the func- 00121 * tion PB_Cnumroc: 00122 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW ) 00123 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL ) 00124 * 00125 * Arguments 00126 * ========= 00127 * 00128 * UPLO (global input) CHARACTER*1 00129 * On entry, UPLO specifies whether the submatrix sub( A ) is 00130 * an upper or lower triangular submatrix as follows: 00131 * 00132 * UPLO = 'U' or 'u' sub( A ) is an upper triangular 00133 * submatrix, 00134 * 00135 * UPLO = 'L' or 'l' sub( A ) is a lower triangular 00136 * submatrix. 00137 * 00138 * TRANS (global input) CHARACTER*1 00139 * On entry, TRANS specifies the operation to be performed as 00140 * follows: 00141 * 00142 * TRANS = 'N' or 'n' sub( A ) * sub( X ) = b. 00143 * 00144 * TRANS = 'T' or 't' sub( A )' * sub( X ) = b. 00145 * 00146 * TRANS = 'C' or 'c' conjg( sub( A )' ) * sub( X ) = b. 00147 * 00148 * DIAG (global input) CHARACTER*1 00149 * On entry, DIAG specifies whether or not sub( A ) is unit 00150 * triangular as follows: 00151 * 00152 * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian- 00153 * gular, 00154 * 00155 * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri- 00156 * angular. 00157 * 00158 * N (global input) INTEGER 00159 * On entry, N specifies the order of the submatrix sub( A ). 00160 * N must be at least zero. 00161 * 00162 * A (local input) COMPLEX*16 array 00163 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is 00164 * at least Lc( 1, JA+N-1 ). Before entry, this array contains 00165 * the local entries of the matrix A. 00166 * Before entry with UPLO = 'U' or 'u', this array contains the 00167 * local entries corresponding to the entries of the upper tri- 00168 * angular submatrix sub( A ), and the local entries correspon- 00169 * ding to the entries of the strictly lower triangular part of 00170 * the submatrix sub( A ) are not referenced. 00171 * Before entry with UPLO = 'L' or 'l', this array contains the 00172 * local entries corresponding to the entries of the lower tri- 00173 * angular submatrix sub( A ), and the local entries correspon- 00174 * ding to the entries of the strictly upper triangular part of 00175 * the submatrix sub( A ) are not referenced. 00176 * Note that when DIAG = 'U' or 'u', the local entries corres- 00177 * ponding to the diagonal elements of the submatrix sub( A ) 00178 * are not referenced either, but are assumed to be unity. 00179 * 00180 * IA (global input) INTEGER 00181 * On entry, IA specifies A's global row index, which points to 00182 * the beginning of the submatrix sub( A ). 00183 * 00184 * JA (global input) INTEGER 00185 * On entry, JA specifies A's global column index, which points 00186 * to the beginning of the submatrix sub( A ). 00187 * 00188 * DESCA (global and local input) INTEGER array 00189 * On entry, DESCA is an integer array of dimension DLEN_. This 00190 * is the array descriptor for the matrix A. 00191 * 00192 * X (local input/local output) COMPLEX*16 array 00193 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X 00194 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and 00195 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least 00196 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise. 00197 * Before entry, this array contains the local entries of the 00198 * matrix X. On entry, sub( X ) is the n element right-hand side 00199 * b. On exit, sub( X ) is overwritten with the solution subvec- 00200 * tor. 00201 * 00202 * IX (global input) INTEGER 00203 * On entry, IX specifies X's global row index, which points to 00204 * the beginning of the submatrix sub( X ). 00205 * 00206 * JX (global input) INTEGER 00207 * On entry, JX specifies X's global column index, which points 00208 * to the beginning of the submatrix sub( X ). 00209 * 00210 * DESCX (global and local input) INTEGER array 00211 * On entry, DESCX is an integer array of dimension DLEN_. This 00212 * is the array descriptor for the matrix X. 00213 * 00214 * INCX (global input) INTEGER 00215 * On entry, INCX specifies the global increment for the 00216 * elements of X. Only two values of INCX are supported in 00217 * this version, namely 1 and M_X. INCX must not be zero. 00218 * 00219 * -- Written on April 1, 1998 by 00220 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA. 00221 * 00222 * --------------------------------------------------------------------- 00223 */ 00224 /* 00225 * .. Local Scalars .. 00226 */ 00227 char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one, 00228 * zero; 00229 int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb, 00230 Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld, 00231 XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info, 00232 ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran, 00233 nounit, npcol, nprow, size, upper; 00234 PBTYP_T * type; 00235 /* 00236 * .. Local Arrays .. 00237 */ 00238 int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_]; 00239 char * Aptr = NULL, * XAC = NULL, * XAR = NULL; 00240 /* .. 00241 * .. Executable Statements .. 00242 * 00243 */ 00244 upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER ); 00245 notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN ); 00246 nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT ); 00247 PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad ); 00248 PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd ); 00249 #ifndef NO_ARGCHK 00250 /* 00251 * Test the input parameters 00252 */ 00253 Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); 00254 if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) ) 00255 { 00256 if( ( !upper ) && ( UploA != CLOWER ) ) 00257 { 00258 PB_Cwarn( ctxt, __LINE__, "PZTRSV", "Illegal UPLO = %c\n", UploA ); 00259 info = -1; 00260 } 00261 else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) ) 00262 { 00263 PB_Cwarn( ctxt, __LINE__, "PZTRSV", "Illegal TRANS = %c\n", TranOp ); 00264 info = -2; 00265 } 00266 else if( ( !nounit ) && ( DiagA != CUNIT ) ) 00267 { 00268 PB_Cwarn( ctxt, __LINE__, "PZTRSV", "Illegal DIAG = %c\n", DiagA ); 00269 info = -3; 00270 } 00271 PB_Cchkmat( ctxt, "PZTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info ); 00272 PB_Cchkvec( ctxt, "PZTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info ); 00273 } 00274 if( info ) { PB_Cabort( ctxt, "PZTRSV", info ); return; } 00275 #endif 00276 /* 00277 * Quick return if possible 00278 */ 00279 if( *N == 0 ) return; 00280 /* 00281 * Retrieve process grid information 00282 */ 00283 #ifdef NO_ARGCHK 00284 Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol ); 00285 #endif 00286 /* 00287 * Get type structure 00288 */ 00289 type = PB_Cztypeset(); 00290 size = type->size; one = type->one; zero = type->zero; negone = type->negone; 00291 /* 00292 * Compute descriptor Ad0 for sub( A ) 00293 */ 00294 PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj, 00295 &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 ); 00296 /* 00297 * Computational partitioning size is computed as the product of the logical 00298 * value returned by pilaenv_ and 2 * lcm( nprow, npcol ) 00299 */ 00300 nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) * 00301 PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) ); 00302 00303 Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN ); 00304 00305 if( notran ) 00306 { 00307 if( upper ) 00308 { 00309 /* 00310 * Save current and enforce ring BLACS topologies 00311 */ 00312 btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); 00313 ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); 00314 (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING ); 00315 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING ); 00316 /* 00317 * Remove next line when BLACS combine operations support ring topologies. 00318 */ 00319 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); 00320 /* 00321 * Reuse sub( X ) and/or create vector XAC in process column owning the last 00322 * column of sub( A ) 00323 */ 00324 PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1, 00325 ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, 00326 &XACfr, &XACsum, &XACapbX ); 00327 /* 00328 * Create vector XAR in process rows spanned by sub( A ) 00329 */ 00330 PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, 00331 &XARsum ); 00332 /* 00333 * Retrieve local quantities related to Ad0 -> sub( A ) 00334 */ 00335 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; 00336 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; 00337 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; 00338 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); 00339 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); 00340 if( ( Anp > 0 ) && ( Anq > 0 ) ) 00341 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); 00342 00343 XACld = XACd[LLD_]; XARld = XARd[LLD_]; 00344 00345 for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) 00346 { 00347 ktmp = *N - k; kb = MIN( ktmp, nb ); 00348 /* 00349 * Solve logical diagonal block, XAC contains the solution scattered in multiple 00350 * process columns and XAR contains the solution replicated in the process rows. 00351 */ 00352 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); 00353 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); 00354 PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, 00355 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, 00356 Akq, XARld, size ), XARld ); 00357 /* 00358 * Update: only the part of sub( X ) to be solved at the next step is locally 00359 * updated and combined, the remaining part of the vector to be solved later is 00360 * only locally updated. 00361 */ 00362 if( Akp > 0 ) 00363 { 00364 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); 00365 if( XACsum ) 00366 { 00367 kbprev = MIN( k, nb ); 00368 ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow, 00369 Arow, nprow ); 00370 Akp -= ktmp; 00371 00372 if( ktmp > 0 ) 00373 { 00374 if( Anq0 > 0 ) 00375 zgemv_( TRANS, &ktmp, &Anq0, negone, 00376 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, 00377 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, 00378 Mptr( XAC, Akp, 0, XACld, size ), &ione ); 00379 Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol ); 00380 Czgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 00381 0, XACld, size ), XACld, myrow, Asrc ); 00382 if( mycol != Asrc ) 00383 zset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, 00384 size ), &ione ); 00385 } 00386 if( Akp > 0 && Anq0 > 0 ) 00387 zgemv_( TRANS, &Akp, &Anq0, negone, 00388 Mptr( Aptr, 0, Akq, Ald, size ), &Ald, 00389 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, 00390 XAC, &ione ); 00391 } 00392 else 00393 { 00394 if( Anq0 > 0 ) 00395 zgemv_( TRANS, &Akp, &Anq0, negone, 00396 Mptr( Aptr, 0, Akq, Ald, size ), &Ald, 00397 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, 00398 XAC, &ione ); 00399 } 00400 } 00401 } 00402 /* 00403 * Combine the scattered resulting vector XAC 00404 */ 00405 if( XACsum && ( Anp > 0 ) ) 00406 { 00407 Czgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, 00408 XACd[CSRC_] ); 00409 } 00410 /* 00411 * sub( X ) := XAC (if necessary) 00412 */ 00413 if( XACapbX ) 00414 { 00415 PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN, 00416 zero, ((char *) X), Xi, Xj, Xd, &Xroc ); 00417 } 00418 /* 00419 * Restore BLACS topologies 00420 */ 00421 (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); 00422 (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); 00423 } 00424 else 00425 { 00426 /* 00427 * Save current and enforce ring BLACS topologies 00428 */ 00429 btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET ); 00430 ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET ); 00431 (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING ); 00432 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING ); 00433 /* 00434 * Remove next line when BLACS combine operations support ring topologies. 00435 */ 00436 (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT ); 00437 /* 00438 * Reuse sub( X ) and/or create vector XAC in process column owning the first 00439 * column of sub( A ) 00440 */ 00441 PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1, 00442 ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd, 00443 &XACfr, &XACsum, &XACapbX ); 00444 /* 00445 * Create vector XAR in process rows spanned by sub( A ) 00446 */ 00447 PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr, 00448 &XARsum ); 00449 /* 00450 * Retrieve local quantities related to Ad0 -> sub( A ) 00451 */ 00452 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; 00453 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; 00454 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; 00455 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); 00456 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); 00457 if( ( Anp > 0 ) && ( Anq > 0 ) ) 00458 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); 00459 00460 XACld = XACd[LLD_]; XARld = XARd[LLD_]; 00461 00462 for( k = 0; k < *N; k += nb ) 00463 { 00464 ktmp = *N - k; kb = MIN( ktmp, nb ); 00465 /* 00466 * Solve logical diagonal block, XAC contains the solution scattered in multiple 00467 * process columns and XAR contains the solution replicated in the process rows. 00468 */ 00469 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); 00470 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); 00471 PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, 00472 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, 00473 Akq, XARld, size ), XARld ); 00474 /* 00475 * Update: only the part of sub( X ) to be solved at the next step is locally 00476 * updated and combined, the remaining part of the vector to be solved later is 00477 * only locally updated. 00478 */ 00479 Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow ); 00480 if( ( Anp0 = Anp - Akp ) > 0 ) 00481 { 00482 Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol ); 00483 if( XACsum ) 00484 { 00485 kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); 00486 ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow, 00487 nprow ); 00488 Anp0 -= ktmp; 00489 00490 if( ktmp > 0 ) 00491 { 00492 if( Anq0 > 0 ) 00493 zgemv_( TRANS, &ktmp, &Anq0, negone, 00494 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, 00495 Mptr( XAR, 0, Akq, XARld, size ), &XARld, one, 00496 Mptr( XAC, Akp, 0, XACld, size ), &ione ); 00497 Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol ); 00498 Czgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp, 00499 0, XACld, size ), XACld, myrow, Asrc ); 00500 if( mycol != Asrc ) 00501 zset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld, 00502 size ), &ione ); 00503 } 00504 if( Anp0 > 0 && Anq0 > 0 ) 00505 zgemv_( TRANS, &Anp0, &Anq0, negone, 00506 Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald, 00507 Mptr( XAR, 0, Akq, XARld, size ), &XARld, 00508 one, 00509 Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione ); 00510 } 00511 else 00512 { 00513 if( Anq0 > 0 ) 00514 zgemv_( TRANS, &Anp0, &Anq0, negone, 00515 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, 00516 Mptr( XAR, 0, Akq, XARld, size ), &XARld, 00517 one, 00518 Mptr( XAC, Akp, 0, XACld, size ), &ione ); 00519 } 00520 } 00521 } 00522 /* 00523 * Combine the scattered resulting vector XAC 00524 */ 00525 if( XACsum && ( Anp > 0 ) ) 00526 { 00527 Czgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow, 00528 XACd[CSRC_] ); 00529 } 00530 /* 00531 * sub( X ) := XAC (if necessary) 00532 */ 00533 if( XACapbX ) 00534 { 00535 PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, 00536 COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc ); 00537 } 00538 /* 00539 * Restore BLACS topologies 00540 */ 00541 (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop ); 00542 (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop ); 00543 } 00544 } 00545 else 00546 { 00547 if( upper ) 00548 { 00549 /* 00550 * Save current and enforce ring BLACS topologies 00551 */ 00552 btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); 00553 ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); 00554 (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING ); 00555 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING ); 00556 /* 00557 * Remove next line when BLACS combine operations support ring topologies. 00558 */ 00559 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); 00560 /* 00561 * Reuse sub( X ) and/or create vector XAR in process row owning the first row 00562 * of sub( A ) 00563 */ 00564 PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1, 00565 ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, 00566 &XARfr, &XARsum, &XARapbX ); 00567 /* 00568 * Create vector XAC in process columns spanned by sub( A ) 00569 */ 00570 PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, 00571 &XACsum ); 00572 /* 00573 * Retrieve local quantities related to Ad0 -> sub( A ) 00574 */ 00575 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; 00576 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; 00577 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; 00578 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); 00579 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); 00580 if( ( Anp > 0 ) && ( Anq > 0 ) ) 00581 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); 00582 00583 XACld = XACd[LLD_]; XARld = XARd[LLD_]; 00584 00585 for( k = 0; k < *N; k += nb ) 00586 { 00587 ktmp = *N - k; kb = MIN( ktmp, nb ); 00588 /* 00589 * Solve logical diagonal block, XAR contains the solution scattered in multiple 00590 * process rows and XAC contains the solution replicated in the process columns. 00591 */ 00592 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); 00593 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); 00594 PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, 00595 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, 00596 Akq, XARld, size ), XARld ); 00597 /* 00598 * Update: only the part of sub( X ) to be solved at the next step is locally 00599 * updated and combined, the remaining part of the vector to be solved later is 00600 * only locally updated. 00601 */ 00602 Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol ); 00603 if( ( Anq0 = Anq - Akq ) > 0 ) 00604 { 00605 Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); 00606 if( XARsum ) 00607 { 00608 kbnext = ktmp - kb; kbnext = MIN( kbnext, nb ); 00609 ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol, 00610 npcol ); 00611 Anq0 -= ktmp; 00612 00613 if( ktmp > 0 ) 00614 { 00615 if( Anp0 > 0 ) 00616 zgemv_( TRANS, &Anp0, &ktmp, negone, 00617 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, 00618 Mptr( XAC, Akp, 0, XACld, size ), &ione, one, 00619 Mptr( XAR, 0, Akq, XARld, size ), &XARld ); 00620 Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow ); 00621 Czgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, 00622 Akq, XARld, size ), XARld, Asrc, mycol ); 00623 if( myrow != Asrc ) 00624 zset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, 00625 size ), &XARld ); 00626 } 00627 if( Anp0 > 0 && Anq0 > 0 ) 00628 zgemv_( TRANS, &Anp0, &Anq0, negone, 00629 Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald, 00630 Mptr( XAC, Akp, 0, XACld, size ), &ione, one, 00631 Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld ); 00632 } 00633 else 00634 { 00635 if( Anp0 > 0 ) 00636 zgemv_( TRANS, &Anp0, &Anq0, negone, 00637 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, 00638 Mptr( XAC, Akp, 0, XACld, size ), &ione, one, 00639 Mptr( XAR, 0, Akq, XARld, size ), &XARld ); 00640 } 00641 } 00642 } 00643 /* 00644 * Combine the scattered resulting vector XAR 00645 */ 00646 if( XARsum && ( Anq > 0 ) ) 00647 { 00648 Czgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], 00649 mycol ); 00650 } 00651 /* 00652 * sub( X ) := XAR (if necessary) 00653 */ 00654 if( XARapbX ) 00655 { 00656 PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, 00657 ((char *) X), Xi, Xj, Xd, &Xroc ); 00658 } 00659 /* 00660 * Restore BLACS topologies 00661 */ 00662 (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); 00663 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); 00664 } 00665 else 00666 { 00667 /* 00668 * Save current and enforce ring BLACS topologies 00669 */ 00670 btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET ); 00671 ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET ); 00672 (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING ); 00673 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING ); 00674 /* 00675 * Remove next line when BLACS combine operations support ring topologies. 00676 */ 00677 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT ); 00678 /* 00679 * Reuse sub( X ) and/or create vector XAC in process row owning the last row 00680 * of sub( A ) 00681 */ 00682 PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1, 00683 ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd, 00684 &XARfr, &XARsum, &XARapbX ); 00685 /* 00686 * Create vector XAC in process columns spanned by sub( A ) 00687 */ 00688 PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr, 00689 &XACsum ); 00690 /* 00691 * Retrieve local quantities related to Ad0 -> sub( A ) 00692 */ 00693 Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ]; 00694 Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ]; 00695 Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_]; 00696 Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow ); 00697 Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol ); 00698 if( ( Anp > 0 ) && ( Anq > 0 ) ) 00699 Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size ); 00700 00701 XACld = XACd[LLD_]; XARld = XARd[LLD_]; 00702 00703 for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb ) 00704 { 00705 ktmp = *N - k; kb = MIN( ktmp, nb ); 00706 /* 00707 * Solve logical diagonal block, XAR contains the solution scattered in multiple 00708 * process rows and XAC contains the solution replicated in the process columns. 00709 */ 00710 Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow ); 00711 Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol ); 00712 PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k, 00713 Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0, 00714 Akq, XARld, size ), XARld ); 00715 /* 00716 * Update: only the part of sub( X ) to be solved at the next step is locally 00717 * updated and combined, the remaining part of the vector to be solved later 00718 * is only locally updated. 00719 */ 00720 if( Akq > 0 ) 00721 { 00722 Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow ); 00723 if( XARsum ) 00724 { 00725 kbprev = MIN( k, nb ); 00726 ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol, 00727 Acol, npcol ); 00728 Akq -= ktmp; 00729 00730 if( ktmp > 0 ) 00731 { 00732 if( Anp0 > 0 ) 00733 zgemv_( TRANS, &Anp0, &ktmp, negone, 00734 Mptr( Aptr, Akp, Akq, Ald, size ), &Ald, 00735 Mptr( XAC, Akp, 0, XACld, size ), &ione, one, 00736 Mptr( XAR, 0, Akq, XARld, size ), &XARld ); 00737 Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow ); 00738 Czgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0, 00739 Akq, XARld, size ), XARld, Asrc, mycol ); 00740 if( myrow != Asrc ) 00741 zset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld, 00742 size ), &XARld ); 00743 } 00744 if( Anp0 > 0 && Akq > 0 ) 00745 zgemv_( TRANS, &Anp0, &Akq, negone, 00746 Mptr( Aptr, Akp, 0, Ald, size ), &Ald, 00747 Mptr( XAC, Akp, 0, XACld, size ), &ione, 00748 one, XAR, &XARld ); 00749 } 00750 else 00751 { 00752 if( Anp0 > 0 ) 00753 zgemv_( TRANS, &Anp0, &Akq, negone, 00754 Mptr( Aptr, Akp, 0, Ald, size ), &Ald, 00755 Mptr( XAC, Akp, 0, XACld, size ), &ione, 00756 one, XAR, &XARld ); 00757 } 00758 } 00759 } 00760 /* 00761 * Combine the scattered resulting vector XAR 00762 */ 00763 if( XARsum && ( Anq > 0 ) ) 00764 { 00765 Czgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_], 00766 mycol ); 00767 } 00768 /* 00769 * sub( X ) := XAR (if necessary) 00770 */ 00771 if( XARapbX ) 00772 { 00773 PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero, 00774 ((char *) X), Xi, Xj, Xd, &Xroc ); 00775 } 00776 /* 00777 * Restore BLACS topologies 00778 */ 00779 (void) PB_Ctop( &ctxt, BCAST, ROW, &btop ); 00780 (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop ); 00781 } 00782 } 00783 if( XACfr ) free( XAC ); 00784 if( XARfr ) free( XAR ); 00785 /* 00786 * End of PZTRSV 00787 */ 00788 }