ScaLAPACK 2.1  2.1 ScaLAPACK: Scalable Linear Algebra PACKage
pztrsv_.c
Go to the documentation of this file.
1 /* ---------------------------------------------------------------------
2 *
3 * -- PBLAS routine (version 2.0) --
4 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
5 * and University of California, Berkeley.
6 * April 1, 1998
7 *
8 * ---------------------------------------------------------------------
9 */
10 /*
11 * Include files
12 */
13 #include "pblas.h"
14 #include "PBpblas.h"
15 #include "PBtools.h"
16 #include "PBblacs.h"
17 #include "PBblas.h"
18
19 #ifdef __STDC__
20 void pztrsv_( F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int * N,
21  double * A, int * IA, int * JA, int * DESCA,
22  double * X, int * IX, int * JX, int * DESCX,
23  int * INCX )
24 #else
25 void pztrsv_( UPLO, TRANS, DIAG, N, A, IA, JA, DESCA, X, IX, JX,
26  DESCX, INCX )
27 /*
28 * .. Scalar Arguments ..
29 */
30  F_CHAR_T DIAG, TRANS, UPLO;
31  int * IA, * INCX, * IX, * JA, * JX, * N;
32 /*
33 * .. Array Arguments ..
34 */
35  int * DESCA, * DESCX;
36  double * A, * X;
37 #endif
38 {
39 /*
40 * Purpose
41 * =======
42 *
43 * PZTRSV solves one of the systems of equations
44 *
45 * sub( A )*sub( X ) = b, or sub( A )'*sub( X ) = b, or
46 *
47 * conjg( sub( A )' )*sub( X ) = b,
48 *
49 * where
50 *
51 * sub( A ) denotes A(IA:IA+N-1,JA:JA+N-1), and,
52 *
53 * sub( X ) denotes X(IX,JX:JX+N-1) if INCX = M_X,
54 * X(IX:IX+N-1,JX) if INCX = 1 and INCX <> M_X.
55 *
56 * b and sub( X ) are n element subvectors and sub( A ) is an n by n
57 * unit, or non-unit, upper or lower triangular submatrix.
58 *
59 * No test for singularity or near-singularity is included in this
60 * routine. Such tests must be performed before calling this routine.
61 *
62 * Notes
63 * =====
64 *
65 * A description vector is associated with each 2D block-cyclicly dis-
66 * tributed matrix. This vector stores the information required to
67 * establish the mapping between a matrix entry and its corresponding
68 * process and memory location.
69 *
70 * In the following comments, the character _ should be read as
71 * "of the distributed matrix". Let A be a generic term for any 2D
72 * block cyclicly distributed matrix. Its description vector is DESC_A:
73 *
74 * NOTATION STORED IN EXPLANATION
75 * ---------------- --------------- ------------------------------------
76 * DTYPE_A (global) DESCA[ DTYPE_ ] The descriptor type.
77 * CTXT_A (global) DESCA[ CTXT_ ] The BLACS context handle, indicating
78 * the NPROW x NPCOL BLACS process grid
79 * A is distributed over. The context
80 * itself is global, but the handle
81 * (the integer value) may vary.
82 * M_A (global) DESCA[ M_ ] The number of rows in the distribu-
83 * ted matrix A, M_A >= 0.
84 * N_A (global) DESCA[ N_ ] The number of columns in the distri-
85 * buted matrix A, N_A >= 0.
86 * IMB_A (global) DESCA[ IMB_ ] The number of rows of the upper left
87 * block of the matrix A, IMB_A > 0.
88 * INB_A (global) DESCA[ INB_ ] The number of columns of the upper
89 * left block of the matrix A,
90 * INB_A > 0.
91 * MB_A (global) DESCA[ MB_ ] The blocking factor used to distri-
92 * bute the last M_A-IMB_A rows of A,
93 * MB_A > 0.
94 * NB_A (global) DESCA[ NB_ ] The blocking factor used to distri-
95 * bute the last N_A-INB_A columns of
96 * A, NB_A > 0.
97 * RSRC_A (global) DESCA[ RSRC_ ] The process row over which the first
98 * row of the matrix A is distributed,
99 * NPROW > RSRC_A >= 0.
100 * CSRC_A (global) DESCA[ CSRC_ ] The process column over which the
101 * first column of A is distributed.
102 * NPCOL > CSRC_A >= 0.
103 * LLD_A (local) DESCA[ LLD_ ] The leading dimension of the local
104 * array storing the local blocks of
105 * the distributed matrix A,
106 * IF( Lc( 1, N_A ) > 0 )
107 * LLD_A >= MAX( 1, Lr( 1, M_A ) )
108 * ELSE
109 * LLD_A >= 1.
110 *
111 * Let K be the number of rows of a matrix A starting at the global in-
112 * dex IA,i.e, A( IA:IA+K-1, : ). Lr( IA, K ) denotes the number of rows
113 * that the process of row coordinate MYROW ( 0 <= MYROW < NPROW ) would
114 * receive if these K rows were distributed over NPROW processes. If K
115 * is the number of columns of a matrix A starting at the global index
116 * JA, i.e, A( :, JA:JA+K-1, : ), Lc( JA, K ) denotes the number of co-
117 * lumns that the process MYCOL ( 0 <= MYCOL < NPCOL ) would receive if
118 * these K columns were distributed over NPCOL processes.
119 *
120 * The values of Lr() and Lc() may be determined via a call to the func-
121 * tion PB_Cnumroc:
122 * Lr( IA, K ) = PB_Cnumroc( K, IA, IMB_A, MB_A, MYROW, RSRC_A, NPROW )
123 * Lc( JA, K ) = PB_Cnumroc( K, JA, INB_A, NB_A, MYCOL, CSRC_A, NPCOL )
124 *
125 * Arguments
126 * =========
127 *
128 * UPLO (global input) CHARACTER*1
129 * On entry, UPLO specifies whether the submatrix sub( A ) is
130 * an upper or lower triangular submatrix as follows:
131 *
132 * UPLO = 'U' or 'u' sub( A ) is an upper triangular
133 * submatrix,
134 *
135 * UPLO = 'L' or 'l' sub( A ) is a lower triangular
136 * submatrix.
137 *
138 * TRANS (global input) CHARACTER*1
139 * On entry, TRANS specifies the operation to be performed as
140 * follows:
141 *
142 * TRANS = 'N' or 'n' sub( A ) * sub( X ) = b.
143 *
144 * TRANS = 'T' or 't' sub( A )' * sub( X ) = b.
145 *
146 * TRANS = 'C' or 'c' conjg( sub( A )' ) * sub( X ) = b.
147 *
148 * DIAG (global input) CHARACTER*1
149 * On entry, DIAG specifies whether or not sub( A ) is unit
150 * triangular as follows:
151 *
152 * DIAG = 'U' or 'u' sub( A ) is assumed to be unit trian-
153 * gular,
154 *
155 * DIAG = 'N' or 'n' sub( A ) is not assumed to be unit tri-
156 * angular.
157 *
158 * N (global input) INTEGER
159 * On entry, N specifies the order of the submatrix sub( A ).
160 * N must be at least zero.
161 *
162 * A (local input) COMPLEX*16 array
163 * On entry, A is an array of dimension (LLD_A, Ka), where Ka is
164 * at least Lc( 1, JA+N-1 ). Before entry, this array contains
165 * the local entries of the matrix A.
166 * Before entry with UPLO = 'U' or 'u', this array contains the
167 * local entries corresponding to the entries of the upper tri-
168 * angular submatrix sub( A ), and the local entries correspon-
169 * ding to the entries of the strictly lower triangular part of
170 * the submatrix sub( A ) are not referenced.
171 * Before entry with UPLO = 'L' or 'l', this array contains the
172 * local entries corresponding to the entries of the lower tri-
173 * angular submatrix sub( A ), and the local entries correspon-
174 * ding to the entries of the strictly upper triangular part of
175 * the submatrix sub( A ) are not referenced.
176 * Note that when DIAG = 'U' or 'u', the local entries corres-
177 * ponding to the diagonal elements of the submatrix sub( A )
178 * are not referenced either, but are assumed to be unity.
179 *
180 * IA (global input) INTEGER
181 * On entry, IA specifies A's global row index, which points to
182 * the beginning of the submatrix sub( A ).
183 *
184 * JA (global input) INTEGER
185 * On entry, JA specifies A's global column index, which points
186 * to the beginning of the submatrix sub( A ).
187 *
188 * DESCA (global and local input) INTEGER array
189 * On entry, DESCA is an integer array of dimension DLEN_. This
190 * is the array descriptor for the matrix A.
191 *
192 * X (local input/local output) COMPLEX*16 array
193 * On entry, X is an array of dimension (LLD_X, Kx), where LLD_X
194 * is at least MAX( 1, Lr( 1, IX ) ) when INCX = M_X and
195 * MAX( 1, Lr( 1, IX+N-1 ) ) otherwise, and, Kx is at least
196 * Lc( 1, JX+N-1 ) when INCX = M_X and Lc( 1, JX ) otherwise.
197 * Before entry, this array contains the local entries of the
198 * matrix X. On entry, sub( X ) is the n element right-hand side
199 * b. On exit, sub( X ) is overwritten with the solution subvec-
200 * tor.
201 *
202 * IX (global input) INTEGER
203 * On entry, IX specifies X's global row index, which points to
204 * the beginning of the submatrix sub( X ).
205 *
206 * JX (global input) INTEGER
207 * On entry, JX specifies X's global column index, which points
208 * to the beginning of the submatrix sub( X ).
209 *
210 * DESCX (global and local input) INTEGER array
211 * On entry, DESCX is an integer array of dimension DLEN_. This
212 * is the array descriptor for the matrix X.
213 *
214 * INCX (global input) INTEGER
215 * On entry, INCX specifies the global increment for the
216 * elements of X. Only two values of INCX are supported in
217 * this version, namely 1 and M_X. INCX must not be zero.
218 *
219 * -- Written on April 1, 1998 by
220 * Antoine Petitet, University of Tennessee, Knoxville 37996, USA.
221 *
222 * ---------------------------------------------------------------------
223 */
224 /*
225 * .. Local Scalars ..
226 */
227  char DiagA, TranOp, UploA, Xroc, btop, ctop, * negone, * one,
228  * zero;
229  int Acol, Ai, Aii, Aimb1, Ainb1, Aj, Ajj, Akp, Akq, Ald, Amb, Anb,
230  Anp, Anp0, Anq, Anq0, Arow, Asrc, XACapbX, XACfr, XACld,
231  XACsum, XARapbX, XARfr, XARld, XARsum, Xi, Xj, ctxt, info,
232  ione=1, k, kb, kbnext, kbprev, ktmp, mycol, myrow, nb, notran,
233  nounit, npcol, nprow, size, upper;
234  PBTYP_T * type;
235 /*
236 * .. Local Arrays ..
237 */
238  int Ad[DLEN_], Ad0[DLEN_], XACd[DLEN_], XARd[DLEN_], Xd[DLEN_];
239  char * Aptr = NULL, * XAC = NULL, * XAR = NULL;
240 /* ..
241 * .. Executable Statements ..
242 *
243 */
244  upper = ( ( UploA = Mupcase( F2C_CHAR( UPLO )[0] ) ) == CUPPER );
245  notran = ( ( TranOp = Mupcase( F2C_CHAR( TRANS )[0] ) ) == CNOTRAN );
246  nounit = ( ( DiagA = Mupcase( F2C_CHAR( DIAG )[0] ) ) == CNOUNIT );
247  PB_CargFtoC( *IA, *JA, DESCA, &Ai, &Aj, Ad );
248  PB_CargFtoC( *IX, *JX, DESCX, &Xi, &Xj, Xd );
249 #ifndef NO_ARGCHK
250 /*
251 * Test the input parameters
252 */
253  Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
254  if( !( info = ( ( nprow == -1 ) ? -( 801 + CTXT_ ) : 0 ) ) )
255  {
256  if( ( !upper ) && ( UploA != CLOWER ) )
257  {
258  PB_Cwarn( ctxt, __LINE__, "PZTRSV", "Illegal UPLO = %c\n", UploA );
259  info = -1;
260  }
261  else if( ( !notran ) && ( TranOp != CTRAN ) && ( TranOp != CCOTRAN ) )
262  {
263  PB_Cwarn( ctxt, __LINE__, "PZTRSV", "Illegal TRANS = %c\n", TranOp );
264  info = -2;
265  }
266  else if( ( !nounit ) && ( DiagA != CUNIT ) )
267  {
268  PB_Cwarn( ctxt, __LINE__, "PZTRSV", "Illegal DIAG = %c\n", DiagA );
269  info = -3;
270  }
271  PB_Cchkmat( ctxt, "PZTRSV", "A", *N, 4, *N, 4, Ai, Aj, Ad, 8, &info );
272  PB_Cchkvec( ctxt, "PZTRSV", "X", *N, 4, Xi, Xj, Xd, *INCX, 12, &info );
273  }
274  if( info ) { PB_Cabort( ctxt, "PZTRSV", info ); return; }
275 #endif
276 /*
277 * Quick return if possible
278 */
279  if( *N == 0 ) return;
280 /*
281 * Retrieve process grid information
282 */
283 #ifdef NO_ARGCHK
284  Cblacs_gridinfo( ( ctxt = Ad[CTXT_] ), &nprow, &npcol, &myrow, &mycol );
285 #endif
286 /*
287 * Get type structure
288 */
289  type = PB_Cztypeset();
290  size = type->size; one = type->one; zero = type->zero; negone = type->negone;
291 /*
292 * Compute descriptor Ad0 for sub( A )
293 */
294  PB_Cdescribe( *N, *N, Ai, Aj, Ad, nprow, npcol, myrow, mycol, &Aii, &Ajj,
295  &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
296 /*
297 * Computational partitioning size is computed as the product of the logical
298 * value returned by pilaenv_ and 2 * lcm( nprow, npcol )
299 */
300  nb = 2 * pilaenv_( &ctxt, C2F_CHAR( &type->type ) ) *
301  PB_Clcm( ( Arow >= 0 ? nprow : 1 ), ( Acol >= 0 ? npcol : 1 ) );
302
303  Xroc = ( *INCX == Xd[M_] ? CROW : CCOLUMN );
304
305  if( notran )
306  {
307  if( upper )
308  {
309 /*
310 * Save current and enforce ring BLACS topologies
311 */
312  btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
313  ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
314  (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_DRING );
315  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DRING );
316 /*
317 * Remove next line when BLACS combine operations support ring topologies.
318 */
319  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
320 /*
321 * Reuse sub( X ) and/or create vector XAC in process column owning the last
322 * column of sub( A )
323 */
324  PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, *N-1, Ad0, 1,
325  ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
326  &XACfr, &XACsum, &XACapbX );
327 /*
328 * Create vector XAR in process rows spanned by sub( A )
329 */
330  PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
331  &XARsum );
332 /*
333 * Retrieve local quantities related to Ad0 -> sub( A )
334 */
335  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
336  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
337  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
338  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
339  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
340  if( ( Anp > 0 ) && ( Anq > 0 ) )
341  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
342
343  XACld = XACd[LLD_]; XARld = XARd[LLD_];
344
345  for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
346  {
347  ktmp = *N - k; kb = MIN( ktmp, nb );
348 /*
349 * Solve logical diagonal block, XAC contains the solution scattered in multiple
350 * process columns and XAR contains the solution replicated in the process rows.
351 */
352  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
353  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
354  PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
355  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
356  Akq, XARld, size ), XARld );
357 /*
358 * Update: only the part of sub( X ) to be solved at the next step is locally
359 * updated and combined, the remaining part of the vector to be solved later is
360 * only locally updated.
361 */
362  if( Akp > 0 )
363  {
364  Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
365  if( XACsum )
366  {
367  kbprev = MIN( k, nb );
368  ktmp = PB_Cnumroc( kbprev, k-kbprev, Aimb1, Amb, myrow,
369  Arow, nprow );
370  Akp -= ktmp;
371
372  if( ktmp > 0 )
373  {
374  if( Anq0 > 0 )
375  zgemv_( TRANS, &ktmp, &Anq0, negone,
376  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
377  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
378  Mptr( XAC, Akp, 0, XACld, size ), &ione );
379  Asrc = PB_Cindxg2p( k-1, Ainb1, Anb, Acol, Acol, npcol );
380  Czgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
381  0, XACld, size ), XACld, myrow, Asrc );
382  if( mycol != Asrc )
383  zset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
384  size ), &ione );
385  }
386  if( Akp > 0 && Anq0 > 0 )
387  zgemv_( TRANS, &Akp, &Anq0, negone,
388  Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
389  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
390  XAC, &ione );
391  }
392  else
393  {
394  if( Anq0 > 0 )
395  zgemv_( TRANS, &Akp, &Anq0, negone,
396  Mptr( Aptr, 0, Akq, Ald, size ), &Ald,
397  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
398  XAC, &ione );
399  }
400  }
401  }
402 /*
403 * Combine the scattered resulting vector XAC
404 */
405  if( XACsum && ( Anp > 0 ) )
406  {
407  Czgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
408  XACd[CSRC_] );
409  }
410 /*
411 * sub( X ) := XAC (if necessary)
412 */
413  if( XACapbX )
414  {
415  PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd, COLUMN,
416  zero, ((char *) X), Xi, Xj, Xd, &Xroc );
417  }
418 /*
419 * Restore BLACS topologies
420 */
421  (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
422  (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
423  }
424  else
425  {
426 /*
427 * Save current and enforce ring BLACS topologies
428 */
429  btop = *PB_Ctop( &ctxt, BCAST, COLUMN, TOP_GET );
430  ctop = *PB_Ctop( &ctxt, COMBINE, ROW, TOP_GET );
431  (void) PB_Ctop( &ctxt, BCAST, COLUMN, TOP_IRING );
432  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_IRING );
433 /*
434 * Remove next line when BLACS combine operations support ring topologies.
435 */
436  (void) PB_Ctop( &ctxt, COMBINE, ROW, TOP_DEFAULT );
437 /*
438 * Reuse sub( X ) and/or create vector XAC in process column owning the first
439 * column of sub( A )
440 */
441  PB_CInOutV2( type, NOCONJG, COLUMN, *N, *N, 0, Ad0, 1,
442  ((char *) X), Xi, Xj, Xd, &Xroc, &XAC, XACd,
443  &XACfr, &XACsum, &XACapbX );
444 /*
445 * Create vector XAR in process rows spanned by sub( A )
446 */
447  PB_COutV( type, ROW, INIT, *N, *N, Ad0, 1, &XAR, XARd, &XARfr,
448  &XARsum );
449 /*
450 * Retrieve local quantities related to Ad0 -> sub( A )
451 */
452  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
453  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
454  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
455  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
456  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
457  if( ( Anp > 0 ) && ( Anq > 0 ) )
458  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
459
460  XACld = XACd[LLD_]; XARld = XARd[LLD_];
461
462  for( k = 0; k < *N; k += nb )
463  {
464  ktmp = *N - k; kb = MIN( ktmp, nb );
465 /*
466 * Solve logical diagonal block, XAC contains the solution scattered in multiple
467 * process columns and XAR contains the solution replicated in the process rows.
468 */
469  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
470  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
471  PB_Cptrsv( type, XARsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
472  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
473  Akq, XARld, size ), XARld );
474 /*
475 * Update: only the part of sub( X ) to be solved at the next step is locally
476 * updated and combined, the remaining part of the vector to be solved later is
477 * only locally updated.
478 */
479  Akp = PB_Cnumroc( k+kb, 0, Aimb1, Amb, myrow, Arow, nprow );
480  if( ( Anp0 = Anp - Akp ) > 0 )
481  {
482  Anq0 = PB_Cnumroc( kb, k, Ainb1, Anb, mycol, Acol, npcol );
483  if( XACsum )
484  {
485  kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
486  ktmp = PB_Cnumroc( kbnext, k+kb, Aimb1, Amb, myrow, Arow,
487  nprow );
488  Anp0 -= ktmp;
489
490  if( ktmp > 0 )
491  {
492  if( Anq0 > 0 )
493  zgemv_( TRANS, &ktmp, &Anq0, negone,
494  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
495  Mptr( XAR, 0, Akq, XARld, size ), &XARld, one,
496  Mptr( XAC, Akp, 0, XACld, size ), &ione );
497  Asrc = PB_Cindxg2p( k+kb, Ainb1, Anb, Acol, Acol, npcol );
498  Czgsum2d( ctxt, ROW, &ctop, ktmp, 1, Mptr( XAC, Akp,
499  0, XACld, size ), XACld, myrow, Asrc );
500  if( mycol != Asrc )
501  zset_( &ktmp, zero, Mptr( XAC, Akp, 0, XACld,
502  size ), &ione );
503  }
504  if( Anp0 > 0 && Anq0 > 0 )
505  zgemv_( TRANS, &Anp0, &Anq0, negone,
506  Mptr( Aptr, Akp+ktmp, Akq, Ald, size ), &Ald,
507  Mptr( XAR, 0, Akq, XARld, size ), &XARld,
508  one,
509  Mptr( XAC, Akp+ktmp, 0, XACld, size ), &ione );
510  }
511  else
512  {
513  if( Anq0 > 0 )
514  zgemv_( TRANS, &Anp0, &Anq0, negone,
515  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
516  Mptr( XAR, 0, Akq, XARld, size ), &XARld,
517  one,
518  Mptr( XAC, Akp, 0, XACld, size ), &ione );
519  }
520  }
521  }
522 /*
523 * Combine the scattered resulting vector XAC
524 */
525  if( XACsum && ( Anp > 0 ) )
526  {
527  Czgsum2d( ctxt, ROW, &ctop, Anp, 1, XAC, XACld, myrow,
528  XACd[CSRC_] );
529  }
530 /*
531 * sub( X ) := XAC (if necessary)
532 */
533  if( XACapbX )
534  {
535  PB_Cpaxpby( type, NOCONJG, *N, 1, one, XAC, 0, 0, XACd,
536  COLUMN, zero, ((char *) X), Xi, Xj, Xd, &Xroc );
537  }
538 /*
539 * Restore BLACS topologies
540 */
541  (void) PB_Ctop( &ctxt, BCAST, COLUMN, &btop );
542  (void) PB_Ctop( &ctxt, COMBINE, ROW, &ctop );
543  }
544  }
545  else
546  {
547  if( upper )
548  {
549 /*
550 * Save current and enforce ring BLACS topologies
551 */
552  btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
553  ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
554  (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_IRING );
555  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_IRING );
556 /*
557 * Remove next line when BLACS combine operations support ring topologies.
558 */
559  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
560 /*
561 * Reuse sub( X ) and/or create vector XAR in process row owning the first row
562 * of sub( A )
563 */
564  PB_CInOutV2( type, NOCONJG, ROW, *N, *N, 0, Ad0, 1,
565  ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
566  &XARfr, &XARsum, &XARapbX );
567 /*
568 * Create vector XAC in process columns spanned by sub( A )
569 */
570  PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
571  &XACsum );
572 /*
573 * Retrieve local quantities related to Ad0 -> sub( A )
574 */
575  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
576  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
577  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
578  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
579  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
580  if( ( Anp > 0 ) && ( Anq > 0 ) )
581  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
582
583  XACld = XACd[LLD_]; XARld = XARd[LLD_];
584
585  for( k = 0; k < *N; k += nb )
586  {
587  ktmp = *N - k; kb = MIN( ktmp, nb );
588 /*
589 * Solve logical diagonal block, XAR contains the solution scattered in multiple
590 * process rows and XAC contains the solution replicated in the process columns.
591 */
592  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
593  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
594  PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
595  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
596  Akq, XARld, size ), XARld );
597 /*
598 * Update: only the part of sub( X ) to be solved at the next step is locally
599 * updated and combined, the remaining part of the vector to be solved later is
600 * only locally updated.
601 */
602  Akq = PB_Cnumroc( k+kb, 0, Ainb1, Anb, mycol, Acol, npcol );
603  if( ( Anq0 = Anq - Akq ) > 0 )
604  {
605  Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
606  if( XARsum )
607  {
608  kbnext = ktmp - kb; kbnext = MIN( kbnext, nb );
609  ktmp = PB_Cnumroc( kbnext, k+kb, Ainb1, Anb, mycol, Acol,
610  npcol );
611  Anq0 -= ktmp;
612
613  if( ktmp > 0 )
614  {
615  if( Anp0 > 0 )
616  zgemv_( TRANS, &Anp0, &ktmp, negone,
617  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
618  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
619  Mptr( XAR, 0, Akq, XARld, size ), &XARld );
620  Asrc = PB_Cindxg2p( k+kb, Aimb1, Amb, Arow, Arow, nprow );
621  Czgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
622  Akq, XARld, size ), XARld, Asrc, mycol );
623  if( myrow != Asrc )
624  zset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
625  size ), &XARld );
626  }
627  if( Anp0 > 0 && Anq0 > 0 )
628  zgemv_( TRANS, &Anp0, &Anq0, negone,
629  Mptr( Aptr, Akp, Akq+ktmp, Ald, size ), &Ald,
630  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
631  Mptr( XAR, 0, Akq+ktmp, XARld, size ), &XARld );
632  }
633  else
634  {
635  if( Anp0 > 0 )
636  zgemv_( TRANS, &Anp0, &Anq0, negone,
637  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
638  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
639  Mptr( XAR, 0, Akq, XARld, size ), &XARld );
640  }
641  }
642  }
643 /*
644 * Combine the scattered resulting vector XAR
645 */
646  if( XARsum && ( Anq > 0 ) )
647  {
648  Czgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
649  mycol );
650  }
651 /*
652 * sub( X ) := XAR (if necessary)
653 */
654  if( XARapbX )
655  {
656  PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
657  ((char *) X), Xi, Xj, Xd, &Xroc );
658  }
659 /*
660 * Restore BLACS topologies
661 */
662  (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
663  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
664  }
665  else
666  {
667 /*
668 * Save current and enforce ring BLACS topologies
669 */
670  btop = *PB_Ctop( &ctxt, BCAST, ROW, TOP_GET );
671  ctop = *PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_GET );
672  (void) PB_Ctop( &ctxt, BCAST, ROW, TOP_DRING );
673  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DRING );
674 /*
675 * Remove next line when BLACS combine operations support ring topologies.
676 */
677  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, TOP_DEFAULT );
678 /*
679 * Reuse sub( X ) and/or create vector XAC in process row owning the last row
680 * of sub( A )
681 */
682  PB_CInOutV2( type, NOCONJG, ROW, *N, *N, *N-1, Ad0, 1,
683  ((char *) X), Xi, Xj, Xd, &Xroc, &XAR, XARd,
684  &XARfr, &XARsum, &XARapbX );
685 /*
686 * Create vector XAC in process columns spanned by sub( A )
687 */
688  PB_COutV( type, COLUMN, INIT, *N, *N, Ad0, 1, &XAC, XACd, &XACfr,
689  &XACsum );
690 /*
691 * Retrieve local quantities related to Ad0 -> sub( A )
692 */
693  Aimb1 = Ad0[IMB_ ]; Ainb1 = Ad0[INB_ ];
694  Amb = Ad0[MB_ ]; Anb = Ad0[NB_ ];
695  Arow = Ad0[RSRC_]; Acol = Ad0[CSRC_]; Ald = Ad0[LLD_];
696  Anp = PB_Cnumroc( *N, 0, Aimb1, Amb, myrow, Arow, nprow );
697  Anq = PB_Cnumroc( *N, 0, Ainb1, Anb, mycol, Acol, npcol );
698  if( ( Anp > 0 ) && ( Anq > 0 ) )
699  Aptr = Mptr( ((char *) A), Aii, Ajj, Ald, size );
700
701  XACld = XACd[LLD_]; XARld = XARd[LLD_];
702
703  for( k = ( ( *N - 1 ) / nb ) * nb; k >= 0; k -= nb )
704  {
705  ktmp = *N - k; kb = MIN( ktmp, nb );
706 /*
707 * Solve logical diagonal block, XAR contains the solution scattered in multiple
708 * process rows and XAC contains the solution replicated in the process columns.
709 */
710  Akp = PB_Cnumroc( k, 0, Aimb1, Amb, myrow, Arow, nprow );
711  Akq = PB_Cnumroc( k, 0, Ainb1, Anb, mycol, Acol, npcol );
712  PB_Cptrsv( type, XACsum, &UploA, &TranOp, &DiagA, kb, Aptr, k, k,
713  Ad0, Mptr( XAC, Akp, 0, XACld, size ), 1, Mptr( XAR, 0,
714  Akq, XARld, size ), XARld );
715 /*
716 * Update: only the part of sub( X ) to be solved at the next step is locally
717 * updated and combined, the remaining part of the vector to be solved later
718 * is only locally updated.
719 */
720  if( Akq > 0 )
721  {
722  Anp0 = PB_Cnumroc( kb, k, Aimb1, Amb, myrow, Arow, nprow );
723  if( XARsum )
724  {
725  kbprev = MIN( k, nb );
726  ktmp = PB_Cnumroc( kbprev, k-kbprev, Ainb1, Anb, mycol,
727  Acol, npcol );
728  Akq -= ktmp;
729
730  if( ktmp > 0 )
731  {
732  if( Anp0 > 0 )
733  zgemv_( TRANS, &Anp0, &ktmp, negone,
734  Mptr( Aptr, Akp, Akq, Ald, size ), &Ald,
735  Mptr( XAC, Akp, 0, XACld, size ), &ione, one,
736  Mptr( XAR, 0, Akq, XARld, size ), &XARld );
737  Asrc = PB_Cindxg2p( k-1, Aimb1, Amb, Arow, Arow, nprow );
738  Czgsum2d( ctxt, COLUMN, &ctop, 1, ktmp, Mptr( XAR, 0,
739  Akq, XARld, size ), XARld, Asrc, mycol );
740  if( myrow != Asrc )
741  zset_( &ktmp, zero, Mptr( XAR, 0, Akq, XARld,
742  size ), &XARld );
743  }
744  if( Anp0 > 0 && Akq > 0 )
745  zgemv_( TRANS, &Anp0, &Akq, negone,
746  Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
747  Mptr( XAC, Akp, 0, XACld, size ), &ione,
748  one, XAR, &XARld );
749  }
750  else
751  {
752  if( Anp0 > 0 )
753  zgemv_( TRANS, &Anp0, &Akq, negone,
754  Mptr( Aptr, Akp, 0, Ald, size ), &Ald,
755  Mptr( XAC, Akp, 0, XACld, size ), &ione,
756  one, XAR, &XARld );
757  }
758  }
759  }
760 /*
761 * Combine the scattered resulting vector XAR
762 */
763  if( XARsum && ( Anq > 0 ) )
764  {
765  Czgsum2d( ctxt, COLUMN, &ctop, 1, Anq, XAR, XARld, XARd[RSRC_],
766  mycol );
767  }
768 /*
769 * sub( X ) := XAR (if necessary)
770 */
771  if( XARapbX )
772  {
773  PB_Cpaxpby( type, NOCONJG, 1, *N, one, XAR, 0, 0, XARd, ROW, zero,
774  ((char *) X), Xi, Xj, Xd, &Xroc );
775  }
776 /*
777 * Restore BLACS topologies
778 */
779  (void) PB_Ctop( &ctxt, BCAST, ROW, &btop );
780  (void) PB_Ctop( &ctxt, COMBINE, COLUMN, &ctop );
781  }
782  }
783  if( XACfr ) free( XAC );
784  if( XARfr ) free( XAR );
785 /*
786 * End of PZTRSV
787 */
788 }
M_
#define M_
Definition: PBtools.h:39
ROW
#define ROW
Definition: PBblacs.h:46
MB_
#define MB_
Definition: PBtools.h:43
TOP_DEFAULT
#define TOP_DEFAULT
Definition: PBblacs.h:51
PB_Cpaxpby
void PB_Cpaxpby()
PB_Cwarn
void PB_Cwarn()
NB_
#define NB_
Definition: PBtools.h:44
COLUMN
#define COLUMN
Definition: PBblacs.h:45
CSRC_
#define CSRC_
Definition: PBtools.h:46
PBblacs.h
TOP_DRING
#define TOP_DRING
Definition: PBblacs.h:53
PBtools.h
PBblas.h
CCOTRAN
#define CCOTRAN
Definition: PBblas.h:22
NOCONJG
#define NOCONJG
Definition: PBblas.h:45
PBTYP_T::type
char type
Definition: pblas.h:327
PBpblas.h
PB_Cztypeset
PBTYP_T * PB_Cztypeset()
Definition: PB_Cztypeset.c:19
DLEN_
#define DLEN_
Definition: PBtools.h:48
CUNIT
#define CUNIT
Definition: PBblas.h:32
LLD_
#define LLD_
Definition: PBtools.h:47
PB_Cdescribe
void PB_Cdescribe()
PB_Cptrsv
void PB_Cptrsv()
F_CHAR_T
char * F_CHAR_T
Definition: pblas.h:118
CROW
#define CROW
Definition: PBblacs.h:21
PB_Cchkvec
void PB_Cchkvec()
IMB_
#define IMB_
Definition: PBtools.h:41
TOP_IRING
#define TOP_IRING
Definition: PBblacs.h:52
pilaenv_
int pilaenv_()
INIT
#define INIT
Definition: PBblas.h:61
PB_Cabort
void PB_Cabort()
CLOWER
#define CLOWER
Definition: PBblas.h:25
PB_Cindxg2p
int PB_Cindxg2p()
F2C_CHAR
#define F2C_CHAR(a)
Definition: pblas.h:120
PBTYP_T::negone
char * negone
Definition: pblas.h:331
TOP_GET
#define TOP_GET
Definition: PBblacs.h:50
PB_Ctop
char * PB_Ctop()
RSRC_
#define RSRC_
Definition: PBtools.h:45
CNOTRAN
#define CNOTRAN
Definition: PBblas.h:18
PBTYP_T::one
char * one
Definition: pblas.h:331
PB_CargFtoC
void PB_CargFtoC()
BCAST
#define BCAST
Definition: PBblacs.h:48
COMBINE
#define COMBINE
Definition: PBblacs.h:49
PBTYP_T::size
int size
Definition: pblas.h:329
PB_Cchkmat
void PB_Cchkmat()
PB_Cnumroc
int PB_Cnumroc()
zset_
F_VOID_FCT zset_()
Czgsum2d
void Czgsum2d()
MIN
#define MIN(a_, b_)
Definition: PBtools.h:76
CCOLUMN
#define CCOLUMN
Definition: PBblacs.h:20
INB_
#define INB_
Definition: PBtools.h:42
PB_COutV
void PB_COutV()
C2F_CHAR
#define C2F_CHAR(a)
Definition: pblas.h:121
PB_CInOutV2
void PB_CInOutV2()
Cblacs_gridinfo
void Cblacs_gridinfo()
PBTYP_T
Definition: pblas.h:325
Mupcase
#define Mupcase(C)
Definition: PBtools.h:83
zgemv_
F_VOID_FCT zgemv_()
pblas.h
CUPPER
#define CUPPER
Definition: PBblas.h:26
CTRAN
#define CTRAN
Definition: PBblas.h:20
Mptr
#define Mptr(a_, i_, j_, lda_, siz_)
Definition: PBtools.h:132
pztrsv_
void pztrsv_(F_CHAR_T UPLO, F_CHAR_T TRANS, F_CHAR_T DIAG, int *N, double *A, int *IA, int *JA, int *DESCA, double *X, int *IX, int *JX, int *DESCX, int *INCX)
Definition: pztrsv_.c:25
CTXT_
#define CTXT_
Definition: PBtools.h:38
PBTYP_T::zero
char * zero
Definition: pblas.h:331
CNOUNIT
#define CNOUNIT
Definition: PBblas.h:33
PB_Clcm
int PB_Clcm()