ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pdptsv.f
Go to the documentation of this file.
1  SUBROUTINE pdptsv( N, NRHS, D, E, JA, DESCA, B, IB, DESCB, WORK,
2  $ LWORK, INFO )
3 *
4 *
5 *
6 * -- ScaLAPACK routine (version 1.7) --
7 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
8 * and University of California, Berkeley.
9 * November 15, 1997
10 *
11 * .. Scalar Arguments ..
12  INTEGER IB, INFO, JA, LWORK, N, NRHS
13 * ..
14 * .. Array Arguments ..
15  INTEGER DESCA( * ), DESCB( * )
16  DOUBLE PRECISION B( * ), D( * ), E( * ), WORK( * )
17 * ..
18 *
19 *
20 * Purpose
21 * =======
22 *
23 * PDPTSV solves a system of linear equations
24 *
25 * A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
26 *
27 * where A(1:N, JA:JA+N-1) is an N-by-N real
28 * tridiagonal symmetric positive definite distributed
29 * matrix.
30 *
31 * Cholesky factorization is used to factor a reordering of
32 * the matrix into L L'.
33 *
34 * See PDPTTRF and PDPTTRS for details.
35 *
36 * =====================================================================
37 *
38 * Arguments
39 * =========
40 *
41 *
42 * N (global input) INTEGER
43 * The number of rows and columns to be operated on, i.e. the
44 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
45 *
46 * NRHS (global input) INTEGER
47 * The number of right hand sides, i.e., the number of columns
48 * of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
49 * NRHS >= 0.
50 *
51 * D (local input/local output) DOUBLE PRECISION pointer to local
52 * part of global vector storing the main diagonal of the
53 * matrix.
54 * On exit, this array contains information containing the
55 * factors of the matrix.
56 * Must be of size >= DESCA( NB_ ).
57 *
58 * E (local input/local output) DOUBLE PRECISION pointer to local
59 * part of global vector storing the upper diagonal of the
60 * matrix. Globally, DU(n) is not referenced, and DU must be
61 * aligned with D.
62 * On exit, this array contains information containing the
63 * factors of the matrix.
64 * Must be of size >= DESCA( NB_ ).
65 *
66 * JA (global input) INTEGER
67 * The index in the global array A that points to the start of
68 * the matrix to be operated on (which may be either all of A
69 * or a submatrix of A).
70 *
71 * DESCA (global and local input) INTEGER array of dimension DLEN.
72 * if 1D type (DTYPE_A=501 or 502), DLEN >= 7;
73 * if 2D type (DTYPE_A=1), DLEN >= 9.
74 * The array descriptor for the distributed matrix A.
75 * Contains information of mapping of A to memory. Please
76 * see NOTES below for full description and options.
77 *
78 * B (local input/local output) DOUBLE PRECISION pointer into
79 * local memory to an array of local lead dimension lld_b>=NB.
80 * On entry, this array contains the
81 * the local pieces of the right hand sides
82 * B(IB:IB+N-1, 1:NRHS).
83 * On exit, this contains the local piece of the solutions
84 * distributed matrix X.
85 *
86 * IB (global input) INTEGER
87 * The row index in the global array B that points to the first
88 * row of the matrix to be operated on (which may be either
89 * all of B or a submatrix of B).
90 *
91 * DESCB (global and local input) INTEGER array of dimension DLEN.
92 * if 1D type (DTYPE_B=502), DLEN >=7;
93 * if 2D type (DTYPE_B=1), DLEN >= 9.
94 * The array descriptor for the distributed matrix B.
95 * Contains information of mapping of B to memory. Please
96 * see NOTES below for full description and options.
97 *
98 * WORK (local workspace/local output)
99 * DOUBLE PRECISION temporary workspace. This space may
100 * be overwritten in between calls to routines. WORK must be
101 * the size given in LWORK.
102 * On exit, WORK( 1 ) contains the minimal LWORK.
103 *
104 * LWORK (local input or global input) INTEGER
105 * Size of user-input workspace WORK.
106 * If LWORK is too small, the minimal acceptable size will be
107 * returned in WORK(1) and an error code is returned. LWORK>=
108 * (12*NPCOL + 3*NB)
109 * +max((10+2*min(100,NRHS))*NPCOL+4*NRHS, 8*NPCOL)
110 *
111 * INFO (local output) INTEGER
112 * = 0: successful exit
113 * < 0: If the i-th argument is an array and the j-entry had
114 * an illegal value, then INFO = -(i*100+j), if the i-th
115 * argument is a scalar and had an illegal value, then
116 * INFO = -i.
117 * > 0: If INFO = K<=NPROCS, the submatrix stored on processor
118 * INFO and factored locally was not
119 * positive definite, and
120 * the factorization was not completed.
121 * If INFO = K>NPROCS, the submatrix stored on processor
122 * INFO-NPROCS representing interactions with other
123 * processors was not
124 * positive definite,
125 * and the factorization was not completed.
126 *
127 * =====================================================================
128 *
129 *
130 * Restrictions
131 * ============
132 *
133 * The following are restrictions on the input parameters. Some of these
134 * are temporary and will be removed in future releases, while others
135 * may reflect fundamental technical limitations.
136 *
137 * Non-cyclic restriction: VERY IMPORTANT!
138 * P*NB>= mod(JA-1,NB)+N.
139 * The mapping for matrices must be blocked, reflecting the nature
140 * of the divide and conquer algorithm as a task-parallel algorithm.
141 * This formula in words is: no processor may have more than one
142 * chunk of the matrix.
143 *
144 * Blocksize cannot be too small:
145 * If the matrix spans more than one processor, the following
146 * restriction on NB, the size of each block on each processor,
147 * must hold:
148 * NB >= 2
149 * The bulk of parallel computation is done on the matrix of size
150 * O(NB) on each processor. If this is too small, divide and conquer
151 * is a poor choice of algorithm.
152 *
153 * Submatrix reference:
154 * JA = IB
155 * Alignment restriction that prevents unnecessary communication.
156 *
157 *
158 * =====================================================================
159 *
160 *
161 * Notes
162 * =====
163 *
164 * If the factorization routine and the solve routine are to be called
165 * separately (to solve various sets of righthand sides using the same
166 * coefficient matrix), the auxiliary space AF *must not be altered*
167 * between calls to the factorization routine and the solve routine.
168 *
169 * The best algorithm for solving banded and tridiagonal linear systems
170 * depends on a variety of parameters, especially the bandwidth.
171 * Currently, only algorithms designed for the case N/P >> bw are
172 * implemented. These go by many names, including Divide and Conquer,
173 * Partitioning, domain decomposition-type, etc.
174 * For tridiagonal matrices, it is obvious: N/P >> bw(=1), and so D&C
175 * algorithms are the appropriate choice.
176 *
177 * Algorithm description: Divide and Conquer
178 *
179 * The Divide and Conqer algorithm assumes the matrix is narrowly
180 * banded compared with the number of equations. In this situation,
181 * it is best to distribute the input matrix A one-dimensionally,
182 * with columns atomic and rows divided amongst the processes.
183 * The basic algorithm divides the tridiagonal matrix up into
184 * P pieces with one stored on each processor,
185 * and then proceeds in 2 phases for the factorization or 3 for the
186 * solution of a linear system.
187 * 1) Local Phase:
188 * The individual pieces are factored independently and in
189 * parallel. These factors are applied to the matrix creating
190 * fillin, which is stored in a non-inspectable way in auxiliary
191 * space AF. Mathematically, this is equivalent to reordering
192 * the matrix A as P A P^T and then factoring the principal
193 * leading submatrix of size equal to the sum of the sizes of
194 * the matrices factored on each processor. The factors of
195 * these submatrices overwrite the corresponding parts of A
196 * in memory.
197 * 2) Reduced System Phase:
198 * A small ((P-1)) system is formed representing
199 * interaction of the larger blocks, and is stored (as are its
200 * factors) in the space AF. A parallel Block Cyclic Reduction
201 * algorithm is used. For a linear system, a parallel front solve
202 * followed by an analagous backsolve, both using the structure
203 * of the factored matrix, are performed.
204 * 3) Backsubsitution Phase:
205 * For a linear system, a local backsubstitution is performed on
206 * each processor in parallel.
207 *
208 *
209 * Descriptors
210 * ===========
211 *
212 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
213 *
214 * Note: tridiagonal codes can use either the old two dimensional
215 * or new one-dimensional descriptors, though the processor grid in
216 * both cases *must be one-dimensional*. We describe both types below.
217 *
218 * Each global data object is described by an associated description
219 * vector. This vector stores the information required to establish
220 * the mapping between an object element and its corresponding process
221 * and memory location.
222 *
223 * Let A be a generic term for any 2D block cyclicly distributed array.
224 * Such a global array has an associated description vector DESCA.
225 * In the following comments, the character _ should be read as
226 * "of the global array".
227 *
228 * NOTATION STORED IN EXPLANATION
229 * --------------- -------------- --------------------------------------
230 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
231 * DTYPE_A = 1.
232 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
233 * the BLACS process grid A is distribu-
234 * ted over. The context itself is glo-
235 * bal, but the handle (the integer
236 * value) may vary.
237 * M_A (global) DESCA( M_ ) The number of rows in the global
238 * array A.
239 * N_A (global) DESCA( N_ ) The number of columns in the global
240 * array A.
241 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
242 * the rows of the array.
243 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
244 * the columns of the array.
245 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
246 * row of the array A is distributed.
247 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
248 * first column of the array A is
249 * distributed.
250 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
251 * array. LLD_A >= MAX(1,LOCr(M_A)).
252 *
253 * Let K be the number of rows or columns of a distributed matrix,
254 * and assume that its process grid has dimension p x q.
255 * LOCr( K ) denotes the number of elements of K that a process
256 * would receive if K were distributed over the p processes of its
257 * process column.
258 * Similarly, LOCc( K ) denotes the number of elements of K that a
259 * process would receive if K were distributed over the q processes of
260 * its process row.
261 * The values of LOCr() and LOCc() may be determined via a call to the
262 * ScaLAPACK tool function, NUMROC:
263 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
264 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
265 * An upper bound for these quantities may be computed by:
266 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
267 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
268 *
269 *
270 * One-dimensional descriptors:
271 *
272 * One-dimensional descriptors are a new addition to ScaLAPACK since
273 * version 1.0. They simplify and shorten the descriptor for 1D
274 * arrays.
275 *
276 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
277 * object, we allow 1D arrays to be distributed either over the
278 * first dimension of the array (as if the grid were P-by-1) or the
279 * 2nd dimension (as if the grid were 1-by-P). This choice is
280 * indicated by the descriptor type (501 or 502)
281 * as described below.
282 * However, for tridiagonal matrices, since the objects being
283 * distributed are the individual vectors storing the diagonals, we
284 * have adopted the convention that both the P-by-1 descriptor and
285 * the 1-by-P descriptor are allowed and are equivalent for
286 * tridiagonal matrices. Thus, for tridiagonal matrices,
287 * DTYPE_A = 501 or 502 can be used interchangeably
288 * without any other change.
289 * We require that the distributed vectors storing the diagonals of a
290 * tridiagonal matrix be aligned with each other. Because of this, a
291 * single descriptor, DESCA, serves to describe the distribution of
292 * of all diagonals simultaneously.
293 *
294 * IMPORTANT NOTE: the actual BLACS grid represented by the
295 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
296 * irrespective of which one-dimensional descriptor type
297 * (501 or 502) is input.
298 * This routine will interpret the grid properly either way.
299 * ScaLAPACK routines *do not support intercontext operations* so that
300 * the grid passed to a single ScaLAPACK routine *must be the same*
301 * for all array descriptors passed to that routine.
302 *
303 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
304 * may also be used, since a one-dimensional array is a special case
305 * of a two-dimensional array with one dimension of size unity.
306 * The two-dimensional array used in this case *must* be of the
307 * proper orientation:
308 * If the appropriate one-dimensional descriptor is DTYPEA=501
309 * (1 by P type), then the two dimensional descriptor must
310 * have a CTXT value that refers to a 1 by P BLACS grid;
311 * If the appropriate one-dimensional descriptor is DTYPEA=502
312 * (P by 1 type), then the two dimensional descriptor must
313 * have a CTXT value that refers to a P by 1 BLACS grid.
314 *
315 *
316 * Summary of allowed descriptors, types, and BLACS grids:
317 * DTYPE 501 502 1 1
318 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
319 * -----------------------------------------------------
320 * A OK OK OK NO
321 * B NO OK NO OK
322 *
323 * Note that a consequence of this chart is that it is not possible
324 * for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
325 * to opposite requirements for the orientation of the BLACS grid,
326 * and as noted before, the *same* BLACS context must be used in
327 * all descriptors in a single ScaLAPACK subroutine call.
328 *
329 * Let A be a generic term for any 1D block cyclicly distributed array.
330 * Such a global array has an associated description vector DESCA.
331 * In the following comments, the character _ should be read as
332 * "of the global array".
333 *
334 * NOTATION STORED IN EXPLANATION
335 * --------------- ---------- ------------------------------------------
336 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
337 * TYPE_A = 501: 1-by-P grid.
338 * TYPE_A = 502: P-by-1 grid.
339 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
340 * the BLACS process grid A is distribu-
341 * ted over. The context itself is glo-
342 * bal, but the handle (the integer
343 * value) may vary.
344 * N_A (global) DESCA( 3 ) The size of the array dimension being
345 * distributed.
346 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
347 * the distributed dimension of the array.
348 * SRC_A (global) DESCA( 5 ) The process row or column over which the
349 * first row or column of the array
350 * is distributed.
351 * Ignored DESCA( 6 ) Ignored for tridiagonal matrices.
352 * Reserved DESCA( 7 ) Reserved for future use.
353 *
354 *
355 *
356 * =====================================================================
357 *
358 * Code Developer: Andrew J. Cleary, University of Tennessee.
359 * Current address: Lawrence Livermore National Labs.
360 * This version released: August, 2001.
361 *
362 * =====================================================================
363 *
364 * ..
365 * .. Parameters ..
366  DOUBLE PRECISION ONE, ZERO
367  parameter( one = 1.0d+0 )
368  parameter( zero = 0.0d+0 )
369  INTEGER INT_ONE
370  parameter( int_one = 1 )
371  INTEGER DESCMULT, BIGNUM
372  parameter(descmult = 100, bignum = descmult * descmult)
373  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
374  $ lld_, mb_, m_, nb_, n_, rsrc_
375  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
376  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
377  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
378 * ..
379 * .. Local Scalars ..
380  INTEGER ICTXT, MYCOL, MYROW, NB, NPCOL, NPROW,
381  $ ws_factor
382 * ..
383 * .. External Subroutines ..
384  EXTERNAL pdpttrf, pdpttrs, pxerbla
385 * ..
386 * .. Executable Statements ..
387 *
388 * Note: to avoid duplication, most error checking is not performed
389 * in this routine and is left to routines
390 * PDPTTRF and PDPTTRS.
391 *
392 * Begin main code
393 *
394  info = 0
395 *
396 * Get block size to calculate workspace requirements
397 *
398  IF( desca( dtype_ ) .EQ. block_cyclic_2d ) THEN
399  nb = desca( nb_ )
400  ictxt = desca( ctxt_ )
401  ELSEIF( desca( dtype_ ) .EQ. 501 ) THEN
402  nb = desca( 4 )
403  ictxt = desca( 2 )
404  ELSEIF( desca( dtype_ ) .EQ. 502 ) THEN
405  nb = desca( 4 )
406  ictxt = desca( 2 )
407  ELSE
408  info = -( 5*100 + dtype_ )
409  CALL pxerbla( ictxt,
410  $ 'PDPTSV',
411  $ -info )
412  RETURN
413  ENDIF
414 *
415  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
416 *
417 *
418 * Size needed for AF in factorization
419 *
420  ws_factor = (12*npcol + 3*nb)
421 *
422 * Factor the matrix
423 *
424  CALL pdpttrf( n, d, e, ja, desca, work, min( lwork, ws_factor ),
425  $ work( 1+ws_factor ), lwork-ws_factor, info )
426 *
427 * Check info for error conditions
428 *
429  IF( info.NE.0 ) THEN
430  IF( info .LT. 0 ) THEN
431  CALL pxerbla( ictxt, 'PDPTSV', -info )
432  ENDIF
433  RETURN
434  END IF
435 *
436 * Solve the system using the factorization
437 *
438  CALL pdpttrs( n, nrhs, d, e, ja, desca, b, ib, descb, work,
439  $ min( lwork, ws_factor ), work( 1+ws_factor),
440  $ lwork-ws_factor, info )
441 *
442 * Check info for error conditions
443 *
444  IF( info.NE.0 ) THEN
445  CALL pxerbla( ictxt, 'PDPTSV', -info )
446  RETURN
447  END IF
448 *
449  RETURN
450 *
451 * End of PDPTSV
452 *
453  END
pdpttrf
subroutine pdpttrf(N, D, E, JA, DESCA, AF, LAF, WORK, LWORK, INFO)
Definition: pdpttrf.f:3
pdptsv
subroutine pdptsv(N, NRHS, D, E, JA, DESCA, B, IB, DESCB, WORK, LWORK, INFO)
Definition: pdptsv.f:3
pdpttrs
subroutine pdpttrs(N, NRHS, D, E, JA, DESCA, B, IB, DESCB, AF, LAF, WORK, LWORK, INFO)
Definition: pdpttrs.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181