ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pdpbmv1.f
Go to the documentation of this file.
1  SUBROUTINE pdpbdcmv( LDBW, BW, UPLO, N, A, JA, DESCA, NRHS, B, IB,
2  $ DESCB, X, WORK, LWORK, INFO )
3 *
4 *
5 *
6 * -- ScaLAPACK routine (version 1.7) --
7 * University of Tennessee, Knoxville, Oak Ridge National Laboratory,
8 * and University of California, Berkeley.
9 * November 15, 1997
10 *
11 * .. Scalar Arguments ..
12  CHARACTER UPLO
13  INTEGER BW, IB, INFO, JA, LDBW, LWORK, N, NRHS
14 * ..
15 * .. Array Arguments ..
16  INTEGER DESCA( * ), DESCB( * )
17  DOUBLE PRECISION A( * ), B( * ), WORK( * ), X( * )
18 * ..
19 *
20 *
21 * Purpose
22 * =======
23 *
24 *
25 * =====================================================================
26 *
27 * Arguments
28 * =========
29 *
30 * UPLO (global input) CHARACTER
31 * = 'U': Upper triangle of A(1:N, JA:JA+N-1) is stored;
32 * = 'L': Lower triangle of A(1:N, JA:JA+N-1) is stored.
33 *
34 * N (global input) INTEGER
35 * The number of rows and columns to be operated on, i.e. the
36 * order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
37 *
38 * BW (global input) INTEGER
39 * Number of subdiagonals in L or U. 0 <= BW <= N-1
40 *
41 * A (local input/local output) DOUBLE PRECISION pointer into
42 * local memory to an array with first dimension
43 * LLD_A >=(bw+1) (stored in DESCA).
44 * On entry, this array contains the local pieces of the
45 * This local portion is stored in the packed banded format
46 * used in LAPACK. Please see the Notes below and the
47 * ScaLAPACK manual for more detail on the format of
48 * distributed matrices.
49 *
50 * JA (global input) INTEGER
51 * The index in the global array A that points to the start of
52 * the matrix to be operated on (which may be either all of A
53 * or a submatrix of A).
54 *
55 * DESCA (global and local input) INTEGER array of dimension DLEN.
56 * if 1D type (DTYPE_A=501), DLEN >= 7;
57 * if 2D type (DTYPE_A=1), DLEN >= 9 .
58 * The array descriptor for the distributed matrix A.
59 * Contains information of mapping of A to memory. Please
60 * see NOTES below for full description and options.
61 *
62 * AF (local output) DOUBLE PRECISION array, dimension LAF.
63 * Auxiliary Fillin Space.
64 * Fillin is created during the factorization routine
65 * PDPBTRF and this is stored in AF. If a linear system
66 * is to be solved using PDPBTRS after the factorization
67 * routine, AF *must not be altered* after the factorization.
68 *
69 * LAF (local input) INTEGER
70 * Size of user-input Auxiliary Fillin space AF. Must be >=
71 * (NB+2*bw)*bw
72 * If LAF is not large enough, an error code will be returned
73 * and the minimum acceptable size will be returned in AF( 1 )
74 *
75 * WORK (local workspace/local output)
76 * DOUBLE PRECISION temporary workspace. This space may
77 * be overwritten in between calls to routines. WORK must be
78 * the size given in LWORK.
79 * On exit, WORK( 1 ) contains the minimal LWORK.
80 *
81 * LWORK (local input or global input) INTEGER
82 * Size of user-input workspace WORK.
83 * If LWORK is too small, the minimal acceptable size will be
84 * returned in WORK(1) and an error code is returned. LWORK>=
85 *
86 * INFO (global output) INTEGER
87 * = 0: successful exit
88 * < 0: If the i-th argument is an array and the j-entry had
89 * an illegal value, then INFO = -(i*100+j), if the i-th
90 * argument is a scalar and had an illegal value, then
91 * INFO = -i.
92 *
93 * =====================================================================
94 *
95 *
96 * Restrictions
97 * ============
98 *
99 * The following are restrictions on the input parameters. Some of these
100 * are temporary and will be removed in future releases, while others
101 * may reflect fundamental technical limitations.
102 *
103 * Non-cyclic restriction: VERY IMPORTANT!
104 * P*NB>= mod(JA-1,NB)+N.
105 * The mapping for matrices must be blocked, reflecting the nature
106 * of the divide and conquer algorithm as a task-parallel algorithm.
107 * This formula in words is: no processor may have more than one
108 * chunk of the matrix.
109 *
110 * Blocksize cannot be too small:
111 * If the matrix spans more than one processor, the following
112 * restriction on NB, the size of each block on each processor,
113 * must hold:
114 * NB >= 2*BW
115 * The bulk of parallel computation is done on the matrix of size
116 * O(NB) on each processor. If this is too small, divide and conquer
117 * is a poor choice of algorithm.
118 *
119 * Submatrix reference:
120 * JA = IB
121 * Alignment restriction that prevents unnecessary communication.
122 *
123 *
124 * =====================================================================
125 *
126 *
127 * Notes
128 * =====
129 *
130 * If the factorization routine and the solve routine are to be called
131 * separately (to solve various sets of righthand sides using the same
132 * coefficient matrix), the auxiliary space AF *must not be altered*
133 * between calls to the factorization routine and the solve routine.
134 *
135 * The best algorithm for solving banded and tridiagonal linear systems
136 * depends on a variety of parameters, especially the bandwidth.
137 * Currently, only algorithms designed for the case N/P >> bw are
138 * implemented. These go by many names, including Divide and Conquer,
139 * Partitioning, domain decomposition-type, etc.
140 *
141 * Algorithm description: Divide and Conquer
142 *
143 * The Divide and Conqer algorithm assumes the matrix is narrowly
144 * banded compared with the number of equations. In this situation,
145 * it is best to distribute the input matrix A one-dimensionally,
146 * with columns atomic and rows divided amongst the processes.
147 * The basic algorithm divides the banded matrix up into
148 * P pieces with one stored on each processor,
149 * and then proceeds in 2 phases for the factorization or 3 for the
150 * solution of a linear system.
151 * 1) Local Phase:
152 * The individual pieces are factored independently and in
153 * parallel. These factors are applied to the matrix creating
154 * fillin, which is stored in a non-inspectable way in auxiliary
155 * space AF. Mathematically, this is equivalent to reordering
156 * the matrix A as P A P^T and then factoring the principal
157 * leading submatrix of size equal to the sum of the sizes of
158 * the matrices factored on each processor. The factors of
159 * these submatrices overwrite the corresponding parts of A
160 * in memory.
161 * 2) Reduced System Phase:
162 * A small (BW* (P-1)) system is formed representing
163 * interaction of the larger blocks, and is stored (as are its
164 * factors) in the space AF. A parallel Block Cyclic Reduction
165 * algorithm is used. For a linear system, a parallel front solve
166 * followed by an analagous backsolve, both using the structure
167 * of the factored matrix, are performed.
168 * 3) Backsubsitution Phase:
169 * For a linear system, a local backsubstitution is performed on
170 * each processor in parallel.
171 *
172 *
173 * Descriptors
174 * ===========
175 *
176 * Descriptors now have *types* and differ from ScaLAPACK 1.0.
177 *
178 * Note: banded codes can use either the old two dimensional
179 * or new one-dimensional descriptors, though the processor grid in
180 * both cases *must be one-dimensional*. We describe both types below.
181 *
182 * Each global data object is described by an associated description
183 * vector. This vector stores the information required to establish
184 * the mapping between an object element and its corresponding process
185 * and memory location.
186 *
187 * Let A be a generic term for any 2D block cyclicly distributed array.
188 * Such a global array has an associated description vector DESCA.
189 * In the following comments, the character _ should be read as
190 * "of the global array".
191 *
192 * NOTATION STORED IN EXPLANATION
193 * --------------- -------------- --------------------------------------
194 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
195 * DTYPE_A = 1.
196 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
197 * the BLACS process grid A is distribu-
198 * ted over. The context itself is glo-
199 * bal, but the handle (the integer
200 * value) may vary.
201 * M_A (global) DESCA( M_ ) The number of rows in the global
202 * array A.
203 * N_A (global) DESCA( N_ ) The number of columns in the global
204 * array A.
205 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
206 * the rows of the array.
207 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
208 * the columns of the array.
209 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
210 * row of the array A is distributed.
211 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
212 * first column of the array A is
213 * distributed.
214 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
215 * array. LLD_A >= MAX(1,LOCr(M_A)).
216 *
217 * Let K be the number of rows or columns of a distributed matrix,
218 * and assume that its process grid has dimension p x q.
219 * LOCr( K ) denotes the number of elements of K that a process
220 * would receive if K were distributed over the p processes of its
221 * process column.
222 * Similarly, LOCc( K ) denotes the number of elements of K that a
223 * process would receive if K were distributed over the q processes of
224 * its process row.
225 * The values of LOCr() and LOCc() may be determined via a call to the
226 * ScaLAPACK tool function, NUMROC:
227 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
228 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
229 * An upper bound for these quantities may be computed by:
230 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
231 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
232 *
233 *
234 * One-dimensional descriptors:
235 *
236 * One-dimensional descriptors are a new addition to ScaLAPACK since
237 * version 1.0. They simplify and shorten the descriptor for 1D
238 * arrays.
239 *
240 * Since ScaLAPACK supports two-dimensional arrays as the fundamental
241 * object, we allow 1D arrays to be distributed either over the
242 * first dimension of the array (as if the grid were P-by-1) or the
243 * 2nd dimension (as if the grid were 1-by-P). This choice is
244 * indicated by the descriptor type (501 or 502)
245 * as described below.
246 *
247 * IMPORTANT NOTE: the actual BLACS grid represented by the
248 * CTXT entry in the descriptor may be *either* P-by-1 or 1-by-P
249 * irrespective of which one-dimensional descriptor type
250 * (501 or 502) is input.
251 * This routine will interpret the grid properly either way.
252 * ScaLAPACK routines *do not support intercontext operations* so that
253 * the grid passed to a single ScaLAPACK routine *must be the same*
254 * for all array descriptors passed to that routine.
255 *
256 * NOTE: In all cases where 1D descriptors are used, 2D descriptors
257 * may also be used, since a one-dimensional array is a special case
258 * of a two-dimensional array with one dimension of size unity.
259 * The two-dimensional array used in this case *must* be of the
260 * proper orientation:
261 * If the appropriate one-dimensional descriptor is DTYPEA=501
262 * (1 by P type), then the two dimensional descriptor must
263 * have a CTXT value that refers to a 1 by P BLACS grid;
264 * If the appropriate one-dimensional descriptor is DTYPEA=502
265 * (P by 1 type), then the two dimensional descriptor must
266 * have a CTXT value that refers to a P by 1 BLACS grid.
267 *
268 *
269 * Summary of allowed descriptors, types, and BLACS grids:
270 * DTYPE 501 502 1 1
271 * BLACS grid 1xP or Px1 1xP or Px1 1xP Px1
272 * -----------------------------------------------------
273 * A OK NO OK NO
274 * B NO OK NO OK
275 *
276 * Let A be a generic term for any 1D block cyclicly distributed array.
277 * Such a global array has an associated description vector DESCA.
278 * In the following comments, the character _ should be read as
279 * "of the global array".
280 *
281 * NOTATION STORED IN EXPLANATION
282 * --------------- ---------- ------------------------------------------
283 * DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
284 * TYPE_A = 501: 1-by-P grid.
285 * TYPE_A = 502: P-by-1 grid.
286 * CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
287 * the BLACS process grid A is distribu-
288 * ted over. The context itself is glo-
289 * bal, but the handle (the integer
290 * value) may vary.
291 * N_A (global) DESCA( 3 ) The size of the array dimension being
292 * distributed.
293 * NB_A (global) DESCA( 4 ) The blocking factor used to distribute
294 * the distributed dimension of the array.
295 * SRC_A (global) DESCA( 5 ) The process row or column over which the
296 * first row or column of the array
297 * is distributed.
298 * LLD_A (local) DESCA( 6 ) The leading dimension of the local array
299 * storing the local blocks of the distri-
300 * buted array A. Minimum value of LLD_A
301 * depends on TYPE_A.
302 * TYPE_A = 501: LLD_A >=
303 * size of undistributed dimension, 1.
304 * TYPE_A = 502: LLD_A >=NB_A, 1.
305 * Reserved DESCA( 7 ) Reserved for future use.
306 *
307 *
308 *
309 * =====================================================================
310 *
311 * Code Developer: Andrew J. Cleary, University of Tennessee.
312 * Current address: Lawrence Livermore National Labs.
313 * This version released: August, 2001.
314 *
315 * =====================================================================
316 *
317 * ..
318 * .. Parameters ..
319  DOUBLE PRECISION ONE, ZERO
320  parameter( one = 1.0d+0 )
321  parameter( zero = 0.0d+0 )
322  INTEGER INT_ONE
323  parameter( int_one = 1 )
324  INTEGER DESCMULT, BIGNUM
325  parameter(descmult = 100, bignum = descmult * descmult)
326  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
327  $ lld_, mb_, m_, nb_, n_, rsrc_
328  parameter( block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
329  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
330  $ rsrc_ = 7, csrc_ = 8, lld_ = 9 )
331 * ..
332 * .. Local Scalars ..
333  INTEGER CSRC, DL_N_M, DL_N_N, DL_P_M, DL_P_N,
334  $ first_proc, i, ictxt, ictxt_new, ictxt_save,
335  $ idum1, idum3, j, ja_new, llda, lldb, mycol,
336  $ myrow, my_num_cols, nb, np, npcol, nprow,
337  $ np_save, odd_size, ofst, part_offset,
338  $ part_size, store_m_b, store_n_a
339  INTEGER NUMROC_SIZE
340 * ..
341 * .. Local Arrays ..
342  INTEGER PARAM_CHECK( 16, 3 )
343 * ..
344 * .. External Subroutines ..
345  EXTERNAL blacs_gridinfo, pxerbla, reshape
346 * ..
347 * .. External Functions ..
348  LOGICAL LSAME
349  INTEGER NUMROC
350  EXTERNAL lsame, numroc
351 * ..
352 * .. Intrinsic Functions ..
353  INTRINSIC ichar, min, mod
354 * ..
355 * .. Executable Statements ..
356 *
357 * Test the input parameters
358 *
359  info = 0
360 *
361  ictxt = desca( ctxt_ )
362  csrc = desca( csrc_ )
363  nb = desca( nb_ )
364  llda = desca( lld_ )
365  store_n_a = desca( n_ )
366  lldb = descb( lld_ )
367  store_m_b = descb( m_ )
368 *
369 *
370 * Pre-calculate bw^2
371 *
372 *
373  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
374  np = nprow * npcol
375 *
376 *
377 *
378  IF( lsame( uplo, 'U' ) ) THEN
379  idum1 = ichar( 'U' )
380  ELSE IF ( lsame( uplo, 'L' ) ) THEN
381  idum1 = ichar( 'L' )
382  ELSE
383  info = -1
384  END IF
385 *
386  IF( lwork .LT. -1) THEN
387  info = -14
388  ELSE IF ( lwork .EQ. -1 ) THEN
389  idum3 = -1
390  ELSE
391  idum3 = 1
392  ENDIF
393 *
394  IF( n .LT. 0 ) THEN
395  info = -2
396  ENDIF
397 *
398  IF( n+ja-1 .GT. store_n_a ) THEN
399  info = -( 7*100 + 6 )
400  ENDIF
401 *
402  IF(( bw .GT. n-1 ) .OR.
403  $ ( bw .LT. 0 ) ) THEN
404  info = -3
405  ENDIF
406 *
407  IF( llda .LT. (bw+1) ) THEN
408  info = -( 7*100 + 6 )
409  ENDIF
410 *
411  IF( nb .LE. 0 ) THEN
412  info = -( 7*100 + 4 )
413  ENDIF
414 *
415 * Argument checking that is specific to Divide & Conquer routine
416 *
417  IF( nprow .NE. 1 ) THEN
418  info = -( 7*100+2 )
419  ENDIF
420 *
421  IF( n .GT. np*nb-mod( ja-1, nb )) THEN
422  info = -( 2 )
423  CALL pxerbla( ictxt,
424  $ 'PDPBDCMV, D&C alg.: only 1 block per proc',
425  $ -info )
426  RETURN
427  ENDIF
428 *
429  IF((ja+n-1.GT.nb) .AND. ( nb.LT.2*bw )) THEN
430  info = -( 7*100+4 )
431  CALL pxerbla( ictxt,
432  $ 'PDPBDCMV, D&C alg.: NB too small',
433  $ -info )
434  RETURN
435  ENDIF
436 *
437 *
438 * Pack params and positions into arrays for global consistency check
439 *
440  param_check( 16, 1 ) = descb(5)
441  param_check( 15, 1 ) = descb(4)
442  param_check( 14, 1 ) = descb(3)
443  param_check( 13, 1 ) = descb(2)
444  param_check( 12, 1 ) = descb(1)
445  param_check( 11, 1 ) = ib
446  param_check( 10, 1 ) = desca(5)
447  param_check( 9, 1 ) = desca(4)
448  param_check( 8, 1 ) = desca(3)
449  param_check( 7, 1 ) = desca(1)
450  param_check( 6, 1 ) = ja
451  param_check( 5, 1 ) = nrhs
452  param_check( 4, 1 ) = bw
453  param_check( 3, 1 ) = n
454  param_check( 2, 1 ) = idum3
455  param_check( 1, 1 ) = idum1
456 *
457  param_check( 16, 2 ) = 1005
458  param_check( 15, 2 ) = 1004
459  param_check( 14, 2 ) = 1003
460  param_check( 13, 2 ) = 1002
461  param_check( 12, 2 ) = 1001
462  param_check( 11, 2 ) = 9
463  param_check( 10, 2 ) = 705
464  param_check( 9, 2 ) = 704
465  param_check( 8, 2 ) = 703
466  param_check( 7, 2 ) = 701
467  param_check( 6, 2 ) = 6
468  param_check( 5, 2 ) = 4
469  param_check( 4, 2 ) = 3
470  param_check( 3, 2 ) = 2
471  param_check( 2, 2 ) = 14
472  param_check( 1, 2 ) = 1
473 *
474 * Want to find errors with MIN( ), so if no error, set it to a big
475 * number. If there already is an error, multiply by the the
476 * descriptor multiplier.
477 *
478  IF( info.GE.0 ) THEN
479  info = bignum
480  ELSE IF( info.LT.-descmult ) THEN
481  info = -info
482  ELSE
483  info = -info * descmult
484  END IF
485 *
486 * Check consistency across processors
487 *
488  CALL globchk( ictxt, 16, param_check, 16,
489  $ param_check( 1, 3 ), info )
490 *
491 * Prepare output: set info = 0 if no error, and divide by DESCMULT
492 * if error is not in a descriptor entry.
493 *
494  IF( info.EQ.bignum ) THEN
495  info = 0
496  ELSE IF( mod( info, descmult ) .EQ. 0 ) THEN
497  info = -info / descmult
498  ELSE
499  info = -info
500  END IF
501 *
502  IF( info.LT.0 ) THEN
503  CALL pxerbla( ictxt, 'PDPBDCMV', -info )
504  RETURN
505  END IF
506 *
507 * Quick return if possible
508 *
509  IF( n.EQ.0 )
510  $ RETURN
511 *
512 *
513 * Adjust addressing into matrix space to properly get into
514 * the beginning part of the relevant data
515 *
516  part_offset = nb*( (ja-1)/(npcol*nb) )
517 *
518  IF ( (mycol-csrc) .LT. (ja-part_offset-1)/nb ) THEN
519  part_offset = part_offset + nb
520  ENDIF
521 *
522  IF ( mycol .LT. csrc ) THEN
523  part_offset = part_offset - nb
524  ENDIF
525 *
526 * Form a new BLACS grid (the "standard form" grid) with only procs
527 * holding part of the matrix, of size 1xNP where NP is adjusted,
528 * starting at csrc=0, with JA modified to reflect dropped procs.
529 *
530 * First processor to hold part of the matrix:
531 *
532  first_proc = mod( ( ja-1 )/nb+csrc, npcol )
533 *
534 * Calculate new JA one while dropping off unused processors.
535 *
536  ja_new = mod( ja-1, nb ) + 1
537 *
538 * Save and compute new value of NP
539 *
540  np_save = np
541  np = ( ja_new+n-2 )/nb + 1
542 *
543 * Call utility routine that forms "standard-form" grid
544 *
545  CALL reshape( ictxt, int_one, ictxt_new, int_one,
546  $ first_proc, int_one, np )
547 *
548 * Use new context from standard grid as context.
549 *
550  ictxt_save = ictxt
551  ictxt = ictxt_new
552 *
553 * Get information about new grid.
554 *
555  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
556 *
557 * Drop out processors that do not have part of the matrix.
558 *
559  IF( myrow .LT. 0 ) THEN
560  GOTO 1234
561  ENDIF
562 *
563 * ********************************
564 * Values reused throughout routine
565 *
566 * User-input value of partition size
567 *
568  part_size = nb
569 *
570 * Number of columns in each processor
571 *
572  my_num_cols = numroc( n, part_size, mycol, 0, npcol )
573 *
574 * Offset in columns to beginning of main partition in each proc
575 *
576  IF ( mycol .EQ. 0 ) THEN
577  part_offset = part_offset+mod( ja_new-1, part_size )
578  my_num_cols = my_num_cols - mod(ja_new-1, part_size )
579  ENDIF
580 *
581 * Offset in elements
582 *
583  ofst = part_offset*llda
584 *
585 * Size of main (or odd) partition in each processor
586 *
587  odd_size = my_num_cols
588  IF ( mycol .LT. np-1 ) THEN
589  odd_size = odd_size - bw
590  ENDIF
591 *
592 *
593 *
594 * Zero out solution to use to accumulate answer
595 *
596  numroc_size =
597  $ numroc( n, part_size, mycol, 0, npcol)
598 *
599  DO 2279 j=1,nrhs
600  DO 4502 i=1,numroc_size
601  x( (j-1)*lldb + i ) = zero
602  4502 CONTINUE
603  2279 CONTINUE
604 *
605  DO 5642 i=1, (bw+2)*bw
606  work( i ) = zero
607  5642 CONTINUE
608 *
609 * Begin main code
610 *
611 *
612 **************************************
613 *
614  IF ( lsame( uplo, 'L' ) ) THEN
615 *
616 * Sizes of the extra triangles communicated bewtween processors
617 *
618  IF( mycol .GT. 0 ) THEN
619 *
620  dl_p_m= min( bw,
621  $ numroc( n, part_size, mycol, 0, npcol ) )
622  dl_p_n= min( bw,
623  $ numroc( n, part_size, mycol-1, 0, npcol ) )
624  ENDIF
625 *
626  IF( mycol .LT. npcol-1 ) THEN
627 *
628  dl_n_m= min( bw,
629  $ numroc( n, part_size, mycol+1, 0, npcol ) )
630  dl_n_n= min( bw,
631  $ numroc( n, part_size, mycol, 0, npcol ) )
632  ENDIF
633 *
634 *
635  IF( mycol .LT. npcol-1 ) THEN
636 * ...must send triangle in upper half of matrix to right
637 *
638 * Transpose triangle in preparation for sending
639 *
640  CALL dlatcpy( 'U', bw, bw,
641  $ a( llda*( numroc_size-bw )+1+bw ),
642  $ llda-1, work( 1 ), bw )
643 *
644 * Send the triangle to neighboring processor to right
645 *
646  CALL dtrsd2d(ictxt, 'L', 'N',
647  $ bw, bw,
648  $ work( 1 ),
649  $ bw, myrow, mycol+1 )
650 *
651  ENDIF
652 *
653 * Use main partition in each processor to multiply locally
654 *
655  CALL dsbmv( 'L', numroc_size, bw, one, a( ofst+1 ), llda,
656  $ b(part_offset+1), 1, zero, x( part_offset+1 ), 1 )
657 *
658 *
659 *
660  IF ( mycol .LT. npcol-1 ) THEN
661 *
662 * Do the multiplication of the triangle in the lower half
663 *
664  CALL dcopy( dl_n_n,
665  $ b( numroc_size-dl_n_n+1 ),
666  $ 1, work( bw*bw+1+bw-dl_n_n ), 1 )
667 *
668  CALL dtrmv( 'U', 'N', 'N', bw,
669  $ a( llda*( numroc_size-bw )+1+bw ), llda-1,
670  $ work( bw*bw+1 ), 1)
671 *
672 * Zero out extraneous elements caused by TRMV if any
673 *
674  IF( dl_n_m .GT. dl_n_n ) THEN
675  DO 10 i = dl_n_m-dl_n_n, dl_n_m
676  work( bw*bw+i ) = 0
677  10 CONTINUE
678  ENDIF
679 *
680 * Send the result to the neighbor
681 *
682  CALL dgesd2d( ictxt, bw, 1,
683  $ work( bw*bw+1 ), bw, myrow, mycol+1 )
684 *
685  ENDIF
686 *
687  IF ( mycol .GT. 0 ) THEN
688 *
689  DO 20 i=1, bw*( bw+2 )
690  work( i ) = zero
691  20 CONTINUE
692 *
693 * Do the multiplication of the triangle in the upper half
694 *
695 * Copy vector to workspace
696 *
697  CALL dcopy( dl_p_m, b( 1 ), 1,
698  $ work( bw*bw+1 ), 1)
699 *
700 * Receive the triangle prior to multiplying by it.
701 *
702  CALL dtrrv2d(ictxt, 'L', 'N',
703  $ bw, bw,
704  $ work( 1 ), bw, myrow, mycol-1 )
705 *
706  CALL dtrmv(
707  $ 'L',
708  $ 'N',
709  $ 'N', bw,
710  $ work( 1 ), bw,
711  $ work( bw*bw+1 ), 1 )
712 *
713 * Zero out extraneous results from TRMV if any
714 *
715  IF( dl_p_m .GT. dl_p_n ) THEN
716  DO 30 i=1, dl_p_m-dl_p_n
717  work( bw*bw+i ) = 0
718  30 CONTINUE
719  ENDIF
720 *
721 * Send result back
722 *
723  CALL dgesd2d( ictxt, bw, 1, work(bw*bw+1 ),
724  $ bw, myrow, mycol-1 )
725 *
726 * Receive vector result from neighboring processor
727 *
728  CALL dgerv2d( ictxt, bw, 1, work( bw*bw+1 ),
729  $ bw, myrow, mycol-1 )
730 *
731 * Do addition of received vector
732 *
733  CALL daxpy( bw, one,
734  $ work( bw*bw+1 ), 1,
735  $ x( 1 ), 1 )
736 *
737  ENDIF
738 *
739 *
740 *
741  IF( mycol .LT. npcol-1 ) THEN
742 *
743 * Receive returned result
744 *
745  CALL dgerv2d( ictxt, bw, 1, work( bw*bw+1 ),
746  $ bw, myrow, mycol+1 )
747 *
748 * Do addition of received vector
749 *
750  CALL daxpy( bw, one,
751  $ work( bw*bw+1 ), 1,
752  $ x( numroc_size-bw+1 ), 1)
753 *
754  ENDIF
755 *
756 *
757  ENDIF
758 *
759 * End of LSAME if
760 *
761 **************************************
762 *
763  IF ( lsame( uplo, 'U' ) ) THEN
764 *
765 * Sizes of the extra triangles communicated bewtween processors
766 *
767  IF( mycol .GT. 0 ) THEN
768 *
769  dl_p_m= min( bw,
770  $ numroc( n, part_size, mycol, 0, npcol ) )
771  dl_p_n= min( bw,
772  $ numroc( n, part_size, mycol-1, 0, npcol ) )
773  ENDIF
774 *
775  IF( mycol .LT. npcol-1 ) THEN
776 *
777  dl_n_m= min( bw,
778  $ numroc( n, part_size, mycol+1, 0, npcol ) )
779  dl_n_n= min( bw,
780  $ numroc( n, part_size, mycol, 0, npcol ) )
781  ENDIF
782 *
783 *
784  IF( mycol .GT. 0 ) THEN
785 * ...must send triangle in lower half of matrix to left
786 *
787 * Transpose triangle in preparation for sending
788 *
789  CALL dlatcpy( 'L', bw, bw, a( ofst+1 ),
790  $ llda-1, work( 1 ), bw )
791 *
792 * Send the triangle to neighboring processor to left
793 *
794  CALL dtrsd2d(ictxt, 'U', 'N',
795  $ bw, bw,
796  $ work( 1 ),
797  $ bw, myrow, mycol-1 )
798 *
799  ENDIF
800 *
801 * Use main partition in each processor to multiply locally
802 *
803  CALL dsbmv( 'U', numroc_size, bw, one, a( ofst+1 ), llda,
804  $ b(part_offset+1), 1, zero, x( part_offset+1 ), 1 )
805 *
806 *
807 *
808  IF ( mycol .LT. npcol-1 ) THEN
809 *
810 * Do the multiplication of the triangle in the lower half
811 *
812  CALL dcopy( dl_n_n,
813  $ b( numroc_size-dl_n_n+1 ),
814  $ 1, work( bw*bw+1+bw-dl_n_n ), 1 )
815 *
816 * Receive the triangle prior to multiplying by it.
817 *
818  CALL dtrrv2d(ictxt, 'U', 'N',
819  $ bw, bw,
820  $ work( 1 ), bw, myrow, mycol+1 )
821 *
822  CALL dtrmv( 'U', 'N', 'N', bw,
823  $ work( 1 ), bw,
824  $ work( bw*bw+1 ), 1)
825 *
826 * Zero out extraneous elements caused by TRMV if any
827 *
828  IF( dl_n_m .GT. dl_n_n ) THEN
829  DO 40 i = dl_n_m-dl_n_n, dl_n_m
830  work( bw*bw+i ) = 0
831  40 CONTINUE
832  ENDIF
833 *
834 * Send the result to the neighbor
835 *
836  CALL dgesd2d( ictxt, bw, 1,
837  $ work( bw*bw+1 ), bw, myrow, mycol+1 )
838 *
839  ENDIF
840 *
841  IF ( mycol .GT. 0 ) THEN
842 *
843  DO 50 i=1, bw*( bw+2 )
844  work( i ) = zero
845  50 CONTINUE
846 *
847 * Do the multiplication of the triangle in the upper half
848 *
849 * Copy vector to workspace
850 *
851  CALL dcopy( dl_p_m, b( 1 ), 1,
852  $ work( bw*bw+1 ), 1)
853 *
854  CALL dtrmv(
855  $ 'L',
856  $ 'N',
857  $ 'N', bw,
858  $ a( 1 ), llda-1,
859  $ work( bw*bw+1 ), 1 )
860 *
861 * Zero out extraneous results from TRMV if any
862 *
863  IF( dl_p_m .GT. dl_p_n ) THEN
864  DO 60 i=1, dl_p_m-dl_p_n
865  work( bw*bw+i ) = 0
866  60 CONTINUE
867  ENDIF
868 *
869 * Send result back
870 *
871  CALL dgesd2d( ictxt, bw, 1, work(bw*bw+1 ),
872  $ bw, myrow, mycol-1 )
873 *
874 * Receive vector result from neighboring processor
875 *
876  CALL dgerv2d( ictxt, bw, 1, work( bw*bw+1 ),
877  $ bw, myrow, mycol-1 )
878 *
879 * Do addition of received vector
880 *
881  CALL daxpy( bw, one,
882  $ work( bw*bw+1 ), 1,
883  $ x( 1 ), 1 )
884 *
885  ENDIF
886 *
887 *
888 *
889  IF( mycol .LT. npcol-1 ) THEN
890 *
891 * Receive returned result
892 *
893  CALL dgerv2d( ictxt, bw, 1, work( bw*bw+1 ),
894  $ bw, myrow, mycol+1 )
895 *
896 * Do addition of received vector
897 *
898  CALL daxpy( bw, one,
899  $ work( bw*bw+1 ), 1,
900  $ x( numroc_size-bw+1 ), 1)
901 *
902  ENDIF
903 *
904 *
905  ENDIF
906 *
907 * End of LSAME if
908 *
909 *
910 * Free BLACS space used to hold standard-form grid.
911 *
912  IF( ictxt_save .NE. ictxt_new ) THEN
913  CALL blacs_gridexit( ictxt_new )
914  ENDIF
915 *
916  1234 CONTINUE
917 *
918 * Restore saved input parameters
919 *
920  ictxt = ictxt_save
921  np = np_save
922 *
923 *
924  RETURN
925 *
926 * End of PDBsBMV1
927 *
928  END
globchk
subroutine globchk(ICTXT, N, X, LDX, IWORK, INFO)
Definition: pchkxmat.f:403
pdpbdcmv
subroutine pdpbdcmv(LDBW, BW, UPLO, N, A, JA, DESCA, NRHS, B, IB, DESCB, X, WORK, LWORK, INFO)
Definition: pdpbmv1.f:3
dlatcpy
subroutine dlatcpy(UPLO, M, N, A, LDA, B, LDB)
Definition: dlatcpy.f:2
reshape
void reshape(int *context_in, int *major_in, int *context_out, int *major_out, int *first_proc, int *nprow_new, int *npcol_new)
Definition: reshape.c:77
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181