ScaLAPACK 2.1  2.1
ScaLAPACK: Scalable Linear Algebra PACKage
pstrord.f
Go to the documentation of this file.
1  SUBROUTINE pstrord( COMPQ, SELECT, PARA, N, T, IT, JT,
2  $ DESCT, Q, IQ, JQ, DESCQ, WR, WI, M, WORK, LWORK,
3  $ IWORK, LIWORK, INFO )
4 *
5 * Contribution from the Department of Computing Science and HPC2N,
6 * Umea University, Sweden
7 *
8 * -- ScaLAPACK computational routine (version 2.0.2) --
9 * Univ. of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver
10 * May 1 2012
11 *
12  IMPLICIT NONE
13 *
14 * .. Scalar Arguments ..
15  CHARACTER COMPQ
16  INTEGER INFO, LIWORK, LWORK, M, N,
17  $ it, jt, iq, jq
18 * ..
19 * .. Array Arguments ..
20  INTEGER SELECT( * )
21  INTEGER PARA( 6 ), DESCT( * ), DESCQ( * ), IWORK( * )
22  REAL Q( * ), T( * ), WI( * ), WORK( * ), WR( * )
23 * ..
24 *
25 * Purpose
26 * =======
27 *
28 * PSTRORD reorders the real Schur factorization of a real matrix
29 * A = Q*T*Q**T, so that a selected cluster of eigenvalues appears
30 * in the leading diagonal blocks of the upper quasi-triangular matrix
31 * T, and the leading columns of Q form an orthonormal basis of the
32 * corresponding right invariant subspace.
33 *
34 * T must be in Schur form (as returned by PSLAHQR), that is, block
35 * upper triangular with 1-by-1 and 2-by-2 diagonal blocks.
36 *
37 * This subroutine uses a delay and accumulate procedure for performing
38 * the off-diagonal updates (see references for details).
39 *
40 * Notes
41 * =====
42 *
43 * Each global data object is described by an associated description
44 * vector. This vector stores the information required to establish
45 * the mapping between an object element and its corresponding process
46 * and memory location.
47 *
48 * Let A be a generic term for any 2D block cyclicly distributed array.
49 * Such a global array has an associated description vector DESCA.
50 * In the following comments, the character _ should be read as
51 * "of the global array".
52 *
53 * NOTATION STORED IN EXPLANATION
54 * --------------- -------------- --------------------------------------
55 * DTYPE_A(global) DESCA( DTYPE_ )The descriptor type. In this case,
56 * DTYPE_A = 1.
57 * CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
58 * the BLACS process grid A is distribu-
59 * ted over. The context itself is glo-
60 * bal, but the handle (the integer
61 * value) may vary.
62 * M_A (global) DESCA( M_ ) The number of rows in the global
63 * array A.
64 * N_A (global) DESCA( N_ ) The number of columns in the global
65 * array A.
66 * MB_A (global) DESCA( MB_ ) The blocking factor used to distribute
67 * the rows of the array.
68 * NB_A (global) DESCA( NB_ ) The blocking factor used to distribute
69 * the columns of the array.
70 * RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
71 * row of the array A is distributed.
72 * CSRC_A (global) DESCA( CSRC_ ) The process column over which the
73 * first column of the array A is
74 * distributed.
75 * LLD_A (local) DESCA( LLD_ ) The leading dimension of the local
76 * array. LLD_A >= MAX(1,LOCr(M_A)).
77 *
78 * Let K be the number of rows or columns of a distributed matrix,
79 * and assume that its process grid has dimension p x q.
80 * LOCr( K ) denotes the number of elements of K that a process
81 * would receive if K were distributed over the p processes of its
82 * process column.
83 * Similarly, LOCc( K ) denotes the number of elements of K that a
84 * process would receive if K were distributed over the q processes of
85 * its process row.
86 * The values of LOCr() and LOCc() may be determined via a call to the
87 * ScaLAPACK tool function, NUMROC:
88 * LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
89 * LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
90 * An upper bound for these quantities may be computed by:
91 * LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
92 * LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
93 *
94 * Arguments
95 * =========
96 *
97 *
98 * COMPQ (global input) CHARACTER*1
99 * = 'V': update the matrix Q of Schur vectors;
100 * = 'N': do not update Q.
101 *
102 * SELECT (global input/output) INTEGER array, dimension (N)
103 * SELECT specifies the eigenvalues in the selected cluster. To
104 * select a real eigenvalue w(j), SELECT(j) must be set to 1.
105 * To select a complex conjugate pair of eigenvalues
106 * w(j) and w(j+1), corresponding to a 2-by-2 diagonal block,
107 * either SELECT(j) or SELECT(j+1) or both must be set to 1;
108 * a complex conjugate pair of eigenvalues must be
109 * either both included in the cluster or both excluded.
110 * On output, the (partial) reordering is displayed.
111 *
112 * PARA (global input) INTEGER*6
113 * Block parameters (some should be replaced by calls to
114 * PILAENV and others by meaningful default values):
115 * PARA(1) = maximum number of concurrent computational windows
116 * allowed in the algorithm;
117 * 0 < PARA(1) <= min(NPROW,NPCOL) must hold;
118 * PARA(2) = number of eigenvalues in each window;
119 * 0 < PARA(2) < PARA(3) must hold;
120 * PARA(3) = window size; PARA(2) < PARA(3) < DESCT(MB_)
121 * must hold;
122 * PARA(4) = minimal percentage of flops required for
123 * performing matrix-matrix multiplications instead
124 * of pipelined orthogonal transformations;
125 * 0 <= PARA(4) <= 100 must hold;
126 * PARA(5) = width of block column slabs for row-wise
127 * application of pipelined orthogonal
128 * transformations in their factorized form;
129 * 0 < PARA(5) <= DESCT(MB_) must hold.
130 * PARA(6) = the maximum number of eigenvalues moved together
131 * over a process border; in practice, this will be
132 * approximately half of the cross border window size
133 * 0 < PARA(6) <= PARA(2) must hold;
134 *
135 * N (global input) INTEGER
136 * The order of the globally distributed matrix T. N >= 0.
137 *
138 * T (local input/output) REAL array,
139 * dimension (LLD_T,LOCc(N)).
140 * On entry, the local pieces of the global distributed
141 * upper quasi-triangular matrix T, in Schur form. On exit, T is
142 * overwritten by the local pieces of the reordered matrix T,
143 * again in Schur form, with the selected eigenvalues in the
144 * globally leading diagonal blocks.
145 *
146 * IT (global input) INTEGER
147 * JT (global input) INTEGER
148 * The row and column index in the global array T indicating the
149 * first column of sub( T ). IT = JT = 1 must hold.
150 *
151 * DESCT (global and local input) INTEGER array of dimension DLEN_.
152 * The array descriptor for the global distributed matrix T.
153 *
154 * Q (local input/output) REAL array,
155 * dimension (LLD_Q,LOCc(N)).
156 * On entry, if COMPQ = 'V', the local pieces of the global
157 * distributed matrix Q of Schur vectors.
158 * On exit, if COMPQ = 'V', Q has been postmultiplied by the
159 * global orthogonal transformation matrix which reorders T; the
160 * leading M columns of Q form an orthonormal basis for the
161 * specified invariant subspace.
162 * If COMPQ = 'N', Q is not referenced.
163 *
164 * IQ (global input) INTEGER
165 * JQ (global input) INTEGER
166 * The column index in the global array Q indicating the
167 * first column of sub( Q ). IQ = JQ = 1 must hold.
168 *
169 * DESCQ (global and local input) INTEGER array of dimension DLEN_.
170 * The array descriptor for the global distributed matrix Q.
171 *
172 * WR (global output) REAL array, dimension (N)
173 * WI (global output) REAL array, dimension (N)
174 * The real and imaginary parts, respectively, of the reordered
175 * eigenvalues of T. The eigenvalues are in principle stored in
176 * the same order as on the diagonal of T, with WR(i) = T(i,i)
177 * and, if T(i:i+1,i:i+1) is a 2-by-2 diagonal block, WI(i) > 0
178 * and WI(i+1) = -WI(i).
179 * Note also that if a complex eigenvalue is sufficiently
180 * ill-conditioned, then its value may differ significantly
181 * from its value before reordering.
182 *
183 * M (global output) INTEGER
184 * The dimension of the specified invariant subspace.
185 * 0 <= M <= N.
186 *
187 * WORK (local workspace/output) REAL array,
188 * dimension (LWORK)
189 * On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
190 *
191 * LWORK (local input) INTEGER
192 * The dimension of the array WORK.
193 *
194 * If LWORK = -1, then a workspace query is assumed; the routine
195 * only calculates the optimal size of the WORK array, returns
196 * this value as the first entry of the WORK array, and no error
197 * message related to LWORK is issued by PXERBLA.
198 *
199 * IWORK (local workspace/output) INTEGER array, dimension (LIWORK)
200 *
201 * LIWORK (local input) INTEGER
202 * The dimension of the array IWORK.
203 *
204 * If LIWORK = -1, then a workspace query is assumed; the
205 * routine only calculates the optimal size of the IWORK array,
206 * returns this value as the first entry of the IWORK array, and
207 * no error message related to LIWORK is issued by PXERBLA.
208 *
209 * INFO (global output) INTEGER
210 * = 0: successful exit
211 * < 0: if INFO = -i, the i-th argument had an illegal value.
212 * If the i-th argument is an array and the j-entry had
213 * an illegal value, then INFO = -(i*1000+j), if the i-th
214 * argument is a scalar and had an illegal value, then INFO = -i.
215 * > 0: here we have several possibilites
216 * *) Reordering of T failed because some eigenvalues are too
217 * close to separate (the problem is very ill-conditioned);
218 * T may have been partially reordered, and WR and WI
219 * contain the eigenvalues in the same order as in T.
220 * On exit, INFO = {the index of T where the swap failed}.
221 * *) A 2-by-2 block to be reordered split into two 1-by-1
222 * blocks and the second block failed to swap with an
223 * adjacent block.
224 * On exit, INFO = {the index of T where the swap failed}.
225 * *) If INFO = N+1, there is no valid BLACS context (see the
226 * BLACS documentation for details).
227 * In a future release this subroutine may distinguish between
228 * the case 1 and 2 above.
229 *
230 * Additional requirements
231 * =======================
232 *
233 * The following alignment requirements must hold:
234 * (a) DESCT( MB_ ) = DESCT( NB_ ) = DESCQ( MB_ ) = DESCQ( NB_ )
235 * (b) DESCT( RSRC_ ) = DESCQ( RSRC_ )
236 * (c) DESCT( CSRC_ ) = DESCQ( CSRC_ )
237 *
238 * All matrices must be blocked by a block factor larger than or
239 * equal to two (3). This is to simplify reordering across processor
240 * borders in the presence of 2-by-2 blocks.
241 *
242 * Limitations
243 * ===========
244 *
245 * This algorithm cannot work on submatrices of T and Q, i.e.,
246 * IT = JT = IQ = JQ = 1 must hold. This is however no limitation
247 * since PDLAHQR does not compute Schur forms of submatrices anyway.
248 *
249 * References
250 * ==========
251 *
252 * [1] Z. Bai and J. W. Demmel; On swapping diagonal blocks in real
253 * Schur form, Linear Algebra Appl., 186:73--95, 1993. Also as
254 * LAPACK Working Note 54.
255 *
256 * [2] D. Kressner; Block algorithms for reordering standard and
257 * generalized Schur forms, ACM TOMS, 32(4):521-532, 2006.
258 * Also LAPACK Working Note 171.
259 *
260 * [3] R. Granat, B. Kagstrom, and D. Kressner; Parallel eigenvalue
261 * reordering in real Schur form, Concurrency and Computations:
262 * Practice and Experience, 21(9):1225-1250, 2009. Also as
263 * LAPACK Working Note 192.
264 *
265 * Parallel execution recommendations
266 * ==================================
267 *
268 * Use a square grid, if possible, for maximum performance. The block
269 * parameters in PARA should be kept well below the data distribution
270 * block size. In particular, see [3] for recommended settings for
271 * these parameters.
272 *
273 * In general, the parallel algorithm strives to perform as much work
274 * as possible without crossing the block borders on the main block
275 * diagonal.
276 *
277 * Contributors
278 * ============
279 *
280 * Implemented by Robert Granat, Dept. of Computing Science and HPC2N,
281 * Umea University, Sweden, March 2007,
282 * in collaboration with Bo Kagstrom and Daniel Kressner.
283 * Modified by Meiyue Shao, October 2011.
284 *
285 * Revisions
286 * =========
287 *
288 * Please send bug-reports to granat@cs.umu.se
289 *
290 * Keywords
291 * ========
292 *
293 * Real Schur form, eigenvalue reordering
294 *
295 * =====================================================================
296 * ..
297 * .. Parameters ..
298  CHARACTER TOP
299  INTEGER BLOCK_CYCLIC_2D, CSRC_, CTXT_, DLEN_, DTYPE_,
300  $ lld_, mb_, m_, nb_, n_, rsrc_
301  REAL ZERO, ONE
302  PARAMETER ( TOP = '1-Tree',
303  $ block_cyclic_2d = 1, dlen_ = 9, dtype_ = 1,
304  $ ctxt_ = 2, m_ = 3, n_ = 4, mb_ = 5, nb_ = 6,
305  $ rsrc_ = 7, csrc_ = 8, lld_ = 9,
306  $ zero = 0.0, one = 1.0 )
307 * ..
308 * .. Local Scalars ..
309  LOGICAL LQUERY, PAIR, SWAP, WANTQ,
310  $ ISHH, FIRST, SKIP1CR, BORDER, LASTWAIT
311  INTEGER NPROW, NPCOL, MYROW, MYCOL, NB, NPROCS,
312  $ IERR, DIM1, INDX, LLDT, TRSRC, TCSRC, ILOC1,
313  $ jloc1, myierr, ictxt,
314  $ rsrc1, csrc1, iloc3, jloc3, trsrc3,
315  $ tcsrc3, iloc, jloc, trsrc4, tcsrc4,
316  $ flops, i, ilo, ihi, j, k, kk, kks,
317  $ ks, liwmin, lwmin, mmult, n1, n2,
318  $ ncb, ndtraf, nitraf, nwin, numwin, pdtraf,
319  $ pitraf, pdw, wineig, winsiz, lldq,
320  $ rsrc, csrc, ililo, ilihi, ilsel, irsrc,
321  $ icsrc, ipiw, ipw1, ipw2, ipw3, tihi, tilo,
322  $ lihi, window, lilo, lsel, buffer,
323  $ nmwin2, bufflen, lrows, lcols, iloc2, jloc2,
324  $ wneicr, window0, rsrc4, csrc4, lihi4, rsrc3,
325  $ csrc3, rsrc2, csrc2, lihic, lihi1, ilen4,
326  $ seli4, ilen1, dim4, ipw4, qrows, trows,
327  $ tcols, ipw5, ipw6, ipw7, ipw8, jloc4,
328  $ east, west, iloc4, south, north, indxs,
329  $ itt, jtt, ilen, dlen, indxe, trsrc1, tcsrc1,
330  $ trsrc2, tcsrc2, ilos, dir, tlihi, tlilo, tlsel,
331  $ round, last, win0s, win0e, wine, mmax, mmin
332  REAL ELEM, ELEM1, ELEM2, ELEM3, ELEM4, SN, CS, TMP,
333  $ ELEM5
334 * ..
335 * .. Local Arrays ..
336  INTEGER IBUFF( 8 ), IDUM1( 1 ), IDUM2( 1 )
337 * ..
338 * .. External Functions ..
339  LOGICAL LSAME
340  INTEGER NUMROC, INDXG2P, INDXG2L
341  EXTERNAL lsame, numroc, indxg2p, indxg2l
342 * ..
343 * .. External Subroutines ..
344  EXTERNAL pslacpy, pxerbla, pchk1mat, pchk2mat,
345  $ sgemm, slamov, ilacpy, chk1mat,
346  $ infog2l, dgsum2d, sgesd2d, sgerv2d, sgebs2d,
347  $ sgebr2d, igsum2d, blacs_gridinfo, igebs2d,
348  $ igebr2d, igamx2d, igamn2d, bslaapp, bdtrexc
349 * ..
350 * .. Intrinsic Functions ..
351  INTRINSIC abs, max, sqrt, min
352 * ..
353 * .. Local Functions ..
354  INTEGER ICEIL
355 * ..
356 * .. Executable Statements ..
357 *
358 * Get grid parameters.
359 *
360  ictxt = desct( ctxt_ )
361  CALL blacs_gridinfo( ictxt, nprow, npcol, myrow, mycol )
362  nprocs = nprow*npcol
363 *
364 * Test if grid is O.K., i.e., the context is valid.
365 *
366  info = 0
367  IF( nprow.EQ.-1 ) THEN
368  info = n+1
369  END IF
370 *
371 * Check if workspace query.
372 *
373  lquery = lwork.EQ.-1 .OR. liwork.EQ.-1
374 *
375 * Test dimensions for local sanity.
376 *
377  IF( info.EQ.0 ) THEN
378  CALL chk1mat( n, 5, n, 5, it, jt, desct, 9, info )
379  END IF
380  IF( info.EQ.0 ) THEN
381  CALL chk1mat( n, 5, n, 5, iq, jq, descq, 13, info )
382  END IF
383 *
384 * Check the blocking sizes for alignment requirements.
385 *
386  IF( info.EQ.0 ) THEN
387  IF( desct( mb_ ).NE.desct( nb_ ) ) info = -(1000*9 + mb_)
388  END IF
389  IF( info.EQ.0 ) THEN
390  IF( descq( mb_ ).NE.descq( nb_ ) ) info = -(1000*13 + mb_)
391  END IF
392  IF( info.EQ.0 ) THEN
393  IF( desct( mb_ ).NE.descq( mb_ ) ) info = -(1000*9 + mb_)
394  END IF
395 *
396 * Check the blocking sizes for minimum sizes.
397 *
398  IF( info.EQ.0 ) THEN
399  IF( n.NE.desct( mb_ ) .AND. desct( mb_ ).LT.3 )
400  $ info = -(1000*9 + mb_)
401  IF( n.NE.descq( mb_ ) .AND. descq( mb_ ).LT.3 )
402  $ info = -(1000*13 + mb_)
403  END IF
404 *
405 * Check parameters in PARA.
406 *
407  nb = desct( mb_ )
408  IF( info.EQ.0 ) THEN
409  IF( para(1).LT.1 .OR. para(1).GT.min(nprow,npcol) )
410  $ info = -(1000 * 4 + 1)
411  IF( para(2).LT.1 .OR. para(2).GE.para(3) )
412  $ info = -(1000 * 4 + 2)
413  IF( para(3).LT.1 .OR. para(3).GT.nb )
414  $ info = -(1000 * 4 + 3)
415  IF( para(4).LT.0 .OR. para(4).GT.100 )
416  $ info = -(1000 * 4 + 4)
417  IF( para(5).LT.1 .OR. para(5).GT.nb )
418  $ info = -(1000 * 4 + 5)
419  IF( para(6).LT.1 .OR. para(6).GT.para(2) )
420  $ info = -(1000 * 4 + 6)
421  END IF
422 *
423 * Check requirements on IT, JT, IQ and JQ.
424 *
425  IF( info.EQ.0 ) THEN
426  IF( it.NE.1 ) info = -6
427  IF( jt.NE.it ) info = -7
428  IF( iq.NE.1 ) info = -10
429  IF( jq.NE.iq ) info = -11
430  END IF
431 *
432 * Test input parameters for global sanity.
433 *
434  IF( info.EQ.0 ) THEN
435  CALL pchk1mat( n, 5, n, 5, it, jt, desct, 9, 0, idum1,
436  $ idum2, info )
437  END IF
438  IF( info.EQ.0 ) THEN
439  CALL pchk1mat( n, 5, n, 5, iq, jq, descq, 13, 0, idum1,
440  $ idum2, info )
441  END IF
442  IF( info.EQ.0 ) THEN
443  CALL pchk2mat( n, 5, n, 5, it, jt, desct, 9, n, 5, n, 5,
444  $ iq, jq, descq, 13, 0, idum1, idum2, info )
445  END IF
446 *
447 * Decode and test the input parameters.
448 *
449  IF( info.EQ.0 .OR. lquery ) THEN
450 *
451  wantq = lsame( compq, 'V' )
452  IF( n.LT.0 ) THEN
453  info = -4
454  ELSE
455 *
456 * Extract local leading dimension.
457 *
458  lldt = desct( lld_ )
459  lldq = descq( lld_ )
460 *
461 * Check the SELECT vector for consistency and set M to the
462 * dimension of the specified invariant subspace.
463 *
464  m = 0
465  DO 10 k = 1, n
466  IF( k.LT.n ) THEN
467  CALL infog2l( k+1, k, desct, nprow, npcol,
468  $ myrow, mycol, itt, jtt, trsrc, tcsrc )
469  IF( myrow.EQ.trsrc .AND. mycol.EQ.tcsrc ) THEN
470  elem = t( (jtt-1)*lldt + itt )
471  IF( elem.NE.zero ) THEN
472  IF( SELECT(k).NE.0 .AND.
473  $ SELECT(k+1).EQ.0 ) THEN
474 * INFO = -2
475  SELECT(k+1) = 1
476  ELSEIF( SELECT(k).EQ.0 .AND.
477  $ SELECT(k+1).NE.0 ) THEN
478 * INFO = -2
479  SELECT(k) = 1
480  END IF
481  END IF
482  END IF
483  END IF
484  IF( SELECT(k).NE.0 ) m = m + 1
485  10 CONTINUE
486  mmax = m
487  mmin = m
488  IF( nprocs.GT.1 )
489  $ CALL igamx2d( ictxt, 'All', top, 1, 1, mmax, 1, -1,
490  $ -1, -1, -1, -1 )
491  IF( nprocs.GT.1 )
492  $ CALL igamn2d( ictxt, 'All', top, 1, 1, mmin, 1, -1,
493  $ -1, -1, -1, -1 )
494  IF( mmax.GT.mmin ) THEN
495  m = mmax
496  IF( nprocs.GT.1 )
497  $ CALL igamx2d( ictxt, 'All', top, n, 1, SELECT, n,
498  $ -1, -1, -1, -1, -1 )
499  END IF
500 *
501 * Compute needed workspace.
502 *
503  n1 = m
504  n2 = n - m
505 *
506  trows = numroc( n, nb, myrow, desct(rsrc_), nprow )
507  tcols = numroc( n, nb, mycol, desct(csrc_), npcol )
508  lwmin = n + 7*nb**2 + 2*trows*para( 3 ) + tcols*para( 3 ) +
509  $ max( trows*para( 3 ), tcols*para( 3 ) )
510  liwmin = 5*para( 1 ) + para( 2 )*para( 3 ) -
511  $ para( 2 ) * ( para( 2 ) + 1 ) / 2
512 *
513  IF( lwork.LT.lwmin .AND. .NOT.lquery ) THEN
514  info = -17
515  ELSE IF( liwork.LT.liwmin .AND. .NOT.lquery ) THEN
516  info = -19
517  END IF
518  END IF
519  END IF
520 *
521 * Global maximum on info.
522 *
523  IF( nprocs.GT.1 )
524  $ CALL igamx2d( ictxt, 'All', top, 1, 1, info, 1, -1, -1, -1,
525  $ -1, -1 )
526 *
527 * Return if some argument is incorrect.
528 *
529  IF( info.NE.0 .AND. .NOT.lquery ) THEN
530  m = 0
531  CALL pxerbla( ictxt, 'PSTRORD', -info )
532  RETURN
533  ELSEIF( lquery ) THEN
534  work( 1 ) = float(lwmin)
535  iwork( 1 ) = liwmin
536  RETURN
537  END IF
538 *
539 * Quick return if possible.
540 *
541  IF( m.EQ.n .OR. m.EQ.0 ) GO TO 545
542 *
543 * Set parameters.
544 *
545  numwin = para( 1 )
546  wineig = max( para( 2 ), 2 )
547  winsiz = min( max( para( 3 ), para( 2 )*2 ), nb )
548  mmult = para( 4 )
549  ncb = para( 5 )
550  wneicr = para( 6 )
551 *
552 * Insert some pointers into INTEGER workspace.
553 *
554 * Information about all the active windows is stored
555 * in IWORK( 1:5*NUMWIN ). Each processor has a copy.
556 * LILO: start position
557 * LIHI: stop position
558 * LSEL: number of selected eigenvalues
559 * RSRC: processor id (row)
560 * CSRC: processor id (col)
561 * IWORK( IPIW+ ) contain information of orthogonal transformations.
562 *
563  ililo = 1
564  ilihi = ililo + numwin
565  ilsel = ilihi + numwin
566  irsrc = ilsel + numwin
567  icsrc = irsrc + numwin
568  ipiw = icsrc + numwin
569 *
570 * Insert some pointers into REAL workspace - for now we
571 * only need two pointers.
572 *
573  ipw1 = 1
574  ipw2 = ipw1 + nb
575 *
576 * Collect the selected blocks at the top-left corner of T.
577 *
578 * Globally: ignore eigenvalues that are already in order.
579 * ILO is a global variable and is kept updated to be consistent
580 * throughout the process mesh.
581 *
582  ilo = 0
583  40 CONTINUE
584  ilo = ilo + 1
585  IF( ilo.LE.n ) THEN
586  IF( SELECT(ilo).NE.0 ) GO TO 40
587  END IF
588 *
589 * Globally: start the collection at the top of the matrix. Here,
590 * IHI is a global variable and is kept updated to be consistent
591 * throughout the process mesh.
592 *
593  ihi = n
594 *
595 * Globally: While ( ILO <= M ) do
596  50 CONTINUE
597 *
598  IF( ilo.LE.m ) THEN
599 *
600 * Depending on the value of ILO, find the diagonal block index J,
601 * such that T(1+(J-1)*NB:1+J*NB,1+(J-1)*NB:1+J*NB) contains the
602 * first unsorted eigenvalue. Check that J does not point to a
603 * block with only one selected eigenvalue in the last position
604 * which belongs to a splitted 2-by-2 block.
605 *
606  ilos = ilo - 1
607  52 CONTINUE
608  ilos = ilos + 1
609  IF( SELECT(ilos).EQ.0 ) GO TO 52
610  IF( ilos.LT.n ) THEN
611  IF( SELECT(ilos+1).NE.0 .AND. mod(ilos,nb).EQ.0 ) THEN
612  CALL pselget( 'All', top, elem, t, ilos+1, ilos, desct )
613  IF( elem.NE.zero ) GO TO 52
614  END IF
615  END IF
616  j = iceil(ilos,nb)
617 *
618 * Globally: Set start values of LILO and LIHI for all processes.
619 * Choose also the number of selected eigenvalues at top of each
620 * diagonal block such that the number of eigenvalues which remain
621 * to be reordered is an integer multiple of WINEIG.
622 *
623 * All the information is saved into the INTEGER workspace such
624 * that all processors are aware of each others operations.
625 *
626 * Compute the number of concurrent windows.
627 *
628  nmwin2 = (iceil(ihi,nb)*nb - (ilo-mod(ilo,nb)+1)+1) / nb
629  nmwin2 = min( min( numwin, nmwin2 ), iceil(n,nb) - j + 1 )
630 *
631 * For all windows, set LSEL = 0 and find a proper start value of
632 * LILO such that LILO points at the first non-selected entry in
633 * the corresponding diagonal block of T.
634 *
635  DO 80 k = 1, nmwin2
636  iwork( ilsel+k-1) = 0
637  iwork( ililo+k-1) = max( ilo, (j-1)*nb+(k-1)*nb+1 )
638  lilo = iwork( ililo+k-1 )
639  82 CONTINUE
640  IF( SELECT(lilo).NE.0 .AND. lilo.LT.(j+k-1)*nb ) THEN
641  lilo = lilo + 1
642  IF( lilo.LE.n ) GO TO 82
643  END IF
644  iwork( ililo+k-1 ) = lilo
645 *
646 * Fix each LILO to ensure that no 2-by-2 block is cut in top
647 * of the submatrix (LILO:LIHI,LILO:LIHI).
648 *
649  lilo = iwork(ililo+k-1)
650  IF( lilo.GT.nb ) THEN
651  CALL pselget( 'All', top, elem, t, lilo, lilo-1, desct )
652  IF( elem.NE.zero ) THEN
653  IF( lilo.LT.(j+k-1)*nb ) THEN
654  iwork(ililo+k-1) = iwork(ililo+k-1) + 1
655  ELSE
656  iwork(ililo+k-1) = iwork(ililo+k-1) - 1
657  END IF
658  END IF
659  END IF
660 *
661 * Set a proper LIHI value for each window. Also find the
662 * processors corresponding to the corresponding windows.
663 *
664  iwork( ilihi+k-1 ) = iwork( ililo+k-1 )
665  iwork( irsrc+k-1 ) = indxg2p( iwork(ililo+k-1), nb, myrow,
666  $ desct( rsrc_ ), nprow )
667  iwork( icsrc+k-1 ) = indxg2p( iwork(ililo+k-1), nb, mycol,
668  $ desct( csrc_ ), npcol )
669  tilo = iwork(ililo+k-1)
670  tihi = min( n, iceil( tilo, nb ) * nb )
671  DO 90 kk = tihi, tilo, -1
672  IF( SELECT(kk).NE.0 ) THEN
673  iwork(ilihi+k-1) = max(iwork(ilihi+k-1) , kk )
674  iwork(ilsel+k-1) = iwork(ilsel+k-1) + 1
675  IF( iwork(ilsel+k-1).GT.wineig ) THEN
676  iwork(ilihi+k-1) = kk
677  iwork(ilsel+k-1) = 1
678  END IF
679  END IF
680  90 CONTINUE
681 *
682 * Fix each LIHI to avoid that bottom of window cuts 2-by-2
683 * block. We exclude such a block if located on block (process)
684 * border and on window border or if an inclusion would cause
685 * violation on the maximum number of eigenvalues to reorder
686 * inside each window. If only on window border, we include it.
687 * The excluded block is included automatically later when a
688 * subcluster is reordered into the block from South-East.
689 *
690  lihi = iwork(ilihi+k-1)
691  IF( lihi.LT.n ) THEN
692  CALL pselget( 'All', top, elem, t, lihi+1, lihi, desct )
693  IF( elem.NE.zero ) THEN
694  IF( iceil( lihi, nb ) .NE. iceil( lihi+1, nb ) .OR.
695  $ iwork( ilsel+k-1 ).EQ.wineig ) THEN
696  iwork( ilihi+k-1 ) = iwork( ilihi+k-1 ) - 1
697  IF( iwork( ilsel+k-1 ).GT.2 )
698  $ iwork( ilsel+k-1 ) = iwork( ilsel+k-1 ) - 1
699  ELSE
700  iwork( ilihi+k-1 ) = iwork( ilihi+k-1 ) + 1
701  IF( SELECT(lihi+1).NE.0 )
702  $ iwork( ilsel+k-1 ) = iwork( ilsel+k-1 ) + 1
703  END IF
704  END IF
705  END IF
706  80 CONTINUE
707 *
708 * Fix the special cases of LSEL = 0 and LILO = LIHI for each
709 * window by assuring that the stop-condition for local reordering
710 * is fulfilled directly. Do this by setting LIHI = startposition
711 * for the corresponding block and LILO = LIHI + 1.
712 *
713  DO 85 k = 1, nmwin2
714  lilo = iwork( ililo + k - 1 )
715  lihi = iwork( ilihi + k - 1 )
716  lsel = iwork( ilsel + k - 1 )
717  IF( lsel.EQ.0 .OR. lilo.EQ.lihi ) THEN
718  lihi = iwork( ilihi + k - 1 )
719  iwork( ilihi + k - 1 ) = (iceil(lihi,nb)-1)*nb + 1
720  iwork( ililo + k - 1 ) = iwork( ilihi + k - 1 ) + 1
721  END IF
722  85 CONTINUE
723 *
724 * Associate all processors with the first computational window
725 * that should be activated, if possible.
726 *
727  lilo = ihi
728  lihi = ilo
729  lsel = m
730  first = .true.
731  DO 95 window = 1, nmwin2
732  rsrc = iwork(irsrc+window-1)
733  csrc = iwork(icsrc+window-1)
734  IF( myrow.EQ.rsrc .OR. mycol.EQ.csrc ) THEN
735  tlilo = iwork( ililo + window - 1 )
736  tlihi = iwork( ilihi + window - 1 )
737  tlsel = iwork( ilsel + window - 1 )
738  IF( (.NOT. ( lihi .GE. lilo + lsel ) ) .AND.
739  $ ( (tlihi .GE. tlilo + tlsel) .OR. first ) ) THEN
740  IF( first ) first = .false.
741  lilo = tlilo
742  lihi = tlihi
743  lsel = tlsel
744  GO TO 97
745  END IF
746  END IF
747  95 CONTINUE
748  97 CONTINUE
749 *
750 * Exclude all processors that are not involved in any
751 * computational window right now.
752 *
753  ierr = 0
754  IF( lilo.EQ.ihi .AND. lihi.EQ.ilo .AND. lsel.EQ.m )
755  $ GO TO 114
756 *
757 * Make sure all processors associated with a compuational window
758 * enter the local reordering the first time.
759 *
760  first = .true.
761 *
762 * Globally for all computational windows:
763 * While ( LIHI >= LILO + LSEL ) do
764  round = 1
765  130 CONTINUE
766  IF( first .OR. ( lihi .GE. lilo + lsel ) ) THEN
767 *
768 * Perform computations in parallel: loop through all
769 * compuational windows, do local reordering and accumulate
770 * transformations, broadcast them in the corresponding block
771 * row and columns and compute the corresponding updates.
772 *
773  DO 110 window = 1, nmwin2
774  rsrc = iwork(irsrc+window-1)
775  csrc = iwork(icsrc+window-1)
776 *
777 * The process on the block diagonal computes the
778 * reordering.
779 *
780  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc ) THEN
781  lilo = iwork(ililo+window-1)
782  lihi = iwork(ilihi+window-1)
783  lsel = iwork(ilsel+window-1)
784 *
785 * Compute the local value of I -- start position.
786 *
787  i = max( lilo, lihi - winsiz + 1 )
788 *
789 * Fix my I to avoid that top of window cuts a 2-by-2
790 * block.
791 *
792  IF( i.GT.lilo ) THEN
793  CALL infog2l( i, i-1, desct, nprow, npcol, myrow,
794  $ mycol, iloc, jloc, rsrc, csrc )
795  IF( t( lldt*(jloc-1) + iloc ).NE.zero )
796  $ i = i + 1
797  END IF
798 *
799 * Compute local indicies for submatrix to operate on.
800 *
801  CALL infog2l( i, i, desct, nprow, npcol,
802  $ myrow, mycol, iloc1, jloc1, rsrc, csrc )
803 *
804 * The active window is ( I:LIHI, I:LIHI ). Reorder
805 * eigenvalues within this window and pipeline
806 * transformations.
807 *
808  nwin = lihi - i + 1
809  ks = 0
810  pitraf = ipiw
811  pdtraf = ipw2
812 *
813  pair = .false.
814  DO 140 k = i, lihi
815  IF( pair ) THEN
816  pair = .false.
817  ELSE
818  swap = SELECT( k ).NE.0
819  IF( k.LT.lihi ) THEN
820  CALL infog2l( k+1, k, desct, nprow, npcol,
821  $ myrow, mycol, iloc, jloc, rsrc, csrc )
822  IF( t( lldt*(jloc-1) + iloc ).NE.zero )
823  $ pair = .true.
824  END IF
825  IF( swap ) THEN
826  ks = ks + 1
827 *
828 * Swap the K-th block to position I+KS-1.
829 *
830  ierr = 0
831  kk = k - i + 1
832  kks = ks
833  IF( kk.NE.ks ) THEN
834  nitraf = liwork - pitraf + 1
835  ndtraf = lwork - pdtraf + 1
836  CALL bstrexc( nwin,
837  $ t(lldt*(jloc1-1) + iloc1), lldt, kk,
838  $ kks, nitraf, iwork( pitraf ), ndtraf,
839  $ work( pdtraf ), work(ipw1), ierr )
840  pitraf = pitraf + nitraf
841  pdtraf = pdtraf + ndtraf
842 *
843 * Update array SELECT.
844 *
845  IF ( pair ) THEN
846  DO 150 j = i+kk-1, i+kks, -1
847  SELECT(j+1) = SELECT(j-1)
848  150 CONTINUE
849  SELECT(i+kks-1) = 1
850  SELECT(i+kks) = 1
851  ELSE
852  DO 160 j = i+kk-1, i+kks, -1
853  SELECT(j) = SELECT(j-1)
854  160 CONTINUE
855  SELECT(i+kks-1) = 1
856  END IF
857 *
858  IF ( ierr.EQ.1 .OR. ierr.EQ.2 ) THEN
859 *
860 * Some blocks are too close to swap:
861 * prepare to leave in a clean fashion. If
862 * IERR.EQ.2, we must update SELECT to
863 * account for the fact that the 2 by 2
864 * block to be reordered did split and the
865 * first part of this block is already
866 * reordered.
867 *
868  IF ( ierr.EQ.2 ) THEN
869  SELECT( i+kks-3 ) = 1
870  SELECT( i+kks-1 ) = 0
871  kks = kks + 1
872  END IF
873 *
874 * Update off-diagonal blocks immediately.
875 *
876  GO TO 170
877  END IF
878  ks = kks
879  END IF
880  IF( pair )
881  $ ks = ks + 1
882  END IF
883  END IF
884  140 CONTINUE
885  END IF
886  110 CONTINUE
887  170 CONTINUE
888 *
889 * The on-diagonal processes save their information from the
890 * local reordering in the integer buffer. This buffer is
891 * broadcasted to updating processors, see below.
892 *
893  DO 175 window = 1, nmwin2
894  rsrc = iwork(irsrc+window-1)
895  csrc = iwork(icsrc+window-1)
896  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc ) THEN
897  ibuff( 1 ) = i
898  ibuff( 2 ) = nwin
899  ibuff( 3 ) = pitraf
900  ibuff( 4 ) = ks
901  ibuff( 5 ) = pdtraf
902  ibuff( 6 ) = ndtraf
903  ilen = pitraf - ipiw
904  dlen = pdtraf - ipw2
905  ibuff( 7 ) = ilen
906  ibuff( 8 ) = dlen
907  END IF
908  175 CONTINUE
909 *
910 * For the updates with respect to the local reordering, we
911 * organize the updates in two phases where the update
912 * "direction" (controlled by the DIR variable below) is first
913 * chosen to be the corresponding rows, then the corresponding
914 * columns.
915 *
916  DO 1111 dir = 1, 2
917 *
918 * Broadcast information about the reordering and the
919 * accumulated transformations: I, NWIN, PITRAF, NITRAF,
920 * PDTRAF, NDTRAF. If no broadcast is performed, use an
921 * artificial value of KS to prevent updating indicies for
922 * windows already finished (use KS = -1).
923 *
924  DO 111 window = 1, nmwin2
925  rsrc = iwork(irsrc+window-1)
926  csrc = iwork(icsrc+window-1)
927  IF( myrow.EQ.rsrc .OR. mycol.EQ.csrc ) THEN
928  lilo = iwork(ililo+window-1)
929  lihi = iwork(ilihi+window-1)
930  lsel = iwork(ilsel+window-1)
931  END IF
932  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc ) THEN
933  IF( npcol.GT.1 .AND. dir.EQ.1 )
934  $ CALL igebs2d( ictxt, 'Row', top, 8, 1, ibuff, 8 )
935  IF( nprow.GT.1 .AND. dir.EQ.2 )
936  $ CALL igebs2d( ictxt, 'Col', top, 8, 1, ibuff, 8 )
937  ELSEIF( myrow.EQ.rsrc .OR. mycol.EQ.csrc ) THEN
938  IF( npcol.GT.1 .AND. dir.EQ.1 .AND. myrow.EQ.rsrc )
939  $ THEN
940  IF( first .OR. (lihi .GE. lilo + lsel) ) THEN
941  CALL igebr2d( ictxt, 'Row', top, 8, 1, ibuff, 8,
942  $ rsrc, csrc )
943  i = ibuff( 1 )
944  nwin = ibuff( 2 )
945  pitraf = ibuff( 3 )
946  ks = ibuff( 4 )
947  pdtraf = ibuff( 5 )
948  ndtraf = ibuff( 6 )
949  ilen = ibuff( 7 )
950  dlen = ibuff( 8 )
951  ELSE
952  ilen = 0
953  dlen = 0
954  ks = -1
955  END IF
956  END IF
957  IF( nprow.GT.1 .AND. dir.EQ.2 .AND. mycol.EQ.csrc )
958  $ THEN
959  IF( first .OR. (lihi .GE. lilo + lsel) ) THEN
960  CALL igebr2d( ictxt, 'Col', top, 8, 1, ibuff, 8,
961  $ rsrc, csrc )
962  i = ibuff( 1 )
963  nwin = ibuff( 2 )
964  pitraf = ibuff( 3 )
965  ks = ibuff( 4 )
966  pdtraf = ibuff( 5 )
967  ndtraf = ibuff( 6 )
968  ilen = ibuff( 7 )
969  dlen = ibuff( 8 )
970  ELSE
971  ilen = 0
972  dlen = 0
973  ks = -1
974  END IF
975  END IF
976  END IF
977 *
978 * Broadcast the accumulated transformations - copy all
979 * information from IWORK(IPIW:PITRAF-1) and
980 * WORK(IPW2:PDTRAF-1) to a buffer and broadcast this
981 * buffer in the corresponding block row and column. On
982 * arrival, copy the information back to the correct part of
983 * the workspace. This step is avoided if no computations
984 * were performed at the diagonal processor, i.e.,
985 * BUFFLEN = 0.
986 *
987  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc ) THEN
988  buffer = pdtraf
989  bufflen = dlen + ilen
990  IF( bufflen.NE.0 ) THEN
991  DO 180 indx = 1, ilen
992  work( buffer+indx-1 ) =
993  $ float( iwork(ipiw+indx-1) )
994  180 CONTINUE
995  CALL slamov( 'All', dlen, 1, work( ipw2 ),
996  $ dlen, work(buffer+ilen), dlen )
997  IF( npcol.GT.1 .AND. dir.EQ.1 ) THEN
998  CALL sgebs2d( ictxt, 'Row', top, bufflen, 1,
999  $ work(buffer), bufflen )
1000  END IF
1001  IF( nprow.GT.1 .AND. dir.EQ.2 ) THEN
1002  CALL sgebs2d( ictxt, 'Col', top, bufflen, 1,
1003  $ work(buffer), bufflen )
1004  END IF
1005  END IF
1006  ELSEIF( myrow.EQ.rsrc .OR. mycol.EQ.csrc ) THEN
1007  IF( npcol.GT.1 .AND. dir.EQ.1 .AND. myrow.EQ.rsrc )
1008  $ THEN
1009  buffer = pdtraf
1010  bufflen = dlen + ilen
1011  IF( bufflen.NE.0 ) THEN
1012  CALL sgebr2d( ictxt, 'Row', top, bufflen, 1,
1013  $ work(buffer), bufflen, rsrc, csrc )
1014  END IF
1015  END IF
1016  IF( nprow.GT.1 .AND. dir.EQ.2 .AND. mycol.EQ.csrc )
1017  $ THEN
1018  buffer = pdtraf
1019  bufflen = dlen + ilen
1020  IF( bufflen.NE.0 ) THEN
1021  CALL sgebr2d( ictxt, 'Col', top, bufflen, 1,
1022  $ work(buffer), bufflen, rsrc, csrc )
1023  END IF
1024  END IF
1025  IF((npcol.GT.1.AND.dir.EQ.1.AND.myrow.EQ.rsrc).OR.
1026  $ (nprow.GT.1.AND.dir.EQ.2.AND.mycol.EQ.csrc ) )
1027  $ THEN
1028  IF( bufflen.NE.0 ) THEN
1029  DO 190 indx = 1, ilen
1030  iwork(ipiw+indx-1) =
1031  $ int(work( buffer+indx-1 ))
1032  190 CONTINUE
1033  CALL slamov( 'All', dlen, 1,
1034  $ work( buffer+ilen ), dlen,
1035  $ work( ipw2 ), dlen )
1036  END IF
1037  END IF
1038  END IF
1039  111 CONTINUE
1040 *
1041 * Now really perform the updates by applying the orthogonal
1042 * transformations to the out-of-window parts of T and Q. This
1043 * step is avoided if no reordering was performed by the on-
1044 * diagonal processor from the beginning, i.e., BUFFLEN = 0.
1045 *
1046 * Count number of operations to decide whether to use
1047 * matrix-matrix multiplications for updating off-diagonal
1048 * parts or not.
1049 *
1050  DO 112 window = 1, nmwin2
1051  rsrc = iwork(irsrc+window-1)
1052  csrc = iwork(icsrc+window-1)
1053 *
1054  IF( (myrow.EQ.rsrc .AND. dir.EQ.1 ).OR.
1055  $ (mycol.EQ.csrc .AND. dir.EQ.2 ) ) THEN
1056  lilo = iwork(ililo+window-1)
1057  lihi = iwork(ilihi+window-1)
1058  lsel = iwork(ilsel+window-1)
1059 *
1060 * Skip update part for current WINDOW if BUFFLEN = 0.
1061 *
1062  IF( bufflen.EQ.0 ) GO TO 295
1063 *
1064  nitraf = pitraf - ipiw
1065  ishh = .false.
1066  flops = 0
1067  DO 200 k = 1, nitraf
1068  IF( iwork( ipiw + k - 1 ).LE.nwin ) THEN
1069  flops = flops + 6
1070  ELSE
1071  flops = flops + 11
1072  ishh = .true.
1073  END IF
1074  200 CONTINUE
1075 *
1076 * Compute amount of work space necessary for performing
1077 * matrix-matrix multiplications.
1078 *
1079  pdw = buffer
1080  ipw3 = pdw + nwin*nwin
1081  ELSE
1082  flops = 0
1083  END IF
1084 *
1085  IF( flops.NE.0 .AND.
1086  $ ( flops*100 ) / ( 2*nwin*nwin ) .GE. mmult ) THEN
1087 *
1088 * Update off-diagonal blocks and Q using matrix-matrix
1089 * multiplications; if there are no Householder
1090 * reflectors it is preferable to take the triangular
1091 * block structure of the transformation matrix into
1092 * account.
1093 *
1094  CALL slaset( 'All', nwin, nwin, zero, one,
1095  $ work( pdw ), nwin )
1096  CALL bslaapp( 1, nwin, nwin, ncb, work( pdw ), nwin,
1097  $ nitraf, iwork(ipiw), work( ipw2 ), work(ipw3) )
1098 *
1099  IF( ishh ) THEN
1100 *
1101 * Loop through the local blocks of the distributed
1102 * matrices T and Q and update them according to the
1103 * performed reordering.
1104 *
1105 * Update the columns of T and Q affected by the
1106 * reordering.
1107 *
1108  IF( dir.EQ.2 ) THEN
1109  DO 210 indx = 1, i-1, nb
1110  CALL infog2l( indx, i, desct, nprow, npcol,
1111  $ myrow, mycol, iloc, jloc, rsrc1, csrc1 )
1112  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 )
1113  $ THEN
1114  lrows = min(nb,i-indx)
1115  CALL sgemm( 'No transpose',
1116  $ 'No transpose', lrows, nwin, nwin,
1117  $ one, t((jloc-1)*lldt+iloc), lldt,
1118  $ work( pdw ), nwin, zero,
1119  $ work(ipw3), lrows )
1120  CALL slamov( 'All', lrows, nwin,
1121  $ work(ipw3), lrows,
1122  $ t((jloc-1)*lldt+iloc), lldt )
1123  END IF
1124  210 CONTINUE
1125  IF( wantq ) THEN
1126  DO 220 indx = 1, n, nb
1127  CALL infog2l( indx, i, descq, nprow,
1128  $ npcol, myrow, mycol, iloc, jloc,
1129  $ rsrc1, csrc1 )
1130  IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 )
1131  $ THEN
1132  lrows = min(nb,n-indx+1)
1133  CALL sgemm( 'No transpose',
1134  $ 'No transpose', lrows, nwin, nwin,
1135  $ one, q((jloc-1)*lldq+iloc), lldq,
1136  $ work( pdw ), nwin, zero,
1137  $ work(ipw3), lrows )
1138  CALL slamov( 'All', lrows, nwin,
1139  $ work(ipw3), lrows,
1140  $ q((jloc-1)*lldq+iloc), lldq )
1141  END IF
1142  220 CONTINUE
1143  END IF
1144  END IF
1145 *
1146 * Update the rows of T affected by the reordering
1147 *
1148  IF( dir.EQ.1 ) THEN
1149  IF( lihi.LT.n ) THEN
1150  IF( mod(lihi,nb).GT.0 ) THEN
1151  indx = lihi + 1
1152  CALL infog2l( i, indx, desct, nprow,
1153  $ npcol, myrow, mycol, iloc, jloc,
1154  $ rsrc1, csrc1 )
1155  IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 )
1156  $ THEN
1157  lcols = mod( min( nb-mod(lihi,nb),
1158  $ n-lihi ), nb )
1159  CALL sgemm( 'Transpose',
1160  $ 'No Transpose', nwin, lcols, nwin,
1161  $ one, work( pdw ), nwin,
1162  $ t((jloc-1)*lldt+iloc), lldt, zero,
1163  $ work(ipw3), nwin )
1164  CALL slamov( 'All', nwin, lcols,
1165  $ work(ipw3), nwin,
1166  $ t((jloc-1)*lldt+iloc), lldt )
1167  END IF
1168  END IF
1169  indxs = iceil(lihi,nb)*nb + 1
1170  DO 230 indx = indxs, n, nb
1171  CALL infog2l( i, indx, desct, nprow,
1172  $ npcol, myrow, mycol, iloc, jloc,
1173  $ rsrc1, csrc1 )
1174  IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 )
1175  $ THEN
1176  lcols = min( nb, n-indx+1 )
1177  CALL sgemm( 'Transpose',
1178  $ 'No Transpose', nwin, lcols, nwin,
1179  $ one, work( pdw ), nwin,
1180  $ t((jloc-1)*lldt+iloc), lldt, zero,
1181  $ work(ipw3), nwin )
1182  CALL slamov( 'All', nwin, lcols,
1183  $ work(ipw3), nwin,
1184  $ t((jloc-1)*lldt+iloc), lldt )
1185  END IF
1186  230 CONTINUE
1187  END IF
1188  END IF
1189  ELSE
1190 *
1191 * The NWIN-by-NWIN matrix U containing the
1192 * accumulated orthogonal transformations has the
1193 * following structure:
1194 *
1195 * [ U11 U12 ]
1196 * U = [ ],
1197 * [ U21 U22 ]
1198 *
1199 * where U21 is KS-by-KS upper triangular and U12 is
1200 * (NWIN-KS)-by-(NWIN-KS) lower triangular.
1201 *
1202 * Update the columns of T and Q affected by the
1203 * reordering.
1204 *
1205 * Compute T2*U21 + T1*U11 in workspace.
1206 *
1207  IF( dir.EQ.2 ) THEN
1208  DO 240 indx = 1, i-1, nb
1209  CALL infog2l( indx, i, desct, nprow, npcol,
1210  $ myrow, mycol, iloc, jloc, rsrc1, csrc1 )
1211  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 )
1212  $ THEN
1213  jloc1 = indxg2l( i+nwin-ks, nb, mycol,
1214  $ desct( csrc_ ), npcol )
1215  lrows = min(nb,i-indx)
1216  CALL slamov( 'All', lrows, ks,
1217  $ t((jloc1-1)*lldt+iloc ), lldt,
1218  $ work(ipw3), lrows )
1219  CALL strmm( 'Right', 'Upper',
1220  $ 'No transpose',
1221  $ 'Non-unit', lrows, ks, one,
1222  $ work( pdw+nwin-ks ), nwin,
1223  $ work(ipw3), lrows )
1224  CALL sgemm( 'No transpose',
1225  $ 'No transpose', lrows, ks, nwin-ks,
1226  $ one, t((jloc-1)*lldt+iloc), lldt,
1227  $ work( pdw ), nwin, one, work(ipw3),
1228  $ lrows )
1229 *
1230 * Compute T1*U12 + T2*U22 in workspace.
1231 *
1232  CALL slamov( 'All', lrows, nwin-ks,
1233  $ t((jloc-1)*lldt+iloc), lldt,
1234  $ work( ipw3+ks*lrows ), lrows )
1235  CALL strmm( 'Right', 'Lower',
1236  $ 'No transpose', 'Non-unit',
1237  $ lrows, nwin-ks, one,
1238  $ work( pdw+nwin*ks ), nwin,
1239  $ work( ipw3+ks*lrows ), lrows )
1240  CALL sgemm( 'No transpose',
1241  $ 'No transpose', lrows, nwin-ks, ks,
1242  $ one, t((jloc1-1)*lldt+iloc), lldt,
1243  $ work( pdw+nwin*ks+nwin-ks ), nwin,
1244  $ one, work( ipw3+ks*lrows ), lrows )
1245 *
1246 * Copy workspace to T.
1247 *
1248  CALL slamov( 'All', lrows, nwin,
1249  $ work(ipw3), lrows,
1250  $ t((jloc-1)*lldt+iloc), lldt )
1251  END IF
1252  240 CONTINUE
1253  IF( wantq ) THEN
1254 *
1255 * Compute Q2*U21 + Q1*U11 in workspace.
1256 *
1257  DO 250 indx = 1, n, nb
1258  CALL infog2l( indx, i, descq, nprow,
1259  $ npcol, myrow, mycol, iloc, jloc,
1260  $ rsrc1, csrc1 )
1261  IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 )
1262  $ THEN
1263  jloc1 = indxg2l( i+nwin-ks, nb,
1264  $ mycol, descq( csrc_ ), npcol )
1265  lrows = min(nb,n-indx+1)
1266  CALL slamov( 'All', lrows, ks,
1267  $ q((jloc1-1)*lldq+iloc ), lldq,
1268  $ work(ipw3), lrows )
1269  CALL strmm( 'Right', 'Upper',
1270  $ 'No transpose', 'Non-unit',
1271  $ lrows, ks, one,
1272  $ work( pdw+nwin-ks ), nwin,
1273  $ work(ipw3), lrows )
1274  CALL sgemm( 'No transpose',
1275  $ 'No transpose', lrows, ks,
1276  $ nwin-ks, one,
1277  $ q((jloc-1)*lldq+iloc), lldq,
1278  $ work( pdw ), nwin, one,
1279  $ work(ipw3), lrows )
1280 *
1281 * Compute Q1*U12 + Q2*U22 in workspace.
1282 *
1283  CALL slamov( 'All', lrows, nwin-ks,
1284  $ q((jloc-1)*lldq+iloc), lldq,
1285  $ work( ipw3+ks*lrows ), lrows)
1286  CALL strmm( 'Right', 'Lower',
1287  $ 'No transpose', 'Non-unit',
1288  $ lrows, nwin-ks, one,
1289  $ work( pdw+nwin*ks ), nwin,
1290  $ work( ipw3+ks*lrows ), lrows)
1291  CALL sgemm( 'No transpose',
1292  $ 'No transpose', lrows, nwin-ks,
1293  $ ks, one, q((jloc1-1)*lldq+iloc),
1294  $ lldq, work(pdw+nwin*ks+nwin-ks),
1295  $ nwin, one, work( ipw3+ks*lrows ),
1296  $ lrows )
1297 *
1298 * Copy workspace to Q.
1299 *
1300  CALL slamov( 'All', lrows, nwin,
1301  $ work(ipw3), lrows,
1302  $ q((jloc-1)*lldq+iloc), lldq )
1303  END IF
1304  250 CONTINUE
1305  END IF
1306  END IF
1307 *
1308  IF( dir.EQ.1 ) THEN
1309  IF ( lihi.LT.n ) THEN
1310 *
1311 * Compute U21**T*T2 + U11**T*T1 in workspace.
1312 *
1313  IF( mod(lihi,nb).GT.0 ) THEN
1314  indx = lihi + 1
1315  CALL infog2l( i, indx, desct, nprow,
1316  $ npcol, myrow, mycol, iloc, jloc,
1317  $ rsrc1, csrc1 )
1318  IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 )
1319  $ THEN
1320  iloc1 = indxg2l( i+nwin-ks, nb, myrow,
1321  $ desct( rsrc_ ), nprow )
1322  lcols = mod( min( nb-mod(lihi,nb),
1323  $ n-lihi ), nb )
1324  CALL slamov( 'All', ks, lcols,
1325  $ t((jloc-1)*lldt+iloc1), lldt,
1326  $ work(ipw3), nwin )
1327  CALL strmm( 'Left', 'Upper',
1328  $ 'Transpose', 'Non-unit', ks,
1329  $ lcols, one, work( pdw+nwin-ks ),
1330  $ nwin, work(ipw3), nwin )
1331  CALL sgemm( 'Transpose',
1332  $ 'No transpose', ks, lcols,
1333  $ nwin-ks, one, work(pdw), nwin,
1334  $ t((jloc-1)*lldt+iloc), lldt, one,
1335  $ work(ipw3), nwin )
1336 *
1337 * Compute U12**T*T1 + U22**T*T2 in
1338 * workspace.
1339 *
1340  CALL slamov( 'All', nwin-ks, lcols,
1341  $ t((jloc-1)*lldt+iloc), lldt,
1342  $ work( ipw3+ks ), nwin )
1343  CALL strmm( 'Left', 'Lower',
1344  $ 'Transpose', 'Non-unit',
1345  $ nwin-ks, lcols, one,
1346  $ work( pdw+nwin*ks ), nwin,
1347  $ work( ipw3+ks ), nwin )
1348  CALL sgemm( 'Transpose',
1349  $ 'No Transpose', nwin-ks, lcols,
1350  $ ks, one,
1351  $ work( pdw+nwin*ks+nwin-ks ),
1352  $ nwin, t((jloc-1)*lldt+iloc1),
1353  $ lldt, one, work( ipw3+ks ),
1354  $ nwin )
1355 *
1356 * Copy workspace to T.
1357 *
1358  CALL slamov( 'All', nwin, lcols,
1359  $ work(ipw3), nwin,
1360  $ t((jloc-1)*lldt+iloc), lldt )
1361  END IF
1362  END IF
1363  indxs = iceil(lihi,nb)*nb + 1
1364  DO 260 indx = indxs, n, nb
1365  CALL infog2l( i, indx, desct, nprow,
1366  $ npcol, myrow, mycol, iloc, jloc,
1367  $ rsrc1, csrc1 )
1368  IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc1 )
1369  $ THEN
1370 *
1371 * Compute U21**T*T2 + U11**T*T1 in
1372 * workspace.
1373 *
1374  iloc1 = indxg2l( i+nwin-ks, nb,
1375  $ myrow, desct( rsrc_ ), nprow )
1376  lcols = min( nb, n-indx+1 )
1377  CALL slamov( 'All', ks, lcols,
1378  $ t((jloc-1)*lldt+iloc1), lldt,
1379  $ work(ipw3), nwin )
1380  CALL strmm( 'Left', 'Upper',
1381  $ 'Transpose', 'Non-unit', ks,
1382  $ lcols, one,
1383  $ work( pdw+nwin-ks ), nwin,
1384  $ work(ipw3), nwin )
1385  CALL sgemm( 'Transpose',
1386  $ 'No transpose', ks, lcols,
1387  $ nwin-ks, one, work(pdw), nwin,
1388  $ t((jloc-1)*lldt+iloc), lldt, one,
1389  $ work(ipw3), nwin )
1390 *
1391 * Compute U12**T*T1 + U22**T*T2 in
1392 * workspace.
1393 *
1394  CALL slamov( 'All', nwin-ks, lcols,
1395  $ t((jloc-1)*lldt+iloc), lldt,
1396  $ work( ipw3+ks ), nwin )
1397  CALL strmm( 'Left', 'Lower',
1398  $ 'Transpose', 'Non-unit',
1399  $ nwin-ks, lcols, one,
1400  $ work( pdw+nwin*ks ), nwin,
1401  $ work( ipw3+ks ), nwin )
1402  CALL sgemm( 'Transpose',
1403  $ 'No Transpose', nwin-ks, lcols,
1404  $ ks, one,
1405  $ work( pdw+nwin*ks+nwin-ks ),
1406  $ nwin, t((jloc-1)*lldt+iloc1),
1407  $ lldt, one, work(ipw3+ks), nwin )
1408 *
1409 * Copy workspace to T.
1410 *
1411  CALL slamov( 'All', nwin, lcols,
1412  $ work(ipw3), nwin,
1413  $ t((jloc-1)*lldt+iloc), lldt )
1414  END IF
1415  260 CONTINUE
1416  END IF
1417  END IF
1418  END IF
1419  ELSEIF( flops.NE.0 ) THEN
1420 *
1421 * Update off-diagonal blocks and Q using the pipelined
1422 * elementary transformations.
1423 *
1424  IF( dir.EQ.2 ) THEN
1425  DO 270 indx = 1, i-1, nb
1426  CALL infog2l( indx, i, desct, nprow, npcol,
1427  $ myrow, mycol, iloc, jloc, rsrc1, csrc1 )
1428  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1429  lrows = min(nb,i-indx)
1430  CALL bslaapp( 1, lrows, nwin, ncb,
1431  $ t((jloc-1)*lldt+iloc ), lldt, nitraf,
1432  $ iwork(ipiw), work( ipw2 ),
1433  $ work(ipw3) )
1434  END IF
1435  270 CONTINUE
1436  IF( wantq ) THEN
1437  DO 280 indx = 1, n, nb
1438  CALL infog2l( indx, i, descq, nprow, npcol,
1439  $ myrow, mycol, iloc, jloc, rsrc1, csrc1 )
1440  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 )
1441  $ THEN
1442  lrows = min(nb,n-indx+1)
1443  CALL bslaapp( 1, lrows, nwin, ncb,
1444  $ q((jloc-1)*lldq+iloc), lldq, nitraf,
1445  $ iwork(ipiw), work( ipw2 ),
1446  $ work(ipw3) )
1447  END IF
1448  280 CONTINUE
1449  END IF
1450  END IF
1451  IF( dir.EQ.1 ) THEN
1452  IF( lihi.LT.n ) THEN
1453  IF( mod(lihi,nb).GT.0 ) THEN
1454  indx = lihi + 1
1455  CALL infog2l( i, indx, desct, nprow, npcol,
1456  $ myrow, mycol, iloc, jloc, rsrc1, csrc1 )
1457  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 )
1458  $ THEN
1459  lcols = mod( min( nb-mod(lihi,nb),
1460  $ n-lihi ), nb )
1461  CALL bslaapp( 0, nwin, lcols, ncb,
1462  $ t((jloc-1)*lldt+iloc), lldt, nitraf,
1463  $ iwork(ipiw), work( ipw2 ),
1464  $ work(ipw3) )
1465  END IF
1466  END IF
1467  indxs = iceil(lihi,nb)*nb + 1
1468  DO 290 indx = indxs, n, nb
1469  CALL infog2l( i, indx, desct, nprow, npcol,
1470  $ myrow, mycol, iloc, jloc, rsrc1, csrc1 )
1471  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 )
1472  $ THEN
1473  lcols = min( nb, n-indx+1 )
1474  CALL bslaapp( 0, nwin, lcols, ncb,
1475  $ t((jloc-1)*lldt+iloc), lldt, nitraf,
1476  $ iwork(ipiw), work( ipw2 ),
1477  $ work(ipw3) )
1478  END IF
1479  290 CONTINUE
1480  END IF
1481  END IF
1482  END IF
1483 *
1484 * If I was not involved in the updates for the current
1485 * window or the window was fully processed, I go here and
1486 * try again for the next window.
1487 *
1488  295 CONTINUE
1489 *
1490 * Update LIHI and LIHI depending on the number of
1491 * eigenvalues really moved - for on-diagonal processes we
1492 * do this update only once since each on-diagonal process
1493 * is only involved with one window at one time. The
1494 * indicies are updated in three cases:
1495 * 1) When some reordering was really performed
1496 * -- indicated by BUFFLEN > 0.
1497 * 2) When no selected eigenvalues was found in the
1498 * current window -- indicated by KS = 0.
1499 * 3) When some selected eigenvalues was found in the
1500 * current window but no one of them was moved
1501 * (KS > 0 and BUFFLEN = 0)
1502 * False index updating is avoided by sometimes setting
1503 * KS = -1. This will affect processors involved in more
1504 * than one window and where the first one ends up with
1505 * KS = 0 and for the second one is done already.
1506 *
1507  IF( myrow.EQ.rsrc.AND.mycol.EQ.csrc ) THEN
1508  IF( dir.EQ.2 ) THEN
1509  IF( bufflen.NE.0 .OR. ks.EQ.0 .OR.
1510  $ ( bufflen.EQ.0 .AND. ks.GT.0 ) )
1511  $ lihi = i + ks - 1
1512  iwork( ilihi+window-1 ) = lihi
1513  IF( .NOT. lihi.GE.lilo+lsel ) THEN
1514  lilo = lilo + lsel
1515  iwork( ililo+window-1 ) = lilo
1516  END IF
1517  END IF
1518  ELSEIF( myrow.EQ.rsrc .AND. dir.EQ.1 ) THEN
1519  IF( bufflen.NE.0 .OR. ks.EQ.0 .OR.
1520  $ ( bufflen.EQ.0 .AND. ks.GT.0 ) )
1521  $ lihi = i + ks - 1
1522  iwork( ilihi+window-1 ) = lihi
1523  IF( .NOT. lihi.GE.lilo+lsel ) THEN
1524  lilo = lilo + lsel
1525  iwork( ililo+window-1 ) = lilo
1526  END IF
1527  ELSEIF( mycol.EQ.csrc .AND. dir.EQ.2 ) THEN
1528  IF( bufflen.NE.0 .OR. ks.EQ.0 .OR.
1529  $ ( bufflen.EQ.0 .AND. ks.GT.0 ) )
1530  $ lihi = i + ks - 1
1531  iwork( ilihi+window-1 ) = lihi
1532  IF( .NOT. lihi.GE.lilo+lsel ) THEN
1533  lilo = lilo + lsel
1534  iwork( ililo+window-1 ) = lilo
1535  END IF
1536  END IF
1537 *
1538  112 CONTINUE
1539 *
1540 * End of direction loop for updates with respect to local
1541 * reordering.
1542 *
1543  1111 CONTINUE
1544 *
1545 * Associate each process with one of the corresponding
1546 * computational windows such that the test for another round
1547 * of local reordering is carried out properly. Since the
1548 * column updates were computed after the row updates, it is
1549 * sufficient to test for changing the association to the
1550 * window in the corresponding process row.
1551 *
1552  DO 113 window = 1, nmwin2
1553  rsrc = iwork( irsrc + window - 1 )
1554  IF( myrow.EQ.rsrc .AND. (.NOT. lihi.GE.lilo+lsel ) ) THEN
1555  lilo = iwork( ililo + window - 1 )
1556  lihi = iwork( ilihi + window - 1 )
1557  lsel = iwork( ilsel + window - 1 )
1558  END IF
1559  113 CONTINUE
1560 *
1561 * End While ( LIHI >= LILO + LSEL )
1562  round = round + 1
1563  IF( first ) first = .false.
1564  GO TO 130
1565  END IF
1566 *
1567 * All processors excluded from the local reordering go here.
1568 *
1569  114 CONTINUE
1570 *
1571 * Barrier to collect the processes before proceeding.
1572 *
1573  CALL blacs_barrier( ictxt, 'All' )
1574 *
1575 * Compute global maximum of IERR so that we know if some process
1576 * experienced a failure in the reordering.
1577 *
1578  myierr = ierr
1579  IF( nprocs.GT.1 )
1580  $ CALL igamx2d( ictxt, 'All', top, 1, 1, ierr, 1, -1,
1581  $ -1, -1, -1, -1 )
1582 *
1583  IF( ierr.NE.0 ) THEN
1584 *
1585 * When calling BDTREXC, the block at position I+KKS-1 failed
1586 * to swap.
1587 *
1588  IF( myierr.NE.0 ) info = max(1,i+kks-1)
1589  IF( nprocs.GT.1 )
1590  $ CALL igamx2d( ictxt, 'All', top, 1, 1, info, 1, -1,
1591  $ -1, -1, -1, -1 )
1592  GO TO 300
1593  END IF
1594 *
1595 * Now, for each compuational window, move the selected
1596 * eigenvalues across the process border. Do this by forming the
1597 * processors into groups of four working together to bring the
1598 * window over the border. The processes are numbered as follows
1599 *
1600 * 1 | 2
1601 * --+--
1602 * 3 | 4
1603 *
1604 * where '|' and '-' denotes the process (and block) borders.
1605 * This implies that the cluster to be reordered over the border
1606 * is held by process 4, process 1 will receive the cluster after
1607 * the reordering, process 3 holds the local (2,1)th element of a
1608 * 2-by-2 diagonal block located on the block border and process 2
1609 * holds the closest off-diagonal part of the window that is
1610 * affected by the cross-border reordering.
1611 *
1612 * The active window is now ( I : LIHI[4], I : LIHI[4] ), where
1613 * I = MAX( ILO, LIHI - 2*MOD(LIHI,NB) ). If this active window is
1614 * too large compared to the value of PARA( 6 ), it will be
1615 * truncated in both ends such that a maximum of PARA( 6 )
1616 * eigenvalues is reordered across the border this time.
1617 *
1618 * The active window will be collected and built in workspace at
1619 * process 1 and 4, which both compute the reordering and return
1620 * the updated parts to the corresponding processes 2-3. Next, the
1621 * accumulated transformations are broadcasted for updates in the
1622 * block rows and column that corresponds to the process rows and
1623 * columns where process 1 and 4 reside.
1624 *
1625 * The off-diagonal blocks are updated by the processes receiving
1626 * from the broadcasts of the orthogonal transformations. Since
1627 * the active window is split over the process borders, the
1628 * updates of T and Q requires that stripes of block rows of
1629 * columns are exchanged between neighboring processes in the
1630 * corresponding process rows and columns.
1631 *
1632 * First, form each group of processors involved in the
1633 * crossborder reordering. Do this in two (or three) phases:
1634 * 1) Reorder each odd window over the border.
1635 * 2) Reorder each even window over the border.
1636 * 3) Reorder the last odd window over the border, if it was not
1637 * processed in the first phase.
1638 *
1639 * When reordering the odd windows over the border, we must make
1640 * sure that no process row or column is involved in both the
1641 * first and the last window at the same time. This happens when
1642 * the total number of windows is odd, greater than one and equal
1643 * to the minumum process mesh dimension. Therefore the last odd
1644 * window may be reordered over the border at last.
1645 *
1646  lastwait = nmwin2.GT.1 .AND. mod(nmwin2,2).EQ.1 .AND.
1647  $ nmwin2.EQ.min(nprow,npcol)
1648 *
1649  last = 0
1650  308 CONTINUE
1651  IF( lastwait ) THEN
1652  IF( last.EQ.0 ) THEN
1653  win0s = 1
1654  win0e = 2
1655  wine = nmwin2 - 1
1656  ELSE
1657  win0s = nmwin2
1658  win0e = nmwin2
1659  wine = nmwin2
1660  END IF
1661  ELSE
1662  win0s = 1
1663  win0e = 2
1664  wine = nmwin2
1665  END IF
1666  DO 310 window0 = win0s, win0e
1667  DO 320 window = window0, wine, 2
1668 *
1669 * Define the process holding the down-right part of the
1670 * window.
1671 *
1672  rsrc4 = iwork(irsrc+window-1)
1673  csrc4 = iwork(icsrc+window-1)
1674 *
1675 * Define the other processes in the group of four.
1676 *
1677  rsrc3 = rsrc4
1678  csrc3 = mod( csrc4 - 1 + npcol, npcol )
1679  rsrc2 = mod( rsrc4 - 1 + nprow, nprow )
1680  csrc2 = csrc4
1681  rsrc1 = rsrc2
1682  csrc1 = csrc3
1683  IF( ( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) .OR.
1684  $ ( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) .OR.
1685  $ ( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) .OR.
1686  $ ( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) ) THEN
1687 *
1688 * Compute the correct active window - for reordering
1689 * into a block that has not been active at all before,
1690 * we try to reorder as many of our eigenvalues over the
1691 * border as possible without knowing of the situation on
1692 * the other side - this may cause very few eigenvalues
1693 * to be reordered over the border this time (perhaps not
1694 * any) but this should be an initial problem. Anyway,
1695 * the bottom-right position of the block will be at
1696 * position LIHIC.
1697 *
1698  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
1699  lihi4 = ( iwork( ililo + window - 1 ) +
1700  $ iwork( ilihi + window - 1 ) ) / 2
1701  lihic = min(lihi4,(iceil(lihi4,nb)-1)*nb+wneicr)
1702 *
1703 * Fix LIHIC to avoid that bottom of window cuts
1704 * 2-by-2 block and make sure all processors in the
1705 * group knows about the correct value.
1706 *
1707  IF( (.NOT. lihic.LE.nb) .AND. lihic.LT.n ) THEN
1708  iloc = indxg2l( lihic+1, nb, myrow,
1709  $ desct( rsrc_ ), nprow )
1710  jloc = indxg2l( lihic, nb, mycol,
1711  $ desct( csrc_ ), npcol )
1712  IF( t( (jloc-1)*lldt+iloc ).NE.zero ) THEN
1713  IF( mod( lihic, nb ).EQ.1 .OR.
1714  $ ( mod( lihic, nb ).EQ.2 .AND.
1715  $ SELECT(lihic-2).EQ.0 ) )
1716  $ THEN
1717  lihic = lihic + 1
1718  ELSE
1719  lihic = lihic - 1
1720  END IF
1721  END IF
1722  END IF
1723  IF( rsrc4.NE.rsrc1 .OR. csrc4.NE.csrc1 )
1724  $ CALL igesd2d( ictxt, 1, 1, lihic, 1, rsrc1,
1725  $ csrc1 )
1726  IF( rsrc4.NE.rsrc2 .OR. csrc4.NE.csrc2 )
1727  $ CALL igesd2d( ictxt, 1, 1, lihic, 1, rsrc2,
1728  $ csrc2 )
1729  IF( rsrc4.NE.rsrc3 .OR. csrc4.NE.csrc3 )
1730  $ CALL igesd2d( ictxt, 1, 1, lihic, 1, rsrc3,
1731  $ csrc3 )
1732  END IF
1733  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1734  IF( rsrc4.NE.rsrc1 .OR. csrc4.NE.csrc1 )
1735  $ CALL igerv2d( ictxt, 1, 1, lihic, 1, rsrc4,
1736  $ csrc4 )
1737  END IF
1738  IF( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) THEN
1739  IF( rsrc4.NE.rsrc2 .OR. csrc4.NE.csrc2 )
1740  $ CALL igerv2d( ictxt, 1, 1, lihic, 1, rsrc4,
1741  $ csrc4 )
1742  END IF
1743  IF( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) THEN
1744  IF( rsrc4.NE.rsrc3 .OR. csrc4.NE.csrc3 )
1745  $ CALL igerv2d( ictxt, 1, 1, lihic, 1, rsrc4,
1746  $ csrc4 )
1747  END IF
1748 *
1749 * Avoid going over the border with the first window if
1750 * it resides in the block where the last global position
1751 * T(ILO,ILO) is or ILO has been updated to point to a
1752 * position right of T(LIHIC,LIHIC).
1753 *
1754  skip1cr = window.EQ.1 .AND.
1755  $ iceil(lihic,nb).LE.iceil(ilo,nb)
1756 *
1757 * Decide I, where to put top of window, such that top of
1758 * window does not cut 2-by-2 block. Make sure that we do
1759 * not end up in a situation where a 2-by-2 block
1760 * splitted on the border is left in its original place
1761 * -- this can cause infinite loops.
1762 * Remedy: make sure that the part of the window that
1763 * resides left to the border is at least of dimension
1764 * two (2) in case we have 2-by-2 blocks in top of the
1765 * cross border window.
1766 *
1767 * Also make sure all processors in the group knows about
1768 * the correct value of I. When skipping the crossborder
1769 * reordering, just set I = LIHIC.
1770 *
1771  IF( .NOT. skip1cr ) THEN
1772  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1773  IF( window.EQ.1 ) THEN
1774  lihi1 = ilo
1775  ELSE
1776  lihi1 = iwork( ilihi + window - 2 )
1777  END IF
1778  i = max( lihi1,
1779  $ min( lihic-2*mod(lihic,nb) + 1,
1780  $ (iceil(lihic,nb)-1)*nb - 1 ) )
1781  iloc = indxg2l( i, nb, myrow, desct( rsrc_ ),
1782  $ nprow )
1783  jloc = indxg2l( i-1, nb, mycol, desct( csrc_ ),
1784  $ npcol )
1785  IF( t( (jloc-1)*lldt+iloc ).NE.zero )
1786  $ i = i - 1
1787  IF( rsrc1.NE.rsrc4 .OR. csrc1.NE.csrc4 )
1788  $ CALL igesd2d( ictxt, 1, 1, i, 1, rsrc4,
1789  $ csrc4 )
1790  IF( rsrc1.NE.rsrc2 .OR. csrc1.NE.csrc2 )
1791  $ CALL igesd2d( ictxt, 1, 1, i, 1, rsrc2,
1792  $ csrc2 )
1793  IF( rsrc1.NE.rsrc3 .OR. csrc1.NE.csrc3 )
1794  $ CALL igesd2d( ictxt, 1, 1, i, 1, rsrc3,
1795  $ csrc3 )
1796  END IF
1797  IF( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) THEN
1798  IF( rsrc1.NE.rsrc2 .OR. csrc1.NE.csrc2 )
1799  $ CALL igerv2d( ictxt, 1, 1, i, 1, rsrc1,
1800  $ csrc1 )
1801  END IF
1802  IF( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) THEN
1803  IF( rsrc1.NE.rsrc3 .OR. csrc1.NE.csrc3 )
1804  $ CALL igerv2d( ictxt, 1, 1, i, 1, rsrc1,
1805  $ csrc1 )
1806  END IF
1807  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
1808  IF( rsrc1.NE.rsrc4 .OR. csrc1.NE.csrc4 )
1809  $ CALL igerv2d( ictxt, 1, 1, i, 1, rsrc1,
1810  $ csrc1 )
1811  END IF
1812  ELSE
1813  i = lihic
1814  END IF
1815 *
1816 * Finalize computation of window size: active window is
1817 * now (I:LIHIC,I:LIHIC).
1818 *
1819  nwin = lihic - i + 1
1820  ks = 0
1821 *
1822 * Skip rest of this part if appropriate.
1823 *
1824  IF( skip1cr ) GO TO 360
1825 *
1826 * Divide workspace -- put active window in
1827 * WORK(IPW2:IPW2+NWIN**2-1) and orthogonal
1828 * transformations in WORK(IPW3:...).
1829 *
1830  CALL slaset( 'All', nwin, nwin, zero, zero,
1831  $ work( ipw2 ), nwin )
1832 *
1833  pitraf = ipiw
1834  ipw3 = ipw2 + nwin*nwin
1835  pdtraf = ipw3
1836 *
1837 * Exchange the current view of SELECT for the active
1838 * window between process 1 and 4 to make sure that
1839 * exactly the same job is performed for both processes.
1840 *
1841  IF( rsrc1.NE.rsrc4 .OR. csrc1.NE.csrc4 ) THEN
1842  ilen4 = mod(lihic,nb)
1843  seli4 = iceil(i,nb)*nb+1
1844  ilen1 = nwin - ilen4
1845  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1846  CALL igesd2d( ictxt, ilen1, 1, SELECT(i),
1847  $ ilen1, rsrc4, csrc4 )
1848  CALL igerv2d( ictxt, ilen4, 1, SELECT(seli4),
1849  $ ilen4, rsrc4, csrc4 )
1850  END IF
1851  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
1852  CALL igesd2d( ictxt, ilen4, 1, SELECT(seli4),
1853  $ ilen4, rsrc1, csrc1 )
1854  CALL igerv2d( ictxt, ilen1, 1, SELECT(i),
1855  $ ilen1, rsrc1, csrc1 )
1856  END IF
1857  END IF
1858 *
1859 * Form the active window by a series of point-to-point
1860 * sends and receives.
1861 *
1862  dim1 = nb - mod(i-1,nb)
1863  dim4 = nwin - dim1
1864  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1865  iloc = indxg2l( i, nb, myrow, desct( rsrc_ ),
1866  $ nprow )
1867  jloc = indxg2l( i, nb, mycol, desct( csrc_ ),
1868  $ npcol )
1869  CALL slamov( 'All', dim1, dim1,
1870  $ t((jloc-1)*lldt+iloc), lldt, work(ipw2),
1871  $ nwin )
1872  IF( rsrc1.NE.rsrc4 .OR. csrc1.NE.csrc4 ) THEN
1873  CALL sgesd2d( ictxt, dim1, dim1,
1874  $ work(ipw2), nwin, rsrc4, csrc4 )
1875  CALL sgerv2d( ictxt, dim4, dim4,
1876  $ work(ipw2+dim1*nwin+dim1), nwin, rsrc4,
1877  $ csrc4 )
1878  END IF
1879  END IF
1880  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
1881  iloc = indxg2l( i+dim1, nb, myrow, desct( rsrc_ ),
1882  $ nprow )
1883  jloc = indxg2l( i+dim1, nb, mycol, desct( csrc_ ),
1884  $ npcol )
1885  CALL slamov( 'All', dim4, dim4,
1886  $ t((jloc-1)*lldt+iloc), lldt,
1887  $ work(ipw2+dim1*nwin+dim1), nwin )
1888  IF( rsrc4.NE.rsrc1 .OR. csrc4.NE.csrc1 ) THEN
1889  CALL sgesd2d( ictxt, dim4, dim4,
1890  $ work(ipw2+dim1*nwin+dim1), nwin, rsrc1,
1891  $ csrc1 )
1892  CALL sgerv2d( ictxt, dim1, dim1,
1893  $ work(ipw2), nwin, rsrc1, csrc1 )
1894  END IF
1895  END IF
1896  IF( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) THEN
1897  iloc = indxg2l( i, nb, myrow, desct( rsrc_ ),
1898  $ nprow )
1899  jloc = indxg2l( i+dim1, nb, mycol, desct( csrc_ ),
1900  $ npcol )
1901  CALL slamov( 'All', dim1, dim4,
1902  $ t((jloc-1)*lldt+iloc), lldt,
1903  $ work(ipw2+dim1*nwin), nwin )
1904  IF( rsrc2.NE.rsrc1 .OR. csrc2.NE.csrc1 ) THEN
1905  CALL sgesd2d( ictxt, dim1, dim4,
1906  $ work(ipw2+dim1*nwin), nwin, rsrc1, csrc1 )
1907  END IF
1908  END IF
1909  IF( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) THEN
1910  IF( rsrc2.NE.rsrc4 .OR. csrc2.NE.csrc4 ) THEN
1911  CALL sgesd2d( ictxt, dim1, dim4,
1912  $ work(ipw2+dim1*nwin), nwin, rsrc4, csrc4 )
1913  END IF
1914  END IF
1915  IF( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) THEN
1916  iloc = indxg2l( i+dim1, nb, myrow, desct( rsrc_ ),
1917  $ nprow )
1918  jloc = indxg2l( i+dim1-1, nb, mycol,
1919  $ desct( csrc_ ), npcol )
1920  CALL slamov( 'All', 1, 1,
1921  $ t((jloc-1)*lldt+iloc), lldt,
1922  $ work(ipw2+(dim1-1)*nwin+dim1), nwin )
1923  IF( rsrc3.NE.rsrc1 .OR. csrc3.NE.csrc1 ) THEN
1924  CALL sgesd2d( ictxt, 1, 1,
1925  $ work(ipw2+(dim1-1)*nwin+dim1), nwin,
1926  $ rsrc1, csrc1 )
1927  END IF
1928  END IF
1929  IF( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) THEN
1930  IF( rsrc3.NE.rsrc4 .OR. csrc3.NE.csrc4 ) THEN
1931  CALL sgesd2d( ictxt, 1, 1,
1932  $ work(ipw2+(dim1-1)*nwin+dim1), nwin,
1933  $ rsrc4, csrc4 )
1934  END IF
1935  END IF
1936  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
1937  IF( rsrc1.NE.rsrc2 .OR. csrc1.NE.csrc2 ) THEN
1938  CALL sgerv2d( ictxt, dim1, dim4,
1939  $ work(ipw2+dim1*nwin), nwin, rsrc2,
1940  $ csrc2 )
1941  END IF
1942  IF( rsrc1.NE.rsrc3 .OR. csrc1.NE.csrc3 ) THEN
1943  CALL sgerv2d( ictxt, 1, 1,
1944  $ work(ipw2+(dim1-1)*nwin+dim1), nwin,
1945  $ rsrc3, csrc3 )
1946  END IF
1947  END IF
1948  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
1949  IF( rsrc4.NE.rsrc2 .OR. csrc4.NE.csrc2 ) THEN
1950  CALL sgerv2d( ictxt, dim1, dim4,
1951  $ work(ipw2+dim1*nwin), nwin, rsrc2,
1952  $ csrc2 )
1953  END IF
1954  IF( rsrc4.NE.rsrc3 .OR. csrc4.NE.csrc3 ) THEN
1955  CALL sgerv2d( ictxt, 1, 1,
1956  $ work(ipw2+(dim1-1)*nwin+dim1), nwin,
1957  $ rsrc3, csrc3 )
1958  END IF
1959  END IF
1960 *
1961 * Compute the reordering (just as in the total local
1962 * case) and accumulate the transformations (ONLY
1963 * ON-DIAGONAL PROCESSES).
1964 *
1965  IF( ( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) .OR.
1966  $ ( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) ) THEN
1967  pair = .false.
1968  DO 330 k = i, lihic
1969  IF( pair ) THEN
1970  pair = .false.
1971  ELSE
1972  swap = SELECT( k ).NE.0
1973  IF( k.LT.lihic ) THEN
1974  elem = work(ipw2+(k-i)*nwin+k-i+1)
1975  IF( elem.NE.zero )
1976  $ pair = .true.
1977  END IF
1978  IF( swap ) THEN
1979  ks = ks + 1
1980 *
1981 * Swap the K-th block to position I+KS-1.
1982 *
1983  ierr = 0
1984  kk = k - i + 1
1985  kks = ks
1986  IF( kk.NE.ks ) THEN
1987  nitraf = liwork - pitraf + 1
1988  ndtraf = lwork - pdtraf + 1
1989  CALL bstrexc( nwin, work(ipw2), nwin,
1990  $ kk, kks, nitraf, iwork( pitraf ),
1991  $ ndtraf, work( pdtraf ),
1992  $ work(ipw1), ierr )
1993  pitraf = pitraf + nitraf
1994  pdtraf = pdtraf + ndtraf
1995 *
1996 * Update array SELECT.
1997 *
1998  IF ( pair ) THEN
1999  DO 340 j = i+kk-1, i+kks, -1
2000  SELECT(j+1) = SELECT(j-1)
2001  340 CONTINUE
2002  SELECT(i+kks-1) = 1
2003  SELECT(i+kks) = 1
2004  ELSE
2005  DO 350 j = i+kk-1, i+kks, -1
2006  SELECT(j) = SELECT(j-1)
2007  350 CONTINUE
2008  SELECT(i+kks-1) = 1
2009  END IF
2010 *
2011  IF ( ierr.EQ.1 .OR. ierr.EQ.2 ) THEN
2012 *
2013  IF ( ierr.EQ.2 ) THEN
2014  SELECT( i+kks-3 ) = 1
2015  SELECT( i+kks-1 ) = 0
2016  kks = kks + 1
2017  END IF
2018 *
2019  GO TO 360
2020  END IF
2021  ks = kks
2022  END IF
2023  IF( pair )
2024  $ ks = ks + 1
2025  END IF
2026  END IF
2027  330 CONTINUE
2028  END IF
2029  360 CONTINUE
2030 *
2031 * Save information about the reordering.
2032 *
2033  IF( ( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) .OR.
2034  $ ( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) ) THEN
2035  ibuff( 1 ) = i
2036  ibuff( 2 ) = nwin
2037  ibuff( 3 ) = pitraf
2038  ibuff( 4 ) = ks
2039  ibuff( 5 ) = pdtraf
2040  ibuff( 6 ) = ndtraf
2041  ilen = pitraf - ipiw + 1
2042  dlen = pdtraf - ipw3 + 1
2043  ibuff( 7 ) = ilen
2044  ibuff( 8 ) = dlen
2045 *
2046 * Put reordered data back into global matrix if a
2047 * reordering took place.
2048 *
2049  IF( .NOT. skip1cr ) THEN
2050  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
2051  iloc = indxg2l( i, nb, myrow, desct( rsrc_ ),
2052  $ nprow )
2053  jloc = indxg2l( i, nb, mycol, desct( csrc_ ),
2054  $ npcol )
2055  CALL slamov( 'All', dim1, dim1, work(ipw2),
2056  $ nwin, t((jloc-1)*lldt+iloc), lldt )
2057  END IF
2058  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
2059  iloc = indxg2l( i+dim1, nb, myrow,
2060  $ desct( rsrc_ ), nprow )
2061  jloc = indxg2l( i+dim1, nb, mycol,
2062  $ desct( csrc_ ), npcol )
2063  CALL slamov( 'All', dim4, dim4,
2064  $ work(ipw2+dim1*nwin+dim1), nwin,
2065  $ t((jloc-1)*lldt+iloc), lldt )
2066  END IF
2067  END IF
2068  END IF
2069 *
2070 * Break if appropriate -- IBUFF(3:8) may now contain
2071 * nonsens, but that's no problem. The processors outside
2072 * the cross border group only needs to know about I and
2073 * NWIN to get a correct value of SKIP1CR (see below) and
2074 * to skip the cross border updates if necessary.
2075 *
2076  IF( window.EQ.1 .AND. skip1cr ) GO TO 325
2077 *
2078 * Return reordered data to process 2 and 3.
2079 *
2080  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
2081  IF( rsrc1.NE.rsrc3 .OR. csrc1.NE.csrc3 ) THEN
2082  CALL sgesd2d( ictxt, 1, 1,
2083  $ work( ipw2+(dim1-1)*nwin+dim1 ), nwin,
2084  $ rsrc3, csrc3 )
2085  END IF
2086  END IF
2087  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
2088  IF( rsrc4.NE.rsrc2 .OR. csrc4.NE.csrc2 ) THEN
2089  CALL sgesd2d( ictxt, dim1, dim4,
2090  $ work( ipw2+dim1*nwin), nwin, rsrc2,
2091  $ csrc2 )
2092  END IF
2093  END IF
2094  IF( myrow.EQ.rsrc2 .AND. mycol.EQ.csrc2 ) THEN
2095  iloc = indxg2l( i, nb, myrow, desct( rsrc_ ),
2096  $ nprow )
2097  jloc = indxg2l( i+dim1, nb, mycol,
2098  $ desct( csrc_ ), npcol )
2099  IF( rsrc2.NE.rsrc4 .OR. csrc2.NE.csrc4 ) THEN
2100  CALL sgerv2d( ictxt, dim1, dim4,
2101  $ work(ipw2+dim1*nwin), nwin, rsrc4, csrc4 )
2102  END IF
2103  CALL slamov( 'All', dim1, dim4,
2104  $ work( ipw2+dim1*nwin ), nwin,
2105  $ t((jloc-1)*lldt+iloc), lldt )
2106  END IF
2107  IF( myrow.EQ.rsrc3 .AND. mycol.EQ.csrc3 ) THEN
2108  iloc = indxg2l( i+dim1, nb, myrow,
2109  $ desct( rsrc_ ), nprow )
2110  jloc = indxg2l( i+dim1-1, nb, mycol,
2111  $ desct( csrc_ ), npcol )
2112  IF( rsrc3.NE.rsrc1 .OR. csrc3.NE.csrc1 ) THEN
2113  CALL sgerv2d( ictxt, 1, 1,
2114  $ work( ipw2+(dim1-1)*nwin+dim1 ), nwin,
2115  $ rsrc1, csrc1 )
2116  END IF
2117  t((jloc-1)*lldt+iloc) =
2118  $ work( ipw2+(dim1-1)*nwin+dim1 )
2119  END IF
2120  END IF
2121 *
2122  325 CONTINUE
2123 *
2124  320 CONTINUE
2125 *
2126 * For the crossborder updates, we use the same directions as
2127 * in the local reordering case above.
2128 *
2129  DO 2222 dir = 1, 2
2130 *
2131 * Broadcast information about the reordering.
2132 *
2133  DO 321 window = window0, wine, 2
2134  rsrc4 = iwork(irsrc+window-1)
2135  csrc4 = iwork(icsrc+window-1)
2136  rsrc1 = mod( rsrc4 - 1 + nprow, nprow )
2137  csrc1 = mod( csrc4 - 1 + npcol, npcol )
2138  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
2139  IF( npcol.GT.1 .AND. dir.EQ.1 )
2140  $ CALL igebs2d( ictxt, 'Row', top, 8, 1,
2141  $ ibuff, 8 )
2142  IF( nprow.GT.1 .AND. dir.EQ.2 )
2143  $ CALL igebs2d( ictxt, 'Col', top, 8, 1,
2144  $ ibuff, 8 )
2145  skip1cr = window.EQ.1 .AND.
2146  $ iceil(lihic,nb).LE.iceil(ilo,nb)
2147  ELSEIF( myrow.EQ.rsrc1 .OR. mycol.EQ.csrc1 ) THEN
2148  IF( npcol.GT.1 .AND. dir.EQ.1 .AND.
2149  $ myrow.EQ.rsrc1 ) THEN
2150  CALL igebr2d( ictxt, 'Row', top, 8, 1,
2151  $ ibuff, 8, rsrc1, csrc1 )
2152  i = ibuff( 1 )
2153  nwin = ibuff( 2 )
2154  pitraf = ibuff( 3 )
2155  ks = ibuff( 4 )
2156  pdtraf = ibuff( 5 )
2157  ndtraf = ibuff( 6 )
2158  ilen = ibuff( 7 )
2159  dlen = ibuff( 8 )
2160  bufflen = ilen + dlen
2161  ipw3 = ipw2 + nwin*nwin
2162  dim1 = nb - mod(i-1,nb)
2163  dim4 = nwin - dim1
2164  lihic = nwin + i - 1
2165  skip1cr = window.EQ.1 .AND.
2166  $ iceil(lihic,nb).LE.iceil(ilo,nb)
2167  END IF
2168  IF( nprow.GT.1 .AND. dir.EQ.2 .AND.
2169  $ mycol.EQ.csrc1 ) THEN
2170  CALL igebr2d( ictxt, 'Col', top, 8, 1,
2171  $ ibuff, 8, rsrc1, csrc1 )
2172  i = ibuff( 1 )
2173  nwin = ibuff( 2 )
2174  pitraf = ibuff( 3 )
2175  ks = ibuff( 4 )
2176  pdtraf = ibuff( 5 )
2177  ndtraf = ibuff( 6 )
2178  ilen = ibuff( 7 )
2179  dlen = ibuff( 8 )
2180  bufflen = ilen + dlen
2181  ipw3 = ipw2 + nwin*nwin
2182  dim1 = nb - mod(i-1,nb)
2183  dim4 = nwin - dim1
2184  lihic = nwin + i - 1
2185  skip1cr = window.EQ.1 .AND.
2186  $ iceil(lihic,nb).LE.iceil(ilo,nb)
2187  END IF
2188  END IF
2189  IF( rsrc1.NE.rsrc4 ) THEN
2190  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
2191  IF( npcol.GT.1 .AND. dir.EQ.1 )
2192  $ CALL igebs2d( ictxt, 'Row', top, 8, 1,
2193  $ ibuff, 8 )
2194  skip1cr = window.EQ.1 .AND.
2195  $ iceil(lihic,nb).LE.iceil(ilo,nb)
2196  ELSEIF( myrow.EQ.rsrc4 ) THEN
2197  IF( npcol.GT.1 .AND. dir.EQ.1 ) THEN
2198  CALL igebr2d( ictxt, 'Row', top, 8, 1,
2199  $ ibuff, 8, rsrc4, csrc4 )
2200  i = ibuff( 1 )
2201  nwin = ibuff( 2 )
2202  pitraf = ibuff( 3 )
2203  ks = ibuff( 4 )
2204  pdtraf = ibuff( 5 )
2205  ndtraf = ibuff( 6 )
2206  ilen = ibuff( 7 )
2207  dlen = ibuff( 8 )
2208  bufflen = ilen + dlen
2209  ipw3 = ipw2 + nwin*nwin
2210  dim1 = nb - mod(i-1,nb)
2211  dim4 = nwin - dim1
2212  lihic = nwin + i - 1
2213  skip1cr = window.EQ.1 .AND.
2214  $ iceil(lihic,nb).LE.iceil(ilo,nb)
2215  END IF
2216  END IF
2217  END IF
2218  IF( csrc1.NE.csrc4 ) THEN
2219  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
2220  IF( nprow.GT.1 .AND. dir.EQ.2 )
2221  $ CALL igebs2d( ictxt, 'Col', top, 8, 1,
2222  $ ibuff, 8 )
2223  skip1cr = window.EQ.1 .AND.
2224  $ iceil(lihic,nb).LE.iceil(ilo,nb)
2225  ELSEIF( mycol.EQ.csrc4 ) THEN
2226  IF( nprow.GT.1 .AND. dir.EQ.2 ) THEN
2227  CALL igebr2d( ictxt, 'Col', top, 8, 1,
2228  $ ibuff, 8, rsrc4, csrc4 )
2229  i = ibuff( 1 )
2230  nwin = ibuff( 2 )
2231  pitraf = ibuff( 3 )
2232  ks = ibuff( 4 )
2233  pdtraf = ibuff( 5 )
2234  ndtraf = ibuff( 6 )
2235  ilen = ibuff( 7 )
2236  dlen = ibuff( 8 )
2237  bufflen = ilen + dlen
2238  ipw3 = ipw2 + nwin*nwin
2239  dim1 = nb - mod(i-1,nb)
2240  dim4 = nwin - dim1
2241  lihic = nwin + i - 1
2242  skip1cr = window.EQ.1 .AND.
2243  $ iceil(lihic,nb).LE.iceil(ilo,nb)
2244  END IF
2245  END IF
2246  END IF
2247 *
2248 * Skip rest of broadcasts and updates if appropriate.
2249 *
2250  IF( skip1cr ) GO TO 326
2251 *
2252 * Broadcast the orthogonal transformations.
2253 *
2254  IF( myrow.EQ.rsrc1 .AND. mycol.EQ.csrc1 ) THEN
2255  buffer = pdtraf
2256  bufflen = dlen + ilen
2257  IF( (nprow.GT.1 .AND. dir.EQ.2) .OR.
2258  $ (npcol.GT.1 .AND. dir.EQ.1) ) THEN
2259  DO 370 indx = 1, ilen
2260  work( buffer+indx-1 ) =
2261  $ float( iwork(ipiw+indx-1) )
2262  370 CONTINUE
2263  CALL slamov( 'All', dlen, 1, work( ipw3 ),
2264  $ dlen, work(buffer+ilen), dlen )
2265  END IF
2266  IF( npcol.GT.1 .AND. dir.EQ.1 ) THEN
2267  CALL sgebs2d( ictxt, 'Row', top, bufflen, 1,
2268  $ work(buffer), bufflen )
2269  END IF
2270  IF( nprow.GT.1 .AND. dir.EQ.2 ) THEN
2271  CALL sgebs2d( ictxt, 'Col', top, bufflen, 1,
2272  $ work(buffer), bufflen )
2273  END IF
2274  ELSEIF( myrow.EQ.rsrc1 .OR. mycol.EQ.csrc1 ) THEN
2275  IF( npcol.GT.1 .AND. dir.EQ.1 .AND.
2276  $ myrow.EQ.rsrc1 ) THEN
2277  buffer = pdtraf
2278  bufflen = dlen + ilen
2279  CALL sgebr2d( ictxt, 'Row', top, bufflen, 1,
2280  $ work(buffer), bufflen, rsrc1, csrc1 )
2281  END IF
2282  IF( nprow.GT.1 .AND. dir.EQ.2 .AND.
2283  $ mycol.EQ.csrc1 ) THEN
2284  buffer = pdtraf
2285  bufflen = dlen + ilen
2286  CALL sgebr2d( ictxt, 'Col', top, bufflen, 1,
2287  $ work(buffer), bufflen, rsrc1, csrc1 )
2288  END IF
2289  IF( (npcol.GT.1.AND.dir.EQ.1.AND.myrow.EQ.rsrc1)
2290  $ .OR. (nprow.GT.1.AND.dir.EQ.2.AND.
2291  $ mycol.EQ.csrc1) ) THEN
2292  DO 380 indx = 1, ilen
2293  iwork(ipiw+indx-1) =
2294  $ int( work( buffer+indx-1 ) )
2295  380 CONTINUE
2296  CALL slamov( 'All', dlen, 1,
2297  $ work( buffer+ilen ), dlen,
2298  $ work( ipw3 ), dlen )
2299  END IF
2300  END IF
2301  IF( rsrc1.NE.rsrc4 ) THEN
2302  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
2303  buffer = pdtraf
2304  bufflen = dlen + ilen
2305  IF( npcol.GT.1 .AND. dir.EQ.1 ) THEN
2306  DO 390 indx = 1, ilen
2307  work( buffer+indx-1 ) =
2308  $ float( iwork(ipiw+indx-1) )
2309  390 CONTINUE
2310  CALL slamov( 'All', dlen, 1, work( ipw3 ),
2311  $ dlen, work(buffer+ilen), dlen )
2312  CALL sgebs2d( ictxt, 'Row', top, bufflen,
2313  $ 1, work(buffer), bufflen )
2314  END IF
2315  ELSEIF( myrow.EQ.rsrc4 .AND. dir.EQ.1 .AND.
2316  $ npcol.GT.1 ) THEN
2317  buffer = pdtraf
2318  bufflen = dlen + ilen
2319  CALL sgebr2d( ictxt, 'Row', top, bufflen,
2320  $ 1, work(buffer), bufflen, rsrc4, csrc4 )
2321  DO 400 indx = 1, ilen
2322  iwork(ipiw+indx-1) =
2323  $ int( work( buffer+indx-1 ) )
2324  400 CONTINUE
2325  CALL slamov( 'All', dlen, 1,
2326  $ work( buffer+ilen ), dlen,
2327  $ work( ipw3 ), dlen )
2328  END IF
2329  END IF
2330  IF( csrc1.NE.csrc4 ) THEN
2331  IF( myrow.EQ.rsrc4 .AND. mycol.EQ.csrc4 ) THEN
2332  buffer = pdtraf
2333  bufflen = dlen + ilen
2334  IF( nprow.GT.1 .AND. dir.EQ.2 ) THEN
2335  DO 395 indx = 1, ilen
2336  work( buffer+indx-1 ) =
2337  $ float( iwork(ipiw+indx-1) )
2338  395 CONTINUE
2339  CALL slamov( 'All', dlen, 1, work( ipw3 ),
2340  $ dlen, work(buffer+ilen), dlen )
2341  CALL sgebs2d( ictxt, 'Col', top, bufflen,
2342  $ 1, work(buffer), bufflen )
2343  END IF
2344  ELSEIF( mycol.EQ.csrc4 .AND. dir.EQ.2 .AND.
2345  $ nprow.GT.1 ) THEN
2346  buffer = pdtraf
2347  bufflen = dlen + ilen
2348  CALL sgebr2d( ictxt, 'Col', top, bufflen, 1,
2349  $ work(buffer), bufflen, rsrc4, csrc4 )
2350  DO 402 indx = 1, ilen
2351  iwork(ipiw+indx-1) =
2352  $ int( work( buffer+indx-1 ) )
2353  402 CONTINUE
2354  CALL slamov( 'All', dlen, 1,
2355  $ work( buffer+ilen ), dlen,
2356  $ work( ipw3 ), dlen )
2357  END IF
2358  END IF
2359 *
2360  326 CONTINUE
2361 *
2362  321 CONTINUE
2363 *
2364 * Compute crossborder updates.
2365 *
2366  DO 322 window = window0, wine, 2
2367  IF( window.EQ.1 .AND. skip1cr ) GO TO 327
2368  rsrc4 = iwork(irsrc+window-1)
2369  csrc4 = iwork(icsrc+window-1)
2370  rsrc1 = mod( rsrc4 - 1 + nprow, nprow )
2371  csrc1 = mod( csrc4 - 1 + npcol, npcol )
2372 *
2373 * Prepare workspaces for updates:
2374 * IPW3 holds now the orthogonal transformations
2375 * IPW4 holds the explicit orthogonal matrix, if formed
2376 * IPW5 holds the crossborder block column of T
2377 * IPW6 holds the crossborder block row of T
2378 * IPW7 holds the crossborder block column of Q
2379 * (if WANTQ=.TRUE.)
2380 * IPW8 points to the leftover workspace used as lhs in
2381 * matrix multiplications
2382 *
2383  IF( ((mycol.EQ.csrc1.OR.mycol.EQ.csrc4).AND.dir.EQ.2)
2384  $ .OR. ((myrow.EQ.rsrc1.OR.myrow.EQ.rsrc4).AND.
2385  $ dir.EQ.1)) THEN
2386  ipw4 = buffer
2387  IF( dir.EQ.2 ) THEN
2388  IF( wantq ) THEN
2389  qrows = numroc( n, nb, myrow, descq( rsrc_ ),
2390  $ nprow )
2391  ELSE
2392  qrows = 0
2393  END IF
2394  trows = numroc( i-1, nb, myrow, desct( rsrc_ ),
2395  $ nprow )
2396  ELSE
2397  qrows = 0
2398  trows = 0
2399  END IF
2400  IF( dir.EQ.1 ) THEN
2401  tcols = numroc( n - (i+dim1-1), nb, mycol,
2402  $ csrc4, npcol )
2403  IF( mycol.EQ.csrc4 ) tcols = tcols - dim4
2404  ELSE
2405  tcols = 0
2406  END IF
2407  ipw5 = ipw4 + nwin*nwin
2408  ipw6 = ipw5 + trows * nwin
2409  IF( wantq ) THEN
2410  ipw7 = ipw6 + nwin * tcols
2411  ipw8 = ipw7 + qrows * nwin
2412  ELSE
2413  ipw8 = ipw6 + nwin * tcols
2414  END IF
2415  END IF
2416 *
2417 * Let each process row and column involved in the updates
2418 * exchange data in T and Q with their neighbours.
2419 *
2420  IF( dir.EQ.2 ) THEN
2421  IF( mycol.EQ.csrc1 .OR. mycol.EQ.csrc4 ) THEN
2422  DO 410 indx = 1, nprow
2423  IF( mycol.EQ.csrc1 ) THEN
2424  CALL infog2l( 1+(indx-1)*nb, i, desct,
2425  $ nprow, npcol, myrow, mycol, iloc,
2426  $ jloc1, rsrc, csrc1 )
2427  IF( myrow.EQ.rsrc ) THEN
2428  CALL slamov( 'All', trows, dim1,
2429  $ t((jloc1-1)*lldt+iloc), lldt,
2430  $ work(ipw5), trows )
2431  IF( npcol.GT.1 ) THEN
2432  east = mod( mycol + 1, npcol )
2433  CALL sgesd2d( ictxt, trows, dim1,
2434  $ work(ipw5), trows, rsrc,
2435  $ east )
2436  CALL sgerv2d( ictxt, trows, dim4,
2437  $ work(ipw5+trows*dim1), trows,
2438  $ rsrc, east )
2439  END IF
2440  END IF
2441  END IF
2442  IF( mycol.EQ.csrc4 ) THEN
2443  CALL infog2l( 1+(indx-1)*nb, i+dim1,
2444  $ desct, nprow, npcol, myrow, mycol,
2445  $ iloc, jloc4, rsrc, csrc4 )
2446  IF( myrow.EQ.rsrc ) THEN
2447  CALL slamov( 'All', trows, dim4,
2448  $ t((jloc4-1)*lldt+iloc), lldt,
2449  $ work(ipw5+trows*dim1), trows )
2450  IF( npcol.GT.1 ) THEN
2451  west = mod( mycol-1+npcol, npcol )
2452  CALL sgesd2d( ictxt, trows, dim4,
2453  $ work(ipw5+trows*dim1), trows,
2454  $ rsrc, west )
2455  CALL sgerv2d( ictxt, trows, dim1,
2456  $ work(ipw5), trows, rsrc,
2457  $ west )
2458  END IF
2459  END IF
2460  END IF
2461  410 CONTINUE
2462  END IF
2463  END IF
2464 *
2465  IF( dir.EQ.1 ) THEN
2466  IF( myrow.EQ.rsrc1 .OR. myrow.EQ.rsrc4 ) THEN
2467  DO 420 indx = 1, npcol
2468  IF( myrow.EQ.rsrc1 ) THEN
2469  IF( indx.EQ.1 ) THEN
2470  CALL infog2l( i, lihic+1, desct, nprow,
2471  $ npcol, myrow, mycol, iloc1, jloc,
2472  $ rsrc1, csrc )
2473  ELSE
2474  CALL infog2l( i,
2475  $ (iceil(lihic,nb)+(indx-2))*nb+1,
2476  $ desct, nprow, npcol, myrow, mycol,
2477  $ iloc1, jloc, rsrc1, csrc )
2478  END IF
2479  IF( mycol.EQ.csrc ) THEN
2480  CALL slamov( 'All', dim1, tcols,
2481  $ t((jloc-1)*lldt+iloc1), lldt,
2482  $ work(ipw6), nwin )
2483  IF( nprow.GT.1 ) THEN
2484  south = mod( myrow + 1, nprow )
2485  CALL sgesd2d( ictxt, dim1, tcols,
2486  $ work(ipw6), nwin, south,
2487  $ csrc )
2488  CALL sgerv2d( ictxt, dim4, tcols,
2489  $ work(ipw6+dim1), nwin, south,
2490  $ csrc )
2491  END IF
2492  END IF
2493  END IF
2494  IF( myrow.EQ.rsrc4 ) THEN
2495  IF( indx.EQ.1 ) THEN
2496  CALL infog2l( i+dim1, lihic+1, desct,
2497  $ nprow, npcol, myrow, mycol, iloc4,
2498  $ jloc, rsrc4, csrc )
2499  ELSE
2500  CALL infog2l( i+dim1,
2501  $ (iceil(lihic,nb)+(indx-2))*nb+1,
2502  $ desct, nprow, npcol, myrow, mycol,
2503  $ iloc4, jloc, rsrc4, csrc )
2504  END IF
2505  IF( mycol.EQ.csrc ) THEN
2506  CALL slamov( 'All', dim4, tcols,
2507  $ t((jloc-1)*lldt+iloc4), lldt,
2508  $ work(ipw6+dim1), nwin )
2509  IF( nprow.GT.1 ) THEN
2510  north = mod( myrow-1+nprow, nprow )
2511  CALL sgesd2d( ictxt, dim4, tcols,
2512  $ work(ipw6+dim1), nwin, north,
2513  $ csrc )
2514  CALL sgerv2d( ictxt, dim1, tcols,
2515  $ work(ipw6), nwin, north,
2516  $ csrc )
2517  END IF
2518  END IF
2519  END IF
2520  420 CONTINUE
2521  END IF
2522  END IF
2523 *
2524  IF( dir.EQ.2 ) THEN
2525  IF( wantq ) THEN
2526  IF( mycol.EQ.csrc1 .OR. mycol.EQ.csrc4 ) THEN
2527  DO 430 indx = 1, nprow
2528  IF( mycol.EQ.csrc1 ) THEN
2529  CALL infog2l( 1+(indx-1)*nb, i, descq,
2530  $ nprow, npcol, myrow, mycol, iloc,
2531  $ jloc1, rsrc, csrc1 )
2532  IF( myrow.EQ.rsrc ) THEN
2533  CALL slamov( 'All', qrows, dim1,
2534  $ q((jloc1-1)*lldq+iloc), lldq,
2535  $ work(ipw7), qrows )
2536  IF( npcol.GT.1 ) THEN
2537  east = mod( mycol + 1, npcol )
2538  CALL sgesd2d( ictxt, qrows, dim1,
2539  $ work(ipw7), qrows, rsrc,
2540  $ east )
2541  CALL sgerv2d( ictxt, qrows, dim4,
2542  $ work(ipw7+qrows*dim1),
2543  $ qrows, rsrc, east )
2544  END IF
2545  END IF
2546  END IF
2547  IF( mycol.EQ.csrc4 ) THEN
2548  CALL infog2l( 1+(indx-1)*nb, i+dim1,
2549  $ descq, nprow, npcol, myrow, mycol,
2550  $ iloc, jloc4, rsrc, csrc4 )
2551  IF( myrow.EQ.rsrc ) THEN
2552  CALL slamov( 'All', qrows, dim4,
2553  $ q((jloc4-1)*lldq+iloc), lldq,
2554  $ work(ipw7+qrows*dim1), qrows )
2555  IF( npcol.GT.1 ) THEN
2556  west = mod( mycol-1+npcol,
2557  $ npcol )
2558  CALL sgesd2d( ictxt, qrows, dim4,
2559  $ work(ipw7+qrows*dim1),
2560  $ qrows, rsrc, west )
2561  CALL sgerv2d( ictxt, qrows, dim1,
2562  $ work(ipw7), qrows, rsrc,
2563  $ west )
2564  END IF
2565  END IF
2566  END IF
2567  430 CONTINUE
2568  END IF
2569  END IF
2570  END IF
2571 *
2572  327 CONTINUE
2573 *
2574  322 CONTINUE
2575 *
2576  DO 323 window = window0, wine, 2
2577  rsrc4 = iwork(irsrc+window-1)
2578  csrc4 = iwork(icsrc+window-1)
2579  rsrc1 = mod( rsrc4 - 1 + nprow, nprow )
2580  csrc1 = mod( csrc4 - 1 + npcol, npcol )
2581  flops = 0
2582  IF( ((mycol.EQ.csrc1.OR.mycol.EQ.csrc4).AND.dir.EQ.2)
2583  $ .OR. ((myrow.EQ.rsrc1.OR.myrow.EQ.rsrc4).AND.
2584  $ dir.EQ.1) ) THEN
2585 *
2586 * Skip this part of the updates if appropriate.
2587 *
2588  IF( window.EQ.1 .AND. skip1cr ) GO TO 328
2589 *
2590 * Count number of operations to decide whether to use
2591 * matrix-matrix multiplications for updating
2592 * off-diagonal parts or not.
2593 *
2594  nitraf = pitraf - ipiw
2595  ishh = .false.
2596  DO 405 k = 1, nitraf
2597  IF( iwork( ipiw + k - 1 ).LE.nwin ) THEN
2598  flops = flops + 6
2599  ELSE
2600  flops = flops + 11
2601  ishh = .true.
2602  END IF
2603  405 CONTINUE
2604 *
2605 * Perform updates in parallel.
2606 *
2607  IF( flops.NE.0 .AND.
2608  $ ( 2*flops*100 )/( 2*nwin*nwin ) .GE. mmult )
2609  $ THEN
2610 *
2611  CALL slaset( 'All', nwin, nwin, zero, one,
2612  $ work( ipw4 ), nwin )
2613  work(ipw8) = float(myrow)
2614  work(ipw8+1) = float(mycol)
2615  CALL bslaapp( 1, nwin, nwin, ncb, work( ipw4 ),
2616  $ nwin, nitraf, iwork(ipiw), work( ipw3 ),
2617  $ work(ipw8) )
2618 *
2619 * Test if sparsity structure of orthogonal matrix
2620 * can be exploited (see below).
2621 *
2622  IF( ishh .OR. dim1.NE.ks .OR. dim4.NE.ks ) THEN
2623 *
2624 * Update the columns of T and Q affected by the
2625 * reordering.
2626 *
2627  IF( dir.EQ.2 ) THEN
2628  DO 440 indx = 1, min(i-1,1+(nprow-1)*nb),
2629  $ nb
2630  IF( mycol.EQ.csrc1 ) THEN
2631  CALL infog2l( indx, i, desct, nprow,
2632  $ npcol, myrow, mycol, iloc,
2633  $ jloc, rsrc, csrc1 )
2634  IF( myrow.EQ.rsrc ) THEN
2635  CALL sgemm( 'No transpose',
2636  $ 'No transpose', trows, dim1,
2637  $ nwin, one, work( ipw5 ),
2638  $ trows, work( ipw4 ), nwin,
2639  $ zero, work(ipw8), trows )
2640  CALL slamov( 'All', trows, dim1,
2641  $ work(ipw8), trows,
2642  $ t((jloc-1)*lldt+iloc),
2643  $ lldt )
2644  END IF
2645  END IF
2646  IF( mycol.EQ.csrc4 ) THEN
2647  CALL infog2l( indx, i+dim1, desct,
2648  $ nprow, npcol, myrow, mycol,
2649  $ iloc, jloc, rsrc, csrc4 )
2650  IF( myrow.EQ.rsrc ) THEN
2651  CALL sgemm( 'No transpose',
2652  $ 'No transpose', trows, dim4,
2653  $ nwin, one, work( ipw5 ),
2654  $ trows,
2655  $ work( ipw4+nwin*dim1 ),
2656  $ nwin, zero, work(ipw8),
2657  $ trows )
2658  CALL slamov( 'All', trows, dim4,
2659  $ work(ipw8), trows,
2660  $ t((jloc-1)*lldt+iloc),
2661  $ lldt )
2662  END IF
2663  END IF
2664  440 CONTINUE
2665 *
2666  IF( wantq ) THEN
2667  DO 450 indx = 1, min(n,1+(nprow-1)*nb),
2668  $ nb
2669  IF( mycol.EQ.csrc1 ) THEN
2670  CALL infog2l( indx, i, descq,
2671  $ nprow, npcol, myrow, mycol,
2672  $ iloc, jloc, rsrc, csrc1 )
2673  IF( myrow.EQ.rsrc ) THEN
2674  CALL sgemm( 'No transpose',
2675  $ 'No transpose', qrows,
2676  $ dim1, nwin, one,
2677  $ work( ipw7 ), qrows,
2678  $ work( ipw4 ), nwin,
2679  $ zero, work(ipw8),
2680  $ qrows )
2681  CALL slamov( 'All', qrows,
2682  $ dim1, work(ipw8), qrows,
2683  $ q((jloc-1)*lldq+iloc),
2684  $ lldq )
2685  END IF
2686  END IF
2687  IF( mycol.EQ.csrc4 ) THEN
2688  CALL infog2l( indx, i+dim1,
2689  $ descq, nprow, npcol, myrow,
2690  $ mycol, iloc, jloc, rsrc,
2691  $ csrc4 )
2692  IF( myrow.EQ.rsrc ) THEN
2693  CALL sgemm( 'No transpose',
2694  $ 'No transpose', qrows,
2695  $ dim4, nwin, one,
2696  $ work( ipw7 ), qrows,
2697  $ work( ipw4+nwin*dim1 ),
2698  $ nwin, zero, work(ipw8),
2699  $ qrows )
2700  CALL slamov( 'All', qrows,
2701  $ dim4, work(ipw8), qrows,
2702  $ q((jloc-1)*lldq+iloc),
2703  $ lldq )
2704  END IF
2705  END IF
2706  450 CONTINUE
2707  END IF
2708  END IF
2709 *
2710 * Update the rows of T affected by the
2711 * reordering.
2712 *
2713  IF( dir.EQ.1 ) THEN
2714  IF ( lihic.LT.n ) THEN
2715  IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc4
2716  $ .AND.mod(lihic,nb).NE.0 ) THEN
2717  indx = lihic + 1
2718  CALL infog2l( i, indx, desct, nprow,
2719  $ npcol, myrow, mycol, iloc,
2720  $ jloc, rsrc1, csrc4 )
2721  CALL sgemm( 'Transpose',
2722  $ 'No Transpose', dim1, tcols,
2723  $ nwin, one, work(ipw4), nwin,
2724  $ work( ipw6 ), nwin, zero,
2725  $ work(ipw8), dim1 )
2726  CALL slamov( 'All', dim1, tcols,
2727  $ work(ipw8), dim1,
2728  $ t((jloc-1)*lldt+iloc), lldt )
2729  END IF
2730  IF( myrow.EQ.rsrc4.AND.mycol.EQ.csrc4
2731  $ .AND.mod(lihic,nb).NE.0 ) THEN
2732  indx = lihic + 1
2733  CALL infog2l( i+dim1, indx, desct,
2734  $ nprow, npcol, myrow, mycol,
2735  $ iloc, jloc, rsrc4, csrc4 )
2736  CALL sgemm( 'Transpose',
2737  $ 'No Transpose', dim4, tcols,
2738  $ nwin, one,
2739  $ work( ipw4+dim1*nwin ), nwin,
2740  $ work( ipw6), nwin, zero,
2741  $ work(ipw8), dim4 )
2742  CALL slamov( 'All', dim4, tcols,
2743  $ work(ipw8), dim4,
2744  $ t((jloc-1)*lldt+iloc), lldt )
2745  END IF
2746  indxs = iceil(lihic,nb)*nb + 1
2747  indxe = min(n,indxs+(npcol-2)*nb)
2748  DO 460 indx = indxs, indxe, nb
2749  IF( myrow.EQ.rsrc1 ) THEN
2750  CALL infog2l( i, indx, desct,
2751  $ nprow, npcol, myrow, mycol,
2752  $ iloc, jloc, rsrc1, csrc )
2753  IF( mycol.EQ.csrc ) THEN
2754  CALL sgemm( 'Transpose',
2755  $ 'No Transpose', dim1,
2756  $ tcols, nwin, one,
2757  $ work( ipw4 ), nwin,
2758  $ work( ipw6 ), nwin,
2759  $ zero, work(ipw8), dim1 )
2760  CALL slamov( 'All', dim1,
2761  $ tcols, work(ipw8), dim1,
2762  $ t((jloc-1)*lldt+iloc),
2763  $ lldt )
2764  END IF
2765  END IF
2766  IF( myrow.EQ.rsrc4 ) THEN
2767  CALL infog2l( i+dim1, indx,
2768  $ desct, nprow, npcol, myrow,
2769  $ mycol, iloc, jloc, rsrc4,
2770  $ csrc )
2771  IF( mycol.EQ.csrc ) THEN
2772  CALL sgemm( 'Transpose',
2773  $ 'No Transpose', dim4,
2774  $ tcols, nwin, one,
2775  $ work( ipw4+nwin*dim1 ),
2776  $ nwin, work( ipw6 ),
2777  $ nwin, zero, work(ipw8),
2778  $ dim4 )
2779  CALL slamov( 'All', dim4,
2780  $ tcols, work(ipw8), dim4,
2781  $ t((jloc-1)*lldt+iloc),
2782  $ lldt )
2783  END IF
2784  END IF
2785  460 CONTINUE
2786  END IF
2787  END IF
2788  ELSE
2789 *
2790 * The NWIN-by-NWIN matrix U containing the
2791 * accumulated orthogonal transformations has
2792 * the following structure:
2793 *
2794 * [ U11 U12 ]
2795 * U = [ ],
2796 * [ U21 U22 ]
2797 *
2798 * where U21 is KS-by-KS upper triangular and
2799 * U12 is (NWIN-KS)-by-(NWIN-KS) lower
2800 * triangular. For reordering over the border
2801 * the structure is only exploited when the
2802 * border cuts the columns of U conformally with
2803 * the structure itself. This happens exactly
2804 * when all eigenvalues in the subcluster was
2805 * moved to the other side of the border and
2806 * fits perfectly in their new positions, i.e.,
2807 * the reordering stops when the last eigenvalue
2808 * to cross the border is reordered to the
2809 * position closest to the border. Tested by
2810 * checking is KS = DIM1 = DIM4 (see above).
2811 * This should hold quite often. But this branch
2812 * is entered only if all involved eigenvalues
2813 * are real.
2814 *
2815 * Update the columns of T and Q affected by the
2816 * reordering.
2817 *
2818 * Compute T2*U21 + T1*U11 on the left side of
2819 * the border.
2820 *
2821  IF( dir.EQ.2 ) THEN
2822  indxe = min(i-1,1+(nprow-1)*nb)
2823  DO 470 indx = 1, indxe, nb
2824  IF( mycol.EQ.csrc1 ) THEN
2825  CALL infog2l( indx, i, desct, nprow,
2826  $ npcol, myrow, mycol, iloc,
2827  $ jloc, rsrc, csrc1 )
2828  IF( myrow.EQ.rsrc ) THEN
2829  CALL slamov( 'All', trows, ks,
2830  $ work( ipw5+trows*dim4),
2831  $ trows, work(ipw8), trows )
2832  CALL strmm( 'Right', 'Upper',
2833  $ 'No transpose',
2834  $ 'Non-unit', trows, ks,
2835  $ one, work( ipw4+dim4 ),
2836  $ nwin, work(ipw8), trows )
2837  CALL sgemm( 'No transpose',
2838  $ 'No transpose', trows, ks,
2839  $ dim4, one, work( ipw5 ),
2840  $ trows, work( ipw4 ), nwin,
2841  $ one, work(ipw8), trows )
2842  CALL slamov( 'All', trows, ks,
2843  $ work(ipw8), trows,
2844  $ t((jloc-1)*lldt+iloc),
2845  $ lldt )
2846  END IF
2847  END IF
2848 *
2849 * Compute T1*U12 + T2*U22 on the right
2850 * side of the border.
2851 *
2852  IF( mycol.EQ.csrc4 ) THEN
2853  CALL infog2l( indx, i+dim1, desct,
2854  $ nprow, npcol, myrow, mycol,
2855  $ iloc, jloc, rsrc, csrc4 )
2856  IF( myrow.EQ.rsrc ) THEN
2857  CALL slamov( 'All', trows, dim4,
2858  $ work(ipw5), trows,
2859  $ work( ipw8 ), trows )
2860  CALL strmm( 'Right', 'Lower',
2861  $ 'No transpose',
2862  $ 'Non-unit', trows, dim4,
2863  $ one, work( ipw4+nwin*ks ),
2864  $ nwin, work( ipw8 ), trows )
2865  CALL sgemm( 'No transpose',
2866  $ 'No transpose', trows, dim4,
2867  $ ks, one,
2868  $ work( ipw5+trows*dim4),
2869  $ trows,
2870  $ work( ipw4+nwin*ks+dim4 ),
2871  $ nwin, one, work( ipw8 ),
2872  $ trows )
2873  CALL slamov( 'All', trows, dim4,
2874  $ work(ipw8), trows,
2875  $ t((jloc-1)*lldt+iloc),
2876  $ lldt )
2877  END IF
2878  END IF
2879  470 CONTINUE
2880  IF( wantq ) THEN
2881 *
2882 * Compute Q2*U21 + Q1*U11 on the left
2883 * side of border.
2884 *
2885  indxe = min(n,1+(nprow-1)*nb)
2886  DO 480 indx = 1, indxe, nb
2887  IF( mycol.EQ.csrc1 ) THEN
2888  CALL infog2l( indx, i, descq,
2889  $ nprow, npcol, myrow, mycol,
2890  $ iloc, jloc, rsrc, csrc1 )
2891  IF( myrow.EQ.rsrc ) THEN
2892  CALL slamov( 'All', qrows, ks,
2893  $ work( ipw7+qrows*dim4),
2894  $ qrows, work(ipw8),
2895  $ qrows )
2896  CALL strmm( 'Right', 'Upper',
2897  $ 'No transpose',
2898  $ 'Non-unit', qrows,
2899  $ ks, one,
2900  $ work( ipw4+dim4 ), nwin,
2901  $ work(ipw8), qrows )
2902  CALL sgemm( 'No transpose',
2903  $ 'No transpose', qrows,
2904  $ ks, dim4, one,
2905  $ work( ipw7 ), qrows,
2906  $ work( ipw4 ), nwin, one,
2907  $ work(ipw8), qrows )
2908  CALL slamov( 'All', qrows, ks,
2909  $ work(ipw8), qrows,
2910  $ q((jloc-1)*lldq+iloc),
2911  $ lldq )
2912  END IF
2913  END IF
2914 *
2915 * Compute Q1*U12 + Q2*U22 on the right
2916 * side of border.
2917 *
2918  IF( mycol.EQ.csrc4 ) THEN
2919  CALL infog2l( indx, i+dim1,
2920  $ descq, nprow, npcol, myrow,
2921  $ mycol, iloc, jloc, rsrc,
2922  $ csrc4 )
2923  IF( myrow.EQ.rsrc ) THEN
2924  CALL slamov( 'All', qrows,
2925  $ dim4, work(ipw7), qrows,
2926  $ work( ipw8 ), qrows )
2927  CALL strmm( 'Right', 'Lower',
2928  $ 'No transpose',
2929  $ 'Non-unit', qrows,
2930  $ dim4, one,
2931  $ work( ipw4+nwin*ks ),
2932  $ nwin, work( ipw8 ),
2933  $ qrows )
2934  CALL sgemm( 'No transpose',
2935  $ 'No transpose', qrows,
2936  $ dim4, ks, one,
2937  $ work(ipw7+qrows*(dim4)),
2938  $ qrows,
2939  $ work(ipw4+nwin*ks+dim4),
2940  $ nwin, one, work( ipw8 ),
2941  $ qrows )
2942  CALL slamov( 'All', qrows,
2943  $ dim4, work(ipw8), qrows,
2944  $ q((jloc-1)*lldq+iloc),
2945  $ lldq )
2946  END IF
2947  END IF
2948  480 CONTINUE
2949  END IF
2950  END IF
2951 *
2952  IF( dir.EQ.1 ) THEN
2953  IF ( lihic.LT.n ) THEN
2954 *
2955 * Compute U21**T*T2 + U11**T*T1 on the
2956 * upper side of the border.
2957 *
2958  IF( myrow.EQ.rsrc1.AND.mycol.EQ.csrc4
2959  $ .AND.mod(lihic,nb).NE.0 ) THEN
2960  indx = lihic + 1
2961  CALL infog2l( i, indx, desct, nprow,
2962  $ npcol, myrow, mycol, iloc,
2963  $ jloc, rsrc1, csrc4 )
2964  CALL slamov( 'All', ks, tcols,
2965  $ work( ipw6+dim4 ), nwin,
2966  $ work(ipw8), ks )
2967  CALL strmm( 'Left', 'Upper',
2968  $ 'Transpose', 'Non-unit',
2969  $ ks, tcols, one,
2970  $ work( ipw4+dim4 ), nwin,
2971  $ work(ipw8), ks )
2972  CALL sgemm( 'Transpose',
2973  $ 'No transpose', ks, tcols,
2974  $ dim4, one, work(ipw4), nwin,
2975  $ work(ipw6), nwin, one,
2976  $ work(ipw8), ks )
2977  CALL slamov( 'All', ks, tcols,
2978  $ work(ipw8), ks,
2979  $ t((jloc-1)*lldt+iloc), lldt )
2980  END IF
2981 *
2982 * Compute U12**T*T1 + U22**T*T2 on the
2983 * lower side of the border.
2984 *
2985  IF( myrow.EQ.rsrc4.AND.mycol.EQ.csrc4
2986  $ .AND.mod(lihic,nb).NE.0 ) THEN
2987  indx = lihic + 1
2988  CALL infog2l( i+dim1, indx, desct,
2989  $ nprow, npcol, myrow, mycol,
2990  $ iloc, jloc, rsrc4, csrc4 )
2991  CALL slamov( 'All', dim4, tcols,
2992  $ work( ipw6 ), nwin,
2993  $ work( ipw8 ), dim4 )
2994  CALL strmm( 'Left', 'Lower',
2995  $ 'Transpose', 'Non-unit',
2996  $ dim4, tcols, one,
2997  $ work( ipw4+nwin*ks ), nwin,
2998  $ work( ipw8 ), dim4 )
2999  CALL sgemm( 'Transpose',
3000  $ 'No Transpose', dim4, tcols,
3001  $ ks, one,
3002  $ work( ipw4+nwin*ks+dim4 ),
3003  $ nwin, work( ipw6+dim1 ), nwin,
3004  $ one, work( ipw8), dim4 )
3005  CALL slamov( 'All', dim4, tcols,
3006  $ work(ipw8), dim4,
3007  $ t((jloc-1)*lldt+iloc), lldt )
3008  END IF
3009 *
3010 * Compute U21**T*T2 + U11**T*T1 on upper
3011 * side on border.
3012 *
3013  indxs = iceil(lihic,nb)*nb+1
3014  indxe = min(n,indxs+(npcol-2)*nb)
3015  DO 490 indx = indxs, indxe, nb
3016  IF( myrow.EQ.rsrc1 ) THEN
3017  CALL infog2l( i, indx, desct,
3018  $ nprow, npcol, myrow, mycol,
3019  $ iloc, jloc, rsrc1, csrc )
3020  IF( mycol.EQ.csrc ) THEN
3021  CALL slamov( 'All', ks, tcols,
3022  $ work( ipw6+dim4 ), nwin,
3023  $ work(ipw8), ks )
3024  CALL strmm( 'Left', 'Upper',
3025  $ 'Transpose',
3026  $ 'Non-unit', ks,
3027  $ tcols, one,
3028  $ work( ipw4+dim4 ), nwin,
3029  $ work(ipw8), ks )
3030  CALL sgemm( 'Transpose',
3031  $ 'No transpose', ks,
3032  $ tcols, dim4, one,
3033  $ work(ipw4), nwin,
3034  $ work(ipw6), nwin, one,
3035  $ work(ipw8), ks )
3036  CALL slamov( 'All', ks, tcols,
3037  $ work(ipw8), ks,
3038  $ t((jloc-1)*lldt+iloc),
3039  $ lldt )
3040  END IF
3041  END IF
3042 *
3043 * Compute U12**T*T1 + U22**T*T2 on
3044 * lower side of border.
3045 *
3046  IF( myrow.EQ.rsrc4 ) THEN
3047  CALL infog2l( i+dim1, indx,
3048  $ desct, nprow, npcol, myrow,
3049  $ mycol, iloc, jloc, rsrc4,
3050  $ csrc )
3051  IF( mycol.EQ.csrc ) THEN
3052  CALL slamov( 'All', dim4,
3053  $ tcols, work( ipw6 ),
3054  $ nwin, work( ipw8 ),
3055  $ dim4 )
3056  CALL strmm( 'Left', 'Lower',
3057  $ 'Transpose',
3058  $ 'Non-unit', dim4,
3059  $ tcols, one,
3060  $ work( ipw4+nwin*ks ),
3061  $ nwin, work( ipw8 ),
3062  $ dim4 )
3063  CALL sgemm( 'Transpose',
3064  $ 'No Transpose', dim4,
3065  $ tcols, ks, one,
3066  $ work(ipw4+nwin*ks+dim4),
3067  $ nwin, work( ipw6+dim1 ),
3068  $ nwin, one, work( ipw8),
3069  $ dim4 )
3070  CALL slamov( 'All', dim4,
3071  $ tcols, work(ipw8), dim4,
3072  $ t((jloc-1)*lldt+iloc),
3073  $ lldt )
3074  END IF
3075  END IF
3076  490 CONTINUE
3077  END IF
3078  END IF
3079  END IF
3080  ELSEIF( flops.NE.0 ) THEN
3081 *
3082 * Update off-diagonal blocks and Q using the
3083 * pipelined elementary transformations. Now we
3084 * have a delicate problem: how to do this without
3085 * redundant work? For now, we let the processes
3086 * involved compute the whole crossborder block
3087 * rows and column saving only the part belonging
3088 * to the corresponding side of the border. To make
3089 * this a realistic alternative, we have modified
3090 * the ratio r_flops (see Reference [2] above) to
3091 * give more favor to the ordinary matrix
3092 * multiplication.
3093 *
3094  IF( dir.EQ.2 ) THEN
3095  indxe = min(i-1,1+(nprow-1)*nb)
3096  DO 500 indx = 1, indxe, nb
3097  CALL infog2l( indx, i, desct, nprow,
3098  $ npcol, myrow, mycol, iloc, jloc,
3099  $ rsrc, csrc )
3100  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc )
3101  $ THEN
3102  CALL bslaapp( 1, trows, nwin, ncb,
3103  $ work(ipw5), trows, nitraf,
3104  $ iwork(ipiw), work( ipw3 ),
3105  $ work(ipw8) )
3106  CALL slamov( 'All', trows, dim1,
3107  $ work(ipw5), trows,
3108  $ t((jloc-1)*lldt+iloc ), lldt )
3109  END IF
3110  CALL infog2l( indx, i+dim1, desct, nprow,
3111  $ npcol, myrow, mycol, iloc, jloc,
3112  $ rsrc, csrc )
3113  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc )
3114  $ THEN
3115  IF( npcol.GT.1 )
3116  $ CALL bslaapp( 1, trows, nwin, ncb,
3117  $ work(ipw5), trows, nitraf,
3118  $ iwork(ipiw), work( ipw3 ),
3119  $ work(ipw8) )
3120  CALL slamov( 'All', trows, dim4,
3121  $ work(ipw5+trows*dim1), trows,
3122  $ t((jloc-1)*lldt+iloc ), lldt )
3123  END IF
3124  500 CONTINUE
3125  IF( wantq ) THEN
3126  indxe = min(n,1+(nprow-1)*nb)
3127  DO 510 indx = 1, indxe, nb
3128  CALL infog2l( indx, i, descq, nprow,
3129  $ npcol, myrow, mycol, iloc, jloc,
3130  $ rsrc, csrc )
3131  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc )
3132  $ THEN
3133  CALL bslaapp( 1, qrows, nwin, ncb,
3134  $ work(ipw7), qrows, nitraf,
3135  $ iwork(ipiw), work( ipw3 ),
3136  $ work(ipw8) )
3137  CALL slamov( 'All', qrows, dim1,
3138  $ work(ipw7), qrows,
3139  $ q((jloc-1)*lldq+iloc ), lldq )
3140  END IF
3141  CALL infog2l( indx, i+dim1, descq,
3142  $ nprow, npcol, myrow, mycol, iloc,
3143  $ jloc, rsrc, csrc )
3144  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc )
3145  $ THEN
3146  IF( npcol.GT.1 )
3147  $ CALL bslaapp( 1, qrows, nwin,
3148  $ ncb, work(ipw7), qrows,
3149  $ nitraf, iwork(ipiw),
3150  $ work( ipw3 ), work(ipw8) )
3151  CALL slamov( 'All', qrows, dim4,
3152  $ work(ipw7+qrows*dim1), qrows,
3153  $ q((jloc-1)*lldq+iloc ), lldq )
3154  END IF
3155  510 CONTINUE
3156  END IF
3157  END IF
3158 *
3159  IF( dir.EQ.1 ) THEN
3160  IF( lihic.LT.n ) THEN
3161  indx = lihic + 1
3162  CALL infog2l( i, indx, desct, nprow,
3163  $ npcol, myrow, mycol, iloc, jloc,
3164  $ rsrc, csrc )
3165  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc.AND.
3166  $ mod(lihic,nb).NE.0 ) THEN
3167  CALL bslaapp( 0, nwin, tcols, ncb,
3168  $ work( ipw6 ), nwin, nitraf,
3169  $ iwork(ipiw), work( ipw3 ),
3170  $ work(ipw8) )
3171  CALL slamov( 'All', dim1, tcols,
3172  $ work( ipw6 ), nwin,
3173  $ t((jloc-1)*lldt+iloc), lldt )
3174  END IF
3175  CALL infog2l( i+dim1, indx, desct, nprow,
3176  $ npcol, myrow, mycol, iloc, jloc,
3177  $ rsrc, csrc )
3178  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc.AND.
3179  $ mod(lihic,nb).NE.0 ) THEN
3180  IF( nprow.GT.1 )
3181  $ CALL bslaapp( 0, nwin, tcols, ncb,
3182  $ work( ipw6 ), nwin, nitraf,
3183  $ iwork(ipiw), work( ipw3 ),
3184  $ work(ipw8) )
3185  CALL slamov( 'All', dim4, tcols,
3186  $ work( ipw6+dim1 ), nwin,
3187  $ t((jloc-1)*lldt+iloc), lldt )
3188  END IF
3189  indxs = iceil(lihic,nb)*nb + 1
3190  indxe = min(n,indxs+(npcol-2)*nb)
3191  DO 520 indx = indxs, indxe, nb
3192  CALL infog2l( i, indx, desct, nprow,
3193  $ npcol, myrow, mycol, iloc, jloc,
3194  $ rsrc, csrc )
3195  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc )
3196  $ THEN
3197  CALL bslaapp( 0, nwin, tcols, ncb,
3198  $ work(ipw6), nwin, nitraf,
3199  $ iwork(ipiw), work( ipw3 ),
3200  $ work(ipw8) )
3201  CALL slamov( 'All', dim1, tcols,
3202  $ work( ipw6 ), nwin,
3203  $ t((jloc-1)*lldt+iloc), lldt )
3204  END IF
3205  CALL infog2l( i+dim1, indx, desct,
3206  $ nprow, npcol, myrow, mycol, iloc,
3207  $ jloc, rsrc, csrc )
3208  IF( myrow.EQ.rsrc .AND. mycol.EQ.csrc )
3209  $ THEN
3210  IF( nprow.GT.1 )
3211  $ CALL bslaapp( 0, nwin, tcols,
3212  $ ncb, work(ipw6), nwin, nitraf,
3213  $ iwork(ipiw), work( ipw3 ),
3214  $ work(ipw8) )
3215  CALL slamov( 'All', dim4, tcols,
3216  $ work( ipw6+dim1 ), nwin,
3217  $ t((jloc-1)*lldt+iloc), lldt )
3218  END IF
3219  520 CONTINUE
3220  END IF
3221  END IF
3222  END IF
3223  END IF
3224 *
3225  328 CONTINUE
3226 *
3227  323 CONTINUE
3228 *
3229 * End of loops over directions (DIR).
3230 *
3231  2222 CONTINUE
3232 *
3233 * End of loops over diagonal blocks for reordering over the
3234 * block diagonal.
3235 *
3236  310 CONTINUE
3237  last = last + 1
3238  IF( lastwait .AND. last.LT.2 ) GO TO 308
3239 *
3240 * Barrier to collect the processes before proceeding.
3241 *
3242  CALL blacs_barrier( ictxt, 'All' )
3243 *
3244 * Compute global maximum of IERR so that we know if some process
3245 * experienced a failure in the reordering.
3246 *
3247  myierr = ierr
3248  IF( nprocs.GT.1 )
3249  $ CALL igamx2d( ictxt, 'All', top, 1, 1, ierr, 1, -1,
3250  $ -1, -1, -1, -1 )
3251 *
3252  IF( ierr.NE.0 ) THEN
3253 *
3254 * When calling BDTREXC, the block at position I+KKS-1 failed
3255 * to swap.
3256 *
3257  IF( myierr.NE.0 ) info = max(1,i+kks-1)
3258  IF( nprocs.GT.1 )
3259  $ CALL igamx2d( ictxt, 'All', top, 1, 1, info, 1, -1,
3260  $ -1, -1, -1, -1 )
3261  GO TO 300
3262  END IF
3263 *
3264 * Do a global update of the SELECT vector.
3265 *
3266  DO 530 k = 1, n
3267  rsrc = indxg2p( k, nb, myrow, desct( rsrc_ ), nprow )
3268  csrc = indxg2p( k, nb, mycol, desct( csrc_ ), npcol )
3269  IF( myrow.NE.rsrc .OR. mycol.NE.csrc )
3270  $ SELECT( k ) = 0
3271  530 CONTINUE
3272  IF( nprocs.GT.1 )
3273  $ CALL igsum2d( ictxt, 'All', top, n, 1, SELECT, n, -1, -1 )
3274 *
3275 * Find the global minumum of ILO and IHI.
3276 *
3277  ilo = ilo - 1
3278  523 CONTINUE
3279  ilo = ilo + 1
3280  IF( ilo.LE.n ) THEN
3281  IF( SELECT(ilo).NE.0 ) GO TO 523
3282  END IF
3283  ihi = ihi + 1
3284  527 CONTINUE
3285  ihi = ihi - 1
3286  IF( ihi.GE.1 ) THEN
3287  IF( SELECT(ihi).EQ.0 ) GO TO 527
3288  END IF
3289 *
3290 * End While ( ILO <= M )
3291  GO TO 50
3292  END IF
3293 *
3294  300 CONTINUE
3295 *
3296 * In case an error occured, do an additional global update of
3297 * SELECT.
3298 *
3299  IF( info.NE.0 ) THEN
3300  DO 540 k = 1, n
3301  rsrc = indxg2p( k, nb, myrow, desct( rsrc_ ), nprow )
3302  csrc = indxg2p( k, nb, mycol, desct( csrc_ ), npcol )
3303  IF( myrow.NE.rsrc .OR. mycol.NE.csrc )
3304  $ SELECT( k ) = 0
3305  540 CONTINUE
3306  IF( nprocs.GT.1 )
3307  $ CALL igsum2d( ictxt, 'All', top, n, 1, SELECT, n, -1, -1 )
3308  END IF
3309 *
3310  545 CONTINUE
3311 *
3312 * Store the output eigenvalues in WR and WI: first let all the
3313 * processes compute the eigenvalue inside their diagonal blocks in
3314 * parallel, except for the eigenvalue located next to a block
3315 * border. After that, compute all eigenvalues located next to the
3316 * block borders. Finally, do a global summation over WR and WI so
3317 * that all processors receive the result. Notice: real eigenvalues
3318 * extracted from a non-canonical 2-by-2 block are not stored in
3319 * any particular order.
3320 *
3321  DO 550 k = 1, n
3322  wr( k ) = zero
3323  wi( k ) = zero
3324  550 CONTINUE
3325 *
3326 * Loop 560: extract eigenvalues from the blocks which are not laid
3327 * out across a border of the processor mesh, except for those 1x1
3328 * blocks on the border.
3329 *
3330  pair = .false.
3331  DO 560 k = 1, n
3332  IF( .NOT. pair ) THEN
3333  border = ( k.NE.n .AND. mod( k, nb ).EQ.0 ) .OR.
3334  % ( k.NE.1 .AND. mod( k, nb ).EQ.1 )
3335  IF( .NOT. border ) THEN
3336  CALL infog2l( k, k, desct, nprow, npcol, myrow, mycol,
3337  $ iloc1, jloc1, trsrc1, tcsrc1 )
3338  IF( myrow.EQ.trsrc1 .AND. mycol.EQ.tcsrc1 ) THEN
3339  elem1 = t((jloc1-1)*lldt+iloc1)
3340  IF( k.LT.n ) THEN
3341  elem3 = t((jloc1-1)*lldt+iloc1+1)
3342  ELSE
3343  elem3 = zero
3344  END IF
3345  IF( elem3.NE.zero ) THEN
3346  elem2 = t((jloc1)*lldt+iloc1)
3347  elem4 = t((jloc1)*lldt+iloc1+1)
3348  CALL slanv2( elem1, elem2, elem3, elem4,
3349  $ wr( k ), wi( k ), wr( k+1 ), wi( k+1 ), sn,
3350  $ cs )
3351  pair = .true.
3352  ELSE
3353  IF( k.GT.1 ) THEN
3354  tmp = t((jloc1-2)*lldt+iloc1)
3355  IF( tmp.NE.zero ) THEN
3356  elem1 = t((jloc1-2)*lldt+iloc1-1)
3357  elem2 = t((jloc1-1)*lldt+iloc1-1)
3358  elem3 = t((jloc1-2)*lldt+iloc1)
3359  elem4 = t((jloc1-1)*lldt+iloc1)
3360  CALL slanv2( elem1, elem2, elem3, elem4,
3361  $ wr( k-1 ), wi( k-1 ), wr( k ),
3362  $ wi( k ), sn, cs )
3363  ELSE
3364  wr( k ) = elem1
3365  END IF
3366  ELSE
3367  wr( k ) = elem1
3368  END IF
3369  END IF
3370  END IF
3371  END IF
3372  ELSE
3373  pair = .false.
3374  END IF
3375  560 CONTINUE
3376 *
3377 * Loop 570: extract eigenvalues from the blocks which are laid
3378 * out across a border of the processor mesh. The processors are
3379 * numbered as below:
3380 *
3381 * 1 | 2
3382 * --+--
3383 * 3 | 4
3384 *
3385  DO 570 k = nb, n-1, nb
3386  CALL infog2l( k, k, desct, nprow, npcol, myrow, mycol,
3387  $ iloc1, jloc1, trsrc1, tcsrc1 )
3388  CALL infog2l( k, k+1, desct, nprow, npcol, myrow, mycol,
3389  $ iloc2, jloc2, trsrc2, tcsrc2 )
3390  CALL infog2l( k+1, k, desct, nprow, npcol, myrow, mycol,
3391  $ iloc3, jloc3, trsrc3, tcsrc3 )
3392  CALL infog2l( k+1, k+1, desct, nprow, npcol, myrow, mycol,
3393  $ iloc4, jloc4, trsrc4, tcsrc4 )
3394  IF( myrow.EQ.trsrc2 .AND. mycol.EQ.tcsrc2 ) THEN
3395  elem2 = t((jloc2-1)*lldt+iloc2)
3396  IF( trsrc1.NE.trsrc2 .OR. tcsrc1.NE.tcsrc2 )
3397  $ CALL sgesd2d( ictxt, 1, 1, elem2, 1, trsrc1, tcsrc1 )
3398  END IF
3399  IF( myrow.EQ.trsrc3 .AND. mycol.EQ.tcsrc3 ) THEN
3400  elem3 = t((jloc3-1)*lldt+iloc3)
3401  IF( trsrc1.NE.trsrc3 .OR. tcsrc1.NE.tcsrc3 )
3402  $ CALL sgesd2d( ictxt, 1, 1, elem3, 1, trsrc1, tcsrc1 )
3403  END IF
3404  IF( myrow.EQ.trsrc4 .AND. mycol.EQ.tcsrc4 ) THEN
3405  work(1) = t((jloc4-1)*lldt+iloc4)
3406  IF( k+1.LT.n ) THEN
3407  work(2) = t((jloc4-1)*lldt+iloc4+1)
3408  ELSE
3409  work(2) = zero
3410  END IF
3411  IF( trsrc1.NE.trsrc4 .OR. tcsrc1.NE.tcsrc4 )
3412  $ CALL sgesd2d( ictxt, 2, 1, work, 2, trsrc1, tcsrc1 )
3413  END IF
3414  IF( myrow.EQ.trsrc1 .AND. mycol.EQ.tcsrc1 ) THEN
3415  elem1 = t((jloc1-1)*lldt+iloc1)
3416  IF( trsrc1.NE.trsrc2 .OR. tcsrc1.NE.tcsrc2 )
3417  $ CALL sgerv2d( ictxt, 1, 1, elem2, 1, trsrc2, tcsrc2 )
3418  IF( trsrc1.NE.trsrc3 .OR. tcsrc1.NE.tcsrc3 )
3419  $ CALL sgerv2d( ictxt, 1, 1, elem3, 1, trsrc3, tcsrc3 )
3420  IF( trsrc1.NE.trsrc4 .OR. tcsrc1.NE.tcsrc4 )
3421  $ CALL sgerv2d( ictxt, 2, 1, work, 2, trsrc4, tcsrc4 )
3422  elem4 = work(1)
3423  elem5 = work(2)
3424  IF( elem5.EQ.zero ) THEN
3425  IF( wr( k ).EQ.zero .AND. wi( k ).EQ.zero ) THEN
3426  CALL slanv2( elem1, elem2, elem3, elem4, wr( k ),
3427  $ wi( k ), wr( k+1 ), wi( k+1 ), sn, cs )
3428  ELSEIF( wr( k+1 ).EQ.zero .AND. wi( k+1 ).EQ.zero ) THEN
3429  wr( k+1 ) = elem4
3430  END IF
3431  ELSEIF( wr( k ).EQ.zero .AND. wi( k ).EQ.zero ) THEN
3432  wr( k ) = elem1
3433  END IF
3434  END IF
3435  570 CONTINUE
3436 *
3437  IF( nprocs.GT.1 ) THEN
3438  CALL sgsum2d( ictxt, 'All', top, n, 1, wr, n, -1, -1 )
3439  CALL sgsum2d( ictxt, 'All', top, n, 1, wi, n, -1, -1 )
3440  END IF
3441 *
3442 * Store storage requirements in workspaces.
3443 *
3444  work( 1 ) = float(lwmin)
3445  iwork( 1 ) = liwmin
3446 *
3447 * Return to calling program.
3448 *
3449  RETURN
3450 *
3451 * End of PSTRORD
3452 *
3453  END
3454 *
max
#define max(A, B)
Definition: pcgemr.c:180
bstrexc
subroutine bstrexc(N, T, LDT, IFST, ILST, NITRAF, ITRAF, NDTRAF, DTRAF, WORK, INFO)
Definition: bstrexc.f:3
ilacpy
subroutine ilacpy(UPLO, M, N, A, LDA, B, LDB)
Definition: ilacpy.f:2
infog2l
subroutine infog2l(GRINDX, GCINDX, DESC, NPROW, NPCOL, MYROW, MYCOL, LRINDX, LCINDX, RSRC, CSRC)
Definition: infog2l.f:3
bdtrexc
subroutine bdtrexc(N, T, LDT, IFST, ILST, NITRAF, ITRAF, NDTRAF, DTRAF, WORK, INFO)
Definition: bdtrexc.f:3
pchk1mat
subroutine pchk1mat(MA, MAPOS0, NA, NAPOS0, IA, JA, DESCA, DESCAPOS0, NEXTRA, EX, EXPOS, INFO)
Definition: pchkxmat.f:3
pchk2mat
subroutine pchk2mat(MA, MAPOS0, NA, NAPOS0, IA, JA, DESCA, DESCAPOS0, MB, MBPOS0, NB, NBPOS0, IB, JB, DESCB, DESCBPOS0, NEXTRA, EX, EXPOS, INFO)
Definition: pchkxmat.f:175
pslacpy
subroutine pslacpy(UPLO, M, N, A, IA, JA, DESCA, B, IB, JB, DESCB)
Definition: pslacpy.f:3
bslaapp
subroutine bslaapp(ISIDE, M, N, NB, A, LDA, NITRAF, ITRAF, DTRAF, WORK)
Definition: bslaapp.f:3
pselget
subroutine pselget(SCOPE, TOP, ALPHA, A, IA, JA, DESCA)
Definition: pselget.f:2
pstrord
subroutine pstrord(COMPQ, SELECT, PARA, N, T, IT, JT, DESCT, Q, IQ, JQ, DESCQ, WR, WI, M, WORK, LWORK, IWORK, LIWORK, INFO)
Definition: pstrord.f:4
chk1mat
subroutine chk1mat(MA, MAPOS0, NA, NAPOS0, IA, JA, DESCA, DESCAPOS0, INFO)
Definition: chk1mat.f:3
pxerbla
subroutine pxerbla(ICTXT, SRNAME, INFO)
Definition: pxerbla.f:2
min
#define min(A, B)
Definition: pcgemr.c:181