|
ScaLAPACK
2.0.2
ScaLAPACK: Scalable Linear Algebra PACKage
|
00001 #include "redist.h" 00158 #define static2 static 00159 #if defined(Add_) || defined(f77IsF2C) 00160 #define fortran_mr2d pctrmr2do_ 00161 #define fortran_mr2dnew pctrmr2d_ 00162 #elif defined(UpCase) 00163 #define fortran_mr2dnew PCTRMR2D 00164 #define fortran_mr2d PCTRMR2DO 00165 #define ccopy_ CCOPY 00166 #define clacpy_ CLACPY 00167 #else 00168 #define fortran_mr2d pctrmr2do 00169 #define fortran_mr2dnew pctrmr2d 00170 #define ccopy_ ccopy 00171 #define clacpy_ clacpy 00172 #endif 00173 #define Clacpy Cctrlacpy 00174 void Clacpy(); 00175 typedef struct { 00176 float r, i; 00177 } complex; 00178 typedef struct { 00179 int desctype; 00180 int ctxt; 00181 int m; 00182 int n; 00183 int nbrow; 00184 int nbcol; 00185 int sprow; 00186 int spcol; 00187 int lda; 00188 } MDESC; 00189 #define BLOCK_CYCLIC_2D 1 00190 typedef struct { 00191 int gstart; 00192 int len; 00193 } IDESC; 00194 #define SHIFT(row,sprow,nbrow) ((row)-(sprow)+ ((row) >= (sprow) ? 0 : (nbrow))) 00195 #define max(A,B) ((A)>(B)?(A):(B)) 00196 #define min(A,B) ((A)>(B)?(B):(A)) 00197 #define DIVUP(a,b) ( ((a)-1) /(b)+1) 00198 #define ROUNDUP(a,b) (DIVUP(a,b)*(b)) 00199 #ifdef MALLOCDEBUG 00200 #define malloc mymalloc 00201 #define free myfree 00202 #define realloc myrealloc 00203 #endif 00204 /* Cblacs */ 00205 extern void Cblacs_pcoord(); 00206 extern int Cblacs_pnum(); 00207 extern void Csetpvmtids(); 00208 extern void Cblacs_get(); 00209 extern void Cblacs_pinfo(); 00210 extern void Cblacs_gridinfo(); 00211 extern void Cblacs_gridinit(); 00212 extern void Cblacs_exit(); 00213 extern void Cblacs_gridexit(); 00214 extern void Cblacs_setup(); 00215 extern void Cigebs2d(); 00216 extern void Cigebr2d(); 00217 extern void Cigesd2d(); 00218 extern void Cigerv2d(); 00219 extern void Cigsum2d(); 00220 extern void Cigamn2d(); 00221 extern void Cigamx2d(); 00222 extern void Ccgesd2d(); 00223 extern void Ccgerv2d(); 00224 /* lapack */ 00225 void clacpy_(); 00226 /* aux fonctions */ 00227 extern int localindice(); 00228 extern void *mr2d_malloc(); 00229 extern int ppcm(); 00230 extern int localsize(); 00231 extern int memoryblocksize(); 00232 extern int changeorigin(); 00233 extern void paramcheck(); 00234 /* tools and others function */ 00235 #define scanD0 ctrscanD0 00236 #define dispmat ctrdispmat 00237 #define setmemory ctrsetmemory 00238 #define freememory ctrfreememory 00239 #define scan_intervals ctrscan_intervals 00240 extern void scanD0(); 00241 extern void dispmat(); 00242 extern void setmemory(); 00243 extern void freememory(); 00244 extern int scan_intervals(); 00245 extern void Cpctrmr2do(); 00246 extern void Cpctrmr2d(); 00247 /* some defines for Cpctrmr2do */ 00248 #define SENDBUFF 0 00249 #define RECVBUFF 1 00250 #define SIZEBUFF 2 00251 #if 0 00252 #define DEBUG 00253 #endif 00254 #ifndef DEBUG 00255 #define NDEBUG 00256 #endif 00257 #include <stdio.h> 00258 #include <stdlib.h> 00259 #include <assert.h> 00260 #define DESCLEN 9 00261 void 00262 fortran_mr2d(uplo, diag, m, n, A, ia, ja, desc_A, 00263 B, ib, jb, desc_B) 00264 char *uplo, *diag; 00265 int *ia, *ib, *ja, *jb, *m, *n; 00266 int desc_A[DESCLEN], desc_B[DESCLEN]; 00267 complex *A, *B; 00268 { 00269 Cpctrmr2do(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, 00270 B, *ib, *jb, (MDESC *) desc_B); 00271 return; 00272 } 00273 void 00274 fortran_mr2dnew(uplo, diag, m, n, A, ia, ja, desc_A, 00275 B, ib, jb, desc_B, gcontext) 00276 char *uplo, *diag; 00277 int *ia, *ib, *ja, *jb, *m, *n; 00278 int desc_A[DESCLEN], desc_B[DESCLEN]; 00279 complex *A, *B; 00280 int *gcontext; 00281 { 00282 Cpctrmr2d(uplo, diag, *m, *n, A, *ia, *ja, (MDESC *) desc_A, 00283 B, *ib, *jb, (MDESC *) desc_B, *gcontext); 00284 return; 00285 } 00286 static2 void init_chenille(); 00287 static2 int inter_len(); 00288 static2 int block2buff(); 00289 static2 void buff2block(); 00290 static2 void gridreshape(); 00291 void 00292 Cpctrmr2do(uplo, diag, m, n, 00293 ptrmyblock, ia, ja, ma, 00294 ptrmynewblock, ib, jb, mb) 00295 char *uplo, *diag; 00296 complex *ptrmyblock, *ptrmynewblock; 00297 /* pointers to the memory location of the matrix and the redistributed matrix */ 00298 MDESC *ma; 00299 MDESC *mb; 00300 int ia, ja, ib, jb, m, n; 00301 { 00302 int dummy, nprocs; 00303 int gcontext; 00304 /* first we initialize a global grid which serve as a reference to 00305 * communicate from grid a to grid b */ 00306 Cblacs_pinfo(&dummy, &nprocs); 00307 Cblacs_get(0, 0, &gcontext); 00308 Cblacs_gridinit(&gcontext, "R", 1, nprocs); 00309 Cpctrmr2d(uplo, diag, m, n, ptrmyblock, ia, ja, ma, 00310 ptrmynewblock, ib, jb, mb, gcontext); 00311 Cblacs_gridexit(gcontext); 00312 } 00313 #define NBPARAM 20 /* p0,q0,p1,q1, puis ma,na,mba,nba,rowa,cola puis 00314 * idem B puis ia,ja puis ib,jb */ 00315 #define MAGIC_MAX 100000000 00316 void 00317 Cpctrmr2d(uplo, diag, m, n, 00318 ptrmyblock, ia, ja, ma, 00319 ptrmynewblock, ib, jb, mb, globcontext) 00320 char *uplo, *diag; 00321 complex *ptrmyblock, *ptrmynewblock; 00322 /* pointers to the memory location of the matrix and the redistributed matrix */ 00323 MDESC *ma; 00324 MDESC *mb; 00325 int ia, ja, ib, jb, m, n, globcontext; 00326 { 00327 complex *ptrsendbuff, *ptrrecvbuff, *ptrNULL = 0; 00328 complex *recvptr; 00329 MDESC newa, newb; 00330 int *proc0, *proc1, *param; 00331 int mypnum, myprow0, mypcol0, myprow1, mypcol1, nprocs; 00332 int i, j; 00333 int nprow, npcol, gcontext; 00334 int recvsize, sendsize; 00335 IDESC *h_inter; /* to store the horizontal intersections */ 00336 IDESC *v_inter; /* to store the vertical intersections */ 00337 int hinter_nb, vinter_nb; /* number of intrsections in both directions */ 00338 int dummy; 00339 int p0, q0, p1, q1; 00340 int *ra, *ca; 00341 /* end of variables */ 00342 /* To simplify further calcul we change the matrix indexation from 00343 * 1..m,1..n (fortran) to 0..m-1,0..n-1 */ 00344 if (m == 0 || n == 0) 00345 return; 00346 ia -= 1; 00347 ja -= 1; 00348 ib -= 1; 00349 jb -= 1; 00350 Cblacs_gridinfo(globcontext, &nprow, &npcol, &dummy, &mypnum); 00351 gcontext = globcontext; 00352 nprocs = nprow * npcol; 00353 /* if the global context that is given to us has not the shape of a line 00354 * (nprow != 1), create a new context. TODO: to be optimal, we should 00355 * avoid this because it is an uncessary synchronisation */ 00356 if (nprow != 1) { 00357 gridreshape(&gcontext); 00358 Cblacs_gridinfo(gcontext, &dummy, &dummy, &dummy, &mypnum); 00359 } 00360 Cblacs_gridinfo(ma->ctxt, &p0, &q0, &myprow0, &mypcol0); 00361 /* compatibility T3D, must check myprow and mypcol are within bounds */ 00362 if (myprow0 >= p0 || mypcol0 >= q0) 00363 myprow0 = mypcol0 = -1; 00364 assert((myprow0 < p0 && mypcol0 < q0) || (myprow0 == -1 && mypcol0 == -1)); 00365 Cblacs_gridinfo(mb->ctxt, &p1, &q1, &myprow1, &mypcol1); 00366 if (myprow1 >= p1 || mypcol1 >= q1) 00367 myprow1 = mypcol1 = -1; 00368 assert((myprow1 < p1 && mypcol1 < q1) || (myprow1 == -1 && mypcol1 == -1)); 00369 /* exchange the missing parameters among the processors: shape of grids and 00370 * location of the processors */ 00371 param = (int *) mr2d_malloc(3 * (nprocs * 2 + NBPARAM) * sizeof(int)); 00372 ra = param + nprocs * 2 + NBPARAM; 00373 ca = param + (nprocs * 2 + NBPARAM) * 2; 00374 for (i = 0; i < nprocs * 2 + NBPARAM; i++) 00375 param[i] = MAGIC_MAX; 00376 proc0 = param + NBPARAM; 00377 proc1 = param + NBPARAM + nprocs; 00378 /* we calulate proc0 and proc1 that will give the number of a proc in 00379 * respectively a or b in the global context */ 00380 if (myprow0 >= 0) { 00381 proc0[myprow0 * q0 + mypcol0] = mypnum; 00382 param[0] = p0; 00383 param[1] = q0; 00384 param[4] = ma->m; 00385 param[5] = ma->n; 00386 param[6] = ma->nbrow; 00387 param[7] = ma->nbcol; 00388 param[8] = ma->sprow; 00389 param[9] = ma->spcol; 00390 param[10] = ia; 00391 param[11] = ja; 00392 } 00393 if (myprow1 >= 0) { 00394 proc1[myprow1 * q1 + mypcol1] = mypnum; 00395 param[2] = p1; 00396 param[3] = q1; 00397 param[12] = mb->m; 00398 param[13] = mb->n; 00399 param[14] = mb->nbrow; 00400 param[15] = mb->nbcol; 00401 param[16] = mb->sprow; 00402 param[17] = mb->spcol; 00403 param[18] = ib; 00404 param[19] = jb; 00405 } 00406 Cigamn2d(gcontext, "All", "H", 2 * nprocs + NBPARAM, 1, param, 2 * nprocs + NBPARAM, 00407 ra, ca, 2 * nprocs + NBPARAM, -1, -1); 00408 newa = *ma; 00409 newb = *mb; 00410 ma = &newa; 00411 mb = &newb; 00412 if (myprow0 == -1) { 00413 p0 = param[0]; 00414 q0 = param[1]; 00415 ma->m = param[4]; 00416 ma->n = param[5]; 00417 ma->nbrow = param[6]; 00418 ma->nbcol = param[7]; 00419 ma->sprow = param[8]; 00420 ma->spcol = param[9]; 00421 ia = param[10]; 00422 ja = param[11]; 00423 } 00424 if (myprow1 == -1) { 00425 p1 = param[2]; 00426 q1 = param[3]; 00427 mb->m = param[12]; 00428 mb->n = param[13]; 00429 mb->nbrow = param[14]; 00430 mb->nbcol = param[15]; 00431 mb->sprow = param[16]; 00432 mb->spcol = param[17]; 00433 ib = param[18]; 00434 jb = param[19]; 00435 } 00436 for (i = 0; i < NBPARAM; i++) { 00437 if (param[i] == MAGIC_MAX) { 00438 fprintf(stderr, "xxGEMR2D:something wrong in the parameters\n"); 00439 exit(1); 00440 } 00441 } 00442 #ifndef NDEBUG 00443 for (i = 0; i < p0 * q0; i++) 00444 assert(proc0[i] >= 0 && proc0[i] < nprocs); 00445 for (i = 0; i < p1 * q1; i++) 00446 assert(proc1[i] >= 0 && proc1[i] < nprocs); 00447 #endif 00448 /* check the validity of the parameters */ 00449 paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); 00450 paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); 00451 /* we change the problem so that ia < a->nbrow ... andia + m = a->m ... */ 00452 { 00453 int decal; 00454 ia = changeorigin(myprow0, ma->sprow, p0, 00455 ma->nbrow, ia, &decal, &ma->sprow); 00456 ptrmyblock += decal; 00457 ja = changeorigin(mypcol0, ma->spcol, q0, 00458 ma->nbcol, ja, &decal, &ma->spcol); 00459 ptrmyblock += decal * ma->lda; 00460 ma->m = ia + m; 00461 ma->n = ja + n; 00462 ib = changeorigin(myprow1, mb->sprow, p1, 00463 mb->nbrow, ib, &decal, &mb->sprow); 00464 ptrmynewblock += decal; 00465 jb = changeorigin(mypcol1, mb->spcol, q1, 00466 mb->nbcol, jb, &decal, &mb->spcol); 00467 ptrmynewblock += decal * mb->lda; 00468 mb->m = ib + m; 00469 mb->n = jb + n; 00470 if (p0 == 1) 00471 ma->nbrow = ma->m; 00472 if (q0 == 1) 00473 ma->nbcol = ma->n; 00474 if (p1 == 1) 00475 mb->nbrow = mb->m; 00476 if (q1 == 1) 00477 mb->nbcol = mb->n; 00478 #ifndef NDEBUG 00479 paramcheck(ma, ia, ja, m, n, p0, q0, gcontext); 00480 paramcheck(mb, ib, jb, m, n, p1, q1, gcontext); 00481 #endif 00482 } 00483 /* We compute the size of the memory buffer ( we choose the worst case, 00484 * when the buffer sizes == the memory block sizes). */ 00485 if (myprow0 >= 0 && mypcol0 >= 0) { 00486 /* Initialize pointer variables */ 00487 setmemory(&ptrsendbuff, memoryblocksize(ma)); 00488 }; /* if (mypnum < p0 * q0) */ 00489 if (myprow1 >= 0 && mypcol1 >= 0) { 00490 /* Initialize pointer variables */ 00491 setmemory(&ptrrecvbuff, memoryblocksize(mb)); 00492 }; /* if (mypnum < p1 * q1) */ 00493 /* allocing room for the tabs, alloc for the worst case,local_n or local_m 00494 * intervals, in fact the worst case should be less, perhaps half that,I 00495 * should think of that one day. */ 00496 h_inter = (IDESC *) mr2d_malloc(DIVUP(ma->n, q0 * ma->nbcol) * 00497 ma->nbcol * sizeof(IDESC)); 00498 v_inter = (IDESC *) mr2d_malloc(DIVUP(ma->m, p0 * ma->nbrow) 00499 * ma->nbrow * sizeof(IDESC)); 00500 /* We go for the scanning of indices. For each processor including mypnum, 00501 * we fill the sendbuff buffer (scanD0(SENDBUFF)) and when it is done send 00502 * it. Then for each processor, we compute the size of message to be 00503 * receive scanD0(SIZEBUFF)), post a receive and then allocate the elements 00504 * of recvbuff the right place (scanD)(RECVBUFF)) */ 00505 recvptr = ptrrecvbuff; 00506 { 00507 int tot, myrang, step, sens; 00508 int *sender, *recver; 00509 int mesending, merecving; 00510 tot = max(p0 * q0, p1 * q1); 00511 init_chenille(mypnum, nprocs, p0 * q0, proc0, p1 * q1, proc1, 00512 &sender, &recver, &myrang); 00513 if (myrang == -1) 00514 goto after_comm; 00515 mesending = myprow0 >= 0; 00516 assert(sender[myrang] >= 0 || !mesending); 00517 assert(!mesending || proc0[sender[myrang]] == mypnum); 00518 merecving = myprow1 >= 0; 00519 assert(recver[myrang] >= 0 || !merecving); 00520 assert(!merecving || proc1[recver[myrang]] == mypnum); 00521 step = tot - 1 - myrang; 00522 do { 00523 for (sens = 0; sens < 2; sens++) { 00524 /* be careful here, when we communicating with ourselves, we must 00525 * send first (myrang > step == 0) */ 00526 if (mesending && recver[step] >= 0 && 00527 (sens == 0)) { 00528 i = recver[step] / q1; 00529 j = recver[step] % q1; 00530 vinter_nb = scan_intervals('r', ia, ib, m, ma, mb, p0, p1, myprow0, i, 00531 v_inter); 00532 hinter_nb = scan_intervals('c', ja, jb, n, ma, mb, q0, q1, mypcol0, j, 00533 h_inter); 00534 scanD0(uplo, diag, SENDBUFF, ptrsendbuff, &sendsize, 00535 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, 00536 v_inter, vinter_nb, h_inter, hinter_nb, 00537 ptrmyblock); 00538 } /* if (mesending...) { */ 00539 if (mesending && recver[step] >= 0 && 00540 (sens == myrang > step)) { 00541 i = recver[step] / q1; 00542 j = recver[step] % q1; 00543 if (sendsize > 0 00544 && (step != myrang || !merecving) 00545 ) { 00546 Ccgesd2d(gcontext, sendsize, 1, ptrsendbuff, sendsize, 00547 0, proc1[i * q1 + j]); 00548 } /* sendsize > 0 */ 00549 } /* if (mesending ... */ 00550 if (merecving && sender[step] >= 0 && 00551 (sens == myrang <= step)) { 00552 i = sender[step] / q0; 00553 j = sender[step] % q0; 00554 vinter_nb = scan_intervals('r', ib, ia, m, mb, ma, p1, p0, myprow1, i, 00555 v_inter); 00556 hinter_nb = scan_intervals('c', jb, ja, n, mb, ma, q1, q0, mypcol1, j, 00557 h_inter); 00558 scanD0(uplo, diag, SIZEBUFF, ptrNULL, &recvsize, 00559 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, 00560 v_inter, vinter_nb, h_inter, hinter_nb, ptrNULL); 00561 if (recvsize > 0) { 00562 if (step == myrang && mesending) { 00563 Clacpy(recvsize, 1, 00564 ptrsendbuff, recvsize, 00565 ptrrecvbuff, recvsize); 00566 } else { 00567 Ccgerv2d(gcontext, recvsize, 1, ptrrecvbuff, recvsize, 00568 0, proc0[i * q0 + j]); 00569 } 00570 } /* recvsize > 0 */ 00571 } /* if (merecving ...) */ 00572 if (merecving && sender[step] >= 0 && sens == 1) { 00573 scanD0(uplo, diag, RECVBUFF, ptrrecvbuff, &recvsize, 00574 m, n, ma, ia, ja, p0, q0, mb, ib, jb, p1, q1, 00575 v_inter, vinter_nb, h_inter, hinter_nb, ptrmynewblock); 00576 } /* if (merecving...) */ 00577 } /* for (sens = 0) */ 00578 step -= 1; 00579 if (step < 0) 00580 step = tot - 1; 00581 } while (step != tot - 1 - myrang); 00582 after_comm: 00583 free(sender); 00584 } /* { int tot,nr,ns ...} */ 00585 /* don't forget to clean up things! */ 00586 if (myprow1 >= 0 && mypcol1 >= 0) { 00587 freememory((char *) ptrrecvbuff); 00588 }; 00589 if (myprow0 >= 0 && mypcol0 >= 0) { 00590 freememory((char *) ptrsendbuff); 00591 }; 00592 if (nprow != 1) 00593 Cblacs_gridexit(gcontext); 00594 free(v_inter); 00595 free(h_inter); 00596 free(param); 00597 }/* distrib */ 00598 static2 void 00599 init_chenille(mypnum, nprocs, n0, proc0, n1, proc1, psend, precv, myrang) 00600 int nprocs, mypnum, n0, n1; 00601 int *proc0, *proc1, **psend, **precv, *myrang; 00602 { 00603 int ns, nr, i, tot; 00604 int *sender, *recver, *g0, *g1; 00605 tot = max(n0, n1); 00606 sender = (int *) mr2d_malloc((nprocs + tot) * sizeof(int) * 2); 00607 recver = sender + tot; 00608 *psend = sender; 00609 *precv = recver; 00610 g0 = recver + tot; 00611 g1 = g0 + nprocs; 00612 for (i = 0; i < nprocs; i++) { 00613 g0[i] = -1; 00614 g1[i] = -1; 00615 } 00616 for (i = 0; i < tot; i++) { 00617 sender[i] = -1; 00618 recver[i] = -1; 00619 } 00620 for (i = 0; i < n0; i++) 00621 g0[proc0[i]] = i; 00622 for (i = 0; i < n1; i++) 00623 g1[proc1[i]] = i; 00624 ns = 0; 00625 nr = 0; 00626 *myrang = -1; 00627 for (i = 0; i < nprocs; i++) 00628 if (g0[i] >= 0 && g1[i] >= 0) { 00629 if (i == mypnum) 00630 *myrang = nr; 00631 sender[ns] = g0[i]; 00632 ns += 1; 00633 recver[nr] = g1[i]; 00634 nr += 1; 00635 assert(ns <= n0 && nr <= n1 && nr == ns); 00636 } 00637 for (i = 0; i < nprocs; i++) 00638 if (g0[i] >= 0 && g1[i] < 0) { 00639 if (i == mypnum) 00640 *myrang = ns; 00641 sender[ns] = g0[i]; 00642 ns += 1; 00643 assert(ns <= n0); 00644 } 00645 for (i = 0; i < nprocs; i++) 00646 if (g1[i] >= 0 && g0[i] < 0) { 00647 if (i == mypnum) 00648 *myrang = nr; 00649 recver[nr] = g1[i]; 00650 nr += 1; 00651 assert(nr <= n1); 00652 } 00653 } 00654 void 00655 Clacpy(m, n, a, lda, b, ldb) 00656 complex *a, *b; 00657 int m, n, lda, ldb; 00658 { 00659 int i, j; 00660 lda -= m; 00661 ldb -= m; 00662 assert(lda >= 0 && ldb >= 0); 00663 for (j = 0; j < n; j++) { 00664 for (i = 0; i < m; i++) 00665 *b++ = *a++; 00666 b += ldb; 00667 a += lda; 00668 } 00669 } 00670 static2 void 00671 gridreshape(ctxtp) 00672 int *ctxtp; 00673 { 00674 int ori, final; /* original context, and new context created, with 00675 * line form */ 00676 int nprow, npcol, myrow, mycol; 00677 int *usermap; 00678 int i, j; 00679 ori = *ctxtp; 00680 Cblacs_gridinfo(ori, &nprow, &npcol, &myrow, &mycol); 00681 usermap = mr2d_malloc(sizeof(int) * nprow * npcol); 00682 for (i = 0; i < nprow; i++) 00683 for (j = 0; j < npcol; j++) { 00684 usermap[i + j * nprow] = Cblacs_pnum(ori, i, j); 00685 } 00686 /* Cblacs_get(0, 0, &final); */ 00687 Cblacs_get(ori, 10, &final); 00688 Cblacs_gridmap(&final, usermap, 1, 1, nprow * npcol); 00689 *ctxtp = final; 00690 free(usermap); 00691 }