41{
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282 char GemmTa, GemmTb, cctop, * one, rctop, * talphaCR, * talphaRC,
283 * tbeta, * zero;
284 Int Acol, Aii, Aimb1, Ainb1, Ajj, Alcmb, Ald, Alp, Alp0, Alq,
285 Alq0, Amb, Amp, An, Anb, Anq, Arow, BCfwd, BCmyprocD,
286 BCmyprocR, BCnD, BCnR, BCnprocsD, BCnprocsR, Bbufld, BcurrocR,
287 Bfr, BiD, BiR, BiiD, BiiR, BinbD, BinbR, Binb1D, Binb1R, BisR,
288 Bkk, Bld, BnbD, BnbR, BnpD, BnpR, Boff, BrocD, BrocR, BsrcR,
289 Cfr, CiD, CiR, CiiD, CiiR, CinbD, CinbR, Cinb1D, Cinb1R, Ckk,
290 CnbD, CnbR, CnpD, CnpR, Coff, CrocD, CrocR, CsrcR, Cbufld,
291 CcurrocR, CisR, Cld, WBCfr, WBCld, WBRfr, WBRld, WCCfr, WCCld,
292 WCCsum, WCRfr, WCRld, WCRsum, conjg, ctxt, l, lb, lcmb, lside,
293 ltmp, maxp, maxpm1, maxq, mycol, myrow, n, nb, nbb, ncpq,
294 npcol, npq=0, nprow, nrpq, p=0, q=0, size, tmp, upper;
298
299
300
304 char * Aptr = NULL, * Bbuf = NULL, * Cbuf = NULL, * WBC = NULL,
305 * WBR = NULL, * WCC = NULL, * WCR = NULL;
306
307
308
309
311
316
318 gemm =
TYPE->Fgemm; gsum2d =
TYPE->Cgsum2d;
320
321
322
323 if( lside )
324 {
325 BCnD = An = M; BCnR = N;
326 BCmyprocD = myrow; BCnprocsD = nprow;
327 BCmyprocR = mycol; BCnprocsR = npcol;
328
329 BiD = IB; BiR = JB;
330 BinbD = DESCB[
IMB_ ]; BinbR = DESCB[
INB_];
331 BnbD = DESCB[
MB_ ]; BnbR = DESCB[
NB_ ];
333 PB_Cinfog2l( IB, JB, DESCB, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR,
334 &BiiD, &BiiR, &BrocD, &BrocR );
335
336 CiD = IC; CiR = JC;
337 CinbD = DESCC[
IMB_ ]; CinbR = DESCC[
INB_];
338 CnbD = DESCC[
MB_ ]; CnbR = DESCC[
NB_ ];
340 PB_Cinfog2l( IC, JC, DESCC, BCnprocsD, BCnprocsR, BCmyprocD, BCmyprocR,
341 &CiiD, &CiiR, &CrocD, &CrocR );
342 }
343 else
344 {
345 BCnD = An = N; BCnR = M;
346 BCmyprocD = mycol; BCnprocsD = npcol;
347 BCmyprocR = myrow; BCnprocsR = nprow;
348
349 BiD = JB; BiR = IB;
350 BinbR = DESCB[
IMB_ ]; BinbD = DESCB[
INB_];
351 BnbR = DESCB[
MB_ ]; BnbD = DESCB[
NB_ ];
353 PB_Cinfog2l( IB, JB, DESCB, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD,
354 &BiiR, &BiiD, &BrocR, &BrocD );
355
356 CiD = JC; CiR = IC;
357 CinbR = DESCC[
IMB_ ]; CinbD = DESCC[
INB_];
358 CnbR = DESCC[
MB_ ]; CnbD = DESCC[
NB_ ];
360 PB_Cinfog2l( IC, JC, DESCC, BCnprocsR, BCnprocsD, BCmyprocR, BCmyprocD,
361 &CiiR, &CiiD, &CrocR, &CrocD );
362 }
363
365 BnpD =
PB_Cnumroc( BCnD, 0, Binb1D, BnbD, BCmyprocD, BrocD, BCnprocsD );
367 BisR = ( ( BsrcR < 0 ) || ( BCnprocsR == 1 ) );
368
370 CnpD =
PB_Cnumroc( BCnD, 0, Cinb1D, CnbD, BCmyprocD, CrocD, BCnprocsD );
372 CisR = ( ( CsrcR < 0 ) || ( BCnprocsR == 1 ) );
373
374
375
376 PB_Cdescribe( An, An, IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
377 &Ald, &Aimb1, &Ainb1, &Amb, &Anb, &Arow, &Acol, Ad0 );
378
379 Amp =
PB_Cnumroc( An, 0, Aimb1, Amb, myrow, Arow, nprow );
380 Anq =
PB_Cnumroc( An, 0, Ainb1, Anb, mycol, Acol, npcol );
381 if( ( Amp > 0 ) && ( Anq > 0 ) ) Aptr =
Mptr( A, Aii, Ajj, Ald, size );
382
383
384
385
386
389
390 if( conjg )
391 {
393 if( lside )
394 {
398 }
399 else
400 {
404 }
405 }
406 else
407 {
408 tzsymm =
PB_Ctzsymm; talphaCR = talphaRC = ALPHA;
410 }
411
412
413
414
415 Alcmb = 2 * nb *
PB_Clcm( ( Arow >= 0 ? nprow : 1 ),
416 ( Acol >= 0 ? npcol : 1 ) );
417
418
419
420
421 if( !( BisR ) && !( BCfwd ) )
422 {
423 tmp =
PB_Cindxg2p( BCnR - 1, Binb1R, BnbR, BrocR, BrocR, BCnprocsR );
424 q =
MModSub( tmp, BrocR, BCnprocsR );
425 }
426
427
428
429
430 if( !( CisR ) && !( BCfwd ) )
431 {
432 tmp =
PB_Cindxg2p( BCnR - 1, Cinb1R, CnbR, CrocR, CrocR, BCnprocsR );
433 p =
MModSub( tmp, CrocR, BCnprocsR );
434 }
435
436
437
438
439 lcmb =
PB_Clcm( ( maxp = ( CisR ? 1 : BCnprocsR ) ) * CnbR,
440 ( maxq = ( BisR ? 1 : BCnprocsR ) ) * BnbR );
441 n = BCnR;
442 maxpm1 = maxp - 1;
443
444 while( n > 0 )
445 {
446
447
448
449 BcurrocR = ( BisR ? -1 :
MModAdd( BrocR, q, BCnprocsR ) );
450 Bkk =
PB_Cg2lrem( BiR, BinbR, BnbR, BcurrocR, BsrcR, BCnprocsR );
451 BnpR =
PB_Cnumroc( BCnR, 0, Binb1R, BnbR, BcurrocR, BrocR, BCnprocsR );
452
453 CcurrocR = ( CisR ? -1 :
MModAdd( CrocR, p, BCnprocsR ) );
454 Ckk =
PB_Cg2lrem( CiR, CinbR, CnbR, CcurrocR, CsrcR, BCnprocsR );
455 CnpR =
PB_Cnumroc( BCnR, 0, Cinb1R, CnbR, CcurrocR, CrocR, BCnprocsR );
456
457 PB_CVMinit( &VM, 0, CnpR, BnpR, Cinb1R, Binb1R, CnbR, BnbR, p, q,
458 maxp, maxq, lcmb );
459
460
461
463
464 n -= npq;
465
466
467
468
469 if( npq ) nbb = npq / ( ( npq - 1 ) / nb + 1 );
470
471 while( npq )
472 {
473 nbb =
MIN( nbb, npq );
474
475
476
478
479 if( lside )
480 {
481
482
483
484
485 if( ( Bfr = ( ncpq < nbb ) ) != 0 )
486 {
487
488
489
490
491 Bbufld =
MAX( 1, BnpD );
492 if( BisR || ( BCmyprocR == BcurrocR ) )
493 {
496 BnpD, one,
Mptr( B, BiiD, Bkk, Bld, size ), Bld,
497 zero, Bbuf, Bbufld );
498 }
499 }
500 else
501 {
502
503
504
505 Bbufld = Bld;
506 if( BisR || ( BCmyprocR == BcurrocR ) )
507 Bbuf =
Mptr( B, BiiD, Bkk+Boff, Bld, size );
508 }
509 PB_Cdescset( DBUFB, BCnD, nbb, Binb1D, nbb, BnbD, nbb, BrocD,
510 BcurrocR, ctxt, Bbufld );
511
512
513
514
515 PB_CInV(
TYPE,
NOCONJG,
COLUMN, An, An, Ad0, nbb, Bbuf, 0, 0,
516 DBUFB,
COLUMN, &WBC, WBCd, &WBCfr );
517 PB_CInV(
TYPE,
NOCONJG,
ROW, An, An, Ad0, nbb, WBC, 0, 0,
518 WBCd,
COLUMN, &WBR, WBRd, &WBRfr );
519 }
520 else
521 {
522
523
524
525
526 if( ( Bfr = ( ncpq < nbb ) ) != 0 )
527 {
528
529
530
531
532 Bbufld = nbb;
533 if( BisR || ( BCmyprocR == BcurrocR ) )
534 {
537 BnpD, one,
Mptr( B, Bkk, BiiD, Bld, size ), Bld,
538 zero, Bbuf, Bbufld );
539 }
540 }
541 else
542 {
543
544
545
546 Bbufld = Bld;
547 if( BisR || ( BCmyprocR == BcurrocR ) )
548 Bbuf =
Mptr( B, Bkk+Boff, BiiD, Bld, size );
549 }
550 PB_Cdescset( DBUFB, nbb, BCnD, nbb, Binb1D, nbb, BnbD, BcurrocR,
551 BrocD, ctxt, Bbufld );
552
553
554
555
556 PB_CInV(
TYPE,
NOCONJG,
ROW, An, An, Ad0, nbb, Bbuf, 0, 0,
557 DBUFB,
ROW, &WBR, WBRd, &WBRfr );
558 PB_CInV(
TYPE,
NOCONJG,
COLUMN, An, An, Ad0, nbb, WBR, 0, 0,
559 WBRd,
ROW, &WBC, WBCd, &WBCfr );
560 }
561
562
563
564 PB_COutV(
TYPE,
COLUMN,
INIT, An, An, Ad0, nbb, &WCC, WCCd, &WCCfr,
565 &WCCsum );
566 PB_COutV(
TYPE,
ROW,
INIT, An, An, Ad0, nbb, &WCR, WCRd, &WCRfr,
567 &WCRsum );
568
569
570
571 WBCld = WBCd[
LLD_]; WBRld = WBRd[
LLD_];
572 WCCld = WCCd[
LLD_]; WCRld = WCRd[
LLD_];
573
574 if( ( Amp > 0 ) && ( Anq > 0 ) )
575 {
576 if( upper )
577 {
578
579
580
581 for( l = 0; l < An; l += Alcmb )
582 {
583 lb = An - l; lb =
MIN( lb, Alcmb );
584 Alp =
PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow );
585 Alq =
PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol );
586 Alq0 =
PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol );
587 if( Alp > 0 && Alq0 > 0 )
588 {
590 &Alq0, talphaRC,
Mptr( Aptr, 0, Alq, Ald, size ),
591 &Ald,
Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one,
592 WCC, &WCCld );
594 &Alp, talphaCR, WBC, &WBCld,
Mptr( Aptr, 0, Alq, Ald,
595 size ), &Ald, one,
Mptr( WCR, 0, Alq, WCRld, size ),
596 &WCRld );
597 }
598 PB_Cpsym(
TYPE,
TYPE, SIDE,
UPPER, lb, nbb, ALPHA, Aptr, l, l,
599 Ad0,
Mptr( WBC, Alp, 0, WBCld, size ), WBCld,
600 Mptr( WBR, 0, Alq, WBRld, size ), WBRld,
Mptr( WCC,
601 Alp, 0, WCCld, size ), WCCld,
Mptr( WCR, 0, Alq,
602 WCRld, size ), WCRld, tzsymm );
603 }
604 }
605 else
606 {
607
608
609
610 for( l = 0; l < An; l += Alcmb )
611 {
612 lb = An - l; ltmp = l + ( lb =
MIN( lb, Alcmb ) );
613 Alp =
PB_Cnumroc( l, 0, Aimb1, Amb, myrow, Arow, nprow );
614 Alq =
PB_Cnumroc( l, 0, Ainb1, Anb, mycol, Acol, npcol );
615 PB_Cpsym(
TYPE,
TYPE, SIDE,
LOWER, lb, nbb, ALPHA, Aptr, l, l,
616 Ad0,
Mptr( WBC, Alp, 0, WBCld, size ), WBCld,
617 Mptr( WBR, 0, Alq, WBRld, size ), WBRld,
Mptr( WCC,
618 Alp, 0, WCCld, size ), WCCld,
Mptr( WCR, 0, Alq,
619 WCRld, size ), WCRld, tzsymm );
620 Alp =
PB_Cnumroc( ltmp, 0, Aimb1, Amb, myrow, Arow, nprow );
621 Alp0 = Amp - Alp;
622 Alq0 =
PB_Cnumroc( lb, l, Ainb1, Anb, mycol, Acol, npcol );
623 if( Alp0 > 0 && Alq0 > 0 )
624 {
626 &Alq0, talphaRC,
Mptr( Aptr, Alp, Alq, Ald, size ),
627 &Ald,
Mptr( WBR, 0, Alq, WBRld, size ), &WBRld, one,
628 Mptr( WCC, Alp, 0, WCCld, size ), &WCCld );
630 &Alp0, talphaCR,
Mptr( WBC, Alp, 0, WBCld, size ),
631 &WBCld,
Mptr( Aptr, Alp, Alq, Ald, size ), &Ald, one,
632 Mptr( WCR, 0, Alq, WCRld, size ), &WCRld );
633 }
634 }
635 }
636 }
637 if( WBCfr ) free( WBC );
638 if( WBRfr ) free( WBR );
639
640 if( Bfr && ( BisR || ( BCmyprocR == BcurrocR ) ) )
641 if( Bbuf ) free( Bbuf );
642
643 if( lside )
644 {
645
646
647
648 if( WCCsum )
649 {
650 WCCd[
CSRC_] = CcurrocR;
651 if( Amp > 0 )
652 gsum2d( ctxt,
ROW, &rctop, Amp, nbb, WCC, WCCld, myrow,
654 }
655
656 if( WCRsum )
657 {
659 if( Anq > 0 )
660 gsum2d( ctxt,
COLUMN, &cctop, nbb, Anq, WCR, WCRld,
661 WCRd[
RSRC_], mycol );
662 }
663
664
665
666 PB_Cpaxpby(
TYPE, CONJUG, nbb, An, one, WCR, 0, 0, WCRd,
ROW, one,
667 WCC, 0, 0, WCCd,
COLUMN );
668 if( WCRfr ) free( WCR );
669
670
671
672
673 if( ( Cfr = ( nrpq < nbb ) ) != 0 )
674 {
675
676
677
678 Cbufld =
MAX( 1, CnpD ); tbeta = zero;
679 if( CisR || ( BCmyprocR == CcurrocR ) )
681 }
682 else
683 {
684
685
686
687 Cbufld = Cld; tbeta = BETA;
688 if( CisR || ( BCmyprocR == CcurrocR ) )
689 Cbuf =
Mptr( C, CiiD, Ckk+Coff, Cld, size );
690 }
691 PB_Cdescset( DBUFC, BCnD, nbb, Cinb1D, nbb, CnbD, nbb, CrocD,
692 CcurrocR, ctxt, Cbufld );
693
694
695
696 PB_Cpaxpby(
TYPE,
NOCONJG, An, nbb, one, WCC, 0, 0, WCCd,
COLUMN,
697 tbeta, Cbuf, 0, 0, DBUFC,
COLUMN );
698 if( WCCfr ) free( WCC );
699
700
701
702 if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) )
703 {
705 CnpD, BETA,
Mptr( C, CiiD, Ckk, Cld, size ), Cld,
706 one, Cbuf, Cbufld );
707 if( Cbuf ) free( Cbuf );
708 }
709 }
710 else
711 {
712
713
714
715 if( WCCsum )
716 {
718 if( Amp > 0 )
719 gsum2d( ctxt,
ROW, &rctop, Amp, nbb, WCC, WCCld, myrow,
721 }
722
723 if( WCRsum )
724 {
725 WCRd[
RSRC_] = CcurrocR;
726 if( Anq > 0 )
727 gsum2d( ctxt,
COLUMN, &cctop, nbb, Anq, WCR, WCRld,
728 WCRd[
RSRC_], mycol );
729 }
730
731
732
733 PB_Cpaxpby(
TYPE, CONJUG, An, nbb, one, WCC, 0, 0, WCCd,
COLUMN,
734 one, WCR, 0, 0, WCRd,
ROW );
735 if( WCCfr ) free( WCC );
736
737
738
739
740 if( ( Cfr = ( nrpq < nbb ) ) != 0 )
741 {
742
743
744
745 Cbufld = nbb; tbeta = zero;
746 if( CisR || ( BCmyprocR == CcurrocR ) )
748 }
749 else
750 {
751
752
753
754 Cbufld = Cld; tbeta = BETA;
755 if( CisR || ( BCmyprocR == CcurrocR ) )
756 Cbuf =
Mptr( C, Ckk+Coff, CiiD, Cld, size );
757 }
758 PB_Cdescset( DBUFC, nbb, BCnD, nbb, Cinb1D, nbb, CnbD, CcurrocR,
759 CrocD, ctxt, Cbufld );
760
761
762
763 PB_Cpaxpby(
TYPE,
NOCONJG, nbb, An, one, WCR, 0, 0, WCRd,
ROW,
764 tbeta, Cbuf, 0, 0, DBUFC,
ROW );
765
766 if( WCRfr ) free( WCR );
767
768
769
770 if( Cfr && ( CisR || ( BCmyprocR == CcurrocR ) ) )
771 {
773 CnpD, BETA,
Mptr( C, Ckk, CiiD, Cld, size ), Cld,
774 one, Cbuf, Cbufld );
775 if( Cbuf ) free( Cbuf );
776 }
777 }
778
779
780
782
783 npq -= nbb;
784 }
785
786
787
788 if( ( BCfwd && ( p == maxpm1 ) ) ||
789 ( !( BCfwd ) && ( p == 0 ) ) )
792 }
793
794 if( conjg ) free( ( lside ? talphaCR : talphaRC ) );
795
796
797
798}