41{
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319 char GatherDir, ScatterDir, * one, top, * talpha, * tbeta, tran,
320 * zero;
321 Int ABm, ABn, Acol, Acurcol, Acurrow, Acurimb1, Acurinb1, Afr,
322 Aii, Aimb, Aimb1, Ainb, Ainb1, AisD, AisR, Ajj, Ald, Amb,
323 Amp, Amp0, Anb, Anq, Anq0, Arow, Aspan, Bcol, Bcurcol,
324 Bcurrow, Bcurimb1, Bcurinb1, Bfr, Bii, Bimb, Bimb1, Binb,
325 Binb1, BisD, BisR, Bjj, Bld, Bmb, Bmp, Bmp0, Bnb, Bnq, Bnq0,
326 Brow, Bspan, Ccsrc, Cimb, Cinb, Cmb, Cnb, Crsrc, WAfr, WACfr,
327 WACld, WACreuse, WACsum, WBfr, WBCfr, WBCld, WBCsum, conjg,
328 ctxt, fwd, k, kb, kbb, kend, kstart, kstep, ktmp, mycol,
329 myrow, notran, npcol, nprow, size, upper;
332
333
334
335 char * Aptr = NULL, * Aptr0 = NULL, * Bptr = NULL, * Bptr0 = NULL,
336 * WA = NULL, * WB = NULL, * WAC = NULL, *WBC = NULL;
339
340
341
342
343
344
345
347
348
349
351
356
358 gsum2d =
TYPE->Cgsum2d; gemm =
TYPE->Fgemm;
359
360
361
363 if( fwd )
364 {
365 kstart = 0; kend = ( ( N - 1 ) / kb + 1 ) * kb; kstep = kb;
367 }
368 else
369 {
370 kstart = ( ( N - 1 ) / kb ) * kb; kend = kstep = -kb;
372 }
373
374
375
376 if( conjg )
377 {
380 }
381 else { tran =
CTRAN; talpha = ALPHA; }
382
383
384
385 if( notran ) { ABm = N; ABn = K; } else { ABm = K; ABn = N; }
386 PB_Cinfog2l( IA, JA, DESCA, nprow, npcol, myrow, mycol, &Aii, &Ajj,
387 &Arow, &Acol );
388 Aimb = DESCA[
IMB_]; Ainb = DESCA[
INB_];
389 Amb = DESCA[
MB_ ]; Anb = DESCA[
NB_ ]; Ald = DESCA[
LLD_];
391 Amp0 =
PB_Cnumroc( ABm, 0, Aimb1, Amb, myrow, Arow, nprow );
393 Anq0 =
PB_Cnumroc( ABn, 0, Ainb1, Anb, mycol, Acol, npcol );
394 if( ( Amp0 > 0 ) && ( Anq0 > 0 ) ) Aptr0 =
Mptr( A, Aii, Ajj, Ald, size );
395
396 PB_Cinfog2l( IB, JB, DESCB, nprow, npcol, myrow, mycol, &Bii, &Bjj,
397 &Brow, &Bcol );
398 Bimb = DESCB[
IMB_]; Binb = DESCB[
INB_];
399 Bmb = DESCB[
MB_ ]; Bnb = DESCB[
NB_ ]; Bld = DESCB[
LLD_];
401 Bmp0 =
PB_Cnumroc( ABm, 0, Bimb1, Bmb, myrow, Brow, nprow );
403 Bnq0 =
PB_Cnumroc( ABn, 0, Binb1, Bnb, mycol, Bcol, npcol );
404 if( ( Bmp0 > 0 ) && ( Bnq0 > 0 ) ) Bptr0 =
Mptr( B, Bii, Bjj, Bld, size );
405
406 if( notran )
407 {
409 Cinb = DESCC[
INB_]; Cnb = DESCC[
NB_]; Ccsrc = DESCC[
CSRC_];
410
411
412
413
414 AisR = ( ( Acol < 0 ) || ( npcol == 1 ) );
415 BisR = ( ( Bcol < 0 ) || ( npcol == 1 ) );
416
417 if( !( AisR ) && !( BisR ) )
418 {
419
420
421
422
423
424
425 Aspan =
PB_Cspan( ABn, 0, Ainb1, Anb, Acol, npcol );
426 Bspan =
PB_Cspan( ABn, 0, Binb1, Bnb, Bcol, npcol );
427 WACreuse = ( Aspan ||
428 ( !( Aspan ) && !( Bspan ) && ( Acol == Bcol ) ) );
429 }
430 else
431 {
432
433
434
435
436 WACreuse = ( AisR && BisR );
437 }
438
439
440
441
442 AisD = ( ( Arow >= 0 ) && ( nprow > 1 ) );
443 BisD = ( ( Brow >= 0 ) && ( nprow > 1 ) );
444 WACreuse = ( WACreuse &&
445 ( ( !AisD && !BisD ) || ( ( AisD && BisD ) &&
446 ( ( Arow == Brow ) &&
447 ( ( ( Aimb1 >= ABm ) && ( Bimb1 >= ABm ) ) ||
448 ( ( Aimb1 == Bimb1 ) && ( Amb == Bmb ) ) ) ) ) ) );
449 tbeta = ( WACreuse ? one : zero );
450
451 if( upper )
452 {
453 for( k = kstart; k != kend; k += kstep )
454 {
455 kbb = N - k; kbb =
MIN( kbb, kb ); ktmp = k + kbb;
456
457
458
460 ROW, &Bptr, DBUFB, &Bfr );
461
462
463
464 PB_Cdescset( Ad0, ktmp, ABn, Aimb1, Ainb1, Amb, Anb, Arow, Acol,
465 ctxt, Ald );
466 PB_CInV(
TYPE,
NOCONJG,
ROW, ktmp, ABn, Ad0, kbb, Bptr, 0, 0, DBUFB,
467 ROW, &WB, WBd, &WBfr );
468
469
470
472 &WACfr, &WACsum );
474 Amp =
PB_Cnumroc( ktmp, 0, Aimb1, Amb, myrow, Arow, nprow );
475 if( ( Amp > 0 ) && ( Anq0 > 0 ) )
477 ALPHA, Aptr0, &Ald, WB, &WBd[
LLD_], zero, WAC, &WACld );
478 if( WBfr ) free( WB );
479 if( Bfr ) free( Bptr );
480
481
482
484 ROW, &Aptr, DBUFA, &Afr );
485
486
487
488 PB_Cdescset( Bd0, ktmp, ABn, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
489 ctxt, Bld );
490 PB_CInV(
TYPE,
NOCONJG,
ROW, ktmp, ABn, Bd0, kbb, Aptr, 0, 0, DBUFA,
491 ROW, &WA, WAd, &WAfr );
492
493
494
495 if( WACreuse )
496 { WBC = WAC;
MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
497 else
498 {
500 &WBCfr, &WBCsum );
501 }
503 Bmp =
PB_Cnumroc( ktmp, 0, Bimb1, Bmb, myrow, Brow, nprow );
504 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
506 talpha, Bptr0, &Bld, WA, &WAd[
LLD_], tbeta, WBC, &WBCld );
507 if( WAfr ) free( WA );
508 if( Afr ) free( Aptr );
509
510
511
512 if( WACsum )
513 {
515 Cnb, Ccsrc, Ccsrc, npcol );
516 if( Amp > 0 )
517 gsum2d( ctxt,
ROW, &top, Amp, kbb, WAC, WACld, myrow,
519 }
520
521
522
523 if( conjg )
525 k, 0, WACd );
526 else if( kbb > 1 )
528 k+1, 0, WACd );
529
530
531
532 if( !( WACreuse ) )
533 {
534 if( WBCsum )
535 {
537 else
538 {
540 Cinb, Cnb, Ccsrc, Ccsrc,
541 npcol );
542 }
543 if( Bmp > 0 )
544 gsum2d( ctxt,
ROW, &top, Bmp, kbb, WBC, WBCld, myrow,
546 }
547
548
549
550 if( conjg )
552 WBC, k, 0, WBCd );
553 else if( kbb > 1 )
555 WBC, k+1, 0, WBCd );
556 }
557
558
559
561 one, C, IC, JC+k, DESCC,
COLUMN );
562 if( WACfr ) free( WAC );
563
564
565
566 if( !( WACreuse ) )
567 {
570 if( WBCfr ) free( WBC );
571 }
572 }
573 }
574 else
575 {
576 for( k = kstart; k != kend; k += kstep )
577 {
578 ktmp = N - k; kbb =
MIN( ktmp, kb );
579
580
581
583 ROW, &Bptr, DBUFB, &Bfr );
584
585
586
588 Acurrow =
PB_Cindxg2p( k, Aimb1, Amb, Arow, Arow, nprow );
589 PB_Cdescset( Ad0, ktmp, ABn, Acurimb1, Ainb1, Amb, Anb, Acurrow,
590 Acol, ctxt, Ald );
591 PB_CInV(
TYPE,
NOCONJG,
ROW, ktmp, ABn, Ad0, kbb, Bptr, 0, 0, DBUFB,
592 ROW, &WB, WBd, &WBfr );
593
594
595
597 &WACfr, &WACsum );
599 Amp =
PB_Cnumroc( ktmp, k, Aimb1, Amb, myrow, Arow, nprow );
600 if( ( Amp > 0 ) && ( Anq0 > 0 ) )
602 ALPHA,
Mptr( Aptr0, Amp0-Amp, 0, Ald, size ), &Ald, WB,
603 &WBd[
LLD_], zero, WAC, &WACld );
604 if( WBfr ) free( WB );
605 if( Bfr ) free( Bptr );
606
607
608
610 ROW, &Aptr, DBUFA, &Afr );
611
612
613
615 Bcurrow =
PB_Cindxg2p( k, Bimb1, Bmb, Brow, Brow, nprow );
616 PB_Cdescset( Bd0, ktmp, ABn, Bcurimb1, Binb1, Bmb, Bnb, Bcurrow,
617 Bcol, ctxt, Bld );
618 PB_CInV(
TYPE,
NOCONJG,
ROW, ktmp, ABn, Bd0, kbb, Aptr, 0, 0, DBUFA,
619 ROW, &WA, WAd, &WAfr );
620
621
622
623 if( WACreuse )
624 { WBC = WAC;
MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
625 else
626 {
628 &WBCfr, &WBCsum );
629 }
631 Bmp =
PB_Cnumroc( ktmp, k, Bimb1, Bmb, myrow, Brow, nprow );
632 if( ( Bmp > 0 ) && ( Bnq0 > 0 ) )
634 talpha,
Mptr( Bptr0, Bmp0-Bmp, 0, Bld, size ), &Bld, WA,
635 &WAd[
LLD_], tbeta, WBC, &WBCld );
636 if( WAfr ) free( WA );
637 if( Afr ) free( Aptr );
638
639
640
641 if( WACsum )
642 {
644 Cnb, Ccsrc, Ccsrc, npcol );
645 if( Amp > 0 )
646 gsum2d( ctxt,
ROW, &top, Amp, kbb, WAC, WACld, myrow,
648 }
649
650
651
652 if( conjg )
654 0, 0, WACd );
655 else if( kbb > 1 )
657 0, 1, WACd );
658
659
660
661 if( !( WACreuse ) )
662 {
663 if( WBCsum )
664 {
666 else
667 {
669 Cinb, Cnb, Ccsrc, Ccsrc,
670 npcol );
671 }
672 if( Bmp > 0 )
673 gsum2d( ctxt,
ROW, &top, Bmp, kbb, WBC, WBCld, myrow,
675 }
676
677
678
679 if( conjg )
681 WBC, 0, 0, WBCd );
682 else if( kbb > 1 )
684 WBC, 0, 1, WBCd );
685 }
686
687
688
690 one, C, IC+k, JC+k, DESCC,
COLUMN );
691 if( WACfr ) free( WAC );
692
693
694
695 if( !( WACreuse ) )
696 {
699 if( WBCfr ) free( WBC );
700 }
701 }
702 }
703 }
704 else
705 {
707 Cimb = DESCC[
IMB_]; Cmb = DESCC[
MB_]; Crsrc = DESCC[
RSRC_];
708
709
710
711
712 AisR = ( ( Arow < 0 ) || ( nprow == 1 ) );
713 BisR = ( ( Brow < 0 ) || ( nprow == 1 ) );
714
715
716
717
718
719
720 if( !( AisR ) && !( BisR ) )
721 {
722 Aspan =
PB_Cspan( ABm, 0, Aimb1, Amb, Arow, nprow );
723 Bspan =
PB_Cspan( ABm, 0, Bimb1, Bmb, Brow, nprow );
724 WACreuse = ( Aspan ||
725 ( !( Aspan ) && !( Bspan ) && ( Arow == Brow ) ) );
726 }
727 else
728 {
729
730
731
732
733 WACreuse = ( AisR && BisR );
734 }
735
736
737
738
739 AisD = ( ( Acol >= 0 ) && ( npcol > 1 ) );
740 BisD = ( ( Bcol >= 0 ) && ( npcol > 1 ) );
741 WACreuse = ( WACreuse &&
742 ( ( !AisD && !BisD ) || ( ( AisD && BisD ) &&
743 ( ( Acol == Bcol ) &&
744 ( ( ( Ainb1 >= ABn ) && ( Binb1 >= ABn ) ) ||
745 ( ( Ainb1 == Binb1 ) && ( Anb == Bnb ) ) ) ) ) ) );
746 tbeta = ( WACreuse ? one : zero );
747
748 if( upper )
749 {
750 for( k = kstart; k != kend; k += kstep )
751 {
752 ktmp = N - k; kbb =
MIN( ktmp, kb );
753
754
755
757 COLUMN, &Bptr, DBUFB, &Bfr );
758
759
760
762 Acurcol =
PB_Cindxg2p( k, Ainb1, Anb, Acol, Acol, npcol );
763 PB_Cdescset( Ad0, ABm, ktmp, Aimb1, Acurinb1, Amb, Anb, Arow,
764 Acurcol, ctxt, Ald );
765 PB_CInV(
TYPE,
NOCONJG,
COLUMN, ABm, ktmp, Ad0, kbb, Bptr, 0, 0,
766 DBUFB,
COLUMN, &WB, WBd, &WBfr );
767
768
769
770 PB_COutV(
TYPE,
ROW,
INIT, ABm, ktmp, Ad0, kbb, &WAC, WACd, &WACfr,
771 &WACsum );
773 Anq =
PB_Cnumroc( ktmp, k, Ainb1, Anb, mycol, Acol, npcol );
774 if( ( Anq > 0 ) && ( Amp0 > 0 ) )
776 talpha, WB, &WBd[
LLD_],
Mptr( Aptr0, 0, Anq0-Anq, Ald,
777 size ), &Ald, zero, WAC, &WACld );
778 if( WBfr ) free( WB );
779 if( Bfr ) free( Bptr );
780
781
782
784 COLUMN, &Aptr, DBUFA, &Afr );
785
786
787
789 Bcurcol =
PB_Cindxg2p( k, Binb1, Bnb, Bcol, Bcol, npcol );
790 PB_Cdescset( Bd0, ABm, ktmp, Bimb1, Bcurinb1, Bmb, Bnb, Brow,
791 Bcurcol, ctxt, Bld );
792 PB_CInV(
TYPE,
NOCONJG,
COLUMN, ABm, ktmp, Bd0, kbb, Aptr, 0, 0,
793 DBUFA,
COLUMN, &WA, WAd, &WAfr );
794
795
796
797 if( WACreuse )
798 { WBC = WAC;
MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
799 else
800 {
802 &WBCfr, &WBCsum );
803 }
805 Bnq =
PB_Cnumroc( ktmp, k, Binb1, Bnb, mycol, Bcol, npcol );
806 if( ( Bnq > 0 ) && ( Bmp0 > 0 ) )
808 ALPHA, WA, &WAd[
LLD_],
Mptr( Bptr0, 0, Bnq0-Bnq, Bld,
809 size ), &Bld, tbeta, WBC, &WBCld );
810 if( WAfr ) free( WA );
811 if( Afr ) free( Aptr );
812
813
814
815 if( WACsum )
816 {
818 Cmb, Crsrc, Crsrc, nprow );
819 if( Anq > 0 )
820 gsum2d( ctxt,
COLUMN, &top, kbb, Anq, WAC, WACld, WACd[
RSRC_],
821 mycol );
822 }
823
824
825
826 if( conjg )
828 0, 0, WACd );
829 else if( kbb > 1 )
831 1, 0, WACd );
832
833
834
835 if( !( WACreuse ) )
836 {
837 if( WBCsum )
838 {
840 else
841 {
843 Cimb, Cmb, Crsrc, Crsrc,
844 nprow );
845 }
846 if( Bnq > 0 )
847 gsum2d( ctxt,
COLUMN, &top, kbb, Bnq, WBC, WBCld,
848 WBCd[
RSRC_], mycol );
849 }
850
851
852
853 if( conjg )
855 WBC, 0, 0, WBCd );
856 else if( kbb > 1 )
858 WBC, 1, 0, WBCd );
859 }
860
861
862
864 one, C, IC+k, JC+k, DESCC,
ROW );
865 if( WACfr ) free( WAC );
866
867
868
869 if( !( WACreuse ) )
870 {
872 one, C, IC+k, JC+k, DESCC,
ROW );
873 if( WBCfr ) free( WBC );
874 }
875 }
876 }
877 else
878 {
879 for( k = kstart; k != kend; k += kstep )
880 {
881 kbb = N - k; kbb =
MIN( kbb, kb ); ktmp = k + kbb;
882
883
884
886 COLUMN, &Bptr, DBUFB, &Bfr );
887
888
889
890 PB_Cdescset( Ad0, ABm, ktmp, Aimb1, Ainb1, Amb, Anb, Arow, Acol,
891 ctxt, Ald );
892 PB_CInV(
TYPE,
NOCONJG,
COLUMN, ABm, ktmp, Ad0, kbb, Bptr, 0, 0,
893 DBUFB,
COLUMN, &WB, WBd, &WBfr );
894
895
896
897 PB_COutV(
TYPE,
ROW,
INIT, ABm, ktmp, Ad0, kbb, &WAC, WACd, &WACfr,
898 &WACsum );
900 Anq =
PB_Cnumroc( ktmp, 0, Ainb1, Anb, mycol, Acol, npcol );
901 if( ( Anq > 0 ) && ( Amp0 > 0 ) )
903 talpha, WB, &WBd[
LLD_], Aptr0, &Ald, zero, WAC, &WACld );
904 if( WBfr ) free( WB );
905 if( Bfr ) free( Bptr );
906
907
908
910 COLUMN, &Aptr, DBUFA, &Afr );
911
912
913
914 PB_Cdescset( Bd0, ABm, ktmp, Bimb1, Binb1, Bmb, Bnb, Brow, Bcol,
915 ctxt, Bld );
916 PB_CInV(
TYPE,
NOCONJG,
COLUMN, ABm, ktmp, Bd0, kbb, Aptr, 0, 0,
917 DBUFA,
COLUMN, &WA, WAd, &WAfr );
918
919
920
921 if( WACreuse )
922 { WBC = WAC;
MDescCopy( WACd, WBCd ); WBCfr = 0; WBCsum = WACsum; }
923 else
924 {
926 &WBCfr, &WBCsum );
927 }
929 Bnq =
PB_Cnumroc( ktmp, 0, Binb1, Bnb, mycol, Bcol, npcol );
930 if( ( Bnq > 0 ) && ( Bmp0 > 0 ) )
932 ALPHA, WA, &WAd[
LLD_], Bptr0, &Bld, tbeta, WBC, &WBCld );
933 if( WAfr ) free( WA );
934 if( Afr ) free( Aptr );
935
936
937
938 if( WACsum )
939 {
941 Cmb, Crsrc, Crsrc, nprow );
942 if( Anq > 0 )
943 gsum2d( ctxt,
COLUMN, &top, kbb, Anq, WAC, WACld, WACd[
RSRC_],
944 mycol );
945 }
946
947
948
949 if( conjg )
951 0, k, WACd );
952 else if( kbb > 1 )
954 0, k+1, WACd );
955
956
957
958 if( !( WACreuse ) )
959 {
960 if( WBCsum )
961 {
963 else
964 {
966 Cimb, Cmb, Crsrc, Crsrc,
967 nprow );
968 }
969 if( Bnq > 0 )
970 gsum2d( ctxt,
COLUMN, &top, kbb, Bnq, WBC, WBCld,
971 WBCd[
RSRC_], mycol );
972 }
973
974
975
976 if( conjg )
978 WBC, 0, k, WBCd );
979 else if( kbb > 1 )
981 WBC, 0, k+1, WBCd );
982 }
983
984
985
987 one, C, IC+k, JC, DESCC,
ROW );
988 if( WACfr ) free( WAC );
989
990
991
992 if( !( WACreuse ) )
993 {
995 one, C, IC+k, JC, DESCC,
ROW );
996 if( WBCfr ) free( WBC );
997 }
998 }
999 }
1000 }
1001 if( conjg ) free( talpha );
1002
1003
1004
1005}