#include "atlas_misc.h"

void ATL_USERMM
   (const int M, const int N, const int K, const TYPE alpha, const TYPE *A, const int lda, const TYPE *B, const int ldb, const TYPE beta, TYPE *C, const int ldc)
/*
 * matmul with TA=T, TB=N, MB=0, NB=0, KB=0, 
 * lda=0, ldb=0, ldc=0, mu=8, nu=8, ku=1
 */
{
   const int Mb = (M>>3)<<3;
   const int Nb = (N>>3)<<3;
   #define Kb K
   const TYPE *stM = A + (lda*Mb);
   const TYPE *stN = B + (ldb*Nb);
   #define incAk 1
   const int incAm = ((((lda) << 3)) - Kb), incAn = -(Mb*lda);
   #define incBk 1
   const int incBm = -(Kb), incBn = (((ldb) << 3));
   #ifdef TREAL
      #define incCm  8
   #else
      #define incCm 16
   #endif
   const int incCn = ((((ldc) << 3)) - (Mb))SHIFT, ldc2 = ldc SHIFT;
   TYPE *pC0=C, *pC1=pC0+(ldc2), *pC2=pC1+(ldc2), *pC3=pC2+(ldc2), 
        *pC4=pC3+(ldc2), *pC5=pC4+(ldc2), *pC6=pC5+(ldc2), *pC7=pC6+(ldc2);
   const TYPE *pA0=A, *pA1=pA0+(lda), *pA2=pA1+(lda), *pA3=pA2+(lda), 
              *pA4=pA3+(lda), *pA5=pA4+(lda), *pA6=pA5+(lda), *pA7=pA6+(lda);
   const TYPE *pB0=B, *pB1=pB0+(ldb), *pB2=pB1+(ldb), *pB3=pB2+(ldb), 
              *pB4=pB3+(ldb), *pB5=pB4+(ldb), *pB6=pB5+(ldb), *pB7=pB6+(ldb);
   TYPE *bp = (TYPE *) &beta;
   register int k;
   register TYPE rA0, rA1, rA2, rA3, rA4, rA5, rA6, rA7;
   register TYPE rB0, rB1, rB2, rB3, rB4, rB5, rB6, rB7;
   register TYPE rC0_0, rC1_0, rC2_0, rC3_0, rC4_0, rC5_0, rC6_0, rC7_0, rC0_1, rC1_1, rC2_1, rC3_1, rC4_1, rC5_1, rC6_1, rC7_1, rC0_2, rC1_2, rC2_2, rC3_2, rC4_2, rC5_2, rC6_2, rC7_2, rC0_3, rC1_3, rC2_3, rC3_3, rC4_3, rC5_3, rC6_3, rC7_3, rC0_4, rC1_4, rC2_4, rC3_4, rC4_4, rC5_4, rC6_4, rC7_4, rC0_5, rC1_5, rC2_5, rC3_5, rC4_5, rC5_5, rC6_5, rC7_5, rC0_6, rC1_6, rC2_6, rC3_6, rC4_6, rC5_6, rC6_6, rC7_6, rC0_7, rC1_7, rC2_7, rC3_7, rC4_7, rC5_7, rC6_7, rC7_7;
   do /* N-loop */
   {
      do /* M-loop */
      {
         #ifdef BETA0
            rC0_0 = rC1_0 = rC2_0 = rC3_0 = rC4_0 = rC5_0 = rC6_0 = rC7_0 =
            rC0_1 = rC1_1 = rC2_1 = rC3_1 = rC4_1 = rC5_1 = rC6_1 = rC7_1 =
            rC0_2 = rC1_2 = rC2_2 = rC3_2 = rC4_2 = rC5_2 = rC6_2 = rC7_2 =
            rC0_3 = rC1_3 = rC2_3 = rC3_3 = rC4_3 = rC5_3 = rC6_3 = rC7_3 =
            rC0_4 = rC1_4 = rC2_4 = rC3_4 = rC4_4 = rC5_4 = rC6_4 = rC7_4 =
            rC0_5 = rC1_5 = rC2_5 = rC3_5 = rC4_5 = rC5_5 = rC6_5 = rC7_5 =
            rC0_6 = rC1_6 = rC2_6 = rC3_6 = rC4_6 = rC5_6 = rC6_6 = rC7_6 =
            rC0_7 = rC1_7 = rC2_7 = rC3_7 = rC4_7 = rC5_7 = rC6_7 = rC7_7 =
                    ATL_rzero;
         #else
            #ifdef TREAL
               rC0_0 = *pC0; rC1_0 = pC0[1]; rC2_0 = pC0[2]; rC3_0 = pC0[3];
               rC4_0 = pC0[4]; rC5_0 = pC0[5]; rC6_0 = pC0[6]; rC7_0 = pC0[7];
               rC0_1 = *pC1; rC1_1 = pC1[1]; rC2_1 = pC1[2]; rC3_1 = pC1[3];
               rC4_1 = pC1[4]; rC5_1 = pC1[5]; rC6_1 = pC1[6]; rC7_1 = pC1[7];
               rC0_2 = *pC2; rC1_2 = pC2[1]; rC2_2 = pC2[2]; rC3_2 = pC2[3];
               rC4_2 = pC2[4]; rC5_2 = pC2[5]; rC6_2 = pC2[6]; rC7_2 = pC2[7];
               rC0_3 = *pC3; rC1_3 = pC3[1]; rC2_3 = pC3[2]; rC3_3 = pC3[3];
               rC4_3 = pC3[4]; rC5_3 = pC3[5]; rC6_3 = pC3[6]; rC7_3 = pC3[7];
               rC0_4 = *pC4; rC1_4 = pC4[1]; rC2_4 = pC4[2]; rC3_4 = pC4[3];
               rC4_4 = pC4[4]; rC5_4 = pC4[5]; rC6_4 = pC4[6]; rC7_4 = pC4[7];
               rC0_5 = *pC5; rC1_5 = pC5[1]; rC2_5 = pC5[2]; rC3_5 = pC5[3];
               rC4_5 = pC5[4]; rC5_5 = pC5[5]; rC6_5 = pC5[6]; rC7_5 = pC5[7];
               rC0_6 = *pC6; rC1_6 = pC6[1]; rC2_6 = pC6[2]; rC3_6 = pC6[3];
               rC4_6 = pC6[4]; rC5_6 = pC6[5]; rC6_6 = pC6[6]; rC7_6 = pC6[7];
               rC0_7 = *pC7; rC1_7 = pC7[1]; rC2_7 = pC7[2]; rC3_7 = pC7[3];
               rC4_7 = pC7[4]; rC5_7 = pC7[5]; rC6_7 = pC7[6]; rC7_7 = pC7[7];
            #else
            rC0_0 = *pC0;   rC1_0 = pC0[ 2]; rC2_0 = pC0[ 4]; rC3_0 = pC0[ 6];
            rC4_0 = pC0[8]; rC5_0 = pC0[10]; rC6_0 = pC0[12]; rC7_0 = pC0[14];
            rC0_1 = *pC1;   rC1_1 = pC1[ 2]; rC2_1 = pC1[ 4]; rC3_1 = pC1[ 6];
            rC4_1 = pC1[8]; rC5_1 = pC1[10]; rC6_1 = pC1[12]; rC7_1 = pC1[14];
            rC0_2 = *pC2;   rC1_2 = pC2[ 2]; rC2_2 = pC2[ 4]; rC3_2 = pC2[ 6];
            rC4_2 = pC2[8]; rC5_2 = pC2[10]; rC6_2 = pC2[12]; rC7_2 = pC2[14];
            rC0_3 = *pC3;   rC1_3 = pC3[ 2]; rC2_3 = pC3[ 4]; rC3_3 = pC3[ 6];
            rC4_3 = pC3[8]; rC5_3 = pC3[10]; rC6_3 = pC3[12]; rC7_3 = pC3[14];
            rC0_4 = *pC4;   rC1_4 = pC4[ 2]; rC2_4 = pC4[ 4]; rC3_4 = pC4[ 6];
            rC4_4 = pC4[8]; rC5_4 = pC4[10]; rC6_4 = pC4[12]; rC7_4 = pC4[14];
            rC0_5 = *pC5;   rC1_5 = pC5[ 2]; rC2_5 = pC5[ 4]; rC3_5 = pC5[ 6];
            rC4_5 = pC5[8]; rC5_5 = pC5[10]; rC6_5 = pC5[12]; rC7_5 = pC5[14];
            rC0_6 = *pC6;   rC1_6 = pC6[ 2]; rC2_6 = pC6[ 4]; rC3_6 = pC6[ 6];
            rC4_6 = pC6[8]; rC5_6 = pC6[10]; rC6_6 = pC6[12]; rC7_6 = pC6[14];
            rC0_7 = *pC7;   rC1_7 = pC7[ 2]; rC2_7 = pC7[ 4]; rC3_7 = pC7[ 6];
            rC4_7 = pC7[8]; rC5_7 = pC7[10]; rC6_7 = pC7[12]; rC7_7 = pC7[14];
            #endif
            #ifdef BETAX
               rA7 = *bp;
               rC0_0 *= rA7; rC1_0 *= rA7; rC2_0 *= rA7; rC3_0 *= rA7;
               rC4_0 *= rA7; rC5_0 *= rA7; rC6_0 *= rA7; rC7_0 *= rA7;
               rC0_1 *= rA7; rC1_1 *= rA7; rC2_1 *= rA7; rC3_1 *= rA7;
               rC4_1 *= rA7; rC5_1 *= rA7; rC6_1 *= rA7; rC7_1 *= rA7;
               rC0_2 *= rA7; rC1_2 *= rA7; rC2_2 *= rA7; rC3_2 *= rA7;
               rC4_2 *= rA7; rC5_2 *= rA7; rC6_2 *= rA7; rC7_2 *= rA7;
               rC0_3 *= rA7; rC1_3 *= rA7; rC2_3 *= rA7; rC3_3 *= rA7;
               rC4_3 *= rA7; rC5_3 *= rA7; rC6_3 *= rA7; rC7_3 *= rA7;
               rC0_4 *= rA7; rC1_4 *= rA7; rC2_4 *= rA7; rC3_4 *= rA7;
               rC4_4 *= rA7; rC5_4 *= rA7; rC6_4 *= rA7; rC7_4 *= rA7;
               rC0_5 *= rA7; rC1_5 *= rA7; rC2_5 *= rA7; rC3_5 *= rA7;
               rC4_5 *= rA7; rC5_5 *= rA7; rC6_5 *= rA7; rC7_5 *= rA7;
               rC0_6 *= rA7; rC1_6 *= rA7; rC2_6 *= rA7; rC3_6 *= rA7;
               rC4_6 *= rA7; rC5_6 *= rA7; rC6_6 *= rA7; rC7_6 *= rA7;
               rC0_7 *= rA7; rC1_7 *= rA7; rC2_7 *= rA7; rC3_7 *= rA7;
               rC4_7 *= rA7; rC5_7 *= rA7; rC6_7 *= rA7; rC7_7 *= rA7;
            #endif
         #endif
         for (k=K; k; k--) /* easy loop to unroll */
         {
            rA0 = *pA0;
            rB0 = *pB0;
            rA1 = *pA1;
            rA2 = *pA2;
            rA3 = *pA3;
            rA4 = *pA4;
            rA5 = *pA5;
            rA6 = *pA6;
            rA7 = *pA7;
            rB1 = *pB1;
            rB2 = *pB2;
            rB3 = *pB3;
            rB4 = *pB4;
            rB5 = *pB5;
            rB6 = *pB6;
            rB7 = *pB7;
            rC0_0 += rA0 * rB0;
            rC1_0 += rA1 * rB0;
            rC2_0 += rA2 * rB0;
            rC3_0 += rA3 * rB0;
            rC4_0 += rA4 * rB0;
            rC5_0 += rA5 * rB0;
            rC6_0 += rA6 * rB0;
            rC7_0 += rA7 * rB0;
            rC0_1 += rA0 * rB1;
            rC1_1 += rA1 * rB1;
            rC2_1 += rA2 * rB1;
            rC3_1 += rA3 * rB1;
            rC4_1 += rA4 * rB1;
            rC5_1 += rA5 * rB1;
            rC6_1 += rA6 * rB1;
            rC7_1 += rA7 * rB1;
            rC0_2 += rA0 * rB2;
            rC1_2 += rA1 * rB2;
            rC2_2 += rA2 * rB2;
            rC3_2 += rA3 * rB2;
            rC4_2 += rA4 * rB2;
            rC5_2 += rA5 * rB2;
            rC6_2 += rA6 * rB2;
            rC7_2 += rA7 * rB2;
            rC0_3 += rA0 * rB3;
            rC1_3 += rA1 * rB3;
            rC2_3 += rA2 * rB3;
            rC3_3 += rA3 * rB3;
            rC4_3 += rA4 * rB3;
            rC5_3 += rA5 * rB3;
            rC6_3 += rA6 * rB3;
            rC7_3 += rA7 * rB3;
            rC0_4 += rA0 * rB4;
            rC1_4 += rA1 * rB4;
            rC2_4 += rA2 * rB4;
            rC3_4 += rA3 * rB4;
            rC4_4 += rA4 * rB4;
            rC5_4 += rA5 * rB4;
            rC6_4 += rA6 * rB4;
            rC7_4 += rA7 * rB4;
            rC0_5 += rA0 * rB5;
            rC1_5 += rA1 * rB5;
            rC2_5 += rA2 * rB5;
            rC3_5 += rA3 * rB5;
            rC4_5 += rA4 * rB5;
            rC5_5 += rA5 * rB5;
            rC6_5 += rA6 * rB5;
            rC7_5 += rA7 * rB5;
            rC0_6 += rA0 * rB6;
            rC1_6 += rA1 * rB6;
            rC2_6 += rA2 * rB6;
            rC3_6 += rA3 * rB6;
            rC4_6 += rA4 * rB6;
            rC5_6 += rA5 * rB6;
            rC6_6 += rA6 * rB6;
            rC7_6 += rA7 * rB6;
            rC0_7 += rA0 * rB7;
            rC1_7 += rA1 * rB7;
            rC2_7 += rA2 * rB7;
            rC3_7 += rA3 * rB7;
            rC4_7 += rA4 * rB7;
            rC5_7 += rA5 * rB7;
            rC6_7 += rA6 * rB7;
            rC7_7 += rA7 * rB7;
            pA0 += incAk;
            pA1 += incAk;
            pA2 += incAk;
            pA3 += incAk;
            pA4 += incAk;
            pA5 += incAk;
            pA6 += incAk;
            pA7 += incAk;
            pB0 += incBk;
            pB1 += incBk;
            pB2 += incBk;
            pB3 += incBk;
            pB4 += incBk;
            pB5 += incBk;
            pB6 += incBk;
            pB7 += incBk;
         }
         #ifdef TREAL
            *pC0   = rC0_0; pC0[1] = rC1_0; pC0[2] = rC2_0; pC0[3] = rC3_0;
            pC0[4] = rC4_0; pC0[5] = rC5_0; pC0[6] = rC6_0; pC0[7] = rC7_0;
            *pC1   = rC0_1; pC1[1] = rC1_1; pC1[2] = rC2_1; pC1[3] = rC3_1;
            pC1[4] = rC4_1; pC1[5] = rC5_1; pC1[6] = rC6_1; pC1[7] = rC7_1;
            *pC2   = rC0_2; pC2[1] = rC1_2; pC2[2] = rC2_2; pC2[3] = rC3_2;
            pC2[4] = rC4_2; pC2[5] = rC5_2; pC2[6] = rC6_2; pC2[7] = rC7_2;
            *pC3   = rC0_3; pC3[1] = rC1_3; pC3[2] = rC2_3; pC3[3] = rC3_3;
            pC3[4] = rC4_3; pC3[5] = rC5_3; pC3[6] = rC6_3; pC3[7] = rC7_3;
            *pC4   = rC0_4; pC4[1] = rC1_4; pC4[2] = rC2_4; pC4[3] = rC3_4;
            pC4[4] = rC4_4; pC4[5] = rC5_4; pC4[6] = rC6_4; pC4[7] = rC7_4;
            *pC5   = rC0_5; pC5[1] = rC1_5; pC5[2] = rC2_5; pC5[3] = rC3_5;
            pC5[4] = rC4_5; pC5[5] = rC5_5; pC5[6] = rC6_5; pC5[7] = rC7_5;
            *pC6   = rC0_6; pC6[1] = rC1_6; pC6[2] = rC2_6; pC6[3] = rC3_6;
            pC6[4] = rC4_6; pC6[5] = rC5_6; pC6[6] = rC6_6; pC6[7] = rC7_6;
            *pC7   = rC0_7; pC7[1] = rC1_7; pC7[2] = rC2_7; pC7[3] = rC3_7;
            pC7[4] = rC4_7; pC7[5] = rC5_7; pC7[6] = rC6_7; pC7[7] = rC7_7;
         #else
            *pC0   = rC0_0; pC0[ 2] = rC1_0; pC0[ 4] = rC2_0; pC0[ 6] = rC3_0;
            pC0[8] = rC4_0; pC0[10] = rC5_0; pC0[12] = rC6_0; pC0[14] = rC7_0;
            *pC1   = rC0_1; pC1[ 2] = rC1_1; pC1[ 4] = rC2_1; pC1[ 6] = rC3_1;
            pC1[8] = rC4_1; pC1[10] = rC5_1; pC1[12] = rC6_1; pC1[14] = rC7_1;
            *pC2   = rC0_2; pC2[ 2] = rC1_2; pC2[ 4] = rC2_2; pC2[ 6] = rC3_2;
            pC2[8] = rC4_2; pC2[10] = rC5_2; pC2[12] = rC6_2; pC2[14] = rC7_2;
            *pC3   = rC0_3; pC3[ 2] = rC1_3; pC3[ 4] = rC2_3; pC3[ 6] = rC3_3;
            pC3[8] = rC4_3; pC3[10] = rC5_3; pC3[12] = rC6_3; pC3[14] = rC7_3;
            *pC4   = rC0_4; pC4[ 2] = rC1_4; pC4[ 4] = rC2_4; pC4[ 6] = rC3_4;
            pC4[8] = rC4_4; pC4[10] = rC5_4; pC4[12] = rC6_4; pC4[14] = rC7_4;
            *pC5   = rC0_5; pC5[ 2] = rC1_5; pC5[ 4] = rC2_5; pC5[ 6] = rC3_5;
            pC5[8] = rC4_5; pC5[10] = rC5_5; pC5[12] = rC6_5; pC5[14] = rC7_5;
            *pC6   = rC0_6; pC6[ 2] = rC1_6; pC6[ 4] = rC2_6; pC6[ 6] = rC3_6;
            pC6[8] = rC4_6; pC6[10] = rC5_6; pC6[12] = rC6_6; pC6[14] = rC7_6;
            *pC7   = rC0_7; pC7[ 2] = rC1_7; pC7[ 4] = rC2_7; pC7[ 6] = rC3_7;
            pC7[8] = rC4_7; pC7[10] = rC5_7; pC7[12] = rC6_7; pC7[14] = rC7_7;
         #endif
         pC0 += incCm; pC1 += incCm; pC2 += incCm; pC3 += incCm;
         pC4 += incCm; pC5 += incCm; pC6 += incCm; pC7 += incCm;
         pA0 += incAm; pA1 += incAm; pA2 += incAm; pA3 += incAm;
         pA4 += incAm; pA5 += incAm; pA6 += incAm; pA7 += incAm;
         pB0 += incBm; pB1 += incBm; pB2 += incBm; pB3 += incBm;
         pB4 += incBm; pB5 += incBm; pB6 += incBm; pB7 += incBm;
      }
      while(pA0 != stM);
      pC0 += incCn; pC1 += incCn; pC2 += incCn; pC3 += incCn;
      pC4 += incCn; pC5 += incCn; pC6 += incCn; pC7 += incCn;
      pA0 += incAn; pA1 += incAn; pA2 += incAn; pA3 += incAn;
      pA4 += incAn; pA5 += incAn; pA6 += incAn; pA7 += incAn;
      pB0 += incBn; pB1 += incBn; pB2 += incBn; pB3 += incBn;
      pB4 += incBn; pB5 += incBn; pB6 += incBn; pB7 += incBn;
   }
   while(pB0 != stN);
}
#ifdef incAm
   #undef incAm
#endif
#ifdef incAn
   #undef incAn
#endif
#ifdef incAk
   #undef incAk
#endif
#ifdef incBm
   #undef incBm
#endif
#ifdef incBn
   #undef incBn
#endif
#ifdef incBk
   #undef incBk
#endif
#ifdef incCm
   #undef incCm
#endif
#ifdef incCn
   #undef incCn
#endif
#ifdef incCk
   #undef incCk
#endif
#ifdef Mb
   #undef Mb
#endif
#ifdef Nb
   #undef Nb
#endif
#ifdef Kb
   #undef Kb
#endif
