/*
 *             Automatically Tuned Linear Algebra Software v3.5.0
 *                    (C) Copyright 2002 R. Clint Whaley
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions, and the following disclaimer in the
 *      documentation and/or other materials provided with the distribution.
 *   3. The name of the ATLAS group or the names of its contributers may
 *      not be used to endorse or promote products derived from this
 *      software without specific written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 */

#ifndef ATL_GAS_x8632
   #error "This kernel requires gas x86-32 assembler!"
#endif
#if !defined(NB) || (NB == 0)
   #error "NB must be a compile-time constant!"
#endif
#if (NB != 72)
   #error "NB must be 72!"
#endif
#if (NB/6)*6 != NB
   #error "NB must be multiple of 6!"
#endif
#
#  Integer register usage shown be these defines
#
#define pC      %esi
#define pA      %ecx
#define pB      %edi
#define incCn   %eax
#define stM     %edx
#define stN     %ebx
#define pfA     %ebp

#define rC0	%xmm0
#define rC1	%xmm1
#define rC2	%xmm2
#define rC3	%xmm3
#define rC4	%xmm4
#define rC5	%xmm5
#define rA0	%xmm6
#define rB0	%xmm7

#define NBso	(NB*8)
#define NBNBso  (NB*NB*8)
#define NB2so   (NBso+NBso)
#define NB3so   (NBso+NBso+NBso)
#define NB4so   (NBso+NBso+NBso+NBso)
#define NB5so   (NBso+NBso+NBso+NBso+NBso)
#define NB6so   (NBso+NBso+NBso+NBso+NBso+NBso)
#define NB7so   (NB6so+NBso)
#define NB8so   (NB6so+NB2so)
#define NB9so   (NB6so+NB3so)
#define NB10so   (NB6so+NB4so)
#define NB11so   (NB6so+NB5so)

#
#  Prefetch defines
#
#if 1
#define pref2(mem) prefetcht1	mem
#define prefB(mem) prefetcht0	mem
#define prefC(mem) prefetcht0	mem
#else
#define pref2(mem)
#define prefB(mem)
#define prefC(mem)
#endif
#
# void ATL_USERMM(const int M, const int N, const int K, const TYPE alpha,
#                 const TYPE *A, const int lda, const TYPE *B, const int ldb,
#                 const TYPE beta, TYPE *C, const int ldc)
#
	.text
.global ATL_USERMM
	.type	ATL_USERMM,@function
ATL_USERMM:
#
#	Save callee-saved iregs; Save old stack pointer in eax,
#       so we can adjust for BETA alignment
#
	movl %esp, %eax
   #ifdef BETAX
	subl	$36, %esp
	shr	$4, %esp
	shl	$4, %esp
	movl	%ebp, 32(%esp)
	movl	%ebx, 28(%esp)
	movl	%esi, 24(%esp)
	movl	%edi, 20(%esp)
	movl	%eax, 16(%esp)
	movlpd	40(%eax), rC0
	unpcklpd	rC0, rC0
	movapd	rC0, (%esp)
      #define BETAOFF 0
   #else
	subl	$16, %esp
	movl	%ebp, 12(%esp)
	movl	%ebx,  8(%esp)
	movl	%esi,  4(%esp)
	movl	%edi,   (%esp)
   #endif
#
#	Initialize pA = A;  pB = B; pC = C;
#
	movl	24(%eax), pA
	movl	32(%eax), pB
	movl	48(%eax), pC
#
#
#       stM = pA + NBNB-6*NB;  pfA = pA+NBNB;  stN = pB + NBNB;
#
	movl	$NBNBso, stM
	addl	pA, stM
	movl	$0, pfA
	movl	$NBNBso, stN
	addl	pB, stN
#
#       Set incCn = (ldc - NB)*sizeof
#
	movl	52(%eax), incCn
	subl	$MB, incCn
   #ifdef DCPLX
	shl	$4, incCn
   #else
	shl	$3, incCn
   #endif
NLOOP:
MLOOP:
#ifdef BETA0
	xorpd	rC0, rC0
	xorpd	rC1, rC1
	xorpd	rC2, rC2
	xorpd	rC3, rC3
	xorpd	rC4, rC4
	xorpd	rC5, rC5
#else
   #ifdef DCPLX
	movsd	(pC), rC0
	movsd	16(pC), rC1
	movsd	32(pC), rC2
	movsd	48(pC), rC3
	movsd	64(pC), rC4
	movsd	80(pC), rC5
   #else
	movsd	(pC), rC0
	movsd	8(pC), rC1
	movsd	16(pC), rC2
	movsd	24(pC), rC3
	movsd	32(pC), rC4
	movsd	40(pC), rC5
   #endif
   #ifdef BETAX
	movlpd	(%esp), rA0
	mulsd	rA0, rC0
	mulsd	rA0, rC1
	mulsd	rA0, rC2
	mulsd	rA0, rC3
	mulsd	rA0, rC4
	mulsd	rA0, rC5
   #endif
#endif
	movapd	0(pB), rB0
	movapd	0(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	0+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	0+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	0+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	0+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	0+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	16(pB), rB0
	movapd	16(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	16+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	16+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	16+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	16+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	16+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	32(pB), rB0
	movapd	32(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	32+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	32+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	32+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	32+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	32+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	48(pB), rB0
	movapd	48(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	48+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	48+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	48+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	48+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	48+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	64(pB), rB0
	movapd	64(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	64+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	64+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	64+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	64+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	64+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	80(pB), rB0
	movapd	80(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	80+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	80+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	80+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	80+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	80+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	96(pB), rB0
	movapd	96(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	96+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	96+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	96+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	96+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	96+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	112(pB), rB0
	movapd	112(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	112+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	112+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	112+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	112+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	112+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	128(pB), rB0
	movapd	128(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	128+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	128+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	128+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	128+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	128+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	144(pB), rB0
	movapd	144(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	144+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	144+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	144+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	144+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	144+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	160(pB), rB0
	movapd	160(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	160+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	160+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	160+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	160+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	160+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	176(pB), rB0
	movapd	176(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	176+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	176+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	176+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	176+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	176+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	192(pB), rB0
	movapd	192(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	192+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	192+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	192+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	192+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	192+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	208(pB), rB0
	movapd	208(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	208+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	208+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	208+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	208+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	208+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	224(pB), rB0
	movapd	224(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	224+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	224+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	224+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	224+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	224+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	240(pB), rB0
	movapd	240(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	240+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	240+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	240+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	240+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	240+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	256(pB), rB0
	movapd	256(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	256+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	256+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	256+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	256+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	256+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	272(pB), rB0
	movapd	272(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	272+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	272+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	272+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	272+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	272+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	288(pB), rB0
	movapd	288(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	288+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	288+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	288+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	288+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	288+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	304(pB), rB0
	movapd	304(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	304+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	304+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	304+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	304+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	304+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	320(pB), rB0
	movapd	320(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	320+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	320+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	320+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	320+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	320+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	336(pB), rB0
	movapd	336(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	336+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	336+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	336+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	336+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	336+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	352(pB), rB0
	movapd	352(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	352+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	352+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	352+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	352+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	352+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	368(pB), rB0
	movapd	368(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	368+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	368+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	368+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	368+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	368+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	384(pB), rB0
	movapd	384(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	384+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	384+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	384+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	384+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	384+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5
				pref2((stN,pfA))

	movapd	400(pB), rB0
	movapd	400(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	400+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	400+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	400+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	400+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	400+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	416(pB), rB0
	movapd	416(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	416+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	416+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	416+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	416+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	416+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	432(pB), rB0
	movapd	432(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	432+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	432+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	432+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	432+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	432+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	448(pB), rB0
	movapd	448(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	448+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	448+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	448+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	448+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	448+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	464(pB), rB0
	movapd	464(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	464+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	464+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	464+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	464+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	464+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	480(pB), rB0
	movapd	480(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	480+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	480+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	480+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	480+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	480+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	496(pB), rB0
	movapd	496(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	496+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	496+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	496+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	496+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	496+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	512(pB), rB0
	movapd	512(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	512+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	512+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	512+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	512+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	512+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	528(pB), rB0
	movapd	528(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	528+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	528+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	528+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	528+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	528+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5
				pref2((stM,pfA))
				addl	$48, pfA

	movapd	544(pB), rB0
	movapd	544(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	544+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	544+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	544+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	544+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	544+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

	movapd	560(pB), rB0
	movapd	560(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC0
	movapd	560+NBso(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC1
	movapd	560+NB2so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC2
	movapd	560+NB3so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC3
	movapd	560+NB4so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC4
	movapd	560+NB5so(pA), rA0
	mulpd	rB0, rA0
	addpd	rA0, rC5

#
#       Get these bastard things summed up correctly
#
                                        # rC0 = c0a  c0b
                                        # rC1 = c1a  c1b
                                        # rC2 = c2a  c2b
                                        # rC3 = c3a  c3b
#
                                        # rC4 = c4a  c4b
                                        # rC5 = c5a  c5b
        movapd          rC0, rA0
        movapd          rC4, rB0
        unpcklpd        rC1, rC0        # rC0 = c0a  c1a
        unpcklpd        rC5, rC4        # rC4 = c4a  c5a
        unpckhpd        rC1, rA0        # rA0 = c0b  c1b
        unpckhpd        rC5, rB0        # rB0 = c4b  c5b
        addpd           rA0, rC0        # rC0 = c0ab c1ab
        addpd           rB0, rC4        # rC4 = c4ab c5ab
        movapd          rC2, rA0
        unpcklpd        rC3, rC2        # rC2 = c2a  c3a
        unpckhpd        rC3, rA0        # rA0 = c2b  c3b
        addpd           rA0, rC2        # rC2 = c2ab c3ab
#
#
#	Write results back to C
#
   #ifdef DCPLX
	movlpd	rC0, (pC)
	movhpd	rC0, 16(pC)
	movlpd	rC2, 32(pC)
	movhpd	rC2, 48(pC)
	movlpd	rC4, 64(pC)
	movhpd	rC4, 80(pC)
   #else
	movupd	rC0, (pC)
	movupd	rC2, 16(pC)
	movupd	rC4, 32(pC)
   #endif
#
#	pC += 6;   pA += 6*NB
#
   #ifdef DCPLX
	addl	$96, pC
   #else
	addl	$48, pC
   #endif
	addl	$NB6so, pA
#
#       while (pA != stM);
#
	cmp	pA, stM
	jne	MLOOP

#
#	pC += incCn;  pA -= NBNB;  pB += NB;
#
	addl	incCn, pC
	subl	$NBNBso, pA
	addl	$NBso, pB
#
#	while (pB != stN);
#
	cmp	pB, stN
	jne	NLOOP

#
#	Restore callee-saved iregs
#
   #ifdef BETAX
	movl	32(%esp), %ebp
	movl	28(%esp), %ebx
	movl	24(%esp), %esi
	movl	20(%esp), %edi
	movl	16(%esp), %esp
   #else
	movl	12(%esp), %ebp
	movl	 8(%esp), %ebx
	movl	 4(%esp), %esi
	movl	  (%esp), %edi
	addl	$16, %esp
   #endif
	ret
