void ATL_USERMM
   (const int M, const int N, const int K, const double alpha, const double *A, const int lda, const double *B, const int ldb, const double beta, double *C, const int ldc)
{
   int i, j, k;
   register double c00, c10, b0;
   const double *pA0, *pB=B;

#if ( (KB / 8)*8 != KB ) || (MB / 2)*2 != MB
   create syntax error!$@@&
#endif
   for (j=0; j < NB; j++, pB += KB)
   {
      pA0 = A;
      for (i=0; i < MB; i += 2, pA0 += KB2)
      {
         #ifdef BETA0
            c00 = c10 = 0.0;
         #elif defined(BETA1)
            c00 = C[i+j*ldc];
            c10 = C[i+1+j*ldc];
         #else
            c00 = beta*C[i+j*ldc];
            c10 = beta*C[i+1+j*ldc];
         #endif
         for (k=0; k < KB; k += 8)
         {
            b0 = pB[k];
            c00 += pA0[k] * b0;
            c10 += pA0[KB+k] * b0;
            b0 = pB[k+1];
            c00 += pA0[k+1] * b0;
            c10 += pA0[KB+k+1] * b0;
            b0 = pB[k+2];
            c00 += pA0[k+2] * b0;
            c10 += pA0[KB+k+2] * b0;
            b0   =  pB[k+3];
            c00 += pA0[k+3] * b0;
            c10 += pA0[KB+k+3] * b0;
            b0   =  pB[k+4];
            c00 += pA0[k+4] * b0;
            c10 += pA0[KB+k+4] * b0;
            b0   =  pB[k+5];
            c00 += pA0[k+5] * b0;
            c10 += pA0[KB+k+5] * b0;
            b0   =  pB[k+6];
            c00 += pA0[k+6] * b0;
            c10 += pA0[KB+k+6] * b0;
            b0   =  pB[k+7];
            c00 += pA0[k+7] * b0;
            c10 += pA0[KB+k+7] * b0;
         }
         C[i+j*ldc] = c00;
         C[i+1+j*ldc] = c10;
      }
   }
}
