Hi,
how many 64 bit registers can I use inside intel i7 cpu for storage purposes to feed them later into XMM registers? I currently use XMM0-15, MM0-8, R8-15 only. I know i can use RAX,RBX,RCX, RDX and eight registers inside the FPU (ST0-ST8), but what others can I use? Can I use stack registers? Thanks in advance.
I attach my application code if needed.
///////////////////////////////////////////
pipe_line_math.h
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void pipe_mult_ushort(ushort *data,ushort *rands)
{
__asm__ __volatile__(".intel_syntax noprefix\n\t"
//// in this section we pull as much data as we can into the CPU
//// to minimize the DRAM delay and store it where we can
"movdqa xmm0,[edi]\n\t" // load xmm0 & xmm1
"movdqa xmm1,[esi]\n\t"
"movdqa xmm2,0x10[edi]\n\t" // load xmm2 & xmm3
"movdqa xmm3,0x10[esi]\n\t"
"movdqa xmm4,0x20[edi]\n\t" // load xmm4 & xmm5
"movdqa xmm5,0x20[esi]\n\t"
"movdqa xmm6,0x30[edi]\n\t" // load xmm6 & xmm7
"movdqa xmm7,0x30[esi]\n\t"
"movdqa xmm8,0x40[edi]\n\t" // load xmm8 & xmm9
"movdqa xmm9,0x40[esi]\n\t"
"movdqa xmm10,0x50[edi]\n\t" // load xmm10 & xmm11
"movdqa xmm11,0x50[esi]\n\t"
"movdqa xmm12,0x60[edi]\n\t" // load xmm12 & xmm13
"movdqa xmm13,0x60[esi]\n\t"
"movdqa xmm14,0x70[edi]\n\t" // load xmm14 & xmm15
"movdqa xmm15,0x70[esi]\n\t"
"movq mm0,0x80[edi]\n\t" // load mmx0
"movq mm1,0x80[esi]\n\t"
"movq mm2,0x88[edi]\n\t"
"movq mm3,0x88[esi]\n\t"
"movq mm4,0x90[edi]\n\t"
"movq mm5,0x90[esi]\n\t"
"movq mm6,0x98[edi]\n\t"
"movq mm7,0x98[esi]\n\t"
"movq r8,0xA0[edi]\n\t" // store some in extended 64bit registers
"movq r9,0xA0[esi]\n\t"
"movq r10,0xA8[edi]\n\t"
"movq r11,0xA8[esi]\n\t"
"movq r12,0xB0[edi]\n\t"
"movq r13,0xB0[esi]\n\t"
"movq r14,0xB8[edi]\n\t"
"movq r15,0xB8[esi]\n\t"
// all available registers were data can be stored were filled, proceed with calcs now
// calc xmms first
"pmullw xmm0,xmm1\n\t" // calc xmm0
"pmullw xmm2,xmm3\n\t" // calc xmm2
"pmullw xmm4,xmm5\n\t" // calc xmm4
"pmullw xmm6,xmm7\n\t" // calc xmm6
"pmullw xmm8,xmm9\n\t" // calc xmm8
"pmullw xmm10,xmm11\n\t" // calc xmm10
"pmullw xmm12,xmm13\n\t" // calc xmm12
"pmullw xmm14,xmm15\n\t" // calc xmm14
// calc mms second
"pmullw mm0,mm1\n\t" // calc mm0
"pmullw mm2,mm3\n\t" // calc mm0
"pmullw mm4,mm5\n\t" // calc mm0
"pmullw mm6,mm7\n\t" // calc mm0
// send xmm values to memory
"movdqa [edi],xmm0\n\t" // xmm0 -> memory
"movdqa 0x10[edi],xmm2\n\t" // xmm2 -> memory
"movdqa 0x20[edi],xmm4\n\t" // xmm4 -> memory
"movdqa 0x30[edi],xmm6\n\t" // xmm6 -> memory
"movdqa 0x40[edi],xmm8\n\t" // xmm8 -> memory
"movdqa 0x50[edi],xmm10\n\t" // xmm10 -> memory
"movdqa 0x60[edi],xmm12\n\t" // xmm12 -> memory
"movdqa 0x70[edi],xmm14\n\t" // xmm14 -> memory
// send mm values to memory
"movq 0x80[edi],mm0\n\t" // mm0 -> memory
"movq 0x88[edi],mm2\n\t" // mm2 -> memory
"movq 0x90[edi],mm4\n\t" // mm4 -> memory
"movq 0x98[edi],mm6\n\t" // mm6 -> memory
// xmms & mms are free now
// load mms from 'r's
"movq mm0,r8\n\t" // move saved 'r' to mm
"movq mm1,r9\n\t" // move saved 'r' to mm
"movq mm2,r10\n\t" // move saved 'r' to mm
"movq mm3,r11\n\t" // move saved 'r' to mm
"movq mm4,r12\n\t" // move saved 'r' to mm
"movq mm5,r13\n\t" // move saved 'r' to mm
"movq mm6,r14\n\t" // move saved 'r' to mm
"movq mm7,r15\n\t" // move saved 'r' to mm
// calc mms
"pmullw mm0,mm1\n\t" // calc mms copied from 'r's
"pmullw mm2,mm3\n\t" // calc mms copied from 'r's
"pmullw mm4,mm5\n\t" // calc mms copied from 'r's
"pmullw mm6,mm7\n\t" // calc mms copied from 'r's
// send mm values to memory
"movq 0xA0[edi],mm0\n\t" // mm0 -> memory
"movq 0xA8[edi],mm2\n\t" // mm2 -> memory
"movq 0xB0[edi],mm4\n\t" // mm4 -> memory
"movq 0xB8[edi],mm6\n\t" // mm6 -> memory
:
: "D" (data) ,"S" (rands)
: "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7",
"xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15",
"mm0","mm1","mm2","mm3","mm4","mm5","mm6","mm7",
"r8","r9","r10","r11","r12","r13","r14","r15",
"memory");
}
///////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "pipe_line_math.h"
#define _ARRAY_SIZE_ 256*256*256*24
#define _ELTS_PER_PIPE_ 112
ushort __attribute__ ((aligned (16))) rands[_ARRAY_SIZE_];
ushort __attribute__ ((aligned (16))) data[_ARRAY_SIZE_];
struct timespec tspec1;
struct timespec tspec2;
main() {
ulong i,max;
double diff;
for (i=0;i<_ARRAY_SIZE_;i++) { /// fill with any data
rands[i]=i%4;
data[i]=i*2+i;
}
max=_ARRAY_SIZE_/_ELTS_PER_PIPE_;
clock_gettime(CLOCK_REALTIME,&tspec1);
for (i=0;i<max;i=i+_ELTS_PER_PIPE_) pipe_mult_ushort(&data[i],&rands[i]);
for (i=0;i<max;i=i+_ELTS_PER_PIPE_) pipe_mult_ushort(&data[i],&rands[i]); // one more time
clock_gettime(CLOCK_REALTIME,&tspec2);
diff=((double)tspec2.tv_sec+(double)tspec2.tv_nsec/1000000000.0)-((double)tspec1.tv_sec+tspec1.tv_nsec/1000000000.0);
printf("time pipeline multiply:\nstart: %d:%d\n end: %d:%d ; total diff: %f\n",tspec1.tv_sec,tspec1.tv_nsec,tspec2.tv_sec,tspec2.tv_nsec,diff);
printf("sample data:\n");
for (i=0;i<64;i++) {
printf("%d,",data[i]);
if (!((i+1)%16)) printf("\n");
}
return(0);
}