24,854
社区成员
发帖
与我相关
我的任务
分享
include <intrin.h>
#include <emmintrin.h>
#include <mmintrin.h>
#include <stdio.h>
extern "C" {
#define _CRT_RAND_S
#include <stdlib.h>
}
#pragma auto_inline(on)
#pragma inline_recursion(on)
template <unsigned LEN>
inline void MemCopy(void *pDest, const void* pSrc)
{
if(LEN >= 16)
{
__m128i temp;
for(unsigned i=0; i < LEN / 16; i++)
{
temp = _mm_load_si128(&((const __m128i*)pSrc)[i]);
_mm_store_si128(&((__m128i*)pDest)[i], temp);
}
MemCopy<LEN - (LEN / 16) * 16>(
&((unsigned char*)pDest)[(LEN / 16) * 16],
&((const unsigned char*)pSrc)[(LEN / 16) * 16]
);
}
else if(LEN >= 8)
{
*(__m64*)pDest = *(const __m64*)pSrc;
_mm_empty();
MemCopy<LEN - (LEN / 8) * 8>(
&((unsigned char*)pDest)[(LEN / 8) * 8],
&((const unsigned char*)pSrc)[(LEN / 8) * 8]
);
}
else if(LEN >= 4)
{
*(unsigned*)pDest = *(const unsigned*)pSrc;
MemCopy<LEN - (LEN / 4) * 4>(
&((unsigned char*)pDest)[(LEN / 4) * 4],
&((const unsigned char*)pSrc)[(LEN / 4) * 4]
);
}
else if(LEN >= 2)
{
*(unsigned short*)pDest = *(const unsigned short*)pSrc;
MemCopy<LEN - (LEN / 2) * 2>(
&((unsigned char*)pDest)[(LEN / 2) * 2],
&((const unsigned char*)pSrc)[(LEN / 2) * 2]
);
}
else if(LEN == 1)
{
*(unsigned char*)pDest = *(const unsigned char*)pSrc;
}
}
__declspec(align(16)) unsigned char src[1024];
__declspec(align(16)) unsigned char dst[1024];
static void init(void)
{
unsigned v;
for(int i=0; i < 1024 / 4; i++)
{
rand_s(&v); // VC特有号称安全的随机数函数,可用v = rand();取代
((unsigned*)src)[i] = v;
((unsigned*)dst)[i] = 0;
}
}
static void validate(void)
{
for(int i=0; i<1023; i++)
{
if(src[i] != dst[i])
{
printf("Error @: %d\n", i);
return;
}
}
puts("Succedded!");
}
int main(void)
{
unsigned __int64 beginTicks, endTicks;
beginTicks = __rdtsc();
for(int i=0; i<10; i++)
{
MemCopy<1023>(dst, src);
__asm nop //为了避免循环被优化
}
endTicks = __rdtsc();
validate();
printf("The number of cycles used to copy is: %I64u\n", endTicks - beginTicks);
}
beginTicks = __rdtsc();
00401006 rdtsc
00401008 push ebx
00401009 push ebp
0040100A push esi
0040100B push edi
0040100C mov ebx,eax
0040100E mov ebp,edx
00401010 mov esi,0Ah
for(int i=0; i<10; i++)
{
MemCopy<1023>(dst, src);
00401015 call MemCopy<1023> (401080h)
__asm nop //为了避免循环被优化
0040101A nop
0040101B sub esi,1
0040101E jne main+15h (401015h)
}
endTicks = __rdtsc();
#pragma auto_inline(on)
#pragma inline_recursion(on)
template <unsigned LEN>
inline void MemCopy(void *pDest, const void* pSrc)
{
if(LEN >= 16)
{
__m128i temp;
for(unsigned i=0; i < LEN / 16; i++)
00401080 xor eax,eax
00401082 jmp MemCopy<1023>+10h (401090h)
00401084 lea esp,[esp]
0040108B jmp MemCopy<1023>+10h (401090h)
0040108D lea ecx,[ecx]
{
temp = _mm_load_si128(&((const __m128i*)pSrc)[i]);
00401090 movdqa xmm0,xmmword ptr src (40E6F0h)[eax]
_mm_store_si128(&((__m128i*)pDest)[i], temp);
00401098 movdqa xmmword ptr dst (40E2F0h)[eax],xmm0
004010A0 add eax,10h
004010A3 cmp eax,3F0h
004010A8 jb MemCopy<1023>+10h (401090h)
}
MemCopy<LEN - (LEN / 16) * 16>(
&((unsigned char*)pDest)[(LEN / 16) * 16],
&((const unsigned char*)pSrc)[(LEN / 16) * 16]
);
004010AA movq mm0,mmword ptr [src+3F0h (40EAE0h)]
004010B1 movq mmword ptr [dst+3F0h (40E6E0h)],mm0
004010B8 emms
004010BA mov eax,dword ptr [src+3F8h (40EAE8h)]
004010BF mov cx,word ptr [src+3FCh (40EAECh)]
004010C6 mov dl,byte ptr [src+3FEh (40EAEEh)]
004010CC mov dword ptr [dst+3F8h (40E6E8h)],eax
004010D1 mov word ptr [dst+3FCh (40E6ECh)],cx
004010D8 mov byte ptr [dst+3FEh (40E6EEh)],dl
#include <stdio.h>
#include <stdlib.h>
#define SSE_MOVDQA_store(memDst, xmmSrc) __asm__("movdqa %%" #xmmSrc ", %0" : "=m"(memDst))
#define SSE_MOVDQA_load(xmmDst, memSrc) __asm__("movdqa %0, %%" #xmmDst : "=m"(memSrc))
#define MMX_MOVQ_store(memDst, mmxSrc) __asm__("movq %%" #mmxSrc ", %0" : "=m"(memDst))
#define MMX_MOVQ_load(mmxDst, memSrc) __asm__("movq %0, %%" #mmxDst : "=m"(memSrc))
#define MMX_EMMS() __asm__("emms")
#define INTRIN_RDTSC(tick) __asm__("rdtsc" : "=A"(tick))
typedef int __attribute__ ((vector_size (16))) __m128i;
typedef int __attribute__ ((vector_size (8))) __m64;
template <unsigned LEN>
inline void MemCopy(void *pDest, const void* pSrc)
{
if(LEN >= 16)
{
for(unsigned i=0; i < LEN / 16; i++)
{
SSE_MOVDQA_load(xmm0, ((__m128i*)pSrc)[i]);
SSE_MOVDQA_store(((__m128i*)pDest)[i], xmm0);
}
MemCopy<LEN - (LEN / 16) * 16>(
&((unsigned char*)pDest)[(LEN / 16) * 16],
&((const unsigned char*)pSrc)[(LEN / 16) * 16]
);
}
else if(LEN >= 8)
{
MMX_MOVQ_load(mm0, *(__m64*)pSrc);
MMX_MOVQ_store(*(__m64*)pDest, mm0);
MMX_EMMS();
MemCopy<LEN - (LEN / 8) * 8>(
&((unsigned char*)pDest)[(LEN / 8) * 8],
&((const unsigned char*)pSrc)[(LEN / 8) * 8]
);
}
else if(LEN >= 4)
{
*(unsigned*)pDest = *(const unsigned*)pSrc;
MemCopy<LEN - (LEN / 4) * 4>(
&((unsigned char*)pDest)[(LEN / 4) * 4],
&((const unsigned char*)pSrc)[(LEN / 4) * 4]
);
}
else if(LEN >= 2)
{
*(unsigned short*)pDest = *(const unsigned short*)pSrc;
MemCopy<LEN - (LEN / 2) * 2>(
&((unsigned char*)pDest)[(LEN / 2) * 2],
&((const unsigned char*)pSrc)[(LEN / 2) * 2]
);
}
else if(LEN == 1)
{
*(unsigned char*)pDest = *(const unsigned char*)pSrc;
}
}
unsigned char __attribute__ ((aligned (16))) src[1024];
unsigned char __attribute__ ((aligned (16))) dst[1024];
static void init(void)
{
unsigned v;
for(int i=0; i < 1024 / 4; i++)
{
v = rand();
((unsigned*)src)[i] = v;
((unsigned*)dst)[i] = 0;
}
}
static void validate(void)
{
for(int i=0; i<1023; i++)
{
if(src[i] != dst[i])
{
printf("Error @: %d\n", i);
return;
}
}
puts("Succedded!");
}
int main(void)
{
unsigned long long beginTicks, endTicks;
INTRIN_RDTSC(beginTicks);
for(int i=0; i<10; i++)
{
MemCopy<1023>(dst, src);
__asm__ volatile("mov %eax, %eax"); //为了避免循环被优化
}
INTRIN_RDTSC(endTicks);
validate();
printf("The number of cycles used to copy is: %u\n", (unsigned)(endTicks - beginTicks));
}