Anyone heard of this? I have an Amd Athlon 2400+ and I guess the memcpy function is slow with these computer. I found a function that optimizes the code somehow. Here it is but I'm not sure how to input it into my program. I tried adding the file into the project and the using the memcpy_amd( ) function like the memcpy one but it said it was an "undeclared identifier" . Then I tried copying and pasting the whole .cpp file into my c++ source file and that gave errors with the assembly coding itself. The Readme file that came with it didn't say anything about how to use it. Here is the code:
Can anyone help? Supposedly it produces significant gains.
[code]
// Very optimized memcpy() routine for all AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetchnta instructions,
// be sure you're running on Athlon/Duron or other recent CPU before calling!
#define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".
#define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization. This code uses
// the software prefetch instruction to get the data into the cache.
#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
#define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
// Inline assembly syntax for use with Visual C++
void * memcpy_amd(void *dest, const void *src, size_t n)
{
__asm {
mov ecx, [n] ; number of bytes to copy
mov edi, [dest] ; destination
mov esi, [src] ; source
mov ebx, ecx ; keep a copy of count
cld
cmp ecx, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy
cmp ecx, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_do_align ; it appears to be slower
cmp ecx, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_align_done
jmp ecx ; jump to array of movsb's
align 4
movsb
movsb
movsb
movsb
movsb
movsb
movsb
movsb
$memcpy_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test
// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1: ; 64-byte block copies, in-cache copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0, [esi+0] ; read 64 bits
movq mm1, [esi+8]
movq [edi+0], mm0 ; write 64 bits
movq [edi+8], mm1 ; note: the normal movq writes the
movq mm2, [esi+16] ; data to cache; a cache line will be
movq mm3, [esi+24] ; allocated as needed, to store the data
movq [edi+16], mm2
movq [edi+24], mm3
movq mm0, [esi+32]
movq mm1, [esi+40]
movq [edi+32], mm0
movq [edi+40], mm1
movq mm2, [esi+48]
movq mm3, [esi+56]
movq [edi+48], mm2
movq [edi+56], mm3
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
jnz $memcpy_ic_1 ; last 64-byte block?
$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1
$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy
prefetchnta [esi + (200*64/34+192)] ; start reading ahead
movq mm0,[esi+0] ; read 64 bits
add edi,64 ; update destination pointer
movq mm1,[esi+8]
add esi,64 ; update source pointer
movq mm2,[esi-48]
movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
movq mm0,[esi-40] ; note: movntq also prevents the CPU
movntq [edi-56], mm1 ; from READING the destination address
movq mm1,[esi-32] ; into the cache, only to be over-written
movntq [edi-48], mm2 ; so that also helps performance
movq mm2,[esi-24]
movntq [edi-40], mm0
movq mm0,[esi-16]
movntq [edi-32], mm1
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
movntq [edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost done
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch, in this case.
// The technique is great for getting maximum read bandwidth,
// especially in DDR memory systems.
$memcpy_bp_1: ; large blocks, block prefetch copy
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
jl $memcpy_64_test ; no, back to regular uncached copy
mov eax, CACHEBLOCK / 2 ; block prefetch loop, unrolled 2X
add esi, CACHEBLOCK * 64 ; move to the top of the block
align 16
$memcpy_bp_2:
mov edx, [esi-64] ; grab one address per cache line
mov edx, [esi-128] ; grab one address per cache line
sub esi, 128 ; go reverse order
dec eax ; count down the cache lines
jnz $memcpy_bp_2 ; keep grabbing more lines into cache
mov eax, CACHEBLOCK ; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
movq mm0, [esi ] ; read 64 bits
movq mm1, [esi+ 8]
movq mm2, [esi+16]
movq mm3, [esi+24]
movq mm4, [esi+32]
movq mm5, [esi+40]
movq mm6, [esi+48]
movq mm7, [esi+56]
add esi, 64 ; update source pointer
movntq [edi ], mm0 ; write 64 bits, bypassing cache
movntq [edi+ 8], mm1 ; note: movntq also prevents the CPU
movntq [edi+16], mm2 ; from READING the destination address
movntq [edi+24], mm3 ; into the cache, only to be over-written,
movntq [edi+32], mm4 ; so that also helps performance
movntq [edi+40], mm5
movntq [edi+48], mm6
movntq [edi+56], mm7
add edi, 64 ; update dest pointer
dec eax ; count down
jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
align 4
movsd
movsd ; perform last 1-15 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
movsd
movsd ; perform last 1-7 dword copies
movsd
movsd
movsd
movsd
movsd
movsd
$memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes
$memcpy_final:
emms ; clean up the MMX state
sfence ; flush the write buffer
mov eax, [dest] ; ret value = destination pointer
}
}
[/code]
Basically, what do I need to include in my files. Please be specific because I don't know very much c++ .
Comments
: Basically, what do I need to include in my files. Please be specific because I don't know very much c++ .
:
compils with Microsoft VC++ 6.0 compiler ok. If you use a different compiler you might have to change _asm to something else. See your compiler's documentation about how to use embedded assembly language.
: : Basically, what do I need to include in my files. Please be specific because I don't know very much c++ .
: :
:
: compils with Microsoft VC++ 6.0 compiler ok. If you use a different compiler you might have to change _asm to something else. See your compiler's documentation about how to use embedded assembly language.
:
It compiles with Microsoft VC++ 6.0 ? That is what I'm using. What includes do you use ? Do you just add it to your project and compile the whole project ? When I compiled it, it had a problem with the [ ] in the .cpp file. I give up. How much more speed do you think it will give me? I use memcpy about 4 times and everyone one of those 4 times, it copies a 400 element array. Why couldn't they make it a header file or something ? I can use those.
It gives me errors and says this:
Compiling...
memcpy.cpp
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(84) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(132) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(139) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(141) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(143) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(145) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(147) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(149) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(150) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(152) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(190) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(191) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(192) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(193) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(194) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(195) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(196) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(197) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(235) : error C2400: inline assembler syntax error in 'opcode'; found 'newline'
c:program filesmicrosoft visual studiomyprojectsmemcpyamdmemcpy.cpp(238) : warning C4035: 'memcpy_amd' : no return value
Error executing cl.exe.
memcpy.obj - 19 error(s), 1 warning(s)
Anyone help ?
I don't think its work the effort anyway. Microsoft's implementation of memcpy(), strcopy() and a whole lot of other functions are already in assembly language.
Have you installed the processor pack with that 6.0 version of MSVC++ . I don't think I have. If I do have it, how do I know . Maybe that is why it won't compile. I've got an AMD Athlon 2400+ so it shouldn't be that.
:
: Have you installed the processor pack with that 6.0 version of MSVC++ . I don't think I have. If I do have it, how do I know . Maybe that is why it won't compile. I've got an AMD Athlon 2400+ so it shouldn't be that.
:
I have VC++ 6.0 Pro edition. I don't know anything about a processor pack. I have Service Pack #5 installed. But I thought in-line assembly was a standard feature of the compiler.
Ok. I download the Processor pack and it helped but I still get two errors and three warnings:
Compiling...
memcpy_amd2.cpp
c:program filesmicrosoft visual studiomemcpy_amd2.cpp(146) : warning C4035: 'get_cpu_type' : no return value
c:program filesmicrosoft visual studiomemcpy_amd2.cpp(527) : warning C4035: 'memcpy_optimized' : no return value
c:program filesmicrosoft visual studiomemcpy_amd2.cpp(764) : warning C4035: 'memset_optimized' : no return value
c:program filesmicrosoft visual studiomemcpy_amd2.cpp(767) : error C2556: 'void __stdcall memzero_optimized(void *,unsigned int)' : overloaded function differs only by return type from 'void *__stdcall memzero_optimized(void *,unsigned int)'
c:program filesmicrosoft visual studiomemcpy_amd.h(39) : see declaration of 'memzero_optimized'
c:program filesmicrosoft visual studiomemcpy_amd2.cpp(767) : error C2040: 'memzero_optimized' : 'void (void *,unsigned int)' differs in levels of indirection from 'void *(void *,unsigned int)'
Error executing cl.exe.
memcpy_amd2.obj - 2 error(s), 3 warning(s)
I did have about 70 so we are getting there. But maybe this is going to be where you get rid of one error only to get 20 more later.