C and C++

Moderators: None (Apply to moderate this forum)
Number of threads: 28691
Number of posts: 94711

This Forum Only
Post New Thread
Single Post View       Linear View       Threaded View      f

Report
memcpy_amd( ) How to use it Posted by theredpill99 on 19 Aug 2004 at 5:02 PM
Anyone heard of this? I have an Amd Athlon 2400+ and I guess the memcpy function is slow with these computer. I found a function that optimizes the code somehow. Here it is but I'm not sure how to input it into my program. I tried adding the file into the project and the using the memcpy_amd( ) function like the memcpy one but it said it was an "undeclared identifier" . Then I tried copying and pasting the whole .cpp file into my c++ source file and that gave errors with the assembly coding itself. The Readme file that came with it didn't say anything about how to use it. Here is the code:
Can anyone help? Supposedly it produces significant gains.

// Very optimized memcpy() routine for all AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetchnta instructions,
// be sure you're running on Athlon/Duron or other recent CPU before calling!

#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".

#define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization.   This code uses
// the software prefetch instruction to get the data into the cache.

#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ.   This write instruction
// bypasses the cache and writes straight to main memory.  This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"

#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch 
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations.   Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch.  The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.

// Inline assembly syntax for use with Visual C++

void * memcpy_amd(void *dest, const void *src, size_t n)
{
  __asm {

	mov		ecx, [n]		; number of bytes to copy
	mov		edi, [dest]		; destination
	mov		esi, [src]		; source
	mov		ebx, ecx		; keep a copy of count

	cld
	cmp		ecx, TINY_BLOCK_COPY
	jb		$memcpy_ic_3	; tiny? skip mmx copy

	cmp		ecx, 32*1024		; don't align between 32k-64k because
	jbe		$memcpy_do_align	;  it appears to be slower
	cmp		ecx, 64*1024
	jbe		$memcpy_align_done
$memcpy_do_align:
	mov		ecx, 8			; a trick that's faster than rep movsb...
	sub		ecx, edi		; align destination to qword
	and		ecx, 111b		; get the low bits
	sub		ebx, ecx		; update copy count
	neg		ecx				; set up to jump into the array
	add		ecx, offset $memcpy_align_done
	jmp		ecx				; jump to array of movsb's

align 4
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb

$memcpy_align_done:			; destination is dword aligned
	mov		ecx, ebx		; number of bytes left to copy
	shr		ecx, 6			; get 64-byte block count
	jz		$memcpy_ic_2	; finish the last few bytes

	cmp		ecx, IN_CACHE_COPY/64	; too big 4 cache? use uncached copy
	jae		$memcpy_uc_test

// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time.  It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1:			; 64-byte block copies, in-cache copy

	prefetchnta [esi + (200*64/34+192)]		; start reading ahead

	movq	mm0, [esi+0]	; read 64 bits
	movq	mm1, [esi+8]
	movq	[edi+0], mm0	; write 64 bits
	movq	[edi+8], mm1	;    note:  the normal movq writes the
	movq	mm2, [esi+16]	;    data to cache; a cache line will be
	movq	mm3, [esi+24]	;    allocated as needed, to store the data
	movq	[edi+16], mm2
	movq	[edi+24], mm3
	movq	mm0, [esi+32]
	movq	mm1, [esi+40]
	movq	[edi+32], mm0
	movq	[edi+40], mm1
	movq	mm2, [esi+48]
	movq	mm3, [esi+56]
	movq	[edi+48], mm2
	movq	[edi+56], mm3

	add		esi, 64			; update source pointer
	add		edi, 64			; update destination pointer
	dec		ecx				; count down
	jnz		$memcpy_ic_1	; last 64-byte block?

$memcpy_ic_2:
	mov		ecx, ebx		; has valid low 6 bits of the byte count
$memcpy_ic_3:
	shr		ecx, 2			; dword count
	and		ecx, 1111b		; only look at the "remainder" bits
	neg		ecx				; set up to jump into the array
	add		ecx, offset $memcpy_last_few
	jmp		ecx				; jump to array of movsd's

$memcpy_uc_test:
	cmp		ecx, UNCACHED_COPY/64	; big enough? use block prefetch copy
	jae		$memcpy_bp_1

$memcpy_64_test:
	or		ecx, ecx		; tail end of block prefetch will jump here
	jz		$memcpy_ic_2	; no more 64-byte blocks left

// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ.   This write instruction
// bypasses the cache and writes straight to main memory.  This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1:				; 64-byte blocks, uncached copy

	prefetchnta [esi + (200*64/34+192)]		; start reading ahead

	movq	mm0,[esi+0]		; read 64 bits
	add		edi,64			; update destination pointer
	movq	mm1,[esi+8]
	add		esi,64			; update source pointer
	movq	mm2,[esi-48]
	movntq	[edi-64], mm0	; write 64 bits, bypassing the cache
	movq	mm0,[esi-40]	;    note: movntq also prevents the CPU
	movntq	[edi-56], mm1	;    from READING the destination address
	movq	mm1,[esi-32]	;    into the cache, only to be over-written
	movntq	[edi-48], mm2	;    so that also helps performance
	movq	mm2,[esi-24]
	movntq	[edi-40], mm0
	movq	mm0,[esi-16]
	movntq	[edi-32], mm1
	movq	mm1,[esi-8]
	movntq	[edi-24], mm2
	movntq	[edi-16], mm0
	dec		ecx
	movntq	[edi-8], mm1
	jnz		$memcpy_uc_1	; last 64-byte block?

	jmp		$memcpy_ic_2		; almost done

// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations.   Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch, in this case.
// The technique is great for getting maximum read bandwidth,
// especially in DDR memory systems.
$memcpy_bp_1:			; large blocks, block prefetch copy

	cmp		ecx, CACHEBLOCK			; big enough to run another prefetch loop?
	jl		$memcpy_64_test			; no, back to regular uncached copy

	mov		eax, CACHEBLOCK / 2		; block prefetch loop, unrolled 2X
	add		esi, CACHEBLOCK * 64	; move to the top of the block
align 16
$memcpy_bp_2:
	mov		edx, [esi-64]		; grab one address per cache line
	mov		edx, [esi-128]		; grab one address per cache line
	sub		esi, 128			; go reverse order
	dec		eax					; count down the cache lines
	jnz		$memcpy_bp_2		; keep grabbing more lines into cache

	mov		eax, CACHEBLOCK		; now that it's in cache, do the copy
align 16
$memcpy_bp_3:
	movq	mm0, [esi   ]		; read 64 bits
	movq	mm1, [esi+ 8]
	movq	mm2, [esi+16]
	movq	mm3, [esi+24]
	movq	mm4, [esi+32]
	movq	mm5, [esi+40]
	movq	mm6, [esi+48]
	movq	mm7, [esi+56]
	add		esi, 64				; update source pointer
	movntq	[edi   ], mm0		; write 64 bits, bypassing cache
	movntq	[edi+ 8], mm1		;    note: movntq also prevents the CPU
	movntq	[edi+16], mm2		;    from READING the destination address 
	movntq	[edi+24], mm3		;    into the cache, only to be over-written,
	movntq	[edi+32], mm4		;    so that also helps performance
	movntq	[edi+40], mm5
	movntq	[edi+48], mm6
	movntq	[edi+56], mm7
	add		edi, 64				; update dest pointer

	dec		eax					; count down

	jnz		$memcpy_bp_3		; keep copying
	sub		ecx, CACHEBLOCK		; update the 64-byte block count
	jmp		$memcpy_bp_1		; keep processing chunks

// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".   Then it handles the last few bytes.
align 4
	movsd
	movsd			; perform last 1-15 dword copies
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd			; perform last 1-7 dword copies
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd

$memcpy_last_few:		; dword aligned from before movsd's
	mov		ecx, ebx	; has valid low 2 bits of the byte count
	and		ecx, 11b	; the last few cows must come home
	jz		$memcpy_final	; no more, let's leave
	rep		movsb		; the last 1, 2, or 3 bytes

$memcpy_final: 
	emms				; clean up the MMX state
	sfence				; flush the write buffer
	mov		eax, [dest]	; ret value = destination pointer

    }
}


Basically, what do I need to include in my files. Please be specific because I don't know very much c++ .
Report
Re: memcpy_amd( ) How to use it Posted by stober on 19 Aug 2004 at 5:15 PM
:
: Basically, what do I need to include in my files. Please be specific because I don't know very much c++ .
:

compils with Microsoft VC++ 6.0 compiler ok. If you use a different compiler you might have to change _asm to something else. See your compiler's documentation about how to use embedded assembly language.
Report
Re: memcpy_amd( ) How to use it Posted by theredpill99 on 20 Aug 2004 at 1:26 PM
: :
: : Basically, what do I need to include in my files. Please be specific because I don't know very much c++ .
: :
:
: compils with Microsoft VC++ 6.0 compiler ok. If you use a different compiler you might have to change _asm to something else. See your compiler's documentation about how to use embedded assembly language.
:
It compiles with Microsoft VC++ 6.0 ? That is what I'm using. What includes do you use ? Do you just add it to your project and compile the whole project ? When I compiled it, it had a problem with the [ ] in the .cpp file. I give up. How much more speed do you think it will give me? I use memcpy about 4 times and everyone one of those 4 times, it copies a 400 element array. Why couldn't they make it a header file or something ? I can use those.
Report
This is what it says Posted by theredpill99 on 20 Aug 2004 at 1:30 PM
Ok, copied and pasted it into a new project and new c++ file.

It gives me errors and says this:

Compiling...
memcpy.cpp
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(84) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(132) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(139) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(141) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(143) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(145) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(147) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(149) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(150) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(152) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(190) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(191) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(192) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(193) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(194) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(195) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(196) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(197) : error C2400: inline assembler syntax error in 'opcode'; found '['
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(235) : error C2400: inline assembler syntax error in 'opcode'; found 'newline'
c:\program files\microsoft visual studio\myprojects\memcpyamd\memcpy.cpp(238) : warning C4035: 'memcpy_amd' : no return value
Error executing cl.exe.

memcpy.obj - 19 error(s), 1 warning(s)


Anyone help ?
Report
Re: memcpy_amd( ) How to use it Posted by stober on 20 Aug 2004 at 1:41 PM
Have you changed the code in your original post? Not it won't compile for me becuase of many unknown assembly instructions such as "movntq" and "prefetchnta". These instructions don't exist in 80x88 family of micro processors.

I don't think its work the effort anyway. Microsoft's implementation of memcpy(), strcopy() and a whole lot of other functions are already in assembly language.
Report
MSVC++ 6.0 Posted by theredpill99 on 20 Aug 2004 at 3:45 PM
Stober,

Have you installed the processor pack with that 6.0 version of MSVC++ . I don't think I have. If I do have it, how do I know . Maybe that is why it won't compile. I've got an AMD Athlon 2400+ so it shouldn't be that.
Report
Re: MSVC++ 6.0 Posted by stober on 20 Aug 2004 at 4:35 PM
: Stober,
:
: Have you installed the processor pack with that 6.0 version of MSVC++ . I don't think I have. If I do have it, how do I know . Maybe that is why it won't compile. I've got an AMD Athlon 2400+ so it shouldn't be that.
:


I have VC++ 6.0 Pro edition. I don't know anything about a processor pack. I have Service Pack #5 installed. But I thought in-line assembly was a standard feature of the compiler.
Report
Re: MSVC++ 6.0 Posted by theredpill99 on 20 Aug 2004 at 4:50 PM
This message was edited by theredpill99 at 2004-8-20 16:53:2

Ok. I download the Processor pack and it helped but I still get two errors and three warnings:

Compiling...
memcpy_amd2.cpp
c:\program files\microsoft visual studio\memcpy_amd2.cpp(146) : warning C4035: 'get_cpu_type' : no return value
c:\program files\microsoft visual studio\memcpy_amd2.cpp(527) : warning C4035: 'memcpy_optimized' : no return value
c:\program files\microsoft visual studio\memcpy_amd2.cpp(764) : warning C4035: 'memset_optimized' : no return value
c:\program files\microsoft visual studio\memcpy_amd2.cpp(767) : error C2556: 'void __stdcall memzero_optimized(void *,unsigned int)' : overloaded function differs only by return type from 'void *__stdcall memzero_optimized(void *,unsigned int)'
c:\program files\microsoft visual studio\memcpy_amd.h(39) : see declaration of 'memzero_optimized'
c:\program files\microsoft visual studio\memcpy_amd2.cpp(767) : error C2040: 'memzero_optimized' : 'void (void *,unsigned int)' differs in levels of indirection from 'void *(void *,unsigned int)'
Error executing cl.exe.

memcpy_amd2.obj - 2 error(s), 3 warning(s)


I did have about 70 so we are getting there. But maybe this is going to be where you get rid of one error only to get 20 more later.


Report
Re: MSVC++ 6.0 Posted by theredpill99 on 20 Aug 2004 at 5:09 PM
You're right. It's having a problem with prefetchnta now. It doesn't support that even though I'm running an Athlon computer. Anything that I can do about this or am I wasting my time?



 

Recent Jobs

Official Programmer's Heaven Blogs
Web Hosting | Browser and Social Games | Gadgets

Popular resources on Programmersheaven.com
Assembly | Basic | C | C# | C++ | Delphi | Flash | Java | JavaScript | Pascal | Perl | PHP | Python | Ruby | Visual Basic
© Copyright 2011 Programmersheaven.com - All rights reserved.
Reproduction in whole or in part, in any form or medium without express written permission is prohibited.
Violators of this policy may be subject to legal action. Please read our Terms Of Use and Privacy Statement for more information.
Operated by CommunityHeaven, a BootstrapLabs company.