C vs ASM

Rebel · Post by **Rebel** » Tue Mar 05, 2013 3:01 pm

Following the heated discussion (C vs ASM) a small example that ASM still can pay off. You can compile (and run) the below program yourself. At program start it creates 6000 random 32-bit numbers and then we are going to bubble the classic way.

My Digital Mars compiler takes 22.1 secs to complete.
GCC 4.6.1 (32 bit) takes 22.2 secs
GCC 4.6.1 (64-bit) takes 22.0 secs

The hand tuned ASM version (BUBBLES=1) takes 14.7 seconds.

I compiled GCC with the "-Ofast" option. Perhaps that's not the right one as I am new to GCC.

Code: Select all

#include <stdio.h>
#include <stdlib.h>

#define MAX_ENTRIES     6000

#define BUBBLES         0           // 0=C | 1=ASM

static int time1,time2;

static unsigned int key1&#91;MAX_ENTRIES+3&#93;;
static unsigned int key2&#91;MAX_ENTRIES+3&#93;;
static unsigned char byte1&#91;MAX_ENTRIES+3&#93;;
static unsigned char byte2&#91;MAX_ENTRIES+3&#93;;

//      Create random 32 bit keys
//      =========================

static unsigned long x_random&#91;55&#93; = &#123;
 1410651636UL, 3012776752UL, 3497475623UL, 2892145026UL, 1571949714UL,
 3253082284UL, 3489895018UL, 387949491UL,  2597396737UL, 1981903553UL,
 3160251843UL, 129444464UL,  1851443344UL, 4156445905UL, 224604922UL,
 1455067070UL, 3953493484UL, 1460937157UL, 2528362617UL, 317430674UL,
 3229354360UL, 117491133UL,  832845075UL,  1961600170UL, 1321557429UL,
 747750121UL,  545747446UL,  810476036UL,  503334515UL,  4088144633UL,
 2824216555UL, 3738252341UL, 3493754131UL, 3672533954UL, 29494241UL,
 1180928407UL, 4213624418UL, 33062851UL,   3221315737UL, 1145213552UL,
 2957984897UL, 4078668503UL, 2262661702UL, 65478801UL,   2527208841UL,
 1960622036UL, 315685891UL,  1196037864UL, 804614524UL,  1421733266UL,
 2017105031UL, 3882325900UL, 810735053UL,  384606609UL,  2393861397UL &#125;;

static int init_random = 1;
static unsigned long y_random&#91;55&#93;;
static int j_random, k_random;
static unsigned long ul_random;

void zobrist&#40;void&#41;;
void bubbles&#40;void&#41;;

int main &#40;int argc,char *argv&#91;&#93;)

&#123;       int x;

        printf&#40;"Creating %d Zobrist keys\n\n",MAX_ENTRIES&#41;;

        zobrist&#40;);                       // make 6.000 random 32-bit keys

        printf&#40;"Bubbling now...  ");

        time1=clock&#40;);

        bubbles&#40;);

        time2=clock&#40;);

        printf&#40;"Done in %d\n\n",time2-time1&#41;;

//      for &#40;x=0; x<20; x++) printf&#40;"%8x\t %8x\t (%2x&#41; (%2x&#41;\n",    // debug ASM code
//       key1&#91;x&#93;,key2&#91;x&#93;,byte1&#91;x&#93;,byte2&#91;x&#93;);

&#125;

void zobrist&#40;)

//      Create random 32-bit integers
//      =============================

&#123;       int x;

        if &#40;init_random&#41;
         &#123; int i;

           init_random = 0;
           for &#40;i = 0; i < 55; i++) y_random&#91;i&#93; = x_random&#91;i&#93;;
           j_random = 24 - 1;
           k_random = 55 - 1; &#125;

        for &#40;x=0; x<MAX_ENTRIES; x++)
         &#123; ul_random = &#40;y_random&#91;k_random&#93; += y_random&#91;j_random&#93;);
           if (--j_random < 0&#41; j_random = 55 - 1;
           if (--k_random < 0&#41; k_random = 55 - 1;
           key1&#91;x&#93;=0; key1&#91;x&#93;=key1&#91;x&#93; ^ ul_random;  // key1
           key2&#91;x&#93;=key1&#91;x&#93;;                         // key2 &#40;copy to check proper swapping&#41;
           byte1&#91;x&#93;=key1&#91;x&#93;;                        // byte1 &#40;lower 8 bits to check proper swapping&#41;
           byte2&#91;x&#93;=key1&#91;x&#93;>>24;                    // byte1 &#40;first 8 bits to check proper swapping&#41;

           &#125;

//      for &#40;x=0; x<20; x++) printf&#40;"%8x\t %8x\t (%2x&#41; (%2x&#41;\n",    // debug ASM code
//       key1&#91;x&#93;,key2&#91;x&#93;,byte1&#91;x&#93;,byte2&#91;x&#93;);
//      printf&#40;"\n");


&#125;


        #if BUBBLES==0

void bubbles&#40;)

&#123;       unsigned int zz; unsigned char ch; int r1,r2;

sort05&#58; r1=0; r2=-1;
sort10&#58; r1++; r2++;
        if &#40;r1>=MAX_ENTRIES&#41; return;
        if &#40;key1&#91;r2&#93; <= key1&#91;r1&#93;) goto sort10;
        zz=key1&#91;r1&#93;; key1&#91;r1&#93;=key1&#91;r2&#93;; key1&#91;r2&#93;=zz;    // swap
        zz=key2&#91;r1&#93;; key2&#91;r1&#93;=key2&#91;r2&#93;; key2&#91;r2&#93;=zz;
        ch=byte1&#91;r1&#93;; byte1&#91;r1&#93;=byte1&#91;r2&#93;; byte1&#91;r2&#93;=ch;
        ch=byte2&#91;r1&#93;; byte2&#91;r1&#93;=byte2&#91;r2&#93;; byte2&#91;r2&#93;=ch;
        goto sort05;                                    // again
&#125;
        #endif


        #if BUBBLES==1

void bubbles&#40;)

&#123;       asm &#123;

sort05&#58; xor EDX,EDX                         // r1=0
        mov EBX,0FFFFFFFFh                  // r2=-1

sort10&#58; add EDX,1                           // r1++
        add EBX,1                           // r2++

        cmp EDX,MAX_ENTRIES-1               // if &#40;r1 >= MAX_ENTRIES&#41; return
        jge done

        mov EAX,dword ptr key1&#91;EBX*4&#93;       // eax=key1&#91;r2&#93;
        cmp EAX,dword ptr key1&#91;EDX*4&#93;
        jbe sort10

        mov ECX,dword ptr key1&#91;EDX*4&#93;       // ecx=key1&#91;r1&#93;
        mov dword ptr key1&#91;EDX*4&#93;,EAX
        mov dword ptr key1&#91;EBX*4&#93;,ECX       // swap key1

        mov EDI,dword ptr key2&#91;EBX*4&#93;       // swap key2
        mov ESI,dword ptr key2&#91;EDX*4&#93;
        mov dword ptr key2&#91;EBX*4&#93;,ESI
        mov dword ptr key2&#91;EDX*4&#93;,EDI

        mov AL,byte ptr byte1&#91;EDX&#93;          // swap byte 1 & 2
        mov AH,byte ptr byte1&#91;EBX&#93;
        mov CL,byte ptr byte2&#91;EDX&#93;
        mov CH,byte ptr byte2&#91;EBX&#93;

        mov byte ptr byte1&#91;EDX&#93;,AH
        mov byte ptr byte1&#91;EBX&#93;,AL
        mov byte ptr byte2&#91;EDX&#93;,CH
        mov byte ptr byte2&#91;EBX&#93;,CL

        jmp sort05

            &#125;       // end of asm

done&#58;   return;

&#125;

        #endif

velmarin · Post by **velmarin** » Tue Mar 05, 2013 3:19 pm

C was little, almost nothing of ASM. Admittedly.

In all Engines codes I've seen,
many indeed, I have seen several that contained some ASM,
almost everyone was talking about not finding significant advantage.

The last, most recent by SlowChess remember is, the version that comes with the GUI has part in ASM, later the author dismisses for the next version (last) certainly is a representation (not bibboard) and therefore without intrinsic.

Evert · Post by **Evert** » Tue Mar 05, 2013 3:43 pm

Hmm.

Well, this is what I get from gcc (4.6.3) under Linux:

Code: Select all

$gcc -march=native -O3 bubble.c -o bubble_asm
bubble.c&#58; In function 'bubbles'&#58;
bubble.c&#58;119&#58;13&#58; error&#58; expected '(' before '&#123;' token
bubble.c&#58;121&#58;9&#58; error&#58; a label can only be part of a statement and a declaration is not a statement
bubble.c&#58;121&#58;9&#58; error&#58; unknown type name 'xor'
bubble.c&#58;122&#58;9&#58; error&#58; expected '=', ',', ';', 'asm' or '__attribute__' before 'mov'
bubble.c&#58;122&#58;17&#58; error&#58; invalid suffix "FFFFFFFFh" on integer constant
bubble.c&#58;159&#58;1&#58; error&#58; expected declaration or statement at end of input

Using Apple's archaic gcc (4.2.1) that comes with Mountain Lion:

Code: Select all

$gcc -O3 -march=core2 -msse4.2 -fasm-blocks bubble.c -o bubble_asm
bubble.c&#58;134&#58;junk `(%rip&#41;' after expression
bubble.c&#58;134&#58;32-bit absolute addressing is not supported for x86-64
bubble.c&#58;134&#58;cannot do signed 4 byte relocation
bubble.c&#58;137&#58;junk `(%rip&#41;' after expression
bubble.c&#58;137&#58;32-bit absolute addressing is not supported for x86-64
bubble.c&#58;137&#58;cannot do signed 4 byte relocation
bubble.c&#58;138&#58;junk `(%rip&#41;' after expression
bubble.c&#58;138&#58;32-bit absolute addressing is not supported for x86-64
bubble.c&#58;138&#58;cannot do signed 4 byte relocation
bubble.c&#58;141&#58;junk `(%rip&#41;' after expression
bubble.c&#58;141&#58;32-bit absolute addressing is not supported for x86-64
bubble.c&#58;141&#58;cannot do signed 4 byte relocation
bubble.c&#58;142&#58;junk `(%rip&#41;' after expression
bubble.c&#58;142&#58;32-bit absolute addressing is not supported for x86-64
bubble.c&#58;142&#58;cannot do signed 4 byte relocation
bubble.c&#58;142&#58;junk `(%rip&#41;' after expression
bubble.c&#58;142&#58;32-bit absolute addressing is not supported for x86-64
bubble.c&#58;142&#58;cannot do signed 4 byte relocation
bubble.c&#58;143&#58;junk `(%rip&#41;' after expression
bubble.c&#58;143&#58;32-bit absolute addressing is not supported for x86-64
bubble.c&#58;143&#58;cannot do signed 4 byte relocation
bubble.c&#58;146&#58;junk `(%rip&#41;' after expression
bubble.c&#58;146&#58;32-bit absolute addressing is not supported for x86-64
bubble.c&#58;146&#58;cannot do signed 4 byte relocation
bubble.c&#58;147&#58;junk `(%rip&#41;' after expression
bubble.c&#58;147&#58;32-bit absolute addressing is not supported for x86-64
bubble.c&#58;147&#58;cannot do signed 4 byte relocation
bubble.c&#58;147&#58;junk `(%rip&#41;' after expression
bubble.c&#58;148&#58;junk `(%rip&#41;' after expression
bubble.c&#58;149&#58;junk `(%rip&#41;' after expression
bubble.c&#58;150&#58;junk `(%rip&#41;' after expression
bubble.c&#58;154&#58;junk `(%rip&#41;' after expression
bubble.c&#58;155&#58;junk `(%rip&#41;' after expression
bubble.c&#58;156&#58;junk `(%rip&#41;' after expression
bubble.c&#58;157&#58;junk `(%rip&#41;' after expression

Using the gcc from MacPorts:

Code: Select all

$gcc -O3 -march=core2 -msse4.2 bubble.c -o bubble_asm
bubble.c&#58; In function 'bubbles'&#58;
bubble.c&#58;119&#58;13&#58; error&#58; expected '(' before '&#123;' token
bubble.c&#58;121&#58;9&#58; error&#58; 'xor' undeclared &#40;first use in this function&#41;
bubble.c&#58;121&#58;9&#58; note&#58; each undeclared identifier is reported only once for each function it appears in
bubble.c&#58;121&#58;13&#58; error&#58; expected ';' before 'EDX'
bubble.c&#58;122&#58;17&#58; error&#58; invalid suffix "FFFFFFFFh" on integer constant

mar · Post by **mar** » Tue Mar 05, 2013 4:00 pm

gcc expects AT&T assembly syntax.
This is Intel syntax, which I personally prefer.
I used to write a lot of modules in netwide assembler (NASM) and then linked with my C code.

Rein Halbersma · Post by **Rein Halbersma** » Tue Mar 05, 2013 5:00 pm

I think assembly programming is like soldering your lights directly to the ceiling. It can be done but why would you unless you are an electrician?

I like to go the other direction: adding abstraction for programming convenience. As a benchmark case, I decided to see if I could write an STL iterator interface over 64-bit bitboards, without an abstraction penalty. Here's the C++11 code http://liveworkspace.org/code/41EaZl$165 It allows you to write:

Code: Select all

 
    // initialize with pieces on 3 squares
    bitset x &#123;17, 31, 61&#125;;

    auto sum = 0; 
    for &#40;auto it = x.begin&#40;); it != x.end&#40;); ++it&#41; 
        sum += *it; 
    return sum;

Here, the increment on the iterator hides the usual (mask &= mask - 1) and the deference of the iterator hides the gcc compiler intrinsic __builtin_ctzll(). Similarly, you can run any non-modifying STL algorithm on such bitsets, and also do the usual pattern based masking through named functions such as set_intersection(), set_union() etc. Thanks to Gerd Isenberg's help on interpreting assembly output of the online compiler/assembler http://gcc.godbolt.org/, I believe that this achieves bitboard iteration with 0% overhead compared to the usual approach.

I'm currently trying to streamline this code (const-correctness, reverse iteration, larger boards for other games such as Go, etc.)

hgm · Post by **hgm** » Tue Mar 05, 2013 5:06 pm

This is a bit fishy. Because the critical loop does something completely obvious, where even the most advanced optimization should not be able to gain on a rather stupid 1-to-1 translation. All you do is memory moves.

When I feed your C code to my gcc, and let is produce assembly (gcc -O3 -S), I get this.

Code: Select all

.globl bubbles
	.type	bubbles, @function
bubbles&#58;
.L15&#58;
	pushl	%ebp
	movl	%esp, %ebp
	pushl	%ebx
	.p2align 4,,7
	.p2align 3
.L19&#58;
	movl	key1, %ecx
	movl	$1, %eax
	jmp	.L16
	.p2align 4,,7
	.p2align 3
.L17&#58;
	addl	$1, %eax
	cmpl	$6000, %eax
	je	.L20
	movl	%edx, %ecx
.L16&#58;
	movl	key1&#40;,%eax,4&#41;, %edx
	leal	-1&#40;%eax&#41;, %ebx
	cmpl	%ecx, %edx
	jae	.L17
	movl	%ecx, key1&#40;,%eax,4&#41;
	movl	key2&#40;,%ebx,4&#41;, %ecx
	movl	%edx, key1&#40;,%ebx,4&#41;
	movl	key2&#40;,%eax,4&#41;, %edx
	movl	%ecx, key2&#40;,%eax,4&#41;
	movzbl	byte1&#40;%ebx&#41;, %ecx
	movl	%edx, key2&#40;,%ebx,4&#41;
	movzbl	byte1&#40;%eax&#41;, %edx
	movb	%cl, byte1&#40;%eax&#41;
	movzbl	byte2&#40;%ebx&#41;, %ecx
	movb	%dl, byte1&#40;%ebx&#41;
	movzbl	byte2&#40;%eax&#41;, %edx
	movb	%cl, byte2&#40;%eax&#41;
	movb	%dl, byte2&#40;%ebx&#41;
	jmp	.L19
	.p2align 4,,7
	.p2align 3
.L20&#58;
	popl	%ebx
	popl	%ebp
	ret
	.size	bubbles, .-bubbles

This looks an awful lot like your hand-written assembly code; it just moves the memory items to be swapped into registers, and then moves them back swapped. So how come your ASM code is faster? Basically this is just 8 loads and 8 stores, with some house-keeping that should be executed in parallel.

Can you show is the code your compiler produces for bubbles()?

Gerd Isenberg · Post by **Gerd Isenberg** » Tue Mar 05, 2013 6:13 pm

hgm wrote: Can you show is the code your compiler produces for bubbles()?

The online GCC explorer:

http://gcc.godbolt.org/

Left side, C source

Code: Select all

#define MAX_ENTRIES     6000

static unsigned int key1&#91;MAX_ENTRIES+3&#93;;
static unsigned int key2&#91;MAX_ENTRIES+3&#93;;
static unsigned char byte1&#91;MAX_ENTRIES+3&#93;;
static unsigned char byte2&#91;MAX_ENTRIES+3&#93;;

void bubbles&#40;)

&#123;       unsigned int zz; unsigned char ch; int r1,r2;

sort05&#58; r1=0; r2=-1;
sort10&#58; r1++; r2++;
        if &#40;r1>=MAX_ENTRIES&#41; return;
        if &#40;key1&#91;r2&#93; <= key1&#91;r1&#93;) goto sort10;
        zz=key1&#91;r1&#93;; key1&#91;r1&#93;=key1&#91;r2&#93;; key1&#91;r2&#93;=zz;    // swap
        zz=key2&#91;r1&#93;; key2&#91;r1&#93;=key2&#91;r2&#93;; key2&#91;r2&#93;=zz;
        ch=byte1&#91;r1&#93;; byte1&#91;r1&#93;=byte1&#91;r2&#93;; byte1&#91;r2&#93;=ch;
        ch=byte2&#91;r1&#93;; byte2&#91;r1&#93;=byte2&#91;r2&#93;; byte2&#91;r2&#93;=ch;
        goto sort05;                                    // again
&#125;

produces with g++ 4.7
-std=c++11 -O3 -march=native -fverbose-asm
Intel Syntax

Code: Select all

bubbles&#40;)&#58;
.L6&#58;
	mov	ecx, DWORD PTR key1&#91;rip&#93;	# key1_I_lsm0.5, key1
	xor	eax, eax	# ivtmp.7
	jmp	.L3	#
.L4&#58;
	add	rax, 1	# ivtmp.7,
	mov	ecx, edx	# key1_I_lsm0.5, key1_I_lsm0.5
	cmp	rax, 5999	# ivtmp.7,
	je	.L1	#,
.L3&#58;
	mov	edx, DWORD PTR key1&#91;4+rax*4&#93;	# key1_I_lsm0.5, MEM&#91;symbol&#58; key1, index&#58; ivtmp.7_38, step&#58; 4, offset&#58; 4B&#93;
	lea	edi, &#91;rax+1&#93;	# r1,
	movsx	rsi, eax	#, ivtmp.7
	cmp	edx, ecx	# key1_I_lsm0.5, key1_I_lsm0.5
	jae	.L4	#,
	movsx	rdi, edi	# r1, r1
	mov	eax, DWORD PTR key2&#91;0+rdi*4&#93;	# zz, key2
	mov	DWORD PTR key1&#91;0+rdi*4&#93;, ecx	# key1, key1_I_lsm0.5
	mov	DWORD PTR key1&#91;0+rsi*4&#93;, edx	# key1, key1_I_lsm0.5
	mov	edx, DWORD PTR key2&#91;0+rsi*4&#93;	# D.2073, key2
	mov	DWORD PTR key2&#91;0+rdi*4&#93;, edx	# key2, D.2073
	movzx	edx, BYTE PTR byte1&#91;rsi&#93;	# D.2074, byte1
	mov	DWORD PTR key2&#91;0+rsi*4&#93;, eax	# key2, zz
	movzx	eax, BYTE PTR byte1&#91;rdi&#93;	# ch, byte1
	mov	BYTE PTR byte1&#91;rdi&#93;, dl	# byte1, D.2074
	movzx	edx, BYTE PTR byte2&#91;rsi&#93;	# D.2075, byte2
	mov	BYTE PTR byte1&#91;rsi&#93;, al	# byte1, ch
	movzx	eax, BYTE PTR byte2&#91;rdi&#93;	# ch, byte2
	mov	BYTE PTR byte2&#91;rdi&#93;, dl	# byte2, D.2075
	mov	BYTE PTR byte2&#91;rsi&#93;, al	# byte2, ch
	jmp	.L6	#
.L1&#58;
	rep
	ret

Joost Buijs · Post by **Joost Buijs** » Tue Mar 05, 2013 6:32 pm

Rebel wrote:Following the heated discussion (C vs ASM) a small example that ASM still can pay off. You can compile (and run) the below program yourself. At program start it creates 6000 random 32-bit numbers and then we are going to bubble the classic way.

My Digital Mars compiler takes 22.1 secs to complete.
GCC 4.6.1 (32 bit) takes 22.2 secs
GCC 4.6.1 (64-bit) takes 22.0 secs

The hand tuned ASM version (BUBBLES=1) takes 14.7 seconds.

I compiled GCC with the "-Ofast" option. Perhaps that's not the right one as I am new to GCC.

I took your code (unmodified) and run it under MSVC-2012 and Intel C++ v13.0. The only thing I had to replace was 'asm {' with '__asm {'
I just used basic settings for both compilers, nothing fancy.

My timings are totally different:
MSVC 12.126 sec.
Intel 12.075 sec.
ASM 18.034 sec.

So on my machine (Sandy-Bridge) your ASM code is actually a lot slower.

hgm · Post by **hgm** » Tue Mar 05, 2013 6:48 pm

That is also crazy. They should be the same... So far the compiler outputs that have been posted here are virtually identical to Ed's ASM code.

Joost Buijs · Post by **Joost Buijs** » Tue Mar 05, 2013 7:24 pm

hgm wrote:That is also crazy. They should be the same... So far the compiler outputs that have been posted here are virtually identical to Ed's ASM code.

This is what MSVC makes of it.

Code: Select all

_TEXT	SEGMENT
?bubbles@@YAXXZ PROC					; bubbles, COMDAT
; File d&#58;\test\test.cpp
; Line 123
	push	esi
$sort05$12&#58;
; Line 125
	xor	eax, eax
	or	esi, -1
$sort10$13&#58;
; Line 126
	inc	eax
	lea	esi, DWORD PTR &#91;esi+1&#93;
; Line 127
	cmp	eax, 6000				; 00001770H
	jge	SHORT $LN8@bubbles
; Line 128
	mov	ecx, DWORD PTR _key1&#91;eax*4-4&#93;
	cmp	ecx, DWORD PTR _key1&#91;eax*4&#93;
	jbe	SHORT $sort10$13
; Line 129
	mov	ecx, DWORD PTR _key1&#91;esi*4&#93;
	mov	edx, DWORD PTR _key1&#91;eax*4&#93;
	mov	DWORD PTR _key1&#91;eax*4&#93;, ecx
; Line 130
	mov	ecx, DWORD PTR _key2&#91;esi*4&#93;
	mov	DWORD PTR _key1&#91;esi*4&#93;, edx
	mov	edx, DWORD PTR _key2&#91;eax*4&#93;
	mov	DWORD PTR _key2&#91;eax*4&#93;, ecx
; Line 131
	movzx	ecx, BYTE PTR _byte1&#91;esi&#93;
	mov	DWORD PTR _key2&#91;esi*4&#93;, edx
	mov	dl, BYTE PTR _byte1&#91;eax&#93;
	mov	BYTE PTR _byte1&#91;eax&#93;, cl
; Line 132
	movzx	ecx, BYTE PTR _byte2&#91;esi&#93;
	mov	BYTE PTR _byte1&#91;esi&#93;, dl
	mov	dl, BYTE PTR _byte2&#91;eax&#93;
	mov	BYTE PTR _byte2&#91;eax&#93;, cl
	mov	BYTE PTR _byte2&#91;esi&#93;, dl
; Line 133
	jmp	$sort05$12
$LN8@bubbles&#58;
	pop	esi
; Line 134
	ret	0
?bubbles@@YAXXZ ENDP					; bubbles
_TEXT	ENDS

C vs ASM

C vs ASM

Re: C vs ASM

Re: C vs ASM

Re: C vs ASM

Re: C vs ASM

Re: C vs ASM

Re: C vs ASM

Re: C vs ASM

Re: C vs ASM

Re: C vs ASM