Code Optimization
Compilation Process
).c $ $ode
reprocessor
gcc
$ompiler
gcc
as
ld
Binary
Preprocessing
,include -stdio.'. ,define /E00&1E 23ello4 5orld67n8 int main !# 9 printf !/E00&1E#: return 0: ;
,1 ,1 ,1 ,1 <'ello.c< <-%uilt-in.< <-command line.< <'ello.c<
... =snip> ... e"tern ?oid funlockfile !F@(E )AAstream# : , B31 <CusrCincludeCstdio.'< 3 + , 2 <'ello.c< 2 int main !# 9 printf !23ello4 5orld67n8#: return 0: ;
Compilation
,include -stdio.'. ,define /E00&1E 23ello4 5orld67n8 int main !# 9 printf !/E00&1E#: return 0: ;
.($0D .file <'ello.c< .section .rodata .string <3ello 5orld67n< .te"t .glo%l main .type main4 @function mainD pus'l Fe%p mo?l Fesp4 Fe%p su%l GB4 Fesp andl G-1H4 Fesp mo?l G04 Fea" su%l Fea"4 Fesp mo?l G.($04 !Fesp# call printf mo?l G04 Fea" lea?e ret .size main4 .-main .section .note.1IJ-stack4<<4@prog%its .ident <1$$D !1IJ# 3.3.E !Ke%ian 1D3.3.E-1#<
Assembly
.($0D .file <'ello.c< .section .rodata .string <3ello 5orld67n< .te"t .glo%l main .type main4 @function mainD pus'l Fe%p mo?l Fesp4 Fe%p su%l GB4 Fesp andl G-1H4 Fesp mo?l G04 Fea" su%l Fea"4 Fesp mo?l G.($04 !Fesp# call printf mo?l G04 Fea" lea?e ret .size main4 .-main .section .note.1IJ-stack4<<4@prog%its .ident <1$$D !1IJ# 3.3.E !Ke%ian 1D3.3.E-1#<
00000000: 00000010: 00000020: 00000030: 00000040: 00000050: 00000060: 00000070: 00000080: 00000090: 000000a0: 000000b0: 000000c0: 000000d0: 000000e0: 000000f0: 00000100: 00000110: 00000120: 00000130: 00000140: 00000150: 00000160: 00000170: 00000180: 00000190: 000001a0: 000001b0: 000001c0: 000001d0: 000001e0: 000001f0: 00000200: 00000210: 00000220: 00000230: 00000240: 00000250: 00000260: 00000270: 00000280: 00000290: 000002a0: 000002b0: 000002c0: 000002d0: 000002e0: 000002f0: 00000300: 00000310: 00000320: 00000330: 00000340: 00000350: 7f45 0100 dc00 0b00 0000 b800 726c 2920 313a 7461 7472 2e64 7461 636b 0000 0000 0000 0000 0000 0900 1000 0800 0000 0000 0800 0000 0000 0000 0000 0100 0000 0000 0000 0000 0300 5100 0000 0000 0800 0300 1500 0000 0000 0400 0300 0300 0300 0300 0300 0300 1200 1000 6e00 0105 4c46 0300 0000 0800 29c4 0000 6421 332e 332e 6200 7461 6174 002e 002e 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 f1ff 0100 0300 0400 0500 0600 0700 0100 0000 7072 0000 0101 0100 0000 5589 c704 00c9 0a00 332e 332e 2e73 6200 6100 6e6f 636f 0000 0000 1f00 3400 0400 0000 0900 2500 5800 0400 0300 0000 3000 5800 0100 0000 0000 4800 6600 0100 0000 0000 0100 9402 0400 0000 0000 0000 0100 0000 0000 0000 0000 0000 0000 0900 0e00 0068 696e 1800 0100 0000 0000 e583 2400 c300 0047 3520 352d 7472 2e72 2e62 7465 6d6d 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 656c 7466 0000 0000 0000 3400 ec08 0000 4865 4343 2844 3129 7461 656c 7373 2e47 656e 0000 0000 0100 2300 0000 0000 0100 0100 0000 0000 0000 0000 0100 0e00 0000 0000 0000 0100 2500 0000 0000 0000 0200 a000 1000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 6c6f 0000 0209 0000 0000 0000 83e4 00e8 6c6c 3a20 6562 0000 6200 2e74 002e 4e55 7400 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 2e63 0000 0000 0000 0000 0000 f0b8 fcff 6f20 2847 6961 2e73 2e73 6578 726f 2d73 0000 0000 0000 0600 0000 1b00 4c03 0400 0300 0000 2b00 5800 0400 0200 0000 3800 6600 0100 0000 0000 1100 8b00 0100 0000 0a00 0900 3403 0100 0000 0000 0000 0000 0000 0000 0000 0000 2300 0000 006d 1300 0000 0000 2800 0000 ffff 776f 4e55 6e20 796d 6873 7400 6461 7461 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 0000 6169 0000 .ELF............ ................ ........4.....(. ....U........... ..)...$......... ........Hello wo rld!...GCC: (GNU ) 3.3.5 (Debian 1:3.3.5-1)...sym tab..strtab..shs trtab..rel.text. .data..bss..roda ta..note.GNU-sta ck..comment..... ................ ................ ................ ....4...#....... ................ ............L... ................ ....%........... ....X........... ............+... ............X... ................ ....0........... ....X........... ............8... ............f... ................ ....H........... ....f...%....... ................ ................ Q............... ................ ................ ................ ............4... ................ ................ ................ ................ ................ ................ ................ ................ ................ ............#... ................ .....hello.c.mai n.printf........ ............
Linking
$ommandD ld -o hello -dynamic-linker /lib/ld-linux.so.2 /usr/lib/crt1.o /usr/lib/crti.o /usr/lib/gcc-lib/i686/3.3.1/crtbegin.o -L/usr/lib/gcc-lib/i686/3.3.1 hello.o -lgcc -lgcc_eh -lc -lgcc -lgcc_eh /usr/lib/gcc-lib/i686/3.3.1/crtend.o /usr/lib/crtn.o
Optimization Levels
-O0D Io optimization. -O1D Meduce code size and e"ecution time. -O2D /a"imum optimization 5it'out size increasing !no loop unrolling or inlining#. -O3D Function @n-lining plus some more aggressi?e optimizations. -OsD Meduce t'e size of t'e e"ecuta%le as most as possi%le.
B
Optimization Levels
Oit'out -O4 t'e compilerPs goal is to reduce t'e cost of compilation and to make de%ugging produce t'e e"pected results. @n ot'er 5ords4 statements are independents.
@f you stop t'e program 5it' a %reakpoint %et5een statements4 you can t'en assign a ne5 ?alue to any ?aria%le or c'ange t'e program counter to any ot'er statement in t'e function and get e"actly t'e results you 5ould e"pect from t'e source code.
N
-funroll-loops -fomit-frame-pointer -finline-funtions -fmerge-constants -fe"pensi?e-optimizations -foptimize-register-mo?e ... read t'e f))cking manual ...
10
11
12
13
-O1 Vs. -O
#include <stdio.h> int main() { int i, j; i = 5; j = 6; printf("%i %i\n", i, j); return 0; }
1+
-O1 Vs. -O
$ompiled 5it'D -O1 -fomit-frame-pointer
#include <stdio.h> int main() { int i, j; i = 5; j = 6; printf("%i %i\n", i, j); return 0; }
.LC0: .string "%i %i\n" .text main: pushl %ebp movl %esp, %ebp subl $24, %esp andl $-16, %esp movl $6, 8(%esp) movl $5, 4(%esp) movl $.LC0, (%esp) call printf movl $0, %eax movl %ebp, %esp popl %ebp ret
1E
-O1 Vs. -O
$ompiled 5it'D -O2 -fomit-frame-pointer
#include <stdio.h> int main() { int i, j; i = 5; j = 6; printf("%i %i\n", i, j); return 0; }
.LC0: .string .text main: pushl movl movl subl movl andl movl movl movl call movl xorl popl ret "%i %i\n" %ebp $6, %edx %esp, %ebp $24, %esp $5, %eax $-16, %esp %edx, 8(%esp) %eax, 4(%esp) $.LC0, (%esp) printf %ebp, %esp %eax, %eax %ebp 1H
-O1 Vs. -O
.LC0: .string "%i %i\n" .text main: pushl %ebp movl %esp, %ebp subl $24, %esp andl $-16, %esp movl $6, 8(%esp) movl $5, 4(%esp) movl $.LC0, (%esp) call printf movl $0, %eax movl %ebp, %esp popl %ebp ret .LC0: .string .text main: pushl movl movl subl movl andl movl movl movl call movl xorl popl ret "%i %i\n" %ebp $6, %edx %esp, %ebp $24, %esp $5, %eax $-16, %esp %edx, 8(%esp) %eax, 4(%esp) $.LC0, (%esp) printf %ebp, %esp %eax, %eax %ebp
1L
-O
Vs. -O!
#include <stdio.h> int foo(int n) { int i, result = 0; for (i=0; i<n; i++) { result = result + i; } return result; } int main() { printf("Result = %i\n", foo(3)); return 0; }
1B
-O
foo: movl xorl xorl cmpl jge .L6: addl incl cmpl jl .L8: ret
Vs. -O!
main: pushl movl subl andl movl call movl movl call movl xorl popl ret
-O
main: pushl xorl movl subl xorl andl .L6: addl incl cmpl jl movl movl call movl xorl popl ret
Vs. -O!
$ompilers ensure you to 'a?e a semantically eQui?alent code !e"cept in case of %ugs#. Rou ne?er kno5 e"actly 5'at assem%ly code t'e compiler is producing out of your code. & compiler is a tool 5'ic' is 'elpful %ut 5'ic' s'ouldnPt %e trusted in matter of e"treme optimizations. O'en it comes to e"treme optimization matter4 you 'a?e to look at t'e assem%ly code !5'ate?er language or processor you are using#.
21
Code Optimization
22
&sing gpro'
E"ecute t'e soft5areD ./my_program Mun gprof to analyse t'e file gmon.outD gprof -b ./my_program
23
Flat profile: Each sample counts as 0.01 seconds. % cumulative self self time seconds seconds calls ms/call 58.83 0.58 0.58 27.38 0.85 0.27 1 271.11 7.10 0.92 0.07 1 70.29 7.10 0.99 0.07 1 70.29 total ms/call 271.11 70.29 70.29
2+
FtimeD ercentage of t'e total running time of t'e program used %y t'is function. $umulati?e secondsD Munning sum of t'e total num%er of seconds 0elf secondsD Ium%er of seconds accounted for %y t'is function $allsD Ium%er of times t'is function 5as in?oked self msCcallD &?erage num%er of milliseconds spent in t'is function per call total msCcallD &?erage num%er of milliseconds spent in t'is function and its descendants per call
2E
gpro' O$tp$t ( )
index % time name <spontaneous> [1] 100.0 0.58 0.41 main [1] 0.28 0.00 1/1 foo0 [2] 0.07 0.00 1/1 foo1 [3] 0.06 0.00 1/1 foo2 [4] ------------------------------------------0.28 0.00 1/1 main [1] [2] 28.3 0.28 0.00 1 foo0 [2] ------------------------------------------0.07 0.00 1/1 main [1] [3] 7.1 0.07 0.00 1 foo1 [3] ------------------------------------------0.06 0.00 1/1 main [1] [4] 6.1 0.06 0.00 1 foo2 [4] ------------------------------------------2H
self
children
called
gpro' O$tp$t ( )
@nde"D JniQue num%er gi?en to eac' element of t'e ta%le. FSimeD ercentage of t'e TtotalP time t'at 5as spent in t'is
function and its c'ildren.
0elfD Sotal amount of time spent in t'is function. $'ildrenD Sotal amount of time propagated into t'is function %y its
c'ildren.
gpro' O$tp$t ( )
For t'e functionPs parents4 t'e fields 'a?e t'e follo5ing meaningsD
0elfD
t'is parent.
$'ildrenD $alledD
total num%er of times t'e function 5as called. Mecursi?e calls to t'e function are not included in t'e num%er after t'e TCP.
IameD
gpro' O$tp$t ( )
For t'e functionPs c'ildren4 t'e fields 'a?e t'e follo5ing meaningsD
0elfD
function.
$'ildrenD
t'e function.
$alledD
Ium%er of times t'e function called t'is c'ild TCP t'e total
num%er of times t'e c'ild 5as called. Mecursi?e calls %y t'e c'ild are not listed in t'e num%er after t'e TCP.
IameD
after it. @f t'e c'ild is a mem%er of a cycle4 t'e cycle num%er is printed %et5een t'e name and t'e inde" num%er.
2N
/atri" $opy
30
int a[12];
1L E +N 1B BH 3 E2 H B LB 2 E a[0] a[11]
a[i] = *(a+i)
%ase address inde"
31
*+o-dimensions Array
KeclarationD
Array a
a[0][0]
int a[4][12];
1 E +N E BH E E2 E BH E E2 E L E +N E BH E E2 E BH E E2 E 1L E +N E BH E E2 E BH E E2 E 2E E +N E BH E E2 E BH E E2 E a[3][11]
*+o-dimensions Array ( )
@n fact4 t'e matri" is represented as a line in t'e memory.
1 E +N E BH E E2 E BH E E2 E L E +N E BH E E2 E BH E E2 E 1L E +N E BH E E2 E BH E E2 E 2E E +N E BH E E2 E BH E E2 E
33
copy1
int copy1(float src[SIZE_X][SIZE_Y], float dest[SIZE_X][SIZE_Y]) { int i, j; for (j=0; j<SIZE_Y; j++) for (i=0; i<SIZE_X; i++) dest[i][j] = src[i][j]; return 0; }
3+
copy1
copy1: pushl xorl pushl pushl movl movl .L11: movl movl .L10: movl movl addl decl jns incl cmpl jle popl xorl popl popl ret %edi %edi, %edi %esi %ebx 16(%esp), %esi 20(%esp), %ebx %edi, %edx $1999, %ecx (%esi,%edx,4), %eax %eax, (%ebx,%edx,4) $75, %edx %ecx .L10 %edi $74, %edi .L11 %ebx %eax, %eax %esi %edi
3E
copy
int copy2(float src[SIZE_X][SIZE_Y], float dest[SIZE_X][SIZE_Y]) { int i, j; for (i=0; i<SIZE_X; i++) for (j=0; j<SIZE_Y; j++) dest[i][j] = src[i][j]; return 0; }
3H
copy
copy2: pushl xorl pushl pushl pushl movl xorl movl .L26: xorl .L25: leal movl incl cmpl movl jle incl addl cmpl jle popl xorl popl popl popl ret (%ecx,%ebx), %edx (%edi,%edx,4), %eax %ecx $74, %ecx %eax, (%esi,%edx,4) .L25 %ebp $75, %ebx $1999, %ebp .L26 %ebx %eax, %eax %esi %edi %ebp %ecx, %ecx %ebp %ebp, %ebp %edi %esi %ebx 20(%esp), %edi %ebx, %ebx 24(%esp), %esi
3L
copy!
int copy3(float* src, float* dest) { int size; for (size=(SIZE_X*SIZE_Y); size; size--) *dest++ = *src++; return 0; }
3B
copy!
copy3: pushl %ebx movl 8(%esp), %ecx movl $150000, %ebx movl 12(%esp), %edx .p2align 4,,15 .L36: movl addl movl addl decl jne popl xorl ret (%ecx), %eax $4, %ecx %eax, (%edx) $4, %edx %ebx .L36 %ebx %eax, %eax
3N
copy,
+0
copy,
copy4: subl movl movl movl movl movl testl je movl addl movl addl movl .L40: cld movl shrl rep movsl xorl movl movl addl ret %eax, %ecx $2, %ecx %eax, %eax (%esp), %esi 4(%esp), %edi $8, %esp $8, %esp $600000, %eax %edi, 4(%esp) 16(%esp), %edi %esi, (%esp) 12(%esp), %esi $4, %edi .L40 (%esi), %eax $4, %esi %eax, (%edi) $4, %edi $599996, %eax
+1
Performances (SIZE_Y)
SIZE_X = 75
350 325 300 275 250 225
ms/call
200 175 150 125 100 75 50 25 0 1000 2500 5000 7500 10000
SIZE_Y
+3
Performances (SIZE_Y)
SIZE_X = 75
70 65 60 55 50 45
ms/call
SIZE_Y
++
Performances (SIZE_X)
SIZE_Y = 75
500 450 400 350
ms/call
300 250 200 150 100 50 0 750 1000 2500 5000 7500 10000
SIZE_X
+E
Performances (SIZE_X)
SIZE_Y = 75
70 65 60 55 50 45 40 35 30 25 20 15 10 5 0
ms/call
750
1000
2500
5000
7500
10000
SIZE_X
+H
Code Optimization
0+itc#
int switch(int input) { int output = 0; switch (input) case 0: output break; case 1: output break; ... etc ... case 15: output break; } return output; }
+N
{ = 1; = 2;
= 16;
i'...elsei'... Vs s+itc#
ifelseif: movl xorl cmpl je cmpl je cmpl je .L3: ret .L9: movl jmp .L8: movl ret .L7: movl ret 4(%esp), %edx %eax, %eax $0, %edx .L7 $1, %edx .L8 $2, %edx .L9 switch: movl xorl cmpl je jg cmpl je ret .L12: movl ret .L17: cmpl je ret .L13: movl ret .L14: movl ret 4(%esp), %edx %eax, %eax $1, %edx .L14 .L17 $0, %edx .L12 $1, %eax $3, %edx .L13 $2, %eax $2, %eax
E0
i'...elsei'... Vs s+itc#
int main() { int i, output; srand(123456); for (i=0; i<ITERATIONS; i++) output = ifelseif((unsigned int)rand()>>24); srand(123456); for (i=0; i<ITERATIONS; i++) output = switch((unsigned int)rand()>>24); return 0; }
E1
S'e s5itc' is 1.E more efficient t'an t'e if ... else if...
E2
Ordered 0+itc#
int switch(int input) { int output = 0; switch (input) case 15: output break; case 14: output break; ... etc ... case 0: output break; } return output; }
E3
{ = 1; = 2;
= 16;
s+itc# Vs ordereds+itc#
int main() { int i, output; srand(123456); for (i=0; i<ITERATIONS; i++) output = switch(((unsigned (unsigned (unsigned (unsigned
srand(123456); for (i=0; i<ITERATIONS; i++) output = orderedswitch(((unsigned (unsigned (unsigned (unsigned return 0; }
E+
EE
Code Optimization
(oop Jnrolling
EH
1itvector Or (1)
int bv_or1(long *X, long *Y, long *Z) { long bits = bits_(X); long size = size_(X); if ((size > 0) && (bits == bits_(Y)) && (bits == bits_(Z))) { while (size-- > 0) *X++ = *Y++ | *Z++; *(--X) &= mask_(X); } return 0; }
EL
1itvector Or ( )
int bv_or2(long *X, long *Y, long *Z) { long bits = bits_(X); long size = size_(X); if ((size > 0) && (bits == bits_(Y)) && (bits == bits_(Z))) { *X = *Y | *Z; while (--size) *++X = *++Y | *++Z; *X &= mask_(X); } return 0; }
EB
Code Optimization
(engt' of an &rray
EN
int length1(char **array) { int i = 0; while ((i<SIZE) && (array[i] != NULL)) i++; return ((i<SIZE) ? i : -1); }
int length2(char **array) { int i = 0; while ((array[i] != NULL) && (i<SIZE)) i++; return ((i<SIZE) ? i : -1); }
Comparing 1 2
length1: movl xorl movl testl je .L6: incl movl cmpl jg movl testl jne .L10: movl .L8: movl ret 4(%esp), %ecx %eax, %eax (%ecx), %edx %edx, %edx .L10 %eax $-1, %edx $SIZE, %eax .L8 (%ecx,%eax,4), %edx %edx, %edx .L6 %eax, %edx %edx, %eax length2: movl xorl movl testl je .L17: incl movl testl je cmpl jle .L18: movl .L19: ret .L14: cmpl jle jmp 4(%esp), %edx %eax, %eax (%edx), %ecx %ecx, %ecx .L19 %eax (%edx,%eax,4), %ecx %ecx, %ecx .L14 $SIZE, %eax .L17 $-1, %eax
4(%esp), %edx %eax, %eax (%edx,%eax,4), %ecx %ecx, %ecx .L23 %eax $SIZE, %eax .L29 $-1, %eax
HH
Comparing 1 2 !
length1: movl xorl movl testl je .L6: incl movl cmpl jg movl testl jne .L10: movl .L8: movl ret 4(%esp), %ecx %eax, %eax (%ecx), %edx %edx, %edx .L10 %eax $-1, %edx $SIZE, %eax .L8 (%ecx,%eax,4), %edx %edx, %edx .L6 %eax, %edx %edx, %eax
length3: movl xorl .L29: movl testl je incl cmpl jle movl .L23: ret
4(%esp), %edx %eax, %eax (%edx,%eax,4), %ecx %ecx, %ecx .L23 %eax $SIZE, %eax .L29 $-1, %eax
HL
Comparing
length2: movl xorl movl testl je .L17: incl movl testl je cmpl jle .L18: movl .L19: ret .L14: cmpl jle jmp 4(%esp), %edx %eax, %eax (%edx), %ecx %ecx, %ecx .L19 %eax (%edx,%eax,4), %ecx %ecx, %ecx .L14 $SIZE, %eax .L17 $-1, %eax
2 !
length3: movl xorl .L29: movl testl je incl cmpl jle movl .L23: ret
4(%esp), %edx %eax, %eax (%edx,%eax,4), %ecx %ecx, %ecx .L23 %eax $SIZE, %eax .L29 $-1, %eax
Comparing 13
int main() { char* array[SIZE]; char* string = "a"; int i, j; for (i=0; i<SIZE; i++) { for (j=0; j<SIZE-i; j++) array[j] = (char*) string; array[SIZE-i] = NULL; for (j=0; j<1000; j++) { length1(array); length2(array); length3(array); } } return 0; }
2 !
HN
us/call
SIZE
L0
n o r
6 g
L2
Code Optimization
Me?erse an &rray
L3
"#at8s +rong %
Code Optimization
(oop Fusion
LH
Loop F$sion
#define SIZE 1000000 void loop1(int a[], int b[]){ int i; for (i=0; i<SIZE; i++) a[i] = 1; for (i=0; i<SIZE; i++) b[i] = 2; } void loop2(int a[], int b[]){ int i; for (i=0; i<SIZE; i++) { a[i] = 1; b[i] = 2; } }
LL
Loop F$sion
loop1: movl xorl movl .L6: movl incl cmpl jle xorl .L11: movl incl cmpl jle ret 4(%esp), %edx %eax, %eax 8(%esp), %ecx $1, (%edx,%eax,4) %eax $999999, %eax .L6 %eax, %eax $2, (%ecx,%eax,4) %eax $999999, %eax .L11
loop2: movl xorl movl .L21: movl movl incl cmpl jle ret
4(%esp), %ecx %eax, %eax 8(%esp), %edx $1, (%ecx,%eax,4) $2, (%edx,%eax,4) %eax $999999, %eax .L21
LB
Loop F$sion
Flat profile: Each sample counts as 0.01 seconds. % self self total time seconds calls ms/call ms/call 61.59 0.19 1 190.94 190.94 38.90 0.12 1 120.59 120.59
Code Optimization
B0
Comparing 0 2 1
#define SIZE 4000000 int main() { int i; char array[SIZE]; for (i=0; i<SIZE; i++) array[i] = (char) ((int) 'a' + i%26); reverse0(array); reverse1(array); return 0; }
B3
gpro' O$tp$t
Flat profile: Each sample counts as 0.01 seconds. % self self total time seconds calls ms/call ms/call 58.83 0.58 27.38 0.27 1 271.11 271.11 7.10 0.07 1 70.29 70.29
7everse an Array ( )
#define SIZE 8000000 void reverse2(char array[]) { int i, j; char tmp; for (i=0, j=SIZE-1; i<(SIZE>>1); i++, j--){ tmp = array[i]; array[i] = array[j]; array[j] = tmp; } return; }
BH
Comparing 1 2
#define SIZE 3 int main() { int i; char array[SIZE]; for (i=0; i<SIZE; i++) array[i] = (char) ((int) 'a' + i%26); for (i=0; i<100000000; i++) { reverse1(array); reverse2(array); } return 0; }
BL
gpro' O$tp$t
Flat profile: Each sample counts as 0.01 seconds. % self self total time seconds calls ns/call ns/call 47.06 18.35 100000000 183.48 183.48 27.71 10.81 100000000 108.06 108.06
reverse 1 2
reverse1: pushl movl pushl movl xorl .L21: movzbl movzbl movb incl movb decl cmpl jl popl popl ret %esi $2, %ecx %ebx 12(%esp), %esi %ebx, %ebx (%ebx,%esi), %edx (%ecx,%esi), %eax %al, (%ebx,%esi) %ebx %dl, (%ecx,%esi) %ecx %ecx, %ebx .L21 %ebx %esi
(Assembler)
4(%esp), %eax (%eax), %ecx 2(%eax), %edx %cl, 2(%eax) %dl, (%eax)
BN
Comparing 1 2
#define SIZE 4 int main() { int i; char array[SIZE];
(Again)
for (i=0; i<SIZE; i++) array[i] = (char) ((int) 'a' + i%26); for (i=0; i<1000000000; i++) { reverse1(array); reverse2(array); } return 0; }
N0
7everse an Array ( )
#define SIZE 8000000 void reverse2(char array[]) { int i, j; char tmp; for (i=0, j=SIZE-1; i<(SIZE>>1); i++, j--){ tmp = array[i]; array[i] = array[j]; array[j] = tmp; } return; }
reverse2: subl xorl movl movl movl movl .L29: movzbl movzbl movb incl movb decl cmpl jle movl movl addl ret $8, %esp %ecx, %ecx %esi, 4(%esp) 12(%esp), %esi %ebx, (%esp) $3, %ebx (%ebx,%esi), %eax (%ecx,%esi), %edx %al, (%ecx,%esi) %ecx %dl, (%ebx,%esi) %ebx $1, %ecx .L29 (%esp), %ebx 4(%esp), %esi $8, %esp
N1
gpro' O$tp$t
Flat profile: Each sample counts as 0.01 seconds. % self self total time seconds calls ns/call ns/call name 47.26 218.76 1000000000 218.76 218.76 reverse1 43.68 202.19 1000000000 202.19 202.19 reverse2
reverse 1 2
reverse1: pushl movl pushl movl xorl .L21: movzbl movzbl movb incl movb decl cmpl jl popl popl ret %esi $2, %ecx %ebx 12(%esp), %esi %ebx, %ebx (%ebx,%esi), %edx (%ecx,%esi), %eax %al, (%ebx,%esi) %ebx %dl, (%ecx,%esi) %ecx %ecx, %ebx .L21 %ebx %esi
(Assembler)
reverse2: subl xorl movl movl movl movl .L29: movzbl movzbl movb incl movb decl cmpl jle movl movl addl ret $8, %esp %ecx, %ecx %esi, 4(%esp) 12(%esp), %esi %ebx, (%esp) $3, %ebx (%ebx,%esi), %eax (%ecx,%esi), %edx %al, (%ecx,%esi) %ecx %dl, (%ebx,%esi) %ebx $1, %ecx .L29 (%esp), %ebx 4(%esp), %esi $8, %esp
N3
reverse 1 2
(Assembler)
$8, %esp %ecx, %ecx %esi, 4(%esp) 12(%esp), %esi %ebx, (%esp) $3, %ebx (%ebx,%esi), %eax (%ecx,%esi), %edx %al, (%ecx,%esi) %ecx %dl, (%ebx,%esi) %ebx $1, %ecx .L29 (%esp), %ebx 4(%esp), %esi $8, %esp
reverse2: reverse1: subl pushl %esi xorl movl $2, %ecx movl pushl %ebx movl movl 12(%esp), %esi movl xorl %ebx, %ebx movl .L21: .L29: movzbl (%ebx,%esi), %edx movzbl movzbl (%ecx,%esi), %eax movzbl movb %al, (%ebx,%esi) movb incl %ebx incl movb %dl, (%ecx,%esi) movb decl %ecx decl cmpl %ecx, %ebx cmpl jl .L21 jle popl %ebx movl popl %esi movl ret Branc' rediction addl ret
N+
Code Optimization
NE
ro%lemD
append0
int append0(struct list *head, struct list *new){ if (head != NULL) { while (head != NULL) { if (head->next != NULL) head = head->next; else { head->next = new; break; } } } else { head = new; } return 0; }
NL
append1
int append1(struct list *head, struct list *new){ struct list *tmp = NULL; if (head != NULL) { do { tmp = head; head = head->next; } while (head != NULL); tmp->next = new; } else { head = new; } return 0; }
NB
NN
*esting Per'ormances
int main() { struct list *head = NULL; struct list elt1, elt2, elt3, elt4; int i; elt1.index = 1; elt2.index = 2; elt3.index = 3; elt4.index = 4; elt4.next = NULL; for (i=0; i<LOOPS; i++) { head = NULL; elt1.next = NULL; elt2.next = NULL; elt3.next = NULL; append0(head, &elt1); append0(head, &elt2); append0(head, &elt3); append0(head, &elt4); } for (i=0; i<LOOPS; i++) { head = NULL; elt1.next = NULL; elt2.next = NULL; elt3.next = NULL; append1(head, &elt1); append1(head, &elt2); append1(head, &elt3); append1(head, &elt4); } return 0; }
100
gpro' O$tp$t
% time 31.62 26.65 self self seconds calls ns/call 11.79 400000000 29.48 9.94 400000000 24.85 total ns/call 29.48 24.85 name append0 append1
Code Optimization
102
3o5 muc' instructionsC$ J cycles does it takes to compute one 'as' W 1i?ing a typical sample of data4 'o5 many collisions are o%ser?ed W
Case 0t$dy
ro%lemD
'as'1 'as'2
#as#1
unsigned long hash1 (char *string) { unsigned long hash = 0; int len = strlen(string); int i = 0; for (i = 0; i < len; i++) { if (i % 2) hash = hash * ((int) string[i]); else hash = hash + ((int) string[i]); } return hash; }
10E
#as#1 (Assembler)
hash1: pushl pushl subl xorl movl movl call xorl movl cmpl jge .L8: testb je movsbl imull .L4: incl cmpl jl .L10: popl movl popl popl ret .L6: movsbl addl jmp %esi %ebx $4, %esp %ebx, %ebx 16(%esp), %esi %esi, (%esp) strlen %edx, %edx %eax, %ecx %eax, %ebx .L10 $1, %dl .L6 (%edx,%esi),%eax %eax, %ebx %edx %ecx, %edx .L8 %edx %ebx, %eax %ebx %esi (%edx,%esi),%eax %eax, %ebx .L4
unsigned long hash1 (char *string){ unsigned long hash = 0; int i = 0; for (i=0; i<strlen(string); i++){ if (i % 2) hash=hash*((int) string[i]); else hash=hash+((int) string[i]); } return hash; }
10H
#as#
unsigned long hash2(char *string) { unsigned long hash=0; int len; for (len=strlen(string); len>0; len--) { hash = (hash + ((*string)<<4) + ((*string++)>>4))*11; } return (unsigned long) hash; }
10L
#as#
(Assembler)
hash2: pushl xorl pushl subl movl movl call testl movl jle .L16: movzbl decl incl movsbl sall sarb leal movsbl addl leal testl leal jg .L18: popl movl popl popl ret %esi %esi, %esi %ebx $4, %esp 16(%esp), %ebx %ebx, (%esp) strlen %eax, %eax %eax, %ecx .L18 (%ebx), %edx %ecx %ebx %dl,%eax $4, %eax $4, %dl (%eax,%esi), %eax %dl,%edx %edx, %eax (%eax,%eax,4), %edx %ecx, %ecx (%eax,%edx,2), %esi .L16 %ecx %esi, %eax %ebx %esi
unsigned long hash2(char *string) { unsigned long hash=0; int len; for (len=strlen(string); len>0; len--) { hash = (hash + ((*string)<<4) + ((*string++)>>4))*11; } return (unsigned long) hash; }
10B
10N
#as#!
unsigned long hash3(char *string) { unsigned long hash=0; int len; for (len=strlen(string); len; len--) { hash = (hash + ((*string)<<4) + ((*string++)>>4))*11; } return (unsigned long) hash; }
110
#as#
hash2: pushl xorl pushl subl movl movl call testl movl jle .L16: movzbl decl incl movsbl sall sarb leal movsbl addl leal testl leal jg .L18: popl movl popl popl ret %esi %esi, %esi %ebx $4, %esp 16(%esp), %ebx %ebx, (%esp) strlen %eax, %eax %eax, %ecx .L18 (%ebx), %edx %ecx %ebx %dl,%eax $4, %eax $4, %dl (%eax,%esi), %eax %dl,%edx %edx, %eax (%eax,%eax,4), %edx %ecx, %ecx (%eax,%edx,2), %esi .L16 %ecx %esi, %eax %ebx %esi
Vs. #as#!
hash3: pushl xorl pushl subl movl movl call testl movl je .L24: movzbl incl movsbl sall sarb leal movsbl addl leal decl leal jne .L26: popl movl popl popl ret %esi %esi, %esi %ebx $4, %esp 16(%esp), %ebx %ebx, (%esp) strlen %eax, %eax %eax, %ecx .L26 (%ebx), %edx %ebx %dl,%eax $4, %eax $4, %dl (%eax,%esi), %eax %dl,%edx %edx, %eax (%eax,%eax,4), %edx %ecx (%eax,%edx,2), %esi .L24 %ebx %esi, %eax %ebx %esi
111
#as#
hash2: pushl xorl pushl subl movl movl call testl movl jle .L16: movzbl decl incl movsbl sall sarb leal movsbl addl leal testl leal jg .L18: popl movl popl popl ret %esi %esi, %esi %ebx $4, %esp 16(%esp), %ebx %ebx, (%esp) strlen %eax, %eax %eax, %ecx .L18 (%ebx), %edx %ecx %ebx %dl,%eax $4, %eax $4, %dl (%eax,%esi), %eax %dl,%edx %edx, %eax (%eax,%eax,4), %edx %ecx, %ecx (%eax,%edx,2), %esi .L16 %ecx %esi, %eax %ebx %esi
Vs. #as#!
hash3: pushl xorl pushl subl movl movl call testl movl je .L24: movzbl incl movsbl sall sarb leal movsbl addl leal decl leal jne .L26: popl movl popl popl ret %esi %esi, %esi %ebx $4, %esp 16(%esp), %ebx %ebx, (%esp) strlen %eax, %eax %eax, %ecx .L26 (%ebx), %edx %ebx %dl,%eax $4, %eax $4, %dl (%eax,%esi), %eax %dl,%edx %edx, %eax (%eax,%eax,4), %edx %ecx (%eax,%edx,2), %esi .L24 %ebx %esi, %eax %ebx %esi
Memo?ed
112
Pro'iling #as#ing
int main() { char string[LINE_MAX]; FILE *fp_input, *fp_output; if (((fp_input = fopen("./list.txt", "r")) == NULL) || ((fp_output = fopen("./hash.txt", "w")) == NULL)){ printf("File list.txt not found or cannot create hash.txt"); return 1; } else { while ((fgets(string, LINE_MAX, fp_input)) != NULL) { fprintf(fp_output, "hash1: %i\n", hash1(string)); fprintf(fp_output, "hash2: %i\n", hash2(string)); fprintf(fp_output, "hash3: %i\n", hash3(string)); } fclose(fp_input); fclose(fp_output); } return 0; } 113
gpro' O$tp$t
3as'ing my 'ome directory pat'sD
% time 38.07 36.33 25.95 self seconds 0.22 0.21 0.15 calls 47468 47468 47468 self us/call 4.65 4.44 3.17 total us/call 4.65 4.44 3.17 name hash1 hash2 hash3
Concl$sion
O'en do code optimization W O'at is a 2safe8 optimization W O'at met'od s'ould @ use W
11H
0tart to t'ink a%out optimization 5'en your code is 5orking. IOS BEFOME 666
11L
@t doesnPt affect t'e logic of t'e function @t is done in an 'ig'-le?el language !no assem%ly in-lining# @t doesnPt o%fuscate t'e code
11B
$omputers are e"tremely comple" &rc'itectures of processors ?ery different O'en you t'ink t'at some modifications mig't impro?e t'e program.... $3E$Z @S 6
11N
!t'eorizing on performances 'as some limits4 c'eck reality from time to time#
120
Code Optimization
\uestions W
121
*O4O
Jsing gco? !%ig part missing 'ere# $ompare access to array ?s linked-list 0plit in t'ree lectures !parts# onD
2 rofiling Sools8
gprof4 gco?4 oprofile
2/emory Optimization8
@nfluence of data-structure on programs4 impact of malloc and free4 data locality and alignment of data
2$ode Optimization8
@nfluence of code on programs
122