This is the fourth lab for my SPO600 class. For this lab we will have to write a program that randomly allocates a number from 1000 to -1000 in two arrays, each one being set to 1000. The two arrays will be added up, element by element into a third array, and that array will be summed. We will compile this program and use object dump to analyze the assembly code of our original program, and then our new program once we rearrange to code to allow to auto Vectorization.
Original code for the C program: https://pastebin.com/4Rtghd4p
#include <stdio.h> #include <stdlib.h> #include <time.h> void main(){ int arr1[1000]; int arr2[1000]; int arr3[1000]; int max = 1000; int min = -1000; time_t t; int sum = 0; int i; srand(time(&t)); for(i=0;i<1000;i++){ arr1[i] = rand() % 2000 - 1000; arr2[i] = rand() % 2000 - 1000; arr3[i] = arr1[i] + arr2[i]; sum += arr3[i]; } printf("The sum of the third array is %d \n", sum); }
0000000000400560 <main>: 400560: a9bb7bfd stp x29, x30, [sp, #-80]! //Store the initial value into the array 400564: 910003fd mov x29, sp // store initial value into the array 400568: a9025bf5 stp x21, x22, [sp, #32] //Store pair value into register using 32bits 40056c: 5289ba75 mov w21, #0x4dd3 // #19923 400570: a90153f3 stp x19, x20, [sp, #16] // Store pair value into register using 16bits 400574: 72a20c55 movk w21, #0x1062, lsl #16 //Move the 16bits into a register while not changing values 400578: f9001bf7 str x23, [sp, #48] //Store values into register 40057c: 52807d13 mov w19, #0x3e8 // #1000 // setting max value to 1000 400580: 5280fa14 mov w20, #0x7d0 // #2000 // setting the value so it can go up to 2000 400584: 910123a0 add x0, x29, #0x48 // adds 400588: 52800017 mov w23, #0x0 // #0 /* The time, seed random and rand functions */ 40058c: 97ffffd9 bl 4004f0 <time@plt> 400590: 97ffffec bl 400540 <srand@plt> 400594: 97ffffdf bl 400510 <rand@plt> /* The loop */ 400598: 2a0003f6 mov w22, w0 40059c: 97ffffdd bl 400510 <rand@plt> // Random number for array 4005a0: 9b357c03 smull x3, w0, w21 // Signed Multiple Long 4005a4: 71000673 subs w19, w19, #0x1 // Subtract and set flags for register 19 4005a8: 9b357ec2 smull x2, w22, w21 // Signed Multiply Long 4005ac: 9367fc63 asr x3, x3, #39 // Shift 39 bits and fit new bit 4005b0: 4b807c63 sub w3, w3, w0, asr #31 // subtract and set flag for register 4005b4: 9367fc42 asr x2, x2, #39 //Shift 39 bits and then fill new bits 4005b8: 4b967c42 sub w2, w2, w22, asr #31 // subtract and set flag for register 4005bc: 1b148060 msub w0, w3, w20, w0 // load w0 with w0-(w3*w21) 4005c0: 1b14d842 msub w2, w2, w20, w22 // load w2 with w22-(w2*w20) 4005c4: 0b000040 add w0, w2, w0 // load w0 with w2+20 4005c8: 511f4000 sub w0, w0, #0x7d0 //subtract value of w0 by 2000 4005cc: 0b0002f7 add w23, w23, w0 // load w20 with w20_w0 4005d0: 54fffe21 b.ne 400594 <main+0x34> // b.any // check if condition is met /* Prints the sum of the array */ 4005d4: 2a1703e1 mov w1, w23 4005d8: 90000000 adrp x0, 400000 <_init-0x4b8> 4005dc: 911ee000 add x0, x0, #0x7b8 4005e0: 97ffffdc bl 400550 <printf@plt> 4005e4: a94153f3 ldp x19, x20, [sp, #16] 4005e8: a9425bf5 ldp x21, x22, [sp, #32] 4005ec: f9401bf7 ldr x23, [sp, #48] 4005f0: a8c57bfd ldp x29, x30, [sp], #80 4005f4: d65f03c0 ret
#include <stdio.h> #include <stdlib.h> #include <time.h> void main(){ int arr1[1000]; int arr2[1000]; int arr3[1000]; int max = 1000; int min = -1000; time_t t; int sum = 0; int i; srand(time(&t)); for(i=0;i<1000;i++){ arr1[i] = rand() % 2000 - 1000; arr2[i] = rand() % 2000 - 1000; } for(i=0;i<1000;i++){ arr3[i] = arr1[i] + arr2[i]; } for(i=0;i<1000;i++){ sum += arr3[i]; } printf("The sum of the third array is %d \n", sum); }
The assembly code on the new C code after running the command gcc -03 -ftree-vectorize -o array2 array.c:
0000000000400560 <main>: 400560: d285e610 mov x16, #0x2f30 // #12080 400564: cb3063ff sub sp, sp, x16 400568: a9007bfd stp x29, x30, [sp] 40056c: 910003fd mov x29, sp 400570: 910123a0 add x0, x29, #0x48 400574: a90153f3 stp x19, x20, [sp, #16] 400578: 5289ba74 mov w20, #0x4dd3 // #19923 40057c: a9025bf5 stp x21, x22, [sp, #32] 400580: 72a20c54 movk w20, #0x1062, lsl #16 400584: f9001bf7 str x23, [sp, #48] 400588: 910143b6 add x22, x29, #0x50 40058c: 913fc3b5 add x21, x29, #0xff0 400590: 5280fa13 mov w19, #0x7d0 // #2000 400594: d2800017 mov x23, #0x0 // #0 400598: 97ffffd6 bl 4004f0 <time@plt> 40059c: 97ffffe9 bl 400540 <srand@plt> 4005a0: 97ffffdc bl 400510 <rand@plt> 4005a4: 9b347c01 smull x1, w0, w20 4005a8: 9367fc21 asr x1, x1, #39 4005ac: 4b807c21 sub w1, w1, w0, asr #31 4005b0: 1b138020 msub w0, w1, w19, w0 4005b4: 510fa000 sub w0, w0, #0x3e8 4005b8: b8376ac0 str w0, [x22, x23] 4005bc: 97ffffd5 bl 400510 <rand@plt> 4005c0: 9b347c01 smull x1, w0, w20 4005c4: 9367fc21 asr x1, x1, #39 4005c8: 4b807c21 sub w1, w1, w0, asr #31 4005cc: 1b138020 msub w0, w1, w19, w0 4005d0: 510fa000 sub w0, w0, #0x3e8 4005d4: b8376aa0 str w0, [x21, x23] 4005d8: 910012f7 add x23, x23, #0x4 4005dc: f13e82ff cmp x23, #0xfa0 4005e0: 54fffe01 b.ne 4005a0 <main+0x40> // b.any 4005e4: d283f202 mov x2, #0x1f90 // #8080 4005e8: 8b0203a1 add x1, x29, x2 4005ec: d2800000 mov x0, #0x0 // #0 4005f0: 3ce06ac0 ldr q0, [x22, x0] 4005f4: 3ce06aa1 ldr q1, [x21, x0] 4005f8: 4ea18400 add v0.4s, v0.4s, v1.4s 4005fc: 3ca06820 str q0, [x1, x0] 400600: 91004000 add x0, x0, #0x10 400604: f13e801f cmp x0, #0xfa0 400608: 54ffff41 b.ne 4005f0 <main+0x90> // b.any 40060c: 4f000400 movi v0.4s, #0x0 400610: aa0103e0 mov x0, x1 400614: d285e601 mov x1, #0x2f30 // #12080 400618: 8b0103a1 add x1, x29, x1 40061c: 3cc10401 ldr q1, [x0], #16 400620: 4ea18400 add v0.4s, v0.4s, v1.4s 400624: eb01001f cmp x0, x1 400628: 54ffffa1 b.ne 40061c <main+0xbc> // b.any 40062c: 4eb1b800 addv s0, v0.4s 400630: 90000000 adrp x0, 400000 <_init-0x4b8> 400634: 91208000 add x0, x0, #0x820 400638: 0e043c01 mov w1, v0.s[0] 40063c: 97ffffc5 bl 400550 <printf@plt> 400640: f9401bf7 ldr x23, [sp, #48] 400644: a94153f3 ldp x19, x20, [sp, #16] 400648: d285e610 mov x16, #0x2f30 // #12080 40064c: a9425bf5 ldp x21, x22, [sp, #32] 400650: a9407bfd ldp x29, x30, [sp] 400654: 8b3063ff add sp, sp, x16 400658: d65f03c0 ret 40065c: 00000000 .inst 0x00000000 ; undefined
The assembly code has grown since enacting the changes to the C code. In addition to this we now see our first evidence of auto-vectorization taking place in the bold. The v is evidence of the vector register being accessed, and vectorization finally taking place.
My experience with these labs was interesting, I almost had to reverse engineer my code to allow for the compiler to utilize auto-vectorization by changing the operations of the program from completing in one for loop, to doing so in separate loops. Auto-vectorization mainly makes changes to optimization, I assume the reason it did not use vectors for the first instance of my C program is because the compiler would figure that utilizing a vector in that instance is not the most optimal way to compile. However it utilizes vectorization for the second C program with the 3 for loops (the 3 loops are less optimal than simply doing the operations in one loop), thus it does actually help with optimization.