Homework Assignment 11 Advanced Computer Architecture CS 350c Unique Number: 51160 Spring, 2016 Given: April 12, 2016 Due: April 21, 2016 This homework is designed to make you aware of what an optimizing compiler does. Remember, a compiler is effectively a part of an architecture; it makes it possible for users to port existing code to a new architecture. Consider the C code below: void copy( long int *s, long int *d, long int c ) { while( c-- ) *d++ = *s++; } Simply put, this C-language code copies data from one place to another. Below is the x86 assembler generated from the Clang/LLVM C-language compiler, Apple LLVM version 7.3.0 (clang-703.0.29), as produced on an Apple Macintosh laptop with a "-O1" (optimization) flag. Explain each line of the x86 assembler code below. .globl _copy .align 4, 0x90 _copy: ## @copy .cfi_startproc ## BB#0: pushq %rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp testq %rdx, %rdx je LBB0_2 .align 4, 0x90 LBB0_1: ## %.lr.ph ## =>This Inner Loop Header: Depth=1 decq %rdx movq (%rdi), %rax leaq 8(%rdi), %rdi movq %rax, (%rsi) leaq 8(%rsi), %rsi jne LBB0_1 LBB0_2: ## %._crit_edge popq %rbp retq .cfi_endproc Below is the x86 assembler generated from the Clang/LLVM C compiler as produced on the same Apple Macintosh laptop with a "-O2" (optimization) flag. Wow! The amount of code exploded! Study the code below, and carefully explain what happened. .globl _copy .align 4, 0x90 _copy: ## @copy .cfi_startproc ## BB#0: pushq %rbp Ltmp0: .cfi_def_cfa_offset 16 Ltmp1: .cfi_offset %rbp, -16 movq %rsp, %rbp Ltmp2: .cfi_def_cfa_register %rbp testq %rdx, %rdx je LBB0_18 ## BB#1: ## %.lr.ph.preheader cmpq $4, %rdx jb LBB0_13 ## BB#2: ## %min.iters.checked movq %rdx, %r8 andq $-4, %r8 je LBB0_13 ## BB#3: ## %vector.memcheck leaq -8(%rdi,%rdx,8), %rax cmpq %rsi, %rax jb LBB0_5 ## BB#4: ## %vector.memcheck leaq -8(%rsi,%rdx,8), %rax cmpq %rdi, %rax jae LBB0_13 LBB0_5: ## %vector.body.preheader leaq -4(%rdx), %r9 movl %r9d, %eax shrl $2, %eax incl %eax xorl %ecx, %ecx testb $3, %al je LBB0_8 ## BB#6: ## %vector.body.prol.preheader leal -4(%rdx), %eax shrl $2, %eax incl %eax andl $3, %eax negq %rax xorl %ecx, %ecx .align 4, 0x90 LBB0_7: ## %vector.body.prol ## =>This Inner Loop Header: Depth=1 movups (%rdi,%rcx,8), %xmm0 movups 16(%rdi,%rcx,8), %xmm1 movups %xmm0, (%rsi,%rcx,8) movups %xmm1, 16(%rsi,%rcx,8) addq $4, %rcx incq %rax jne LBB0_7 LBB0_8: ## %vector.body.preheader.split cmpq $12, %r9 jb LBB0_11 ## BB#9: ## %vector.body.preheader.split.split movq %r8, %r9 subq %rcx, %r9 leaq 112(%rdi,%rcx,8), %rax leaq 112(%rsi,%rcx,8), %rcx .align 4, 0x90 LBB0_10: ## %vector.body ## =>This Inner Loop Header: Depth=1 movups -112(%rax), %xmm0 movups -96(%rax), %xmm1 movups %xmm0, -112(%rcx) movups %xmm1, -96(%rcx) movups -80(%rax), %xmm0 movups -64(%rax), %xmm1 movups %xmm0, -80(%rcx) movups %xmm1, -64(%rcx) movups -48(%rax), %xmm0 movups -32(%rax), %xmm1 movups %xmm0, -48(%rcx) movups %xmm1, -32(%rcx) movups -16(%rax), %xmm0 movups (%rax), %xmm1 movups %xmm0, -16(%rcx) movups %xmm1, (%rcx) subq $-128, %rax subq $-128, %rcx addq $-16, %r9 jne LBB0_10 LBB0_11: ## %middle.block cmpq %rdx, %r8 je LBB0_18 ## BB#12: subq %r8, %rdx leaq (%rsi,%r8,8), %rsi leaq (%rdi,%r8,8), %rdi LBB0_13: ## %.lr.ph.preheader32 leaq -1(%rdx), %r8 testb $7, %dl je LBB0_16 ## BB#14: ## %.lr.ph.prol.preheader movl %edx, %ecx andl $7, %ecx negq %rcx .align 4, 0x90 LBB0_15: ## %.lr.ph.prol ## =>This Inner Loop Header: Depth=1 decq %rdx movq (%rdi), %rax addq $8, %rdi movq %rax, (%rsi) addq $8, %rsi incq %rcx jne LBB0_15 LBB0_16: ## %.lr.ph.preheader32.split cmpq $7, %r8 jb LBB0_18 .align 4, 0x90 LBB0_17: ## %.lr.ph ## =>This Inner Loop Header: Depth=1 movq (%rdi), %rax movq %rax, (%rsi) movq 8(%rdi), %rax movq %rax, 8(%rsi) movq 16(%rdi), %rax movq %rax, 16(%rsi) movq 24(%rdi), %rax movq %rax, 24(%rsi) movq 32(%rdi), %rax movq %rax, 32(%rsi) movq 40(%rdi), %rax movq %rax, 40(%rsi) movq 48(%rdi), %rax movq %rax, 48(%rsi) addq $-8, %rdx movq 56(%rdi), %rax leaq 64(%rdi), %rdi movq %rax, 56(%rsi) leaq 64(%rsi), %rsi jne LBB0_17 LBB0_18: ## %._crit_edge popq %rbp retq .cfi_endproc Finally, write both a simple copy program and an optimized (e.g., a 4-way loop unrolled) copy program for our class SM (simple microprocessor), and execute it on SM -- just like you did for Laboratory 2. How much faster is your optimized version than your simple copy program?