Homework Assignment 11

                    Advanced Computer Architecture
                               CS 350c

                         Unique Number: 51160
                             Spring, 2016

                        Given:  April 12, 2016
                         Due:  April 21, 2016


This homework is designed to make you aware of what an optimizing
compiler does.  Remember, a compiler is effectively a part of an
architecture; it makes it possible for users to port existing code to
a new architecture.  Consider the C code below:

    void copy( long int *s, long int *d, long int c ) {
      while( c-- )
        *d++ = *s++;
    }

Simply put, this C-language code copies data from one place to
another.  Below is the x86 assembler generated from the Clang/LLVM
C-language compiler, Apple LLVM version 7.3.0 (clang-703.0.29), as
produced on an Apple Macintosh laptop with a "-O1" (optimization)
flag.  Explain each line of the x86 assembler code below.

          .globl  _copy
          .align  4, 0x90
  _copy:                                  ## @copy
          .cfi_startproc
  ## BB#0:
          pushq   %rbp
  Ltmp0:
          .cfi_def_cfa_offset 16
  Ltmp1:
          .cfi_offset %rbp, -16
          movq    %rsp, %rbp
  Ltmp2:
          .cfi_def_cfa_register %rbp
          testq   %rdx, %rdx
          je      LBB0_2
          .align  4, 0x90
  LBB0_1:                                 ## %.lr.ph
                                          ## =>This Inner Loop Header: Depth=1
          decq    %rdx
          movq    (%rdi), %rax
          leaq    8(%rdi), %rdi
          movq    %rax, (%rsi)
          leaq    8(%rsi), %rsi
          jne     LBB0_1
  LBB0_2:                                 ## %._crit_edge
          popq    %rbp
          retq
          .cfi_endproc


Below is the x86 assembler generated from the Clang/LLVM C compiler as
produced on the same Apple Macintosh laptop with a "-O2"
(optimization) flag.  Wow!  The amount of code exploded!  Study the
code below, and carefully explain what happened.

            .globl  _copy
            .align  4, 0x90
    _copy:                                  ## @copy
            .cfi_startproc
    ## BB#0:
            pushq   %rbp
    Ltmp0:
            .cfi_def_cfa_offset 16
    Ltmp1:
            .cfi_offset %rbp, -16
            movq    %rsp, %rbp
    Ltmp2:
            .cfi_def_cfa_register %rbp
            testq   %rdx, %rdx
            je      LBB0_18
    ## BB#1:                                ## %.lr.ph.preheader
            cmpq    $4, %rdx
            jb      LBB0_13
    ## BB#2:                                ## %min.iters.checked
            movq    %rdx, %r8
            andq    $-4, %r8
            je      LBB0_13
    ## BB#3:                                ## %vector.memcheck
            leaq    -8(%rdi,%rdx,8), %rax
            cmpq    %rsi, %rax
            jb      LBB0_5
    ## BB#4:                                ## %vector.memcheck
            leaq    -8(%rsi,%rdx,8), %rax
            cmpq    %rdi, %rax
            jae     LBB0_13
    LBB0_5:                                 ## %vector.body.preheader
            leaq    -4(%rdx), %r9
            movl    %r9d, %eax
            shrl    $2, %eax
            incl    %eax
            xorl    %ecx, %ecx
            testb   $3, %al
            je      LBB0_8
    ## BB#6:                                ## %vector.body.prol.preheader
            leal    -4(%rdx), %eax
            shrl    $2, %eax
            incl    %eax
            andl    $3, %eax
            negq    %rax
            xorl    %ecx, %ecx
            .align  4, 0x90
    LBB0_7:                                 ## %vector.body.prol
                                            ## =>This Inner Loop Header: Depth=1
            movups  (%rdi,%rcx,8), %xmm0
            movups  16(%rdi,%rcx,8), %xmm1
            movups  %xmm0, (%rsi,%rcx,8)
            movups  %xmm1, 16(%rsi,%rcx,8)
            addq    $4, %rcx
            incq    %rax
            jne     LBB0_7
    LBB0_8:                                 ## %vector.body.preheader.split
            cmpq    $12, %r9
            jb      LBB0_11
    ## BB#9:                                ## %vector.body.preheader.split.split
            movq    %r8, %r9
            subq    %rcx, %r9
            leaq    112(%rdi,%rcx,8), %rax
            leaq    112(%rsi,%rcx,8), %rcx
            .align  4, 0x90
    LBB0_10:                                ## %vector.body
                                            ## =>This Inner Loop Header: Depth=1
            movups  -112(%rax), %xmm0
            movups  -96(%rax), %xmm1
            movups  %xmm0, -112(%rcx)
            movups  %xmm1, -96(%rcx)
            movups  -80(%rax), %xmm0
            movups  -64(%rax), %xmm1
            movups  %xmm0, -80(%rcx)
            movups  %xmm1, -64(%rcx)
            movups  -48(%rax), %xmm0
            movups  -32(%rax), %xmm1
            movups  %xmm0, -48(%rcx)
            movups  %xmm1, -32(%rcx)
            movups  -16(%rax), %xmm0
            movups  (%rax), %xmm1
            movups  %xmm0, -16(%rcx)
            movups  %xmm1, (%rcx)
            subq    $-128, %rax
            subq    $-128, %rcx
            addq    $-16, %r9
            jne     LBB0_10
    LBB0_11:                                ## %middle.block
            cmpq    %rdx, %r8
            je      LBB0_18
    ## BB#12:
            subq    %r8, %rdx
            leaq    (%rsi,%r8,8), %rsi
            leaq    (%rdi,%r8,8), %rdi
    LBB0_13:                                ## %.lr.ph.preheader32
            leaq    -1(%rdx), %r8
            testb   $7, %dl
            je      LBB0_16
    ## BB#14:                               ## %.lr.ph.prol.preheader
            movl    %edx, %ecx
            andl    $7, %ecx
            negq    %rcx
            .align  4, 0x90
    LBB0_15:                                ## %.lr.ph.prol
                                            ## =>This Inner Loop Header: Depth=1
            decq    %rdx
            movq    (%rdi), %rax
            addq    $8, %rdi
            movq    %rax, (%rsi)
            addq    $8, %rsi
            incq    %rcx
            jne     LBB0_15
    LBB0_16:                                ## %.lr.ph.preheader32.split
            cmpq    $7, %r8
            jb      LBB0_18
            .align  4, 0x90
    LBB0_17:                                ## %.lr.ph
                                            ## =>This Inner Loop Header: Depth=1
            movq    (%rdi), %rax
            movq    %rax, (%rsi)
            movq    8(%rdi), %rax
            movq    %rax, 8(%rsi)
            movq    16(%rdi), %rax
            movq    %rax, 16(%rsi)
            movq    24(%rdi), %rax
            movq    %rax, 24(%rsi)
            movq    32(%rdi), %rax
            movq    %rax, 32(%rsi)
            movq    40(%rdi), %rax
            movq    %rax, 40(%rsi)
            movq    48(%rdi), %rax
            movq    %rax, 48(%rsi)
            addq    $-8, %rdx
            movq    56(%rdi), %rax
            leaq    64(%rdi), %rdi
            movq    %rax, 56(%rsi)
            leaq    64(%rsi), %rsi
            jne     LBB0_17
    LBB0_18:                                ## %._crit_edge
            popq    %rbp
            retq
            .cfi_endproc

Finally, write both a simple copy program and an optimized (e.g.,
a 4-way loop unrolled) copy program for our class SM (simple
microprocessor), and execute it on SM -- just like you did for
Laboratory 2.  How much faster is your optimized version than
your simple copy program?