@Article{Goto:2008:HPI, author = "Kazushige Goto and Robert van de Geijn", title = "High Performance Implementation of the Level-3 {BLAS}", journal = "{ACM} Transactions on Mathematical Software", volume = "35", number = "1", accepted = "28 October 2007", upcoming = "true", abstract = "A simple but highly effective approach for transforming high-performance implementations on cache-based architectures of matrix-matrix multiplication into implementations of other commonly used matrix-matrix computations (the level-3 BLAS) is presented. Exceptional performance is demonstrated on various architectures." }