@TechReport{ it:2017-023, author = {Kim-Anh Tran and Alexandra Jimborean and Trevor E. Carlson and Magnus Sj{\"a}lander and Konstantinos Koukos and Stefanos Kaxiras}, title = {Transcending Hardware Limits with Software Out-Of-Order Execution}, institution = {Department of Information Technology, Uppsala University}, department = {Division of Computer Systems}, year = {2017}, number = {2017-023}, month = oct, abstract = {Reducing the widening gap between processor and memory speed has been steering processors' design over the last decade, as memory accesses became the main performance bottleneck. Out-of-order architectures attempt to hide memory latency by dynamically reordering instructions, while in-order architectures are restricted to static instruction schedules. We propose a software-hardware co-design to break out of the hardware limits of existing architectures and attain increased memory and instruction level parallelism by orchestrating coarse-grain out-of-program-order execution in software (SWOOP). On in-order architectures, SWOOP acts as a virtual reorder buffer (ROB) while out-of-order architectures are endowed with the ability to jump ahead to independent code, far beyond the reach of the ROB. We build upon the decoupled access-execute model, however, executed in a single superscalar pipeline and within a single thread of control. The compiler generates the Access and Execute code slices and orchestrates their execution out-of-order, with the support of frugal microarchitectural enhancements to maximize efficiency. SWOOP significantly improves the performance of memory-bound applications by 42\% on in-order cores, and by 43\% on out-of-order architectures. Furthermore, not only is SWOOP competitive with out-of-order cores which benefit from double-sized reorder buffers, but it is also considerably more energy efficient. } }