@TechReport{ it:2005-041, author = {H{\aa}kan Zeffer and Zoran Radovic and Martin Karlsson and Erik Hagersten}, title = {{TMA}: A Trap-Based Memory Architecture}, institution = {Department of Information Technology, Uppsala University}, department = {Division of Computer Systems}, year = {2005}, number = {2005-041}, month = dec, note = {Revised version of Technical Report 2005-015}, abstract = {The advances in semiconductor technology have set the shared-memory server trend towards processors with multiple cores per die and multiple threads per core. We believe that this technology shift forces a reevaluation of how to interconnect multiple such chips to form larger systems. This paper argues that by adding support for \textit{coherence traps} in future chip multiprocessors, large-scale server systems can be formed at a much lower cost. This is due to shorter design time, verification and time to market when compared to its traditional all-hardware counter part. In the proposed \textit{trap-based memory architecture} (TMA), software trap handlers are responsible for obtaining read/write permission, whereas the coherence trap hardware is responsible for the actual permission check. In this paper we evaluate a TMA implementation (called \textit{TMA Lite}) with a minimal amount of hardware extensions, all contained within the processor. The proposed mechanisms for coherence trap processing should not affect the critical path and have a negligible cost in terms of area and power for most processor designs. Our evaluation is based on detailed full system simulation using out-of-order processors with one or two dual-threaded cores per die as processing nodes. The results show that a TMA based distributed shared memory system can on average perform within 1 percent of a highly optimized hardware based design.} }