@TechReport{ it:2004-006, author = {Henrik L{\"o}f and Markus Nord{\'e}n and Sverker Holmgren}, title = {Improving Geographical Locality of Data for Shared Memory Implementations of {PDE} Solvers}, institution = {Department of Information Technology, Uppsala University}, department = {Division of Scientific Computing}, year = {2004}, number = {2004-006}, month = feb, abstract = {On cc-NUMA multi-processors, the non-uniformity of main memory latencies motivates the need for co-location of threads and data. We call this special form of data locality, \emph{geographical locality}, as the non-uniformity is a consequence of the physical distance between the cc-NUMA nodes. In this article, we compare the well established method of exploiting the first-touch strategy using parallel initialization of data to an application-initiated page migration strategy as means of increasing the geographical locality for a set of important scientific applications. Four PDE solvers parallelized using OpenMP are studied; two standard NAS NPB3.0-OMP benchmarks and two kernels from industrial applications. The solvers employ both structured and unstructured computational grids. The main conclusions of the study are: (1) that geographical locality is important for the performance of the applications, (2) that application-initiated migration outperforms the first-touch scheme in almost all cases, and in some cases even results in performance which is close to what is obtained if all threads and data are allocated on a single node. We also suggest that such an application-initiated migration could be made fully transparent by letting the OpenMP compiler invoke it automatically.} }