diff options
| author | Faith Ekstrand <faith.ekstrand@collabora.com> | 2025-07-24 19:47:33 +0000 |
|---|---|---|
| committer | Marge Bot <marge-bot@fdo.invalid> | 2025-10-16 01:19:44 +0000 |
| commit | 555881e57499bc38f098dd5859edecdf6bdad2a2 (patch) | |
| tree | 9f719a6e083794847be5660e32a070c680fc003a | |
| parent | 1dea86f7730931726476c4e0831e7aff0558c37a (diff) | |
util/cache_ops: Add some cache flush helpers
The x86 implementation was shamelessly stolen from intel_mem.c and the
aarch64 implementaiton was based on the code in Turnip.
Reviewed-by: Boris Brezillon <boris.brezillon@collabora.com>
Tested-by: Boris Brezillon <boris.brezillon@collabora.com>
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37803>
| -rw-r--r-- | src/util/cache_ops.h | 115 | ||||
| -rw-r--r-- | src/util/cache_ops_aarch64.c | 228 | ||||
| -rw-r--r-- | src/util/cache_ops_null.c | 70 | ||||
| -rw-r--r-- | src/util/cache_ops_x86.c | 129 | ||||
| -rw-r--r-- | src/util/cache_ops_x86_clflushopt.c | 46 | ||||
| -rw-r--r-- | src/util/meson.build | 24 |
6 files changed, 611 insertions, 1 deletions
diff --git a/src/util/cache_ops.h b/src/util/cache_ops.h new file mode 100644 index 00000000000..2f9dce03946 --- /dev/null +++ b/src/util/cache_ops.h @@ -0,0 +1,115 @@ +/* + * Copyright © 2025 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef UTIL_CACHE_OPS_H +#define UTIL_CACHE_OPS_H + +#include <stdbool.h> +#include <stddef.h> + +#include "detect_arch.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** Returns true if we have cache operations available */ +static inline bool +util_has_cache_ops(void) +{ + /* TODO: Port to MSVC if and when we have Windows hardware drivers that + * need cache flushing ops. + */ +#if defined(_MSC_VER) + return false; +#endif + + return DETECT_ARCH_X86 || DETECT_ARCH_X86_64 || DETECT_ARCH_AARCH64; +} + +/** Returns the cache granularity + * + * This is the maximum number of bytes that may be overwritten as the result + * of a cache flush or cache line eviction. On big.LITTLE platforms, the + * cache flush helpers may sometimes operate at a smaller granularity but may + * also round up to at most util_cache_granularity(). + * + * Vulkan drivers should return this as nonCoherentAtomSize. + */ +size_t util_cache_granularity(void); + +/** Flushes a range to main memory */ +void util_flush_range(void *start, size_t size); + +/** Flushes a range to main memory and invalidates those cache lines */ +void util_flush_inval_range(void *start, size_t size); + +/** Flushes a range to main memory without fencing + * + * This is for the case where you have a lot of ranges to flush and want to + * avoid unnecessary fencing. In this case, call + * + * util_pre_flush_fence() + * util_flush_range_no_fence() + * util_flush_range_no_fence() + * util_post_flush_fence() + */ +void util_flush_range_no_fence(void *start, size_t size); + +/** Flushes a range to main memory and invalidates those cache lines without + * fencing + * + * This is for the case where you have a lot of ranges to flush and invalidate + * and want to avoid unnecessary fencing. In this case, call + * + * util_pre_flush_fence() + * util_flush_inval_range_no_fence() + * util_flush_range_no_fence() + * util_flush_inval_range_no_fence() + * util_post_flush_inval_fence() + */ +void util_flush_inval_range_no_fence(void *start, size_t size); + +/** Fence between memory access and cache flush operations + * + * see util_flush_range_no_fence() + */ +void util_pre_flush_fence(void); + +/** Fence between cache flush operations and memory access + * + * see util_flush_range_no_fence() + */ +void util_post_flush_fence(void); + +/** Fence between cache invalidate operations and memory access + * + * see util_flush_inval_range_no_fence() + */ +void util_post_flush_inval_fence(void); + +#ifdef __cplusplus +} +#endif + +#endif /* UTIL_CACHE_OPS_H */ diff --git a/src/util/cache_ops_aarch64.c b/src/util/cache_ops_aarch64.c new file mode 100644 index 00000000000..d56ab8a9ece --- /dev/null +++ b/src/util/cache_ops_aarch64.c @@ -0,0 +1,228 @@ +/* + * Copyright © 2025 Collabora Ltd. and Igalia S.L. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "cache_ops.h" + +#include "util/macros.h" +#include "util/u_atomic.h" + +static uint32_t +get_ctr_el0(void) +{ + uint32_t ctr_el0; + __asm("mrs\t%x0, ctr_el0" : "=r"(ctr_el0)); + return ctr_el0; +} + +static uint32_t +get_ctr_cwg(void) +{ + return (get_ctr_el0() >> 24) & 0xf; +} + +size_t +util_cache_granularity(void) +{ + static uint32_t cached_size = 0; + uint32_t size = p_atomic_read(&cached_size); + if (likely(size > 0)) + return size; + + /* We use CTR_EL0.CWG as the cache granularity. According to Arm: + * + * "CWG, [27:24] + * + * Cache write-back granule. Log2 of the number of words of the maximum + * size of memory that can be overwritten as a result of the eviction of + * a cache entry that has had a memory location in it modified" + * + * On big.LITTLE CPUs, Linux will trap on fetching CTR_EL0 and take the + * maximum across all CPU cores so this should really be the maximum that + * drivers and clients can assume. + */ + size = 4 << ((get_ctr_el0() >> 24) & 0xf); + + p_atomic_set(&cached_size, size); + return size; +} + +static size_t +get_dmin_line(void) +{ + static uint32_t cached_size = 0; + uint32_t size = p_atomic_read(&cached_size); + if (likely(size > 0)) + return size; + + /* For walking cache lines, we want to use CTR_EL0.DminLine as the step + * size. According to Arm: + * + * "DminLine, [19:16] + * + * Log2 of the number of words in the smallest cache line of all the + * data and unified caches that the core controls" + * + * On big.LITTLE CPUs, Linux will trap on fetching CTR_EL0 and take the + * minimum across all CPU cores so this should be safe no matter what core + * we happen to be living on. + */ + size = 4 << ((get_ctr_el0() >> 16) & 0xf); + + p_atomic_set(&cached_size, size); + return size; +} + +static void +flush_l1_cacheline(UNUSED void *p) +{ + /* Clean data cache. */ + __asm volatile("dc cvac, %0" : : "r" (p) : "memory"); +} + +static void +flush_inval_l1_cacheline(UNUSED void *p) +{ + /* Clean and Invalidate data cache, there is no separate Invalidate. */ + __asm volatile("dc civac, %0" : : "r" (p) : "memory"); +} + +static void +data_sync_bar(void) +{ + __asm volatile("dsb sy"); +} + +void +util_flush_range_no_fence(void *start, size_t size) +{ + uintptr_t l1_cacheline_size = get_dmin_line(); + char *p = (char *) (((uintptr_t) start) & ~(l1_cacheline_size - 1)); + char *end = ((char *) start) + size; + + while (p < end) { + flush_l1_cacheline(p); + p += l1_cacheline_size; + } +} + +void +util_flush_inval_range_no_fence(void *start, size_t size) +{ + uintptr_t l1_cacheline_size = get_dmin_line(); + char *p = (char *) (((uintptr_t) start) & ~(l1_cacheline_size - 1)); + char *end = ((char *) start) + size; + + while (p < end) { + flush_inval_l1_cacheline(p); + p += l1_cacheline_size; + } +} + +void +util_flush_range(void *p, size_t size) +{ + if (size == 0) + return; + + util_pre_flush_fence(); + util_flush_range_no_fence(p, size); + util_post_flush_fence(); +} + +void +util_flush_inval_range(void *p, size_t size) +{ + if (size == 0) + return; + + util_pre_flush_fence(); + util_flush_inval_range_no_fence(p, size); + util_post_flush_inval_fence(); +} + +void +util_pre_flush_fence(void) +{ + /* From the Arm ® Architecture Reference Manual (revision L.b): + * + * "All data cache instructions, other than DC ZVA, DC GVA, and DC GZVA + * that specify an address: [...] Execute in program order relative to + * other data cache instructions, other than DC ZVA, DC GVA, and DC GZVA + * that specify an address within the same cache line of minimum size, + * as indicated by CTR_EL0.DMinLine." + * + * So cache flush operations are properly ordered against memory accesses + * and there's nothing we need to do to ensure that prior writes land + * before the cache flush operations flush the data. + * + * In the case where this pre_flush_fence() is called before a flush/inval + * used for a GPU -> CPU barrier, there is also nothing to do because it's + * the responsibility of the GPU to ensure that all memory writes have + * landed before we see this on the CPU side. + */ +} + +void +util_post_flush_fence(void) +{ + /* From the Arm ® Architecture Reference Manual (revision L.b): + * + * "A cache maintenance instruction can complete at any time after it is + * executed, but is only guaranteed to be complete, and its effects + * visible to other observers, following a DSB instruction executed by + * the PE that executed the cache maintenance instruction." + * + * In order to ensure that the GPU sees data flushed by pror cache flushes, + * we need to execute a DSB to ensure the flushes land. + */ + data_sync_bar(); +} + +void +util_post_flush_inval_fence(void) +{ + /* From the Arm ® Architecture Reference Manual (revision L.b): + * + * "All data cache instructions, other than DC ZVA, DC GVA, and DC GZVA + * that specify an address: [...] Execute in program order relative to + * other data cache instructions, other than DC ZVA, DC GVA, and DC GZVA + * that specify an address within the same cache line of minimum size, + * as indicated by CTR_EL0.DMinLine." + * + * This seems to imply that memory access that happens after the cache + * flush/invalidate operation would be properly ordered with respect to it. + * However, the manual also says: + * + * "A cache maintenance instruction can complete at any time after it is + * executed, but is only guaranteed to be complete, and its effects + * visible to other observers, following a DSB instruction executed by + * the PE that executed the cache maintenance instruction." + * + * In practice, it appears that the ordering guarantees only really apply + * to the queue order in the data cache and not the order in which + * operations complete. In other words, a read which is queued after the + * invalidate may still use the stale cache line unless we explicitly + * insert a DSB between them. + */ + data_sync_bar(); +} diff --git a/src/util/cache_ops_null.c b/src/util/cache_ops_null.c new file mode 100644 index 00000000000..4bd5e370416 --- /dev/null +++ b/src/util/cache_ops_null.c @@ -0,0 +1,70 @@ +/* + * Copyright © 2025 Collabora Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "cache_ops.h" +#include "util/macros.h" + +size_t +util_cache_granularity() +{ + return 0; +} + +void +util_flush_range(void *start, size_t size) +{ + UNREACHABLE("Cache ops are not implemented on this platform"); +} + +void +util_flush_inval_range(void *start, size_t size) +{ + UNREACHABLE("Cache ops are not implemented on this platform"); +} + +void +util_flush_range_no_fence(void *start, size_t size) +{ + UNREACHABLE("Cache ops are not implemented on this platform"); +} + +void +util_flush_inval_range_no_fence(void *start, size_t size) +{ + UNREACHABLE("Cache ops are not implemented on this platform"); +} + +void util_pre_flush_fence(void) +{ + UNREACHABLE("Cache ops are not implemented on this platform"); +} + +void util_post_flush_fence(void) +{ + UNREACHABLE("Cache ops are not implemented on this platform"); +} + +void util_post_flush_inval_fence(void) +{ + UNREACHABLE("Cache ops are not implemented on this platform"); +} diff --git a/src/util/cache_ops_x86.c b/src/util/cache_ops_x86.c new file mode 100644 index 00000000000..370296cc3b2 --- /dev/null +++ b/src/util/cache_ops_x86.c @@ -0,0 +1,129 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "cache_ops.h" +#include "u_cpu_detect.h" + +#define CACHELINE_SIZE 64 +#define CACHELINE_MASK 63 + +size_t +util_cache_granularity(void) +{ + return util_get_cpu_caps()->cacheline; +} + +/* Defined in cache_ops_x86_clflushopt.c */ +#ifdef HAVE___BUILTIN_IA32_CLFLUSHOPT +void util_clflushopt_range(void *start, size_t size); +#endif + +static void +util_clflush_range(void *start, size_t size) +{ + char *p = (char *) (((uintptr_t) start) & ~CACHELINE_MASK); + char *end = ((char *) start) + size; + + while (p < end) { + __builtin_ia32_clflush(p); + p += CACHELINE_SIZE; + } +} + +void +util_flush_range_no_fence(void *start, size_t size) +{ +#ifdef HAVE___BUILTIN_IA32_CLFLUSHOPT + if (util_get_cpu_caps()->has_clflushopt) { + util_clflushopt_range(start, size); + return; + } +#endif + util_clflush_range(start, size); +} + +void +util_flush_range(void *start, size_t size) +{ + __builtin_ia32_mfence(); + util_clflush_range(start, size); +#ifdef HAVE___BUILTIN_IA32_CLFLUSHOPT + /* clflushopt doesn't include an mfence like clflush */ + if (util_get_cpu_caps()->has_clflushopt) + __builtin_ia32_mfence(); +#endif +} + +void +util_flush_inval_range_no_fence(void *start, size_t size) +{ + if (size == 0) + return; + + util_flush_range_no_fence(start, size); + + /* Modern Atom CPUs (Baytrail+) have issues with clflush serialization, + * where mfence is not a sufficient synchronization barrier. We must + * double clflush the last cacheline. This guarantees it will be ordered + * after the preceding clflushes, and then the mfence guards against + * prefetches crossing the clflush boundary. + * + * See kernel commit 396f5d62d1a5fd99421855a08ffdef8edb43c76e + * ("drm: Restore double clflush on the last partial cacheline") + * and https://bugs.freedesktop.org/show_bug.cgi?id=92845. + */ +#ifdef HAVE___BUILTIN_IA32_CLFLUSHOPT + if (util_get_cpu_caps()->has_clflushopt) { + /* clflushopt doesn't include an mfence like clflush */ + __builtin_ia32_mfence(); + util_clflushopt_range((char *)start + size - 1, 1); + return; + } +#endif + __builtin_ia32_clflush((char *)start + size - 1); +} + +void +util_flush_inval_range(void *start, size_t size) +{ + util_flush_inval_range_no_fence(start, size); + __builtin_ia32_mfence(); +} + +void +util_pre_flush_fence(void) +{ + __builtin_ia32_mfence(); +} + +void +util_post_flush_fence(void) +{ + __builtin_ia32_mfence(); +} + +void +util_post_flush_inval_fence(void) +{ + __builtin_ia32_mfence(); +} diff --git a/src/util/cache_ops_x86_clflushopt.c b/src/util/cache_ops_x86_clflushopt.c new file mode 100644 index 00000000000..ca97b9000f6 --- /dev/null +++ b/src/util/cache_ops_x86_clflushopt.c @@ -0,0 +1,46 @@ +/* + * Copyright © 2017 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "util/u_cpu_detect.h" + +#ifndef HAVE___BUILTIN_IA32_CLFLUSHOPT +#error "Compiler doesn't support clflushopt!" +#endif + +void util_clflushopt_range(void *start, size_t size); + +void +util_clflushopt_range(void *start, size_t size) +{ + const struct util_cpu_caps_t *cpu_caps = util_get_cpu_caps(); + assert(cpu_caps->has_clflushopt); + assert(cpu_caps->cacheline > 0); + void *p = (void *) (((uintptr_t) start) & + ~((uintptr_t)cpu_caps->cacheline - 1)); + void *end = start + size; + + while (p < end) { + __builtin_ia32_clflushopt(p); + p += cpu_caps->cacheline; + } +} diff --git a/src/util/meson.build b/src/util/meson.build index a10b987e23a..5c3ac1a9b29 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -25,6 +25,7 @@ files_mesa_util = files( 'box.h', 'build_id.c', 'build_id.h', + 'cache_ops.h', 'cnd_monotonic.c', 'cnd_monotonic.h', 'compiler.h', @@ -182,6 +183,26 @@ files_mesa_util = files( 'mesa_cache_db_multipart.h', ) +libmesa_util_links = [] + +if host_machine.cpu_family() == 'aarch64' and cc.get_id() != 'msvc' + files_mesa_util += files('cache_ops_aarch64.c') +elif host_machine.cpu_family() in ['x86', 'x86_64'] and cc.get_id() != 'msvc' + files_mesa_util += files('cache_ops_x86.c') + if with_clflushopt + libmesa_util_clflushopt = static_library( + 'mesa_util_clflushopt', + ['cache_ops_x86_clflushopt.c'], + include_directories : [inc_util], + c_args : [no_override_init_args] + clflushopt_args, + gnu_symbol_visibility : 'hidden', + ) + libmesa_util_links += libmesa_util_clflushopt + endif +else + files_mesa_util += files('cache_ops_null.c') +endif + files_drirc = files('00-mesa-defaults.conf') if with_amd_vk @@ -304,13 +325,14 @@ libmesa_util_simd = static_library( gnu_symbol_visibility : 'hidden', build_by_default : false, ) +libmesa_util_links += libmesa_util_simd _libmesa_util = static_library( 'mesa_util', [files_mesa_util, files_debug_stack, format_srgb], include_directories : [inc_util, include_directories('format')], dependencies : deps_for_libmesa_util, - link_with: [libmesa_util_simd], + link_with: libmesa_util_links, c_args : [c_msvc_compat_args], gnu_symbol_visibility : 'hidden', build_by_default : false |