| OLD | NEW |
| (Empty) |
| 1 // Copyright 2010 the V8 project authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 // This file is an internal atomic implementation, use atomicops.h instead. | |
| 6 // | |
| 7 // LinuxKernelCmpxchg and Barrier_AtomicIncrement are from Google Gears. | |
| 8 | |
| 9 #ifndef V8_BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ | |
| 10 #define V8_BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ | |
| 11 | |
| 12 #if defined(__QNXNTO__) | |
| 13 #include <sys/cpuinline.h> | |
| 14 #endif | |
| 15 | |
| 16 namespace v8 { | |
| 17 namespace base { | |
| 18 | |
| 19 // Memory barriers on ARM are funky, but the kernel is here to help: | |
| 20 // | |
| 21 // * ARMv5 didn't support SMP, there is no memory barrier instruction at | |
| 22 // all on this architecture, or when targeting its machine code. | |
| 23 // | |
| 24 // * Some ARMv6 CPUs support SMP. A full memory barrier can be produced by | |
| 25 // writing a random value to a very specific coprocessor register. | |
| 26 // | |
| 27 // * On ARMv7, the "dmb" instruction is used to perform a full memory | |
| 28 // barrier (though writing to the co-processor will still work). | |
| 29 // However, on single core devices (e.g. Nexus One, or Nexus S), | |
| 30 // this instruction will take up to 200 ns, which is huge, even though | |
| 31 // it's completely un-needed on these devices. | |
| 32 // | |
| 33 // * There is no easy way to determine at runtime if the device is | |
| 34 // single or multi-core. However, the kernel provides a useful helper | |
| 35 // function at a fixed memory address (0xffff0fa0), which will always | |
| 36 // perform a memory barrier in the most efficient way. I.e. on single | |
| 37 // core devices, this is an empty function that exits immediately. | |
| 38 // On multi-core devices, it implements a full memory barrier. | |
| 39 // | |
| 40 // * This source could be compiled to ARMv5 machine code that runs on a | |
| 41 // multi-core ARMv6 or ARMv7 device. In this case, memory barriers | |
| 42 // are needed for correct execution. Always call the kernel helper, even | |
| 43 // when targeting ARMv5TE. | |
| 44 // | |
| 45 | |
| 46 inline void MemoryBarrier() { | |
| 47 #if defined(__ANDROID__) | |
| 48 // Note: This is a function call, which is also an implicit compiler barrier. | |
| 49 typedef void (*KernelMemoryBarrierFunc)(); | |
| 50 ((KernelMemoryBarrierFunc)0xffff0fa0)(); | |
| 51 #elif defined(__QNXNTO__) | |
| 52 __cpu_membarrier(); | |
| 53 #else | |
| 54 // Fallback to GCC built-in function | |
| 55 __sync_synchronize(); | |
| 56 #endif | |
| 57 } | |
| 58 | |
| 59 // An ARM toolchain would only define one of these depending on which | |
| 60 // variant of the target architecture is being used. This tests against | |
| 61 // any known ARMv6 or ARMv7 variant, where it is possible to directly | |
| 62 // use ldrex/strex instructions to implement fast atomic operations. | |
| 63 #if defined(__ARM_ARCH_8A__) || \ | |
| 64 defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ | |
| 65 defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || \ | |
| 66 defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ | |
| 67 defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \ | |
| 68 defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) | |
| 69 | |
| 70 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, | |
| 71 Atomic32 old_value, | |
| 72 Atomic32 new_value) { | |
| 73 Atomic32 prev_value; | |
| 74 int reloop; | |
| 75 do { | |
| 76 // The following is equivalent to: | |
| 77 // | |
| 78 // prev_value = LDREX(ptr) | |
| 79 // reloop = 0 | |
| 80 // if (prev_value != old_value) | |
| 81 // reloop = STREX(ptr, new_value) | |
| 82 __asm__ __volatile__(" ldrex %0, [%3]\n" | |
| 83 " mov %1, #0\n" | |
| 84 " cmp %0, %4\n" | |
| 85 #ifdef __thumb2__ | |
| 86 " it eq\n" | |
| 87 #endif | |
| 88 " strexeq %1, %5, [%3]\n" | |
| 89 : "=&r"(prev_value), "=&r"(reloop), "+m"(*ptr) | |
| 90 : "r"(ptr), "r"(old_value), "r"(new_value) | |
| 91 : "cc", "memory"); | |
| 92 } while (reloop != 0); | |
| 93 return prev_value; | |
| 94 } | |
| 95 | |
| 96 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, | |
| 97 Atomic32 old_value, | |
| 98 Atomic32 new_value) { | |
| 99 Atomic32 result = NoBarrier_CompareAndSwap(ptr, old_value, new_value); | |
| 100 MemoryBarrier(); | |
| 101 return result; | |
| 102 } | |
| 103 | |
| 104 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, | |
| 105 Atomic32 old_value, | |
| 106 Atomic32 new_value) { | |
| 107 MemoryBarrier(); | |
| 108 return NoBarrier_CompareAndSwap(ptr, old_value, new_value); | |
| 109 } | |
| 110 | |
| 111 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, | |
| 112 Atomic32 increment) { | |
| 113 Atomic32 value; | |
| 114 int reloop; | |
| 115 do { | |
| 116 // Equivalent to: | |
| 117 // | |
| 118 // value = LDREX(ptr) | |
| 119 // value += increment | |
| 120 // reloop = STREX(ptr, value) | |
| 121 // | |
| 122 __asm__ __volatile__(" ldrex %0, [%3]\n" | |
| 123 " add %0, %0, %4\n" | |
| 124 " strex %1, %0, [%3]\n" | |
| 125 : "=&r"(value), "=&r"(reloop), "+m"(*ptr) | |
| 126 : "r"(ptr), "r"(increment) | |
| 127 : "cc", "memory"); | |
| 128 } while (reloop); | |
| 129 return value; | |
| 130 } | |
| 131 | |
| 132 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, | |
| 133 Atomic32 increment) { | |
| 134 // TODO(digit): Investigate if it's possible to implement this with | |
| 135 // a single MemoryBarrier() operation between the LDREX and STREX. | |
| 136 // See http://crbug.com/246514 | |
| 137 MemoryBarrier(); | |
| 138 Atomic32 result = NoBarrier_AtomicIncrement(ptr, increment); | |
| 139 MemoryBarrier(); | |
| 140 return result; | |
| 141 } | |
| 142 | |
| 143 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, | |
| 144 Atomic32 new_value) { | |
| 145 Atomic32 old_value; | |
| 146 int reloop; | |
| 147 do { | |
| 148 // old_value = LDREX(ptr) | |
| 149 // reloop = STREX(ptr, new_value) | |
| 150 __asm__ __volatile__(" ldrex %0, [%3]\n" | |
| 151 " strex %1, %4, [%3]\n" | |
| 152 : "=&r"(old_value), "=&r"(reloop), "+m"(*ptr) | |
| 153 : "r"(ptr), "r"(new_value) | |
| 154 : "cc", "memory"); | |
| 155 } while (reloop != 0); | |
| 156 return old_value; | |
| 157 } | |
| 158 | |
| 159 // This tests against any known ARMv5 variant. | |
| 160 #elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ | |
| 161 defined(__ARM_ARCH_5TE__) || defined(__ARM_ARCH_5TEJ__) | |
| 162 | |
| 163 // The kernel also provides a helper function to perform an atomic | |
| 164 // compare-and-swap operation at the hard-wired address 0xffff0fc0. | |
| 165 // On ARMv5, this is implemented by a special code path that the kernel | |
| 166 // detects and treats specially when thread pre-emption happens. | |
| 167 // On ARMv6 and higher, it uses LDREX/STREX instructions instead. | |
| 168 // | |
| 169 // Note that this always perform a full memory barrier, there is no | |
| 170 // need to add calls MemoryBarrier() before or after it. It also | |
| 171 // returns 0 on success, and 1 on exit. | |
| 172 // | |
| 173 // Available and reliable since Linux 2.6.24. Both Android and ChromeOS | |
| 174 // use newer kernel revisions, so this should not be a concern. | |
| 175 namespace { | |
| 176 | |
| 177 inline int LinuxKernelCmpxchg(Atomic32 old_value, | |
| 178 Atomic32 new_value, | |
| 179 volatile Atomic32* ptr) { | |
| 180 typedef int (*KernelCmpxchgFunc)(Atomic32, Atomic32, volatile Atomic32*); | |
| 181 return ((KernelCmpxchgFunc)0xffff0fc0)(old_value, new_value, ptr); | |
| 182 } | |
| 183 | |
| 184 } // namespace | |
| 185 | |
| 186 inline Atomic32 NoBarrier_CompareAndSwap(volatile Atomic32* ptr, | |
| 187 Atomic32 old_value, | |
| 188 Atomic32 new_value) { | |
| 189 Atomic32 prev_value; | |
| 190 for (;;) { | |
| 191 prev_value = *ptr; | |
| 192 if (prev_value != old_value) | |
| 193 return prev_value; | |
| 194 if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) | |
| 195 return old_value; | |
| 196 } | |
| 197 } | |
| 198 | |
| 199 inline Atomic32 NoBarrier_AtomicExchange(volatile Atomic32* ptr, | |
| 200 Atomic32 new_value) { | |
| 201 Atomic32 old_value; | |
| 202 do { | |
| 203 old_value = *ptr; | |
| 204 } while (LinuxKernelCmpxchg(old_value, new_value, ptr)); | |
| 205 return old_value; | |
| 206 } | |
| 207 | |
| 208 inline Atomic32 NoBarrier_AtomicIncrement(volatile Atomic32* ptr, | |
| 209 Atomic32 increment) { | |
| 210 return Barrier_AtomicIncrement(ptr, increment); | |
| 211 } | |
| 212 | |
| 213 inline Atomic32 Barrier_AtomicIncrement(volatile Atomic32* ptr, | |
| 214 Atomic32 increment) { | |
| 215 for (;;) { | |
| 216 // Atomic exchange the old value with an incremented one. | |
| 217 Atomic32 old_value = *ptr; | |
| 218 Atomic32 new_value = old_value + increment; | |
| 219 if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) { | |
| 220 // The exchange took place as expected. | |
| 221 return new_value; | |
| 222 } | |
| 223 // Otherwise, *ptr changed mid-loop and we need to retry. | |
| 224 } | |
| 225 } | |
| 226 | |
| 227 inline Atomic32 Acquire_CompareAndSwap(volatile Atomic32* ptr, | |
| 228 Atomic32 old_value, | |
| 229 Atomic32 new_value) { | |
| 230 Atomic32 prev_value; | |
| 231 for (;;) { | |
| 232 prev_value = *ptr; | |
| 233 if (prev_value != old_value) { | |
| 234 // Always ensure acquire semantics. | |
| 235 MemoryBarrier(); | |
| 236 return prev_value; | |
| 237 } | |
| 238 if (!LinuxKernelCmpxchg(old_value, new_value, ptr)) | |
| 239 return old_value; | |
| 240 } | |
| 241 } | |
| 242 | |
| 243 inline Atomic32 Release_CompareAndSwap(volatile Atomic32* ptr, | |
| 244 Atomic32 old_value, | |
| 245 Atomic32 new_value) { | |
| 246 // This could be implemented as: | |
| 247 // MemoryBarrier(); | |
| 248 // return NoBarrier_CompareAndSwap(); | |
| 249 // | |
| 250 // But would use 3 barriers per succesful CAS. To save performance, | |
| 251 // use Acquire_CompareAndSwap(). Its implementation guarantees that: | |
| 252 // - A succesful swap uses only 2 barriers (in the kernel helper). | |
| 253 // - An early return due to (prev_value != old_value) performs | |
| 254 // a memory barrier with no store, which is equivalent to the | |
| 255 // generic implementation above. | |
| 256 return Acquire_CompareAndSwap(ptr, old_value, new_value); | |
| 257 } | |
| 258 | |
| 259 #else | |
| 260 # error "Your CPU's ARM architecture is not supported yet" | |
| 261 #endif | |
| 262 | |
| 263 // NOTE: Atomicity of the following load and store operations is only | |
| 264 // guaranteed in case of 32-bit alignement of |ptr| values. | |
| 265 | |
| 266 inline void NoBarrier_Store(volatile Atomic32* ptr, Atomic32 value) { | |
| 267 *ptr = value; | |
| 268 } | |
| 269 | |
| 270 inline void Acquire_Store(volatile Atomic32* ptr, Atomic32 value) { | |
| 271 *ptr = value; | |
| 272 MemoryBarrier(); | |
| 273 } | |
| 274 | |
| 275 inline void Release_Store(volatile Atomic32* ptr, Atomic32 value) { | |
| 276 MemoryBarrier(); | |
| 277 *ptr = value; | |
| 278 } | |
| 279 | |
| 280 inline Atomic32 NoBarrier_Load(volatile const Atomic32* ptr) { return *ptr; } | |
| 281 | |
| 282 inline Atomic32 Acquire_Load(volatile const Atomic32* ptr) { | |
| 283 Atomic32 value = *ptr; | |
| 284 MemoryBarrier(); | |
| 285 return value; | |
| 286 } | |
| 287 | |
| 288 inline Atomic32 Release_Load(volatile const Atomic32* ptr) { | |
| 289 MemoryBarrier(); | |
| 290 return *ptr; | |
| 291 } | |
| 292 | |
| 293 // Byte accessors. | |
| 294 | |
| 295 inline void NoBarrier_Store(volatile Atomic8* ptr, Atomic8 value) { | |
| 296 *ptr = value; | |
| 297 } | |
| 298 | |
| 299 inline Atomic8 NoBarrier_Load(volatile const Atomic8* ptr) { return *ptr; } | |
| 300 | |
| 301 } // namespace base | |
| 302 } // namespace v8 | |
| 303 | |
| 304 #endif // V8_BASE_ATOMICOPS_INTERNALS_ARM_GCC_H_ | |
| OLD | NEW |