| OLD | NEW |
| 1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. | 1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. |
| 2 ; This should apply to both atomic and non-atomic loads/stores | 2 ; This should apply to both atomic and non-atomic loads/stores |
| 3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only | 3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only |
| 4 ; applies to atomic load/stores). | 4 ; applies to atomic load/stores). |
| 5 ; | 5 ; |
| 6 ; RUN: %p2i -i %s --args -O2 --verbose none \ | 6 ; RUN: %p2i -i %s --assemble --disassemble --args -O2 --verbose none \ |
| 7 ; RUN: | llvm-mc -triple=i686-none-nacl -filetype=obj \ | 7 ; RUN: | FileCheck %s |
| 8 ; RUN: | llvm-objdump -d -r -symbolize -x86-asm-syntax=intel - | FileCheck %s | |
| 9 | 8 |
| 10 ; TODO(jvoung): llvm-objdump doesn't symbolize global symbols well, so we | 9 ; TODO(jvoung): llvm-objdump doesn't symbolize global symbols well, so we |
| 11 ; have 0 == g32_a, 4 == g32_b, 8 == g32_c, 12 == g32_d | 10 ; have 0 == g32_a, 4 == g32_b, 8 == g32_c, 12 == g32_d |
| 12 | 11 |
| 13 declare void @llvm.nacl.atomic.fence.all() | 12 declare void @llvm.nacl.atomic.fence.all() |
| 14 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) | 13 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) |
| 15 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) | 14 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) |
| 16 | 15 |
| 17 @g32_a = internal global [4 x i8] zeroinitializer, align 4 | 16 @g32_a = internal global [4 x i8] zeroinitializer, align 4 |
| 18 @g32_b = internal global [4 x i8] zeroinitializer, align 4 | 17 @g32_b = internal global [4 x i8] zeroinitializer, align 4 |
| (...skipping 19 matching lines...) Expand all Loading... |
| 38 %p_c = bitcast [4 x i8]* @g32_c to i32* | 37 %p_c = bitcast [4 x i8]* @g32_c to i32* |
| 39 %l_c = load i32* %p_c, align 1 | 38 %l_c = load i32* %p_c, align 1 |
| 40 %l_c2 = add i32 %l_c, 1 | 39 %l_c2 = add i32 %l_c, 1 |
| 41 call void @llvm.nacl.atomic.fence.all() | 40 call void @llvm.nacl.atomic.fence.all() |
| 42 store i32 %l_c2, i32* %p_c, align 1 | 41 store i32 %l_c2, i32* %p_c, align 1 |
| 43 | 42 |
| 44 ret i32 %l_c2 | 43 ret i32 %l_c2 |
| 45 } | 44 } |
| 46 ; CHECK-LABEL: test_fused_load_add_a | 45 ; CHECK-LABEL: test_fused_load_add_a |
| 47 ; alloca store | 46 ; alloca store |
| 48 ; CHECK: mov {{.*}}, esp | 47 ; CHECK: mov {{.*}},esp |
| 49 ; CHECK: mov dword ptr {{.*}}, 999 | 48 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
| 50 ; atomic store (w/ its own mfence) | 49 ; atomic store (w/ its own mfence) |
| 51 ; The load + add are optimized into one everywhere. | 50 ; The load + add are optimized into one everywhere. |
| 52 ; CHECK: add {{.*}}, dword ptr [.bss] | 51 ; CHECK: add {{.*}},DWORD PTR [.bss] |
| 53 ; CHECK-NEXT: R_386_32 | 52 ; CHECK-NEXT: R_386_32 |
| 54 ; CHECK: mov dword ptr | 53 ; CHECK: mov DWORD PTR |
| 55 ; CHECK: mfence | 54 ; CHECK: mfence |
| 56 ; CHECK: add {{.*}}, dword ptr [.bss] | 55 ; CHECK: add {{.*}},DWORD PTR [.bss] |
| 57 ; CHECK-NEXT: R_386_32 | 56 ; CHECK-NEXT: R_386_32 |
| 58 ; CHECK: mov dword ptr | 57 ; CHECK: mov DWORD PTR |
| 59 ; CHECK: add {{.*}}, dword ptr [.bss] | 58 ; CHECK: add {{.*}},DWORD PTR [.bss] |
| 60 ; CHECK-NEXT: R_386_32 | 59 ; CHECK-NEXT: R_386_32 |
| 61 ; CHECK: mfence | 60 ; CHECK: mfence |
| 62 ; CHECK: mov dword ptr | 61 ; CHECK: mov DWORD PTR |
| 63 | 62 |
| 64 ; Test with the fence moved up a bit. | 63 ; Test with the fence moved up a bit. |
| 65 define i32 @test_fused_load_add_b() { | 64 define i32 @test_fused_load_add_b() { |
| 66 entry: | 65 entry: |
| 67 %p_alloca = alloca i8, i32 4, align 4 | 66 %p_alloca = alloca i8, i32 4, align 4 |
| 68 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 67 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
| 69 store i32 999, i32* %p_alloca_bc, align 1 | 68 store i32 999, i32* %p_alloca_bc, align 1 |
| 70 | 69 |
| 71 %p_a = bitcast [4 x i8]* @g32_a to i32* | 70 %p_a = bitcast [4 x i8]* @g32_a to i32* |
| 72 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 71 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
| 73 %l_a2 = add i32 %l_a, 1 | 72 %l_a2 = add i32 %l_a, 1 |
| 74 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 73 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
| 75 | 74 |
| 76 %p_b = bitcast [4 x i8]* @g32_b to i32* | 75 %p_b = bitcast [4 x i8]* @g32_b to i32* |
| 77 %l_b = load i32* %p_b, align 1 | 76 %l_b = load i32* %p_b, align 1 |
| 78 %l_b2 = add i32 %l_b, 1 | 77 %l_b2 = add i32 %l_b, 1 |
| 79 store i32 %l_b2, i32* %p_b, align 1 | 78 store i32 %l_b2, i32* %p_b, align 1 |
| 80 | 79 |
| 81 %p_c = bitcast [4 x i8]* @g32_c to i32* | 80 %p_c = bitcast [4 x i8]* @g32_c to i32* |
| 82 call void @llvm.nacl.atomic.fence.all() | 81 call void @llvm.nacl.atomic.fence.all() |
| 83 %l_c = load i32* %p_c, align 1 | 82 %l_c = load i32* %p_c, align 1 |
| 84 %l_c2 = add i32 %l_c, 1 | 83 %l_c2 = add i32 %l_c, 1 |
| 85 store i32 %l_c2, i32* %p_c, align 1 | 84 store i32 %l_c2, i32* %p_c, align 1 |
| 86 | 85 |
| 87 ret i32 %l_c2 | 86 ret i32 %l_c2 |
| 88 } | 87 } |
| 89 ; CHECK-LABEL: test_fused_load_add_b | 88 ; CHECK-LABEL: test_fused_load_add_b |
| 90 ; alloca store | 89 ; alloca store |
| 91 ; CHECK: mov {{.*}}, esp | 90 ; CHECK: mov {{.*}},esp |
| 92 ; CHECK: mov dword ptr {{.*}}, 999 | 91 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
| 93 ; atomic store (w/ its own mfence) | 92 ; atomic store (w/ its own mfence) |
| 94 ; CHECK: add {{.*}}, dword ptr [.bss] | 93 ; CHECK: add {{.*}},DWORD PTR [.bss] |
| 95 ; CHECK-NEXT: R_386_32 | 94 ; CHECK-NEXT: R_386_32 |
| 96 ; CHECK: mov dword ptr | 95 ; CHECK: mov DWORD PTR |
| 97 ; CHECK: mfence | 96 ; CHECK: mfence |
| 98 ; CHECK: add {{.*}}, dword ptr [.bss] | 97 ; CHECK: add {{.*}},DWORD PTR [.bss] |
| 99 ; CHECK-NEXT: R_386_32 | 98 ; CHECK-NEXT: R_386_32 |
| 100 ; CHECK: mov dword ptr | 99 ; CHECK: mov DWORD PTR |
| 101 ; CHECK: mfence | 100 ; CHECK: mfence |
| 102 ; Load + add can still be optimized into one instruction | 101 ; Load + add can still be optimized into one instruction |
| 103 ; because it is not separated by a fence. | 102 ; because it is not separated by a fence. |
| 104 ; CHECK: add {{.*}}, dword ptr [.bss] | 103 ; CHECK: add {{.*}},DWORD PTR [.bss] |
| 105 ; CHECK-NEXT: R_386_32 | 104 ; CHECK-NEXT: R_386_32 |
| 106 ; CHECK: mov dword ptr | 105 ; CHECK: mov DWORD PTR |
| 107 | 106 |
| 108 ; Test with the fence splitting a load/add. | 107 ; Test with the fence splitting a load/add. |
| 109 define i32 @test_fused_load_add_c() { | 108 define i32 @test_fused_load_add_c() { |
| 110 entry: | 109 entry: |
| 111 %p_alloca = alloca i8, i32 4, align 4 | 110 %p_alloca = alloca i8, i32 4, align 4 |
| 112 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 111 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
| 113 store i32 999, i32* %p_alloca_bc, align 1 | 112 store i32 999, i32* %p_alloca_bc, align 1 |
| 114 | 113 |
| 115 %p_a = bitcast [4 x i8]* @g32_a to i32* | 114 %p_a = bitcast [4 x i8]* @g32_a to i32* |
| 116 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 115 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
| 117 %l_a2 = add i32 %l_a, 1 | 116 %l_a2 = add i32 %l_a, 1 |
| 118 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 117 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
| 119 | 118 |
| 120 %p_b = bitcast [4 x i8]* @g32_b to i32* | 119 %p_b = bitcast [4 x i8]* @g32_b to i32* |
| 121 %l_b = load i32* %p_b, align 1 | 120 %l_b = load i32* %p_b, align 1 |
| 122 call void @llvm.nacl.atomic.fence.all() | 121 call void @llvm.nacl.atomic.fence.all() |
| 123 %l_b2 = add i32 %l_b, 1 | 122 %l_b2 = add i32 %l_b, 1 |
| 124 store i32 %l_b2, i32* %p_b, align 1 | 123 store i32 %l_b2, i32* %p_b, align 1 |
| 125 | 124 |
| 126 %p_c = bitcast [4 x i8]* @g32_c to i32* | 125 %p_c = bitcast [4 x i8]* @g32_c to i32* |
| 127 %l_c = load i32* %p_c, align 1 | 126 %l_c = load i32* %p_c, align 1 |
| 128 %l_c2 = add i32 %l_c, 1 | 127 %l_c2 = add i32 %l_c, 1 |
| 129 store i32 %l_c2, i32* %p_c, align 1 | 128 store i32 %l_c2, i32* %p_c, align 1 |
| 130 | 129 |
| 131 ret i32 %l_c2 | 130 ret i32 %l_c2 |
| 132 } | 131 } |
| 133 ; CHECK-LABEL: test_fused_load_add_c | 132 ; CHECK-LABEL: test_fused_load_add_c |
| 134 ; alloca store | 133 ; alloca store |
| 135 ; CHECK: mov {{.*}}, esp | 134 ; CHECK: mov {{.*}},esp |
| 136 ; CHECK: mov dword ptr {{.*}}, 999 | 135 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
| 137 ; atomic store (w/ its own mfence) | 136 ; atomic store (w/ its own mfence) |
| 138 ; CHECK: add {{.*}}, dword ptr [.bss] | 137 ; CHECK: add {{.*}},DWORD PTR [.bss] |
| 139 ; CHECK-NEXT: R_386_32 | 138 ; CHECK-NEXT: R_386_32 |
| 140 ; CHECK: mov dword ptr | 139 ; CHECK: mov DWORD PTR |
| 141 ; CHECK: mfence | 140 ; CHECK: mfence |
| 142 ; This load + add are no longer optimized into one, | 141 ; This load + add are no longer optimized into one, |
| 143 ; though perhaps it should be legal as long as | 142 ; though perhaps it should be legal as long as |
| 144 ; the load stays on the same side of the fence. | 143 ; the load stays on the same side of the fence. |
| 145 ; CHECK: mov {{.*}}, dword ptr [.bss] | 144 ; CHECK: mov {{.*}},DWORD PTR [.bss] |
| 146 ; CHECK-NEXT: R_386_32 | 145 ; CHECK-NEXT: R_386_32 |
| 147 ; CHECK: mfence | 146 ; CHECK: mfence |
| 148 ; CHECK: add {{.*}}, 1 | 147 ; CHECK: add {{.*}},0x1 |
| 149 ; CHECK: mov dword ptr | 148 ; CHECK: mov DWORD PTR |
| 150 ; CHECK: add {{.*}}, dword ptr [.bss] | 149 ; CHECK: add {{.*}},DWORD PTR [.bss] |
| 151 ; CHECK-NEXT: R_386_32 | 150 ; CHECK-NEXT: R_386_32 |
| 152 ; CHECK: mov dword ptr | 151 ; CHECK: mov DWORD PTR |
| 153 | 152 |
| 154 | 153 |
| 155 ; Test where a bunch of i8 loads could have been fused into one | 154 ; Test where a bunch of i8 loads could have been fused into one |
| 156 ; i32 load, but a fence blocks that. | 155 ; i32 load, but a fence blocks that. |
| 157 define i32 @could_have_fused_loads() { | 156 define i32 @could_have_fused_loads() { |
| 158 entry: | 157 entry: |
| 159 %ptr1 = bitcast [4 x i8]* @g32_d to i8* | 158 %ptr1 = bitcast [4 x i8]* @g32_d to i8* |
| 160 %b1 = load i8* %ptr1, align 1 | 159 %b1 = load i8* %ptr1, align 1 |
| 161 | 160 |
| 162 %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32 | 161 %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32 |
| (...skipping 17 matching lines...) Expand all Loading... |
| 180 %b12 = or i32 %b1.ext, %b2.shift | 179 %b12 = or i32 %b1.ext, %b2.shift |
| 181 %b3.ext = zext i8 %b3 to i32 | 180 %b3.ext = zext i8 %b3 to i32 |
| 182 %b3.shift = shl i32 %b3.ext, 16 | 181 %b3.shift = shl i32 %b3.ext, 16 |
| 183 %b123 = or i32 %b12, %b3.shift | 182 %b123 = or i32 %b12, %b3.shift |
| 184 %b4.ext = zext i8 %b4 to i32 | 183 %b4.ext = zext i8 %b4 to i32 |
| 185 %b4.shift = shl i32 %b4.ext, 24 | 184 %b4.shift = shl i32 %b4.ext, 24 |
| 186 %b1234 = or i32 %b123, %b4.shift | 185 %b1234 = or i32 %b123, %b4.shift |
| 187 ret i32 %b1234 | 186 ret i32 %b1234 |
| 188 } | 187 } |
| 189 ; CHECK-LABEL: could_have_fused_loads | 188 ; CHECK-LABEL: could_have_fused_loads |
| 190 ; CHECK: mov {{.*}}, byte ptr | 189 ; CHECK: mov {{.*}},BYTE PTR |
| 191 ; CHECK-NEXT: R_386_32 | 190 ; CHECK-NEXT: R_386_32 |
| 192 ; CHECK: mov {{.*}}, byte ptr | 191 ; CHECK: mov {{.*}},BYTE PTR |
| 193 ; CHECK: mov {{.*}}, byte ptr | 192 ; CHECK: mov {{.*}},BYTE PTR |
| 194 ; CHECK: mfence | 193 ; CHECK: mfence |
| 195 ; CHECK: mov {{.*}}, byte ptr | 194 ; CHECK: mov {{.*}},BYTE PTR |
| 196 | 195 |
| 197 | 196 |
| 198 ; Test where an identical load from two branches could have been hoisted | 197 ; Test where an identical load from two branches could have been hoisted |
| 199 ; up, and then the code merged, but a fence prevents it. | 198 ; up, and then the code merged, but a fence prevents it. |
| 200 define i32 @could_have_hoisted_loads(i32 %x) { | 199 define i32 @could_have_hoisted_loads(i32 %x) { |
| 201 entry: | 200 entry: |
| 202 %ptr = bitcast [4 x i8]* @g32_d to i32* | 201 %ptr = bitcast [4 x i8]* @g32_d to i32* |
| 203 %cmp = icmp eq i32 %x, 1 | 202 %cmp = icmp eq i32 %x, 1 |
| 204 br i1 %cmp, label %branch1, label %branch2 | 203 br i1 %cmp, label %branch1, label %branch2 |
| 205 branch1: | 204 branch1: |
| 206 %y = load i32* %ptr, align 1 | 205 %y = load i32* %ptr, align 1 |
| 207 ret i32 %y | 206 ret i32 %y |
| 208 branch2: | 207 branch2: |
| 209 call void @llvm.nacl.atomic.fence.all() | 208 call void @llvm.nacl.atomic.fence.all() |
| 210 %z = load i32* %ptr, align 1 | 209 %z = load i32* %ptr, align 1 |
| 211 ret i32 %z | 210 ret i32 %z |
| 212 } | 211 } |
| 213 ; CHECK-LABEL: could_have_hoisted_loads | 212 ; CHECK-LABEL: could_have_hoisted_loads |
| 214 ; CHECK: jne {{.*}} | 213 ; CHECK: jne {{.*}} |
| 215 ; CHECK: mov {{.*}}, dword ptr [.bss] | 214 ; CHECK: mov {{.*}},DWORD PTR [.bss] |
| 216 ; CHECK-NEXT: R_386_32 | 215 ; CHECK-NEXT: R_386_32 |
| 217 ; CHECK: ret | 216 ; CHECK: ret |
| 218 ; CHECK: mfence | 217 ; CHECK: mfence |
| 219 ; CHECK: mov {{.*}}, dword ptr [.bss] | 218 ; CHECK: mov {{.*}},DWORD PTR [.bss] |
| 220 ; CHECK-NEXT: R_386_32 | 219 ; CHECK-NEXT: R_386_32 |
| 221 ; CHECK: ret | 220 ; CHECK: ret |
| OLD | NEW |