OLD | NEW |
1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. | 1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. |
2 ; This should apply to both atomic and non-atomic loads/stores | 2 ; This should apply to both atomic and non-atomic loads/stores |
3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only | 3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only |
4 ; applies to atomic load/stores). | 4 ; applies to atomic load/stores). |
5 ; | 5 ; |
6 ; TODO(kschimpf) Find out why lc2i is needed. | 6 ; TODO(kschimpf) Find out why lc2i is needed. |
7 ; RUN: %lc2i -i %s --args -O2 --verbose none \ | 7 ; RUN: %lc2i -i %s --args -O2 --verbose none \ |
8 ; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \ | 8 ; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \ |
9 ; RUN: | llvm-objdump -d -symbolize -x86-asm-syntax=intel - | FileCheck %s | 9 ; RUN: | llvm-objdump -d -r -symbolize -x86-asm-syntax=intel - | FileCheck %s |
10 | 10 |
11 ; TODO(jvoung): llvm-objdump doesn't symbolize global symbols well, so we | 11 ; TODO(jvoung): llvm-objdump doesn't symbolize global symbols well, so we |
12 ; have [0] == g32_a, [4] == g32_b, [8] == g32_c. | 12 ; have 0 == g32_a, 4 == g32_b, 8 == g32_c. |
13 ; g32_d is also [0] because it's in the .data section instead of .bss. | 13 ; g32_d is also 0 because it's in the .data section instead of .bss. |
14 | 14 |
15 declare void @llvm.nacl.atomic.fence.all() | 15 declare void @llvm.nacl.atomic.fence.all() |
16 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) | 16 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) |
17 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) | 17 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) |
18 | 18 |
19 @g32_a = internal global [4 x i8] zeroinitializer, align 4 | 19 @g32_a = internal global [4 x i8] zeroinitializer, align 4 |
20 @g32_b = internal global [4 x i8] zeroinitializer, align 4 | 20 @g32_b = internal global [4 x i8] zeroinitializer, align 4 |
21 @g32_c = internal global [4 x i8] zeroinitializer, align 4 | 21 @g32_c = internal global [4 x i8] zeroinitializer, align 4 |
22 @g32_d = internal global [4 x i8] c"\02\00\00\00", align 4 | 22 @g32_d = internal global [4 x i8] c"\02\00\00\00", align 4 |
23 | 23 |
(...skipping 19 matching lines...) Expand all Loading... |
43 call void @llvm.nacl.atomic.fence.all() | 43 call void @llvm.nacl.atomic.fence.all() |
44 store i32 %l_c2, i32* %p_c, align 1 | 44 store i32 %l_c2, i32* %p_c, align 1 |
45 | 45 |
46 ret i32 %l_c2 | 46 ret i32 %l_c2 |
47 } | 47 } |
48 ; CHECK-LABEL: test_fused_load_add_a | 48 ; CHECK-LABEL: test_fused_load_add_a |
49 ; alloca store | 49 ; alloca store |
50 ; CHECK: mov {{.*}}, esp | 50 ; CHECK: mov {{.*}}, esp |
51 ; CHECK: mov dword ptr {{.*}}, 999 | 51 ; CHECK: mov dword ptr {{.*}}, 999 |
52 ; atomic store (w/ its own mfence) | 52 ; atomic store (w/ its own mfence) |
53 ; CHECK: dword ptr [0] | 53 ; CHECK: mov {{.*}}, 0 |
| 54 ; CHECK-NEXT: R_386_32 |
54 ; The load + add are optimized into one everywhere. | 55 ; The load + add are optimized into one everywhere. |
55 ; CHECK: add {{.*}}, dword ptr | 56 ; CHECK: add {{.*}}, dword ptr |
56 ; CHECK: mov dword ptr | 57 ; CHECK: mov dword ptr |
57 ; CHECK: mfence | 58 ; CHECK: mfence |
58 ; CHECK: dword ptr [4] | 59 ; CHECK: mov {{.*}}, 4 |
| 60 ; CHECK-NEXT: R_386_32 |
59 ; CHECK: add {{.*}}, dword ptr | 61 ; CHECK: add {{.*}}, dword ptr |
60 ; CHECK: mov dword ptr | 62 ; CHECK: mov dword ptr |
61 ; CHECK: dword ptr [8] | 63 ; CHECK: mov {{.*}}, 8 |
| 64 ; CHECK-NEXT: R_386_32 |
62 ; CHECK: add {{.*}}, dword ptr | 65 ; CHECK: add {{.*}}, dword ptr |
63 ; CHECK: mfence | 66 ; CHECK: mfence |
64 ; CHECK: mov dword ptr | 67 ; CHECK: mov dword ptr |
65 | 68 |
66 ; Test with the fence moved up a bit. | 69 ; Test with the fence moved up a bit. |
67 define i32 @test_fused_load_add_b() { | 70 define i32 @test_fused_load_add_b() { |
68 entry: | 71 entry: |
69 %p_alloca = alloca i8, i32 4, align 4 | 72 %p_alloca = alloca i8, i32 4, align 4 |
70 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 73 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
71 store i32 999, i32* %p_alloca_bc, align 1 | 74 store i32 999, i32* %p_alloca_bc, align 1 |
(...skipping 14 matching lines...) Expand all Loading... |
86 %l_c2 = add i32 %l_c, 1 | 89 %l_c2 = add i32 %l_c, 1 |
87 store i32 %l_c2, i32* %p_c, align 1 | 90 store i32 %l_c2, i32* %p_c, align 1 |
88 | 91 |
89 ret i32 %l_c2 | 92 ret i32 %l_c2 |
90 } | 93 } |
91 ; CHECK-LABEL: test_fused_load_add_b | 94 ; CHECK-LABEL: test_fused_load_add_b |
92 ; alloca store | 95 ; alloca store |
93 ; CHECK: mov {{.*}}, esp | 96 ; CHECK: mov {{.*}}, esp |
94 ; CHECK: mov dword ptr {{.*}}, 999 | 97 ; CHECK: mov dword ptr {{.*}}, 999 |
95 ; atomic store (w/ its own mfence) | 98 ; atomic store (w/ its own mfence) |
96 ; CHECK: dword ptr [0] | 99 ; CHECK: mov {{.*}}, 0 |
| 100 ; CHECK-NEXT: R_386_32 |
97 ; CHECK: add {{.*}}, dword ptr | 101 ; CHECK: add {{.*}}, dword ptr |
98 ; CHECK: mov dword ptr | 102 ; CHECK: mov dword ptr |
99 ; CHECK: mfence | 103 ; CHECK: mfence |
100 ; CHECK: dword ptr [4] | 104 ; CHECK: mov {{.*}}, 4 |
| 105 ; CHECK-NEXT: R_386_32 |
101 ; CHECK: add {{.*}}, dword ptr | 106 ; CHECK: add {{.*}}, dword ptr |
102 ; CHECK: mov dword ptr | 107 ; CHECK: mov dword ptr |
103 ; CHECK: dword ptr [8] | 108 ; CHECK: mov {{.*}}, 8 |
| 109 ; CHECK-NEXT: R_386_32 |
104 ; CHECK: mfence | 110 ; CHECK: mfence |
105 ; Load + add can still be optimized into one instruction | 111 ; Load + add can still be optimized into one instruction |
106 ; because it is not separated by a fence. | 112 ; because it is not separated by a fence. |
107 ; CHECK: add {{.*}}, dword ptr | 113 ; CHECK: add {{.*}}, dword ptr |
108 ; CHECK: mov dword ptr | 114 ; CHECK: mov dword ptr |
109 | 115 |
110 ; Test with the fence splitting a load/add. | 116 ; Test with the fence splitting a load/add. |
111 define i32 @test_fused_load_add_c() { | 117 define i32 @test_fused_load_add_c() { |
112 entry: | 118 entry: |
113 %p_alloca = alloca i8, i32 4, align 4 | 119 %p_alloca = alloca i8, i32 4, align 4 |
(...skipping 16 matching lines...) Expand all Loading... |
130 %l_c2 = add i32 %l_c, 1 | 136 %l_c2 = add i32 %l_c, 1 |
131 store i32 %l_c2, i32* %p_c, align 1 | 137 store i32 %l_c2, i32* %p_c, align 1 |
132 | 138 |
133 ret i32 %l_c2 | 139 ret i32 %l_c2 |
134 } | 140 } |
135 ; CHECK-LABEL: test_fused_load_add_c | 141 ; CHECK-LABEL: test_fused_load_add_c |
136 ; alloca store | 142 ; alloca store |
137 ; CHECK: mov {{.*}}, esp | 143 ; CHECK: mov {{.*}}, esp |
138 ; CHECK: mov dword ptr {{.*}}, 999 | 144 ; CHECK: mov dword ptr {{.*}}, 999 |
139 ; atomic store (w/ its own mfence) | 145 ; atomic store (w/ its own mfence) |
140 ; CHECK: dword ptr [0] | 146 ; CHECK: mov {{.*}}, 0 |
| 147 ; CHECK-NEXT: R_386_32 |
141 ; CHECK: add {{.*}}, dword ptr | 148 ; CHECK: add {{.*}}, dword ptr |
142 ; CHECK: mov dword ptr | 149 ; CHECK: mov dword ptr |
143 ; CHECK: mfence | 150 ; CHECK: mfence |
144 ; CHECK: dword ptr [4] | 151 ; CHECK: mov {{.*}}, 4 |
| 152 ; CHECK-NEXT: R_386_32 |
145 ; This load + add are no longer optimized into one, | 153 ; This load + add are no longer optimized into one, |
146 ; though perhaps it should be legal as long as | 154 ; though perhaps it should be legal as long as |
147 ; the load stays on the same side of the fence. | 155 ; the load stays on the same side of the fence. |
148 ; CHECK: mov {{.*}}, dword ptr | 156 ; CHECK: mov {{.*}}, dword ptr |
149 ; CHECK: mfence | 157 ; CHECK: mfence |
150 ; CHECK: add {{.*}}, 1 | 158 ; CHECK: add {{.*}}, 1 |
151 ; CHECK: mov dword ptr | 159 ; CHECK: mov dword ptr |
152 ; CHECK: dword ptr [8] | 160 ; CHECK: mov {{.*}}, 8 |
| 161 ; CHECK-NEXT: R_386_32 |
153 ; CHECK: add {{.*}}, dword ptr | 162 ; CHECK: add {{.*}}, dword ptr |
154 ; CHECK: mov dword ptr | 163 ; CHECK: mov dword ptr |
155 | 164 |
156 | 165 |
157 ; Test where a bunch of i8 loads could have been fused into one | 166 ; Test where a bunch of i8 loads could have been fused into one |
158 ; i32 load, but a fence blocks that. | 167 ; i32 load, but a fence blocks that. |
159 define i32 @could_have_fused_loads() { | 168 define i32 @could_have_fused_loads() { |
160 entry: | 169 entry: |
161 %ptr1 = bitcast [4 x i8]* @g32_d to i8* | 170 %ptr1 = bitcast [4 x i8]* @g32_d to i8* |
162 %b1 = load i8* %ptr1 | 171 %b1 = load i8* %ptr1 |
(...skipping 19 matching lines...) Expand all Loading... |
182 %b12 = or i32 %b1.ext, %b2.shift | 191 %b12 = or i32 %b1.ext, %b2.shift |
183 %b3.ext = zext i8 %b3 to i32 | 192 %b3.ext = zext i8 %b3 to i32 |
184 %b3.shift = shl i32 %b3.ext, 16 | 193 %b3.shift = shl i32 %b3.ext, 16 |
185 %b123 = or i32 %b12, %b3.shift | 194 %b123 = or i32 %b12, %b3.shift |
186 %b4.ext = zext i8 %b4 to i32 | 195 %b4.ext = zext i8 %b4 to i32 |
187 %b4.shift = shl i32 %b4.ext, 24 | 196 %b4.shift = shl i32 %b4.ext, 24 |
188 %b1234 = or i32 %b123, %b4.shift | 197 %b1234 = or i32 %b123, %b4.shift |
189 ret i32 %b1234 | 198 ret i32 %b1234 |
190 } | 199 } |
191 ; CHECK-LABEL: could_have_fused_loads | 200 ; CHECK-LABEL: could_have_fused_loads |
192 ; CHECK: dword ptr [0] | 201 ; CHECK: mov {{.*}}, 0 |
| 202 ; CHECK-NEXT: R_386_32 |
193 ; CHECK: mov {{.*}}, byte ptr | 203 ; CHECK: mov {{.*}}, byte ptr |
194 ; CHECK: mov {{.*}}, byte ptr | 204 ; CHECK: mov {{.*}}, byte ptr |
195 ; CHECK: mov {{.*}}, byte ptr | 205 ; CHECK: mov {{.*}}, byte ptr |
196 ; CHECK: mfence | 206 ; CHECK: mfence |
197 ; CHECK: mov {{.*}}, byte ptr | 207 ; CHECK: mov {{.*}}, byte ptr |
198 | 208 |
199 | 209 |
200 ; Test where an identical load from two branches could have been hoisted | 210 ; Test where an identical load from two branches could have been hoisted |
201 ; up, and then the code merged, but a fence prevents it. | 211 ; up, and then the code merged, but a fence prevents it. |
202 define i32 @could_have_hoisted_loads(i32 %x) { | 212 define i32 @could_have_hoisted_loads(i32 %x) { |
203 entry: | 213 entry: |
204 %ptr = bitcast [4 x i8]* @g32_d to i32* | 214 %ptr = bitcast [4 x i8]* @g32_d to i32* |
205 %cmp = icmp eq i32 %x, 1 | 215 %cmp = icmp eq i32 %x, 1 |
206 br i1 %cmp, label %branch1, label %branch2 | 216 br i1 %cmp, label %branch1, label %branch2 |
207 branch1: | 217 branch1: |
208 %y = load i32* %ptr | 218 %y = load i32* %ptr |
209 ret i32 %y | 219 ret i32 %y |
210 branch2: | 220 branch2: |
211 call void @llvm.nacl.atomic.fence.all() | 221 call void @llvm.nacl.atomic.fence.all() |
212 %z = load i32* %ptr | 222 %z = load i32* %ptr |
213 ret i32 %z | 223 ret i32 %z |
214 } | 224 } |
215 ; CHECK-LABEL: could_have_hoisted_loads | 225 ; CHECK-LABEL: could_have_hoisted_loads |
216 ; CHECK: dword ptr [0] | 226 ; CHECK: mov {{.*}}, 0 |
| 227 ; CHECK-NEXT: R_386_32 |
217 ; CHECK: jne {{.*}} | 228 ; CHECK: jne {{.*}} |
218 ; CHECK: mov {{.*}}, dword ptr | 229 ; CHECK: mov {{.*}}, dword ptr |
219 ; CHECK: ret | 230 ; CHECK: ret |
220 ; CHECK: mfence | 231 ; CHECK: mfence |
221 ; CHECK: mov {{.*}}, dword ptr | 232 ; CHECK: mov {{.*}}, dword ptr |
222 ; CHECK: ret | 233 ; CHECK: ret |
OLD | NEW |