OLD | NEW |
1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. | 1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. |
2 ; This should apply to both atomic and non-atomic loads/stores | 2 ; This should apply to both atomic and non-atomic loads/stores |
3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only | 3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only |
4 ; applies to atomic load/stores). | 4 ; applies to atomic load/stores). |
5 ; | 5 ; |
6 ; TODO(kschimpf) Find out why lc2i is needed. | 6 ; RUN: %p2i -i %s --args -O2 --verbose none \ |
7 ; RUN: %lc2i -i %s --args -O2 --verbose none \ | |
8 ; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \ | 7 ; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \ |
9 ; RUN: | llvm-objdump -d -r -symbolize -x86-asm-syntax=intel - | FileCheck %s | 8 ; RUN: | llvm-objdump -d -r -symbolize -x86-asm-syntax=intel - | FileCheck %s |
10 | 9 |
11 ; TODO(jvoung): llvm-objdump doesn't symbolize global symbols well, so we | 10 ; TODO(jvoung): llvm-objdump doesn't symbolize global symbols well, so we |
12 ; have 0 == g32_a, 4 == g32_b, 8 == g32_c. | 11 ; have 0 == g32_a, 4 == g32_b, 8 == g32_c, 12 == g32_d |
13 ; g32_d is also 0 because it's in the .data section instead of .bss. | |
14 | 12 |
15 declare void @llvm.nacl.atomic.fence.all() | 13 declare void @llvm.nacl.atomic.fence.all() |
16 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) | 14 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) |
17 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) | 15 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) |
18 | 16 |
19 @g32_a = internal global [4 x i8] zeroinitializer, align 4 | 17 @g32_a = internal global [4 x i8] zeroinitializer, align 4 |
20 @g32_b = internal global [4 x i8] zeroinitializer, align 4 | 18 @g32_b = internal global [4 x i8] zeroinitializer, align 4 |
21 @g32_c = internal global [4 x i8] zeroinitializer, align 4 | 19 @g32_c = internal global [4 x i8] zeroinitializer, align 4 |
22 @g32_d = internal global [4 x i8] c"\02\00\00\00", align 4 | 20 @g32_d = internal global [4 x i8] zeroinitializer, align 4 |
23 | 21 |
24 define i32 @test_fused_load_add_a() { | 22 define i32 @test_fused_load_add_a() { |
25 entry: | 23 entry: |
26 %p_alloca = alloca i8, i32 4, align 4 | 24 %p_alloca = alloca i8, i32 4, align 4 |
27 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 25 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
28 store i32 999, i32* %p_alloca_bc, align 1 | 26 store i32 999, i32* %p_alloca_bc, align 1 |
29 | 27 |
30 %p_a = bitcast [4 x i8]* @g32_a to i32* | 28 %p_a = bitcast [4 x i8]* @g32_a to i32* |
31 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 29 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
32 %l_a2 = add i32 %l_a, 1 | 30 %l_a2 = add i32 %l_a, 1 |
33 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 31 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
34 | 32 |
35 %p_b = bitcast [4 x i8]* @g32_b to i32* | 33 %p_b = bitcast [4 x i8]* @g32_b to i32* |
36 %l_b = load i32* %p_b | 34 %l_b = load i32* %p_b, align 1 |
37 %l_b2 = add i32 %l_b, 1 | 35 %l_b2 = add i32 %l_b, 1 |
38 store i32 %l_b2, i32* %p_b, align 1 | 36 store i32 %l_b2, i32* %p_b, align 1 |
39 | 37 |
40 %p_c = bitcast [4 x i8]* @g32_c to i32* | 38 %p_c = bitcast [4 x i8]* @g32_c to i32* |
41 %l_c = load i32* %p_c | 39 %l_c = load i32* %p_c, align 1 |
42 %l_c2 = add i32 %l_c, 1 | 40 %l_c2 = add i32 %l_c, 1 |
43 call void @llvm.nacl.atomic.fence.all() | 41 call void @llvm.nacl.atomic.fence.all() |
44 store i32 %l_c2, i32* %p_c, align 1 | 42 store i32 %l_c2, i32* %p_c, align 1 |
45 | 43 |
46 ret i32 %l_c2 | 44 ret i32 %l_c2 |
47 } | 45 } |
48 ; CHECK-LABEL: test_fused_load_add_a | 46 ; CHECK-LABEL: test_fused_load_add_a |
49 ; alloca store | 47 ; alloca store |
50 ; CHECK: mov {{.*}}, esp | 48 ; CHECK: mov {{.*}}, esp |
51 ; CHECK: mov dword ptr {{.*}}, 999 | 49 ; CHECK: mov dword ptr {{.*}}, 999 |
52 ; atomic store (w/ its own mfence) | 50 ; atomic store (w/ its own mfence) |
53 ; CHECK: mov {{.*}}, 0 | 51 ; The load + add are optimized into one everywhere. |
| 52 ; CHECK: add {{.*}}, dword ptr [0] |
54 ; CHECK-NEXT: R_386_32 | 53 ; CHECK-NEXT: R_386_32 |
55 ; The load + add are optimized into one everywhere. | |
56 ; CHECK: add {{.*}}, dword ptr | |
57 ; CHECK: mov dword ptr | 54 ; CHECK: mov dword ptr |
58 ; CHECK: mfence | 55 ; CHECK: mfence |
59 ; CHECK: mov {{.*}}, 4 | 56 ; CHECK: add {{.*}}, dword ptr [4] |
60 ; CHECK-NEXT: R_386_32 | 57 ; CHECK-NEXT: R_386_32 |
61 ; CHECK: add {{.*}}, dword ptr | |
62 ; CHECK: mov dword ptr | 58 ; CHECK: mov dword ptr |
63 ; CHECK: mov {{.*}}, 8 | 59 ; CHECK: add {{.*}}, dword ptr [8] |
64 ; CHECK-NEXT: R_386_32 | 60 ; CHECK-NEXT: R_386_32 |
65 ; CHECK: add {{.*}}, dword ptr | |
66 ; CHECK: mfence | 61 ; CHECK: mfence |
67 ; CHECK: mov dword ptr | 62 ; CHECK: mov dword ptr |
68 | 63 |
69 ; Test with the fence moved up a bit. | 64 ; Test with the fence moved up a bit. |
70 define i32 @test_fused_load_add_b() { | 65 define i32 @test_fused_load_add_b() { |
71 entry: | 66 entry: |
72 %p_alloca = alloca i8, i32 4, align 4 | 67 %p_alloca = alloca i8, i32 4, align 4 |
73 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 68 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
74 store i32 999, i32* %p_alloca_bc, align 1 | 69 store i32 999, i32* %p_alloca_bc, align 1 |
75 | 70 |
76 %p_a = bitcast [4 x i8]* @g32_a to i32* | 71 %p_a = bitcast [4 x i8]* @g32_a to i32* |
77 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 72 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
78 %l_a2 = add i32 %l_a, 1 | 73 %l_a2 = add i32 %l_a, 1 |
79 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 74 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
80 | 75 |
81 %p_b = bitcast [4 x i8]* @g32_b to i32* | 76 %p_b = bitcast [4 x i8]* @g32_b to i32* |
82 %l_b = load i32* %p_b | 77 %l_b = load i32* %p_b, align 1 |
83 %l_b2 = add i32 %l_b, 1 | 78 %l_b2 = add i32 %l_b, 1 |
84 store i32 %l_b2, i32* %p_b, align 1 | 79 store i32 %l_b2, i32* %p_b, align 1 |
85 | 80 |
86 %p_c = bitcast [4 x i8]* @g32_c to i32* | 81 %p_c = bitcast [4 x i8]* @g32_c to i32* |
87 call void @llvm.nacl.atomic.fence.all() | 82 call void @llvm.nacl.atomic.fence.all() |
88 %l_c = load i32* %p_c | 83 %l_c = load i32* %p_c, align 1 |
89 %l_c2 = add i32 %l_c, 1 | 84 %l_c2 = add i32 %l_c, 1 |
90 store i32 %l_c2, i32* %p_c, align 1 | 85 store i32 %l_c2, i32* %p_c, align 1 |
91 | 86 |
92 ret i32 %l_c2 | 87 ret i32 %l_c2 |
93 } | 88 } |
94 ; CHECK-LABEL: test_fused_load_add_b | 89 ; CHECK-LABEL: test_fused_load_add_b |
95 ; alloca store | 90 ; alloca store |
96 ; CHECK: mov {{.*}}, esp | 91 ; CHECK: mov {{.*}}, esp |
97 ; CHECK: mov dword ptr {{.*}}, 999 | 92 ; CHECK: mov dword ptr {{.*}}, 999 |
98 ; atomic store (w/ its own mfence) | 93 ; atomic store (w/ its own mfence) |
99 ; CHECK: mov {{.*}}, 0 | 94 ; CHECK: add {{.*}}, dword ptr [0] |
100 ; CHECK-NEXT: R_386_32 | 95 ; CHECK-NEXT: R_386_32 |
101 ; CHECK: add {{.*}}, dword ptr | |
102 ; CHECK: mov dword ptr | 96 ; CHECK: mov dword ptr |
103 ; CHECK: mfence | 97 ; CHECK: mfence |
104 ; CHECK: mov {{.*}}, 4 | 98 ; CHECK: add {{.*}}, dword ptr [4] |
105 ; CHECK-NEXT: R_386_32 | 99 ; CHECK-NEXT: R_386_32 |
106 ; CHECK: add {{.*}}, dword ptr | |
107 ; CHECK: mov dword ptr | 100 ; CHECK: mov dword ptr |
108 ; CHECK: mov {{.*}}, 8 | |
109 ; CHECK-NEXT: R_386_32 | |
110 ; CHECK: mfence | 101 ; CHECK: mfence |
111 ; Load + add can still be optimized into one instruction | 102 ; Load + add can still be optimized into one instruction |
112 ; because it is not separated by a fence. | 103 ; because it is not separated by a fence. |
113 ; CHECK: add {{.*}}, dword ptr | 104 ; CHECK: add {{.*}}, dword ptr [8] |
| 105 ; CHECK-NEXT: R_386_32 |
114 ; CHECK: mov dword ptr | 106 ; CHECK: mov dword ptr |
115 | 107 |
116 ; Test with the fence splitting a load/add. | 108 ; Test with the fence splitting a load/add. |
117 define i32 @test_fused_load_add_c() { | 109 define i32 @test_fused_load_add_c() { |
118 entry: | 110 entry: |
119 %p_alloca = alloca i8, i32 4, align 4 | 111 %p_alloca = alloca i8, i32 4, align 4 |
120 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 112 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
121 store i32 999, i32* %p_alloca_bc, align 1 | 113 store i32 999, i32* %p_alloca_bc, align 1 |
122 | 114 |
123 %p_a = bitcast [4 x i8]* @g32_a to i32* | 115 %p_a = bitcast [4 x i8]* @g32_a to i32* |
124 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 116 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
125 %l_a2 = add i32 %l_a, 1 | 117 %l_a2 = add i32 %l_a, 1 |
126 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 118 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
127 | 119 |
128 %p_b = bitcast [4 x i8]* @g32_b to i32* | 120 %p_b = bitcast [4 x i8]* @g32_b to i32* |
129 %l_b = load i32* %p_b | 121 %l_b = load i32* %p_b, align 1 |
130 call void @llvm.nacl.atomic.fence.all() | 122 call void @llvm.nacl.atomic.fence.all() |
131 %l_b2 = add i32 %l_b, 1 | 123 %l_b2 = add i32 %l_b, 1 |
132 store i32 %l_b2, i32* %p_b, align 1 | 124 store i32 %l_b2, i32* %p_b, align 1 |
133 | 125 |
134 %p_c = bitcast [4 x i8]* @g32_c to i32* | 126 %p_c = bitcast [4 x i8]* @g32_c to i32* |
135 %l_c = load i32* %p_c | 127 %l_c = load i32* %p_c, align 1 |
136 %l_c2 = add i32 %l_c, 1 | 128 %l_c2 = add i32 %l_c, 1 |
137 store i32 %l_c2, i32* %p_c, align 1 | 129 store i32 %l_c2, i32* %p_c, align 1 |
138 | 130 |
139 ret i32 %l_c2 | 131 ret i32 %l_c2 |
140 } | 132 } |
141 ; CHECK-LABEL: test_fused_load_add_c | 133 ; CHECK-LABEL: test_fused_load_add_c |
142 ; alloca store | 134 ; alloca store |
143 ; CHECK: mov {{.*}}, esp | 135 ; CHECK: mov {{.*}}, esp |
144 ; CHECK: mov dword ptr {{.*}}, 999 | 136 ; CHECK: mov dword ptr {{.*}}, 999 |
145 ; atomic store (w/ its own mfence) | 137 ; atomic store (w/ its own mfence) |
146 ; CHECK: mov {{.*}}, 0 | 138 ; CHECK: add {{.*}}, dword ptr [0] |
147 ; CHECK-NEXT: R_386_32 | 139 ; CHECK-NEXT: R_386_32 |
148 ; CHECK: add {{.*}}, dword ptr | |
149 ; CHECK: mov dword ptr | 140 ; CHECK: mov dword ptr |
150 ; CHECK: mfence | 141 ; CHECK: mfence |
151 ; CHECK: mov {{.*}}, 4 | |
152 ; CHECK-NEXT: R_386_32 | |
153 ; This load + add are no longer optimized into one, | 142 ; This load + add are no longer optimized into one, |
154 ; though perhaps it should be legal as long as | 143 ; though perhaps it should be legal as long as |
155 ; the load stays on the same side of the fence. | 144 ; the load stays on the same side of the fence. |
156 ; CHECK: mov {{.*}}, dword ptr | 145 ; CHECK: mov {{.*}}, dword ptr [4] |
| 146 ; CHECK-NEXT: R_386_32 |
157 ; CHECK: mfence | 147 ; CHECK: mfence |
158 ; CHECK: add {{.*}}, 1 | 148 ; CHECK: add {{.*}}, 1 |
159 ; CHECK: mov dword ptr | 149 ; CHECK: mov dword ptr |
160 ; CHECK: mov {{.*}}, 8 | 150 ; CHECK: add {{.*}}, dword ptr [8] |
161 ; CHECK-NEXT: R_386_32 | 151 ; CHECK-NEXT: R_386_32 |
162 ; CHECK: add {{.*}}, dword ptr | |
163 ; CHECK: mov dword ptr | 152 ; CHECK: mov dword ptr |
164 | 153 |
165 | 154 |
166 ; Test where a bunch of i8 loads could have been fused into one | 155 ; Test where a bunch of i8 loads could have been fused into one |
167 ; i32 load, but a fence blocks that. | 156 ; i32 load, but a fence blocks that. |
168 define i32 @could_have_fused_loads() { | 157 define i32 @could_have_fused_loads() { |
169 entry: | 158 entry: |
170 %ptr1 = bitcast [4 x i8]* @g32_d to i8* | 159 %ptr1 = bitcast [4 x i8]* @g32_d to i8* |
171 %b1 = load i8* %ptr1 | 160 %b1 = load i8* %ptr1, align 1 |
172 | 161 |
173 %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32 | 162 %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32 |
174 %int_ptr_bump2 = add i32 %int_ptr2, 1 | 163 %int_ptr_bump2 = add i32 %int_ptr2, 1 |
175 %ptr2 = inttoptr i32 %int_ptr_bump2 to i8* | 164 %ptr2 = inttoptr i32 %int_ptr_bump2 to i8* |
176 %b2 = load i8* %ptr2 | 165 %b2 = load i8* %ptr2, align 1 |
177 | 166 |
178 %int_ptr_bump3 = add i32 %int_ptr2, 2 | 167 %int_ptr_bump3 = add i32 %int_ptr2, 2 |
179 %ptr3 = inttoptr i32 %int_ptr_bump3 to i8* | 168 %ptr3 = inttoptr i32 %int_ptr_bump3 to i8* |
180 %b3 = load i8* %ptr3 | 169 %b3 = load i8* %ptr3, align 1 |
181 | 170 |
182 call void @llvm.nacl.atomic.fence.all() | 171 call void @llvm.nacl.atomic.fence.all() |
183 | 172 |
184 %int_ptr_bump4 = add i32 %int_ptr2, 3 | 173 %int_ptr_bump4 = add i32 %int_ptr2, 3 |
185 %ptr4 = inttoptr i32 %int_ptr_bump4 to i8* | 174 %ptr4 = inttoptr i32 %int_ptr_bump4 to i8* |
186 %b4 = load i8* %ptr4 | 175 %b4 = load i8* %ptr4, align 1 |
187 | 176 |
188 %b1.ext = zext i8 %b1 to i32 | 177 %b1.ext = zext i8 %b1 to i32 |
189 %b2.ext = zext i8 %b2 to i32 | 178 %b2.ext = zext i8 %b2 to i32 |
190 %b2.shift = shl i32 %b2.ext, 8 | 179 %b2.shift = shl i32 %b2.ext, 8 |
191 %b12 = or i32 %b1.ext, %b2.shift | 180 %b12 = or i32 %b1.ext, %b2.shift |
192 %b3.ext = zext i8 %b3 to i32 | 181 %b3.ext = zext i8 %b3 to i32 |
193 %b3.shift = shl i32 %b3.ext, 16 | 182 %b3.shift = shl i32 %b3.ext, 16 |
194 %b123 = or i32 %b12, %b3.shift | 183 %b123 = or i32 %b12, %b3.shift |
195 %b4.ext = zext i8 %b4 to i32 | 184 %b4.ext = zext i8 %b4 to i32 |
196 %b4.shift = shl i32 %b4.ext, 24 | 185 %b4.shift = shl i32 %b4.ext, 24 |
197 %b1234 = or i32 %b123, %b4.shift | 186 %b1234 = or i32 %b123, %b4.shift |
198 ret i32 %b1234 | 187 ret i32 %b1234 |
199 } | 188 } |
200 ; CHECK-LABEL: could_have_fused_loads | 189 ; CHECK-LABEL: could_have_fused_loads |
201 ; CHECK: mov {{.*}}, 0 | 190 ; CHECK: mov {{.*}}, byte ptr [12] |
202 ; CHECK-NEXT: R_386_32 | 191 ; CHECK-NEXT: R_386_32 |
203 ; CHECK: mov {{.*}}, byte ptr | 192 ; CHECK: mov {{.*}}, byte ptr |
204 ; CHECK: mov {{.*}}, byte ptr | 193 ; CHECK: mov {{.*}}, byte ptr |
205 ; CHECK: mov {{.*}}, byte ptr | |
206 ; CHECK: mfence | 194 ; CHECK: mfence |
207 ; CHECK: mov {{.*}}, byte ptr | 195 ; CHECK: mov {{.*}}, byte ptr |
208 | 196 |
209 | 197 |
210 ; Test where an identical load from two branches could have been hoisted | 198 ; Test where an identical load from two branches could have been hoisted |
211 ; up, and then the code merged, but a fence prevents it. | 199 ; up, and then the code merged, but a fence prevents it. |
212 define i32 @could_have_hoisted_loads(i32 %x) { | 200 define i32 @could_have_hoisted_loads(i32 %x) { |
213 entry: | 201 entry: |
214 %ptr = bitcast [4 x i8]* @g32_d to i32* | 202 %ptr = bitcast [4 x i8]* @g32_d to i32* |
215 %cmp = icmp eq i32 %x, 1 | 203 %cmp = icmp eq i32 %x, 1 |
216 br i1 %cmp, label %branch1, label %branch2 | 204 br i1 %cmp, label %branch1, label %branch2 |
217 branch1: | 205 branch1: |
218 %y = load i32* %ptr | 206 %y = load i32* %ptr, align 1 |
219 ret i32 %y | 207 ret i32 %y |
220 branch2: | 208 branch2: |
221 call void @llvm.nacl.atomic.fence.all() | 209 call void @llvm.nacl.atomic.fence.all() |
222 %z = load i32* %ptr | 210 %z = load i32* %ptr, align 1 |
223 ret i32 %z | 211 ret i32 %z |
224 } | 212 } |
225 ; CHECK-LABEL: could_have_hoisted_loads | 213 ; CHECK-LABEL: could_have_hoisted_loads |
226 ; CHECK: mov {{.*}}, 0 | 214 ; CHECK: jne {{.*}} |
| 215 ; CHECK: mov {{.*}}, dword ptr [12] |
227 ; CHECK-NEXT: R_386_32 | 216 ; CHECK-NEXT: R_386_32 |
228 ; CHECK: jne {{.*}} | |
229 ; CHECK: mov {{.*}}, dword ptr | |
230 ; CHECK: ret | 217 ; CHECK: ret |
231 ; CHECK: mfence | 218 ; CHECK: mfence |
232 ; CHECK: mov {{.*}}, dword ptr | 219 ; CHECK: mov {{.*}}, dword ptr [12] |
| 220 ; CHECK-NEXT: R_386_32 |
233 ; CHECK: ret | 221 ; CHECK: ret |
OLD | NEW |