OLD | NEW |
1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. | 1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. |
2 ; This should apply to both atomic and non-atomic loads/stores | 2 ; This should apply to both atomic and non-atomic loads/stores |
3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only | 3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only |
4 ; applies to atomic load/stores). | 4 ; applies to atomic load/stores). |
5 ; | 5 ; |
6 ; RUN: %p2i -i %s --args -O2 --verbose none \ | 6 ; RUN: %p2i -i %s --assemble --disassemble --args -O2 --verbose none \ |
7 ; RUN: | llvm-mc -triple=i686-none-nacl -filetype=obj \ | 7 ; RUN: | FileCheck %s |
8 ; RUN: | llvm-objdump -d -r -symbolize -x86-asm-syntax=intel - | FileCheck %s | |
9 | |
10 ; TODO(jvoung): llvm-objdump doesn't symbolize global symbols well, so we | |
11 ; have 0 == g32_a, 4 == g32_b, 8 == g32_c, 12 == g32_d | |
12 | 8 |
13 declare void @llvm.nacl.atomic.fence.all() | 9 declare void @llvm.nacl.atomic.fence.all() |
14 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) | 10 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) |
15 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) | 11 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) |
16 | 12 |
17 @g32_a = internal global [4 x i8] zeroinitializer, align 4 | 13 @g32_a = internal global [4 x i8] zeroinitializer, align 4 |
18 @g32_b = internal global [4 x i8] zeroinitializer, align 4 | 14 @g32_b = internal global [4 x i8] zeroinitializer, align 4 |
19 @g32_c = internal global [4 x i8] zeroinitializer, align 4 | 15 @g32_c = internal global [4 x i8] zeroinitializer, align 4 |
20 @g32_d = internal global [4 x i8] zeroinitializer, align 4 | 16 @g32_d = internal global [4 x i8] zeroinitializer, align 4 |
21 | 17 |
(...skipping 16 matching lines...) Expand all Loading... |
38 %p_c = bitcast [4 x i8]* @g32_c to i32* | 34 %p_c = bitcast [4 x i8]* @g32_c to i32* |
39 %l_c = load i32* %p_c, align 1 | 35 %l_c = load i32* %p_c, align 1 |
40 %l_c2 = add i32 %l_c, 1 | 36 %l_c2 = add i32 %l_c, 1 |
41 call void @llvm.nacl.atomic.fence.all() | 37 call void @llvm.nacl.atomic.fence.all() |
42 store i32 %l_c2, i32* %p_c, align 1 | 38 store i32 %l_c2, i32* %p_c, align 1 |
43 | 39 |
44 ret i32 %l_c2 | 40 ret i32 %l_c2 |
45 } | 41 } |
46 ; CHECK-LABEL: test_fused_load_add_a | 42 ; CHECK-LABEL: test_fused_load_add_a |
47 ; alloca store | 43 ; alloca store |
48 ; CHECK: mov {{.*}}, esp | 44 ; CHECK: mov {{.*}},esp |
49 ; CHECK: mov dword ptr {{.*}}, 999 | 45 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
50 ; atomic store (w/ its own mfence) | 46 ; atomic store (w/ its own mfence) |
51 ; The load + add are optimized into one everywhere. | 47 ; The load + add are optimized into one everywhere. |
52 ; CHECK: add {{.*}}, dword ptr [.bss] | 48 ; CHECK: add {{.*}},DWORD PTR {{.*}}.bss |
53 ; CHECK-NEXT: R_386_32 | 49 ; CHECK: mov DWORD PTR |
54 ; CHECK: mov dword ptr | |
55 ; CHECK: mfence | 50 ; CHECK: mfence |
56 ; CHECK: add {{.*}}, dword ptr [.bss] | 51 ; CHECK: add {{.*}},DWORD PTR {{.*}}.bss |
57 ; CHECK-NEXT: R_386_32 | 52 ; CHECK: mov DWORD PTR |
58 ; CHECK: mov dword ptr | 53 ; CHECK: add {{.*}},DWORD PTR {{.*}}.bss |
59 ; CHECK: add {{.*}}, dword ptr [.bss] | |
60 ; CHECK-NEXT: R_386_32 | |
61 ; CHECK: mfence | 54 ; CHECK: mfence |
62 ; CHECK: mov dword ptr | 55 ; CHECK: mov DWORD PTR |
63 | 56 |
64 ; Test with the fence moved up a bit. | 57 ; Test with the fence moved up a bit. |
65 define i32 @test_fused_load_add_b() { | 58 define i32 @test_fused_load_add_b() { |
66 entry: | 59 entry: |
67 %p_alloca = alloca i8, i32 4, align 4 | 60 %p_alloca = alloca i8, i32 4, align 4 |
68 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 61 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
69 store i32 999, i32* %p_alloca_bc, align 1 | 62 store i32 999, i32* %p_alloca_bc, align 1 |
70 | 63 |
71 %p_a = bitcast [4 x i8]* @g32_a to i32* | 64 %p_a = bitcast [4 x i8]* @g32_a to i32* |
72 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 65 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
73 %l_a2 = add i32 %l_a, 1 | 66 %l_a2 = add i32 %l_a, 1 |
74 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 67 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
75 | 68 |
76 %p_b = bitcast [4 x i8]* @g32_b to i32* | 69 %p_b = bitcast [4 x i8]* @g32_b to i32* |
77 %l_b = load i32* %p_b, align 1 | 70 %l_b = load i32* %p_b, align 1 |
78 %l_b2 = add i32 %l_b, 1 | 71 %l_b2 = add i32 %l_b, 1 |
79 store i32 %l_b2, i32* %p_b, align 1 | 72 store i32 %l_b2, i32* %p_b, align 1 |
80 | 73 |
81 %p_c = bitcast [4 x i8]* @g32_c to i32* | 74 %p_c = bitcast [4 x i8]* @g32_c to i32* |
82 call void @llvm.nacl.atomic.fence.all() | 75 call void @llvm.nacl.atomic.fence.all() |
83 %l_c = load i32* %p_c, align 1 | 76 %l_c = load i32* %p_c, align 1 |
84 %l_c2 = add i32 %l_c, 1 | 77 %l_c2 = add i32 %l_c, 1 |
85 store i32 %l_c2, i32* %p_c, align 1 | 78 store i32 %l_c2, i32* %p_c, align 1 |
86 | 79 |
87 ret i32 %l_c2 | 80 ret i32 %l_c2 |
88 } | 81 } |
89 ; CHECK-LABEL: test_fused_load_add_b | 82 ; CHECK-LABEL: test_fused_load_add_b |
90 ; alloca store | 83 ; alloca store |
91 ; CHECK: mov {{.*}}, esp | 84 ; CHECK: mov {{.*}},esp |
92 ; CHECK: mov dword ptr {{.*}}, 999 | 85 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
93 ; atomic store (w/ its own mfence) | 86 ; atomic store (w/ its own mfence) |
94 ; CHECK: add {{.*}}, dword ptr [.bss] | 87 ; CHECK: add {{.*}},DWORD PTR {{.*}}.bss |
95 ; CHECK-NEXT: R_386_32 | 88 ; CHECK: mov DWORD PTR |
96 ; CHECK: mov dword ptr | |
97 ; CHECK: mfence | 89 ; CHECK: mfence |
98 ; CHECK: add {{.*}}, dword ptr [.bss] | 90 ; CHECK: add {{.*}},DWORD PTR {{.*}}.bss |
99 ; CHECK-NEXT: R_386_32 | 91 ; CHECK: mov DWORD PTR |
100 ; CHECK: mov dword ptr | |
101 ; CHECK: mfence | 92 ; CHECK: mfence |
102 ; Load + add can still be optimized into one instruction | 93 ; Load + add can still be optimized into one instruction |
103 ; because it is not separated by a fence. | 94 ; because it is not separated by a fence. |
104 ; CHECK: add {{.*}}, dword ptr [.bss] | 95 ; CHECK: add {{.*}},DWORD PTR {{.*}}.bss |
105 ; CHECK-NEXT: R_386_32 | 96 ; CHECK: mov DWORD PTR |
106 ; CHECK: mov dword ptr | |
107 | 97 |
108 ; Test with the fence splitting a load/add. | 98 ; Test with the fence splitting a load/add. |
109 define i32 @test_fused_load_add_c() { | 99 define i32 @test_fused_load_add_c() { |
110 entry: | 100 entry: |
111 %p_alloca = alloca i8, i32 4, align 4 | 101 %p_alloca = alloca i8, i32 4, align 4 |
112 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 102 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
113 store i32 999, i32* %p_alloca_bc, align 1 | 103 store i32 999, i32* %p_alloca_bc, align 1 |
114 | 104 |
115 %p_a = bitcast [4 x i8]* @g32_a to i32* | 105 %p_a = bitcast [4 x i8]* @g32_a to i32* |
116 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 106 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
117 %l_a2 = add i32 %l_a, 1 | 107 %l_a2 = add i32 %l_a, 1 |
118 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 108 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
119 | 109 |
120 %p_b = bitcast [4 x i8]* @g32_b to i32* | 110 %p_b = bitcast [4 x i8]* @g32_b to i32* |
121 %l_b = load i32* %p_b, align 1 | 111 %l_b = load i32* %p_b, align 1 |
122 call void @llvm.nacl.atomic.fence.all() | 112 call void @llvm.nacl.atomic.fence.all() |
123 %l_b2 = add i32 %l_b, 1 | 113 %l_b2 = add i32 %l_b, 1 |
124 store i32 %l_b2, i32* %p_b, align 1 | 114 store i32 %l_b2, i32* %p_b, align 1 |
125 | 115 |
126 %p_c = bitcast [4 x i8]* @g32_c to i32* | 116 %p_c = bitcast [4 x i8]* @g32_c to i32* |
127 %l_c = load i32* %p_c, align 1 | 117 %l_c = load i32* %p_c, align 1 |
128 %l_c2 = add i32 %l_c, 1 | 118 %l_c2 = add i32 %l_c, 1 |
129 store i32 %l_c2, i32* %p_c, align 1 | 119 store i32 %l_c2, i32* %p_c, align 1 |
130 | 120 |
131 ret i32 %l_c2 | 121 ret i32 %l_c2 |
132 } | 122 } |
133 ; CHECK-LABEL: test_fused_load_add_c | 123 ; CHECK-LABEL: test_fused_load_add_c |
134 ; alloca store | 124 ; alloca store |
135 ; CHECK: mov {{.*}}, esp | 125 ; CHECK: mov {{.*}},esp |
136 ; CHECK: mov dword ptr {{.*}}, 999 | 126 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
137 ; atomic store (w/ its own mfence) | 127 ; atomic store (w/ its own mfence) |
138 ; CHECK: add {{.*}}, dword ptr [.bss] | 128 ; CHECK: add {{.*}},DWORD PTR {{.*}}.bss |
139 ; CHECK-NEXT: R_386_32 | 129 ; CHECK: mov DWORD PTR |
140 ; CHECK: mov dword ptr | |
141 ; CHECK: mfence | 130 ; CHECK: mfence |
142 ; This load + add are no longer optimized into one, | 131 ; This load + add are no longer optimized into one, |
143 ; though perhaps it should be legal as long as | 132 ; though perhaps it should be legal as long as |
144 ; the load stays on the same side of the fence. | 133 ; the load stays on the same side of the fence. |
145 ; CHECK: mov {{.*}}, dword ptr [.bss] | 134 ; CHECK: mov {{.*}},DWORD PTR {{.*}}.bss |
146 ; CHECK-NEXT: R_386_32 | |
147 ; CHECK: mfence | 135 ; CHECK: mfence |
148 ; CHECK: add {{.*}}, 1 | 136 ; CHECK: add {{.*}},0x1 |
149 ; CHECK: mov dword ptr | 137 ; CHECK: mov DWORD PTR |
150 ; CHECK: add {{.*}}, dword ptr [.bss] | 138 ; CHECK: add {{.*}},DWORD PTR {{.*}}.bss |
151 ; CHECK-NEXT: R_386_32 | 139 ; CHECK: mov DWORD PTR |
152 ; CHECK: mov dword ptr | |
153 | 140 |
154 | 141 |
155 ; Test where a bunch of i8 loads could have been fused into one | 142 ; Test where a bunch of i8 loads could have been fused into one |
156 ; i32 load, but a fence blocks that. | 143 ; i32 load, but a fence blocks that. |
157 define i32 @could_have_fused_loads() { | 144 define i32 @could_have_fused_loads() { |
158 entry: | 145 entry: |
159 %ptr1 = bitcast [4 x i8]* @g32_d to i8* | 146 %ptr1 = bitcast [4 x i8]* @g32_d to i8* |
160 %b1 = load i8* %ptr1, align 1 | 147 %b1 = load i8* %ptr1, align 1 |
161 | 148 |
162 %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32 | 149 %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32 |
(...skipping 17 matching lines...) Expand all Loading... |
180 %b12 = or i32 %b1.ext, %b2.shift | 167 %b12 = or i32 %b1.ext, %b2.shift |
181 %b3.ext = zext i8 %b3 to i32 | 168 %b3.ext = zext i8 %b3 to i32 |
182 %b3.shift = shl i32 %b3.ext, 16 | 169 %b3.shift = shl i32 %b3.ext, 16 |
183 %b123 = or i32 %b12, %b3.shift | 170 %b123 = or i32 %b12, %b3.shift |
184 %b4.ext = zext i8 %b4 to i32 | 171 %b4.ext = zext i8 %b4 to i32 |
185 %b4.shift = shl i32 %b4.ext, 24 | 172 %b4.shift = shl i32 %b4.ext, 24 |
186 %b1234 = or i32 %b123, %b4.shift | 173 %b1234 = or i32 %b123, %b4.shift |
187 ret i32 %b1234 | 174 ret i32 %b1234 |
188 } | 175 } |
189 ; CHECK-LABEL: could_have_fused_loads | 176 ; CHECK-LABEL: could_have_fused_loads |
190 ; CHECK: mov {{.*}}, byte ptr | 177 ; CHECK: mov {{.*}},BYTE PTR |
191 ; CHECK-NEXT: R_386_32 | 178 ; CHECK: mov {{.*}},BYTE PTR |
192 ; CHECK: mov {{.*}}, byte ptr | 179 ; CHECK: mov {{.*}},BYTE PTR |
193 ; CHECK: mov {{.*}}, byte ptr | |
194 ; CHECK: mfence | 180 ; CHECK: mfence |
195 ; CHECK: mov {{.*}}, byte ptr | 181 ; CHECK: mov {{.*}},BYTE PTR |
196 | 182 |
197 | 183 |
198 ; Test where an identical load from two branches could have been hoisted | 184 ; Test where an identical load from two branches could have been hoisted |
199 ; up, and then the code merged, but a fence prevents it. | 185 ; up, and then the code merged, but a fence prevents it. |
200 define i32 @could_have_hoisted_loads(i32 %x) { | 186 define i32 @could_have_hoisted_loads(i32 %x) { |
201 entry: | 187 entry: |
202 %ptr = bitcast [4 x i8]* @g32_d to i32* | 188 %ptr = bitcast [4 x i8]* @g32_d to i32* |
203 %cmp = icmp eq i32 %x, 1 | 189 %cmp = icmp eq i32 %x, 1 |
204 br i1 %cmp, label %branch1, label %branch2 | 190 br i1 %cmp, label %branch1, label %branch2 |
205 branch1: | 191 branch1: |
206 %y = load i32* %ptr, align 1 | 192 %y = load i32* %ptr, align 1 |
207 ret i32 %y | 193 ret i32 %y |
208 branch2: | 194 branch2: |
209 call void @llvm.nacl.atomic.fence.all() | 195 call void @llvm.nacl.atomic.fence.all() |
210 %z = load i32* %ptr, align 1 | 196 %z = load i32* %ptr, align 1 |
211 ret i32 %z | 197 ret i32 %z |
212 } | 198 } |
213 ; CHECK-LABEL: could_have_hoisted_loads | 199 ; CHECK-LABEL: could_have_hoisted_loads |
214 ; CHECK: jne {{.*}} | 200 ; CHECK: jne {{.*}} |
215 ; CHECK: mov {{.*}}, dword ptr [.bss] | 201 ; CHECK: mov {{.*}},DWORD PTR {{.*}}.bss |
216 ; CHECK-NEXT: R_386_32 | |
217 ; CHECK: ret | 202 ; CHECK: ret |
218 ; CHECK: mfence | 203 ; CHECK: mfence |
219 ; CHECK: mov {{.*}}, dword ptr [.bss] | 204 ; CHECK: mov {{.*}},DWORD PTR {{.*}}.bss |
220 ; CHECK-NEXT: R_386_32 | |
221 ; CHECK: ret | 205 ; CHECK: ret |
OLD | NEW |