OLD | NEW |
1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. | 1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. |
2 ; This should apply to both atomic and non-atomic loads/stores | 2 ; This should apply to both atomic and non-atomic loads/stores |
3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only | 3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only |
4 ; applies to atomic load/stores). | 4 ; applies to atomic load/stores). |
5 ; | 5 ; |
6 ; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s | 6 ; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s |
7 | 7 |
8 declare void @llvm.nacl.atomic.fence.all() | 8 declare void @llvm.nacl.atomic.fence.all() |
9 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) | 9 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) |
10 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) | 10 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) |
11 | 11 |
12 @g32_a = internal global [4 x i8] zeroinitializer, align 4 | 12 @g32_a = internal global [4 x i8] zeroinitializer, align 4 |
13 @g32_b = internal global [4 x i8] zeroinitializer, align 4 | 13 @g32_b = internal global [4 x i8] zeroinitializer, align 4 |
14 @g32_c = internal global [4 x i8] zeroinitializer, align 4 | 14 @g32_c = internal global [4 x i8] zeroinitializer, align 4 |
15 @g32_d = internal global [4 x i8] zeroinitializer, align 4 | 15 @g32_d = internal global [4 x i8] zeroinitializer, align 4 |
16 | 16 |
17 define i32 @test_fused_load_add_a() { | 17 define i32 @test_fused_load_sub_a() { |
18 entry: | 18 entry: |
19 %p_alloca = alloca i8, i32 4, align 4 | 19 %p_alloca = alloca i8, i32 4, align 4 |
20 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 20 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
21 store i32 999, i32* %p_alloca_bc, align 1 | 21 store i32 999, i32* %p_alloca_bc, align 1 |
22 | 22 |
23 %p_a = bitcast [4 x i8]* @g32_a to i32* | 23 %p_a = bitcast [4 x i8]* @g32_a to i32* |
24 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 24 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
25 %l_a2 = add i32 %l_a, 1 | 25 %l_a2 = sub i32 1, %l_a |
26 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 26 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
27 | 27 |
28 %p_b = bitcast [4 x i8]* @g32_b to i32* | 28 %p_b = bitcast [4 x i8]* @g32_b to i32* |
29 %l_b = load i32, i32* %p_b, align 1 | 29 %l_b = load i32, i32* %p_b, align 1 |
30 %l_b2 = add i32 %l_b, 1 | 30 %l_b2 = sub i32 1, %l_b |
31 store i32 %l_b2, i32* %p_b, align 1 | 31 store i32 %l_b2, i32* %p_b, align 1 |
32 | 32 |
33 %p_c = bitcast [4 x i8]* @g32_c to i32* | 33 %p_c = bitcast [4 x i8]* @g32_c to i32* |
34 %l_c = load i32, i32* %p_c, align 1 | 34 %l_c = load i32, i32* %p_c, align 1 |
35 %l_c2 = add i32 %l_c, 1 | 35 %l_c2 = sub i32 1, %l_c |
36 call void @llvm.nacl.atomic.fence.all() | 36 call void @llvm.nacl.atomic.fence.all() |
37 store i32 %l_c2, i32* %p_c, align 1 | 37 store i32 %l_c2, i32* %p_c, align 1 |
38 | 38 |
39 ret i32 %l_c2 | 39 ret i32 %l_c2 |
40 } | 40 } |
41 ; CHECK-LABEL: test_fused_load_add_a | 41 ; CHECK-LABEL: test_fused_load_sub_a |
42 ; alloca store | 42 ; alloca store |
43 ; CHECK: mov {{.*}},esp | 43 ; CHECK: mov {{.*}},esp |
44 ; CHECK: mov DWORD PTR {{.*}},0x3e7 | 44 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
45 ; atomic store (w/ its own mfence) | 45 ; atomic store (w/ its own mfence) |
46 ; The load + add are optimized into one everywhere. | 46 ; The load + sub are optimized into one everywhere. |
47 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a | 47 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a |
48 ; CHECK: mov DWORD PTR | 48 ; CHECK: mov DWORD PTR |
49 ; CHECK: mfence | 49 ; CHECK: mfence |
50 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b | 50 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b |
51 ; CHECK: mov DWORD PTR | 51 ; CHECK: mov DWORD PTR |
52 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c | 52 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c |
53 ; CHECK: mfence | 53 ; CHECK: mfence |
54 ; CHECK: mov DWORD PTR | 54 ; CHECK: mov DWORD PTR |
55 | 55 |
56 ; Test with the fence moved up a bit. | 56 ; Test with the fence moved up a bit. |
57 define i32 @test_fused_load_add_b() { | 57 define i32 @test_fused_load_sub_b() { |
58 entry: | 58 entry: |
59 %p_alloca = alloca i8, i32 4, align 4 | 59 %p_alloca = alloca i8, i32 4, align 4 |
60 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 60 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
61 store i32 999, i32* %p_alloca_bc, align 1 | 61 store i32 999, i32* %p_alloca_bc, align 1 |
62 | 62 |
63 %p_a = bitcast [4 x i8]* @g32_a to i32* | 63 %p_a = bitcast [4 x i8]* @g32_a to i32* |
64 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 64 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
65 %l_a2 = add i32 %l_a, 1 | 65 %l_a2 = sub i32 1, %l_a |
66 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 66 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
67 | 67 |
68 %p_b = bitcast [4 x i8]* @g32_b to i32* | 68 %p_b = bitcast [4 x i8]* @g32_b to i32* |
69 %l_b = load i32, i32* %p_b, align 1 | 69 %l_b = load i32, i32* %p_b, align 1 |
70 %l_b2 = add i32 %l_b, 1 | 70 %l_b2 = sub i32 1, %l_b |
71 store i32 %l_b2, i32* %p_b, align 1 | 71 store i32 %l_b2, i32* %p_b, align 1 |
72 | 72 |
73 %p_c = bitcast [4 x i8]* @g32_c to i32* | 73 %p_c = bitcast [4 x i8]* @g32_c to i32* |
74 call void @llvm.nacl.atomic.fence.all() | 74 call void @llvm.nacl.atomic.fence.all() |
75 %l_c = load i32, i32* %p_c, align 1 | 75 %l_c = load i32, i32* %p_c, align 1 |
76 %l_c2 = add i32 %l_c, 1 | 76 %l_c2 = sub i32 1, %l_c |
77 store i32 %l_c2, i32* %p_c, align 1 | 77 store i32 %l_c2, i32* %p_c, align 1 |
78 | 78 |
79 ret i32 %l_c2 | 79 ret i32 %l_c2 |
80 } | 80 } |
81 ; CHECK-LABEL: test_fused_load_add_b | 81 ; CHECK-LABEL: test_fused_load_sub_b |
82 ; alloca store | 82 ; alloca store |
83 ; CHECK: mov {{.*}},esp | 83 ; CHECK: mov {{.*}},esp |
84 ; CHECK: mov DWORD PTR {{.*}},0x3e7 | 84 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
85 ; atomic store (w/ its own mfence) | 85 ; atomic store (w/ its own mfence) |
86 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a | 86 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a |
87 ; CHECK: mov DWORD PTR | 87 ; CHECK: mov DWORD PTR |
88 ; CHECK: mfence | 88 ; CHECK: mfence |
89 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b | 89 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b |
90 ; CHECK: mov DWORD PTR | 90 ; CHECK: mov DWORD PTR |
91 ; CHECK: mfence | 91 ; CHECK: mfence |
92 ; Load + add can still be optimized into one instruction | 92 ; Load + sub can still be optimized into one instruction |
93 ; because it is not separated by a fence. | 93 ; because it is not separated by a fence. |
94 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c | 94 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c |
95 ; CHECK: mov DWORD PTR | 95 ; CHECK: mov DWORD PTR |
96 | 96 |
97 ; Test with the fence splitting a load/add. | 97 ; Test with the fence splitting a load/sub. |
98 define i32 @test_fused_load_add_c() { | 98 define i32 @test_fused_load_sub_c() { |
99 entry: | 99 entry: |
100 %p_alloca = alloca i8, i32 4, align 4 | 100 %p_alloca = alloca i8, i32 4, align 4 |
101 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 101 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
102 store i32 999, i32* %p_alloca_bc, align 1 | 102 store i32 999, i32* %p_alloca_bc, align 1 |
103 | 103 |
104 %p_a = bitcast [4 x i8]* @g32_a to i32* | 104 %p_a = bitcast [4 x i8]* @g32_a to i32* |
105 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 105 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
106 %l_a2 = add i32 %l_a, 1 | 106 %l_a2 = sub i32 1, %l_a |
107 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 107 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
108 | 108 |
109 %p_b = bitcast [4 x i8]* @g32_b to i32* | 109 %p_b = bitcast [4 x i8]* @g32_b to i32* |
110 %l_b = load i32, i32* %p_b, align 1 | 110 %l_b = load i32, i32* %p_b, align 1 |
111 call void @llvm.nacl.atomic.fence.all() | 111 call void @llvm.nacl.atomic.fence.all() |
112 %l_b2 = add i32 %l_b, 1 | 112 %l_b2 = sub i32 1, %l_b |
113 store i32 %l_b2, i32* %p_b, align 1 | 113 store i32 %l_b2, i32* %p_b, align 1 |
114 | 114 |
115 %p_c = bitcast [4 x i8]* @g32_c to i32* | 115 %p_c = bitcast [4 x i8]* @g32_c to i32* |
116 %l_c = load i32, i32* %p_c, align 1 | 116 %l_c = load i32, i32* %p_c, align 1 |
117 %l_c2 = add i32 %l_c, 1 | 117 %l_c2 = sub i32 1, %l_c |
118 store i32 %l_c2, i32* %p_c, align 1 | 118 store i32 %l_c2, i32* %p_c, align 1 |
119 | 119 |
120 ret i32 %l_c2 | 120 ret i32 %l_c2 |
121 } | 121 } |
122 ; CHECK-LABEL: test_fused_load_add_c | 122 ; CHECK-LABEL: test_fused_load_sub_c |
123 ; alloca store | 123 ; alloca store |
124 ; CHECK: mov {{.*}},esp | 124 ; CHECK: mov {{.*}},esp |
125 ; CHECK: mov DWORD PTR {{.*}},0x3e7 | 125 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
126 ; atomic store (w/ its own mfence) | 126 ; atomic store (w/ its own mfence) |
127 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a | 127 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a |
128 ; CHECK: mov DWORD PTR | 128 ; CHECK: mov DWORD PTR |
129 ; CHECK: mfence | 129 ; CHECK: mfence |
130 ; This load + add are no longer optimized into one, | 130 ; This load + sub are no longer optimized into one, |
131 ; though perhaps it should be legal as long as | 131 ; though perhaps it should be legal as long as |
132 ; the load stays on the same side of the fence. | 132 ; the load stays on the same side of the fence. |
133 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b | 133 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b |
134 ; CHECK: mfence | 134 ; CHECK: mfence |
135 ; CHECK: add {{.*}},0x1 | 135 ; CHECK: mov {{.*}},0x1 |
| 136 ; CHECK: sub |
136 ; CHECK: mov DWORD PTR | 137 ; CHECK: mov DWORD PTR |
137 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c | 138 ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c |
138 ; CHECK: mov DWORD PTR | 139 ; CHECK: mov DWORD PTR |
139 | 140 |
140 | 141 |
141 ; Test where a bunch of i8 loads could have been fused into one | 142 ; Test where a bunch of i8 loads could have been fused into one |
142 ; i32 load, but a fence blocks that. | 143 ; i32 load, but a fence blocks that. |
143 define i32 @could_have_fused_loads() { | 144 define i32 @could_have_fused_loads() { |
144 entry: | 145 entry: |
145 %ptr1 = bitcast [4 x i8]* @g32_d to i8* | 146 %ptr1 = bitcast [4 x i8]* @g32_d to i8* |
146 %b1 = load i8, i8* %ptr1, align 1 | 147 %b1 = load i8, i8* %ptr1, align 1 |
147 | 148 |
(...skipping 47 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
195 %z = load i32, i32* %ptr, align 1 | 196 %z = load i32, i32* %ptr, align 1 |
196 ret i32 %z | 197 ret i32 %z |
197 } | 198 } |
198 ; CHECK-LABEL: could_have_hoisted_loads | 199 ; CHECK-LABEL: could_have_hoisted_loads |
199 ; CHECK: jne {{.*}} | 200 ; CHECK: jne {{.*}} |
200 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_d | 201 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_d |
201 ; CHECK: ret | 202 ; CHECK: ret |
202 ; CHECK: mfence | 203 ; CHECK: mfence |
203 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_d | 204 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_d |
204 ; CHECK: ret | 205 ; CHECK: ret |
OLD | NEW |