| OLD | NEW |
| 1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. | 1 ; Test that loads/stores don't move across a nacl.atomic.fence.all. |
| 2 ; This should apply to both atomic and non-atomic loads/stores | 2 ; This should apply to both atomic and non-atomic loads/stores |
| 3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only | 3 ; (unlike the non-"all" variety of nacl.atomic.fence, which only |
| 4 ; applies to atomic load/stores). | 4 ; applies to atomic load/stores). |
| 5 ; | 5 ; |
| 6 ; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s | 6 ; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s |
| 7 | 7 |
| 8 declare void @llvm.nacl.atomic.fence.all() | 8 declare void @llvm.nacl.atomic.fence.all() |
| 9 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) | 9 declare i32 @llvm.nacl.atomic.load.i32(i32*, i32) |
| 10 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) | 10 declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) |
| 11 | 11 |
| 12 @g32_a = internal global [4 x i8] zeroinitializer, align 4 | 12 @g32_a = internal global [4 x i8] zeroinitializer, align 4 |
| 13 @g32_b = internal global [4 x i8] zeroinitializer, align 4 | 13 @g32_b = internal global [4 x i8] zeroinitializer, align 4 |
| 14 @g32_c = internal global [4 x i8] zeroinitializer, align 4 | 14 @g32_c = internal global [4 x i8] zeroinitializer, align 4 |
| 15 @g32_d = internal global [4 x i8] zeroinitializer, align 4 | 15 @g32_d = internal global [4 x i8] zeroinitializer, align 4 |
| 16 | 16 |
| 17 define i32 @test_fused_load_add_a() { | 17 define i32 @test_fused_load_add_a() { |
| 18 entry: | 18 entry: |
| 19 %p_alloca = alloca i8, i32 4, align 4 | 19 %p_alloca = alloca i8, i32 4, align 4 |
| 20 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 20 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
| 21 store i32 999, i32* %p_alloca_bc, align 1 | 21 store i32 999, i32* %p_alloca_bc, align 1 |
| 22 | 22 |
| 23 %p_a = bitcast [4 x i8]* @g32_a to i32* | 23 %p_a = bitcast [4 x i8]* @g32_a to i32* |
| 24 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 24 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
| 25 %l_a2 = add i32 %l_a, 1 | 25 %l_a2 = add i32 %l_a, 1 |
| 26 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 26 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
| 27 | 27 |
| 28 %p_b = bitcast [4 x i8]* @g32_b to i32* | 28 %p_b = bitcast [4 x i8]* @g32_b to i32* |
| 29 %l_b = load i32* %p_b, align 1 | 29 %l_b = load i32, i32* %p_b, align 1 |
| 30 %l_b2 = add i32 %l_b, 1 | 30 %l_b2 = add i32 %l_b, 1 |
| 31 store i32 %l_b2, i32* %p_b, align 1 | 31 store i32 %l_b2, i32* %p_b, align 1 |
| 32 | 32 |
| 33 %p_c = bitcast [4 x i8]* @g32_c to i32* | 33 %p_c = bitcast [4 x i8]* @g32_c to i32* |
| 34 %l_c = load i32* %p_c, align 1 | 34 %l_c = load i32, i32* %p_c, align 1 |
| 35 %l_c2 = add i32 %l_c, 1 | 35 %l_c2 = add i32 %l_c, 1 |
| 36 call void @llvm.nacl.atomic.fence.all() | 36 call void @llvm.nacl.atomic.fence.all() |
| 37 store i32 %l_c2, i32* %p_c, align 1 | 37 store i32 %l_c2, i32* %p_c, align 1 |
| 38 | 38 |
| 39 ret i32 %l_c2 | 39 ret i32 %l_c2 |
| 40 } | 40 } |
| 41 ; CHECK-LABEL: test_fused_load_add_a | 41 ; CHECK-LABEL: test_fused_load_add_a |
| 42 ; alloca store | 42 ; alloca store |
| 43 ; CHECK: mov {{.*}},esp | 43 ; CHECK: mov {{.*}},esp |
| 44 ; CHECK: mov DWORD PTR {{.*}},0x3e7 | 44 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 59 %p_alloca = alloca i8, i32 4, align 4 | 59 %p_alloca = alloca i8, i32 4, align 4 |
| 60 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 60 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
| 61 store i32 999, i32* %p_alloca_bc, align 1 | 61 store i32 999, i32* %p_alloca_bc, align 1 |
| 62 | 62 |
| 63 %p_a = bitcast [4 x i8]* @g32_a to i32* | 63 %p_a = bitcast [4 x i8]* @g32_a to i32* |
| 64 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 64 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
| 65 %l_a2 = add i32 %l_a, 1 | 65 %l_a2 = add i32 %l_a, 1 |
| 66 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 66 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
| 67 | 67 |
| 68 %p_b = bitcast [4 x i8]* @g32_b to i32* | 68 %p_b = bitcast [4 x i8]* @g32_b to i32* |
| 69 %l_b = load i32* %p_b, align 1 | 69 %l_b = load i32, i32* %p_b, align 1 |
| 70 %l_b2 = add i32 %l_b, 1 | 70 %l_b2 = add i32 %l_b, 1 |
| 71 store i32 %l_b2, i32* %p_b, align 1 | 71 store i32 %l_b2, i32* %p_b, align 1 |
| 72 | 72 |
| 73 %p_c = bitcast [4 x i8]* @g32_c to i32* | 73 %p_c = bitcast [4 x i8]* @g32_c to i32* |
| 74 call void @llvm.nacl.atomic.fence.all() | 74 call void @llvm.nacl.atomic.fence.all() |
| 75 %l_c = load i32* %p_c, align 1 | 75 %l_c = load i32, i32* %p_c, align 1 |
| 76 %l_c2 = add i32 %l_c, 1 | 76 %l_c2 = add i32 %l_c, 1 |
| 77 store i32 %l_c2, i32* %p_c, align 1 | 77 store i32 %l_c2, i32* %p_c, align 1 |
| 78 | 78 |
| 79 ret i32 %l_c2 | 79 ret i32 %l_c2 |
| 80 } | 80 } |
| 81 ; CHECK-LABEL: test_fused_load_add_b | 81 ; CHECK-LABEL: test_fused_load_add_b |
| 82 ; alloca store | 82 ; alloca store |
| 83 ; CHECK: mov {{.*}},esp | 83 ; CHECK: mov {{.*}},esp |
| 84 ; CHECK: mov DWORD PTR {{.*}},0x3e7 | 84 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
| 85 ; atomic store (w/ its own mfence) | 85 ; atomic store (w/ its own mfence) |
| (...skipping 14 matching lines...) Expand all Loading... |
| 100 %p_alloca = alloca i8, i32 4, align 4 | 100 %p_alloca = alloca i8, i32 4, align 4 |
| 101 %p_alloca_bc = bitcast i8* %p_alloca to i32* | 101 %p_alloca_bc = bitcast i8* %p_alloca to i32* |
| 102 store i32 999, i32* %p_alloca_bc, align 1 | 102 store i32 999, i32* %p_alloca_bc, align 1 |
| 103 | 103 |
| 104 %p_a = bitcast [4 x i8]* @g32_a to i32* | 104 %p_a = bitcast [4 x i8]* @g32_a to i32* |
| 105 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) | 105 %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) |
| 106 %l_a2 = add i32 %l_a, 1 | 106 %l_a2 = add i32 %l_a, 1 |
| 107 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) | 107 call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) |
| 108 | 108 |
| 109 %p_b = bitcast [4 x i8]* @g32_b to i32* | 109 %p_b = bitcast [4 x i8]* @g32_b to i32* |
| 110 %l_b = load i32* %p_b, align 1 | 110 %l_b = load i32, i32* %p_b, align 1 |
| 111 call void @llvm.nacl.atomic.fence.all() | 111 call void @llvm.nacl.atomic.fence.all() |
| 112 %l_b2 = add i32 %l_b, 1 | 112 %l_b2 = add i32 %l_b, 1 |
| 113 store i32 %l_b2, i32* %p_b, align 1 | 113 store i32 %l_b2, i32* %p_b, align 1 |
| 114 | 114 |
| 115 %p_c = bitcast [4 x i8]* @g32_c to i32* | 115 %p_c = bitcast [4 x i8]* @g32_c to i32* |
| 116 %l_c = load i32* %p_c, align 1 | 116 %l_c = load i32, i32* %p_c, align 1 |
| 117 %l_c2 = add i32 %l_c, 1 | 117 %l_c2 = add i32 %l_c, 1 |
| 118 store i32 %l_c2, i32* %p_c, align 1 | 118 store i32 %l_c2, i32* %p_c, align 1 |
| 119 | 119 |
| 120 ret i32 %l_c2 | 120 ret i32 %l_c2 |
| 121 } | 121 } |
| 122 ; CHECK-LABEL: test_fused_load_add_c | 122 ; CHECK-LABEL: test_fused_load_add_c |
| 123 ; alloca store | 123 ; alloca store |
| 124 ; CHECK: mov {{.*}},esp | 124 ; CHECK: mov {{.*}},esp |
| 125 ; CHECK: mov DWORD PTR {{.*}},0x3e7 | 125 ; CHECK: mov DWORD PTR {{.*}},0x3e7 |
| 126 ; atomic store (w/ its own mfence) | 126 ; atomic store (w/ its own mfence) |
| 127 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a | 127 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a |
| 128 ; CHECK: mov DWORD PTR | 128 ; CHECK: mov DWORD PTR |
| 129 ; CHECK: mfence | 129 ; CHECK: mfence |
| 130 ; This load + add are no longer optimized into one, | 130 ; This load + add are no longer optimized into one, |
| 131 ; though perhaps it should be legal as long as | 131 ; though perhaps it should be legal as long as |
| 132 ; the load stays on the same side of the fence. | 132 ; the load stays on the same side of the fence. |
| 133 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b | 133 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b |
| 134 ; CHECK: mfence | 134 ; CHECK: mfence |
| 135 ; CHECK: add {{.*}},0x1 | 135 ; CHECK: add {{.*}},0x1 |
| 136 ; CHECK: mov DWORD PTR | 136 ; CHECK: mov DWORD PTR |
| 137 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c | 137 ; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c |
| 138 ; CHECK: mov DWORD PTR | 138 ; CHECK: mov DWORD PTR |
| 139 | 139 |
| 140 | 140 |
| 141 ; Test where a bunch of i8 loads could have been fused into one | 141 ; Test where a bunch of i8 loads could have been fused into one |
| 142 ; i32 load, but a fence blocks that. | 142 ; i32 load, but a fence blocks that. |
| 143 define i32 @could_have_fused_loads() { | 143 define i32 @could_have_fused_loads() { |
| 144 entry: | 144 entry: |
| 145 %ptr1 = bitcast [4 x i8]* @g32_d to i8* | 145 %ptr1 = bitcast [4 x i8]* @g32_d to i8* |
| 146 %b1 = load i8* %ptr1, align 1 | 146 %b1 = load i8, i8* %ptr1, align 1 |
| 147 | 147 |
| 148 %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32 | 148 %int_ptr2 = ptrtoint [4 x i8]* @g32_d to i32 |
| 149 %int_ptr_bump2 = add i32 %int_ptr2, 1 | 149 %int_ptr_bump2 = add i32 %int_ptr2, 1 |
| 150 %ptr2 = inttoptr i32 %int_ptr_bump2 to i8* | 150 %ptr2 = inttoptr i32 %int_ptr_bump2 to i8* |
| 151 %b2 = load i8* %ptr2, align 1 | 151 %b2 = load i8, i8* %ptr2, align 1 |
| 152 | 152 |
| 153 %int_ptr_bump3 = add i32 %int_ptr2, 2 | 153 %int_ptr_bump3 = add i32 %int_ptr2, 2 |
| 154 %ptr3 = inttoptr i32 %int_ptr_bump3 to i8* | 154 %ptr3 = inttoptr i32 %int_ptr_bump3 to i8* |
| 155 %b3 = load i8* %ptr3, align 1 | 155 %b3 = load i8, i8* %ptr3, align 1 |
| 156 | 156 |
| 157 call void @llvm.nacl.atomic.fence.all() | 157 call void @llvm.nacl.atomic.fence.all() |
| 158 | 158 |
| 159 %int_ptr_bump4 = add i32 %int_ptr2, 3 | 159 %int_ptr_bump4 = add i32 %int_ptr2, 3 |
| 160 %ptr4 = inttoptr i32 %int_ptr_bump4 to i8* | 160 %ptr4 = inttoptr i32 %int_ptr_bump4 to i8* |
| 161 %b4 = load i8* %ptr4, align 1 | 161 %b4 = load i8, i8* %ptr4, align 1 |
| 162 | 162 |
| 163 %b1.ext = zext i8 %b1 to i32 | 163 %b1.ext = zext i8 %b1 to i32 |
| 164 %b2.ext = zext i8 %b2 to i32 | 164 %b2.ext = zext i8 %b2 to i32 |
| 165 %b2.shift = shl i32 %b2.ext, 8 | 165 %b2.shift = shl i32 %b2.ext, 8 |
| 166 %b12 = or i32 %b1.ext, %b2.shift | 166 %b12 = or i32 %b1.ext, %b2.shift |
| 167 %b3.ext = zext i8 %b3 to i32 | 167 %b3.ext = zext i8 %b3 to i32 |
| 168 %b3.shift = shl i32 %b3.ext, 16 | 168 %b3.shift = shl i32 %b3.ext, 16 |
| 169 %b123 = or i32 %b12, %b3.shift | 169 %b123 = or i32 %b12, %b3.shift |
| 170 %b4.ext = zext i8 %b4 to i32 | 170 %b4.ext = zext i8 %b4 to i32 |
| 171 %b4.shift = shl i32 %b4.ext, 24 | 171 %b4.shift = shl i32 %b4.ext, 24 |
| 172 %b1234 = or i32 %b123, %b4.shift | 172 %b1234 = or i32 %b123, %b4.shift |
| 173 ret i32 %b1234 | 173 ret i32 %b1234 |
| 174 } | 174 } |
| 175 ; CHECK-LABEL: could_have_fused_loads | 175 ; CHECK-LABEL: could_have_fused_loads |
| 176 ; CHECK: mov {{.*}},BYTE PTR | 176 ; CHECK: mov {{.*}},BYTE PTR |
| 177 ; CHECK: mov {{.*}},BYTE PTR | 177 ; CHECK: mov {{.*}},BYTE PTR |
| 178 ; CHECK: mov {{.*}},BYTE PTR | 178 ; CHECK: mov {{.*}},BYTE PTR |
| 179 ; CHECK: mfence | 179 ; CHECK: mfence |
| 180 ; CHECK: mov {{.*}},BYTE PTR | 180 ; CHECK: mov {{.*}},BYTE PTR |
| 181 | 181 |
| 182 | 182 |
| 183 ; Test where an identical load from two branches could have been hoisted | 183 ; Test where an identical load from two branches could have been hoisted |
| 184 ; up, and then the code merged, but a fence prevents it. | 184 ; up, and then the code merged, but a fence prevents it. |
| 185 define i32 @could_have_hoisted_loads(i32 %x) { | 185 define i32 @could_have_hoisted_loads(i32 %x) { |
| 186 entry: | 186 entry: |
| 187 %ptr = bitcast [4 x i8]* @g32_d to i32* | 187 %ptr = bitcast [4 x i8]* @g32_d to i32* |
| 188 %cmp = icmp eq i32 %x, 1 | 188 %cmp = icmp eq i32 %x, 1 |
| 189 br i1 %cmp, label %branch1, label %branch2 | 189 br i1 %cmp, label %branch1, label %branch2 |
| 190 branch1: | 190 branch1: |
| 191 %y = load i32* %ptr, align 1 | 191 %y = load i32, i32* %ptr, align 1 |
| 192 ret i32 %y | 192 ret i32 %y |
| 193 branch2: | 193 branch2: |
| 194 call void @llvm.nacl.atomic.fence.all() | 194 call void @llvm.nacl.atomic.fence.all() |
| 195 %z = load i32* %ptr, align 1 | 195 %z = load i32, i32* %ptr, align 1 |
| 196 ret i32 %z | 196 ret i32 %z |
| 197 } | 197 } |
| 198 ; CHECK-LABEL: could_have_hoisted_loads | 198 ; CHECK-LABEL: could_have_hoisted_loads |
| 199 ; CHECK: jne {{.*}} | 199 ; CHECK: jne {{.*}} |
| 200 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_d | 200 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_d |
| 201 ; CHECK: ret | 201 ; CHECK: ret |
| 202 ; CHECK: mfence | 202 ; CHECK: mfence |
| 203 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_d | 203 ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_d |
| 204 ; CHECK: ret | 204 ; CHECK: ret |
| OLD | NEW |