compiler_builtins/mem/
x86_64.rs

1// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
2// been enhanced to perform better than an simple qword loop, making them ideal
3// for implementing memcpy/memset. Note that "rep cmps" has received no such
4// enhancement, so it is not used to implement memcmp.
5//
6// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
7// further enhanced to automatically select the best microarchitectural
8// implementation based on length and alignment. See the following features from
9// the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual":
10//  - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
11//  - FSRM - Fast Short REP MOV (Ice Lake and later)
12//  - Fast Zero-Length MOVSB (On no current hardware)
13//  - Fast Short STOSB (On no current hardware)
14//
15// To simplify things, we switch to using the byte-based variants if the "ermsb"
16// feature is present at compile-time. We don't bother detecting other features.
17// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
18
19use core::arch::asm;
20use core::{intrinsics, mem};
21
22#[inline(always)]
23#[cfg(target_feature = "ermsb")]
24pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
25    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
26    core::arch::asm!(
27        "repe movsb (%rsi), (%rdi)",
28        inout("rcx") count => _,
29        inout("rdi") dest => _,
30        inout("rsi") src => _,
31        options(att_syntax, nostack, preserves_flags)
32    );
33}
34
35#[inline(always)]
36#[cfg(not(target_feature = "ermsb"))]
37pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
38    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
39    // Separating the blocks gives the compiler more freedom to reorder instructions.
40    asm!(
41        "rep movsb",
42        inout("ecx") pre_byte_count => _,
43        inout("rdi") dest => dest,
44        inout("rsi") src => src,
45        options(att_syntax, nostack, preserves_flags)
46    );
47    asm!(
48        "rep movsq",
49        inout("rcx") qword_count => _,
50        inout("rdi") dest => dest,
51        inout("rsi") src => src,
52        options(att_syntax, nostack, preserves_flags)
53    );
54    asm!(
55        "rep movsb",
56        inout("ecx") byte_count => _,
57        inout("rdi") dest => _,
58        inout("rsi") src => _,
59        options(att_syntax, nostack, preserves_flags)
60    );
61}
62
63#[inline(always)]
64pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
65    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
66    // We can't separate this block due to std/cld
67    asm!(
68        "std",
69        "rep movsb",
70        "sub $7, %rsi",
71        "sub $7, %rdi",
72        "mov {qword_count}, %rcx",
73        "rep movsq",
74        "test {pre_byte_count:e}, {pre_byte_count:e}",
75        "add $7, %rsi",
76        "add $7, %rdi",
77        "mov {pre_byte_count:e}, %ecx",
78        "rep movsb",
79        "cld",
80        pre_byte_count = in(reg) pre_byte_count,
81        qword_count = in(reg) qword_count,
82        inout("ecx") byte_count => _,
83        inout("rdi") dest.add(count - 1) => _,
84        inout("rsi") src.add(count - 1) => _,
85        // We modify flags, but we restore it afterwards
86        options(att_syntax, nostack, preserves_flags)
87    );
88}
89
90#[inline(always)]
91#[cfg(target_feature = "ermsb")]
92pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
93    // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
94    core::arch::asm!(
95        "repe stosb %al, (%rdi)",
96        inout("rcx") count => _,
97        inout("rdi") dest => _,
98        inout("al") c => _,
99        options(att_syntax, nostack, preserves_flags)
100    )
101}
102
103#[inline(always)]
104#[cfg(not(target_feature = "ermsb"))]
105pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
106    let c = c as u64 * 0x0101_0101_0101_0101;
107    let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
108    // Separating the blocks gives the compiler more freedom to reorder instructions.
109    asm!(
110        "rep stosb",
111        inout("ecx") pre_byte_count => _,
112        inout("rdi") dest => dest,
113        in("rax") c,
114        options(att_syntax, nostack, preserves_flags)
115    );
116    asm!(
117        "rep stosq",
118        inout("rcx") qword_count => _,
119        inout("rdi") dest => dest,
120        in("rax") c,
121        options(att_syntax, nostack, preserves_flags)
122    );
123    asm!(
124        "rep stosb",
125        inout("ecx") byte_count => _,
126        inout("rdi") dest => _,
127        in("rax") c,
128        options(att_syntax, nostack, preserves_flags)
129    );
130}
131
132#[inline(always)]
133pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
134    #[inline(always)]
135    unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
136    where
137        T: Clone + Copy + Eq,
138        U: Clone + Copy + Eq,
139        F: FnOnce(*const U, *const U, usize) -> i32,
140    {
141        // Ensure T is not a ZST.
142        const { assert!(mem::size_of::<T>() != 0) };
143
144        let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>()));
145        while a != end {
146            if a.read_unaligned() != b.read_unaligned() {
147                return f(a.cast(), b.cast(), mem::size_of::<T>());
148            }
149            a = a.add(1);
150            b = b.add(1);
151        }
152        f(
153            a.cast(),
154            b.cast(),
155            intrinsics::unchecked_rem(n, mem::size_of::<T>()),
156        )
157    }
158    let c1 = |mut a: *const u8, mut b: *const u8, n| {
159        for _ in 0..n {
160            if a.read() != b.read() {
161                return i32::from(a.read()) - i32::from(b.read());
162            }
163            a = a.add(1);
164            b = b.add(1);
165        }
166        0
167    };
168    let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
169    let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
170    let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
171    let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
172    c16(a.cast(), b.cast(), n)
173}
174
175// In order to process more than on byte simultaneously when executing strlen,
176// two things must be considered:
177// * An n byte read with an n-byte aligned address will never cross
178//   a page boundary and will always succeed. Any smaller alignment
179//   may result in a read that will cross a page boundary, which may
180//   trigger an access violation.
181// * Surface Rust considers any kind of out-of-bounds read as undefined
182//   behaviour. To dodge this, memory access operations are written
183//   using inline assembly.
184
185#[cfg(target_feature = "sse2")]
186#[inline(always)]
187pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
188    use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
189
190    let mut n = 0;
191
192    // The use of _mm_movemask_epi8 and company allow for speedups,
193    // but they aren't cheap by themselves. Thus, possibly small strings
194    // are handled in simple loops.
195
196    for _ in 0..4 {
197        if *s == 0 {
198            return n;
199        }
200
201        n += 1;
202        s = s.add(1);
203    }
204
205    // Shave of the least significand bits to align the address to a 16
206    // byte boundary. The shaved of bits are used to correct the first iteration.
207
208    let align = s as usize & 15;
209    let mut s = ((s as usize) - align) as *const __m128i;
210    let zero = _mm_set1_epi8(0);
211
212    let x = {
213        let r;
214        asm!(
215            "movdqa ({addr}), {dest}",
216            addr = in(reg) s,
217            dest = out(xmm_reg) r,
218            options(att_syntax, nostack),
219        );
220        r
221    };
222    let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
223
224    if cmp != 0 {
225        return n + cmp.trailing_zeros() as usize;
226    }
227
228    n += 16 - align;
229    s = s.add(1);
230
231    loop {
232        let x = {
233            let r;
234            asm!(
235                "movdqa ({addr}), {dest}",
236                addr = in(reg) s,
237                dest = out(xmm_reg) r,
238                options(att_syntax, nostack),
239            );
240            r
241        };
242        let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
243        if cmp == 0 {
244            n += 16;
245            s = s.add(1);
246        } else {
247            return n + cmp.trailing_zeros() as usize;
248        }
249    }
250}
251
252// Provided for scenarios like kernel development, where SSE might not
253// be available.
254#[cfg(not(target_feature = "sse2"))]
255#[inline(always)]
256pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
257    let mut n = 0;
258
259    // Check bytes in steps of one until
260    // either a zero byte is discovered or
261    // pointer is aligned to an eight byte boundary.
262
263    while s as usize & 7 != 0 {
264        if *s == 0 {
265            return n;
266        }
267        n += 1;
268        s = s.add(1);
269    }
270
271    // Check bytes in steps of eight until a zero
272    // byte is discovered.
273
274    let mut s = s as *const u64;
275
276    loop {
277        let mut cs = {
278            let r: u64;
279            asm!(
280                "mov ({addr}), {dest}",
281                addr = in(reg) s,
282                dest = out(reg) r,
283                options(att_syntax, nostack),
284            );
285            r
286        };
287        // Detect if a word has a zero byte, taken from
288        // https://graphics.stanford.edu/~seander/bithacks.html
289        if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
290            loop {
291                if cs & 255 == 0 {
292                    return n;
293                } else {
294                    cs >>= 8;
295                    n += 1;
296                }
297            }
298        } else {
299            n += 8;
300            s = s.add(1);
301        }
302    }
303}
304
305/// Determine optimal parameters for a `rep` instruction.
306fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
307    // Unaligned writes are still slow on modern processors, so align the destination address.
308    let pre_byte_count = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
309    count -= pre_byte_count;
310    let qword_count = count >> 3;
311    let byte_count = count & 0b111;
312    (pre_byte_count, qword_count, byte_count)
313}