compiler_builtins/int/specialized_div_rem/
mod.rs

1// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
2#![allow(unused_unsafe)]
3// The functions are complex with many branches, and explicit
4// `return`s makes it clear where function exit points are
5#![allow(clippy::needless_return)]
6#![allow(clippy::comparison_chain)]
7// Clippy is confused by the complex configuration
8#![allow(clippy::if_same_then_else)]
9#![allow(clippy::needless_bool)]
10
11//! This `specialized_div_rem` module is originally from version 1.0.0 of the
12//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
13//! module, since unoptimized compilation may generate references to `memcpy`.
14//!
15//! The purpose of these macros is to easily change the both the division algorithm used
16//! for a given integer size and the half division used by that algorithm. The way
17//! functions call each other is also constructed such that linkers will find the chain of
18//! software and hardware divisions needed for every size of signed and unsigned division.
19//! For example, most target compilations do the following:
20//!
21//!  - Many 128 bit division functions like `u128::wrapping_div` use
22//!    `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
23//!    is not a 128 bit by 128 bit hardware division function in most architectures.
24//!    `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
25//!    `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
26//!    one function to calculate both the quotient and remainder. If configuration flags
27//!    enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
28//!    which requires the half sized division `u64_by_u64_div_rem`. If the architecture
29//!    supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
30//!    reduced to those instructions. Note that we do not specify the half size division
31//!    directly to be `__udivdi3`, because hardware division would never be introduced.
32//!  - If the architecture does not supply a 64 bit hardware division instruction, u64
33//!    divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
34//!    which is defined by `impl_delegate!`. The half division for this algorithm is
35//!    `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
36//!    software division algorithms.
37//!  - If the architecture does not supply a 32 bit hardware instruction, linkers will
38//!    look for `__udivsi3`. `impl_binary_long!` is used, but this  algorithm uses no half
39//!    division, so the chain of calls ends here.
40//!
41//! On some architectures like x86_64, an asymmetrically sized division is supplied, in
42//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
43//! extend the 128 by 64 bit division to a full 128 by 128 bit division.
44
45// `allow(dead_code)` is used in various places, because the configuration code would otherwise be
46// ridiculously complex
47
48#[macro_use]
49mod norm_shift;
50
51#[macro_use]
52mod binary_long;
53
54#[macro_use]
55mod delegate;
56
57// used on SPARC
58#[allow(unused_imports)]
59#[cfg(not(feature = "unstable-public-internals"))]
60pub(crate) use self::delegate::u128_divide_sparc;
61#[cfg(feature = "unstable-public-internals")]
62pub use self::delegate::u128_divide_sparc;
63
64#[macro_use]
65mod trifecta;
66
67#[macro_use]
68mod asymmetric;
69
70/// The behavior of all divisions by zero is controlled by this function. This function should be
71/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
72/// `core/std::unchecked_div/rem` are directly used without a zero check in front.
73fn zero_div_fn() -> ! {
74    // Calling the intrinsic directly, to avoid the `assert_unsafe_precondition` that cannot be used
75    // here because it involves non-`inline` functions
76    // (https://github.com/rust-lang/compiler-builtins/issues/491).
77    unsafe { core::intrinsics::unreachable() }
78}
79
80const USE_LZ: bool = {
81    if cfg!(target_arch = "arm") {
82        if cfg!(target_feature = "thumb-mode") {
83            // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
84            // supported. This is needed to successfully differentiate between targets like
85            // `thumbv8.base` and `thumbv8.main`.
86            cfg!(target_feature = "v6t2")
87        } else {
88            // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
89            // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
90            // feature does not seem to work.
91            cfg!(target_feature = "v5te")
92        }
93    } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
94        // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
95        cfg!(target_feature = "vis3")
96    } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
97        // The 'Zbb' Basic Bit-Manipulation extension on RISC-V
98        // determines if a CLZ assembly instruction exists
99        cfg!(target_feature = "zbb")
100    } else {
101        // All other common targets Rust supports should have CLZ instructions
102        true
103    }
104};
105
106impl_normalization_shift!(
107    u32_normalization_shift,
108    USE_LZ,
109    32,
110    u32,
111    i32,
112    allow(dead_code)
113);
114impl_normalization_shift!(
115    u64_normalization_shift,
116    USE_LZ,
117    64,
118    u64,
119    i64,
120    allow(dead_code)
121);
122
123/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
124/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
125/// dependencies.
126#[inline]
127fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
128    if let Some(quo) = duo.checked_div(div) {
129        if let Some(rem) = duo.checked_rem(div) {
130            return (quo, rem);
131        }
132    }
133    zero_div_fn()
134}
135
136// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
137// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
138// faster if the target pointer width is at least 64. Note that this
139// implementation is additionally included on WebAssembly despite the typical
140// pointer width there being 32 because it's typically run on a 64-bit machine
141// that has access to faster 64-bit operations.
142#[cfg(all(
143    any(
144        target_family = "wasm",
145        not(any(target_pointer_width = "16", target_pointer_width = "32")),
146    ),
147    not(all(not(feature = "no-asm"), target_arch = "x86_64")),
148    not(any(target_arch = "sparc", target_arch = "sparc64"))
149))]
150impl_trifecta!(
151    u128_div_rem,
152    zero_div_fn,
153    u64_by_u64_div_rem,
154    32,
155    u32,
156    u64,
157    u128
158);
159
160// If the pointer width less than 64 and this isn't wasm, then the target
161// architecture almost certainly does not have the fast 64 to 128 bit widening
162// multiplication needed for `trifecta` to be faster.
163#[cfg(all(
164    not(any(
165        target_family = "wasm",
166        not(any(target_pointer_width = "16", target_pointer_width = "32")),
167    )),
168    not(all(not(feature = "no-asm"), target_arch = "x86_64")),
169    not(any(target_arch = "sparc", target_arch = "sparc64"))
170))]
171impl_delegate!(
172    u128_div_rem,
173    zero_div_fn,
174    u64_normalization_shift,
175    u64_by_u64_div_rem,
176    32,
177    u32,
178    u64,
179    u128,
180    i128
181);
182
183/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
184///
185/// # Safety
186///
187/// If the quotient does not fit in a `u64`, a floating point exception occurs.
188/// If `div == 0`, then a division by zero exception occurs.
189#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
190#[inline]
191unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
192    let duo_lo = duo as u64;
193    let duo_hi = (duo >> 64) as u64;
194    let quo: u64;
195    let rem: u64;
196    unsafe {
197        // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
198        // by `div`. The quotient is stored in rax and the remainder in rdx.
199        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
200        core::arch::asm!(
201            "div {0}",
202            in(reg) div,
203            inlateout("rax") duo_lo => quo,
204            inlateout("rdx") duo_hi => rem,
205            options(att_syntax, pure, nomem, nostack)
206        );
207    }
208    (quo, rem)
209}
210
211// use `asymmetric` instead of `trifecta` on x86_64
212#[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
213impl_asymmetric!(
214    u128_div_rem,
215    zero_div_fn,
216    u64_by_u64_div_rem,
217    u128_by_u64_div_rem,
218    32,
219    u32,
220    u64,
221    u128
222);
223
224/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
225/// `checked_div` and `checked_rem` are used to avoid bringing in panic function
226/// dependencies.
227#[inline]
228#[allow(dead_code)]
229fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
230    if let Some(quo) = duo.checked_div(div) {
231        if let Some(rem) = duo.checked_rem(div) {
232            return (quo, rem);
233        }
234    }
235    zero_div_fn()
236}
237
238// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
239// than register size.
240#[cfg(all(
241    not(all(not(feature = "no-asm"), target_arch = "x86")),
242    not(target_pointer_width = "64")
243))]
244impl_delegate!(
245    u64_div_rem,
246    zero_div_fn,
247    u32_normalization_shift,
248    u32_by_u32_div_rem,
249    16,
250    u16,
251    u32,
252    u64,
253    i64
254);
255
256// When not on x86 and the pointer width is 64, use `binary_long`.
257#[cfg(all(
258    not(all(not(feature = "no-asm"), target_arch = "x86")),
259    target_pointer_width = "64"
260))]
261impl_binary_long!(
262    u64_div_rem,
263    zero_div_fn,
264    u64_normalization_shift,
265    64,
266    u64,
267    i64
268);
269
270/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
271///
272/// # Safety
273///
274/// If the quotient does not fit in a `u32`, a floating point exception occurs.
275/// If `div == 0`, then a division by zero exception occurs.
276#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
277#[inline]
278unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
279    let duo_lo = duo as u32;
280    let duo_hi = (duo >> 32) as u32;
281    let quo: u32;
282    let rem: u32;
283    unsafe {
284        // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
285        // by `div`. The quotient is stored in rax and the remainder in rdx.
286        // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
287        core::arch::asm!(
288            "div {0}",
289            in(reg) div,
290            inlateout("rax") duo_lo => quo,
291            inlateout("rdx") duo_hi => rem,
292            options(att_syntax, pure, nomem, nostack)
293        );
294    }
295    (quo, rem)
296}
297
298// use `asymmetric` instead of `delegate` on x86
299#[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
300impl_asymmetric!(
301    u64_div_rem,
302    zero_div_fn,
303    u32_by_u32_div_rem,
304    u64_by_u32_div_rem,
305    16,
306    u16,
307    u32,
308    u64
309);
310
311// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
312impl_binary_long!(
313    u32_div_rem,
314    zero_div_fn,
315    u32_normalization_shift,
316    32,
317    u32,
318    i32,
319    allow(dead_code)
320);
compiler_builtins/int/specialized_div_rem/mod.rs

compiler_builtins/int/specialized_div_rem/
mod.rs