compiler_builtins/int/specialized_div_rem/
binary_long.rs

1/// Creates an unsigned division function that uses binary long division, designed for
2/// computer architectures without division instructions. These functions have good performance for
3/// microarchitectures with large branch miss penalties and architectures without the ability to
4/// predicate instructions. For architectures with predicated instructions, one of the algorithms
5/// described in the documentation of these functions probably has higher performance, and a custom
6/// assembly routine should be used instead.
7#[allow(unused_macros)]
8macro_rules! impl_binary_long {
9    (
10        $fn:ident, // name of the unsigned division function
11        $zero_div_fn:ident, // function called when division by zero is attempted
12        $normalization_shift:ident, // function for finding the normalization shift
13        $n:tt, // the number of bits in a $iX or $uX
14        $uX:ident, // unsigned integer type for the inputs and outputs of `$fn`
15        $iX:ident // signed integer type with same bitwidth as `$uX`
16        $(, $fun_attr:meta)* // attributes for the function
17    ) => {
18        /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a
19        /// tuple.
20        $(
21            #[$fun_attr]
22        )*
23        pub fn $fn(duo: $uX, div: $uX) -> ($uX, $uX) {
24            let mut duo = duo;
25            // handle edge cases before calling `$normalization_shift`
26            if div == 0 {
27                $zero_div_fn()
28            }
29            if duo < div {
30                return (0, duo);
31            }
32
33            // There are many variations of binary division algorithm that could be used. This
34            // documentation gives a tour of different methods so that future readers wanting to
35            // optimize further do not have to painstakingly derive them. The SWAR variation is
36            // especially hard to understand without reading the less convoluted methods first.
37
38            // You may notice that a `duo < div_original` check is included in many these
39            // algorithms. A critical optimization that many algorithms miss is handling of
40            // quotients that will turn out to have many trailing zeros or many leading zeros. This
41            // happens in cases of exact or close-to-exact divisions, divisions by power of two, and
42            // in cases where the quotient is small. The `duo < div_original` check handles these
43            // cases of early returns and ends up replacing other kinds of mundane checks that
44            // normally terminate a binary division algorithm.
45            //
46            // Something you may see in other algorithms that is not special-cased here is checks
47            // for division by powers of two. The `duo < div_original` check handles this case and
48            // more, however it can be checked up front before the bisection using the
49            // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because
50            // compilers should handle most cases where divisions by power of two occur, and we do
51            // not want to add on a few cycles for every division operation just to save a few
52            // cycles rarely.
53
54            // The following example is the most straightforward translation from the way binary
55            // long division is typically visualized:
56            // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to
57            // the result from `$normalization_shift(duo, div, false)`.
58            //
59            // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set
60            // and `duo` is kept unchanged.
61            // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5
62            //
63            // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step.
64            // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4
65            //
66            // Step 2: Continue based on `sub`. The `quo` bits start accumulating.
67            // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3
68            // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2
69            // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1
70            // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0
71            // The `duo < div_original` check terminates the algorithm with the correct quotient of
72            // 29u8 and remainder of 4u8
73            /*
74            let div_original = div;
75            let mut shl = $normalization_shift(duo, div, false);
76            let mut quo = 0;
77            loop {
78                let div_shifted = div << shl;
79                let sub = duo.wrapping_sub(div_shifted);
80                // it is recommended to use `println!`s like this if functionality is unclear
81                /*
82                println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}",
83                    duo,
84                    div_shifted,
85                    sub,
86                    quo,
87                    shl
88                );
89                */
90                if 0 <= (sub as $iX) {
91                    duo = sub;
92                    quo += 1 << shl;
93                    if duo < div_original {
94                        // this branch is optional
95                        return (quo, duo)
96                    }
97                }
98                if shl == 0 {
99                    return (quo, duo)
100                }
101                shl -= 1;
102            }
103            */
104
105            // This restoring binary long division algorithm reduces the number of operations
106            // overall via:
107            // - `pow` can be shifted right instead of recalculating from `shl`
108            // - starting `div` shifted left and shifting it right for each step instead of
109            //   recalculating from `shl`
110            // - The `duo < div_original` branch is used to terminate the algorithm instead of the
111            //   `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and
112            //   `div` from being shifted off the end. This check also only occurs on half of steps
113            //   on average, since it is behind the `(sub as $iX) >= 0` branch.
114            // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are
115            //   being updated between steps
116            //
117            // There are many variations of this algorithm, but this encompases the largest number
118            // of architectures and does not rely on carry flags, add-with-carry, or SWAR
119            // complications to be decently fast.
120            /*
121            let div_original = div;
122            let shl = $normalization_shift(duo, div, false);
123            let mut div: $uX = div << shl;
124            let mut pow: $uX = 1 << shl;
125            let mut quo: $uX = 0;
126            loop {
127                let sub = duo.wrapping_sub(div);
128                if 0 <= (sub as $iX) {
129                    duo = sub;
130                    quo |= pow;
131                    if duo < div_original {
132                        return (quo, duo)
133                    }
134                }
135                div >>= 1;
136                pow >>= 1;
137            }
138            */
139
140            // If the architecture has flags and predicated arithmetic instructions, it is possible
141            // to do binary long division without branching and in only 3 or 4 instructions. This is
142            // a variation of a 3 instruction central loop from
143            // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt.
144            //
145            // What allows doing division in only 3 instructions is realizing that instead of
146            // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in
147            // place and `duo` can be shifted left. This means `div` does not have to be updated,
148            // but causes edge case problems and makes `duo < div_original` tests harder. Some
149            // architectures have an option to shift an argument in an arithmetic operation, which
150            // means `duo` can be shifted left and subtracted from in one instruction. The other two
151            // instructions are updating `quo` and undoing the subtraction if it turns out things
152            // were not normalized.
153
154            /*
155            // Perform one binary long division step on the already normalized arguments, because
156            // the main. Note that this does a full normalization since the central loop needs
157            // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original
158            // variation only did normalization to the nearest 4 steps, but this makes handling edge
159            // cases much harder. We do a full normalization and perform a binary long division
160            // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb
161            // of `duo`, then the edge case handler shifts `div` right and does another long
162            // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`.
163            let div_original = div;
164            let mut shl = $normalization_shift(duo, div, true);
165            let mut div: $uX = (div << shl);
166            let mut quo: $uX = 1;
167            duo = duo.wrapping_sub(div);
168            if duo < div_original {
169                return (1 << shl, duo);
170            }
171            let div_neg: $uX;
172            if (div as $iX) < 0 {
173                // A very ugly edge case where the most significant bit of `div` is set (after
174                // shifting to match `duo` when its most significant bit is at the sign bit), which
175                // leads to the sign bit of `div_neg` being cut off and carries not happening when
176                // they should. This branch performs a long division step that keeps `duo` in place
177                // and shifts `div` down.
178                div >>= 1;
179                div_neg = div.wrapping_neg();
180                let (sub, carry) = duo.overflowing_add(div_neg);
181                duo = sub;
182                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
183                if !carry {
184                    duo = duo.wrapping_add(div);
185                }
186                shl -= 1;
187            } else {
188                div_neg = div.wrapping_neg();
189            }
190            // The add-with-carry that updates `quo` needs to have the carry set when a normalized
191            // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the
192            // subtraction generates a carry when an unnormalized subtract happens, which is the
193            // opposite of what we want. Instead, we use
194            // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`.
195            let mut i = shl;
196            loop {
197                if i == 0 {
198                    break;
199                }
200                i -= 1;
201                // `ADDS duo, div, duo, LSL #1`
202                // (add `div` to `duo << 1` and set flags)
203                let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg);
204                duo = sub;
205                // `ADC quo, quo, quo`
206                // (add with carry). Effectively shifts `quo` left by 1 and sets the least
207                // significant bit to the carry.
208                quo = quo.wrapping_add(quo).wrapping_add(carry as $uX);
209                // `ADDCC duo, duo, div`
210                // (add if carry clear). Undoes the subtraction if no carry was generated.
211                if !carry {
212                    duo = duo.wrapping_add(div);
213                }
214            }
215            return (quo, duo >> shl);
216            */
217
218            // This is the SWAR (SIMD within in a register) restoring division algorithm.
219            // This combines several ideas of the above algorithms:
220            //  - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction
221            //    restoring division algorithm, some architectures can do the shifting and
222            //    subtraction step in one instruction.
223            //  - `quo` can be constructed by adding powers-of-two to it or shifting it left by one
224            //    and adding one.
225            //  - Every time `duo` is shifted left, there is another unused 0 bit shifted into the
226            //    LSB, so what if we use those bits to store `quo`?
227            // Through a complex setup, it is possible to manage `duo` and `quo` in the same
228            // register, and perform one step with 2 or 3 instructions. The only major downsides are
229            // that there is significant setup (it is only saves instructions if `shl` is
230            // approximately more than 4), `duo < div_original` checks are impractical once SWAR is
231            // initiated, and the number of division steps taken has to be exact (we cannot do more
232            // division steps than `shl`, because it introduces edge cases where quotient bits in
233            // `duo` start to collide with the real part of `div`.
234            /*
235            // first step. The quotient bit is stored in `quo` for now
236            let div_original = div;
237            let mut shl = $normalization_shift(duo, div, true);
238            let mut div: $uX = (div << shl);
239            duo = duo.wrapping_sub(div);
240            let mut quo: $uX = 1 << shl;
241            if duo < div_original {
242                return (quo, duo);
243            }
244
245            let mask: $uX;
246            if (div as $iX) < 0 {
247                // deal with same edge case as the 3 instruction restoring division algorithm, but
248                // the quotient bit from this step also has to be stored in `quo`
249                div >>= 1;
250                shl -= 1;
251                let tmp = 1 << shl;
252                mask = tmp - 1;
253                let sub = duo.wrapping_sub(div);
254                if (sub as $iX) >= 0 {
255                    // restore
256                    duo = sub;
257                    quo |= tmp;
258                }
259                if duo < div_original {
260                    return (quo, duo);
261                }
262            } else {
263                mask = quo - 1;
264            }
265            // There is now room for quotient bits in `duo`.
266
267            // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from
268            // `div` and end up with the subset of `shl` bits being all being set. This subset acts
269            // just like a two's complement negative one. The subset of `div` containing the divisor
270            // had 1 subtracted from it, but a carry will always be generated from the `shl` subset
271            // as long as the quotient stays positive.
272            //
273            // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset
274            // adds a quotient bit to the least significant bit.
275            // For example, 89 (0b01011001) divided by 3 (0b11):
276            //
277            // shl:4, div:0b00110000
278            // first step:
279            //       duo:0b01011001
280            // + div_neg:0b11010000
281            // ____________________
282            //           0b00101001
283            // quo is set to 0b00010000 and mask is set to 0b00001111 for later
284            //
285            // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the
286            // quotient part of `duo` with `^`s.
287            // chars.
288            //     div:0b00110000
289            //               ^^^^
290            //   +     0b11111111
291            //   ________________
292            //         0b00101111
293            //               ^^^^
294            // div_neg:0b11010001
295            //
296            // first SWAR step:
297            //  duo_shl1:0b01010010
298            //                    ^
299            // + div_neg:0b11010001
300            // ____________________
301            //           0b00100011
302            //                    ^
303            // second:
304            //  duo_shl1:0b01000110
305            //                   ^^
306            // + div_neg:0b11010001
307            // ____________________
308            //           0b00010111
309            //                   ^^
310            // third:
311            //  duo_shl1:0b00101110
312            //                  ^^^
313            // + div_neg:0b11010001
314            // ____________________
315            //           0b11111111
316            //                  ^^^
317            // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real
318            // part of `duo` is negative and the third step was an unnormalized step. The restore
319            // branch then restores `duo`. Note that the restore branch does not shift `duo` left.
320            //
321            //   duo:0b11111111
322            //              ^^^
323            // + div:0b00101111
324            //             ^^^^
325            // ________________
326            //       0b00101110
327            //              ^^^
328            // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an
329            // unset quotient bit.
330            //
331            // final step (`shl` was 4, so exactly 4 steps must be taken)
332            //  duo_shl1:0b01011100
333            //                 ^^^^
334            // + div_neg:0b11010001
335            // ____________________
336            //           0b00101101
337            //                 ^^^^
338            // The quotient includes the `^` bits added with the `quo` bits from the beginning that
339            // contained the first step and potential edge case step,
340            // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`.
341            // The remainder is the bits remaining in `duo` that are not part of the quotient bits,
342            // `duo:0b00101101 >> shl == 0b0010 == 2u8`.
343            let div: $uX = div.wrapping_sub(1);
344            let mut i = shl;
345            loop {
346                if i == 0 {
347                    break;
348                }
349                i -= 1;
350                duo = duo.wrapping_shl(1).wrapping_sub(div);
351                if (duo as $iX) < 0 {
352                    // restore
353                    duo = duo.wrapping_add(div);
354                }
355            }
356            // unpack the results of SWAR
357            return ((duo & mask) | quo, duo >> shl);
358            */
359
360            // The problem with the conditional restoring SWAR algorithm above is that, in practice,
361            // it requires assembly code to bring out its full unrolled potential (It seems that
362            // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit
363            // that my algorithm intends. On architectures without predicated instructions, the code
364            // gen is especially bad. We need a default software division algorithm that is
365            // guaranteed to get decent code gen for the central loop.
366
367            // For non-SWAR algorithms, there is a way to do binary long division without
368            // predication or even branching. This involves creating a mask from the sign bit and
369            // performing different kinds of steps using that.
370            /*
371            let shl = $normalization_shift(duo, div, true);
372            let mut div: $uX = div << shl;
373            let mut pow: $uX = 1 << shl;
374            let mut quo: $uX = 0;
375            loop {
376                let sub = duo.wrapping_sub(div);
377                let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX);
378                duo -= div & sign_mask;
379                quo |= pow & sign_mask;
380                div >>= 1;
381                pow >>= 1;
382                if pow == 0 {
383                    break;
384                }
385            }
386            return (quo, duo);
387            */
388            // However, it requires about 4 extra operations (smearing the sign bit, negating the
389            // mask, and applying the mask twice) on top of the operations done by the actual
390            // algorithm. With SWAR however, just 2 extra operations are needed, making it
391            // practical and even the most optimal algorithm for some architectures.
392
393            // What we do is use custom assembly for predicated architectures that need software
394            // division, and for the default algorithm use a mask based restoring SWAR algorithm
395            // without conditionals or branches. On almost all architectures, this Rust code is
396            // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM
397            // will unroll it in a decent way.
398
399            // standard opening for SWAR algorithm with first step and edge case handling
400            let div_original = div;
401            let mut shl = $normalization_shift(duo, div, true);
402            let mut div: $uX = (div << shl);
403            duo = duo.wrapping_sub(div);
404            let mut quo: $uX = 1 << shl;
405            if duo < div_original {
406                return (quo, duo);
407            }
408            let mask: $uX;
409            if (div as $iX) < 0 {
410                div >>= 1;
411                shl -= 1;
412                let tmp = 1 << shl;
413                mask = tmp - 1;
414                let sub = duo.wrapping_sub(div);
415                if (sub as $iX) >= 0 {
416                    duo = sub;
417                    quo |= tmp;
418                }
419                if duo < div_original {
420                    return (quo, duo);
421                }
422            } else {
423                mask = quo - 1;
424            }
425
426            // central loop
427            div = div.wrapping_sub(1);
428            let mut i = shl;
429            loop {
430                if i == 0 {
431                    break;
432                }
433                i -= 1;
434                // shift left 1 and subtract
435                duo = duo.wrapping_shl(1).wrapping_sub(div);
436                // create mask
437                let mask = (duo as $iX).wrapping_shr($n - 1) as $uX;
438                // restore
439                duo = duo.wrapping_add(div & mask);
440            }
441            // unpack
442            return ((duo & mask) | quo, duo >> shl);
443
444            // miscellanious binary long division algorithms that might be better for specific
445            // architectures
446
447            // Another kind of long division uses an interesting fact that `div` and `pow` can be
448            // negated when `duo` is negative to perform a "negated" division step that works in
449            // place of any normalization mechanism. This is a non-restoring division algorithm that
450            // is very similar to the non-restoring division algorithms that can be found on the
451            // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can
452            // be viewed as shifting the least significant set bit right (e.x. if we enter a series
453            // of negated binary long division steps starting with `quo == 0b1011_0000` and
454            // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100,
455            // 0b1010_0010, 0b1010_0001).
456            /*
457            let div_original = div;
458            let shl = $normalization_shift(duo, div, true);
459            let mut div: $uX = (div << shl);
460            let mut pow: $uX = 1 << shl;
461            let mut quo: $uX = pow;
462            duo = duo.wrapping_sub(div);
463            if duo < div_original {
464                return (quo, duo);
465            }
466            div >>= 1;
467            pow >>= 1;
468            loop {
469                if (duo as $iX) < 0 {
470                    // Negated binary long division step.
471                    duo = duo.wrapping_add(div);
472                    quo = quo.wrapping_sub(pow);
473                } else {
474                    // Normal long division step.
475                    if duo < div_original {
476                        return (quo, duo)
477                    }
478                    duo = duo.wrapping_sub(div);
479                    quo = quo.wrapping_add(pow);
480                }
481                pow >>= 1;
482                div >>= 1;
483            }
484            */
485
486            // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with
487            // SWAR techniques that makes the only difference between steps be negation of `div`.
488            // If there was an architecture with an instruction that negated inputs to an adder
489            // based on conditionals, and in place shifting (or a three input addition operation
490            // that can have `duo` as two of the inputs to effectively shift it left by 1), then a
491            // single instruction central loop is possible. Microarchitectures often have inputs to
492            // their ALU that can invert the arguments and carry in of adders, but the architectures
493            // unfortunately do not have an instruction to dynamically invert this input based on
494            // conditionals.
495            /*
496            // SWAR opening
497            let div_original = div;
498            let mut shl = $normalization_shift(duo, div, true);
499            let mut div: $uX = (div << shl);
500            duo = duo.wrapping_sub(div);
501            let mut quo: $uX = 1 << shl;
502            if duo < div_original {
503                return (quo, duo);
504            }
505            let mask: $uX;
506            if (div as $iX) < 0 {
507                div >>= 1;
508                shl -= 1;
509                let tmp = 1 << shl;
510                let sub = duo.wrapping_sub(div);
511                if (sub as $iX) >= 0 {
512                    // restore
513                    duo = sub;
514                    quo |= tmp;
515                }
516                if duo < div_original {
517                    return (quo, duo);
518                }
519                mask = tmp - 1;
520            } else {
521                mask = quo - 1;
522            }
523
524            // central loop
525            let div: $uX = div.wrapping_sub(1);
526            let mut i = shl;
527            loop {
528                if i == 0 {
529                    break;
530                }
531                i -= 1;
532                // note: the `wrapping_shl(1)` can be factored out, but would require another
533                // restoring division step to prevent `(duo as $iX)` from overflowing
534                if (duo as $iX) < 0 {
535                    // Negated binary long division step.
536                    duo = duo.wrapping_shl(1).wrapping_add(div);
537                } else {
538                    // Normal long division step.
539                    duo = duo.wrapping_shl(1).wrapping_sub(div);
540                }
541            }
542            if (duo as $iX) < 0 {
543                // Restore. This was not needed in the original nonrestoring algorithm because of
544                // the `duo < div_original` checks.
545                duo = duo.wrapping_add(div);
546            }
547            // unpack
548            return ((duo & mask) | quo, duo >> shl);
549            */
550        }
551    };
552}