Revisiting 32-bit integer division on AVR 8-bit processors, I tried to code unsigned non-restoring division. Not thinking it looks too bad, there is one problem:
With a divisor with highest bit set, it does not readily result in a quotient 0 where necessary.
分解为8轨,然后是一些校准法:
#define bits __zero_reg__
#define R r25
#define D r22
#define NQ r24
ldi NQ, 25
ldi r22, 129
__udivmodqi4NR: ; 19 instructions NQ, R = divmod(NQ, D)
clr R
ldi bits, 8
lsl NQ
_udivmodqi4_pos:
rol R ; 1
sub R, D ; 2 diminish remainder
brcs _udivmodqi4_nep ; 4/3 remainder got < 0, was < divisor
sec ; 4 How to obviate this?
_udivmodqi4_ep:
rol NQ ; 5 rotate 1 quotient bit into dendQuot
dec bits ; 6 decrement loop counter
brne _udivmodqi4_pos ; 8/7
ret
_udivmodqi4_neg:
rol R ; 1
add R, D ; 2 increase remainder
brcs _udivmodqi4_ep ; 4/3 remainder restored to >= 0
_udivmodqi4_nep:
lsl NQ ; 4/5 shift 0 quotient bit into dendQuot
dec bits ; 5/6 decrement loop counter
brne _udivmodqi4_neg ; 7/6/8
add R, D
ret
sleep
; libgcc (or close enough), 12 instructions, worst case not significantly slower:
#define r_rem r25 /* remainder */
#define r_arg1 r24 /* dividend, quotient */
#define r_arg2 r22 /* divisor */
#define q_cnt r23 /* loop count */
__udivmodqi4: ; r24, r25 = divmod(r24, r22)
sub r_rem, r_rem ; clear remainder and carry
ldi q_cnt, 8 ; init loop counter
lsl r_arg1
__udivmodqi4_loop:
rol r_rem ; 1 shift dividend into remainder
cp r_rem, r_arg2 ; 2 compare remainder & divisor
brcs __udivmodqi4_ep ; 4/3 remainder <= divisor
sub r_rem, r_arg2 ; 4 restore remainder
__udivmodqi4_ep:
rol r_arg1 ; 5 shift dividend (with CARRY)
dec q_cnt ; 6 decrement loop counter
brne __udivmodqi4_loop;8/7
com r_arg1 ; 67 complement result
; because C flag was complemented in loop
ret
sleep
#define quotient __tmp_reg__
#define byteCnt __zero_reg__ /* loop count (0 after the loop!) */
#define dividend r_arg1HH
zeroOne:; 11(movw)/7(mov) + 2 instructions just to handle divisor MSB == 1
mov_l r_remL , r_arg1L ; remainder = dividend, part one
mov_h r_remH , r_arg1H
mov_l r_arg1L, r_remHL ; quotient = 0
mov_h r_arg1H, r_remHH
mov_l r_remHL , r_arg1HL ; part two
mov_h r_remHH , r_arg1HH
mov_l r_arg1HL, r_arg1L
mov_h r_arg1HH, r_arg1H
ldi byteCnt, 1 ; initialise loop control variables
ldi quotient, 0x80 ; for a single/final iteration
rjmp subtract
__udivmodsi4NonRestoring: ; arg2, arg1 = divmod(arg1, arg2)
clr r_remHH
clr r_remHL
cpi r_arg2HH, 128 ; sbrc r_arg2HH, 7 rjmp
brsh zeroOne ; how to avoid special casing divisor MSB set?
mov_l r_remL , r_remHL
mov_h r_remH , r_remHH
ldi byteCnt, 4
ldi quotient, 1 ; takes NBBY(8) rol instructions to appear as carry
_udivmodsi4_pos:
lsl dividend ; 1
rol r_remL ; 2 shift dividend into remainder
rol r_remH ; 3
rol r_remHL ; 4
rol r_remHH ; 5
subtract:
sub r_remL, r_arg2L ; 6 diminish remainder
sbc r_remH, r_arg2H ; 7
sbc r_remHL, r_arg2HL ; 8
sbc r_remHH, r_arg2HH ; 9
brcs _udivmodsi4_nep ; 11/10 remainder got < 0, was < divisor
sec ; 11
_udivmodsi4_ep:
rol quotient ; 12 rotate 1 into quotient
brcc _udivmodsi4_pos ; 14/13
dec byteCnt
breq bitsDone
byteLoop:
mov r_arg1HH, r_arg1HL ; shift by bytes
mov r_arg1HL, r_arg1H
mov r_arg1H , r_arg1L
mov r_arg1L , quotient
ldi quotient, 1
sbrc r_arg1L, 0
rjmp _udivmodsi4_pos
_udivmodsi4_neg:
lsl dividend ; 1
rol r_remL ; 2 shift dividend into remainder
rol r_remH ; 3
rol r_remHL ; 4
rol r_remHH ; 5
add r_remL, r_arg2L ; 6 raise remainder
adc r_remH, r_arg2H ; 7
adc r_remHL, r_arg2HL ; 8
adc r_remHH, r_arg2HH ; 9
brcs _udivmodsi4_ep ; 11/10 remainder restored to >= 0
_udivmodsi4_nep:
lsl quotient ; 12/11 shift 0 into quotient
brcc _udivmodsi4_neg ; 14/13/12
dec byteCnt
brne byteLoop
add r_remL , r_arg2L ; restore remainder
adc r_remH , r_arg2H
adc r_remHL, r_arg2HL
adc r_remHH, r_arg2HH
bitsDone:
mov r_arg2L , quotient
mov r_arg2H , r_arg1L
mov r_arg2HL, r_arg1H
mov r_arg2HH, r_arg1HL
mov_l r_arg1L , r_remL
mov_h r_arg1H , r_remH
mov_l r_arg1HL, r_remHL
mov_h r_arg1HH, r_remHH
ret ; < 500 cycles worst case
sleep
How can I keep the speed of non-restoring division without that many additional instructions for divisors with MSB 1? (As is, their number increases with operand size.)