// file kernel/n/x86-64/div_n2.S: O(n^2) division of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                         Division quadratique                          |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                         # +-------------------------+
                         # |  Division  un chiffre  |
                         # +-------------------------+


# unsigned long xn(div_1)(chiffre *a, long la, unsigned long b, chiffre *c)
#
# entre :
# a = naturel de longueur la >= 0
# b = long > 0
# c = naturel de longueur la, peut tre confondu avec a
#
# sortie :
# c <- floor(a/b)
# retourne a mod b
        
#ifdef assembly_sn_div_1
#undef L
#define L(x) .Lsn_div_1_##x
QUICKENTER(sn_div_1)

        movq   %rdx,    %r9             # r9  <- b
        movq   %rcx,    %r8             # r8  <- &c
        movq   %rsi,    %rcx            # rcx <- la
        xorq   %rdx,    %rdx            # init reste
        jrcxz  2f
        ALIGN(8)

1:
        movq   -8(%rdi,%rcx,8), %rax
        divq   %r9
        movq   %rax,   -8(%r8,%rcx,8)
        loop   1b

2:
        movq   %rdx,    %rax            # rax <- reste
        ret

#endif /* assembly_sn_div_1 */
        
# unsigned long xn(mod_1)(chiffre *a, long la, unsigned long b)
#
# entre :
# a = naturel de longueur la >= 0
# b = long > 0
#
# sortie :
# retourne a mod b
        
#ifdef assembly_sn_mod_1
#undef L
#define L(x) .Lsn_mod_1_##x
QUICKENTER(sn_mod_1)

	movq   %rdx,    %r9             # r9  <- b
        movq   %rsi,    %rcx            # rcx <- la
        xorq   %rdx,    %rdx            # init reste
        jrcxz  2f
        ALIGN(8)

1:
        movq   -8(%rdi,%rcx,8), %rax
        divq   %r9
        loop   1b

2:
        movq   %rdx,    %rax            # rax <- reste
        ret


#endif /* assembly_sn_mod_1 */
        

                         # +------------------------+
                         # |  Division quadratique  |
                         # +------------------------+

# entre :
#   a = naturel de longueur la     rsi = &a, rdx = la-lb
#   b = naturel de longueur lb     rbx = &b, rcx = lb
#   c = naturel de longueur la-lb  rdi = &c
#
# contraintes : 
# deux <= lb < la, le bit de poids fort de b est non nul,
# a < BASE^(la-lb)*b
# a,b,c non confondus
#
# sortie :
# a <- a mod b
# c <- floor(a/b)
#
# registres modifis : tous


#ifdef assembly_sn_div_n2
#undef L
#define L(x) .Lsn_fdiv_n2_##x
        ALIGN(32)
#ifdef debug_div_n2
.Lsn_fdiv_n2_buggy:
#else
.Lsn_fdiv_n2:
#endif

	# iniialise les registres
	movq   %rdi,   %r11             # r11 <- &c
	movq   %rdx,   %r12             # r12 <- la-lb
        leaq -8(%rsi,%rdx,8), %rsi
        leaq   (%rsi,%rcx,8), %rdi      # rdi <- &a[la-1]
        leaq   (%rbx,%rcx,8), %rsi      # rsi <- &b[lb]
	movq   (%rbx), %r14             # r14 <- b[0]
	movq -8(%rsi), %r15             # r15 <- b[lb-1]
        xorq   %rbx,   %rbx             # rbx <- 0 (pour retenues)
        negq   %rcx
	incq   %rcx
	movq   %rcx,   %r10             # r10 <- 1-lb

	# prcalcule le droulement de la boucle interne
	leaq   .Lsn_mul_sub_loop(%rip), %r13
	andq   $7,     %rcx             # rcx <- (1-lb) mod 8
	leaq   (%rcx,%rcx,2), %rdx      # r13 += 23*rcx
	leaq   (%r13,%rdx,8), %r13
	subq   %rcx,   %r13

	# effectue la division chiffre par chiffre
        ALIGN(8)
L(loop):

        # calcule le quotient approch, trop grand d au plus 2 units
        # q <- max(floor(a[la-1]:a[la-2]/b[lb-1]), BASE-1)
	movq   (%rdi),  %rdx            # rdx <- a[la-1]
	leaq -1(%rbx),  %rax            # rax <- BASE-1
	cmpq    %r15,   %rdx            # si gaux, q <- BASE-1
	leaq -8(%rdi),  %rdi            # rdi <- &a[la-2]
	jnb    1f
	movq   (%rdi),  %rax            # rdx:rax <- a[la-1]:a[la-2]
	divq    %r15                    # divise par b[lb-1]
1:	
	movq    %rax,   %rbp            # q <- quotient

        # a <- a - q*b
	mulq   %r14                     # rdx:rax <- q*b[0]
	movq   %r10,    %rcx            # rcx <- -8*ceil((lb-1)/8)
	andq   $-8,     %rcx
	movq   %rax,    %r8             # init retenues
	movq   %rax,    %r9
        call   *%r13                    # a -= q*b
	subq   %r9,    (%rdi)           # retranche le dernier produit
	adcq   %rbx,    %rdx
	setc   %bl                      # rbx <- 0
	subq   %rdx,  8(%rdi)
        jnb    L(next)

        # si < 0, diminue q et ajoute b
L(add):
        movq   %r10,    %rcx
        decq   %rbp                     # q--
        clc
        ALIGN(8)
1:
        movq -8(%rsi,%rcx,8), %rax
        adcq   %rax, (%rdi,%rcx,8)
        incq   %rcx
        jle    1b
        adcq   %rbx, 8(%rdi)
        jnb    L(add)

        # fin de la boucle principale
        ALIGN(8)
L(next):
        decq   %r12                     # la-
	movq   %rbp, (%r11,%r12,8)      # c[la-lb-1] <- q
        jne    L(loop)
        ret
        
        
                              # +---------------+
                              # |  Interface C  |
                              # +---------------+
        

# void xn(div_n2)(chiffre *a, long lc, chiffre *b, long lb, chiffre *c)
#
# entre :
# a = naturel de longueur lc+lb
# b = naturel de longueur lb
# c = naturel de longueur lc
#
# contraintes : 
# lb >= 2, lc > 0, le bit de poids fort de b est non nul,
# a < BASE^lc*b
# a,b,c non confondus
#
# sortie :
# a <- a mod b
# c <- floor(a/b)

#ifdef debug_div_n2
ENTER(sn_div_n2_buggy)
#else
ENTER(sn_div_n2)
#endif

        movq   %rdx,    %rbx            # rbx <- &b
        movq   %rsi,    %rdx            # rdx <- la-lb
        movq   %rdi,    %rsi            # rsi <- &a
        movq   %r8,     %rdi            # rdi <- &c
#ifdef debug_div_n2
        call   .Lsn_fdiv_n2_buggy       # effectue la division
#else
        call   .Lsn_fdiv_n2
#endif
        RETURN_WITH_SP
        
#endif /* assembly_sn_div_n2 */

        # cas o la version assembleur est dsactive ou dbogue :
        # sn_fdiv_n2 renvoie vers la version C
        
#if !defined(assembly_sn_div_n2) || defined(debug_div_n2)
        ALIGN(32)
.Lsn_fdiv_n2:

	movq   %rdi,  %r8
	movq   %rsi,  %rdi
	movq   %rdx,  %rsi
	movq   %rbx,  %rdx
        jmp    SUBR(sn_div_n2)
        
#endif /* !defined(assembly_sn_div_n2) || defined(debug_div_n2) */

