// file kernel/n/ppc32/toom.S: Toom multiplication of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                          Multiplication de Toom                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

#if defined(assembly_sn_toommul) || defined(assembly_sn_toomsqr)
        
                         ; +-------------------------+
                         ; |  Addition/soustraction  |
                         ; +-------------------------+

; chiffre xn(add_sub3)(chiffre *a, chiffre *b, long p, long q)
; entre :
;   a = naturel de longueur 2p+q
;   b = naturel de longueur 2p+2
; contraintes : 0 < q <= p
;
; sortie :
;   b[0..p]      <-  a[0..p-1] + a[p..2p-1] + a[2p..2p+q-1]
;   b[p+1..2p+1] <- |a[0..p-1] - a[p..2p-1] + a[2p..2p+q-1]|
; r2 <- signe de a[0..p-1] - a[p..2p-1] + a[2p..2p+q-1]
;
; remarque :
;   fonction non implmente en C

#define L(x) Lsn_add_sub3_##x
Lsn_add_sub3:

	#define _a_  r31
	#define _b_  r30
	#define _p_  r29
	#define _ra_ r28

	stmw   _ra_,   4(r1)	; sauvegarde les registres modifis
	mflr   _ra_		; sauvegarde les paramtres
	mr     _b_,   r4
	mr     _p_,   r5

	slwi   r7,    r5,    2	; r7 <- 4p
	add    _a_,   r3,   r7  ; a  <- &a1
	add    r5,    _a_,  r7  ; r5 <- &a2
	mr     r4,    _p_
	mr     r7,    _b_
	bl     Lsn_add		; b0 <- a0 + a2
	
	slwi   r7,    _p_,  2	; r7 <- &b0[p]
	add    r7,    _b_,  r7
	stw    r3,   0(r7)	; sauve la retenue de a0 + a2
	addi   r7,    r7,    4	; r7 <- &b1
	mr     r3,    _b_
	addi   r4,    _p_,   1
	mr     r5,    _a_	; r5 <- &a1
	mr     r6,    _p_
	bl     Lsn_asub		; b1 <- |a0-a1+a2|

	mr     r3,    _b_
	addi   r4,    _p_,   1
	mr     r5,    _a_	; r5 <- &a1
	mr     r6,    _p_
	mr     _a_,   r2	; sauve r2
        bl     Lsn_inc          ; b0 <- a0+a1+a2
	mr     r2,    _a_	; rcupre r2

	mtlr   _ra_
	lmw    _ra_, 4(r1)	; rcupre les registres
	blr

	#undef _a_
	#undef _b_
	#undef _p_
	#undef _r_
	#undef _ra_

#undef L
        
        
                        ; +--------------------------+
                        ; |  Addition avec dcalage  |
                        ; +--------------------------+

; void xn(add_sub3)(chiffre *a, chiffre *b, long p, long q)
; entre :
;   a = naturel de longueur 2p+q
;   b = naturel de longueur p+3
; contraintes : 0 < q <= p, p > 2
;
; sortie :
;   b <-  a[0..p-1] + BASE*a[p..2p-1] + BASE^2*a[2p..2p+q-1]
;
; remarque :
;   fonction non implmente en C

#define L(x) Lsn_add_base_##x
Lsn_add_base:

	#define _a_  r31
	#define _b_  r30
	#define _p_  r29
	#define _q_  r28
	#define _ra_ r27

	stmw   _ra_,   4(r1)	; sauvegarde les registres modifis
	mflr   _ra_		; sauvegarde les paramtres
	mr     _b_,   r4
	mr     _p_,   r5
	mr     _q_,   r6

        ; b <- a0 + BASE*a1
	lwz   r7,    0(r3)	; b[0] <- a[0]
	stw   r7,    0(r4)
	addi  r7,    r4,    4	; r7 <- &b[1]
	slwi  r8,    _p_,   2	; r8 <- 4p
	addi  r5,    r3,    4	; r5 <- &a[1]
	add   r3,    r3,   r8	; r3 <- &a1
	add   _a_,   r3,   r8	; a  <- &a2
	mr    r4,    _p_
	subi  r6,    _p_,   1
	bl    Lsn_add		; b[1..p-1] <- a0[1..p-1] + a1[0..p-1]

	; b <- b + BASE^2*a2
	slwi  r7,   _p_,    2	; r7 <- &b[p]
	add   r7,   _b_,   r7
	stw   r3,   4(r7)	; sauve la retenue de a0 + BASE*a1
	li    r3,   0
	stw   r3,   8(r7)
	stw   r3,  12(r7)
	addi  r3,   _b_,    8	; r3 <- &b[2]
	addi  r4,   _p_,    1
	mr    r5,   _a_
	mr    r6,   _q_
	bl    Lsn_inc

	mtlr   _ra_
	lmw    _ra_, 4(r1)	; rcupre les registres
	blr

	#undef _a_
	#undef _b_
	#undef _p_
	#undef _q_
	#undef _ra_
        
#undef L
#endif /* defined(assembly_sn_toommul) || defined(assembly_sn_toomsqr) */

                            ; +------------------+
                            ; |  Multiplication  |
                            ; +------------------+
        
;  void xn(toommul)(chiffre *a, long la, chiffre *b, long lb, chiffre *c)
;
;  entre :
;  a = naturel de longueur la
;  b = naturel de longueur lb
;  c = naturel de longueur la+lb, non confondu avec a ou b
;  contraintes : 0 < lb <= la
;
;  sortie :
;  c <- a*b
        
#ifdef assembly_sn_toommul
#ifdef debug_toommul
.globl _sn_toommul_buggy
_sn_toommul_buggy:
#else
.globl _sn_toommul
_sn_toommul:
Lsn_toommul:
#endif

#define L(x) Lsn_toommul_##x

        ; petite multiplication => algorithme de Karatsuba
	cmpwi  cr0,   r6,   toommul_lim
	ble    Lsn_karamul

	li     r8,   3
	addi   r9,   r4,   2
	divwu  r8,   r9,   r8	; r8  <- p = ceil(la/3)
	add    r9,   r8,   r8	; r9  <- 2p
	subf.  r10,  r9,   r6	; r10 <- r = lb - 2p
	ble    L(tranches)	; si lb <= 2p, dcoupe a en tranches

	; ici lb >= 2*ceil(a/3) : dcoupe en trois parties
        ; variables locales
	#define _d_  36(r1)
        #define _a_  r31
        #define _b_  r30
        #define _c_  r29
        #define _p_  r28
        #define _q_  r27
        #define _r_  r26
        #define _x_  r25
	#define _ra_ 32(r1)

	stmw   r25,  4(r1)	; sauvegarde r25-r31
	mulli  r11,  r9,   12	; rserve 6p+10 chiffres + lr + cadre de pile
	addi   r11,  r11,  88
	clrrwi r11,  r11,  4	; arrondi  un multiple de 16 octets
	neg    r11,  r11
	stwux  r1,   r1,  r11
	mflr   r0
	stw    r0,   _ra_

	mr     _a_,  r3		; sauve les paramtres
	mr     _b_,  r5
	mr     _c_,  r7
	addi   _p_,  r8,  1	; p <- p+1
	subf   _q_,  r9,  r4
	mr     _r_,  r10

        ; c[0..p] <- a0 + a1 + a2, c[p+1..2p+1] <- |a0 - a1 + a2|
/*	mr     r3,   _a_ */
	mr     r4,   _c_
	subi   r5,   _p_,  1	; r5 <- p
	mr     r6,   _q_
	bl     Lsn_add_sub3
        mr     _x_,  r2

        ; c[2p+2..3p+2] <- b0 + b1 + b2, c[3p+3..4p+3] <- |b0 - b1 + b2|
	mr     r3,   _b_
	slwi   r4,   _p_,   3	; r4 <- &c[2p+2]
	add    r4,   r4,   _c_
	subi   r5,   _p_,  1	; r5 <- p
	mr     r6,   _r_
	bl     Lsn_add_sub3
	xor    _x_,  _x_,  r2   ; x <- sgn((a0-a1+a2)*(b0-b1+b2))

        ; d <- (a0 + a1 + a2)(b0 + b1 + b2) = c0 + c1 + c2 + c3 + c4
	mr     r5,   _c_
	mr     r4,   _p_	; r4 <- p+1
	slwi   r8,   _p_,   2	; r8 <- 4p+4
	add    r3,   r8,   _c_	; r3 <- &c[p+1]
	lwz    r7,   -4(r3)	; r6 <- lg(a0+a1+a2)
	addic  r7,   r7,   -1
	addme  r6,   _p_
	add    r3,   r8,   r3	; r3 <- &c[2p+2]
	subi   r8,   r8,   4	; r8 <- 4p
	lwzx   r2,   r3,   r8	; b0+b1+b2 >= BASE^p ?
	and.   r2,   r2,   r2
	la     r7,   _d_
	bne    1f
	mr     r5,   r3		; sinon diminue lg(b0+b1+b2) et change
	mr     r3,   _c_	; les oprandes
	mr     r4,   r6
	subi   r6,   _p_,   1
	slwi   r8,   r8,    1	; d[2p] <- 0
	stwx   r2,   r7,   r8
1:
        bl     Lsn_toommul

        ; e <- |a0 - a1 + a2|*|b0 - b1 + b2| = |c0 - c1 + c2 - c3 + c4|
	mr     r4,   _p_	; r4 <- p+1
	slwi   r8,   _p_,   2	; r8 <- 4p+4
	add    r5,   _c_,  r8	; r5 <- &c[p+1]
	add    r3,   r5,   r8	; r3 <- &c[2p+2]
	lwz    r7,   -4(r3)	; r6 <- lg(|a0-a1+a2|)
	addic  r7,   r7,   -1
	addme  r6,   _p_
	add    r3,   r8,   r3	; r3 <- &c[3p+3]
	subi   r9,   r8,   4	; r9 <- 4p
	lwzx   r2,   r3,   r9	; |b0-b1+b2| >= BASE^p ?
	and.   r2,   r2,   r2
	slwi   r9,   r9,   1	; r8 <- 8p
	la     r7,   40(r1)	; r7 <- &e
	add    r7,   r7,   r9
	bne    1f
	mr     r5,   r3		; sinon diminue lg(|b0+b1+b2|) et change
	add    r3,   _c_,  r8	; les oprandes
	mr     r4,   r6
	subi   r6,   _p_,   1
	stwx   r2,   r7,   r9	; e[2p] <- 0
1:
        bl     Lsn_toommul
	
        ; c[0..p+2] <- a0 + BASE*a1 + BASE^2*a2
	mr     r3,   _a_
	mr     r4,   _c_
	subi   r5,   _p_,   1	; r5 <- p
	mr     r6,   _q_
        bl     Lsn_add_base
        
        ; c[p+3..2p+5] <- b0 + BASE*b1 + BASE^2*b2
	mr     r3,   _b_
	slwi   r4,   _p_,   2	; r4 <- &c[p+3]
	addi   r4,   r4,    8
	add    r4,   r4,   _c_
	subi   r5,   _p_,   1	; r5 <- p
	mr     r6,   _r_
        bl     Lsn_add_base

        ; f <- (a0 + BASE*a1 + BASE^2*a2)*(b0 + BASE*b1 + BASE^2*b2)
        ;    = c0 + BASE*c1 + BASE^2*c2 + BASE^3*c3 + BASE^4*c4
	mr     r5,   _c_
	slwi   r7,   _p_,   2	; r7 <- 4p+12
	addi   r7,   r7,    8
	add    r3,   r7,   _c_	; r3 <- &c[p+3]
	lwz    r8,  -8(r3)	; r6 <- lg(a0+B*a1+B^2*a2)
	lwz    r6,  -4(r3)
	or     r8,   r8,   r6
	addic  r8,   r8,  -1
	adde   r6,   _p_,  r6
	add    r4,   r3,   r7	; r4 <- lg(b0+B*b1+B^2*b2)
	lwz    r8,  -8(r4)
	lwz    r4,  -4(r4)
	or     r8,   r8,   r4
	addic  r8,   r8,  -1
	adde   r4,   _p_,  r4
	cmpl   cr0,  r6,   r4	; si r4 < r6, change
	ble    1f
	mr     r8,   r4
	mr     r4,   r6
	mr     r6,   r8
	mr     r5,   r3
	mr     r3,   _c_
1:
	la     r7,   _d_	; r7 <- &f
	slwi   r8,   _p_,  4
	add    r7,   r7,  r8
	subi   r7,   r7,   8
	bl     Lsn_toommul

        ; c[0..2p-1] <- a0*b0 = c0
	mr     r3,   _a_
	subi   r4,   _p_,  1	; r4 <- p
	mr     r5,   _b_
	subi   r6,   _p_,  1	; r6 <- p
	mr     r7,   _c_
	bl     Lsn_toommul

        ; c[4*p..4p+q+r-1] <- a2*b2 = c4
	subi   r8,   _p_,  1	; r8 <- 8p
	slwi   r8,   r8,   3
	add    r3,   _a_,  r8	; r3 <- &a2
	add    r5,   _b_,  r8	; r5 <- &b2
	add    r7,   _c_,  r8	; r7 <- &c4
	add    r7,   r7,   r8
	mr     r4,   _q_
	mr     r6,   _r_
	bl     Lsn_toommul
	add    _r_,  _q_,  _r_	; r <- q+r

        ; point de chute pour toom_sqr
Lsn_toom_aux:

	subi   _p_,  _p_,  1	; p <- 2p
	slwi   _p_,  _p_,  1
	slwi   _b_,  _p_,  2	; b <- 8p

        ; c[2p..4p] <- (d+e)/2 = c0 + c2 + c4, d <- (d-e)/2 = c1 + c3
	la     r3,   _d_
	la     r6,   _d_
	add    r4,   r3,   _b_	; r4 <- &e
	addi   r4,   r4,    4
	add    r5,   _c_,  _b_	; r5 <- &c[2p]
	and.   _x_,  _x_,  _x_	; si e < 0, permute les rsultats
	lwzx   _x_,  r5,   _b_	; sauve c[4p] dans x
	beq    1f
	mr     r6,   r5
	la     r5,   _d_
1:
	addi   r7,   _p_,  1	; r7 <- 2p+1
	bl     Lsn_half_add_sub

        ; c[2p..4p] <- c[2p..4p] - c0 - c4 = c2
	mr     r5,   _c_
	mr     r4,   _p_	; r4 <- 2p
	mr     r6,   _p_	; r6 <- 2p
	add    r3,   _c_,  _b_	; r3 <- &c[2p]
	lwzx   r7,   r3,   _b_	; restaure c(4p]
	stwx   _x_,  r3,   _b_
	mr     _x_,  r7		; x <- dernier chiffre de (d+e)/2
	bl     Lsn_dec		; c[2p..4p-1] -= c0
	subf   _x_,  r3,   _x_	; x -= retenue
	add    r3,   _c_,  _b_	; r3 <- &c[2p]
	add    r5,   r3,   _b_	; r5 <- &c[4p]
	mr     r4,   _p_	; r4 <- 2p
	mr     r6,   _r_	; r6 <- q+r
	bl     Lsn_dec		; c[2p..4p-1] -= c4
	subf   _x_,  r3,   _x_	; x -= retenue

        ; f <- -f + c0 + BASE^2*c2 + BASE^4*c4 = -BASE*c1 - BASE^3*c3
        ;
        ; rmq1 : f a 2p+6 chiffres mais on s en sert pour calculer BASE*c3
        ; qui tient sur p+q+2 chiffres -> on peut ignorer les chiffres de rang
        ; 2p+2  2p+5 (d ailleurs on ne les a peut-tre mme pas calculs)
        ;
        ; rmq2 : f et c0 ont mme chiffre des units, donc on peut commencer
        ; la soustraction au rang 1. Ce n est mme pas la peine de forcer
        ; le premier chiffre  zro, on ne s en servira pas
	la     r5,   _d_	; r5 <- &f[1]
	add    r5,   r5,   _b_
	add    r5,   r5,   _b_
	addi   r5,   r5,   12
	mr     _a_,  r5		; a  <- &f[1]
	mr     r7,   r5		; r7 <- &f[1]
	subi   r4,   _p_,  1	; r4 <- 2p-1
	addi   r3,   _c_,  4	; r3 <- &c0[1]
	subi   r6,   _p_,  1	; r6 <- 2p-1
	bl     Lsn_sub		; f <- c0 - f
#ifndef assembly_sn_sub
	add    r12,  _a_,  _b_	; r12 <- &f[2p]
	subi   r12,  r12,  4
	li     r4,   0
	subfc  r4,   r3,   r4	; CA <- 1 - retenue
#endif
	lwz    r3, 0(r12)	; propage la retenue sur les deux derniers
	subfze r3,  r3		; chiffres
	stw    r3, 0(r12)
	lwz    r3, 4(r12)
	subfze r3,  r3
	stw    r3, 4(r12)

	addi   r3,   _a_,  4	; r3 <- &f[2]
	mr     r4,   _p_
	add    r5,   _c_,  _b_	; r5 <- &c2
	mr     r6,   _p_
	bl     Lsn_inc		; f += BASE^2*c2
	addi   r3,   _a_,  12	; r3 <- &f[4]
        ; on pourrait prendre r4 = 2p-2 ici puisqu on ne veut que les
        ; 2p+2 premiers chiffres de f, mais sn_inc va planter si
        ; on a q+r > 2p-2.
	mr     r4,   _p_	; r4 <- 2p
	add    r5,   _c_,  _b_	; r5 <- &c4
	add    r5,   r5,   _b_
	mr     r6,   _r_	; r6 <- q+r
	bl     Lsn_inc		; f += BASE^4*c4

        ; f <- f + BASE*d = BASE*(1 - BASE^2)*c3
	mr     r3,   _a_	; r3 <- &f[1]
	addi   r4,   _p_,   1	; r4 <- 2p+1
	la     r5,   _d_
	addi   r6,   _p_,   1	; r6 <- 2p+1
	bl     Lsn_inc		; f -= BASE*d

        ; f <- f/(1 - BASE^2) = BASE*c3
	addi   r3,   _a_,  8	; r3 <- &f[3]
	subi   r4,   _p_,  1	; r4 <- 2p-1
	mr     r5,   _a_	; r5 <- &f[1]
	subi   r6,   _p_,  1	; r6 <- 2p-1
	bl     Lsn_inc		; divise par 1 - BASE^2

	; e[2..2p+1] <- d - f/BASE = c1
	la     r3,   _d_
	mr     r5,   _a_	; r5 <- &f[1]
	subf   r7,   _b_,  _a_	; r7 <- &e[2]
	mr     r4,   _p_	; r4 <- 2p
	mr     r6,   _p_	; r6 <- 2p
	bl     Lsn_sub
#ifndef assembly_sn_sub
	mr     r12,  _a_	; r12 <- &f[1]
	add    r11,  _a_,  _b_	; r11 <- &f[2p+1]
	la     r10,  _d_	; r10 <- &d[2p]
	add    r10,  r10,  _b_
	li     r4,   0
	subfc  r4,   r3,   r4	; CA <- 1 - retenue
#endif
	lwz    r3, 0(r10)	; dernier chiffre
	lwz    r4, 0(r11)
	subfe  r3,   r4,   r3
	lwz    r4, 0(r12)	; l ajoute  f[1..2p+1]
	addc   r3,   r3,   r4
	stw    r3, 0(r12)
	subfe. r3,   r3,   r3
	bne    2f
1:
	lwz    r4, 4(r12)
	addze. r4,   r4
	stwu   r4, 4(r12)
	beq    1b
2:

	; f[p+1..2p+1] += x
	add    r12,  _a_,  _p_	; r12 <- &f[p+1]
	add    r12,  r12,  _p_
	lwz    r3, 0(r12)	; ajoute x  f[p+1..2p+1]
	addc   r3,   r3,   _x_
	stw    r3, 0(r12)
	subfe. r3,   r3,   r3
	bne    2f
1:
	lwz    r4, 4(r12)
	addze. r4,   r4
	stwu   r4, 4(r12)
	beq    1b
2:

        ; c[p..4p+q+r-1] += c3:c1
	add    r3,  _c_,  _p_	; r3 <- &c[p]
	add    r3,  r3,   _p_
	subf   r5,  _b_, _a_	; r5 <- &e[2]
	srwi   r4,  _p_,  1	; r4 <- 3p+q+r, r6 <- 3p+q+1
	add    r4,  r4,  _p_
	add    r6,  r4,  _q_
	addi   r6,  r6,   1
	add    r4,  r4,  _r_
	bl     Lsn_inc
	
        ; termin
	lwz   r0,    _ra_	; rcupre l adresse de retour
	mtlr  r0
	lwz   r1,    0(r1)	; nettoie la pile
	lmw   r25,   4(r1)	; rcupre r25-r31
	blr

	#undef _d_
        #undef _a_
        #undef _b_
        #undef _c_
        #undef _p_
        #undef _q_
        #undef _r_
        #undef _x_
	#undef _ra_

        ; ici lb <= 2*ceil(la/3) : dcoupage en tranches
        ; Le code qui suit est recopi mot  mot dans karamul.
        ; Attention  rpercuter les mises  jour !
L(tranches):

        # variables locales
        #define _d_  32(r1)
        #define _a_  r31
        #define _b_  r30
        #define _c_  r29
        #define _la_ r28
        #define _lb_ r27
        #define _ra_ r26

	stmw   r26, 4(r1)	; sauvegarde r26-r31 dans le cadre de pile
	mflr   _ra_		; ra <- adresse de retour

	slwi   r8,   r6,   2	; rserve lb chiffres + cadre de pile, arrondi
	addi   r8,   r8,  44	;  un multiple de 16 octets
	clrrwi r8,   r8,   4
	neg    r8,   r8
	stwux  r1,   r1,  r8
	mr     _a_,  r3
	mr     _b_,  r5
	mr     _c_,  r7
	mr     _la_, r4
	mr     _lb_, r6

        ; premire multiplication : c <- a[0..(la % lb)-1]*b
	mr     r4,   _lb_
	divwu  r8,   _la_, _lb_	; r6 <- l = la % lb
	mullw  r8,   r8,   _lb_
	subf.  r6,   r8,   _la_
	bne    1f		; si la est multiple de lb, r6 <- lb
	mr     r6,   _lb_
1:
	mr     r3,   _b_
	mr     r5,   _a_
/*	mr     r7,   _c_ */
	slwi   r8,   r6,   2
	add    _a_,  _a_,  r8	; a  += l
	add    _c_,  _c_,  r8	; c  += l
	subf   _la_, r6,   _la_	; la -= l
	bl     Lsn_toommul

        ; multiplications suivantes
L(loop):
	mr     r4,   _lb_
	slwi   r8,   _lb_,  2	; r8 <- 4*lb
	mr     r3,   _a_
	add    _a_,  _a_,  r8	; a += lb
	mr     r5,   _b_
	mr     r6,   _lb_
	mr     r7,   _c_
	subi   r10,  _c_,   4	; r10 <- &c[-1]
	la     r11,  28(r1)	; r11 <- &d[-1]
	mtctr  r4
1:
	lwzu   r8,   4(r10)	; d <- c[0..lb-1]
	stwu   r8,   4(r11)
	bdnz   1b
	bl     Lsn_toommul	;  c[0..2lb-1] <- a[0..lb-1]*b

	mr     r3,   _c_
	la     r5,   _d_
	mr     r6,   _lb_
	slwi   r4,   r6,   1	; r4 <- 2*lb
	add    _c_,  _c_,  r4	; c += lb
	add    _c_,  _c_,  r4
	bl     Lsn_inc		; c <- c + d
	subf.  _la_, _lb_, _la_ ;  la -= lb
        bne    L(loop)

        ; termin
	mtlr  _ra_
	lwz   r1,    0(r1)	; nettoie la pile
	lmw   r26,   4(r1)	; rcupre r26-r31
	blr

        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _d_
        #undef  _la_
        #undef  _lb_
	#undef  _ra_

#undef L
#endif /* assembly_sn_toommul */
#if !defined(assembly_sn_toommul) || defined(debug_toommul)
REPLACE(sn_toommul)
#endif

                                 ; +---------+
                                 ; |  Carr  |
                                 ; +---------+

; entre :
;   a = naturel de longueur la     rsi = &a, rdx = la
;   c = naturel de longueur 2*la   rdi = &c
; contraintes : 0 < la
;
; sortie :
;   c <- a^2
;
; registres modifis : tous


#ifdef assembly_sn_toomsqr
#ifdef debug_toommul
.globl _sn_toomsqr_buggy
_sn_toomsqr_buggy:
#else
.globl _sn_toomsqr
_sn_toomsqr:
Lsn_toomsqr:
#endif

#define L(x) Lsn_toomsqr_##x

        ; petit carr => algorithme de Karatsuba
	cmpwi  cr0,   r4,   toomsqr_lim
	ble    Lsn_karasqr

	li     r8,   3
	addi   r9,   r4,   2
	divwu  r8,   r9,   r8	; r8  <- p = ceil(la/3)
	add    r9,   r8,   r8	; r9  <- 2p

        ; variables locales
	#define _d_  36(r1)
        #define _a_  r31
        #define _b_  r30
        #define _c_  r29
        #define _p_  r28
        #define _q_  r27
        #define _r_  r26
        #define _x_  r25
	#define _ra_ 32(r1)

	stmw   r25,  4(r1)	; sauvegarde r25-r31
	mulli  r11,  r9,   12	; rserve 6p+10 chiffres + lr + cadre de pile
	addi   r11,  r11,  88
	clrrwi r11,  r11,  4	; arrondi  un multiple de 16 octets
	neg    r11,  r11
	stwux  r1,   r1,  r11
	mflr   r0
	stw    r0,   _ra_

	mr     _a_,  r3		; sauve les paramtres
	mr     _c_,  r5
	addi   _p_,  r8,  1	; p <- p+1
	subf   _q_,  r9,  r4
	slwi   _r_,  _q_, 1	; r <- 2q
	slwi   r8,   r8,  3	; r8 <- 8p
	la     r7,   40(r1)
	add    r7,   r7,   r8	; r7 <- &e
	li     r4,   0		; d[2p] <- 0, e[2p] <- 0
	stw    r4, -4(r7)
	stwx   r4,  r7,   r8

        ; c[0..p] <- a0 + a1 + a2, c[p+1..2p+1] <- |a0 - a1 + a2|
/*	mr     r3,   _a_ */
	mr     r4,   _c_
	subi   r5,   _p_,  1	; r5 <- p
	mr     r6,   _q_
	bl     Lsn_add_sub3
        li     _x_,  0

        ; d <- (a0 + a1 + a2)^2 = c0 + c1 + c2 + c3 + c4
	mr     r3,   _c_
	slwi   r8,   _p_,   2	; r8 <- 4p+4
	add    r5,   r8,   _c_	; r5 <- &c[p+1]
	lwz    r7,   -4(r5)	; r4 <- lg(a0+a1+a2)
	addic  r7,   r7,   -1
	addme  r4,   _p_
	la     r5,   _d_
        bl     Lsn_toomsqr

        ; e <- |a0 - a1 + a2|*|b0 - b1 + b2| = |c0 - c1 + c2 - c3 + c4|
	slwi   r8,   _p_,   2	; r8 <- 4p+4
	add    r3,   _c_,  r8	; r3 <- &c[p+1]
	subi   r8,   r8,    4	; r8 <- 4p
	lwzx   r7,   r3,   r8	; r4 <- lg(|a0-a1+a2|)
	addic  r7,   r7,   -1
	addme  r4,   _p_
	la     r5,   40(r1)	; r5 <- &e
	add    r5,   r5,   r8
	add    r5,   r5,   r8
        bl     Lsn_toomsqr
	
        ; c[0..p+2] <- a0 + BASE*a1 + BASE^2*a2
	mr     r3,   _a_
	mr     r4,   _c_
	subi   r5,   _p_,   1	; r5 <- p
	mr     r6,   _q_
        bl     Lsn_add_base
        
        ; f <- (a0 + BASE*a1 + BASE^2*a2)^2
        ;    = c0 + BASE*c1 + BASE^2*c2 + BASE^3*c3 + BASE^4*c4
	mr     r3,   _c_
	slwi   r7,   _p_,   2	; r7 <- 4p+4
	add    r5,   r7,   _c_	; r5 <- &c[p+1]
	lwz    r8, 0(r5)	; r4 <- lg(a0+B*a1+B^2*a2)
	lwz    r4, 4(r5)
	or     r8,   r8,   r4
	addic  r8,   r8,  -1
	adde   r4,   _p_,  r4
	la     r5,   _d_	; r5 <- &f
	slwi   r8,   _p_,  4
	add    r5,   r5,  r8
	subi   r5,   r5,   8
	bl     Lsn_toomsqr

        ; c[0..2p-1] <- a0^2 = c0
	mr     r3,   _a_
	subi   r4,   _p_,  1	; r4 <- p
	mr     r5,   _c_
	bl     Lsn_toomsqr

        ; c[4*p..4p+q+r-1] <- a2^2 = c4
	subi   r8,   _p_,  1	; r8 <- 8p
	slwi   r8,   r8,   3
	add    r3,   _a_,  r8	; r3 <- &a2
	add    r5,   _c_,  r8	; r5 <- &c4
	add    r5,   r5,   r8
	mr     r4,   _q_
	bl     Lsn_toomsqr
        b      Lsn_toom_aux	; continue avec toommul

	#undef _d_
        #undef _a_
        #undef _b_
        #undef _c_
        #undef _p_
        #undef _q_
        #undef _r_
        #undef _x_
	#undef _ra_

#undef L
#endif /* assembly_sn_toomsqr */
#if !defined(assembly_sn_toomsqr) || defined(debug_toommul)
REPLACE(sn_toomsqr)
#endif
