// file kernel/n/x86-64/mmod.S: operations on residues modulo BASE^n + 1
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                  Arithmtique modulo BASE^n + 1                       |
 |                                                                       |
 +-----------------------------------------------------------------------*/

        # +---------------------------------------------------------+
        # |  Rduction modulo BASE^p + 1 et BASE^(2p) - BASE^p + 1  |
        # +---------------------------------------------------------+

# entre :
#  a = naturel de longueur 3p         rsi = &a, rcx = p
#  b = naturel de longueur 3p+1       rdi = &b
#  c = naturel de longueur p          rbx = &c
#
# contraintes :
#  p > 0, c disjoint de a et b. On peut avoir &a == &b
#
# sortie :
#  b[0..2p-1] <- a mod BASE^(2p) - BASE^p + 1
#  b[2p..3p]  <- a mod BASE^p + 1
#  c <- ind.
#
# registres modifis :
#  rax,rbx,rcx,rdx,rsi,rdi,rbp,r8,r9,r10,r11,r12 <- ind.
        
#if defined(assembly_sn_mmul) || defined(assembly_sn_msqr)
#undef L
#define L(x) .Lsn_fmred_##x
        ALIGN(32)
.Lsn_fmred:

        movq   %rsi,   %r12             # sauve &a
        movq   %rdi,   %r11             # sauve &b
        movq   %rbx,   %r10             # sauve &c
        movq   %rcx,   %r9              # sauve p
	
        # c <- a0 - a1
        movq   %rbx,    %rdi            # rdi <- &c
        leaq   (%rsi,%rcx,8), %rbx      # rbx <- &a1
        call   .Lsn_fsub_1
        rclq   $1,     %r8              # sauve la retenue
        
        # b1 <- a1 + a2
        movq   %r9,     %rcx
        movq   %r11,    %rdi
        leaq   (%rdi,%r9,8), %rdi       # rdi <- &b1
        call   .Lsn_fadd_1
        rclq   $1,     %r8              # sauve la retenue

        # b0 <- a0 - a2
        movq   %rsi,    %rbx            # rbx <- &a2
        movq   %r12,    %rsi
        movq   %r11,    %rdi
        movq   %r9,     %rcx
        call   .Lsn_fsub_1
        
        # propage la retenue sortant de b0
        movq   %r9,     %rcx            # rcx <- p
        leaq   (%rdi,%r9,8), %rbx       # rbx <- &b2
        not    %rcx;  incq %rcx         # rcx <- -p
        jnb    2f
        movq   %rcx,    %rdx
1:
        sbbq   $0, (%rbx,%rdx,8)        # b1--
        jnb    2f
        incq   %rdx
        jne    1b
        jmp    L(noret)                 # si la retenue traverse b1 alors
        ALIGN(8)                        # a1+a2=BASE^p et il n y a plus rien  faire
2:

        # recycle la retenue sortant de b1
        bt     $0,      %r8
        jnc    L(noret)

        # ici retenue = 1 -> ajoute BASE^p - 1
        # rmq:  on a b1 <= BASE^p - 2, donc il ne peut pas y avoir de
        # nouvelle retenue
        movq   %rcx,    %rdx            # rdx <- -p
1:
        subq   $1, (%rdi,%rdx,8)        # b0--
        jnb    2f
        incq   %rdx
        jne    1b
        jmp    L(noret)
        ALIGN(8)
2:
        movq   %rcx,    %rdx            # rdx <- -p
3:
        incq   (%rbx,%rdx,8)            # b1++
        jne    L(noret)
        incq   %rdx
        jmp    3b
        ALIGN(8)
L(noret):

        # b2 <- a2 + (a0 - a1)
        movq   %r9,     %rcx            # rcx <- p
        movq   %rbx,    %rdi            # rdi <- &b2
        movq   %r10,    %rbx
        leaq   (%rsi,%r9,8), %rsi       # rsi <- &a2
        call   .Lsn_fadd_1
        
        # recycle la retenue sortant de b2
        setc   %cl
        bt     $1,      %r8
        sbbq   $0,      %rcx            # rcx <- retenue (-1,0,1)
        jnb    2f
        movq   %r9,     %rdx
        negq   %rdx
        incq   %rcx
1:
        incq   (%rdi,%rdx,8)            # si -1, b2++
        jne    2f
        incq   %rdx
        jne    1b
        incq   %rcx
2:      
        movq   %rcx,   (%rdi)           # b2[p] <- retenue finale
        ret

#endif /* defined(assembly_sn_mmul) || defined(assembly_sn_msqr) */
        

                   # +------------------------------------+
                   # |  Multiplication modulo BASE^n + 1  |
                   # +------------------------------------+

#  void xn(mmul)(chiffre *a, chiffre *b, long n)
#
#  entre :
#  a = naturel de longueur n+1
#  b = naturel de longueur n+1 non confondu avec a
#
#  contrainte : n > 0
#
#  sortie :
#  a <- (a*b) mod (BASE^n + 1), le chiffre de poids fort vaut 0 ou 1
#  b <- b mod (BASE^n + 1)

#ifdef assembly_sn_mmul
#undef L
#define L(x) .Lsn_mmul_##x

#ifdef debug_mmul
ENTER(sn_mmul_buggy)
#else
        ALIGN(32)
        .globl SUBR(sn_mmul)
#if __ELF__
        .type  SUBR(sn_mmul),@function
#endif
SUBR(sn_mmul):
.Lsn_mmul:
        pushq  %rbp
        movq   %rsp,%rbp
        pushq  %rbx
        pushq  %r12
        pushq  %r13
        pushq  %r14
        pushq  %r15
#endif

        # normalise a
        movq   (%rdi,%rdx,8), %rax      # rax <- a[n]
        subq   %rax,    (%rdi)          # a[0..1] -= a[n]
        sbbq   $0,     8(%rdi)
        jnb    L(a_ok)                  # pas de retenue -> fini
        leaq   -2(%rdx), %rcx           # rcx <- n-2
        leaq  16(%rdi), %rbx            # rbx <- &a[2]
1:
        sbbq   $0,      (%rbx)          # propage la retenue
        jnb    L(a_ok)
        leaq   8(%rbx), %rbx
        loop   1b
        movq   %rdx,    %rcx            # si elle ressort, ajoute BASE^n + 1
        movq   %rdi,    %rbx
2:
        incq   (%rbx)
        jnz    L(a_ok)
        leaq   8(%rbx), %rbx
        loop   2b
        
        # ici a = BASE^n, donc le produit vaut  -b mod BASE^n + 1
        movq   (%rbx,%rdx,8), %rax      # rax <- b[n]
        movq   %rsi,    %rbx            # rbx <- &b
        jmp    L(neg_b)                 # a <- -b mod BASE^n + 1

        # normalise b
        ALIGN(8)
L(a_ok):
        movq   (%rsi,%rdx,8), %rax
        movq   $0,      (%rsi,%rdx,8)   # b[n] <- 0
        subq   %rax,    (%rsi)          # b[0..1] -= b[n]
        sbbq   $0,     8(%rsi)
        jnb    L(b_ok)                  # pas de retenue -> fini
        leaq   -2(%rdx), %rcx           # rcx <- n-2
        leaq  16(%rsi), %rbx            # rbx <- &b[2]
1:
        sbbq   $0,      (%rbx)          # propage la retenue
        jnb    L(b_ok)
        leaq   8(%rbx), %rbx
        loop   1b
        movq   %rdx,    %rcx            # si elle ressort, ajoute BASE^n + 1
        movq   %rsi,    %rbx
2:
        incq   (%rbx)
        jnz    L(b_ok)
        leaq   8(%rbx), %rbx
        loop   2b
        movq   $1,      (%rbx)          # b[n] <- 1
        movq   %rdi,    %rbx            # rbx <- &a

        # ici b = BASE^n, donc le produit vaut -a mod BASE^n + 1
        movq   %rcx, %rax               # rax <- a[n] (= 0)
L(neg_b):
        leaq   (%rdi,%rdx,8), %rsi      # rsi <- &a[n]
        leaq   (%rbx,%rdx,8), %rbx      # rbx <- &b[n]
        movq   %rdx,    %rcx
        negq   %rcx                     # rcx <- -n
        xorq   %rdx,    %rdx
        ALIGN(8)
1:
        sbbq   (%rbx,%rcx,8),  %rax     # a <- b[n] - b[0..n-1]
        movq   %rax,    (%rsi,%rcx,8)
        movq   %rdx,    %rax
        incq   %rcx
        jne    1b
        movq   %rax,    (%rsi)          # a[n] <- 0
        jnb    3f
2:
        incq   (%rdi)                   # s il y a retenue, ajoute BASE^n + 1
        leaq   8(%rdi), %rdi
        jz     2b
3:
        RETURN_WITH_SP

        # ici a et b sont normaliss et tiennent sur n chiffres
        # n est-il divisible par 3 et suffisament grand ?
        ALIGN(8)
L(b_ok):
        cmpq   $mmul_lim, %rdx
        jbe    L(simple_mul)
        movq   %rdx,    %rax            # rax <- n
        xorq   %rdx,    %rdx            # rdx:rax <- n
        movq   $3,      %rcx
        divq   %rcx                     # rax <- n/3, rdx <- n%3
        testq  %rdx,    %rdx
        jz     L(trois)
	leaq   (%rax,%rax,2), %rax
        addq   %rax,    %rdx            # rdx <- n

        # cas n petit ou non divisible par 3 : multiplication dans N
L(simple_mul):
        movq   %rdx,    %rcx            # rcx <- n
	movq   %rdx,    %rax
	shlq   $4,      %rax
	ALLOCA                          # rserve 2n chiffres dans la pile
        movq   %rdi,    %rbx            # rbx <- &a
        movq   %rsp,    %rdi            # rdi <- &c
        pushq  %rcx                     # sauve n
        pushq  %rbx                     # sauve &a
        call   .Lsn_ftoommul            # c <- a*b
        
        # point de chute pour msqr
.Lsn_mmul_aux_simple:
        
        popq   %rdi                     # rdi <- &a
        movq   (%rsp),  %rcx            # rcx <- n
        movq   %rcx,    %rdx            # rdx <- n
        leaq   8(%rsp), %rsi            # rsi <- &c
        leaq   (%rsi,%rcx,8), %rbx      # rbx <- &c[n]
        call   .Lsn_fsub_1              # a[0..n-1] <- c[0..n-1] - c[n..2n-1]
        popq   %rdx                     # rdx <- n
        leaq   (%rsp,%rdx,8), %rsp      # nettoie la pile
        leaq   (%rsp,%rdx,8), %rsp
        movq   %rcx,    (%rdi)          # a[n] <- 0
        jnb    2f                       # s il n y a pas de retenue, c est fini
        not    %rdx
1:
        incq   %rdx
        incq   (%rdi,%rdx,8)            # sinon, ajoute BASE^n + 1
        jz     1b
2:
        RETURN_WITH_SP
        
        # cas n divisible par 3 : multiplie modulo BASE^p + 1
        # et modulo BASE^(2p) - BASE^p + 1
        
L(trois):

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _p_
        #define _c_   24(%rsp)
        #define _a_   16(%rsp)
        #define _b_    8(%rsp)
        #define _p_     (%rsp)

        movq   %rsi,    %rbx            # rbx <- &b
        movq   %rdi,    %rsi            # rsi <- &a
	movq   %rax,    %rcx
        shlq   $4,      %rax
        leaq   (%rax,%rax,2), %rax      # rax <- 24p
	ALLOCA                          # rserve 6p chiffres dans la pile
        pushq  %rsi                     # sauve &a
        pushq  %rbx                     # sauve &b
        pushq  %rcx                     # sauve p

        # dcompose a et b modulo BASE^(2p) - BASE^p + 1 et BASE^p + 1
        movq   %rsi,   %rdi
        leaq   _c_,    %rbx
        call   .Lsn_fmred
        movq   _b_,    %rsi
        movq   _p_,    %rcx
        leaq   _c_,    %rdi
        leaq   (%rdi,%rcx,8), %rbx
        leaq   (%rbx,%rcx,8), %rbx
        call   .Lsn_fmred

        # a[2p..3p] <- (a*b) mod BASE^p + 1
        movq   _a_,     %rdi
        leaq   _c_,     %rsi
        movq   _p_,     %rdx
        leaq   (%rsi,%rdx,8), %rsi      # rsi <- &a[2p]
        leaq   (%rsi,%rdx,8), %rsi
        leaq   (%rdi,%rdx,8), %rdi      # rdi <- &c[2p]
        leaq   (%rdi,%rdx,8), %rdi
        call   .Lsn_mmul

        # c[2p..6p-1] <- (a*b) mod (BASE^(2p) - BASE^p + 1), non rduit
        movq   _a_,     %rsi
        leaq   _c_,     %rbx
        movq   _p_,     %rcx
        leaq   (,%rcx,2), %rcx          # rcx <- 2p
        movq   %rcx,    %rdx            # rdx <- 2p
        leaq   (%rbx,%rcx,8), %rdi      # rdi <- &c[2p]
        call   .Lsn_ftoommul
        
        # point de chute pour msqr
.Lsn_mmul_aux_trois:

	# raffectation des variables locales
	movq    _a_, %r10
	movq    _p_, %r11
	leaq    (,%r11,2), %r12         # r12 <- 2p
	
	#undef  _a_
	#undef  _p_
	#undef  _2p_
	#define _a_  %r10
	#define _p_  %r11
	#define _2p_ %r12

        # a[0..2p-1] <- (a*b) mod (BASE^(2p) - BASE^p + 1)
        movq   _a_,     %rdi
        leaq   _c_,     %rsi
        movq   _p_,     %rcx
        leaq   (%rsi,_2p_,8), %rsi      # rsi <- &c[2p]
        leaq   (%rsi,_2p_,8), %rbx      # rbx <- &c2
        call   .Lsn_fsub_1              # a0 <- c0 - c2
        setc   %cl                      # r9 <- retenue sur a1
        movq   %rcx,    %r9
        movq   _p_,     %rcx
        leaq   (%rsi,_p_,8), %rbx       # rbx <- &c2
        call   .Lsn_fadd_1              # a1 <- c1 + c2
        setc   %cl
        movq   %rcx,    %r8             # r8 <- retenue sur a2
        movq   _a_,     %rsi
	movq   _a_,     %rdi
        movq   _p_,     %rcx
        call   .Lsn_fsub_1              # a0 -= c3

        sbbq   %r9,    (%rsi)           # cumule  la retenue sur a1
        jnb    2f                       # et la propage
        movq   _p_,     %rcx
        leaq   (%rsi,_p_,8), %rbx
        decq   %rcx
        negq   %rcx
1:
        subq   $1,      (%rbx,%rcx,8)
        jnb    2f
        incq   %rcx
        jne    1b
        ALIGN(8)
2:
        sbbq   $0,      %r8            # r8 <- retenue sur a2
        jz     7f
        movq   _p_,     %rcx
        not    %rcx
        incq   %rcx
        jb     5f
3:                                      # si > 0, a0--, a1++    
        subq   $1,      (%rsi,%rcx,8)
        jnb    4f
        incq   %rcx
        jne    3b
        jmp    7f
        ALIGN(8)
4:
        incq   (%rsi)
        leaq   8(%rsi), %rsi
        jz     4b
        jmp    7f
        ALIGN(8)
5:                                      # si < 0, a0++, a1--
        incq   (%rsi,%rcx,8)
        jne    6f
        incq   %rcx
        jne    5b
        jmp    7f
        ALIGN(8)
6:
        subq   $1,      (%rsi)
        leaq   8(%rsi), %rsi
        jb     6b
7:

        # a[2p..3p] <- (a[0..2p-1] - a[2p..3p]) mod (BASE^p + 1), normalis
        movq   _p_,     %rcx
        movq   _a_,     %rsi
        leaq   (_a_,_2p_,8), %rbx       # rbx <- &a2
        movq   %rbx,    %rdi
        call   .Lsn_fsub_1              # a2 <- a0 - a2
        adcq   %rcx,    (%rbx)
        movq   _p_,     %rcx
        movq   %rsi,    %rbx            # rbx <- &a1
        leaq   (%rsi,_p_,8), %rsi       # rsi <- &a2
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # a2 -= a1
        xchgq  %rcx,    (%rsi)          # a2[p] <- 0
        adcq   $0,      %rcx            # rcx <- retenue
        movq   _p_,     %rdx
        negq   %rdx
        addq   %rcx,    (%rsi,%rdx,8)   # rinjecte la retenue
        jnc    2f
1:
        incq   %rdx
        incq   (%rsi,%rdx,8)
        jz     1b
2:

# algorithme de division par -3 modulo BASE^p + 1
# -----------------------------------------------
#
# On note a pour a2 et B pour BASE. On a 0 <= a <= B^p et on cherche
# q tel que a = -3q mod (B^p+1) avec 0 <= q <= 2B^p - 1 (ie. le chiffre
# de poids fort de q vaut 0 ou 1). q existe car -3 est premier  B.
#
# Soit m = (B-1)/3 (entier). l algorithme consiste  calculer m*a et  le
# diviser par 1-B (division suivant les puissances de B croissantes).
# On obtient donc b compris entre 0 et B^p-1 et x relatif tels que :
#
#     m*a = (1-B)*b + B^p*x
#
# Les deux premiers termes sont divisibles par m, donc le troisime aussi.
# Soit x = m*y, d o a = -3b + B^p*y. Comme 0 <= a <= B^p et 0 <= b < B^p,
# on a 0 <= y <= 3.
#
# Si y = 0: (a=0 dans ce cas) alors q = b (= 0).
# Si y = 1: a = -3b +  B^p = -3(b + 1 +  m*(1+B+..+B^(p-1))) mod (B^p+1)
# Si y = 2: a = -3b + 2B^p = -3(b + 2 + 2m*(1+B+..+B^(p-1))) mod (B^p+1)
# Si y = 3: a = -3n + 3B^p = -3(b+1) mod (B^p+1).
#
# Calcul de b et y: on note a = sum(a.i*B^i), b = sum(b.i*B^i) et on
# calcule les nombres b.i,c.i,d.i de proche en proche par :
#
#     d.0 = 0
#     m*(a.i + d.i) = b.i + B*c.i    (division euclidienne)
#     d.(i+1) = (b.i + c.i)/m        (entier)
#
# Donc m*(a+d) = b + B*c + m*B^p*(a.p+d.p) et m*d = B*(b+c)
# d o m*a = (1-B)*b + B^p*m*(a.p+d.p) comme annonc, et y = a.p+d.p.
#
# Le calcul de b.i et c.i  partir de a.i et d.i est immdiat. En ce qui
# concerne d.i, on montre par rcurrence que d.i <= 3 et comme m = 1 mod 4,
# on en dduit d.i = (b.i + c.i) mod 4.

        # a[2p..3p] <- -a[2p..3p]/3 mod BASE^p + 1
        movq   _p_,     %rcx
        negq   %rcx
        movq   $0x55555555, %rbp        # rbp <- (BASE-1)/3
	shlq   $32,     %rbp
	addq   $0x55555555, %rbp
        xorq   %rax,    %rax            # init reste
        ALIGN(8)
1:
        adcq   (%rsi,%rcx,8), %rax      # rax += a[i]
        rclq   $1,      %rbx            # sauve la retenue
        mulq   %rbp                     # divise par -3
        movq   %rax,    (%rsi,%rcx,8)   # a[i] <- quotient
        addq   %rdx,    %rax
        andq   $3,      %rax            # rax <- reste
        bt     $0,      %rbx            # rcupre la retenue
        incq   %rcx
        jne    1b
        adcq   (%rsi),  %rax            # ajoute le dernier chiffre de a2
        movq   %rcx,    (%rsi)          # a2[p] <- 0
        jz     5f                       # pas de retenue -> fini
        
        movq   _p_,     %rcx
        negq   %rcx
        cmpq   $2,      %rax
        ja     4f                       # ret = 3 -> a2++
        jne    2f
        shlq   $1,      %rbp            # ret = 1 ou 2 => rbp <- ret*(BASE-1)/3
2:
        addq   %rbp,    %rax            # rax <- ret*(BASE+2)/3
        addq   %rax,    (%rsi,%rcx,8)   # a2 += ret/3
        incq   %rcx
        ALIGN(8)
3:
        adcq   %rbp,    (%rsi,%rcx,8)
        incq   %rcx
        jne    3b
        adcq   %rcx,    (%rsi)
        jmp    5f
        ALIGN(8)
4:
        incq   %rcx
        incq  -8(%rsi,%rcx,8)           # si ret = 3, a2++
        jz     4b
5:
     
        # a <- a - (BASE^p - 1)*a[2p..3p]
        movq   _a_,     %rsi
        leaq   1(_p_),  %rcx            # rcx <- p+1
        leaq   (_a_,_2p_,8), %rbx       # rbx <- &a2
        movq   _2p_,    %rdx            # rdx <- 2p
        call   .Lsn_finc                # a1:a0 += a2
        setc   %cl                      # sauve la retenue de l addition
	movq   %rcx,    %r9

        leaq   (_a_,_p_,8),  %rsi       # rsi <- &a1
        leaq   (_a_,_2p_,8), %rbx       # rbx <- &a2
        leaq   1(_p_),  %rcx            # rcx <- p+1
        leaq   1(_2p_), %rdx            # rdx <- 2p+1
        setc   %r9b                     # sauve la retenue de l addition
        call   .Lsn_fdec                # a2:a1 -= a2

        bt     $0,      %r9             # propage la retenue de l addiion
        jnc    2f
1:
	incq  -8(%rsi)
	leaq   8(%rsi),%rsi
	jz     1b
2:
        # termin
        leaq   3(_2p_,_2p_,2), %rax     # rax <- 6p+3
        leaq   (%rsp,%rax,8),  %rsp     # nettoie la pile
        RETURN_WITH_SP

#endif /* assembly_sn_mmul */

        # cas o la version assembleur est dsactive ou dbogue :
        # .Lsn_mmul renvoie vers la version C

#if !defined(assembly_sn_mmul) || defined(debug_mmul)
        ALIGN(32)
.Lsn_mmul:
        jmp   SUBR(sn_mmul)
#endif /* !defined(assembly_sn_mmul) || defined(debug_mmul) */



                        # +---------------------------+
                        # |  Carr modulo BASE^n + 1  |
                        # +---------------------------+

#  void xn(msqr)(chiffre *a, long n)
#
#  entre :
#  a = naturel de longueur n+1
#
#  contrainte : n > 0
#
#  sortie :
#  a <- a^2 mod (BASE^n + 1), le chiffre de poids fort vaut 0 ou 1

#ifdef assembly_sn_msqr
#undef L
#define L(x) .Lsn_msqr_##x
#ifdef debug_mmul
ENTER(sn_msqr_buggy)
#else
        ALIGN(32)
        .globl SUBR(sn_msqr)
#if __ELF__
        .type  SUBR(sn_msqr),@function
#endif
SUBR(sn_msqr):
.Lsn_msqr:
        pushq  %rbp
        movq   %rsp,%rbp
        pushq  %rbx
        pushq  %r12
        pushq  %r13
        pushq  %r14
        pushq  %r15
#endif

        # normalise a
        movq   (%rdi,%rsi,8), %rax      # rax <- a[n]
        subq   %rax,    (%rdi)          # a[0..1] -= a[n]
        sbbq   $0,     8(%rdi)
        jnb    L(a_ok)                  # pas de retenue -> fini
        leaq  -2(%rsi), %rcx            # rcx <- n-2
        leaq  16(%rdi), %rbx            # rbx <- &a[2]
1:
        sbbq   $0,      (%rbx)          # propage la retenue
        jnb    L(a_ok)
        leaq   8(%rbx), %rbx
        loop   1b
        movq   %rsi,    %rcx            # si elle ressort, ajoute BASE^n + 1
        movq   %rdi,    %rbx
2:
        incq   (%rbx)
        jnz    L(a_ok)
        leaq   8(%rbx), %rbx
        loop   2b
        
        # ici a = BASE^n, donc le carr vaut 1
        movq   $0,      (%rbx)
        movq   $1,      (%rdi)
        RETURN_WITH_SP

        # ici a est normalis et tient sur n chiffres
        # n est-il divisible par 3 et suffisament grand ?
        ALIGN(8)
L(a_ok):
        cmpq   $msqr_lim, %rsi
        jbe    L(simple_sqr)
        movq   %rsi,    %rax            # rax <- n
        xorq   %rdx,    %rdx            # rdx:rax <- n
        movq   $3,      %rcx
        divq   %rcx                     # rax <- n/3, rdx <- n%3
        testq  %rdx,    %rdx
        jz     L(trois)

        # cas n petit ou non divisible par 3 : carr dans N
L(simple_sqr):

        movq   %rsi,    %rdx            # rdx <- n
        movq   %rsi,    %rax
        shlq   $4,      %rax
	ALLOCA                          # rserve 2n chiffres dans la pile
	movq   %rdi,    %rsi            # rsi <- &a
        movq   %rsp,    %rdi            # rdi <- &c
        pushq  %rdx                     # sauve n
        pushq  %rsi                     # sauve &a
        call   .Lsn_ftoomsqr            # c <- a^2
        jmp    .Lsn_mmul_aux_simple     # continue avec mmul

        # cas n divisible par 3 : multiplie modulo BASE^p + 1
        # et modulo BASE^(2p) - BASE^p + 1

L(trois):

        # variables locales
        #undef  _a_
        #undef  _b_
        #undef  _c_
        #undef  _p_
        #define _c_   24(%rsp)
        #define _a_   16(%rsp)
        #define _b_    8(%rsp)
        #define _p_     (%rsp)

	movq   %rax,    %rcx
        shlq   $4,      %rax
        leaq   (%rax,%rax,2), %rax      # rax <- 24p
	ALLOCA                          # rserve 6p chiffres dans la pile
        pushq  %rdi                     # sauve &a
        pushq  %rdi                     # sauve &b (= &a)
        pushq  %rcx                     # sauve p

        # dcompose a modulo BASE^(2p) - BASE^p + 1 et BASE^p + 1
	movq   %rdi,   %rsi
        leaq   _c_,    %rbx
        call   .Lsn_fmred

        # a[2p..3p] <- a^2 mod BASE^p + 1
        movq   _a_,     %rdi
        movq   _p_,     %rsi
        leaq   (%rdi,%rsi,8), %rdi      # rdi <- &a[2p]
        leaq   (%rdi,%rsi,8), %rdi
        call   .Lsn_msqr
        
        # c[2p..6p-1] <- a^2 mod (BASE^(2p) - BASE^p + 1), non rduit
        movq   _a_,     %rsi
        leaq   _c_,     %rdi
        movq   _p_,     %rdx
        leaq   (,%rdx,2), %rdx          # rcx <- 2p
        leaq   (%rdi,%rdx,8), %rdi      # rdi <- &c[2p]
        call   .Lsn_ftoomsqr

        jmp    .Lsn_mmul_aux_trois      # continue avec mmul
        
#endif /* assembly_sn_msqr */

        # cas o la version assembleur est dsactive ou dbogue :
        # .Lsn_msqr renvoie vers la version C

#if !defined(assembly_sn_msqr) || defined(debug_mmul)
        ALIGN(32)
.Lsn_msqr:
        jmp   SUBR(sn_msqr)
#endif /* !defined(assembly_sn_msqr) || defined(debug_mmul) */

                      # +------------------------------+
                      # |  Papillon modulo BASE^n + 1  |
                      # +------------------------------+

# void xn(butterfly1)(chiffre *a, chiffre *b, long n, long q, int s)
#
#  entre :
#  a = naturel de longueur n+1
#  b = naturel de longueur n+1 non confondu avec a
#  q = entier positif ou nul
#  s = 0 ou 1
#
#  contraintes : n >= 3 et si q est impair, n doir tre pair
#
#  sortie :
#  a <- a + (-1)^s * b * 2^(q/2) mod (BASE^n + 1)
#  b <- a - (-1)^s * b * 2^(q/2) mod (BASE^n + 1)
#
#  remarque : 2^(1/2) = BASE^(3n/4)*(BASE^(n/2) + 1) mod (BASE^n + 1)

#ifdef assembly_sn_butterfly
#undef L
#define L(x) .Lsn_butterfly_##x
#ifdef debug_butterfly
ENTER(sn_butterfly_buggy)
#else
ENTER(sn_butterfly)
#endif
	# variables locales
	#undef  _a_
	#undef  _b_
	#undef  _n_
	#undef  _q_
	#undef  _s_
	#define _a_ %r9
	#define _b_ %r10
	#define _n_ %r11
	#define _q_ %r12
	#define _s_ %r13

	movq    %rdi, _a_
	movq    %rsi, _b_
	movq    %rdx, _n_
	movq    %rcx, _q_
	movq    %r8,  _s_

        # force 2 <= a[n] <= BASE-3 pour absorber les retenues
        movq   (%rdi,%rdx,8), %rax      # rax <- a[n]
        addq   $2,      %rax
        jnc    2f                       # a[n] < BASE - 2 ?
        subq   $2,      (%rdi,%rdx,8)   # si oui, retranche 2*(BASE^n + 1)
        subq   $2,      (%rdi)
        jnb    4f
1:
        leaq   8(%rdi), %rdi
        sbbq   $0,      (%rdi)
        jb     1b
        jmp    4f
        ALIGN(8)
2:
        cmpq   $4,      %rax            # a[n] >= 2 ?
        jnb    4f
        movq   %rax,    (%rdi,%rdx,8)   # si non, ajoute 2*(BASE^n + 1)
        addq   $2,      (%rdi)
        jnc    4f
3:
        leaq   8(%rdi), %rdi            # propage la retenue
        incq   (%rdi)
        jz     3b
4:
        shrq   $1,      _q_             # q <- q/2
        jnc    L(sqrt_2_done)

        # q est impair, il faut multiplier b par (BASE^(n/2) + 1)
        leaq   (_n_,_n_,2),   %rdx      # q <- q/2 + 48*n
	shlq   $4,      %rdx
	addq   %rdx,    _q_
	leaq   2(_n_),  %rcx            # rcx <- n/2 + 1
        shrq   $1,      %rcx
        leaq   (,%rcx,8), %rax
	ALLOCA                          # rserve n/2 + 1 chiffres dans la pile
        movq   %rsp,    %rdi            # rdi <- &x
        leaq   -8(_b_,%rcx,8), %rsi     # rsi <- &b[n/2]
        cld;   rep movsq                # x <- b[n/2..n]

        # haut <- haut + bas
        movq   _n_,     %rcx
        shrq   $1,      %rcx
        leaq   (_b_,%rcx,8), %rsi       # rsi <- &b[n/2]
        movq   _b_,     %rbx            # rbx <- &b[0]
        movq   %rsi,    %rdi
        call   .Lsn_fadd_1              # b[n/2..n-1] += b[0..n/2-1]
        adcq   %rcx,    (%rsi)          # b[n] += retenue
        jnc    2f
        decq   (%rsi)                   # s il y a retenue, b[n] <- BASE-1
        movq   _b_,     %rsi            # ... et b <- b - 1
1:
        subq   $1,      (%rsi)
        leaq   8(%rsi), %rsi
        jb     1b
2:
        
        # bas <- bas - haut
        movq   _b_,     %rsi            # rsi <- &b
        movq   %rsp,    %rbx            # rbx <- &x
	leaq   2(_n_),  %rcx            # rcx <- n/2 + 1
        shrq   $1,      %rcx
	movq   %rsi,    %rdi
        call   .Lsn_fsub_1              # b[0..n/2] -= x
        movq   %rbx,    %rsp            # nettoie la pile
1:
        sbbq   $0,      (%rsi)          # propage la retenue
        leaq   8(%rsi), %rsi
        jb     1b
        
        ALIGN(8)
L(sqrt_2_done):
        
        # dcompose le dcalage en nombre et fraction de chiffre
	movq   _q_,     %rcx
        andq   $63,     %rcx            # rcx <- q % 64
	shrq   $6,      _q_             # q <- q/64
1:
	subq   _n_,     _q_             # rduit modulo n
	incq   _s_
	jnb    1b
	addq   _n_,     _q_  

        # b <- b*2^k mod (BASE^n + 1), normalis
        movq   %rcx,    %rdx            # rdx <- k
        jrcxz  L(norm)                  # si k = 0, normalise b
        
        # dcalage de k bits
        leaq   1(_n_),  %rdx            # rdx <- n+1
        movq   (_b_,_n_,8), %rbx        # rbx <- b[n]
        movq   %rbx,    %rax            # rax <- b[n]
        movq   %rbx,    %rsi            # rsi <- b[n]
        btrq   $0,      %rdx
        jnc    2f
        ALIGN(8)
1:
        movq  -8(_b_,%rdx,8), %rbx      # rbx <- b[2i-1]
        shldq  %cl, %rbx, %rax          # rax <- (ret:b[2i-1]) << k mod BASE
        movq   %rax, (_b_,%rdx,8)       # sauve dans b[2i]
2:
        movq -16(_b_,%rdx,8), %rax      # rax <- b[2i-2]
        shldq  %cl, %rax, %rbx          # rbx <- (ret:b[2i-2]) << k mod BASE
        movq   %rbx, -8(_b_,%rdx,8)     # sauve dans b[2i-1]
        subq   $2,     %rdx
        jne    1b
        shlq   %cl,     %rax            # rax <- b[0] << k mod BASE
        movq   %rax,    (_b_)           # sauve dans b[0]
        shldq  %cl, %rsi, %rdx          # rdx <- fin de b[n] << k
        
        # si k = 0, normalise b
L(norm):
        leaq   (_b_,_n_,8), %rsi        # rsi <- &b[n]
	movq   $2,      %rcx            # rcx <- 2 - n
	subq   _n_,     %rcx
        movq   (%rsi),  %rax            # rax <- b[n]
        subq   %rax,    (_b_)           # retranche  b[1]:b[0]
        sbbq   %rdx,    8(_b_)
        jnb    L(b_ok)
1:
        sbbq   $0,      (%rsi,%rcx,8)   # propage la retenue
        jnb    L(b_ok)
        incq   %rcx
        jnz    1b
        movq   _n_,     %rcx            # si elle ressort, ajoute BASE^n + 1
        movq   _b_,     %rsi
2:
        incq   (%rsi)
        jnz    L(b_ok)
        leaq   8(%rsi), %rsi
        loop   2b

        # cas b = BASE^n
        movq   _a_,     %rsi            # rsi <- &a
        movq   _b_,     %rdi            # rsi <- &a
        leaq   1(_n_),  %rcx            # rcx <- n+1
        movq   _q_,     %rdx
        subq   %rcx,    %rdx            # rdx <- m - n - 1
        cld;   rep movsq                # b <- a
        leaq   (%rsi,%rdx,8), %rsi      # rsi <- &a[m]
        leaq   (%rdi,%rdx,8), %rdi      # rdi <- &b[m]
        bt     $0,      _s_             # si s = 1, change les pointeurs
        jc     1f
        xchgq  %rsi,    %rdi
1:                                      # a -= BASE^m
        subq   $1,      (%rsi)
        leaq   8(%rsi), %rsi
        jb     1b
2:                                      # b += BASE^m
        incq   (%rdi)
        leaq   8(%rdi), %rdi
        jz     2b
        RETURN_WITH_SP

        # ici on est sr que b tient sur n chiffres 
        # x <- b*BASE^m mod (BASE^n + 1) 
        ALIGN(8)
L(b_ok):
        cld
        movq   _b_,     %rsi            # rsi <- &b
        movq   _n_,     %rcx            # rcx <- n
        movq   _q_,     %rdx            # rdx <- m
        pushq  $0                       # x[n] <- 0
        leaq   (,%rcx,8), %rax
	ALLOCA                          # rserve n chiffres de plus dans la pile
        subq   %rdx,    %rcx            # rcx <- n-m
        cmpq   %rdx,    %rcx            # si m > n-m, change de signe pour
        jb     L(reverse)               # que la copie par "rep movsq"
                                        # porte sur la partie la plus longue

        # ici m <= n-m : x[m..n-1] <- b[0..n-m-1]
        leaq   (%rsp,%rdx,8), %rdi      # rdi <- &x[m]
        rep    movsq
	
        # x[0..m-1] <- 1 - b[n-m..n-1]
        xchgq  %rdx,    %rcx            # rcx <- m, rdx <- 0
        jrcxz  L(x_ok)
        movq   $1,      %rax            # init retenue
        movq   %rax,    (%rdi)          # x[n] <- 1 pour absorber la retenue
        leaq   (%rsp,%rcx,8), %rdi      # rdi <- &x[m]
        leaq   (%rsi,%rcx,8), %rsi      # rsi <- &b[n]
        negq   %rcx
        clc
        ALIGN(8)
1:
        sbbq   (%rsi,%rcx,8), %rax      # rax -= b[i+n-m]
        movq   %rax,    (%rdi,%rcx,8)   # sauve dans x[i]
        movq   %rdx,    %rax            # raz retenue
        incq   %rcx
        jne    1b
2:
        sbbq   %rcx,   (%rdi)           # propage la retenue
        leaq   8(%rdi), %rdi
        jc     2b
        jmp    L(x_ok)

        # ici m > n-m > 0
        ALIGN(8)
L(reverse):
        incq   _s_                      # s <- 1 - s

        # x[0..m-1] <- b[n-m..n-1]
        leaq   (%rsi,%rcx,8), %rsi      # rsi <- &b[n-m]
        movq   %rsp,    %rdi            # rdi <- &x
        xchgq  %rcx,    %rdx            # rcx <- m, rdx <- n-m
        rep    movsq

        # x[m..n-1] <- -b[0..n-m-1]
        xchgq  %rcx,    %rdx            # rcx <- n-m, rdx <- 0
        leaq   (_b_,%rcx,8), %rsi       # rsi <- &b[n-m]
        leaq   (%rdi,%rcx,8), %rdi      # rdi <- &x[n]
        negq   %rcx
        clc
        ALIGN(8)
1:
        movq   %rdx,    %rax
        sbbq   (%rsi,%rcx,8), %rax
        movq   %rax,    (%rdi,%rcx,8)
        incq   %rcx
        jne    1b
        movq   %rsp,    %rdi            # rdi <- &x
2:
        adcq   %rcx,    (%rdi)          # rinjecte la retenue
        leaq   8(%rdi), %rdi
        jc     2b
L(x_ok):
        
        # a <- a + (-1)^s*x, b <- a - (-1)^s*x
        movq   _a_,     %rsi            # rsi <- &a
        movq   %rsp,    %rbx            # rbx <- &x
        movq   _b_,     %rdi            # rdi <- &b
        leaq   1(_n_),  %rcx            # rcx <- n+1

	leaq .Lsn_fadd_1(%rip), %r14    # r14,r15 <- adresses de saut
	leaq .Lsn_fsub_1(%rip), %r15
	bt    $0,       _s_             # si s = 1, les change
	jnc   1f
	xchgq %r14,      %r15
1:
	call   *%r14                    # b <- a + (-1)^s*x
        movq   _a_,     %rsi            # rsi <- &a
	movq   _a_,     %rdi            # rdi <- &a
        movq   %rsp,    %rbx            # rbx <- &x
        leaq   1(_n_),  %rcx            # rcx <- n+1
        call   *%r15                    # a <- a - (-1)^s*s
	
	leaq 8(%rsp,_n_,8),%rsp         # nettoie la pile
        RETURN_WITH_SP
	

#endif /* assembly_sn_butterfly */

