// file kernel/n/alpha/shift.S: shift of natural integers
/*-----------------------------------------------------------------------+
 |  Copyright 2005-2006, Michel Quercia (michel.quercia@prepas.org)      |
 |                                                                       |
 |  This file is part of Numerix. Numerix is free software; you can      |
 |  redistribute it and/or modify it under the terms of the GNU Lesser   |
 |  General Public License as published by the Free Software Foundation; |
 |  either version 2.1 of the License, or (at your option) any later     |
 |  version.                                                             |
 |                                                                       |
 |  The Numerix Library is distributed in the hope that it will be       |
 |  useful, but WITHOUT ANY WARRANTY; without even the implied warranty  |
 |  of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU  |
 |  Lesser General Public License for more details.                      |
 |                                                                       |
 |  You should have received a copy of the GNU Lesser General Public     |
 |  License along with the GNU MP Library; see the file COPYING. If not, |
 |  write to the Free Software Foundation, Inc., 59 Temple Place -       |
 |  Suite 330, Boston, MA 02111-1307, USA.                               |
 +-----------------------------------------------------------------------+
 |                                                                       |
 |                                 Dcalages                             |
 |                                                                       |
 +-----------------------------------------------------------------------*/

                            # +------------------+
                            # |  Copie droule  |
                            # +------------------+

   # entre en milieu de boucle :
   #   r2  = -(longueur des oprandes)
   #   r16 = adresse source cadre sur un multiple de 32
   #   r20 = adresse rsultat cadre sur un multiple de 32
   #   r27 = adresse de retour
   #
   # sortie:
   #   (r20) <- (r16) (par adresses croissantes)
   #
   # registres modifis:
   #   r2 <- r2 + 32*ceil(r2/32) 
   #   r3 <- ind
   #   r16 <- adresse suivante pour la source
   #   r20 <- adresse suivante pour le rsultat

        .align 5
        .globl sn_cpuploop
        .ent   sn_cpuploop
        .frame $30,0,$27,0
        .prologue 0

        # corps de boucle  drouler (2 instructions, entrer  la 1re)
#define BODY(x) \
        ldq    $3,   x($16)     ;\
        stq    $3,   x($20)

	# boucle droule pour 32 chiffres
sn_cpuploop:
        BODY(0);   BODY(8);   BODY(16);  BODY(24)
        BODY(32);  BODY(40);  BODY(48);  BODY(56)
        BODY(64);  BODY(72);  BODY(80);  BODY(88)
        BODY(96);  BODY(104); BODY(112); BODY(120)
        BODY(128); BODY(136); BODY(144); BODY(152)
        BODY(160); BODY(168); BODY(176); BODY(184)
        BODY(192); BODY(200); BODY(208); BODY(216)
        BODY(224); BODY(232); BODY(240); BODY(248)
#undef BODY

	lda    $2,   32($2)
	lda    $16,  256($16)
	lda    $20,  256($20)
	blt    $2,   sn_cpuploop
	ret    $31,  ($27),1

	.end   sn_cpuploop

   # entre en milieu de boucle :
   #   r2  = -(longueur des oprandes)
   #   r16 = adresse source cadre sur un multiple de 32
   #   r20 = adresse rsultat cadre sur un multiple de 32
   #   r27 = adresse de retour
   #
   # sortie:
   #   (r20) <- (r16) (par adresses dcroissantes)
   #
   # registres modifis:
   #   r2 <- r2 + 32*ceil(r2/32) 
   #   r3 <- ind

        .align 5
        .globl sn_cpdnloop
        .ent   sn_cpdnloop
        .frame $30,0,$27,0
        .prologue 0

        # corps de boucle  drouler (2 instructions, entrer  la 1re)
#define BODY(x) \
        ldq    $3,  -x($16)    ;\
        stq    $3,  -x($20)

	# boucle droule pour 32 chiffres
sn_cpdnloop:
	BODY(8);   BODY(16);  BODY(24);  BODY(32)
	BODY(40);  BODY(48);  BODY(56);  BODY(64)
	BODY(72);  BODY(80);  BODY(88);  BODY(96)
	BODY(104); BODY(112); BODY(120); BODY(128)
	BODY(136); BODY(144); BODY(152); BODY(160)
	BODY(168); BODY(176); BODY(184); BODY(192)
	BODY(200); BODY(208); BODY(216); BODY(224)
	BODY(232); BODY(240); BODY(248); BODY(256)
#undef BODY

	lda    $2,   32($2)
	lda    $16,  -256($16)
	lda    $20,  -256($20)
	blt    $2,   sn_cpdnloop
	ret    $31,  ($27),1

	.end   sn_cpdnloop

                           # +--------------------+
                           # |  Dcalage droul  |
                           # +--------------------+

   # entre en milieu de boucle :
   #   r0  = retenue entrante
   #   r2  = -(longueur des oprandes)
   #   r7  = 64 - dcalage
   #   r8  = dcalage
   #   r16 = adresse source cadre sur un multiple de 32
   #   r20 = adresse rsultat cadre sur un multiple de 32
   #   r27 = adresse de retour
   #
   # sortie:
   #   (r20) <- (r16) << r8 (par adresses croissantes)
   #   r0 <- retenue sortante
   #
   # registres modifis:
   #   r1 <- ind
   #   r2 <- r2 + 32*ceil(r2/32) 
   #   r3 <- ind
   #   r16 <- adresse suivante pour la source
   #   r20 <- adresse suivante pour le rsultat

        .align 5
        .globl sn_shuploop
        .ent   sn_shuploop
        .frame $30,0,$27,0
        .prologue 0

        # corps de boucle  drouler (5 instructions, entrer  la 1re)
#define BODY(x) \
        ldq    $1,   x($16)    ;\
        sll    $1,   $8,   $3  ;\
        bis    $0,   $3,   $0  ;\
        stq    $0,   x($20)    ;\
        srl    $1,   $7,   $0

	# boucle droule pour 32 chiffres
sn_shuploop:
        BODY(0);   BODY(8);   BODY(16);  BODY(24)
        BODY(32);  BODY(40);  BODY(48);  BODY(56)
        BODY(64);  BODY(72);  BODY(80);  BODY(88)
        BODY(96);  BODY(104); BODY(112); BODY(120)
        BODY(128); BODY(136); BODY(144); BODY(152)
        BODY(160); BODY(168); BODY(176); BODY(184)
        BODY(192); BODY(200); BODY(208); BODY(216)
        BODY(224); BODY(232); BODY(240); BODY(248)
#undef BODY

	lda    $2,   32($2)
	lda    $16,  256($16)
	lda    $20,  256($20)
	blt    $2,   sn_shuploop
	ret    $31,  ($27),1

	.end   sn_shuploop

   # entre en milieu de boucle :
   #   r0  = retenue entrante
   #   r2  = -(longueur des oprandes)
   #   r7  = 64 - dcalage
   #   r8  = dcalage
   #   r16 = adresse source cadre sur un multiple de 32
   #   r20 = adresse rsultat cadre sur un multiple de 32
   #   r27 = adresse de retour
   #
   # sortie:
   #   (r20) <- (r16) >> r8 (par adresses dcroissantes)
   #   r0 <- retenue sortante
   #
   # registres modifis:
   #   r1 <- ind
   #   r2 <- r2 + 32*ceil(r2/32) 
   #   r3 <- ind

        .align 5
        .globl sn_shdnloop
        .ent   sn_shdnloop
        .frame $30,0,$27,0
        .prologue 0

        # corps de boucle  drouler (5 instructions, entrer  la 1re)
#define BODY(x) \
        ldq    $1,  -x($16)    ;\
        srl    $1,   $8,   $3  ;\
        bis    $0,   $3,   $0  ;\
        stq    $0,  -x($20)    ;\
        sll    $1,   $7,   $0

	# boucle droule pour 32 chiffres
sn_shdnloop:
	BODY(8);   BODY(16);  BODY(24);  BODY(32)
	BODY(40);  BODY(48);  BODY(56);  BODY(64)
	BODY(72);  BODY(80);  BODY(88);  BODY(96)
	BODY(104); BODY(112); BODY(120); BODY(128)
	BODY(136); BODY(144); BODY(152); BODY(160)
	BODY(168); BODY(176); BODY(184); BODY(192)
	BODY(200); BODY(208); BODY(216); BODY(224)
	BODY(232); BODY(240); BODY(248); BODY(256)
#undef BODY

	lda    $2,   32($2)
	lda    $16,  -256($16)
	lda    $20,  -256($20)
	blt    $2,   sn_shdnloop
	ret    $31,  ($27),1

	.end   sn_shdnloop

                  # +-------------------------------------+
                  # |  Dcalage par adresses croissantes  |
                  # +-------------------------------------+


   # chiffre xn(shift_up)(chiffre *a, long la, chiffre *b, int k)
   #
   # entre :
   #   a = naturel de longueur la > 0
   #   b = naturel de longueur la, peut tre confondu avec a
   #   k = entier tel que 0 <= k < HW
   #
   # sortie :
   #   b <- a << k
   #   retourne les k bits de poids fort de a

#ifdef assembly_sn_shift_up
#define L(x) .Lsn_shift_up_##x
#define _a_  $16
#define _b_  $18
#define _c_  $20
#define _k_  $19
#define _la_ $17

        .align 5
        .globl sn_shift_up
        .ent   sn_shift_up
sn_shift_up:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)

	subq   $31,  _la_, $2	# r2 <- -la
	and    $2,   31,   $3	# r3 <- (-la) % 32
	bic    $2,   31,   $2   # r2 <- -32*ceil(la/32)
	sll    $3,   3,    $4   # r4 <- 8*((-la) % 32)
	subq   _a_,  $4,   _a_  # cadre a,c sur le multiple de 32 prcdent
	subq   _b_,  $4,   _c_
	bis    $31,  $31,  $0   # r0 <- 0 (retenue)
	bis    $26,  $26,  $27  # r27 <- adresse de retour
	beq    _k_,  1f

	# dcalage
	lda    $1,   sn_shuploop
	s4addq $3,   $3,   $3
	s4addq $3,   $1,   $1   # r1 <- adresse de saut
	lda    $7,   64($31)    # r8 <- k, r7 <- 64-k
	bis    _k_,  _k_,  $8
	subq   $7,   _k_,  $7
	jmp    $31,  ($1)

	# copie
	.align 5
1:
	lda    $1,   sn_cpuploop
	s8addq $3,   $1,   $1   # r1 <- adresse de saut
	jmp    $31,  ($1)

	.end sn_shift_up

#undef L
#undef _a_
#undef _b_
#undef _c_
#undef _k_
#undef _la_
#endif /* assembly_sn_shift_up */

                 # +---------------------------------------+
                 # |  Dcalage par adresses dcroissantes  |
                 # +---------------------------------------+
        
   # chiffre xn(shift_down)(chiffre *a, long la, chiffre *b, int k)
   #
   #  entre :
   #  a = naturel de longueur la > 0
   #  b = naturel de longueur la, peut tre confondu avec a
   #  k = entier tel que 0 <= k < HW
   #
   #  sortie :
   #  b <- a >> k
   #  retourne a mod 2^k

#ifdef assembly_sn_shift_down
#define L(x) .Lsn_shift_down_##x
#define _a_  $16
#define _b_  $18
#define _c_  $20
#define _k_  $19
#define _la_ $17

        .align 5
        .globl sn_shift_down
        .ent   sn_shift_down
sn_shift_down:
        .frame $30,0,$26,0
        .prologue 1
	ldgp   $gp,  0($27)

	subq   $31,  _la_, $2	# r2 <- -la
	and    $2,   31,   $3	# r3 <- (-la) % 32
	bic    $2,   31,   $2   # r2 <- -32*ceil(la/32)
	sll    $2,   3,    $4   # cadre a,c sur le multiple de 32 suivant
	subq   _a_,  $4,   _a_
	subq   _b_,  $4,   _c_
	bis    $31,  $31,  $0   # r0 <- 0 (retenue)
	beq    _k_,  1f

	# dcalage
	lda    $1,   sn_shdnloop
	s4addq $3,   $3,   $3
	s4addq $3,   $1,   $1   # r1 <- adresse de saut
	lda    $7,   64($31)    # r8 <- k, r7 <- 64-k
	bis    _k_,  _k_,  $8
	subq   $7,   _k_,  $7
	jsr    $27,  ($1)
        srl    $0,   $7,   $0   # r0 <- retenue
        ret    $31,  ($26),1

	# copie
	.align 5
1:
	lda    $1,   sn_cpdnloop
	s8addq $3,   $1,   $1   # r1 <- adresse de saut
	bis    $26,  $26,  $27  # r27 <- adresse de retour
	jmp    $31,  ($1)

	.end sn_shift_down

#undef L
#undef _a_
#undef _b_
#undef _c_
#undef _k_
#undef _la_
#endif /* assembly_sn_shift_down */

